mirror of
https://github.com/TorqueGameEngines/Torque3D.git
synced 2026-03-19 12:20:57 +00:00
Merge pull request #1680 from marauder2k9-torque/SIMD-Math-and-ISA-integration
Simd math and isa integration
This commit is contained in:
commit
6700cfd993
49 changed files with 5232 additions and 212 deletions
|
|
@ -130,7 +130,7 @@ torqueAddSourceDirectories("windowManager" "windowManager/torque" "windowManager
|
|||
torqueAddSourceDirectories("scene" "scene/culling" "scene/zones" "scene/mixin")
|
||||
|
||||
# Handle math
|
||||
torqueAddSourceDirectories("math" "math/util")
|
||||
torqueAddSourceDirectories("math" "math/util" "math/public" "math/impl") # note impl must skip the .inl files, never use them in engine code.
|
||||
|
||||
# Handle persistence
|
||||
set(TORQUE_INCLUDE_DIRECTORIES ${TORQUE_INCLUDE_DIRECTORIES} "persistence/rapidjson")
|
||||
|
|
@ -496,6 +496,37 @@ else()
|
|||
set_target_properties(${TORQUE_APP_NAME} PROPERTIES LINK_FLAGS "-Wl,-rpath,./")
|
||||
endif()
|
||||
|
||||
string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH)
|
||||
|
||||
set(IS_X86 FALSE)
|
||||
set(IS_ARM FALSE)
|
||||
|
||||
if(ARCH MATCHES "x86_64|amd64|i[3-6]86")
|
||||
set(IS_X86 TRUE)
|
||||
endif()
|
||||
|
||||
if(ARCH MATCHES "arm64|aarch64")
|
||||
set(IS_ARM TRUE)
|
||||
endif()
|
||||
|
||||
# always available
|
||||
add_math_backend(scalar MATH_SIMD_SCALAR)
|
||||
message(STATUS "Processor: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
message(STATUS "IS_X86=${IS_X86}")
|
||||
message(STATUS "IS_ARM=${IS_ARM}")
|
||||
|
||||
# x86 family
|
||||
if(IS_X86)
|
||||
add_math_backend(sse2 MATH_SIMD_SSE2)
|
||||
add_math_backend(sse41 MATH_SIMD_SSE41)
|
||||
add_math_backend(avx MATH_SIMD_AVX)
|
||||
add_math_backend(avx2 MATH_SIMD_AVX2)
|
||||
endif()
|
||||
|
||||
# ARM family
|
||||
if(IS_ARM)
|
||||
add_math_backend(neon MATH_SIMD_NEON)
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
# Match projectGenerator naming for executables
|
||||
|
|
|
|||
|
|
@ -3406,7 +3406,7 @@ MatrixF MeshRoad::getNodeTransform( U32 idx )
|
|||
mat.setColumn( 2, node.normal );
|
||||
mat.setColumn( 3, node.point );
|
||||
|
||||
AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
|
||||
AssertFatal(mat.determinant() != 0.0f, "no inverse!");
|
||||
|
||||
return mat;
|
||||
}
|
||||
|
|
@ -3456,7 +3456,7 @@ void MeshRoad::calcSliceTransform( U32 idx, MatrixF &mat )
|
|||
mat.setColumn( 2, slice.normal );
|
||||
mat.setColumn( 3, slice.p1 );
|
||||
|
||||
AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
|
||||
AssertFatal(mat.determinant() != 0.0f, "no inverse!");
|
||||
}
|
||||
|
||||
F32 MeshRoad::getRoadLength() const
|
||||
|
|
|
|||
|
|
@ -2139,7 +2139,7 @@ MatrixF River::getNodeTransform( U32 idx ) const
|
|||
mat.setColumn( 2, node.normal );
|
||||
mat.setColumn( 3, node.point );
|
||||
|
||||
AssertFatal( m_matF_determinant( mat ) != 0.0f, "no inverse!");
|
||||
AssertFatal( mat.determinant() != 0.0f, "no inverse!");
|
||||
|
||||
return mat;
|
||||
}
|
||||
|
|
|
|||
122
Engine/source/math/impl/float3_impl.inl
Normal file
122
Engine/source/math/impl/float3_impl.inl
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
#pragma once
|
||||
#include <cmath> // for sqrtf, etc.
|
||||
#include "../mConstants.h"
|
||||
|
||||
// Safely loads a float3 -> simd 4 lane backend
|
||||
namespace math_backend::float3
|
||||
{
|
||||
//----------------------------------------------------------
|
||||
// Add two float4 vectors: r = a + b
|
||||
inline void float3_add_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load3_vec(a);
|
||||
f32x4 vb = v_load3_vec(b);
|
||||
f32x4 vr = v_add(va, vb);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Subtract: r = a - b
|
||||
inline void float3_sub_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load3_vec(a);
|
||||
f32x4 vb = v_load3_vec(b);
|
||||
f32x4 vr = v_sub(va, vb);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Multiply element-wise: r = a * b
|
||||
inline void float3_mul_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load3_vec(a);
|
||||
f32x4 vb = v_load3_vec(b);
|
||||
f32x4 vr = v_mul(va, vb);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Multiply by scalar: r = a * s
|
||||
inline void float3_mul_scalar_impl(const float* a, float s, float* r)
|
||||
{
|
||||
f32x4 va = v_load3_vec(a);
|
||||
f32x4 vs = v_set1(s);
|
||||
f32x4 vr = v_mul(va, vs);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Divide element-wise: r = a / b
|
||||
inline void float3_div_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load3_vec(a);
|
||||
f32x4 vb = v_load3_vec(b);
|
||||
f32x4 vr = v_div(va, vb);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Divide by scalar: r = a / s
|
||||
inline void float3_div_scalar_impl(const float* a, float s, float* r)
|
||||
{
|
||||
f32x4 va = v_load3_vec(a);
|
||||
f32x4 vs = v_set1(s);
|
||||
f32x4 vr = v_div(va, vs);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Dot product: returns scalar
|
||||
inline float float3_dot_impl(const float* a, const float* b)
|
||||
{
|
||||
f32x4 va = v_load3_vec(a);
|
||||
f32x4 vb = v_load3_vec(b);
|
||||
f32x4 vdot = v_dot3(va, vb);
|
||||
return v_extract0(vdot); // first lane is the sum of 3 elements
|
||||
}
|
||||
|
||||
// Length squared
|
||||
inline float float3_length_squared_impl(const float* a)
|
||||
{
|
||||
return float3_dot_impl(a, a);
|
||||
}
|
||||
|
||||
// Length
|
||||
inline float float3_length_impl(const float* a)
|
||||
{
|
||||
return std::sqrt(float3_length_squared_impl(a));
|
||||
}
|
||||
|
||||
// Normalize in-place
|
||||
inline void float3_normalize_impl(float* a)
|
||||
{
|
||||
f32x4 va = v_load3_vec(a);
|
||||
f32x4 vr = v_normalize3(va);
|
||||
v_store3(a, vr);
|
||||
}
|
||||
|
||||
// Normalize with magnitude: r = normalize(a) * r
|
||||
inline void float3_normalize_mag_impl(float* a, float r)
|
||||
{
|
||||
f32x4 va = v_load3_vec(a);
|
||||
|
||||
// invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
|
||||
f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot3(va, va)));
|
||||
|
||||
f32x4 vnorm = v_mul(va, invLen);
|
||||
v_store3(a, vnorm);
|
||||
}
|
||||
|
||||
// Linear interpolation: r = from + (to - from) * f
|
||||
inline void float3_lerp_impl(const float* from, const float* to, float f, float* r)
|
||||
{
|
||||
f32x4 vfrom = v_load3_vec(from);
|
||||
f32x4 vto = v_load3_vec(to);
|
||||
f32x4 vf = v_set1(f);
|
||||
f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
inline void float3_cross_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load3_vec(a);
|
||||
f32x4 vb = v_load3_vec(b);
|
||||
f32x4 vcross = v_cross(va, vb);
|
||||
v_store3(r, vcross);
|
||||
}
|
||||
|
||||
}
|
||||
123
Engine/source/math/impl/float4_impl.inl
Normal file
123
Engine/source/math/impl/float4_impl.inl
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
#pragma once
|
||||
#include <cmath> // for sqrtf, etc.
|
||||
#include "../mConstants.h"
|
||||
|
||||
namespace math_backend::float4
|
||||
{
|
||||
|
||||
//----------------------------------------------------------
|
||||
// Add two float4 vectors: r = a + b
|
||||
inline void float4_add_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load(a);
|
||||
f32x4 vb = v_load(b);
|
||||
f32x4 vr = v_add(va, vb);
|
||||
v_store(r, vr);
|
||||
}
|
||||
|
||||
// Subtract: r = a - b
|
||||
inline void float4_sub_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load(a);
|
||||
f32x4 vb = v_load(b);
|
||||
f32x4 vr = v_sub(va, vb);
|
||||
v_store(r, vr);
|
||||
}
|
||||
|
||||
// Multiply element-wise: r = a * b
|
||||
inline void float4_mul_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load(a);
|
||||
f32x4 vb = v_load(b);
|
||||
f32x4 vr = v_mul(va, vb);
|
||||
v_store(r, vr);
|
||||
}
|
||||
|
||||
// Multiply by scalar: r = a * s
|
||||
inline void float4_mul_scalar_impl(const float* a, float s, float* r)
|
||||
{
|
||||
f32x4 va = v_load(a);
|
||||
f32x4 vs = v_set1(s);
|
||||
f32x4 vr = v_mul(va, vs);
|
||||
v_store(r, vr);
|
||||
}
|
||||
|
||||
// Divide element-wise: r = a / b
|
||||
inline void float4_div_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load(a);
|
||||
f32x4 vb = v_load(b);
|
||||
f32x4 vr = v_div(va, vb);
|
||||
v_store(r, vr);
|
||||
}
|
||||
|
||||
// Divide by scalar: r = a / s
|
||||
inline void float4_div_scalar_impl(const float* a, float s, float* r)
|
||||
{
|
||||
f32x4 va = v_load(a);
|
||||
f32x4 vs = v_set1(s);
|
||||
f32x4 vr = v_div(va, vs);
|
||||
v_store(r, vr);
|
||||
}
|
||||
|
||||
// Dot product: returns scalar
|
||||
inline float float4_dot_impl(const float* a, const float* b)
|
||||
{
|
||||
f32x4 va = v_load(a);
|
||||
f32x4 vb = v_load(b);
|
||||
f32x4 vdot = v_dot4(va, vb); // calls ISA-specific implementation
|
||||
return v_extract0(vdot);
|
||||
}
|
||||
|
||||
// Length squared
|
||||
inline float float4_length_squared_impl(const float* a)
|
||||
{
|
||||
return float4_dot_impl(a, a);
|
||||
}
|
||||
|
||||
// Length
|
||||
inline float float4_length_impl(const float* a)
|
||||
{
|
||||
return std::sqrt(float4_length_squared_impl(a));
|
||||
}
|
||||
|
||||
// Normalize in-place
|
||||
inline void float4_normalize_impl(float* a)
|
||||
{
|
||||
f32x4 va = v_load(a);
|
||||
f32x4 invLen = v_rsqrt_nr(v_dot4(va, va)); // fully abstracted
|
||||
f32x4 vnorm = v_mul(va, invLen);
|
||||
v_store(a, vnorm);
|
||||
}
|
||||
|
||||
// Normalize with magnitude: r = normalize(a) * r
|
||||
inline void float4_normalize_mag_impl(float* a, float r)
|
||||
{
|
||||
f32x4 va = v_load(a);
|
||||
|
||||
// invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
|
||||
f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot4(va, va)));
|
||||
|
||||
f32x4 vnorm = v_mul(va, invLen);
|
||||
v_store(a, vnorm);
|
||||
}
|
||||
|
||||
// Linear interpolation: r = from + (to - from) * f
|
||||
inline void float4_lerp_impl(const float* from, const float* to, float f, float* r)
|
||||
{
|
||||
f32x4 vfrom = v_load(from);
|
||||
f32x4 vto = v_load(to);
|
||||
f32x4 vf = v_set1(f);
|
||||
f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
|
||||
v_store(r, vr);
|
||||
}
|
||||
|
||||
inline void float4_cross_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4 va = v_load(a);
|
||||
f32x4 vb = v_load(b);
|
||||
f32x4 vcross = v_cross(va, vb);
|
||||
v_store(r, vcross);
|
||||
}
|
||||
|
||||
} // namespace math_backend::float4
|
||||
424
Engine/source/math/impl/mat44_impl.inl
Normal file
424
Engine/source/math/impl/mat44_impl.inl
Normal file
|
|
@ -0,0 +1,424 @@
|
|||
#pragma once
|
||||
#include <cmath> // for sqrtf, etc.
|
||||
#include "../mConstants.h"
|
||||
|
||||
namespace math_backend::mat44
|
||||
{
|
||||
//------------------------------------------------------------------
|
||||
// Matrix Transpose
|
||||
inline void mat44_transpose_impl(float* m)
|
||||
{
|
||||
f32x4x4 ma = m_load(m);
|
||||
f32x4x4 mr = m_transpose(ma);
|
||||
m_store(m, mr);
|
||||
}
|
||||
|
||||
inline float mat44_get_determinant(const float* m)
|
||||
{
|
||||
f32x4x4 ma = m_load(m);
|
||||
return v_extract0(m_determinant_affine(ma));
|
||||
}
|
||||
|
||||
// Matrix Scale: Float3 (assume w = 1.0f)
|
||||
inline void mat44_scale_impl(float* m, const float* s)
|
||||
{
|
||||
f32x4x4 ma = m_load(m);
|
||||
f32x4 va = v_load3_pos(s);
|
||||
|
||||
ma.r0 = v_mul(ma.r0, va);
|
||||
ma.r1 = v_mul(ma.r1, va);
|
||||
ma.r2 = v_mul(ma.r2, va);
|
||||
ma.r3 = v_mul(ma.r3, va);
|
||||
m_store(m, ma);
|
||||
}
|
||||
|
||||
inline void mat44_transform_plane_impl(const float* m, const float* scale, const float* plane, float* plane_result)
|
||||
{
|
||||
f32x4x4 M = m_load(m);
|
||||
|
||||
f32x4 plane_v = v_load(plane);
|
||||
f32x4 scale_v = v_load3_vec(scale);
|
||||
f32x4 invScale = v_rcp_nr(scale_v);
|
||||
|
||||
// normal = plane.xyz
|
||||
f32x4 normal = plane_v;
|
||||
|
||||
// apply Inv(s)
|
||||
normal = v_mul(normal, invScale);
|
||||
|
||||
// multiply by Inv(Tr(m)) (only the rotation part matters)
|
||||
f32x4 nx = v_mul(v_swizzle_singular_mask(normal, 0), M.r0);
|
||||
f32x4 ny = v_mul(v_swizzle_singular_mask(normal, 1), M.r1);
|
||||
f32x4 nz = v_mul(v_swizzle_singular_mask(normal, 2), M.r2);
|
||||
|
||||
normal = v_add(v_add(nx, ny), nz);
|
||||
|
||||
normal = v_normalize3(normal);
|
||||
|
||||
// compute point on plane
|
||||
float d = v_extract0(v_swizzle_singular_mask(plane_v, 3));
|
||||
|
||||
f32x4 point = v_mul(plane_v, v_set1(-d));
|
||||
point = v_preserve_w(point, v_set1(1.0f));
|
||||
|
||||
// apply scale
|
||||
point = v_mul(point, scale_v);
|
||||
|
||||
// transform point by matrix
|
||||
point = m_mul_vec4(M, point);
|
||||
|
||||
// compute new plane distance
|
||||
float newD = -v_extract0(v_dot3(point, normal));
|
||||
|
||||
alignas(16) float n[4];
|
||||
v_store(n, normal);
|
||||
|
||||
plane_result[0] = n[0];
|
||||
plane_result[1] = n[1];
|
||||
plane_result[2] = n[2];
|
||||
plane_result[3] = newD;
|
||||
}
|
||||
|
||||
inline void mat44_get_scale_impl(const float* m, float* s)
|
||||
{
|
||||
f32x4x4 ma = m_load(m);
|
||||
|
||||
// squared lengths
|
||||
f32x4 len2_x = v_dot3(ma.r0, ma.r0);
|
||||
f32x4 len2_y = v_dot3(ma.r1, ma.r1);
|
||||
f32x4 len2_z = v_dot3(ma.r2, ma.r2);
|
||||
|
||||
// extract and sqrt
|
||||
s[0] = 1.0f / v_extract0(v_rsqrt_nr(len2_x));
|
||||
s[1] = 1.0f / v_extract0(v_rsqrt_nr(len2_y));
|
||||
s[2] = 1.0f / v_extract0(v_rsqrt_nr(len2_z));
|
||||
}
|
||||
|
||||
// Matrix Scale Uniform: Float value (assume w = 1.0f)
|
||||
inline void mat44_scale_uniform(float* m, float s)
|
||||
{
|
||||
f32x4x4 ma = m_load(m);
|
||||
|
||||
// (s, s, s, 1)
|
||||
f32x4 scale = v_set(s, s, s, 1.0f);
|
||||
|
||||
// Scale only rotation rows (xyz part)
|
||||
ma.r0 = v_mul(ma.r0, scale);
|
||||
ma.r1 = v_mul(ma.r1, scale);
|
||||
ma.r2 = v_mul(ma.r2, scale);
|
||||
m_store(m, ma);
|
||||
}
|
||||
|
||||
// Matrix Inverse
|
||||
inline void mat44_inverse_impl(float* m)
|
||||
{
|
||||
f32x4x4 ma = m_load(m);
|
||||
|
||||
// Compute cofactors using cross products
|
||||
f32x4x4 mTemp;
|
||||
mTemp.r0 = v_cross(ma.r1, ma.r2);
|
||||
mTemp.r1 = v_cross(ma.r2, ma.r0);
|
||||
mTemp.r2 = v_cross(ma.r0, ma.r1);
|
||||
|
||||
// Determinant = dot(ma.r0, c0)
|
||||
f32x4 det = v_dot3(ma.r0, mTemp.r0);
|
||||
f32x4 invDet = v_rcp_nr(det);
|
||||
|
||||
// Scale cofactors
|
||||
mTemp.r0 = v_mul(mTemp.r0, invDet);
|
||||
mTemp.r1 = v_mul(mTemp.r1, invDet);
|
||||
mTemp.r2 = v_mul(mTemp.r2, invDet);
|
||||
|
||||
// Store inverse 3x3 (transpose of cofactor matrix)
|
||||
|
||||
mTemp = m_transpose(mTemp);
|
||||
mTemp.r3 = ma.r3;
|
||||
|
||||
// ---- Translation ----
|
||||
|
||||
// Load original translation
|
||||
f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
|
||||
|
||||
// Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
|
||||
f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
|
||||
result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
|
||||
result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
|
||||
result = v_mul(result, v_set1(-1.0f));
|
||||
|
||||
m_store(m, mTemp);
|
||||
|
||||
// Store translation
|
||||
m[3] = v_extract0(result);
|
||||
m[7] = v_extract0(v_swizzle_singular_mask(result, 1));
|
||||
m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
|
||||
}
|
||||
|
||||
// Matrix Inverse
|
||||
inline void mat44_inverse_to_impl(const float* m, float* dst)
|
||||
{
|
||||
f32x4x4 ma = m_load(m);
|
||||
|
||||
// Compute cofactors using cross products
|
||||
f32x4x4 mTemp;
|
||||
mTemp.r0 = v_cross(ma.r1, ma.r2);
|
||||
mTemp.r1 = v_cross(ma.r2, ma.r0);
|
||||
mTemp.r2 = v_cross(ma.r0, ma.r1);
|
||||
|
||||
// Determinant = dot(ma.r0, c0)
|
||||
f32x4 det = v_dot3(ma.r0, mTemp.r0);
|
||||
f32x4 invDet = v_rcp_nr(det);
|
||||
|
||||
// Scale cofactors
|
||||
mTemp.r0 = v_mul(mTemp.r0, invDet);
|
||||
mTemp.r1 = v_mul(mTemp.r1, invDet);
|
||||
mTemp.r2 = v_mul(mTemp.r2, invDet);
|
||||
|
||||
// Store inverse 3x3 (transpose of cofactor matrix)
|
||||
|
||||
mTemp = m_transpose(mTemp);
|
||||
mTemp.r3 = ma.r3;
|
||||
|
||||
// ---- Translation ----
|
||||
|
||||
// Load original translation
|
||||
f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
|
||||
|
||||
// Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
|
||||
f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
|
||||
result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
|
||||
result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
|
||||
result = v_mul(result, v_set1(-1.0f));
|
||||
|
||||
m_store(dst, mTemp);
|
||||
|
||||
// Store translation
|
||||
dst[3] = v_extract0(result);
|
||||
dst[7] = v_extract0(v_swizzle_singular_mask(result, 1));
|
||||
dst[11] = v_extract0(v_swizzle_singular_mask(result, 2));
|
||||
}
|
||||
|
||||
// Matrix Affine Inverse
|
||||
inline void mat44_affine_inverse_impl(float* m)
|
||||
{
|
||||
f32x4x4 ma = m_load(m);
|
||||
|
||||
f32x4x4 mTemp = m_transpose(ma);
|
||||
mTemp.r3 = v_set(0, 0, 0, 1);
|
||||
|
||||
// ---- Translation ----
|
||||
// Load original translation
|
||||
f32x4 T = v_set(m[3], m[7], m[11], 0.0f);
|
||||
|
||||
// Compute -(Tx*ma.r0 + Ty*ma.r1 + Tz*ma.r2)
|
||||
f32x4 result = v_mul(ma.r0, v_swizzle_singular_mask(T, 0));
|
||||
result = v_add(result, v_mul(ma.r1, v_swizzle_singular_mask(T, 1)));
|
||||
result = v_add(result, v_mul(ma.r2, v_swizzle_singular_mask(T, 2)));
|
||||
result = v_mul(result, v_set1(-1.0f));
|
||||
|
||||
m_store(m, mTemp);
|
||||
|
||||
// Store translation
|
||||
m[3] = v_extract0(result);
|
||||
m[7] = v_extract0(v_swizzle_singular_mask(result, 1));
|
||||
m[11] = v_extract0(v_swizzle_singular_mask(result, 2));
|
||||
}
|
||||
|
||||
inline void mat44_normalize_impl(float* m)
|
||||
{
|
||||
// Load the matrix into SIMD registers
|
||||
f32x4x4 mat = m_load(m);
|
||||
|
||||
// Transpose: now rows are columns
|
||||
mat = m_transpose(mat);
|
||||
|
||||
// Extract columns (which are now rows)
|
||||
f32x4 col0 = mat.r0;
|
||||
f32x4 col1 = mat.r1;
|
||||
|
||||
// Rebuild orthonormal basis
|
||||
f32x4 col2 = v_cross(col0, col1);
|
||||
col1 = v_cross(col2, col0);
|
||||
|
||||
// Normalize columns
|
||||
col0 = v_normalize3(col0);
|
||||
col1 = v_normalize3(col1);
|
||||
col2 = v_normalize3(col2);
|
||||
|
||||
// Write back directly into transposed matrix
|
||||
mat.r0 = col0;
|
||||
mat.r1 = col1;
|
||||
mat.r2 = col2;
|
||||
|
||||
// Transpose back to row-major
|
||||
mat = m_transpose(mat);
|
||||
|
||||
// Store back
|
||||
m_store(m, mat);
|
||||
}
|
||||
|
||||
// Matrix Multiply: a * b
|
||||
inline void mat44_mul_mat44_impl(const float* a, const float* b, float* r)
|
||||
{
|
||||
f32x4x4 ma = m_load(a);
|
||||
f32x4x4 mb = m_load(b);
|
||||
|
||||
f32x4x4 mr = m_mul(ma, mb);
|
||||
m_store(r, mr);
|
||||
}
|
||||
|
||||
// Vector Multiply: m * p (assume w = 1.0f)
|
||||
inline void mat44_mul_pos3_impl(const float *m, const float *p, float* r)
|
||||
{
|
||||
f32x4x4 ma = m_load(m);
|
||||
f32x4 va = v_load3_pos(p);
|
||||
f32x4 vr = m_mul_vec4(ma, va);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Vector Multiply: m * v (assume w = 0.0f)
|
||||
inline void mat44_mul_vec3_impl(const float* m, const float* v, float* r)
|
||||
{
|
||||
f32x4x4 ma = m_load(m);
|
||||
f32x4 va = v_load3_vec(v);
|
||||
f32x4 vr = m_mul_vec3(ma, va);
|
||||
v_store3(r, vr);
|
||||
}
|
||||
|
||||
// Vector Multiply: m * p (full [4x4] * [1x4])
|
||||
inline void mat44_mul_float4_impl(const float* m, const float* p, float* r)
|
||||
{
|
||||
f32x4x4 ma = m_load(m);
|
||||
f32x4 va = v_load(p);
|
||||
f32x4 vr = m_mul_vec4(ma, va);
|
||||
v_store(r, vr);
|
||||
}
|
||||
|
||||
//--------------------------------------------------
|
||||
// MATRIX ROTATION FUNCTIONS
|
||||
//--------------------------------------------------
|
||||
|
||||
inline void mat44_rotation_x_impl(float* m, float angle)
|
||||
{
|
||||
float c = cosf(angle), s = sinf(angle);
|
||||
f32x4x4 mr = m_identity();
|
||||
mr.r1 = v_set(0, c, s, 0);
|
||||
mr.r2 = v_set(0, -s, c, 0);
|
||||
m_store(m, mr);
|
||||
}
|
||||
|
||||
inline void mat44_rotation_y_impl(float* m, float angle)
|
||||
{
|
||||
float c = cosf(angle), s = sinf(angle);
|
||||
f32x4x4 mr = m_identity();
|
||||
mr.r0 = v_set(c, 0, -s, 0);
|
||||
mr.r2 = v_set(s, 0, c, 0);
|
||||
m_store(m, mr);
|
||||
}
|
||||
|
||||
inline void mat44_rotation_z_impl(float* m, float angle)
|
||||
{
|
||||
float c = cosf(angle), s = sinf(angle);
|
||||
f32x4x4 mr = m_identity();
|
||||
mr.r0 = v_set(c, s, 0, 0);
|
||||
mr.r1 = v_set(-s, c, 0, 0);
|
||||
m_store(m, mr);
|
||||
}
|
||||
|
||||
// Compose rotation from Euler angles (pitch=X, yaw=Y, roll=Z)
|
||||
inline void mat44_rotation_euler_impl(float* m, float pitch, float yaw, float roll)
|
||||
{
|
||||
f32x4x4 rx, ry, rz;
|
||||
mat44_rotation_x_impl((float*)&rx, pitch);
|
||||
mat44_rotation_y_impl((float*)&ry, yaw);
|
||||
mat44_rotation_z_impl((float*)&rz, roll);
|
||||
|
||||
f32x4x4 r = m_mul(rz, m_mul(ry, rx));
|
||||
m_store(m, r);
|
||||
}
|
||||
|
||||
inline void mat44_lookat_impl(float* m, const float* eye, const float* target, const float* up)
|
||||
{
|
||||
f32x4 vEye = v_load3_pos(eye);
|
||||
f32x4 vTarget = v_load3_pos(target);
|
||||
f32x4 vUp = v_load3_vec(up);
|
||||
|
||||
// Forward (z+)
|
||||
f32x4 zaxis = v_normalize3(v_sub(vTarget, vEye));
|
||||
|
||||
// Right (x+)
|
||||
f32x4 xaxis = v_normalize3(v_cross(vUp, zaxis));
|
||||
|
||||
// Up (y+)
|
||||
f32x4 yaxis = v_cross(zaxis, xaxis);
|
||||
|
||||
// Compute translation components: -dot(axis, eye)
|
||||
f32x4 t_x = v_mul(v_dot3(xaxis, vEye), v_set1(-1.0f));
|
||||
f32x4 t_y = v_mul(v_dot3(yaxis, vEye), v_set1(-1.0f));
|
||||
f32x4 t_z = v_mul(v_dot3(zaxis, vEye), v_set1(-1.0f));
|
||||
|
||||
f32x4x4 view;
|
||||
view.r0 = v_insert_w(xaxis, t_x);
|
||||
view.r1 = v_insert_w(yaxis, t_y);
|
||||
view.r2 = v_insert_w(zaxis, t_z);
|
||||
view.r3 = v_set(0, 0, 0, 1.0f);
|
||||
|
||||
m_store(m, view);
|
||||
}
|
||||
|
||||
inline void mat44_perspective_impl(float* m, float fovY, float aspect, float znear, float zfar)
|
||||
{
|
||||
float f = 1.0f / tanf(fovY * 0.5f);
|
||||
float nf = 1.0f / (znear - zfar);
|
||||
|
||||
f32x4x4 mp = m_zero();
|
||||
mp.r0 = v_set(f / aspect, 0, 0, 0);
|
||||
mp.r1 = v_set(0, f, 0, 0);
|
||||
mp.r2 = v_set(0, 0, (zfar + znear) * nf, 2 * zfar * znear * nf);
|
||||
mp.r3 = v_set(0, 0, -1, 0); // row-major projection
|
||||
m_store(m, mp);
|
||||
}
|
||||
|
||||
inline void mat44_orthographic_impl(float* m, float left, float right, float bottom, float top, float znear, float zfar)
|
||||
{
|
||||
f32x4x4 mo = m_zero();
|
||||
mo.r0 = v_set(2.0f / (right - left), 0, 0, -(right + left) / (right - left));
|
||||
mo.r1 = v_set(0, 2.0f / (top - bottom), 0, -(top + bottom) / (top - bottom));
|
||||
mo.r2 = v_set(0, 0, -2.0f / (zfar - znear), -(zfar + znear) / (zfar - znear));
|
||||
mo.r3 = v_set(0, 0, 0, 1.0f);
|
||||
m_store(m, mo);
|
||||
}
|
||||
|
||||
//--------------------------------------------------
|
||||
// MATRIX BATCH FUNCTIONS
|
||||
//--------------------------------------------------
|
||||
|
||||
inline void mat44_batch_mul_pos3(const float* m, const float* points, int count, float* result)
|
||||
{
|
||||
int i = 0;
|
||||
// AVX has 8 lanes to play with
|
||||
#if defined(MATH_SIMD_AVX2) || defined(MATH_SIMD_AVX)
|
||||
// 8-wide AVX only
|
||||
for (; i + 8 <= count; i += 8)
|
||||
{
|
||||
vec4_batch8 va = load_vec3_batch8(&points[i*3], 1.0f, false);
|
||||
vec4_batch8 vr = m_mul_pos3_batch8(m, va);
|
||||
store_vec3_batch8(&result[i*3], vr);
|
||||
}
|
||||
#endif // MATH_SIMD_AVX2 || MATH_SIMD_AVX
|
||||
|
||||
// 4-wide
|
||||
for (; i + 4 <= count; i += 4)
|
||||
{
|
||||
vec4_batch4 va = load_vec3_batch4(&points[i * 3], 1.0f, false);
|
||||
vec4_batch4 vr = m_mul_pos3_batch4(m, va);
|
||||
store_vec3_batch4(&result[i * 3], vr);
|
||||
}
|
||||
|
||||
for (; i < count; ++i)
|
||||
{
|
||||
size_t idx = i * 3;
|
||||
mat44_mul_pos3_impl(m, &points[idx], &result[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace math_backend::mat44
|
||||
500
Engine/source/math/impl/math_c.cpp
Normal file
500
Engine/source/math/impl/math_c.cpp
Normal file
|
|
@ -0,0 +1,500 @@
|
|||
#include "platform/platform.h"
|
||||
#include "math/public/float4_dispatch.h"
|
||||
#include "math/public/float3_dispatch.h"
|
||||
#include "math/public/mat44_dispatch.h"
|
||||
#include "math/mConstants.h"
|
||||
#include "math/mMatrix.h"
|
||||
#include <cmath> // for sqrtf, etc.
|
||||
|
||||
namespace math_backend::float4::dispatch
|
||||
{
|
||||
void install_scalar()
|
||||
{
|
||||
gFloat4.add = [](const float* a, const float* b, float* r) {
|
||||
for (int i = 0; i < 4; i++) r[i] = a[i] + b[i];
|
||||
};
|
||||
|
||||
gFloat4.sub = [](const float* a, const float* b, float* r) {
|
||||
for (int i = 0; i < 4; i++) r[i] = a[i] - b[i];
|
||||
};
|
||||
|
||||
gFloat4.mul = [](const float* a, const float* b, float* r) {
|
||||
for (int i = 0; i < 4; i++) r[i] = a[i] * b[i];
|
||||
};
|
||||
|
||||
gFloat4.mul_scalar = [](const float* a, float s, float* r) {
|
||||
for (int i = 0; i < 4; i++) r[i] = a[i] * s;
|
||||
};
|
||||
|
||||
gFloat4.div = [](const float* a, const float* b, float* r) {
|
||||
for (int i = 0; i < 4; i++) r[i] = a[i] / b[i];
|
||||
};
|
||||
|
||||
gFloat4.div_scalar = [](const float* a, float s, float* r) {
|
||||
float denom = 1.0f / s;
|
||||
for (int i = 0; i < 4; i++) r[i] = a[i] * denom;
|
||||
};
|
||||
|
||||
gFloat4.dot = [](const float* a, const float* b) {
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < 4; i++) sum += a[i] * b[i];
|
||||
return sum;
|
||||
};
|
||||
|
||||
gFloat4.length = [](const float* a) {
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < 4; i++) sum += a[i] * a[i];
|
||||
return std::sqrt(sum);
|
||||
};
|
||||
|
||||
gFloat4.lengthSquared = [](const float* a) {
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < 4; i++) sum += a[i] * a[i];
|
||||
return (sum);
|
||||
};
|
||||
|
||||
gFloat4.normalize = [](float* a) {
|
||||
float len = gFloat4.length(a);
|
||||
if (len > POINT_EPSILON)
|
||||
{
|
||||
float denom = 1.0f / len;
|
||||
for (int i = 0; i < 4; i++)
|
||||
a[i] *= denom;
|
||||
}
|
||||
};
|
||||
|
||||
gFloat4.normalize_mag = [](float* a, float f) {
|
||||
float len = gFloat4.length(a);
|
||||
if (len > POINT_EPSILON)
|
||||
{
|
||||
float denom = f / len;
|
||||
for (int i = 0; i < 4; i++) a[i] *= denom;
|
||||
}
|
||||
};
|
||||
|
||||
gFloat4.lerp = [](const float* from, const float* to, float f, float* r) {
|
||||
for (int i = 0; i < 4; i++) r[i] = from[i] + (to[i] - from[i]) * f;
|
||||
};
|
||||
|
||||
gFloat4.cross = [](const float* a, const float* b, float* r) {
|
||||
const float ax = a[0];
|
||||
const float ay = a[1];
|
||||
const float az = a[2];
|
||||
|
||||
const float bx = b[0];
|
||||
const float by = b[1];
|
||||
const float bz = b[2];
|
||||
|
||||
r[0] = ay * bz - az * by;
|
||||
r[1] = az * bx - ax * bz;
|
||||
r[2] = ax * by - ay * bx;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
namespace math_backend::float3::dispatch
|
||||
{
|
||||
void install_scalar()
|
||||
{
|
||||
gFloat3.add = [](const float* a, const float* b, float* r) {
|
||||
for (int i = 0; i < 3; i++) r[i] = a[i] + b[i];
|
||||
};
|
||||
|
||||
gFloat3.sub = [](const float* a, const float* b, float* r) {
|
||||
for (int i = 0; i < 3; i++) r[i] = a[i] - b[i];
|
||||
};
|
||||
|
||||
gFloat3.mul = [](const float* a, const float* b, float* r) {
|
||||
for (int i = 0; i < 3; i++) r[i] = a[i] * b[i];
|
||||
};
|
||||
|
||||
gFloat3.mul_scalar = [](const float* a, float s, float* r) {
|
||||
for (int i = 0; i < 3; i++) r[i] = a[i] * s;
|
||||
};
|
||||
|
||||
gFloat3.div = [](const float* a, const float* b, float* r) {
|
||||
for (int i = 0; i < 3; i++) r[i] = a[i] / b[i];
|
||||
};
|
||||
|
||||
gFloat3.div_scalar = [](const float* a, float s, float* r) {
|
||||
float denom = 1.0f / s;
|
||||
for (int i = 0; i < 3; i++) r[i] = a[i] * denom;
|
||||
};
|
||||
|
||||
gFloat3.dot = [](const float* a, const float* b) {
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < 3; i++) sum += a[i] * b[i];
|
||||
return sum;
|
||||
};
|
||||
|
||||
gFloat3.length = [](const float* a) {
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < 3; i++) sum += a[i] * a[i];
|
||||
return std::sqrt(sum);
|
||||
};
|
||||
|
||||
gFloat3.lengthSquared = [](const float* a) {
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < 3; i++) sum += a[i] * a[i];
|
||||
return (sum);
|
||||
};
|
||||
|
||||
gFloat3.normalize = [](float* a) {
|
||||
float len = gFloat3.length(a);
|
||||
if (len > POINT_EPSILON)
|
||||
{
|
||||
float denom = 1.0 / len;
|
||||
for (int i = 0; i < 3; i++) a[i] *= denom;
|
||||
}
|
||||
};
|
||||
|
||||
gFloat3.normalize_mag = [](float* a, float f) {
|
||||
float len = gFloat3.length(a);
|
||||
if (len > POINT_EPSILON)
|
||||
{
|
||||
float denom = f / len;
|
||||
for (int i = 0; i < 3; i++) a[i] *= denom;
|
||||
}
|
||||
};
|
||||
|
||||
gFloat3.lerp = [](const float* from, const float* to, float f, float* r) {
|
||||
for (int i = 0; i < 3; i++) r[i] = from[i] + (to[i] - from[i]) * f;
|
||||
};
|
||||
|
||||
gFloat3.cross = [](const float* a, const float* b, float* r) {
|
||||
const float ax = a[0];
|
||||
const float ay = a[1];
|
||||
const float az = a[2];
|
||||
|
||||
const float bx = b[0];
|
||||
const float by = b[1];
|
||||
const float bz = b[2];
|
||||
|
||||
r[0] = ay * bz - az * by;
|
||||
r[1] = az * bx - ax * bz;
|
||||
r[2] = ax * by - ay * bx;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
inline void swap(float& a, float& b)
|
||||
{
|
||||
float temp = a;
|
||||
a = b;
|
||||
b = temp;
|
||||
}
|
||||
|
||||
|
||||
namespace math_backend::mat44::dispatch
|
||||
{
|
||||
void install_scalar()
|
||||
{
|
||||
gMat44.transpose = [](float* a) {
|
||||
swap(a[1], a[4]);
|
||||
swap(a[2], a[8]);
|
||||
swap(a[3], a[12]);
|
||||
swap(a[6], a[9]);
|
||||
swap(a[7], a[13]);
|
||||
swap(a[11], a[14]);
|
||||
};
|
||||
|
||||
gMat44.determinant = [](const float* m) {
|
||||
return m[0] * (m[5] * m[10] - m[6] * m[9]) +
|
||||
m[4] * (m[2] * m[9] - m[1] * m[10]) +
|
||||
m[8] * (m[1] * m[6] - m[2] * m[5]);
|
||||
};
|
||||
|
||||
gMat44.mul_vec3 = [](const float* a, const float* b, float* r) {
|
||||
#ifdef TORQUE_COMPILER_GCC
|
||||
const F32 v0 = b[0], v1 = b[1], v2 = b[2];
|
||||
const F32 m0 = a[0], m1 = a[1], m2 = a[2];
|
||||
const F32 m4 = a[4], m5 = a[5], m6 = a[6];
|
||||
const F32 m8 = a[8], m9 = a[9], m10 = a[10];
|
||||
|
||||
r[0] = m0 * v0 + m1 * v1 + m2 * v2;
|
||||
r[1] = m4 * v0 + m5 * v1 + m6 * v2;
|
||||
r[2] = m8 * v0 + m9 * v1 + m10 * v2;
|
||||
#else
|
||||
r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
|
||||
r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2];
|
||||
r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2];
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
gMat44.inverse = [](float* m) {
|
||||
// using Cramers Rule find the Inverse
|
||||
// Minv = (1/det(M)) * adjoint(M)
|
||||
float det = gMat44.determinant(m);
|
||||
AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
|
||||
float invDet = 1.0f / det;
|
||||
float temp[16];
|
||||
temp[0] = (m[5] * m[10] - m[6] * m[9]) * invDet;
|
||||
temp[1] = (m[9] * m[2] - m[10] * m[1]) * invDet;
|
||||
temp[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
|
||||
|
||||
temp[4] = (m[6] * m[8] - m[4] * m[10]) * invDet;
|
||||
temp[5] = (m[10] * m[0] - m[8] * m[2]) * invDet;
|
||||
temp[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
|
||||
|
||||
temp[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
|
||||
temp[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
|
||||
temp[10] = (m[0] * m[5] - m[1] * m[4]) * invDet;
|
||||
|
||||
m[0] = temp[0];
|
||||
m[1] = temp[1];
|
||||
m[2] = temp[2];
|
||||
|
||||
m[4] = temp[4];
|
||||
m[5] = temp[5];
|
||||
m[6] = temp[6];
|
||||
|
||||
m[8] = temp[8];
|
||||
m[9] = temp[9];
|
||||
m[10] = temp[10];
|
||||
|
||||
// invert the translation
|
||||
temp[0] = -m[3];
|
||||
temp[1] = -m[7];
|
||||
temp[2] = -m[11];
|
||||
gMat44.mul_vec3(m, temp, &temp[4]);
|
||||
m[3] = temp[4];
|
||||
m[7] = temp[5];
|
||||
m[11] = temp[6];
|
||||
|
||||
};
|
||||
|
||||
gMat44.inverse_to = [](const float* m, float* d) {
|
||||
// using Cramers Rule find the Inverse
|
||||
// Minv = (1/det(M)) * adjoint(M)
|
||||
F32 det = gMat44.determinant(m);
|
||||
AssertFatal(det != 0.0f, "MatrixF::inverse: non-singular matrix, no inverse.");
|
||||
|
||||
F32 invDet = 1.0f / det;
|
||||
|
||||
d[0] = (m[5] * m[10] - m[6] * m[9]) * invDet;
|
||||
d[1] = (m[9] * m[2] - m[10] * m[1]) * invDet;
|
||||
d[2] = (m[1] * m[6] - m[2] * m[5]) * invDet;
|
||||
|
||||
d[4] = (m[6] * m[8] - m[4] * m[10]) * invDet;
|
||||
d[5] = (m[10] * m[0] - m[8] * m[2]) * invDet;
|
||||
d[6] = (m[2] * m[4] - m[0] * m[6]) * invDet;
|
||||
|
||||
d[8] = (m[4] * m[9] - m[5] * m[8]) * invDet;
|
||||
d[9] = (m[8] * m[1] - m[9] * m[0]) * invDet;
|
||||
d[10] = (m[0] * m[5] - m[1] * m[4]) * invDet;
|
||||
|
||||
// invert the translation
|
||||
F32 temp[6];
|
||||
temp[0] = -m[3];
|
||||
temp[1] = -m[7];
|
||||
temp[2] = -m[11];
|
||||
gMat44.mul_vec3(d, temp, &temp[3]);
|
||||
d[3] = temp[3];
|
||||
d[7] = temp[4];
|
||||
d[11] = temp[5];
|
||||
d[12] = m[12];
|
||||
d[13] = m[13];
|
||||
d[14] = m[14];
|
||||
d[15] = m[15];
|
||||
};
|
||||
|
||||
gMat44.affine_inverse = [](float* a) {
|
||||
F32 temp[16];
|
||||
dMemcpy(temp, a, 16 * sizeof(F32));
|
||||
|
||||
// Transpose rotation
|
||||
a[1] = temp[4];
|
||||
a[4] = temp[1];
|
||||
a[2] = temp[8];
|
||||
a[8] = temp[2];
|
||||
a[6] = temp[9];
|
||||
a[9] = temp[6];
|
||||
|
||||
a[3] = -(temp[0] * temp[3] + temp[4] * temp[7] + temp[8] * temp[11]);
|
||||
a[7] = -(temp[1] * temp[3] + temp[5] * temp[7] + temp[9] * temp[11]);
|
||||
a[11] = -(temp[2] * temp[3] + temp[6] * temp[7] + temp[10] * temp[11]);
|
||||
};
|
||||
|
||||
gMat44.scale = [](float* a, const float* s) {
|
||||
// Note, doesn't allow scaling w...
|
||||
|
||||
a[0] *= s[0]; a[1] *= s[1]; a[2] *= s[2];
|
||||
a[4] *= s[0]; a[5] *= s[1]; a[6] *= s[2];
|
||||
a[8] *= s[0]; a[9] *= s[1]; a[10] *= s[2];
|
||||
a[12] *= s[0]; a[13] *= s[1]; a[14] *= s[2];
|
||||
};
|
||||
|
||||
gMat44.get_scale = [](const float* a, float* s) {
|
||||
// Note, doesn't allow scaling w...
|
||||
s[0] = sqrt(a[0] * a[0] + a[4] * a[4] + a[8] * a[8]);
|
||||
s[1] = sqrt(a[1] * a[1] + a[5] * a[5] + a[9] * a[9]);
|
||||
s[2] = sqrt(a[2] * a[2] + a[6] * a[6] + a[10] * a[10]);
|
||||
};
|
||||
|
||||
gMat44.mul_float4 = [](const float* a, const float* b, float* r) {
|
||||
AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
|
||||
r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
|
||||
r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2] + a[7] * b[3];
|
||||
r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2] + a[11] * b[3];
|
||||
r[2] = a[12] * b[0] + a[13] * b[1] + a[14] * b[2] + a[15] * b[3];
|
||||
};
|
||||
|
||||
gMat44.mul_pos3 = [](const float* a, const float* b, float* r) {
|
||||
AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
|
||||
r[0] = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3];
|
||||
r[1] = a[4]*b[0] + a[5]*b[1] + a[6]*b[2] + a[7];
|
||||
r[2] = a[8]*b[0] + a[9]*b[1] + a[10]*b[2] + a[11];
|
||||
};
|
||||
|
||||
gMat44.mul_vec3 = [](const float* a, const float* b, float* r) {
|
||||
AssertFatal(b != r, "Error, aliasing matrix mul pointers not allowed here!");
|
||||
r[0] = a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
|
||||
r[1] = a[4] * b[0] + a[5] * b[1] + a[6] * b[2];
|
||||
r[2] = a[8] * b[0] + a[9] * b[1] + a[10] * b[2];
|
||||
};
|
||||
|
||||
gMat44.mul_mat44 = [](const float* a, const float* b, float* mresult) {
|
||||
mresult[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8] + a[3]*b[12];
|
||||
mresult[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9] + a[3]*b[13];
|
||||
mresult[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10] + a[3]*b[14];
|
||||
mresult[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11] + a[3]*b[15];
|
||||
|
||||
mresult[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8] + a[7]*b[12];
|
||||
mresult[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9] + a[7]*b[13];
|
||||
mresult[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10] + a[7]*b[14];
|
||||
mresult[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11] + a[7]*b[15];
|
||||
|
||||
mresult[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8] + a[11]*b[12];
|
||||
mresult[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9] + a[11]*b[13];
|
||||
mresult[10]= a[8]*b[2] + a[9]*b[6] + a[10]*b[10]+ a[11]*b[14];
|
||||
mresult[11]= a[8]*b[3] + a[9]*b[7] + a[10]*b[11]+ a[11]*b[15];
|
||||
|
||||
mresult[12]= a[12]*b[0]+ a[13]*b[4]+ a[14]*b[8] + a[15]*b[12];
|
||||
mresult[13]= a[12]*b[1]+ a[13]*b[5]+ a[14]*b[9] + a[15]*b[13];
|
||||
mresult[14]= a[12]*b[2]+ a[13]*b[6]+ a[14]*b[10]+ a[15]*b[14];
|
||||
mresult[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15];
|
||||
};
|
||||
|
||||
gMat44.transform_plane = [](const F32* m, const F32* s, const F32* p, F32* presult) {
|
||||
// We take in a matrix, a scale factor, and a plane equation. We want to output
|
||||
// the resultant normal
|
||||
// We have T = m*s
|
||||
// To multiply the normal, we want Inv(Tr(m*s))
|
||||
// Inv(Tr(ms)) = Inv(Tr(s) * Tr(m))
|
||||
// = Inv(Tr(m)) * Inv(Tr(s))
|
||||
//
|
||||
// Inv(Tr(s)) = Inv(s) = [ 1/x 0 0 0]
|
||||
// [ 0 1/y 0 0]
|
||||
// [ 0 0 1/z 0]
|
||||
// [ 0 0 0 1]
|
||||
//
|
||||
// Since m is an affine matrix,
|
||||
// Tr(m) = [ [ ] 0 ]
|
||||
// [ [ R ] 0 ]
|
||||
// [ [ ] 0 ]
|
||||
// [ [ x y z ] 1 ]
|
||||
//
|
||||
// Inv(Tr(m)) = [ [ -1 ] 0 ]
|
||||
// [ [ R ] 0 ]
|
||||
// [ [ ] 0 ]
|
||||
// [ [ A B C ] 1 ]
|
||||
// Where:
|
||||
//
|
||||
// P = (x, y, z)
|
||||
// A = -(Row(0, r) * P);
|
||||
// B = -(Row(1, r) * P);
|
||||
// C = -(Row(2, r) * P);
|
||||
|
||||
MatrixF invScale(true);
|
||||
F32* pScaleElems = invScale;
|
||||
pScaleElems[MatrixF::idx(0, 0)] = 1.0f / s[0];
|
||||
pScaleElems[MatrixF::idx(1, 1)] = 1.0f / s[1];
|
||||
pScaleElems[MatrixF::idx(2, 2)] = 1.0f / s[2];
|
||||
|
||||
const Point3F shear(m[MatrixF::idx(3, 0)], m[MatrixF::idx(3, 1)], m[MatrixF::idx(3, 2)]);
|
||||
|
||||
const Point3F row0(m[MatrixF::idx(0, 0)], m[MatrixF::idx(0, 1)], m[MatrixF::idx(0, 2)]);
|
||||
const Point3F row1(m[MatrixF::idx(1, 0)], m[MatrixF::idx(1, 1)], m[MatrixF::idx(1, 2)]);
|
||||
const Point3F row2(m[MatrixF::idx(2, 0)], m[MatrixF::idx(2, 1)], m[MatrixF::idx(2, 2)]);
|
||||
|
||||
const F32 A = -mDot(row0, shear);
|
||||
const F32 B = -mDot(row1, shear);
|
||||
const F32 C = -mDot(row2, shear);
|
||||
|
||||
MatrixF invTrMatrix(true);
|
||||
F32* destMat = invTrMatrix;
|
||||
destMat[MatrixF::idx(0, 0)] = m[MatrixF::idx(0, 0)];
|
||||
destMat[MatrixF::idx(1, 0)] = m[MatrixF::idx(1, 0)];
|
||||
destMat[MatrixF::idx(2, 0)] = m[MatrixF::idx(2, 0)];
|
||||
destMat[MatrixF::idx(0, 1)] = m[MatrixF::idx(0, 1)];
|
||||
destMat[MatrixF::idx(1, 1)] = m[MatrixF::idx(1, 1)];
|
||||
destMat[MatrixF::idx(2, 1)] = m[MatrixF::idx(2, 1)];
|
||||
destMat[MatrixF::idx(0, 2)] = m[MatrixF::idx(0, 2)];
|
||||
destMat[MatrixF::idx(1, 2)] = m[MatrixF::idx(1, 2)];
|
||||
destMat[MatrixF::idx(2, 2)] = m[MatrixF::idx(2, 2)];
|
||||
destMat[MatrixF::idx(0, 3)] = A;
|
||||
destMat[MatrixF::idx(1, 3)] = B;
|
||||
destMat[MatrixF::idx(2, 3)] = C;
|
||||
invTrMatrix.mul(invScale);
|
||||
|
||||
Point3F norm(p[0], p[1], p[2]);
|
||||
Point3F point = norm * -p[3];
|
||||
invTrMatrix.mulP(norm);
|
||||
norm.normalize();
|
||||
|
||||
MatrixF temp;
|
||||
dMemcpy(temp, m, sizeof(F32) * 16);
|
||||
point.x *= s[0];
|
||||
point.y *= s[1];
|
||||
point.z *= s[2];
|
||||
temp.mulP(point);
|
||||
|
||||
PlaneF resultPlane(point, norm);
|
||||
presult[0] = resultPlane.x;
|
||||
presult[1] = resultPlane.y;
|
||||
presult[2] = resultPlane.z;
|
||||
presult[3] = resultPlane.d;
|
||||
|
||||
};
|
||||
|
||||
gMat44.normalize = [](float* a) {
|
||||
F32 col0[3], col1[3], col2[3];
|
||||
// extract columns 0 and 1
|
||||
col0[0] = a[0];
|
||||
col0[1] = a[4];
|
||||
col0[2] = a[8];
|
||||
|
||||
col1[0] = a[1];
|
||||
col1[1] = a[5];
|
||||
col1[2] = a[9];
|
||||
|
||||
math_backend::float3::dispatch::gFloat3.normalize(col0);
|
||||
math_backend::float3::dispatch::gFloat3.normalize(col1);
|
||||
math_backend::float3::dispatch::gFloat3.normalize(col2);
|
||||
|
||||
// store the normalized columns
|
||||
a[0] = col0[0];
|
||||
a[4] = col0[1];
|
||||
a[8] = col0[2];
|
||||
|
||||
a[1] = col1[0];
|
||||
a[5] = col1[1];
|
||||
a[9] = col1[2];
|
||||
|
||||
a[2] = col2[0];
|
||||
a[6] = col2[1];
|
||||
a[10] = col2[2];
|
||||
|
||||
};
|
||||
|
||||
gMat44.batch_mul_pos3 = [](const float* m, const float* pts, int count, float* result_ptrs) {
|
||||
int i = 0;
|
||||
for (; i < count; i++)
|
||||
{
|
||||
int idx = i * 3;
|
||||
gMat44.mul_pos3(m, &pts[idx], &result_ptrs[idx]);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
625
Engine/source/math/isa/avx/avx_intrinsics.h
Normal file
625
Engine/source/math/isa/avx/avx_intrinsics.h
Normal file
|
|
@ -0,0 +1,625 @@
|
|||
#pragma once
|
||||
#include <immintrin.h> // AVX
|
||||
|
||||
namespace
|
||||
{
|
||||
typedef __m128 f32x4;
|
||||
|
||||
//------------------------------------------------------
|
||||
// Load / Store
|
||||
//------------------------------------------------------
|
||||
|
||||
// Load 4 floats from memory into a SIMD register
|
||||
inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
|
||||
|
||||
inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
|
||||
|
||||
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
|
||||
|
||||
inline f32x4 v_zero() { return _mm_setzero_ps(); }
|
||||
|
||||
inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
|
||||
|
||||
//------------------------------------------------------
|
||||
// Mask helpers
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
|
||||
|
||||
inline f32x4 v_swizzle_mask(f32x4 v1, const int x, const int y, const int z, const int w)
|
||||
{
|
||||
__m128i idx = _mm_set_epi32(w, z, y, x);
|
||||
return _mm_permutevar_ps(v1, idx);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x)
|
||||
{
|
||||
__m128i idx = _mm_set1_epi32(x);
|
||||
return _mm_permutevar_ps(v1, idx);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_lo(f32x4 v1)
|
||||
{
|
||||
return _mm_moveldup_ps(v1);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_hi(f32x4 v1)
|
||||
{
|
||||
return _mm_movehdup_ps(v1);
|
||||
}
|
||||
|
||||
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
|
||||
{
|
||||
return _mm_blend_ps(newv, original, 0b1000);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Shuffle helpers
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
|
||||
{
|
||||
f32x4 lo = v1;
|
||||
f32x4 hi = v2;
|
||||
|
||||
f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x));
|
||||
f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z));
|
||||
|
||||
return _mm_movelh_ps(combined_lo, combined_hi);
|
||||
}
|
||||
|
||||
inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
|
||||
{
|
||||
return _mm_movelh_ps(v1, v2);
|
||||
}
|
||||
|
||||
inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
|
||||
{
|
||||
return _mm_movehl_ps(v2, v1);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Float3 helpers (safe loading into 4 lanes)
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 v_load3_vec(const float* p) // w = 0
|
||||
{
|
||||
return _mm_set_ps(0.0f, p[2], p[1], p[0]);
|
||||
}
|
||||
|
||||
inline f32x4 v_load3_pos(const float* p) // w = 1
|
||||
{
|
||||
return _mm_set_ps(1.0f, p[2], p[1], p[0]);
|
||||
}
|
||||
|
||||
inline void v_store3(float* dst, f32x4 v)
|
||||
{
|
||||
alignas(16) float tmp[4]; // temp storage
|
||||
_mm_store_ps(tmp, v); // store all 4 lanes
|
||||
dst[0] = tmp[0];
|
||||
dst[1] = tmp[1];
|
||||
dst[2] = tmp[2];
|
||||
}
|
||||
|
||||
inline f32x4 v_set(float x, float y, float z, float w)
|
||||
{
|
||||
return _mm_set_ps(w, z, y, x);
|
||||
}
|
||||
|
||||
inline f32x4 v_insert_w(f32x4 v, f32x4 w)
|
||||
{
|
||||
return _mm_insert_ps(v, w, 0x30);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Simple Arithmatic
|
||||
//------------------------------------------------------
|
||||
|
||||
// Element-wise multiply
|
||||
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
|
||||
|
||||
// Element-wise divide
|
||||
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
|
||||
|
||||
// Element-wise add
|
||||
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
|
||||
|
||||
// Element-wise subtract
|
||||
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
|
||||
|
||||
//------------------------------------------------------
|
||||
// Fast recip
|
||||
//------------------------------------------------------
|
||||
|
||||
// Fast recip 1/b
|
||||
inline f32x4 v_rcp_nr(f32x4 b)
|
||||
{
|
||||
f32x4 r = _mm_rcp_ps(b);
|
||||
f32x4 two = _mm_set1_ps(2.0f);
|
||||
r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
|
||||
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
|
||||
}
|
||||
|
||||
// Divide fast ( b = recip eg 1/b)
|
||||
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
|
||||
|
||||
inline f32x4 v_rsqrt_nr(f32x4 x)
|
||||
{
|
||||
f32x4 half = _mm_set1_ps(0.5f);
|
||||
f32x4 three = _mm_set1_ps(3.0f);
|
||||
|
||||
f32x4 r = _mm_rsqrt_ps(x);
|
||||
f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
|
||||
return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Vector intrinsic functions
|
||||
//------------------------------------------------------
|
||||
|
||||
// full dot4
|
||||
inline f32x4 v_dot4(f32x4 a, f32x4 b)
|
||||
{
|
||||
return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes
|
||||
}
|
||||
|
||||
// dot3 (ignores w)
|
||||
inline f32x4 v_dot3(f32x4 a, f32x4 b)
|
||||
{
|
||||
return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes
|
||||
}
|
||||
|
||||
// cross product xyz only.
|
||||
inline f32x4 v_cross(f32x4 a, f32x4 b)
|
||||
{
|
||||
f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
|
||||
f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
|
||||
|
||||
return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
}
|
||||
|
||||
inline f32x4 v_normalize3(f32x4 v)
|
||||
{
|
||||
const f32x4 zero = _mm_setzero_ps();
|
||||
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
|
||||
f32x4 dot = v_dot3(v, v);
|
||||
|
||||
f32x4 inv = v_rsqrt_nr(dot);
|
||||
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
|
||||
f32x4 norm = _mm_mul_ps(v, inv);
|
||||
|
||||
return _mm_blendv_ps(norm, fallback, isZero);
|
||||
}
|
||||
|
||||
// adds all 4 lanes together.
|
||||
inline f32x4 v_hadd4(f32x4 a)
|
||||
{
|
||||
// sum all 4 lanes in SSE41
|
||||
__m128 sum = _mm_hadd_ps(a, a);
|
||||
return _mm_hadd_ps(sum, sum);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix type (row-major 4x4)
|
||||
//------------------------------------------------------
|
||||
|
||||
struct f32x4x4
|
||||
{
|
||||
f32x4 r0;
|
||||
f32x4 r1;
|
||||
f32x4 r2;
|
||||
f32x4 r3;
|
||||
};
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix Load / Store
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
|
||||
{
|
||||
f32x4x4 out;
|
||||
out.r0 = v_load(m + 0);
|
||||
out.r1 = v_load(m + 4);
|
||||
out.r2 = v_load(m + 8);
|
||||
out.r3 = v_load(m + 12);
|
||||
return out;
|
||||
}
|
||||
|
||||
inline void m_store(float* dst, const f32x4x4& m)
|
||||
{
|
||||
v_store(dst + 0, m.r0);
|
||||
v_store(dst + 4, m.r1);
|
||||
v_store(dst + 8, m.r2);
|
||||
v_store(dst + 12, m.r3);
|
||||
}
|
||||
|
||||
inline f32x4x4 m_identity()
|
||||
{
|
||||
f32x4x4 m;
|
||||
m.r0 = _mm_set_ps(0, 0, 0, 1);
|
||||
m.r1 = _mm_set_ps(0, 0, 1, 0);
|
||||
m.r2 = _mm_set_ps(0, 1, 0, 0);
|
||||
m.r3 = _mm_set_ps(1, 0, 0, 0);
|
||||
return m;
|
||||
}
|
||||
|
||||
inline f32x4x4 m_zero()
|
||||
{
|
||||
f32x4 z = v_zero();
|
||||
return { z, z, z, z };
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix × Vector
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
|
||||
{
|
||||
f32x4 x = v_dot4(m.r0, v);
|
||||
f32x4 y = v_dot4(m.r1, v);
|
||||
f32x4 z = v_dot4(m.r2, v);
|
||||
f32x4 w = v_dot4(m.r3, v);
|
||||
|
||||
// combine to a vector
|
||||
return _mm_set_ps(_mm_cvtss_f32(w),
|
||||
_mm_cvtss_f32(z),
|
||||
_mm_cvtss_f32(y),
|
||||
_mm_cvtss_f32(x));
|
||||
}
|
||||
|
||||
inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
|
||||
{
|
||||
f32x4 x = v_dot3(m.r0, v);
|
||||
f32x4 y = v_dot3(m.r1, v);
|
||||
f32x4 z = v_dot3(m.r2, v);
|
||||
|
||||
// combine to a vector
|
||||
return _mm_set_ps(0.0f,
|
||||
_mm_cvtss_f32(z),
|
||||
_mm_cvtss_f32(y),
|
||||
_mm_cvtss_f32(x));
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix × Matrix
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
|
||||
{
|
||||
// Transpose B once for dot products
|
||||
f32x4 b0 = b.r0;
|
||||
f32x4 b1 = b.r1;
|
||||
f32x4 b2 = b.r2;
|
||||
f32x4 b3 = b.r3;
|
||||
|
||||
// Transpose (SSE2)
|
||||
_MM_TRANSPOSE4_PS(b0, b1, b2, b3);
|
||||
|
||||
f32x4x4 C;
|
||||
|
||||
auto mul_row = [&](f32x4 arow)
|
||||
{
|
||||
f32x4 x = v_dot4(arow, b0);
|
||||
f32x4 y = v_dot4(arow, b1);
|
||||
f32x4 z = v_dot4(arow, b2);
|
||||
f32x4 w = v_dot4(arow, b3);
|
||||
|
||||
f32x4 xy = _mm_unpacklo_ps(x, y);
|
||||
f32x4 zw = _mm_unpacklo_ps(z, w);
|
||||
return _mm_movelh_ps(xy, zw);
|
||||
};
|
||||
|
||||
C.r0 = mul_row(a.r0);
|
||||
C.r1 = mul_row(a.r1);
|
||||
C.r2 = mul_row(a.r2);
|
||||
C.r3 = mul_row(a.r3);
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Transpose
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4x4 m_transpose(const f32x4x4& m)
|
||||
{
|
||||
f32x4 r0 = m.r0;
|
||||
f32x4 r1 = m.r1;
|
||||
f32x4 r2 = m.r2;
|
||||
f32x4 r3 = m.r3;
|
||||
|
||||
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
|
||||
|
||||
return { r0, r1, r2, r3 };
|
||||
}
|
||||
|
||||
inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
|
||||
{
|
||||
f32x4x4 t = m_transpose(m);
|
||||
|
||||
// Extract translation
|
||||
f32x4 T = v_set(
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
|
||||
0.0f
|
||||
);
|
||||
|
||||
f32x4 newT = m_mul_vec4(t, T);
|
||||
newT = _mm_sub_ps(v_zero(), newT);
|
||||
|
||||
t.r0 = v_preserve_w(t.r0, newT);
|
||||
t.r1 = v_preserve_w(t.r1, newT);
|
||||
t.r2 = v_preserve_w(t.r2, newT);
|
||||
t.r3 = v_set(0, 0, 0, 1);
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
inline f32x4 m_determinant(const f32x4x4& m)
|
||||
{
|
||||
f32x4 a = m.r0;
|
||||
f32x4 b = m.r1;
|
||||
f32x4 c = m.r2;
|
||||
f32x4 d = m.r3;
|
||||
|
||||
f32x4 c0 = v_cross(c, d);
|
||||
f32x4 c1 = v_cross(d, b);
|
||||
f32x4 c2 = v_cross(b, c);
|
||||
|
||||
f32x4 term0 = _mm_mul_ps(a, c0);
|
||||
f32x4 term1 = _mm_mul_ps(a, c1);
|
||||
f32x4 term2 = _mm_mul_ps(a, c2);
|
||||
|
||||
f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
|
||||
|
||||
return v_hadd4(det);
|
||||
}
|
||||
|
||||
inline f32x4 m_determinant_affine(const f32x4x4& m)
|
||||
{
|
||||
f32x4 r0 = m.r0;
|
||||
f32x4 r1 = m.r1;
|
||||
f32x4 r2 = m.r2;
|
||||
|
||||
r0 = _mm_and_ps(r0, v_mask_xyz());
|
||||
r1 = _mm_and_ps(r1, v_mask_xyz());
|
||||
r2 = _mm_and_ps(r2, v_mask_xyz());
|
||||
|
||||
f32x4 c0 = v_cross(r1, r2);
|
||||
return v_dot3(r0, c0); // splatted determinant
|
||||
}
|
||||
|
||||
//---------------------------------------------------------
|
||||
// BATCH INTRINSICS
|
||||
//---------------------------------------------------------
|
||||
typedef __m256 f32x8;
|
||||
|
||||
struct vec4_batch8
|
||||
{
|
||||
f32x8 x;
|
||||
f32x8 y;
|
||||
f32x8 z;
|
||||
f32x8 w;
|
||||
};
|
||||
|
||||
struct vec4_batch4
|
||||
{
|
||||
f32x4 x;
|
||||
f32x4 y;
|
||||
f32x4 z;
|
||||
f32x4 w;
|
||||
};
|
||||
|
||||
inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW)
|
||||
{
|
||||
vec4_batch8 r;
|
||||
|
||||
r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9],
|
||||
ptr[12], ptr[15], ptr[18], ptr[21]);
|
||||
r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10],
|
||||
ptr[13], ptr[16], ptr[19], ptr[22]);
|
||||
r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11],
|
||||
ptr[14], ptr[17], ptr[20], ptr[23]);
|
||||
|
||||
if (fillW)
|
||||
{
|
||||
r.w = _mm256_set1_ps(w);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
|
||||
{
|
||||
vec4_batch4 r;
|
||||
|
||||
r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]);
|
||||
r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]);
|
||||
r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]);
|
||||
|
||||
if (fillW)
|
||||
{
|
||||
r.w = _mm_set1_ps(w);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// Store the result back out to a float array 8 at a time
|
||||
inline void store_f32x8(float* out, f32x8 v)
|
||||
{
|
||||
_mm256_storeu_ps(out, v);
|
||||
}
|
||||
|
||||
// Store the result back out to a float array 4 at a time
|
||||
inline void store_f32x4(float* out, f32x4 v)
|
||||
{
|
||||
_mm_storeu_ps(out, v);
|
||||
}
|
||||
|
||||
// Store the result back to a float3 array with size of 8
|
||||
inline void store_vec3_batch8(float* out, const vec4_batch8& v)
|
||||
{
|
||||
alignas(32) float xs[8];
|
||||
alignas(32) float ys[8];
|
||||
alignas(32) float zs[8];
|
||||
|
||||
_mm256_store_ps(xs, v.x);
|
||||
_mm256_store_ps(ys, v.y);
|
||||
_mm256_store_ps(zs, v.z);
|
||||
|
||||
for (int i = 0; i < 8; ++i)
|
||||
{
|
||||
out[i * 3 + 0] = xs[i];
|
||||
out[i * 3 + 1] = ys[i];
|
||||
out[i * 3 + 2] = zs[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Store the result back to a float3 array with size of 4
|
||||
inline void store_vec3_batch4(float* out, const vec4_batch4& v)
|
||||
{
|
||||
alignas(16) float xs[8];
|
||||
alignas(16) float ys[8];
|
||||
alignas(16) float zs[8];
|
||||
|
||||
_mm_store_ps(xs, v.x);
|
||||
_mm_store_ps(ys, v.y);
|
||||
_mm_store_ps(zs, v.z);
|
||||
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
out[i * 3 + 0] = xs[i];
|
||||
out[i * 3 + 1] = ys[i];
|
||||
out[i * 3 + 2] = zs[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
|
||||
{
|
||||
f32x4 mulx = _mm_mul_ps(a.x, b.x);
|
||||
f32x4 muly = _mm_mul_ps(a.y, b.y);
|
||||
f32x4 mulz = _mm_mul_ps(a.z, b.z);
|
||||
|
||||
return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
|
||||
}
|
||||
|
||||
inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b)
|
||||
{
|
||||
f32x8 mulx = _mm256_mul_ps(a.x, b.x);
|
||||
f32x8 muly = _mm256_mul_ps(a.y, b.y);
|
||||
f32x8 mulz = _mm256_mul_ps(a.z, b.z);
|
||||
|
||||
return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
|
||||
}
|
||||
|
||||
// Batch 8 mul_Vec4.
|
||||
inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v)
|
||||
{
|
||||
vec4_batch8 r;
|
||||
|
||||
__m256 m00 = _mm256_set1_ps(m[0]);
|
||||
__m256 m01 = _mm256_set1_ps(m[1]);
|
||||
__m256 m02 = _mm256_set1_ps(m[2]);
|
||||
__m256 m03 = _mm256_set1_ps(m[3]);
|
||||
|
||||
__m256 m10 = _mm256_set1_ps(m[4]);
|
||||
__m256 m11 = _mm256_set1_ps(m[5]);
|
||||
__m256 m12 = _mm256_set1_ps(m[6]);
|
||||
__m256 m13 = _mm256_set1_ps(m[7]);
|
||||
|
||||
__m256 m20 = _mm256_set1_ps(m[8]);
|
||||
__m256 m21 = _mm256_set1_ps(m[9]);
|
||||
__m256 m22 = _mm256_set1_ps(m[10]);
|
||||
__m256 m23 = _mm256_set1_ps(m[11]);
|
||||
|
||||
//
|
||||
// row0 dot
|
||||
//
|
||||
r.x = _mm256_add_ps(
|
||||
_mm256_add_ps(
|
||||
_mm256_mul_ps(v.x, m00),
|
||||
_mm256_mul_ps(v.y, m01)),
|
||||
_mm256_add_ps(
|
||||
_mm256_mul_ps(v.z, m02), m03));
|
||||
|
||||
//
|
||||
// row1 dot
|
||||
//
|
||||
r.y = _mm256_add_ps(
|
||||
_mm256_add_ps(
|
||||
_mm256_mul_ps(v.x, m10),
|
||||
_mm256_mul_ps(v.y, m11)),
|
||||
_mm256_add_ps(
|
||||
_mm256_mul_ps(v.z, m12), m13));
|
||||
|
||||
//
|
||||
// row2 dot
|
||||
//
|
||||
r.z = _mm256_add_ps(
|
||||
_mm256_add_ps(
|
||||
_mm256_mul_ps(v.x, m20),
|
||||
_mm256_mul_ps(v.y, m21)),
|
||||
_mm256_add_ps(
|
||||
_mm256_mul_ps(v.z, m22), m23));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// Batch 4 mul_Vec4.
|
||||
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
|
||||
{
|
||||
vec4_batch4 r;
|
||||
|
||||
f32x4 m00 = _mm_set1_ps(m[0]);
|
||||
f32x4 m01 = _mm_set1_ps(m[1]);
|
||||
f32x4 m02 = _mm_set1_ps(m[2]);
|
||||
f32x4 m03 = _mm_set1_ps(m[3]);
|
||||
|
||||
f32x4 m10 = _mm_set1_ps(m[4]);
|
||||
f32x4 m11 = _mm_set1_ps(m[5]);
|
||||
f32x4 m12 = _mm_set1_ps(m[6]);
|
||||
f32x4 m13 = _mm_set1_ps(m[7]);
|
||||
|
||||
f32x4 m20 = _mm_set1_ps(m[8]);
|
||||
f32x4 m21 = _mm_set1_ps(m[9]);
|
||||
f32x4 m22 = _mm_set1_ps(m[10]);
|
||||
f32x4 m23 = _mm_set1_ps(m[11]);
|
||||
|
||||
//
|
||||
// row0 dot
|
||||
//
|
||||
r.x = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.x, m00),
|
||||
_mm_mul_ps(v.y, m01)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.z, m02), m03));
|
||||
|
||||
//
|
||||
// row1 dot
|
||||
//
|
||||
r.y = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.x, m10),
|
||||
_mm_mul_ps(v.y, m11)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.z, m12), m13));
|
||||
|
||||
//
|
||||
// row2 dot
|
||||
//
|
||||
r.z = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.x, m20),
|
||||
_mm_mul_ps(v.y, m21)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.z, m22), m23));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
}
|
||||
26
Engine/source/math/isa/avx/float3.cpp
Normal file
26
Engine/source/math/isa/avx/float3.cpp
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
#include "avx_intrinsics.h"
|
||||
#include "float3_dispatch.h"
|
||||
#include <immintrin.h> // AVX/AVX2 intrinsics
|
||||
|
||||
#include "float3_impl.inl"
|
||||
|
||||
namespace math_backend::float3::dispatch
|
||||
{
|
||||
// Install AVX backend
|
||||
void install_avx()
|
||||
{
|
||||
gFloat3.add = float3_add_impl;
|
||||
gFloat3.sub = float3_sub_impl;
|
||||
gFloat3.mul = float3_mul_impl;
|
||||
gFloat3.mul_scalar = float3_mul_scalar_impl;
|
||||
gFloat3.div = float3_div_impl;
|
||||
gFloat3.div_scalar = float3_div_scalar_impl;
|
||||
gFloat3.dot = float3_dot_impl;
|
||||
gFloat3.length = float3_length_impl;
|
||||
gFloat3.lengthSquared = float3_length_squared_impl;
|
||||
gFloat3.normalize = float3_normalize_impl;
|
||||
gFloat3.normalize_mag = float3_normalize_mag_impl;
|
||||
gFloat3.lerp = float3_lerp_impl;
|
||||
gFloat3.cross = float3_cross_impl;
|
||||
}
|
||||
}
|
||||
25
Engine/source/math/isa/avx/float4.cpp
Normal file
25
Engine/source/math/isa/avx/float4.cpp
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
#include "avx_intrinsics.h"
|
||||
#include "float4_dispatch.h"
|
||||
|
||||
#include "float4_impl.inl"
|
||||
|
||||
namespace math_backend::float4::dispatch
|
||||
{
|
||||
// Install AVX backend
|
||||
void install_avx()
|
||||
{
|
||||
gFloat4.add = float4_add_impl;
|
||||
gFloat4.sub = float4_sub_impl;
|
||||
gFloat4.mul = float4_mul_impl;
|
||||
gFloat4.mul_scalar = float4_mul_scalar_impl;
|
||||
gFloat4.div = float4_div_impl;
|
||||
gFloat4.div_scalar = float4_div_scalar_impl;
|
||||
gFloat4.dot = float4_dot_impl;
|
||||
gFloat4.length = float4_length_impl;
|
||||
gFloat4.lengthSquared = float4_length_squared_impl;
|
||||
gFloat4.normalize = float4_normalize_impl;
|
||||
gFloat4.normalize_mag = float4_normalize_mag_impl;
|
||||
gFloat4.lerp = float4_lerp_impl;
|
||||
gFloat4.cross = float4_cross_impl;
|
||||
}
|
||||
}
|
||||
26
Engine/source/math/isa/avx/mat44.cpp
Normal file
26
Engine/source/math/isa/avx/mat44.cpp
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
#include "avx_intrinsics.h"
|
||||
#include "mat44_dispatch.h"
|
||||
|
||||
#include "mat44_impl.inl"
|
||||
|
||||
namespace math_backend::mat44::dispatch
|
||||
{
|
||||
void install_avx()
|
||||
{
|
||||
gMat44.transpose = mat44_transpose_impl;
|
||||
gMat44.inverse = mat44_inverse_impl;
|
||||
gMat44.inverse_to = mat44_inverse_to_impl;
|
||||
gMat44.affine_inverse = mat44_affine_inverse_impl;
|
||||
gMat44.mul_mat44 = mat44_mul_mat44_impl;
|
||||
gMat44.mul_pos3 = mat44_mul_pos3_impl;
|
||||
gMat44.mul_vec3 = mat44_mul_vec3_impl;
|
||||
gMat44.mul_float4 = mat44_mul_float4_impl;
|
||||
gMat44.transform_plane = mat44_transform_plane_impl;
|
||||
gMat44.scale = mat44_scale_impl;
|
||||
gMat44.get_scale = mat44_get_scale_impl;
|
||||
gMat44.normalize = mat44_normalize_impl;
|
||||
gMat44.determinant = mat44_get_determinant;
|
||||
|
||||
gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
|
||||
}
|
||||
}
|
||||
625
Engine/source/math/isa/avx2/avx2_intrinsics.h
Normal file
625
Engine/source/math/isa/avx2/avx2_intrinsics.h
Normal file
|
|
@ -0,0 +1,625 @@
|
|||
#pragma once
|
||||
#include <immintrin.h> // AVX2
|
||||
|
||||
namespace
|
||||
{
|
||||
typedef __m128 f32x4;
|
||||
|
||||
//------------------------------------------------------
|
||||
// Load / Store
|
||||
//------------------------------------------------------
|
||||
|
||||
// Load 4 floats from memory into a SIMD register
|
||||
inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
|
||||
|
||||
inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
|
||||
|
||||
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
|
||||
|
||||
inline f32x4 v_zero() { return _mm_setzero_ps(); }
|
||||
|
||||
inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
|
||||
|
||||
//------------------------------------------------------
|
||||
// Mask helpers
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
|
||||
|
||||
inline f32x4 v_swizzle_mask(f32x4 v1,const int x, const int y, const int z, const int w)
|
||||
{
|
||||
__m128i idx = _mm_set_epi32(w, z, y, x);
|
||||
return _mm_permutevar_ps(v1, idx);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_singular_mask(f32x4 v1, const int x)
|
||||
{
|
||||
__m128i idx = _mm_set1_epi32(x);
|
||||
return _mm_permutevar_ps(v1, idx);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_lo(f32x4 v1)
|
||||
{
|
||||
return _mm_moveldup_ps(v1);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_hi(f32x4 v1)
|
||||
{
|
||||
return _mm_movehdup_ps(v1);
|
||||
}
|
||||
|
||||
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
|
||||
{
|
||||
return _mm_blend_ps(newv, original, 0b1000);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Shuffle helpers
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
|
||||
{
|
||||
f32x4 lo = v1;
|
||||
f32x4 hi = v2;
|
||||
|
||||
f32x4 combined_lo = _mm_permutevar_ps(lo, _mm_set_epi32(y, y, x, x));
|
||||
f32x4 combined_hi = _mm_permutevar_ps(hi, _mm_set_epi32(w, w, z, z));
|
||||
|
||||
return _mm_movelh_ps(combined_lo, combined_hi);
|
||||
}
|
||||
|
||||
inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
|
||||
{
|
||||
return _mm_movelh_ps(v1, v2);
|
||||
}
|
||||
|
||||
inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
|
||||
{
|
||||
return _mm_movehl_ps(v2, v1);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Float3 helpers (safe loading into 4 lanes)
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 v_load3_vec(const float* p) // w = 0
|
||||
{
|
||||
return _mm_set_ps(0.0f, p[2], p[1], p[0]);
|
||||
}
|
||||
|
||||
inline f32x4 v_load3_pos(const float* p) // w = 1
|
||||
{
|
||||
return _mm_set_ps(1.0f, p[2], p[1], p[0]);
|
||||
}
|
||||
|
||||
inline void v_store3(float* dst, f32x4 v)
|
||||
{
|
||||
alignas(16) float tmp[4]; // temp storage
|
||||
_mm_store_ps(tmp, v); // store all 4 lanes
|
||||
dst[0] = tmp[0];
|
||||
dst[1] = tmp[1];
|
||||
dst[2] = tmp[2];
|
||||
}
|
||||
|
||||
inline f32x4 v_set(float x, float y, float z, float w)
|
||||
{
|
||||
return _mm_set_ps(w, z, y, x);
|
||||
}
|
||||
|
||||
inline f32x4 v_insert_w(f32x4 v, f32x4 w)
|
||||
{
|
||||
return _mm_insert_ps(v, w, 0x30);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Simple Arithmatic
|
||||
//------------------------------------------------------
|
||||
|
||||
// Element-wise multiply
|
||||
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
|
||||
|
||||
// Element-wise divide
|
||||
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
|
||||
|
||||
// Element-wise add
|
||||
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
|
||||
|
||||
// Element-wise subtract
|
||||
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
|
||||
|
||||
//------------------------------------------------------
|
||||
// Fast recip
|
||||
//------------------------------------------------------
|
||||
|
||||
// Fast recip 1/b
|
||||
inline f32x4 v_rcp_nr(f32x4 b)
|
||||
{
|
||||
f32x4 r = _mm_rcp_ps(b);
|
||||
f32x4 two = _mm_set1_ps(2.0f);
|
||||
r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
|
||||
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
|
||||
}
|
||||
|
||||
// Divide fast ( b = recip eg 1/b)
|
||||
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
|
||||
|
||||
inline f32x4 v_rsqrt_nr(f32x4 x)
|
||||
{
|
||||
f32x4 half = _mm_set1_ps(0.5f);
|
||||
f32x4 three = _mm_set1_ps(3.0f);
|
||||
|
||||
f32x4 r = _mm_rsqrt_ps(x);
|
||||
f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
|
||||
return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Vector intrinsic functions
|
||||
//------------------------------------------------------
|
||||
|
||||
// full dot4
|
||||
inline f32x4 v_dot4(f32x4 a, f32x4 b)
|
||||
{
|
||||
return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes
|
||||
}
|
||||
|
||||
// dot3 (ignores w)
|
||||
inline f32x4 v_dot3(f32x4 a, f32x4 b)
|
||||
{
|
||||
return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes
|
||||
}
|
||||
|
||||
// cross product xyz only.
|
||||
inline f32x4 v_cross(f32x4 a, f32x4 b)
|
||||
{
|
||||
f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
|
||||
f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
|
||||
|
||||
return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
}
|
||||
|
||||
inline f32x4 v_normalize3(f32x4 v)
|
||||
{
|
||||
const f32x4 zero = _mm_setzero_ps();
|
||||
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
|
||||
f32x4 dot = v_dot3(v, v);
|
||||
|
||||
f32x4 inv = v_rsqrt_nr(dot);
|
||||
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
|
||||
f32x4 norm = _mm_mul_ps(v, inv);
|
||||
|
||||
return _mm_blendv_ps(norm, fallback, isZero);
|
||||
}
|
||||
|
||||
// adds all 4 lanes together.
|
||||
inline f32x4 v_hadd4(f32x4 a)
|
||||
{
|
||||
// sum all 4 lanes in SSE41
|
||||
__m128 sum = _mm_hadd_ps(a, a);
|
||||
return _mm_hadd_ps(sum, sum);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix type (row-major 4x4)
|
||||
//------------------------------------------------------
|
||||
|
||||
struct f32x4x4
|
||||
{
|
||||
f32x4 r0;
|
||||
f32x4 r1;
|
||||
f32x4 r2;
|
||||
f32x4 r3;
|
||||
};
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix Load / Store
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
|
||||
{
|
||||
f32x4x4 out;
|
||||
out.r0 = v_load(m + 0);
|
||||
out.r1 = v_load(m + 4);
|
||||
out.r2 = v_load(m + 8);
|
||||
out.r3 = v_load(m + 12);
|
||||
return out;
|
||||
}
|
||||
|
||||
inline void m_store(float* dst, const f32x4x4& m)
|
||||
{
|
||||
v_store(dst + 0, m.r0);
|
||||
v_store(dst + 4, m.r1);
|
||||
v_store(dst + 8, m.r2);
|
||||
v_store(dst + 12, m.r3);
|
||||
}
|
||||
|
||||
inline f32x4x4 m_identity()
|
||||
{
|
||||
f32x4x4 m;
|
||||
m.r0 = _mm_set_ps(0, 0, 0, 1);
|
||||
m.r1 = _mm_set_ps(0, 0, 1, 0);
|
||||
m.r2 = _mm_set_ps(0, 1, 0, 0);
|
||||
m.r3 = _mm_set_ps(1, 0, 0, 0);
|
||||
return m;
|
||||
}
|
||||
|
||||
inline f32x4x4 m_zero()
|
||||
{
|
||||
f32x4 z = v_zero();
|
||||
return { z, z, z, z };
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix × Vector
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
|
||||
{
|
||||
f32x4 x = v_dot4(m.r0, v);
|
||||
f32x4 y = v_dot4(m.r1, v);
|
||||
f32x4 z = v_dot4(m.r2, v);
|
||||
f32x4 w = v_dot4(m.r3, v);
|
||||
|
||||
// combine to a vector
|
||||
return _mm_set_ps(_mm_cvtss_f32(w),
|
||||
_mm_cvtss_f32(z),
|
||||
_mm_cvtss_f32(y),
|
||||
_mm_cvtss_f32(x));
|
||||
}
|
||||
|
||||
inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
|
||||
{
|
||||
f32x4 x = v_dot3(m.r0, v);
|
||||
f32x4 y = v_dot3(m.r1, v);
|
||||
f32x4 z = v_dot3(m.r2, v);
|
||||
|
||||
// combine to a vector
|
||||
return _mm_set_ps(0.0f,
|
||||
_mm_cvtss_f32(z),
|
||||
_mm_cvtss_f32(y),
|
||||
_mm_cvtss_f32(x));
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix × Matrix
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
|
||||
{
|
||||
// Transpose B once for dot products
|
||||
f32x4 b0 = b.r0;
|
||||
f32x4 b1 = b.r1;
|
||||
f32x4 b2 = b.r2;
|
||||
f32x4 b3 = b.r3;
|
||||
|
||||
// Transpose (SSE2)
|
||||
_MM_TRANSPOSE4_PS(b0, b1, b2, b3);
|
||||
|
||||
f32x4x4 C;
|
||||
|
||||
auto mul_row = [&](f32x4 arow)
|
||||
{
|
||||
f32x4 x = v_dot4(arow, b0);
|
||||
f32x4 y = v_dot4(arow, b1);
|
||||
f32x4 z = v_dot4(arow, b2);
|
||||
f32x4 w = v_dot4(arow, b3);
|
||||
|
||||
f32x4 xy = _mm_unpacklo_ps(x, y);
|
||||
f32x4 zw = _mm_unpacklo_ps(z, w);
|
||||
return _mm_movelh_ps(xy, zw);
|
||||
};
|
||||
|
||||
C.r0 = mul_row(a.r0);
|
||||
C.r1 = mul_row(a.r1);
|
||||
C.r2 = mul_row(a.r2);
|
||||
C.r3 = mul_row(a.r3);
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Transpose
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4x4 m_transpose(const f32x4x4& m)
|
||||
{
|
||||
f32x4 r0 = m.r0;
|
||||
f32x4 r1 = m.r1;
|
||||
f32x4 r2 = m.r2;
|
||||
f32x4 r3 = m.r3;
|
||||
|
||||
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
|
||||
|
||||
return { r0, r1, r2, r3 };
|
||||
}
|
||||
|
||||
inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
|
||||
{
|
||||
f32x4x4 t = m_transpose(m);
|
||||
|
||||
// Extract translation
|
||||
f32x4 T = v_set(
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
|
||||
0.0f
|
||||
);
|
||||
|
||||
f32x4 newT = m_mul_vec4(t, T);
|
||||
newT = _mm_sub_ps(v_zero(), newT);
|
||||
|
||||
t.r0 = v_preserve_w(t.r0, newT);
|
||||
t.r1 = v_preserve_w(t.r1, newT);
|
||||
t.r2 = v_preserve_w(t.r2, newT);
|
||||
t.r3 = v_set(0, 0, 0, 1);
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
inline f32x4 m_determinant(const f32x4x4& m)
|
||||
{
|
||||
f32x4 a = m.r0;
|
||||
f32x4 b = m.r1;
|
||||
f32x4 c = m.r2;
|
||||
f32x4 d = m.r3;
|
||||
|
||||
f32x4 c0 = v_cross(c, d);
|
||||
f32x4 c1 = v_cross(d, b);
|
||||
f32x4 c2 = v_cross(b, c);
|
||||
|
||||
f32x4 term0 = _mm_mul_ps(a, c0);
|
||||
f32x4 term1 = _mm_mul_ps(a, c1);
|
||||
f32x4 term2 = _mm_mul_ps(a, c2);
|
||||
|
||||
f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
|
||||
|
||||
return v_hadd4(det);
|
||||
}
|
||||
|
||||
inline f32x4 m_determinant_affine(const f32x4x4& m)
|
||||
{
|
||||
f32x4 r0 = m.r0;
|
||||
f32x4 r1 = m.r1;
|
||||
f32x4 r2 = m.r2;
|
||||
|
||||
r0 = _mm_and_ps(r0, v_mask_xyz());
|
||||
r1 = _mm_and_ps(r1, v_mask_xyz());
|
||||
r2 = _mm_and_ps(r2, v_mask_xyz());
|
||||
|
||||
f32x4 c0 = v_cross(r1, r2);
|
||||
return v_dot3(r0, c0); // splatted determinant
|
||||
}
|
||||
|
||||
//---------------------------------------------------------
|
||||
// BATCH INTRINSICS
|
||||
//---------------------------------------------------------
|
||||
typedef __m256 f32x8;
|
||||
|
||||
struct vec4_batch8
|
||||
{
|
||||
f32x8 x;
|
||||
f32x8 y;
|
||||
f32x8 z;
|
||||
f32x8 w;
|
||||
};
|
||||
|
||||
struct vec4_batch4
|
||||
{
|
||||
f32x4 x;
|
||||
f32x4 y;
|
||||
f32x4 z;
|
||||
f32x4 w;
|
||||
};
|
||||
|
||||
inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW)
|
||||
{
|
||||
vec4_batch8 r;
|
||||
|
||||
r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9],
|
||||
ptr[12], ptr[15], ptr[18], ptr[21]);
|
||||
r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10],
|
||||
ptr[13], ptr[16], ptr[19], ptr[22]);
|
||||
r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11],
|
||||
ptr[14], ptr[17], ptr[20], ptr[23]);
|
||||
|
||||
if (fillW)
|
||||
{
|
||||
r.w = _mm256_set1_ps(w);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
|
||||
{
|
||||
vec4_batch4 r;
|
||||
|
||||
r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]);
|
||||
r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]);
|
||||
r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]);
|
||||
|
||||
if (fillW)
|
||||
{
|
||||
r.w = _mm_set1_ps(w);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// Store the result back out to a float array 8 at a time
|
||||
inline void store_f32x8(float* out, f32x8 v)
|
||||
{
|
||||
_mm256_storeu_ps(out, v);
|
||||
}
|
||||
|
||||
// Store the result back out to a float array 4 at a time
|
||||
inline void store_f32x4(float* out, f32x4 v)
|
||||
{
|
||||
_mm_storeu_ps(out, v);
|
||||
}
|
||||
|
||||
// Store the result back to a float3 array with size of 8
|
||||
inline void store_vec3_batch8(float* out, const vec4_batch8& v)
|
||||
{
|
||||
alignas(32) float xs[8];
|
||||
alignas(32) float ys[8];
|
||||
alignas(32) float zs[8];
|
||||
|
||||
_mm256_store_ps(xs, v.x);
|
||||
_mm256_store_ps(ys, v.y);
|
||||
_mm256_store_ps(zs, v.z);
|
||||
|
||||
for (int i = 0; i < 8; ++i)
|
||||
{
|
||||
out[i * 3 + 0] = xs[i];
|
||||
out[i * 3 + 1] = ys[i];
|
||||
out[i * 3 + 2] = zs[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Store the result back to a float3 array with size of 4
|
||||
inline void store_vec3_batch4(float* out, const vec4_batch4& v)
|
||||
{
|
||||
alignas(16) float xs[8];
|
||||
alignas(16) float ys[8];
|
||||
alignas(16) float zs[8];
|
||||
|
||||
_mm_store_ps(xs, v.x);
|
||||
_mm_store_ps(ys, v.y);
|
||||
_mm_store_ps(zs, v.z);
|
||||
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
out[i * 3 + 0] = xs[i];
|
||||
out[i * 3 + 1] = ys[i];
|
||||
out[i * 3 + 2] = zs[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
|
||||
{
|
||||
f32x4 mulx = _mm_mul_ps(a.x, b.x);
|
||||
f32x4 muly = _mm_mul_ps(a.y, b.y);
|
||||
f32x4 mulz = _mm_mul_ps(a.z, b.z);
|
||||
|
||||
return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
|
||||
}
|
||||
|
||||
inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b)
|
||||
{
|
||||
f32x8 mulx = _mm256_mul_ps(a.x, b.x);
|
||||
f32x8 muly = _mm256_mul_ps(a.y, b.y);
|
||||
f32x8 mulz = _mm256_mul_ps(a.z, b.z);
|
||||
|
||||
return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz);
|
||||
}
|
||||
|
||||
// Batch 8 mul_Vec4.
|
||||
inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v)
|
||||
{
|
||||
vec4_batch8 r;
|
||||
|
||||
__m256 m00 = _mm256_set1_ps(m[0]);
|
||||
__m256 m01 = _mm256_set1_ps(m[1]);
|
||||
__m256 m02 = _mm256_set1_ps(m[2]);
|
||||
__m256 m03 = _mm256_set1_ps(m[3]);
|
||||
|
||||
__m256 m10 = _mm256_set1_ps(m[4]);
|
||||
__m256 m11 = _mm256_set1_ps(m[5]);
|
||||
__m256 m12 = _mm256_set1_ps(m[6]);
|
||||
__m256 m13 = _mm256_set1_ps(m[7]);
|
||||
|
||||
__m256 m20 = _mm256_set1_ps(m[8]);
|
||||
__m256 m21 = _mm256_set1_ps(m[9]);
|
||||
__m256 m22 = _mm256_set1_ps(m[10]);
|
||||
__m256 m23 = _mm256_set1_ps(m[11]);
|
||||
|
||||
//
|
||||
// row0 dot
|
||||
//
|
||||
r.x = _mm256_add_ps(
|
||||
_mm256_add_ps(
|
||||
_mm256_mul_ps(v.x, m00),
|
||||
_mm256_mul_ps(v.y, m01)),
|
||||
_mm256_add_ps(
|
||||
_mm256_mul_ps(v.z, m02), m03));
|
||||
|
||||
//
|
||||
// row1 dot
|
||||
//
|
||||
r.y = _mm256_add_ps(
|
||||
_mm256_add_ps(
|
||||
_mm256_mul_ps(v.x, m10),
|
||||
_mm256_mul_ps(v.y, m11)),
|
||||
_mm256_add_ps(
|
||||
_mm256_mul_ps(v.z, m12), m13));
|
||||
|
||||
//
|
||||
// row2 dot
|
||||
//
|
||||
r.z = _mm256_add_ps(
|
||||
_mm256_add_ps(
|
||||
_mm256_mul_ps(v.x, m20),
|
||||
_mm256_mul_ps(v.y, m21)),
|
||||
_mm256_add_ps(
|
||||
_mm256_mul_ps(v.z, m22), m23));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// Batch 4 mul_Vec4.
|
||||
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
|
||||
{
|
||||
vec4_batch4 r;
|
||||
|
||||
f32x4 m00 = _mm_set1_ps(m[0]);
|
||||
f32x4 m01 = _mm_set1_ps(m[1]);
|
||||
f32x4 m02 = _mm_set1_ps(m[2]);
|
||||
f32x4 m03 = _mm_set1_ps(m[3]);
|
||||
|
||||
f32x4 m10 = _mm_set1_ps(m[4]);
|
||||
f32x4 m11 = _mm_set1_ps(m[5]);
|
||||
f32x4 m12 = _mm_set1_ps(m[6]);
|
||||
f32x4 m13 = _mm_set1_ps(m[7]);
|
||||
|
||||
f32x4 m20 = _mm_set1_ps(m[8]);
|
||||
f32x4 m21 = _mm_set1_ps(m[9]);
|
||||
f32x4 m22 = _mm_set1_ps(m[10]);
|
||||
f32x4 m23 = _mm_set1_ps(m[11]);
|
||||
|
||||
//
|
||||
// row0 dot
|
||||
//
|
||||
r.x = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.x, m00),
|
||||
_mm_mul_ps(v.y, m01)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.z, m02), m03));
|
||||
|
||||
//
|
||||
// row1 dot
|
||||
//
|
||||
r.y = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.x, m10),
|
||||
_mm_mul_ps(v.y, m11)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.z, m12), m13));
|
||||
|
||||
//
|
||||
// row2 dot
|
||||
//
|
||||
r.z = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.x, m20),
|
||||
_mm_mul_ps(v.y, m21)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.z, m22), m23));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
}
|
||||
26
Engine/source/math/isa/avx2/float3.cpp
Normal file
26
Engine/source/math/isa/avx2/float3.cpp
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
#include "avx2_intrinsics.h"
|
||||
#include "float3_dispatch.h"
|
||||
#include <immintrin.h> // AVX/AVX2 intrinsics
|
||||
|
||||
#include "float3_impl.inl"
|
||||
|
||||
namespace math_backend::float3::dispatch
|
||||
{
|
||||
// Install AVX2 backend
|
||||
void install_avx2()
|
||||
{
|
||||
gFloat3.add = float3_add_impl;
|
||||
gFloat3.sub = float3_sub_impl;
|
||||
gFloat3.mul = float3_mul_impl;
|
||||
gFloat3.mul_scalar = float3_mul_scalar_impl;
|
||||
gFloat3.div = float3_div_impl;
|
||||
gFloat3.div_scalar = float3_div_scalar_impl;
|
||||
gFloat3.dot = float3_dot_impl;
|
||||
gFloat3.length = float3_length_impl;
|
||||
gFloat3.lengthSquared = float3_length_squared_impl;
|
||||
gFloat3.normalize = float3_normalize_impl;
|
||||
gFloat3.normalize_mag = float3_normalize_mag_impl;
|
||||
gFloat3.lerp = float3_lerp_impl;
|
||||
gFloat3.cross = float3_cross_impl;
|
||||
}
|
||||
}
|
||||
25
Engine/source/math/isa/avx2/float4.cpp
Normal file
25
Engine/source/math/isa/avx2/float4.cpp
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
#include "avx2_intrinsics.h"
|
||||
#include "float4_dispatch.h"
|
||||
|
||||
#include "float4_impl.inl"
|
||||
|
||||
namespace math_backend::float4::dispatch
|
||||
{
|
||||
// Install AVX2 backend
|
||||
void install_avx2()
|
||||
{
|
||||
gFloat4.add = float4_add_impl;
|
||||
gFloat4.sub = float4_sub_impl;
|
||||
gFloat4.mul = float4_mul_impl;
|
||||
gFloat4.mul_scalar = float4_mul_scalar_impl;
|
||||
gFloat4.div = float4_div_impl;
|
||||
gFloat4.div_scalar = float4_div_scalar_impl;
|
||||
gFloat4.dot = float4_dot_impl;
|
||||
gFloat4.length = float4_length_impl;
|
||||
gFloat4.lengthSquared = float4_length_squared_impl;
|
||||
gFloat4.normalize = float4_normalize_impl;
|
||||
gFloat4.normalize_mag = float4_normalize_mag_impl;
|
||||
gFloat4.lerp = float4_lerp_impl;
|
||||
gFloat4.cross = float4_cross_impl;
|
||||
}
|
||||
}
|
||||
25
Engine/source/math/isa/avx2/mat44.cpp
Normal file
25
Engine/source/math/isa/avx2/mat44.cpp
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
#include "avx2_intrinsics.h"
|
||||
#include "mat44_dispatch.h"
|
||||
|
||||
#include "mat44_impl.inl"
|
||||
|
||||
namespace math_backend::mat44::dispatch
|
||||
{
|
||||
void install_avx2()
|
||||
{
|
||||
gMat44.transpose = mat44_transpose_impl;
|
||||
gMat44.inverse = mat44_inverse_impl;
|
||||
gMat44.inverse_to = mat44_inverse_to_impl;
|
||||
gMat44.affine_inverse = mat44_affine_inverse_impl;
|
||||
gMat44.mul_mat44 = mat44_mul_mat44_impl;
|
||||
gMat44.mul_pos3 = mat44_mul_pos3_impl;
|
||||
gMat44.mul_vec3 = mat44_mul_vec3_impl;
|
||||
gMat44.mul_float4 = mat44_mul_float4_impl;
|
||||
gMat44.transform_plane = mat44_transform_plane_impl;
|
||||
gMat44.scale = mat44_scale_impl;
|
||||
gMat44.get_scale = mat44_get_scale_impl;
|
||||
gMat44.normalize = mat44_normalize_impl;
|
||||
gMat44.determinant = mat44_get_determinant;
|
||||
gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
|
||||
}
|
||||
}
|
||||
25
Engine/source/math/isa/neon/float3.cpp
Normal file
25
Engine/source/math/isa/neon/float3.cpp
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
#include "neon_intrinsics.h"
|
||||
#include "float3_dispatch.h"
|
||||
|
||||
#include "float3_impl.inl"
|
||||
|
||||
namespace math_backend::float3::dispatch
|
||||
{
|
||||
// Install NEON backend
|
||||
void install_neon()
|
||||
{
|
||||
gFloat3.add = float3_add_impl;
|
||||
gFloat3.sub = float3_sub_impl;
|
||||
gFloat3.mul = float3_mul_impl;
|
||||
gFloat3.mul_scalar = float3_mul_scalar_impl;
|
||||
gFloat3.div = float3_div_impl;
|
||||
gFloat3.div_scalar = float3_div_scalar_impl;
|
||||
gFloat3.dot = float3_dot_impl;
|
||||
gFloat3.length = float3_length_impl;
|
||||
gFloat3.lengthSquared = float3_length_squared_impl;
|
||||
gFloat3.normalize = float3_normalize_impl;
|
||||
gFloat3.normalize_mag = float3_normalize_mag_impl;
|
||||
gFloat3.lerp = float3_lerp_impl;
|
||||
gFloat3.cross = float3_cross_impl;
|
||||
}
|
||||
}
|
||||
25
Engine/source/math/isa/neon/float4.cpp
Normal file
25
Engine/source/math/isa/neon/float4.cpp
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
#include "neon_intrinsics.h"
|
||||
#include "float4_dispatch.h"
|
||||
|
||||
#include "float4_impl.inl"
|
||||
|
||||
namespace math_backend::float4::dispatch
|
||||
{
|
||||
// Install NEON64 backend
|
||||
void install_neon()
|
||||
{
|
||||
gFloat4.add = float4_add_impl;
|
||||
gFloat4.sub = float4_sub_impl;
|
||||
gFloat4.mul = float4_mul_impl;
|
||||
gFloat4.mul_scalar = float4_mul_scalar_impl;
|
||||
gFloat4.div = float4_div_impl;
|
||||
gFloat4.div_scalar = float4_div_scalar_impl;
|
||||
gFloat4.dot = float4_dot_impl;
|
||||
gFloat4.length = float4_length_impl;
|
||||
gFloat4.lengthSquared = float4_length_squared_impl;
|
||||
gFloat4.normalize = float4_normalize_impl;
|
||||
gFloat4.normalize_mag = float4_normalize_mag_impl;
|
||||
gFloat4.lerp = float4_lerp_impl;
|
||||
gFloat4.cross = float4_cross_impl;
|
||||
}
|
||||
}
|
||||
23
Engine/source/math/isa/neon/mat44.cpp
Normal file
23
Engine/source/math/isa/neon/mat44.cpp
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
#include "neon_intrinsics.h"
|
||||
#include "mat44_dispatch.h"
|
||||
|
||||
#include "mat44_impl.inl"
|
||||
|
||||
namespace math_backend::mat44::dispatch
|
||||
{
|
||||
void install_neon()
|
||||
{
|
||||
gMat44.transpose = mat44_transpose_impl;
|
||||
gMat44.inverse = mat44_inverse_impl;
|
||||
gMat44.inverse_to = mat44_inverse_to_impl;
|
||||
gMat44.affine_inverse = mat44_affine_inverse_impl;
|
||||
gMat44.mul_mat44 = mat44_mul_mat44_impl;
|
||||
gMat44.mul_pos3 = mat44_mul_pos3_impl;
|
||||
gMat44.mul_vec3 = mat44_mul_vec3_impl;
|
||||
gMat44.mul_float4 = mat44_mul_float4_impl;
|
||||
gMat44.scale = mat44_scale_impl;
|
||||
gMat44.get_scale = mat44_get_scale_impl;
|
||||
gMat44.normalize = mat44_normalize_impl;
|
||||
gMat44.determinant = mat44_get_determinant;
|
||||
}
|
||||
}
|
||||
504
Engine/source/math/isa/neon/neon_intrinsics.h
Normal file
504
Engine/source/math/isa/neon/neon_intrinsics.h
Normal file
|
|
@ -0,0 +1,504 @@
|
|||
#pragma once
|
||||
#include <arm_neon.h>
|
||||
|
||||
namespace
|
||||
{
|
||||
typedef float32x4_t f32x4;
|
||||
|
||||
//------------------------------------------------------
|
||||
// Load / Store
|
||||
//------------------------------------------------------
|
||||
inline f32x4 v_load(const float* p) { return vld1q_f32(p); }
|
||||
inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); }
|
||||
inline f32x4 v_set1(float s) { return vdupq_n_f32(s); }
|
||||
inline f32x4 v_zero() { return vdupq_n_f32(0.0f); }
|
||||
inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); }
|
||||
|
||||
inline f32x4 v_set(float x, float y, float z, float w)
|
||||
{
|
||||
return { x, y, z, w }; // lane0=x, lane1=y, lane2=z, lane3=w
|
||||
}
|
||||
|
||||
inline f32x4 v_insert_w(f32x4 v, f32x4 w)
|
||||
{
|
||||
// extract scalar value from w lane0
|
||||
float w_val = vgetq_lane_f32(w, 0);
|
||||
|
||||
// broadcast w_val to all lanes (we only need lane3 later)
|
||||
f32x4 w_broadcast = vdupq_n_f32(w_val);
|
||||
|
||||
// mask to select only lane3
|
||||
uint32x4_t mask = {0, 0, 0, 0xFFFFFFFF};
|
||||
|
||||
// vbslq: if mask bit=1, take from first argument, else take from second
|
||||
return vbslq_f32(mask, w_broadcast, v);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Mask helpers
|
||||
//------------------------------------------------------
|
||||
inline f32x4 v_mask_xyz()
|
||||
{
|
||||
uint32x4_t mask = {
|
||||
0xFFFFFFFF,
|
||||
0xFFFFFFFF,
|
||||
0xFFFFFFFF,
|
||||
0x00000000
|
||||
};
|
||||
|
||||
return vreinterpretq_f32_u32(mask);
|
||||
}
|
||||
|
||||
inline f32x4 v_and(f32x4 a, f32x4 b)
|
||||
{
|
||||
return vreinterpretq_f32_u32(
|
||||
vandq_u32(
|
||||
vreinterpretq_u32_f32(a),
|
||||
vreinterpretq_u32_f32(b)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
|
||||
{
|
||||
// base byte index of the float lane
|
||||
uint8x16_t base = vdupq_n_u8((uint8_t)(x * 4));
|
||||
|
||||
// byte offsets inside a float (0,1,2,3 repeated 4 times)
|
||||
const uint8x16_t offsets = {
|
||||
0,1,2,3, 0,1,2,3,
|
||||
0,1,2,3, 0,1,2,3
|
||||
};
|
||||
|
||||
uint8x16_t idx = vaddq_u8(base, offsets);
|
||||
|
||||
return vreinterpretq_f32_u8(
|
||||
vqtbl1q_u8(
|
||||
vreinterpretq_u8_f32(v),
|
||||
idx
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_lo(f32x4 v)
|
||||
{
|
||||
float32x4x2_t t = vzipq_f32(v, v);
|
||||
return t.val[0]; // [x, x, z, z]
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_hi(f32x4 v)
|
||||
{
|
||||
float32x4x2_t t = vzipq_f32(v, v);
|
||||
return t.val[1]; // [y, y, w, w]
|
||||
}
|
||||
|
||||
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
|
||||
{
|
||||
float32x4_t mask = {0.0f, 0.0f, 0.0f, 1.0f};
|
||||
return vbslq_f32(vreinterpretq_u32_f32(mask), original, newv);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Float3 helpers
|
||||
//------------------------------------------------------
|
||||
inline f32x4 v_load3_vec(const float* p) // w = 0
|
||||
{
|
||||
float tmp[4] = { p[0], p[1], p[2], 0.0f };
|
||||
return vld1q_f32(tmp);
|
||||
}
|
||||
|
||||
inline f32x4 v_load3_pos(const float* p) // w = 1
|
||||
{
|
||||
float tmp[4] = { p[0], p[1], p[2], 1.0f };
|
||||
return vld1q_f32(tmp);
|
||||
}
|
||||
|
||||
inline void v_store3(float* dst, f32x4 v)
|
||||
{
|
||||
float tmp[4];
|
||||
vst1q_f32(tmp, v);
|
||||
dst[0] = tmp[0];
|
||||
dst[1] = tmp[1];
|
||||
dst[2] = tmp[2];
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Simple Arithmetic
|
||||
//------------------------------------------------------
|
||||
inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
|
||||
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return vdivq_f32(a, b); } // only NEON64
|
||||
inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
|
||||
inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
|
||||
|
||||
//------------------------------------------------------
|
||||
// Fast recip
|
||||
//------------------------------------------------------
|
||||
inline f32x4 v_rcp_nr(f32x4 b)
|
||||
{
|
||||
f32x4 r = vrecpeq_f32(b);
|
||||
r = vmulq_f32(r, vrecpsq_f32(b, r)); // Newton-Raphson
|
||||
r = vmulq_f32(r, vrecpsq_f32(b, r));
|
||||
return r;
|
||||
}
|
||||
|
||||
inline f32x4 v_div(f32x4 a, f32x4 b)
|
||||
{
|
||||
return vmulq_f32(a, v_rcp_nr(b));
|
||||
}
|
||||
|
||||
inline f32x4 v_rsqrt_nr(f32x4 x)
|
||||
{
|
||||
f32x4 r = vrsqrteq_f32(x);
|
||||
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); // refine
|
||||
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x));
|
||||
return r;
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Vector intrinsic functions
|
||||
//------------------------------------------------------
|
||||
inline f32x4 v_dot4(f32x4 a, f32x4 b)
|
||||
{
|
||||
f32x4 mul = vmulq_f32(a, b);
|
||||
float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
|
||||
float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
|
||||
return vdupq_n_f32(sum);
|
||||
}
|
||||
|
||||
inline f32x4 v_dot3(f32x4 a, f32x4 b)
|
||||
{
|
||||
float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
|
||||
f32x4 mul = vmulq_f32(a, b);
|
||||
mul = vmulq_f32(mul, mask);
|
||||
float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
|
||||
float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
|
||||
return vdupq_n_f32(sum);
|
||||
}
|
||||
|
||||
inline f32x4 v_cross(f32x4 a, f32x4 b)
|
||||
{
|
||||
f32x4 a_yzx = { vgetq_lane_f32(a,1),
|
||||
vgetq_lane_f32(a,2),
|
||||
vgetq_lane_f32(a,0),
|
||||
0.0f };
|
||||
|
||||
f32x4 b_zxy = { vgetq_lane_f32(b,2),
|
||||
vgetq_lane_f32(b,0),
|
||||
vgetq_lane_f32(b,1),
|
||||
0.0f };
|
||||
|
||||
f32x4 a_zxy = { vgetq_lane_f32(a,2),
|
||||
vgetq_lane_f32(a,0),
|
||||
vgetq_lane_f32(a,1),
|
||||
0.0f };
|
||||
|
||||
f32x4 b_yzx = { vgetq_lane_f32(b,1),
|
||||
vgetq_lane_f32(b,2),
|
||||
vgetq_lane_f32(b,0),
|
||||
0.0f };
|
||||
|
||||
return vsubq_f32(
|
||||
vmulq_f32(a_yzx, b_zxy),
|
||||
vmulq_f32(a_zxy, b_yzx)
|
||||
);
|
||||
}
|
||||
|
||||
inline f32x4 v_normalize3(f32x4 v)
|
||||
{
|
||||
const float32x4_t zero = vdupq_n_f32(0.0f);
|
||||
const float32x4_t fallback = {0.0f, 0.0f, 1.0f, 0.0f};
|
||||
|
||||
f32x4 dot = v_dot3(v, v);
|
||||
|
||||
// dot == 0?
|
||||
uint32x4_t isZero = vceqq_f32(dot, zero);
|
||||
|
||||
f32x4 inv = v_rsqrt_nr(dot);
|
||||
f32x4 norm = vmulq_f32(v, inv);
|
||||
|
||||
// Select fallback when zero
|
||||
return vbslq_f32(isZero, fallback, norm);
|
||||
}
|
||||
|
||||
inline f32x4 v_hadd4(f32x4 a)
|
||||
{
|
||||
float32x2_t sum2 = vpadd_f32(vget_low_f32(a), vget_high_f32(a));
|
||||
float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1);
|
||||
return vdupq_n_f32(sum);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix type (row-major 4x4)
|
||||
//------------------------------------------------------
|
||||
|
||||
struct f32x4x4
|
||||
{
|
||||
f32x4 r0;
|
||||
f32x4 r1;
|
||||
f32x4 r2;
|
||||
f32x4 r3;
|
||||
};
|
||||
|
||||
inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
|
||||
{
|
||||
f32x4x4 out;
|
||||
out.r0 = v_load(m + 0);
|
||||
out.r1 = v_load(m + 4);
|
||||
out.r2 = v_load(m + 8);
|
||||
out.r3 = v_load(m + 12);
|
||||
return out;
|
||||
}
|
||||
|
||||
inline void m_store(float* dst, const f32x4x4& m)
|
||||
{
|
||||
v_store(dst + 0, m.r0);
|
||||
v_store(dst + 4, m.r1);
|
||||
v_store(dst + 8, m.r2);
|
||||
v_store(dst + 12, m.r3);
|
||||
}
|
||||
|
||||
inline f32x4x4 m_identity()
|
||||
{
|
||||
f32x4x4 m;
|
||||
m.r0 = {1,0,0,0};
|
||||
m.r1 = {0,1,0,0};
|
||||
m.r2 = {0,0,1,0};
|
||||
m.r3 = {0,0,0,1};
|
||||
return m;
|
||||
}
|
||||
|
||||
inline f32x4x4 m_zero()
|
||||
{
|
||||
f32x4 z = v_zero();
|
||||
return { z, z, z, z };
|
||||
}
|
||||
|
||||
inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
|
||||
{
|
||||
f32x4 x = v_dot4(m.r0, v);
|
||||
f32x4 y = v_dot4(m.r1, v);
|
||||
f32x4 z = v_dot4(m.r2, v);
|
||||
f32x4 w = v_dot4(m.r3, v);
|
||||
|
||||
return {
|
||||
v_extract0(x),
|
||||
v_extract0(y),
|
||||
v_extract0(z),
|
||||
v_extract0(w)
|
||||
};
|
||||
}
|
||||
|
||||
inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
|
||||
{
|
||||
f32x4 x = v_dot3(m.r0, v);
|
||||
f32x4 y = v_dot3(m.r1, v);
|
||||
f32x4 z = v_dot3(m.r2, v);
|
||||
|
||||
return {
|
||||
v_extract0(x),
|
||||
v_extract0(y),
|
||||
v_extract0(z),
|
||||
0.0f
|
||||
};
|
||||
}
|
||||
|
||||
inline f32x4x4 m_transpose(const f32x4x4& m)
|
||||
{
|
||||
float32x4x2_t t0 = vtrnq_f32(m.r0, m.r1);
|
||||
float32x4x2_t t1 = vtrnq_f32(m.r2, m.r3);
|
||||
|
||||
float32x2_t a0 = vget_low_f32(t0.val[0]);
|
||||
float32x2_t a1 = vget_high_f32(t0.val[0]);
|
||||
float32x2_t a2 = vget_low_f32(t1.val[0]);
|
||||
float32x2_t a3 = vget_high_f32(t1.val[0]);
|
||||
|
||||
float32x2_t b0 = vget_low_f32(t0.val[1]);
|
||||
float32x2_t b1 = vget_high_f32(t0.val[1]);
|
||||
float32x2_t b2 = vget_low_f32(t1.val[1]);
|
||||
float32x2_t b3 = vget_high_f32(t1.val[1]);
|
||||
|
||||
f32x4x4 out;
|
||||
|
||||
out.r0 = vcombine_f32(a0, a2);
|
||||
out.r1 = vcombine_f32(b0, b2);
|
||||
out.r2 = vcombine_f32(a1, a3);
|
||||
out.r3 = vcombine_f32(b1, b3);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
|
||||
{
|
||||
f32x4x4 bt = m_transpose(b);
|
||||
|
||||
auto mul_row = [&](f32x4 row)
|
||||
{
|
||||
f32x4 x = v_dot4(row, bt.r0);
|
||||
f32x4 y = v_dot4(row, bt.r1);
|
||||
f32x4 z = v_dot4(row, bt.r2);
|
||||
f32x4 w = v_dot4(row, bt.r3);
|
||||
|
||||
return f32x4{
|
||||
v_extract0(x),
|
||||
v_extract0(y),
|
||||
v_extract0(z),
|
||||
v_extract0(w)
|
||||
};
|
||||
};
|
||||
|
||||
f32x4x4 C;
|
||||
C.r0 = mul_row(a.r0);
|
||||
C.r1 = mul_row(a.r1);
|
||||
C.r2 = mul_row(a.r2);
|
||||
C.r3 = mul_row(a.r3);
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
inline f32x4 m_determinant(const f32x4x4& m)
|
||||
{
|
||||
f32x4 a = m.r0;
|
||||
f32x4 b = m.r1;
|
||||
f32x4 c = m.r2;
|
||||
f32x4 d = m.r3;
|
||||
|
||||
f32x4 c0 = v_cross(c, d);
|
||||
f32x4 c1 = v_cross(d, b);
|
||||
f32x4 c2 = v_cross(b, c);
|
||||
|
||||
f32x4 term0 = vmulq_f32(a, c0);
|
||||
f32x4 term1 = vmulq_f32(a, c1);
|
||||
f32x4 term2 = vmulq_f32(a, c2);
|
||||
|
||||
f32x4 det = vaddq_f32(term0, vaddq_f32(term1, term2));
|
||||
|
||||
return v_hadd4(det);
|
||||
}
|
||||
|
||||
inline f32x4 m_determinant_affine(const f32x4x4& m)
|
||||
{
|
||||
f32x4 r0 = v_and(m.r0, v_mask_xyz());
|
||||
f32x4 r1 = v_and(m.r1, v_mask_xyz());
|
||||
f32x4 r2 = v_and(m.r2, v_mask_xyz());
|
||||
|
||||
f32x4 c0 = v_cross(r1, r2);
|
||||
return v_dot3(r0, c0);
|
||||
}
|
||||
|
||||
inline f32x4x4 m_inverse(const f32x4x4& m)
|
||||
{
|
||||
f32x4 a = m.r0;
|
||||
f32x4 b = m.r1;
|
||||
f32x4 c = m.r2;
|
||||
f32x4 d = m.r3;
|
||||
|
||||
f32x4 c0 = v_cross(b, c);
|
||||
f32x4 c1 = v_cross(c, d);
|
||||
f32x4 c2 = v_cross(d, a);
|
||||
f32x4 c3 = v_cross(a, b);
|
||||
|
||||
f32x4 det = v_dot4(a, c1);
|
||||
f32x4 invDet = v_rcp_nr(det);
|
||||
|
||||
f32x4x4 adj;
|
||||
adj.r0 = vmulq_f32(c1, invDet);
|
||||
adj.r1 = vmulq_f32(c2, invDet);
|
||||
adj.r2 = vmulq_f32(c3, invDet);
|
||||
adj.r3 = vmulq_f32(c0, invDet);
|
||||
|
||||
return m_transpose(adj);
|
||||
}
|
||||
|
||||
struct vec4_batch4
|
||||
{
|
||||
f32x4 x;
|
||||
f32x4 y;
|
||||
f32x4 z;
|
||||
f32x4 w;
|
||||
};
|
||||
|
||||
inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
|
||||
{
|
||||
vec4_batch4 r;
|
||||
|
||||
r.x = (f32x4){ ptr[0], ptr[3], ptr[6], ptr[9] };
|
||||
r.y = (f32x4){ ptr[1], ptr[4], ptr[7], ptr[10] };
|
||||
r.z = (f32x4){ ptr[2], ptr[5], ptr[8], ptr[11] };
|
||||
|
||||
if (fillW)
|
||||
{
|
||||
r.w = vdupq_n_f32(w);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline void store_vec3_batch4(float* out, const vec4_batch4& v)
|
||||
{
|
||||
alignas(16) float xs[4];
|
||||
alignas(16) float ys[4];
|
||||
alignas(16) float zs[4];
|
||||
|
||||
vst1q_f32(xs, v.x);
|
||||
vst1q_f32(ys, v.y);
|
||||
vst1q_f32(zs, v.z);
|
||||
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
out[i * 3 + 0] = xs[i];
|
||||
out[i * 3 + 1] = ys[i];
|
||||
out[i * 3 + 2] = zs[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
|
||||
{
|
||||
vec4_batch4 r;
|
||||
|
||||
float32x4_t m00 = vdupq_n_f32(m[0]);
|
||||
float32x4_t m01 = vdupq_n_f32(m[1]);
|
||||
float32x4_t m02 = vdupq_n_f32(m[2]);
|
||||
float32x4_t m03 = vdupq_n_f32(m[3]);
|
||||
|
||||
float32x4_t m10 = vdupq_n_f32(m[4]);
|
||||
float32x4_t m11 = vdupq_n_f32(m[5]);
|
||||
float32x4_t m12 = vdupq_n_f32(m[6]);
|
||||
float32x4_t m13 = vdupq_n_f32(m[7]);
|
||||
|
||||
float32x4_t m20 = vdupq_n_f32(m[8]);
|
||||
float32x4_t m21 = vdupq_n_f32(m[9]);
|
||||
float32x4_t m22 = vdupq_n_f32(m[10]);
|
||||
float32x4_t m23 = vdupq_n_f32(m[11]);
|
||||
|
||||
// row0 dot
|
||||
r.x = vaddq_f32(
|
||||
vaddq_f32(
|
||||
vmulq_f32(v.x, m00),
|
||||
vmulq_f32(v.y, m01)),
|
||||
vaddq_f32(
|
||||
vmulq_f32(v.z, m02),
|
||||
m03));
|
||||
|
||||
// row1 dot
|
||||
r.y = vaddq_f32(
|
||||
vaddq_f32(
|
||||
vmulq_f32(v.x, m10),
|
||||
vmulq_f32(v.y, m11)),
|
||||
vaddq_f32(
|
||||
vmulq_f32(v.z, m12),
|
||||
m13));
|
||||
|
||||
// row2 dot
|
||||
r.z = vaddq_f32(
|
||||
vaddq_f32(
|
||||
vmulq_f32(v.x, m20),
|
||||
vmulq_f32(v.y, m21)),
|
||||
vaddq_f32(
|
||||
vmulq_f32(v.z, m22),
|
||||
m23));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
}
|
||||
26
Engine/source/math/isa/sse2/float3.cpp
Normal file
26
Engine/source/math/isa/sse2/float3.cpp
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
#include "sse2_intrinsics.h"
|
||||
#include "float3_dispatch.h"
|
||||
#include <emmintrin.h> // SSE2 intrinsics
|
||||
|
||||
#include "float3_impl.inl"
|
||||
|
||||
namespace math_backend::float3::dispatch
|
||||
{
|
||||
// Install SSE2 backend
|
||||
void install_sse2()
|
||||
{
|
||||
gFloat3.add = float3_add_impl;
|
||||
gFloat3.sub = float3_sub_impl;
|
||||
gFloat3.mul = float3_mul_impl;
|
||||
gFloat3.mul_scalar = float3_mul_scalar_impl;
|
||||
gFloat3.div = float3_div_impl;
|
||||
gFloat3.div_scalar = float3_div_scalar_impl;
|
||||
gFloat3.dot = float3_dot_impl;
|
||||
gFloat3.length = float3_length_impl;
|
||||
gFloat3.lengthSquared = float3_length_squared_impl;
|
||||
gFloat3.normalize = float3_normalize_impl;
|
||||
gFloat3.normalize_mag = float3_normalize_mag_impl;
|
||||
gFloat3.lerp = float3_lerp_impl;
|
||||
gFloat3.cross = float3_cross_impl;
|
||||
}
|
||||
}
|
||||
26
Engine/source/math/isa/sse2/float4.cpp
Normal file
26
Engine/source/math/isa/sse2/float4.cpp
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
#include "sse2_intrinsics.h"
|
||||
#include "float4_dispatch.h"
|
||||
|
||||
#include "float4_impl.inl"
|
||||
|
||||
|
||||
namespace math_backend::float4::dispatch
|
||||
{
|
||||
// Install SSE2 backend
|
||||
void install_sse2()
|
||||
{
|
||||
gFloat4.add = float4_add_impl;
|
||||
gFloat4.sub = float4_sub_impl;
|
||||
gFloat4.mul = float4_mul_impl;
|
||||
gFloat4.mul_scalar = float4_mul_scalar_impl;
|
||||
gFloat4.div = float4_div_impl;
|
||||
gFloat4.div_scalar = float4_div_scalar_impl;
|
||||
gFloat4.dot = float4_dot_impl;
|
||||
gFloat4.length = float4_length_impl;
|
||||
gFloat4.lengthSquared = float4_length_squared_impl;
|
||||
gFloat4.normalize = float4_normalize_impl;
|
||||
gFloat4.normalize_mag = float4_normalize_mag_impl;
|
||||
gFloat4.lerp = float4_lerp_impl;
|
||||
gFloat4.cross = float4_cross_impl;
|
||||
}
|
||||
}
|
||||
25
Engine/source/math/isa/sse2/mat44.cpp
Normal file
25
Engine/source/math/isa/sse2/mat44.cpp
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
#include "sse2_intrinsics.h"
|
||||
#include "mat44_dispatch.h"
|
||||
|
||||
#include "mat44_impl.inl"
|
||||
|
||||
namespace math_backend::mat44::dispatch
|
||||
{
|
||||
void install_sse2()
|
||||
{
|
||||
gMat44.transpose = mat44_transpose_impl;
|
||||
gMat44.inverse = mat44_inverse_impl;
|
||||
gMat44.inverse_to = mat44_inverse_to_impl;
|
||||
gMat44.affine_inverse = mat44_affine_inverse_impl;
|
||||
gMat44.mul_mat44 = mat44_mul_mat44_impl;
|
||||
gMat44.mul_pos3 = mat44_mul_pos3_impl;
|
||||
gMat44.mul_vec3 = mat44_mul_vec3_impl;
|
||||
gMat44.mul_float4 = mat44_mul_float4_impl;
|
||||
gMat44.transform_plane = mat44_transform_plane_impl;
|
||||
gMat44.scale = mat44_scale_impl;
|
||||
gMat44.get_scale = mat44_get_scale_impl;
|
||||
gMat44.normalize = mat44_normalize_impl;
|
||||
gMat44.determinant = mat44_get_determinant;
|
||||
gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
|
||||
}
|
||||
}
|
||||
655
Engine/source/math/isa/sse2/sse2_intrinsics.h
Normal file
655
Engine/source/math/isa/sse2/sse2_intrinsics.h
Normal file
|
|
@ -0,0 +1,655 @@
|
|||
#pragma once
|
||||
#include <emmintrin.h> // SSE2
|
||||
#include <xmmintrin.h> // SSE
|
||||
|
||||
namespace
|
||||
{
|
||||
typedef __m128 f32x4;
|
||||
|
||||
//------------------------------------------------------
|
||||
// Vector/Point Load / Store
|
||||
//------------------------------------------------------
|
||||
|
||||
// Load 4 floats from memory into a SIMD register
|
||||
inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
|
||||
|
||||
inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
|
||||
|
||||
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
|
||||
|
||||
inline f32x4 v_zero() { return _mm_setzero_ps(); }
|
||||
|
||||
inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
|
||||
|
||||
//------------------------------------------------------
|
||||
// Float3 helpers (safe loading into 4 lanes)
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 v_load3_vec(const float* p) // w = 0
|
||||
{
|
||||
return _mm_set_ps(0.0f, p[2], p[1], p[0]);
|
||||
}
|
||||
|
||||
inline f32x4 v_load3_pos(const float* p) // w = 1
|
||||
{
|
||||
return _mm_set_ps(1.0f, p[2], p[1], p[0]);
|
||||
}
|
||||
|
||||
inline f32x4 v_set(float x, float y, float z, float w)
|
||||
{
|
||||
return _mm_set_ps(w, z, y, x);
|
||||
}
|
||||
|
||||
inline f32x4 v_insert_w(f32x4 v, f32x4 w)
|
||||
{
|
||||
f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0));
|
||||
return _mm_or_ps( _mm_and_ps(mask, w), _mm_andnot_ps(mask, v));
|
||||
}
|
||||
|
||||
inline void v_store3(float* dst, f32x4 v)
|
||||
{
|
||||
alignas(16) float tmp[4]; // temp storage
|
||||
_mm_store_ps(tmp, v); // store all 4 lanes
|
||||
dst[0] = tmp[0];
|
||||
dst[1] = tmp[1];
|
||||
dst[2] = tmp[2];
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Mask helpers
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 v_mask_xyz() { return _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)); }
|
||||
|
||||
inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w)
|
||||
{
|
||||
alignas(16) float tmp[4];
|
||||
_mm_store_ps(tmp, v);
|
||||
|
||||
return _mm_set_ps(
|
||||
tmp[w],
|
||||
tmp[z],
|
||||
tmp[y],
|
||||
tmp[x]
|
||||
);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
|
||||
{
|
||||
alignas(16) float tmp[4];
|
||||
_mm_store_ps(tmp, v);
|
||||
return _mm_set1_ps(tmp[x]);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_lo(f32x4 v1)
|
||||
{
|
||||
return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2,2,0,0));
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_hi(f32x4 v1)
|
||||
{
|
||||
return _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 3, 1, 1));
|
||||
}
|
||||
|
||||
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
|
||||
{
|
||||
f32x4 mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)); // lane3 = 1
|
||||
return _mm_or_ps(_mm_and_ps(mask, original), _mm_andnot_ps(mask, newv));
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Shuffle helpers
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
|
||||
{
|
||||
alignas(16) float a[4], b[4];
|
||||
_mm_store_ps(a, v1);
|
||||
_mm_store_ps(b, v2);
|
||||
|
||||
return _mm_set_ps(
|
||||
b[w],
|
||||
b[z],
|
||||
a[y],
|
||||
a[x]
|
||||
);
|
||||
}
|
||||
|
||||
inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
|
||||
{
|
||||
return _mm_movelh_ps(v1, v2);
|
||||
}
|
||||
|
||||
inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
|
||||
{
|
||||
return _mm_movehl_ps(v2, v1);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------
|
||||
// Simple Arithmatic
|
||||
//------------------------------------------------------
|
||||
|
||||
// Element-wise multiply
|
||||
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
|
||||
|
||||
// Element-wise divide
|
||||
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
|
||||
|
||||
// Element-wise add
|
||||
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
|
||||
|
||||
// Element-wise subtract
|
||||
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
|
||||
|
||||
// Absolute value.
|
||||
inline f32x4 v_abs(f32x4 v)
|
||||
{
|
||||
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
|
||||
return _mm_and_ps(v, mask);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Fast recip
|
||||
//------------------------------------------------------
|
||||
|
||||
// Fast recip 1/b
|
||||
inline f32x4 v_rcp_nr(f32x4 b)
|
||||
{
|
||||
f32x4 r = _mm_rcp_ps(b);
|
||||
f32x4 two = _mm_set1_ps(2.0f);
|
||||
r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
|
||||
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
|
||||
}
|
||||
|
||||
// Divide fast ( b = recip eg 1/b)
|
||||
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
|
||||
|
||||
inline f32x4 v_rsqrt_nr(f32x4 x)
|
||||
{
|
||||
f32x4 half = _mm_set1_ps(0.5f);
|
||||
f32x4 three = _mm_set1_ps(3.0f);
|
||||
|
||||
f32x4 r = _mm_rsqrt_ps(x);
|
||||
f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
|
||||
return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Vector intrinsic functions
|
||||
//------------------------------------------------------
|
||||
|
||||
// full dot4
|
||||
inline f32x4 v_dot4(f32x4 a, f32x4 b)
|
||||
{
|
||||
f32x4 prod = _mm_mul_ps(a, b); // multiply element-wise
|
||||
f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 3, 2));
|
||||
prod = _mm_add_ps(prod, shuf);
|
||||
shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(0, 0, 0, 1));
|
||||
prod = _mm_add_ps(prod, shuf);
|
||||
return prod; // f32x4, all lanes = dot(a,b)
|
||||
}
|
||||
|
||||
// dot3 (ignores w)
|
||||
inline f32x4 v_dot3(f32x4 a, f32x4 b)
|
||||
{
|
||||
f32x4 prod = _mm_mul_ps(a, b);
|
||||
prod = _mm_and_ps(prod, v_mask_xyz()); // zero w
|
||||
f32x4 shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
prod = _mm_add_ps(prod, shuf);
|
||||
shuf = _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
prod = _mm_add_ps(prod, shuf);
|
||||
return prod; // f32x4, all lanes = dot(a,b)
|
||||
}
|
||||
|
||||
// cross product xyz only.
|
||||
inline f32x4 v_cross(f32x4 a, f32x4 b)
|
||||
{
|
||||
f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
|
||||
f32x4 c = _mm_sub_ps( _mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
|
||||
|
||||
return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
}
|
||||
|
||||
inline f32x4 v_normalize3(f32x4 v)
|
||||
{
|
||||
const f32x4 zero = _mm_setzero_ps();
|
||||
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
|
||||
f32x4 dot = v_dot3(v, v);
|
||||
|
||||
f32x4 inv = v_rsqrt_nr(dot);
|
||||
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
|
||||
|
||||
f32x4 norm = _mm_mul_ps(v, inv);
|
||||
|
||||
// vbsl equivalent
|
||||
f32x4 result = _mm_or_ps(
|
||||
_mm_and_ps(isZero, fallback),
|
||||
_mm_andnot_ps(isZero, norm)
|
||||
);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// adds all 4 lanes together.
|
||||
inline f32x4 v_hadd4(f32x4 a)
|
||||
{
|
||||
// sum all 4 lanes in SSE2
|
||||
__m128 shuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // swap pairs
|
||||
__m128 sums = _mm_add_ps(a, shuf);
|
||||
shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
return _mm_add_ps(sums, shuf);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix type (row-major 4x4)
|
||||
//------------------------------------------------------
|
||||
|
||||
struct f32x4x4
|
||||
{
|
||||
f32x4 r0;
|
||||
f32x4 r1;
|
||||
f32x4 r2;
|
||||
f32x4 r3;
|
||||
};
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix Load / Store
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
|
||||
{
|
||||
f32x4x4 out;
|
||||
out.r0 = v_load(m + 0);
|
||||
out.r1 = v_load(m + 4);
|
||||
out.r2 = v_load(m + 8);
|
||||
out.r3 = v_load(m + 12);
|
||||
return out;
|
||||
}
|
||||
|
||||
inline void m_store(float* dst, const f32x4x4& m)
|
||||
{
|
||||
v_store(dst + 0, m.r0);
|
||||
v_store(dst + 4, m.r1);
|
||||
v_store(dst + 8, m.r2);
|
||||
v_store(dst + 12, m.r3);
|
||||
}
|
||||
|
||||
inline f32x4x4 m_identity()
|
||||
{
|
||||
f32x4x4 m;
|
||||
m.r0 = _mm_set_ps(0, 0, 0, 1);
|
||||
m.r1 = _mm_set_ps(0, 0, 1, 0);
|
||||
m.r2 = _mm_set_ps(0, 1, 0, 0);
|
||||
m.r3 = _mm_set_ps(1, 0, 0, 0);
|
||||
return m;
|
||||
}
|
||||
|
||||
inline f32x4x4 m_zero()
|
||||
{
|
||||
f32x4 z = v_zero();
|
||||
return { z, z, z, z };
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix × Vector
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
|
||||
{
|
||||
f32x4 x = v_dot4(m.r0, v);
|
||||
f32x4 y = v_dot4(m.r1, v);
|
||||
f32x4 z = v_dot4(m.r2, v);
|
||||
f32x4 w = v_dot4(m.r3, v);
|
||||
|
||||
// combine to a vector
|
||||
return _mm_set_ps(_mm_cvtss_f32(w),
|
||||
_mm_cvtss_f32(z),
|
||||
_mm_cvtss_f32(y),
|
||||
_mm_cvtss_f32(x));
|
||||
}
|
||||
|
||||
inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
|
||||
{
|
||||
f32x4 x = v_dot3(m.r0, v);
|
||||
f32x4 y = v_dot3(m.r1, v);
|
||||
f32x4 z = v_dot3(m.r2, v);
|
||||
|
||||
// combine to a vector
|
||||
return _mm_set_ps(0.0f,
|
||||
_mm_cvtss_f32(z),
|
||||
_mm_cvtss_f32(y),
|
||||
_mm_cvtss_f32(x));
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix × Matrix
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
|
||||
{
|
||||
// Transpose B once for dot products
|
||||
f32x4 b0 = b.r0;
|
||||
f32x4 b1 = b.r1;
|
||||
f32x4 b2 = b.r2;
|
||||
f32x4 b3 = b.r3;
|
||||
|
||||
// Transpose (SSE2)
|
||||
_MM_TRANSPOSE4_PS(b0, b1, b2, b3);
|
||||
|
||||
f32x4x4 C;
|
||||
|
||||
auto mul_row = [&](f32x4 arow)
|
||||
{
|
||||
f32x4 x = v_dot4(arow, b0);
|
||||
f32x4 y = v_dot4(arow, b1);
|
||||
f32x4 z = v_dot4(arow, b2);
|
||||
f32x4 w = v_dot4(arow, b3);
|
||||
|
||||
f32x4 xy = _mm_unpacklo_ps(x, y);
|
||||
f32x4 zw = _mm_unpacklo_ps(z, w);
|
||||
return _mm_movelh_ps(xy, zw);
|
||||
};
|
||||
|
||||
C.r0 = mul_row(a.r0);
|
||||
C.r1 = mul_row(a.r1);
|
||||
C.r2 = mul_row(a.r2);
|
||||
C.r3 = mul_row(a.r3);
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Transpose
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4x4 m_transpose(const f32x4x4& m)
|
||||
{
|
||||
f32x4 r0 = m.r0;
|
||||
f32x4 r1 = m.r1;
|
||||
f32x4 r2 = m.r2;
|
||||
f32x4 r3 = m.r3;
|
||||
|
||||
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
|
||||
|
||||
return { r0, r1, r2, r3 };
|
||||
}
|
||||
|
||||
inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2)
|
||||
{
|
||||
dst[0] = v_extract0(c0); // c0.x
|
||||
dst[1] = v_extract0(c1); // c1.x
|
||||
dst[2] = v_extract0(c2); // c2.x
|
||||
|
||||
dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y
|
||||
dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y
|
||||
dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y
|
||||
|
||||
dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z
|
||||
dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z
|
||||
dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z
|
||||
}
|
||||
|
||||
inline f32x4x4 m_inverse(const f32x4x4& m)
|
||||
{
|
||||
f32x4 a = m.r0;
|
||||
f32x4 b = m.r1;
|
||||
f32x4 c = m.r2;
|
||||
f32x4 d = m.r3;
|
||||
|
||||
f32x4 c0 = v_cross(b, c);
|
||||
f32x4 c1 = v_cross(c, d);
|
||||
f32x4 c2 = v_cross(d, a);
|
||||
f32x4 c3 = v_cross(a, b);
|
||||
|
||||
f32x4 det = v_dot4(a, c1);
|
||||
f32x4 invDet = v_rcp_nr(det);
|
||||
|
||||
f32x4x4 adj;
|
||||
adj.r0 = _mm_mul_ps(c1, invDet);
|
||||
adj.r1 = _mm_mul_ps(c2, invDet);
|
||||
adj.r2 = _mm_mul_ps(c3, invDet);
|
||||
adj.r3 = _mm_mul_ps(c0, invDet);
|
||||
|
||||
return m_transpose(adj);
|
||||
}
|
||||
|
||||
inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
|
||||
{
|
||||
f32x4x4 t = m_transpose(m);
|
||||
|
||||
// Extract translation
|
||||
f32x4 T = v_set(
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
|
||||
0.0f
|
||||
);
|
||||
|
||||
f32x4 newT = m_mul_vec4(t, T);
|
||||
newT = _mm_sub_ps(v_zero(), newT);
|
||||
|
||||
t.r0 = v_preserve_w(t.r0, newT);
|
||||
t.r1 = v_preserve_w(t.r1, newT);
|
||||
t.r2 = v_preserve_w(t.r2, newT);
|
||||
t.r3 = v_set(0, 0, 0, 1);
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
inline f32x4x4 m_inverse_affine(const f32x4x4& m)
|
||||
{
|
||||
// Extract upper 3x3 rows
|
||||
f32x4 r0 = m.r0;
|
||||
f32x4 r1 = m.r1;
|
||||
f32x4 r2 = m.r2;
|
||||
|
||||
// Zero translation
|
||||
r0 = _mm_and_ps(r0, v_mask_xyz());
|
||||
r1 = _mm_and_ps(r1, v_mask_xyz());
|
||||
r2 = _mm_and_ps(r2, v_mask_xyz());
|
||||
|
||||
// Compute cofactors via cross products
|
||||
f32x4 c0 = v_cross(r1, r2);
|
||||
f32x4 c1 = v_cross(r2, r0);
|
||||
f32x4 c2 = v_cross(r0, r1);
|
||||
|
||||
// Determinant
|
||||
f32x4 det = v_dot3(r0, c0);
|
||||
|
||||
// Reciprocal determinant
|
||||
f32x4 invDet = v_rcp_nr(det);
|
||||
|
||||
// Inverse 3x3 (transpose of cofactor matrix)
|
||||
f32x4 i0 = _mm_mul_ps(c0, invDet);
|
||||
f32x4 i1 = _mm_mul_ps(c1, invDet);
|
||||
f32x4 i2 = _mm_mul_ps(c2, invDet);
|
||||
|
||||
// Transpose 3x3
|
||||
f32x4 t0 = i0;
|
||||
f32x4 t1 = i1;
|
||||
f32x4 t2 = i2;
|
||||
f32x4 t3 = _mm_setzero_ps();
|
||||
|
||||
_MM_TRANSPOSE4_PS(t0, t1, t2, t3);
|
||||
|
||||
// Extract translation
|
||||
alignas(16) float tmp[4];
|
||||
tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
tmp[3] = 0.0f;
|
||||
|
||||
f32x4 T = v_load(tmp);
|
||||
|
||||
// New translation = -R' * T
|
||||
f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T);
|
||||
newT = _mm_sub_ps(v_zero(), newT);
|
||||
|
||||
// Assemble final matrix
|
||||
f32x4x4 out;
|
||||
out.r0 = v_preserve_w(t0, newT);
|
||||
out.r1 = v_preserve_w(t1, newT);
|
||||
out.r2 = v_preserve_w(t2, newT);
|
||||
out.r3 = _mm_set_ps(1, 0, 0, 0);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
inline f32x4 m_determinant(const f32x4x4& m)
|
||||
{
|
||||
f32x4 a = m.r0;
|
||||
f32x4 b = m.r1;
|
||||
f32x4 c = m.r2;
|
||||
f32x4 d = m.r3;
|
||||
|
||||
f32x4 c0 = v_cross(c, d);
|
||||
f32x4 c1 = v_cross(d, b);
|
||||
f32x4 c2 = v_cross(b, c);
|
||||
|
||||
f32x4 term0 = _mm_mul_ps(a, c0);
|
||||
f32x4 term1 = _mm_mul_ps(a, c1);
|
||||
f32x4 term2 = _mm_mul_ps(a, c2);
|
||||
|
||||
f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
|
||||
|
||||
return v_hadd4(det);
|
||||
}
|
||||
|
||||
inline f32x4 m_determinant_affine(const f32x4x4& m)
|
||||
{
|
||||
f32x4 r0 = m.r0;
|
||||
f32x4 r1 = m.r1;
|
||||
f32x4 r2 = m.r2;
|
||||
|
||||
r0 = _mm_and_ps(r0, v_mask_xyz());
|
||||
r1 = _mm_and_ps(r1, v_mask_xyz());
|
||||
r2 = _mm_and_ps(r2, v_mask_xyz());
|
||||
|
||||
f32x4 c0 = v_cross(r1, r2);
|
||||
return v_dot3(r0, c0); // splatted determinant
|
||||
}
|
||||
|
||||
//---------------------------------------------------------
|
||||
// BATCH INTRINSICS
|
||||
//---------------------------------------------------------
|
||||
|
||||
struct vec4_batch4
|
||||
{
|
||||
f32x4 x;
|
||||
f32x4 y;
|
||||
f32x4 z;
|
||||
f32x4 w;
|
||||
};
|
||||
|
||||
|
||||
inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
|
||||
{
|
||||
vec4_batch4 r;
|
||||
|
||||
r.x = _mm_set_ps(
|
||||
ptr[9], ptr[6], ptr[3], ptr[0]);
|
||||
|
||||
r.y = _mm_set_ps(
|
||||
ptr[10], ptr[7], ptr[4], ptr[1]);
|
||||
|
||||
r.z = _mm_set_ps(
|
||||
ptr[11], ptr[8], ptr[5], ptr[2]);
|
||||
|
||||
if (fillW)
|
||||
{
|
||||
r.w = _mm_set1_ps(w);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// Store the result back out to a float array 4 at a time
|
||||
inline void store_f32x4(float* out, f32x4 v)
|
||||
{
|
||||
_mm_storeu_ps(out, v);
|
||||
}
|
||||
|
||||
// Store the result back to a float3 array with size of 4
|
||||
inline void store_vec3_batch4(float* out, const vec4_batch4& v)
|
||||
{
|
||||
alignas(16) float xs[8];
|
||||
alignas(16) float ys[8];
|
||||
alignas(16) float zs[8];
|
||||
|
||||
_mm_store_ps(xs, v.x);
|
||||
_mm_store_ps(ys, v.y);
|
||||
_mm_store_ps(zs, v.z);
|
||||
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
out[i * 3 + 0] = xs[i];
|
||||
out[i * 3 + 1] = ys[i];
|
||||
out[i * 3 + 2] = zs[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
|
||||
{
|
||||
f32x4 mulx = _mm_mul_ps(a.x, b.x);
|
||||
f32x4 muly = _mm_mul_ps(a.y, b.y);
|
||||
f32x4 mulz = _mm_mul_ps(a.z, b.z);
|
||||
|
||||
return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
|
||||
}
|
||||
|
||||
// Batch 4 mul_Vec4.
|
||||
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
|
||||
{
|
||||
vec4_batch4 r;
|
||||
|
||||
f32x4 m00 = _mm_set1_ps(m[0]);
|
||||
f32x4 m01 = _mm_set1_ps(m[1]);
|
||||
f32x4 m02 = _mm_set1_ps(m[2]);
|
||||
f32x4 m03 = _mm_set1_ps(m[3]);
|
||||
|
||||
f32x4 m10 = _mm_set1_ps(m[4]);
|
||||
f32x4 m11 = _mm_set1_ps(m[5]);
|
||||
f32x4 m12 = _mm_set1_ps(m[6]);
|
||||
f32x4 m13 = _mm_set1_ps(m[7]);
|
||||
|
||||
f32x4 m20 = _mm_set1_ps(m[8]);
|
||||
f32x4 m21 = _mm_set1_ps(m[9]);
|
||||
f32x4 m22 = _mm_set1_ps(m[10]);
|
||||
f32x4 m23 = _mm_set1_ps(m[11]);
|
||||
|
||||
//
|
||||
// row0 dot
|
||||
//
|
||||
r.x = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.x, m00),
|
||||
_mm_mul_ps(v.y, m01)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.z, m02), m03));
|
||||
|
||||
//
|
||||
// row1 dot
|
||||
//
|
||||
r.y = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.x, m10),
|
||||
_mm_mul_ps(v.y, m11)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.z, m12), m13));
|
||||
|
||||
//
|
||||
// row2 dot
|
||||
//
|
||||
r.z = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.x, m20),
|
||||
_mm_mul_ps(v.y, m21)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.z, m22), m23));
|
||||
|
||||
return r;
|
||||
}
|
||||
}
|
||||
26
Engine/source/math/isa/sse41/float3.cpp
Normal file
26
Engine/source/math/isa/sse41/float3.cpp
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
#include "sse41_intrinsics.h"
|
||||
#include "float3_dispatch.h"
|
||||
#include <smmintrin.h> // SSE41 intrinsics
|
||||
|
||||
#include "float3_impl.inl"
|
||||
|
||||
namespace math_backend::float3::dispatch
|
||||
{
|
||||
// Install SSE41 backend
|
||||
void install_sse41()
|
||||
{
|
||||
gFloat3.add = float3_add_impl;
|
||||
gFloat3.sub = float3_sub_impl;
|
||||
gFloat3.mul = float3_mul_impl;
|
||||
gFloat3.mul_scalar = float3_mul_scalar_impl;
|
||||
gFloat3.div = float3_div_impl;
|
||||
gFloat3.div_scalar = float3_div_scalar_impl;
|
||||
gFloat3.dot = float3_dot_impl;
|
||||
gFloat3.length = float3_length_impl;
|
||||
gFloat3.lengthSquared = float3_length_squared_impl;
|
||||
gFloat3.normalize = float3_normalize_impl;
|
||||
gFloat3.normalize_mag = float3_normalize_mag_impl;
|
||||
gFloat3.lerp = float3_lerp_impl;
|
||||
gFloat3.cross = float3_cross_impl;
|
||||
}
|
||||
}
|
||||
25
Engine/source/math/isa/sse41/float4.cpp
Normal file
25
Engine/source/math/isa/sse41/float4.cpp
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
#include "sse41_intrinsics.h"
|
||||
#include "float4_dispatch.h"
|
||||
|
||||
#include "float4_impl.inl"
|
||||
|
||||
namespace math_backend::float4::dispatch
|
||||
{
|
||||
// Install SSE41 backend
|
||||
void install_sse41()
|
||||
{
|
||||
gFloat4.add = float4_add_impl;
|
||||
gFloat4.sub = float4_sub_impl;
|
||||
gFloat4.mul = float4_mul_impl;
|
||||
gFloat4.mul_scalar = float4_mul_scalar_impl;
|
||||
gFloat4.div = float4_div_impl;
|
||||
gFloat4.div_scalar = float4_div_scalar_impl;
|
||||
gFloat4.dot = float4_dot_impl;
|
||||
gFloat4.length = float4_length_impl;
|
||||
gFloat4.lengthSquared = float4_length_squared_impl;
|
||||
gFloat4.normalize = float4_normalize_impl;
|
||||
gFloat4.normalize_mag = float4_normalize_mag_impl;
|
||||
gFloat4.lerp = float4_lerp_impl;
|
||||
gFloat4.cross = float4_cross_impl;
|
||||
}
|
||||
}
|
||||
25
Engine/source/math/isa/sse41/mat44.cpp
Normal file
25
Engine/source/math/isa/sse41/mat44.cpp
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
#include "sse41_intrinsics.h"
|
||||
#include "mat44_dispatch.h"
|
||||
|
||||
#include "mat44_impl.inl"
|
||||
|
||||
namespace math_backend::mat44::dispatch
|
||||
{
|
||||
void install_sse41()
|
||||
{
|
||||
gMat44.transpose = mat44_transpose_impl;
|
||||
gMat44.inverse = mat44_inverse_impl;
|
||||
gMat44.inverse_to = mat44_inverse_to_impl;
|
||||
gMat44.affine_inverse = mat44_affine_inverse_impl;
|
||||
gMat44.mul_mat44 = mat44_mul_mat44_impl;
|
||||
gMat44.mul_pos3 = mat44_mul_pos3_impl;
|
||||
gMat44.mul_vec3 = mat44_mul_vec3_impl;
|
||||
gMat44.mul_float4 = mat44_mul_float4_impl;
|
||||
gMat44.transform_plane = mat44_transform_plane_impl;
|
||||
gMat44.scale = mat44_scale_impl;
|
||||
gMat44.get_scale = mat44_get_scale_impl;
|
||||
gMat44.normalize = mat44_normalize_impl;
|
||||
gMat44.determinant = mat44_get_determinant;
|
||||
gMat44.batch_mul_pos3 = mat44_batch_mul_pos3;
|
||||
}
|
||||
}
|
||||
625
Engine/source/math/isa/sse41/sse41_intrinsics.h
Normal file
625
Engine/source/math/isa/sse41/sse41_intrinsics.h
Normal file
|
|
@ -0,0 +1,625 @@
|
|||
#pragma once
|
||||
#include <smmintrin.h> // SSE4.1
|
||||
|
||||
namespace
|
||||
{
|
||||
typedef __m128 f32x4;
|
||||
|
||||
//------------------------------------------------------
|
||||
// Load / Store
|
||||
//------------------------------------------------------
|
||||
|
||||
// Load 4 floats from memory into a SIMD register
|
||||
inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
|
||||
|
||||
inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
|
||||
|
||||
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
|
||||
|
||||
inline f32x4 v_zero() { return _mm_setzero_ps(); }
|
||||
|
||||
inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }
|
||||
|
||||
//------------------------------------------------------
|
||||
// Mask helpers
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 v_mask_xyz() { return _mm_blend_ps(_mm_set1_ps(0.0f), _mm_set1_ps(1.0f), 0b0111); }
|
||||
|
||||
inline f32x4 v_swizzle_mask(f32x4 v, int x, int y, int z, int w)
|
||||
{
|
||||
alignas(16) float tmp[4];
|
||||
_mm_store_ps(tmp, v);
|
||||
|
||||
return _mm_set_ps(
|
||||
tmp[w],
|
||||
tmp[z],
|
||||
tmp[y],
|
||||
tmp[x]
|
||||
);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_singular_mask(f32x4 v, int x)
|
||||
{
|
||||
alignas(16) float tmp[4];
|
||||
_mm_store_ps(tmp, v);
|
||||
return _mm_set1_ps(tmp[x]);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_lo(f32x4 v1)
|
||||
{
|
||||
return _mm_moveldup_ps(v1);
|
||||
}
|
||||
|
||||
inline f32x4 v_swizzle_hi(f32x4 v1)
|
||||
{
|
||||
return _mm_movehdup_ps(v1);
|
||||
}
|
||||
|
||||
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
|
||||
{
|
||||
return _mm_blend_ps(newv, original, 0b1000);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Shuffle helpers
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w)
|
||||
{
|
||||
alignas(16) float a[4], b[4];
|
||||
_mm_store_ps(a, v1);
|
||||
_mm_store_ps(b, v2);
|
||||
|
||||
return _mm_set_ps(
|
||||
b[w],
|
||||
b[z],
|
||||
a[y],
|
||||
a[x]
|
||||
);
|
||||
}
|
||||
|
||||
inline f32x4 v_shuffle_hilo(f32x4 v1, f32x4 v2)
|
||||
{
|
||||
return _mm_movelh_ps(v1, v2);
|
||||
}
|
||||
|
||||
inline f32x4 v_shuffle_lohi(f32x4 v1, f32x4 v2)
|
||||
{
|
||||
return _mm_movehl_ps(v2, v1);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------
|
||||
// Float3 helpers (safe loading into 4 lanes)
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 v_load3_vec(const float* p) // w = 0
|
||||
{
|
||||
return _mm_set_ps(0.0f, p[2], p[1], p[0]);
|
||||
}
|
||||
|
||||
inline f32x4 v_load3_pos(const float* p) // w = 1
|
||||
{
|
||||
return _mm_set_ps(1.0f, p[2], p[1], p[0]);
|
||||
}
|
||||
|
||||
inline void v_store3(float* dst, f32x4 v)
|
||||
{
|
||||
alignas(16) float tmp[4]; // temp storage
|
||||
_mm_store_ps(tmp, v); // store all 4 lanes
|
||||
dst[0] = tmp[0];
|
||||
dst[1] = tmp[1];
|
||||
dst[2] = tmp[2];
|
||||
}
|
||||
|
||||
inline f32x4 v_set(float x, float y, float z, float w)
|
||||
{
|
||||
return _mm_set_ps(w, z, y, x);
|
||||
}
|
||||
|
||||
inline f32x4 v_insert_w(f32x4 v, f32x4 w)
|
||||
{
|
||||
return _mm_insert_ps(v, w, 0x30);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Simple Arithmatic
|
||||
//------------------------------------------------------
|
||||
|
||||
// Element-wise multiply
|
||||
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
|
||||
|
||||
// Element-wise divide
|
||||
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
|
||||
|
||||
// Element-wise add
|
||||
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
|
||||
|
||||
// Element-wise subtract
|
||||
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
|
||||
|
||||
//------------------------------------------------------
|
||||
// Fast recip
|
||||
//------------------------------------------------------
|
||||
|
||||
// Fast recip 1/b
|
||||
inline f32x4 v_rcp_nr(f32x4 b)
|
||||
{
|
||||
f32x4 r = _mm_rcp_ps(b);
|
||||
f32x4 two = _mm_set1_ps(2.0f);
|
||||
r = _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
|
||||
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(b, _mm_mul_ps(r, r)));
|
||||
}
|
||||
|
||||
// Divide fast ( b = recip eg 1/b)
|
||||
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_mul_ps(a, v_rcp_nr(b)); }
|
||||
|
||||
inline f32x4 v_rsqrt_nr(f32x4 x)
|
||||
{
|
||||
f32x4 half = _mm_set1_ps(0.5f);
|
||||
f32x4 three = _mm_set1_ps(3.0f);
|
||||
|
||||
f32x4 r = _mm_rsqrt_ps(x);
|
||||
f32x4 nr = _mm_mul_ps(_mm_mul_ps(x, r), r);
|
||||
return _mm_mul_ps(_mm_mul_ps(half, r), _mm_sub_ps(three, nr));
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Vector intrinsic functions
|
||||
//------------------------------------------------------
|
||||
|
||||
// full dot4
|
||||
inline f32x4 v_dot4(f32x4 a, f32x4 b)
|
||||
{
|
||||
return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1
|
||||
}
|
||||
|
||||
// dot3 (ignores w)
|
||||
inline f32x4 v_dot3(f32x4 a, f32x4 b)
|
||||
{
|
||||
return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1
|
||||
}
|
||||
|
||||
// cross product xyz only.
|
||||
inline f32x4 v_cross(f32x4 a, f32x4 b)
|
||||
{
|
||||
f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
|
||||
f32x4 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
|
||||
|
||||
return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
}
|
||||
|
||||
inline f32x4 v_normalize3(f32x4 v)
|
||||
{
|
||||
const f32x4 zero = _mm_setzero_ps();
|
||||
const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0}
|
||||
f32x4 dot = v_dot3(v, v);
|
||||
|
||||
f32x4 inv = v_rsqrt_nr(dot);
|
||||
f32x4 isZero = _mm_cmpeq_ps(dot, zero);
|
||||
f32x4 norm = _mm_mul_ps(v, inv);
|
||||
|
||||
return _mm_blendv_ps(norm, fallback, isZero);
|
||||
}
|
||||
|
||||
// adds all 4 lanes together.
|
||||
inline f32x4 v_hadd4(f32x4 a)
|
||||
{
|
||||
// sum all 4 lanes in SSE41
|
||||
__m128 sum = _mm_hadd_ps(a, a);
|
||||
return _mm_hadd_ps(sum, sum);
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix type (row-major 4x4)
|
||||
//------------------------------------------------------
|
||||
|
||||
struct f32x4x4
|
||||
{
|
||||
f32x4 r0;
|
||||
f32x4 r1;
|
||||
f32x4 r2;
|
||||
f32x4 r3;
|
||||
};
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix Load / Store
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4x4 m_load(const float* m) // expects 16 floats (row-major)
|
||||
{
|
||||
f32x4x4 out;
|
||||
out.r0 = v_load(m + 0);
|
||||
out.r1 = v_load(m + 4);
|
||||
out.r2 = v_load(m + 8);
|
||||
out.r3 = v_load(m + 12);
|
||||
return out;
|
||||
}
|
||||
|
||||
inline void m_store(float* dst, const f32x4x4& m)
|
||||
{
|
||||
_mm_storeu_ps(dst + 0, m.r0);
|
||||
_mm_storeu_ps(dst + 4, m.r1);
|
||||
_mm_storeu_ps(dst + 8, m.r2);
|
||||
_mm_storeu_ps(dst + 12, m.r3);
|
||||
}
|
||||
|
||||
inline f32x4x4 m_identity()
|
||||
{
|
||||
f32x4x4 m;
|
||||
m.r0 = _mm_set_ps(0, 0, 0, 1);
|
||||
m.r1 = _mm_set_ps(0, 0, 1, 0);
|
||||
m.r2 = _mm_set_ps(0, 1, 0, 0);
|
||||
m.r3 = _mm_set_ps(1, 0, 0, 0);
|
||||
return m;
|
||||
}
|
||||
|
||||
inline f32x4x4 m_zero()
|
||||
{
|
||||
f32x4 z = v_zero();
|
||||
return { z, z, z, z };
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix × Vector
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4 m_mul_vec4(const f32x4x4& m, f32x4 v)
|
||||
{
|
||||
f32x4 x = v_dot4(m.r0, v);
|
||||
f32x4 y = v_dot4(m.r1, v);
|
||||
f32x4 z = v_dot4(m.r2, v);
|
||||
f32x4 w = v_dot4(m.r3, v);
|
||||
|
||||
// combine to a vector
|
||||
return _mm_set_ps(_mm_cvtss_f32(w),
|
||||
_mm_cvtss_f32(z),
|
||||
_mm_cvtss_f32(y),
|
||||
_mm_cvtss_f32(x));
|
||||
}
|
||||
|
||||
inline f32x4 m_mul_vec3(const f32x4x4& m, f32x4 v)
|
||||
{
|
||||
f32x4 x = v_dot3(m.r0, v);
|
||||
f32x4 y = v_dot3(m.r1, v);
|
||||
f32x4 z = v_dot3(m.r2, v);
|
||||
|
||||
// combine to a vector
|
||||
return _mm_set_ps(0.0f,
|
||||
_mm_cvtss_f32(z),
|
||||
_mm_cvtss_f32(y),
|
||||
_mm_cvtss_f32(x));
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Matrix × Matrix
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4x4 m_mul(const f32x4x4& a, const f32x4x4& b)
|
||||
{
|
||||
// Transpose B once for dot products
|
||||
f32x4 b0 = b.r0;
|
||||
f32x4 b1 = b.r1;
|
||||
f32x4 b2 = b.r2;
|
||||
f32x4 b3 = b.r3;
|
||||
|
||||
// Transpose (SSE2)
|
||||
_MM_TRANSPOSE4_PS(b0, b1, b2, b3);
|
||||
|
||||
f32x4x4 C;
|
||||
|
||||
auto mul_row = [&](f32x4 arow)
|
||||
{
|
||||
f32x4 x = v_dot4(arow, b0);
|
||||
f32x4 y = v_dot4(arow, b1);
|
||||
f32x4 z = v_dot4(arow, b2);
|
||||
f32x4 w = v_dot4(arow, b3);
|
||||
|
||||
f32x4 xy = _mm_unpacklo_ps(x, y);
|
||||
f32x4 zw = _mm_unpacklo_ps(z, w);
|
||||
return _mm_movelh_ps(xy, zw);
|
||||
};
|
||||
|
||||
C.r0 = mul_row(a.r0);
|
||||
C.r1 = mul_row(a.r1);
|
||||
C.r2 = mul_row(a.r2);
|
||||
C.r3 = mul_row(a.r3);
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
//------------------------------------------------------
|
||||
// Transpose
|
||||
//------------------------------------------------------
|
||||
|
||||
inline f32x4x4 m_transpose(const f32x4x4& m)
|
||||
{
|
||||
f32x4 r0 = m.r0;
|
||||
f32x4 r1 = m.r1;
|
||||
f32x4 r2 = m.r2;
|
||||
f32x4 r3 = m.r3;
|
||||
|
||||
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
|
||||
|
||||
return { r0, r1, r2, r3 };
|
||||
}
|
||||
|
||||
inline void m_set3x3_transpose(float* dst, f32x4 c0, f32x4 c1, f32x4 c2)
|
||||
{
|
||||
dst[0] = v_extract0(c0); // c0.x
|
||||
dst[1] = v_extract0(c1); // c1.x
|
||||
dst[2] = v_extract0(c2); // c2.x
|
||||
|
||||
dst[4] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 1))); // c0.y
|
||||
dst[5] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 1))); // c1.y
|
||||
dst[6] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 1))); // c2.y
|
||||
|
||||
dst[8] = _mm_cvtss_f32(_mm_shuffle_ps(c0, c0, _MM_SHUFFLE(3, 3, 3, 2))); // c0.z
|
||||
dst[9] = _mm_cvtss_f32(_mm_shuffle_ps(c1, c1, _MM_SHUFFLE(3, 3, 3, 2))); // c1.z
|
||||
dst[10] = _mm_cvtss_f32(_mm_shuffle_ps(c2, c2, _MM_SHUFFLE(3, 3, 3, 2))); // c2.z
|
||||
}
|
||||
|
||||
inline f32x4x4 m_inverse(const f32x4x4& m)
|
||||
{
|
||||
f32x4 a = m.r0;
|
||||
f32x4 b = m.r1;
|
||||
f32x4 c = m.r2;
|
||||
f32x4 d = m.r3;
|
||||
|
||||
f32x4 c0 = v_cross(b, c);
|
||||
f32x4 c1 = v_cross(c, d);
|
||||
f32x4 c2 = v_cross(d, a);
|
||||
f32x4 c3 = v_cross(a, b);
|
||||
|
||||
f32x4 det = v_dot4(a, c1);
|
||||
f32x4 invDet = v_rcp_nr(det);
|
||||
|
||||
f32x4x4 adj;
|
||||
adj.r0 = _mm_mul_ps(c1, invDet);
|
||||
adj.r1 = _mm_mul_ps(c2, invDet);
|
||||
adj.r2 = _mm_mul_ps(c3, invDet);
|
||||
adj.r3 = _mm_mul_ps(c0, invDet);
|
||||
|
||||
return m_transpose(adj);
|
||||
}
|
||||
|
||||
inline f32x4x4 m_inverse_rigid(const f32x4x4& m)
|
||||
{
|
||||
f32x4x4 t = m_transpose(m);
|
||||
|
||||
// Extract translation
|
||||
f32x4 T = v_set(
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3))),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3))),
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3))),
|
||||
0.0f
|
||||
);
|
||||
|
||||
f32x4 newT = m_mul_vec4(t, T);
|
||||
newT = _mm_sub_ps(v_zero(), newT);
|
||||
|
||||
t.r0 = v_preserve_w(t.r0, newT);
|
||||
t.r1 = v_preserve_w(t.r1, newT);
|
||||
t.r2 = v_preserve_w(t.r2, newT);
|
||||
t.r3 = v_set(0, 0, 0, 1);
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
inline f32x4x4 m_inverse_affine(const f32x4x4& m)
|
||||
{
|
||||
// Extract upper 3x3 rows
|
||||
f32x4 r0 = m.r0;
|
||||
f32x4 r1 = m.r1;
|
||||
f32x4 r2 = m.r2;
|
||||
|
||||
// Zero translation
|
||||
r0 = _mm_and_ps(r0, v_mask_xyz());
|
||||
r1 = _mm_and_ps(r1, v_mask_xyz());
|
||||
r2 = _mm_and_ps(r2, v_mask_xyz());
|
||||
|
||||
// Compute cofactors via cross products
|
||||
f32x4 c0 = v_cross(r1, r2);
|
||||
f32x4 c1 = v_cross(r2, r0);
|
||||
f32x4 c2 = v_cross(r0, r1);
|
||||
|
||||
// Determinant
|
||||
f32x4 det = v_dot3(r0, c0);
|
||||
|
||||
// Reciprocal determinant
|
||||
f32x4 invDet = v_rcp_nr(det);
|
||||
|
||||
// Inverse 3x3 (transpose of cofactor matrix)
|
||||
f32x4 i0 = _mm_mul_ps(c0, invDet);
|
||||
f32x4 i1 = _mm_mul_ps(c1, invDet);
|
||||
f32x4 i2 = _mm_mul_ps(c2, invDet);
|
||||
|
||||
// Transpose 3x3
|
||||
f32x4 t0 = i0;
|
||||
f32x4 t1 = i1;
|
||||
f32x4 t2 = i2;
|
||||
f32x4 t3 = _mm_setzero_ps();
|
||||
|
||||
_MM_TRANSPOSE4_PS(t0, t1, t2, t3);
|
||||
|
||||
// Extract translation
|
||||
alignas(16) float tmp[4];
|
||||
tmp[0] = _mm_cvtss_f32(_mm_shuffle_ps(m.r0, m.r0, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
tmp[1] = _mm_cvtss_f32(_mm_shuffle_ps(m.r1, m.r1, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
tmp[2] = _mm_cvtss_f32(_mm_shuffle_ps(m.r2, m.r2, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
tmp[3] = 0.0f;
|
||||
|
||||
f32x4 T = v_load(tmp);
|
||||
|
||||
// New translation = -R' * T
|
||||
f32x4 newT = m_mul_vec4({ t0, t1, t2, t3 }, T);
|
||||
newT = _mm_sub_ps(v_zero(), newT);
|
||||
|
||||
// Assemble final matrix
|
||||
f32x4x4 out;
|
||||
out.r0 = v_preserve_w(t0, newT);
|
||||
out.r1 = v_preserve_w(t1, newT);
|
||||
out.r2 = v_preserve_w(t2, newT);
|
||||
out.r3 = _mm_set_ps(1, 0, 0, 0);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
inline f32x4 m_determinant(const f32x4x4& m)
|
||||
{
|
||||
f32x4 a = m.r0;
|
||||
f32x4 b = m.r1;
|
||||
f32x4 c = m.r2;
|
||||
f32x4 d = m.r3;
|
||||
|
||||
f32x4 c0 = v_cross(c, d);
|
||||
f32x4 c1 = v_cross(d, b);
|
||||
f32x4 c2 = v_cross(b, c);
|
||||
|
||||
f32x4 term0 = _mm_mul_ps(a, c0);
|
||||
f32x4 term1 = _mm_mul_ps(a, c1);
|
||||
f32x4 term2 = _mm_mul_ps(a, c2);
|
||||
|
||||
f32x4 det = _mm_add_ps(term0, _mm_add_ps(term1, term2));
|
||||
|
||||
return v_hadd4(det);
|
||||
}
|
||||
|
||||
inline f32x4 m_determinant_affine(const f32x4x4& m)
|
||||
{
|
||||
f32x4 r0 = m.r0;
|
||||
f32x4 r1 = m.r1;
|
||||
f32x4 r2 = m.r2;
|
||||
|
||||
r0 = _mm_and_ps(r0, v_mask_xyz());
|
||||
r1 = _mm_and_ps(r1, v_mask_xyz());
|
||||
r2 = _mm_and_ps(r2, v_mask_xyz());
|
||||
|
||||
f32x4 c0 = v_cross(r1, r2);
|
||||
return v_dot3(r0, c0); // splatted determinant
|
||||
}
|
||||
|
||||
//---------------------------------------------------------
|
||||
// BATCH INTRINSICS
|
||||
//---------------------------------------------------------
|
||||
|
||||
struct vec4_batch4
|
||||
{
|
||||
f32x4 x;
|
||||
f32x4 y;
|
||||
f32x4 z;
|
||||
f32x4 w;
|
||||
};
|
||||
|
||||
|
||||
inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW)
|
||||
{
|
||||
vec4_batch4 r;
|
||||
|
||||
r.x = _mm_set_ps(
|
||||
ptr[9], ptr[6], ptr[3], ptr[0]);
|
||||
|
||||
r.y = _mm_set_ps(
|
||||
ptr[10], ptr[7], ptr[4], ptr[1]);
|
||||
|
||||
r.z = _mm_set_ps(
|
||||
ptr[11], ptr[8], ptr[5], ptr[2]);
|
||||
|
||||
if (fillW)
|
||||
{
|
||||
r.w = _mm_set1_ps(w);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// Store the result back out to a float array 4 at a time
|
||||
inline void store_f32x4(float* out, f32x4 v)
|
||||
{
|
||||
_mm_storeu_ps(out, v);
|
||||
}
|
||||
|
||||
// Store the result back to a float3 array with size of 4
|
||||
inline void store_vec3_batch4(float* out, const vec4_batch4& v)
|
||||
{
|
||||
alignas(16) float xs[8];
|
||||
alignas(16) float ys[8];
|
||||
alignas(16) float zs[8];
|
||||
|
||||
_mm_store_ps(xs, v.x);
|
||||
_mm_store_ps(ys, v.y);
|
||||
_mm_store_ps(zs, v.z);
|
||||
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
out[i * 3 + 0] = xs[i];
|
||||
out[i * 3 + 1] = ys[i];
|
||||
out[i * 3 + 2] = zs[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b)
|
||||
{
|
||||
f32x4 mulx = _mm_mul_ps(a.x, b.x);
|
||||
f32x4 muly = _mm_mul_ps(a.y, b.y);
|
||||
f32x4 mulz = _mm_mul_ps(a.z, b.z);
|
||||
|
||||
return _mm_add_ps(_mm_add_ps(mulx, muly), mulz);
|
||||
}
|
||||
|
||||
// Batch 4 mul_Vec4.
|
||||
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
|
||||
{
|
||||
vec4_batch4 r;
|
||||
|
||||
f32x4 m00 = _mm_set1_ps(m[0]);
|
||||
f32x4 m01 = _mm_set1_ps(m[1]);
|
||||
f32x4 m02 = _mm_set1_ps(m[2]);
|
||||
f32x4 m03 = _mm_set1_ps(m[3]);
|
||||
|
||||
f32x4 m10 = _mm_set1_ps(m[4]);
|
||||
f32x4 m11 = _mm_set1_ps(m[5]);
|
||||
f32x4 m12 = _mm_set1_ps(m[6]);
|
||||
f32x4 m13 = _mm_set1_ps(m[7]);
|
||||
|
||||
f32x4 m20 = _mm_set1_ps(m[8]);
|
||||
f32x4 m21 = _mm_set1_ps(m[9]);
|
||||
f32x4 m22 = _mm_set1_ps(m[10]);
|
||||
f32x4 m23 = _mm_set1_ps(m[11]);
|
||||
|
||||
//
|
||||
// row0 dot
|
||||
//
|
||||
r.x = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.x, m00),
|
||||
_mm_mul_ps(v.y, m01)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.z, m02), m03));
|
||||
|
||||
//
|
||||
// row1 dot
|
||||
//
|
||||
r.y = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.x, m10),
|
||||
_mm_mul_ps(v.y, m11)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.z, m12), m13));
|
||||
|
||||
//
|
||||
// row2 dot
|
||||
//
|
||||
r.z = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.x, m20),
|
||||
_mm_mul_ps(v.y, m21)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(v.z, m22), m23));
|
||||
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
|
@ -151,7 +151,7 @@ inline void m_matF_x_vectorF(const F32 *m, const F32 *v, F32 *vresult)
|
|||
inline bool mIsEqual( F32 a, F32 b, const F32 epsilon = __EQUAL_CONST_F )
|
||||
{
|
||||
F32 diff = a - b;
|
||||
return diff > -epsilon && diff < epsilon;
|
||||
return diff > -epsilon && diff < epsilon;
|
||||
}
|
||||
|
||||
inline bool mIsZero(const F32 val, const F32 epsilon = __EQUAL_CONST_F )
|
||||
|
|
@ -207,16 +207,16 @@ inline F32 mFmod(const F32 val, const F32 mod)
|
|||
return fmod(val, mod);
|
||||
}
|
||||
|
||||
inline S32 mRound(const F32 val)
|
||||
{
|
||||
return (S32)floor(val + 0.5f);
|
||||
}
|
||||
inline S32 mRound(const F32 val)
|
||||
{
|
||||
return (S32)floor(val + 0.5f);
|
||||
}
|
||||
|
||||
inline F32 mRound(const F32 val, const S32 n)
|
||||
{
|
||||
S32 place = (S32) pow(10.0f, n);
|
||||
inline F32 mRound(const F32 val, const S32 n)
|
||||
{
|
||||
S32 place = (S32) pow(10.0f, n);
|
||||
|
||||
return mFloor((val*place)+0.5)/place;
|
||||
return mFloor((val*place)+0.5)/place;
|
||||
}
|
||||
|
||||
inline F32 mRoundF(const F32 val, const F32 step)
|
||||
|
|
@ -261,15 +261,15 @@ inline F32 mClampF(F32 val, F32 low, F32 high)
|
|||
|
||||
inline S32 mWrap(S32 val, S32 low, S32 high)
|
||||
{
|
||||
int len = high - low;
|
||||
return low + (val >= 0 ? val % len : -val % len ? len - (-val % len) : 0);
|
||||
int len = high - low;
|
||||
return low + (val >= 0 ? val % len : -val % len ? len - (-val % len) : 0);
|
||||
|
||||
}
|
||||
|
||||
inline F32 mWrapF(F32 val, F32 low, F32 high)
|
||||
{
|
||||
F32 t = fmod(val - low, high - low);
|
||||
return t < 0 ? t + high : t + low;
|
||||
F32 t = fmod(val - low, high - low);
|
||||
return t < 0 ? t + high : t + low;
|
||||
}
|
||||
|
||||
/// Template function for doing a linear interpolation between any two
|
||||
|
|
@ -497,7 +497,7 @@ inline F64 mRadToDeg(F64 r)
|
|||
|
||||
inline bool mIsNaN_F( const F32 x )
|
||||
{
|
||||
// If x is a floating point variable, then (x != x) will be TRUE if x has the value NaN.
|
||||
// If x is a floating point variable, then (x != x) will be TRUE if x has the value NaN.
|
||||
// This is only going to work if the compiler is IEEE 748 compliant.
|
||||
//
|
||||
// Tested and working on VC2k5
|
||||
|
|
|
|||
|
|
@ -152,8 +152,8 @@ static void m_point3F_interpolate_C(const F32 *from, const F32 *to, F32 factor,
|
|||
#ifdef TORQUE_COMPILER_GCC
|
||||
// remove possibility of aliases
|
||||
const F32 inverse = 1.0f - factor;
|
||||
const F32 from0 = from[0], from1 = from[1], from2 = from[2];
|
||||
const F32 to0 = to[0], to1 = to[1], to2 = to[2];
|
||||
const F32 from0 = from[0], from1 = from[1], from2 = from[2];
|
||||
const F32 to0 = to[0], to1 = to[1], to2 = to[2];
|
||||
|
||||
result[0] = from0 * inverse + to0 * factor;
|
||||
result[1] = from1 * inverse + to1 * factor;
|
||||
|
|
@ -182,8 +182,8 @@ static void m_point3D_interpolate_C(const F64 *from, const F64 *to, F64 factor,
|
|||
#ifdef TORQUE_COMPILER_GCC
|
||||
// remove possibility of aliases
|
||||
const F64 inverse = 1.0f - factor;
|
||||
const F64 from0 = from[0], from1 = from[1], from2 = from[2];
|
||||
const F64 to0 = to[0], to1 = to[1], to2 = to[2];
|
||||
const F64 from0 = from[0], from1 = from[1], from2 = from[2];
|
||||
const F64 to0 = to[0], to1 = to[1], to2 = to[2];
|
||||
|
||||
result[0] = from0 * inverse + to0 * factor;
|
||||
result[1] = from1 * inverse + to1 * factor;
|
||||
|
|
@ -436,8 +436,8 @@ static void affineInvertTo(const F32 * m, F32 * out)
|
|||
|
||||
F32 invDet = 1.0f / (m[idx(0,0)] * d1 + m[idx(0,1)] * d2 + m[idx(0,2)] * d3);
|
||||
|
||||
F32 m00 = m[idx(0,0)] * invDet;
|
||||
F32 m01 = m[idx(0,1)] * invDet;
|
||||
F32 m00 = m[idx(0,0)] * invDet;
|
||||
F32 m01 = m[idx(0,1)] * invDet;
|
||||
F32 m02 = m[idx(0,2)] * invDet;
|
||||
|
||||
F32 * result = out;
|
||||
|
|
@ -950,4 +950,3 @@ void mInstallLibrary_C()
|
|||
m_matF_x_scale_x_planeF = m_matF_x_scale_x_planeF_C;
|
||||
m_matF_x_box3F = m_matF_x_box3F_C;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -50,13 +50,16 @@
|
|||
#ifndef _CONSOLE_H_
|
||||
#include "console/console.h"
|
||||
#endif
|
||||
#ifndef _MATH_BACKEND_H_
|
||||
#include "math/public/math_backend.h"
|
||||
#endif
|
||||
|
||||
#ifndef USE_TEMPLATE_MATRIX
|
||||
|
||||
/// 4x4 Matrix Class
|
||||
///
|
||||
/// This runs at F32 precision.
|
||||
|
||||
using namespace math_backend::mat44::dispatch;
|
||||
class MatrixF
|
||||
{
|
||||
friend class MatrixFEngineExport;
|
||||
|
|
@ -128,7 +131,7 @@ public:
|
|||
EulerF toEuler() const;
|
||||
|
||||
F32 determinant() const {
|
||||
return m_matF_determinant(*this);
|
||||
return GetMat44().determinant(*this);
|
||||
}
|
||||
|
||||
/// Compute the inverse of the matrix.
|
||||
|
|
@ -228,6 +231,8 @@ public:
|
|||
void mul( Point4F& p ) const; ///< M * p -> p (full [4x4] * [1x4])
|
||||
void mulP( Point3F& p ) const; ///< M * p -> p (assume w = 1.0f)
|
||||
void mulP( const Point3F &p, Point3F *d) const; ///< M * p -> d (assume w = 1.0f)
|
||||
void batch_mulP(Point3F* points, U32 count) const;
|
||||
void batch_mulP(const Point3F* points, U32 count, Point3F* out) const;
|
||||
void mulV( VectorF& p ) const; ///< M * v -> v (assume w = 0.0f)
|
||||
void mulV( const VectorF &p, Point3F *d) const; ///< M * v -> d (assume w = 0.0f)
|
||||
|
||||
|
|
@ -372,71 +377,66 @@ inline MatrixF& MatrixF::identity()
|
|||
|
||||
inline MatrixF& MatrixF::inverse()
|
||||
{
|
||||
m_matF_inverse(m);
|
||||
GetMat44().inverse(m);
|
||||
return (*this);
|
||||
}
|
||||
|
||||
inline void MatrixF::invertTo( MatrixF *out )
|
||||
{
|
||||
m_matF_invert_to(m,*out);
|
||||
GetMat44().inverse_to(*this, *out);
|
||||
}
|
||||
|
||||
inline MatrixF& MatrixF::affineInverse()
|
||||
{
|
||||
// AssertFatal(isAffine() == true, "Error, this matrix is not an affine transform");
|
||||
m_matF_affineInverse(m);
|
||||
GetMat44().affine_inverse(m);
|
||||
return (*this);
|
||||
}
|
||||
|
||||
inline MatrixF& MatrixF::transpose()
|
||||
{
|
||||
m_matF_transpose(m);
|
||||
GetMat44().transpose(m);
|
||||
return (*this);
|
||||
}
|
||||
|
||||
inline MatrixF& MatrixF::scale(const Point3F& p)
|
||||
{
|
||||
m_matF_scale(m,p);
|
||||
GetMat44().scale(m, p);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point3F MatrixF::getScale() const
|
||||
{
|
||||
Point3F scale;
|
||||
scale.x = mSqrt(m[0]*m[0] + m[4] * m[4] + m[8] * m[8]);
|
||||
scale.y = mSqrt(m[1]*m[1] + m[5] * m[5] + m[9] * m[9]);
|
||||
scale.z = mSqrt(m[2]*m[2] + m[6] * m[6] + m[10] * m[10]);
|
||||
GetMat44().get_scale(m, scale);
|
||||
return scale;
|
||||
}
|
||||
|
||||
inline void MatrixF::normalize()
|
||||
{
|
||||
m_matF_normalize(m);
|
||||
GetMat44().normalize(m);
|
||||
}
|
||||
|
||||
inline MatrixF& MatrixF::mul( const MatrixF &a )
|
||||
{ // M * a -> M
|
||||
AssertFatal(&a != this, "MatrixF::mul - a.mul(a) is invalid!");
|
||||
|
||||
MatrixF tempThis(*this);
|
||||
m_matF_x_matF(tempThis, a, *this);
|
||||
GetMat44().mul_mat44(tempThis, a, *this);
|
||||
|
||||
return (*this);
|
||||
}
|
||||
|
||||
inline MatrixF& MatrixF::mulL( const MatrixF &a )
|
||||
{ // a * M -> M
|
||||
AssertFatal(&a != this, "MatrixF::mulL - a.mul(a) is invalid!");
|
||||
|
||||
MatrixF tempThis(*this);
|
||||
m_matF_x_matF(a, tempThis, *this);
|
||||
GetMat44().mul_mat44(a, tempThis, *this);
|
||||
return (*this);
|
||||
}
|
||||
|
||||
inline MatrixF& MatrixF::mul( const MatrixF &a, const MatrixF &b )
|
||||
{ // a * b -> M
|
||||
AssertFatal((&a != this) && (&b != this), "MatrixF::mul - a.mul(a, b) a.mul(b, a) a.mul(a, a) is invalid!");
|
||||
|
||||
m_matF_x_matF(a, b, *this);
|
||||
GetMat44().mul_mat44(a, b, *this);
|
||||
return (*this);
|
||||
}
|
||||
|
||||
|
|
@ -461,7 +461,7 @@ inline MatrixF& MatrixF::mul(const MatrixF &a, const F32 b)
|
|||
inline void MatrixF::mul( Point4F& p ) const
|
||||
{
|
||||
Point4F temp;
|
||||
m_matF_x_point4F(*this, &p.x, &temp.x);
|
||||
GetMat44().mul_float4(*this, p, temp);
|
||||
p = temp;
|
||||
}
|
||||
|
||||
|
|
@ -469,28 +469,62 @@ inline void MatrixF::mulP( Point3F& p) const
|
|||
{
|
||||
// M * p -> d
|
||||
Point3F d;
|
||||
m_matF_x_point3F(*this, &p.x, &d.x);
|
||||
GetMat44().mul_pos3(*this, p, d);
|
||||
p = d;
|
||||
}
|
||||
|
||||
inline void MatrixF::mulP( const Point3F &p, Point3F *d) const
|
||||
{
|
||||
// M * p -> d
|
||||
m_matF_x_point3F(*this, &p.x, &d->x);
|
||||
GetMat44().mul_pos3(*this, p, *d);
|
||||
}
|
||||
|
||||
inline void MatrixF::batch_mulP(Point3F* points, U32 count) const
|
||||
{
|
||||
// Allocate temporary buffer
|
||||
Point3F* temp = new Point3F[count];
|
||||
|
||||
GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
|
||||
|
||||
// Copy the results back to out
|
||||
for (U32 i = 0; i < count; ++i)
|
||||
{
|
||||
points[i] = temp[i];
|
||||
}
|
||||
|
||||
// Free temporary buffer
|
||||
delete[] temp;
|
||||
}
|
||||
|
||||
inline void MatrixF::batch_mulP(const Point3F* points, U32 count, Point3F* out) const
|
||||
{
|
||||
// Allocate temporary buffer
|
||||
Point3F* temp = new Point3F[count];
|
||||
|
||||
GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
|
||||
|
||||
// Copy the results back to out
|
||||
for (U32 i = 0; i < count; ++i)
|
||||
{
|
||||
out[i] = temp[i];
|
||||
}
|
||||
|
||||
// Free temporary buffer
|
||||
delete[] temp;
|
||||
}
|
||||
|
||||
inline void MatrixF::mulV( VectorF& v) const
|
||||
{
|
||||
// M * v -> v
|
||||
VectorF temp;
|
||||
m_matF_x_vectorF(*this, &v.x, &temp.x);
|
||||
GetMat44().mul_vec3(*this, v, temp);
|
||||
v = temp;
|
||||
}
|
||||
|
||||
inline void MatrixF::mulV( const VectorF &v, Point3F *d) const
|
||||
{
|
||||
// M * v -> d
|
||||
m_matF_x_vectorF(*this, &v.x, &d->x);
|
||||
GetMat44().mul_vec3(*this, v, *d);
|
||||
}
|
||||
|
||||
inline void MatrixF::mul(Box3F& b) const
|
||||
|
|
@ -634,14 +668,14 @@ inline MatrixF operator * ( const MatrixF &m1, const MatrixF &m2 )
|
|||
{
|
||||
// temp = m1 * m2
|
||||
MatrixF temp;
|
||||
m_matF_x_matF(m1, m2, temp);
|
||||
GetMat44().mul_mat44(m1, m2, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline MatrixF& MatrixF::operator *= ( const MatrixF &m1 )
|
||||
{
|
||||
MatrixF tempThis(*this);
|
||||
m_matF_x_matF(tempThis, m1, *this);
|
||||
GetMat44().mul_mat44(tempThis, m1, *this);
|
||||
return (*this);
|
||||
}
|
||||
|
||||
|
|
@ -665,7 +699,9 @@ inline bool MatrixF::isNaN()
|
|||
|
||||
inline void mTransformPlane(const MatrixF& mat, const Point3F& scale, const PlaneF& plane, PlaneF * result)
|
||||
{
|
||||
m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x);
|
||||
//m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x);
|
||||
|
||||
GetMat44().transform_plane(mat, scale, plane, *result);
|
||||
}
|
||||
|
||||
#else // !USE_TEMPLATE_MATRIX
|
||||
|
|
|
|||
|
|
@ -29,6 +29,11 @@
|
|||
#ifndef _MPOINT2_H_
|
||||
#include "math/mPoint2.h"
|
||||
#endif
|
||||
#ifndef _MATH_BACKEND_H_
|
||||
#include "math/public/math_backend.h"
|
||||
#endif
|
||||
|
||||
#include <array>
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
/// 3D integer point
|
||||
|
|
@ -97,6 +102,7 @@ public:
|
|||
class Point3D;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
using namespace math_backend::float3::dispatch;
|
||||
class Point3F
|
||||
{
|
||||
//-------------------------------------- Public data
|
||||
|
|
@ -497,7 +503,8 @@ inline void Point3F::setMax(const Point3F& _test)
|
|||
inline void Point3F::interpolate(const Point3F& _from, const Point3F& _to, F32 _factor)
|
||||
{
|
||||
AssertFatal(_factor >= 0.0f && _factor <= 1.0f, "Out of bound interpolation factor");
|
||||
m_point3F_interpolate( _from, _to, _factor, *this);
|
||||
|
||||
GetFloat3().lerp(_from, _to, _factor, *this);
|
||||
}
|
||||
|
||||
inline void Point3F::zero()
|
||||
|
|
@ -585,31 +592,27 @@ inline void Point3F::neg()
|
|||
|
||||
inline void Point3F::convolve(const Point3F& c)
|
||||
{
|
||||
x *= c.x;
|
||||
y *= c.y;
|
||||
z *= c.z;
|
||||
GetFloat3().mul(*this, c, *this);
|
||||
}
|
||||
|
||||
inline void Point3F::convolveInverse(const Point3F& c)
|
||||
{
|
||||
x /= c.x;
|
||||
y /= c.y;
|
||||
z /= c.z;
|
||||
GetFloat3().div(*this, c, *this);
|
||||
}
|
||||
|
||||
inline F32 Point3F::lenSquared() const
|
||||
{
|
||||
return (x * x) + (y * y) + (z * z);
|
||||
return GetFloat3().lengthSquared(*this);
|
||||
}
|
||||
|
||||
inline F32 Point3F::len() const
|
||||
{
|
||||
return mSqrt(x*x + y*y + z*z);
|
||||
return GetFloat3().length(*this);
|
||||
}
|
||||
|
||||
inline void Point3F::normalize()
|
||||
{
|
||||
m_point3F_normalize(*this);
|
||||
GetFloat3().normalize(*this);
|
||||
}
|
||||
|
||||
inline F32 Point3F::magnitudeSafe() const
|
||||
|
|
@ -626,18 +629,13 @@ inline F32 Point3F::magnitudeSafe() const
|
|||
|
||||
inline void Point3F::normalizeSafe()
|
||||
{
|
||||
F32 vmag = magnitudeSafe();
|
||||
|
||||
if( vmag > POINT_EPSILON )
|
||||
{
|
||||
*this *= F32(1.0 / vmag);
|
||||
}
|
||||
GetFloat3().normalize(*this);
|
||||
}
|
||||
|
||||
|
||||
inline void Point3F::normalize(F32 val)
|
||||
{
|
||||
m_point3F_normalize_f(*this, val);
|
||||
GetFloat3().normalize_mag(*this, val);
|
||||
}
|
||||
|
||||
inline bool Point3F::operator==(const Point3F& _test) const
|
||||
|
|
@ -652,52 +650,49 @@ inline bool Point3F::operator!=(const Point3F& _test) const
|
|||
|
||||
inline Point3F Point3F::operator+(const Point3F& _add) const
|
||||
{
|
||||
return Point3F(x + _add.x, y + _add.y, z + _add.z);
|
||||
Point3F temp;
|
||||
GetFloat3().add(*this, _add, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3F Point3F::operator-(const Point3F& _rSub) const
|
||||
{
|
||||
return Point3F(x - _rSub.x, y - _rSub.y, z - _rSub.z);
|
||||
Point3F temp;
|
||||
GetFloat3().sub(*this, _rSub, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3F& Point3F::operator+=(const Point3F& _add)
|
||||
{
|
||||
x += _add.x;
|
||||
y += _add.y;
|
||||
z += _add.z;
|
||||
|
||||
GetFloat3().add(*this, _add, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point3F& Point3F::operator-=(const Point3F& _rSub)
|
||||
{
|
||||
x -= _rSub.x;
|
||||
y -= _rSub.y;
|
||||
z -= _rSub.z;
|
||||
|
||||
GetFloat3().sub(*this, _rSub, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point3F Point3F::operator*(F32 _mul) const
|
||||
{
|
||||
return Point3F(x * _mul, y * _mul, z * _mul);
|
||||
Point3F temp;
|
||||
GetFloat3().mul_scalar(*this, _mul, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3F Point3F::operator/(F32 _div) const
|
||||
{
|
||||
AssertFatal(_div != 0.0f, "Error, div by zero attempted");
|
||||
|
||||
F32 inv = 1.0f / _div;
|
||||
|
||||
return Point3F(x * inv, y * inv, z * inv);
|
||||
Point3F temp;
|
||||
GetFloat3().div_scalar(*this, _div, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3F& Point3F::operator*=(F32 _mul)
|
||||
{
|
||||
x *= _mul;
|
||||
y *= _mul;
|
||||
z *= _mul;
|
||||
|
||||
GetFloat3().mul_scalar(*this, _mul, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
|
@ -705,39 +700,35 @@ inline Point3F& Point3F::operator/=(F32 _div)
|
|||
{
|
||||
AssertFatal(_div != 0.0f, "Error, div by zero attempted");
|
||||
|
||||
F32 inv = 1.0f / _div;
|
||||
x *= inv;
|
||||
y *= inv;
|
||||
z *= inv;
|
||||
|
||||
GetFloat3().div_scalar(*this, _div, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point3F Point3F::operator*(const Point3F &_vec) const
|
||||
{
|
||||
return Point3F(x * _vec.x, y * _vec.y, z * _vec.z);
|
||||
Point3F temp;
|
||||
GetFloat3().mul(*this, _vec, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3F& Point3F::operator*=(const Point3F &_vec)
|
||||
{
|
||||
x *= _vec.x;
|
||||
y *= _vec.y;
|
||||
z *= _vec.z;
|
||||
GetFloat3().mul(*this, _vec, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point3F Point3F::operator/(const Point3F &_vec) const
|
||||
{
|
||||
AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
|
||||
return Point3F(x / _vec.x, y / _vec.y, z / _vec.z);
|
||||
Point3F temp;
|
||||
GetFloat3().div(*this, _vec, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3F& Point3F::operator/=(const Point3F &_vec)
|
||||
{
|
||||
AssertFatal(_vec.x != 0.0f && _vec.y != 0.0f && _vec.z != 0.0f, "Error, div by zero attempted");
|
||||
x /= _vec.x;
|
||||
y /= _vec.y;
|
||||
z /= _vec.z;
|
||||
GetFloat3().div(*this, _vec, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
|
@ -989,7 +980,9 @@ inline Point3I operator*(S32 mul, const Point3I& multiplicand)
|
|||
|
||||
inline Point3F operator*(F32 mul, const Point3F& multiplicand)
|
||||
{
|
||||
return multiplicand * mul;
|
||||
Point3F temp;
|
||||
GetFloat3().mul_scalar(multiplicand, mul, temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
inline Point3D operator*(F64 mul, const Point3D& multiplicand)
|
||||
|
|
@ -999,7 +992,7 @@ inline Point3D operator*(F64 mul, const Point3D& multiplicand)
|
|||
|
||||
inline F32 mDot(const Point3F &p1, const Point3F &p2)
|
||||
{
|
||||
return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z);
|
||||
return GetFloat3().dot(p1, p2);
|
||||
}
|
||||
|
||||
inline F64 mDot(const Point3D &p1, const Point3D &p2)
|
||||
|
|
@ -1009,9 +1002,7 @@ inline F64 mDot(const Point3D &p1, const Point3D &p2)
|
|||
|
||||
inline void mCross(const Point3F &a, const Point3F &b, Point3F *res)
|
||||
{
|
||||
res->x = (a.y * b.z) - (a.z * b.y);
|
||||
res->y = (a.z * b.x) - (a.x * b.z);
|
||||
res->z = (a.x * b.y) - (a.y * b.x);
|
||||
GetFloat3().cross(a, b, *res);
|
||||
}
|
||||
|
||||
inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
|
||||
|
|
@ -1024,7 +1015,7 @@ inline void mCross(const Point3D &a, const Point3D &b, Point3D *res)
|
|||
inline Point3F mCross(const Point3F &a, const Point3F &b)
|
||||
{
|
||||
Point3F r;
|
||||
mCross( a, b, &r );
|
||||
GetFloat3().cross(a, b, r);
|
||||
return r;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -26,10 +26,12 @@
|
|||
#ifndef _MMATHFN_H_
|
||||
#include "math/mMathFn.h"
|
||||
#endif
|
||||
|
||||
#ifndef _MPOINT3_H_
|
||||
#include "math/mPoint3.h"
|
||||
#endif
|
||||
#ifndef _MATH_BACKEND_H_
|
||||
#include "math/public/math_backend.h"
|
||||
#endif
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
@ -61,6 +63,8 @@ class Point4I
|
|||
/// Uses F32 internally.
|
||||
///
|
||||
/// Useful for representing quaternions and other 4d beasties.
|
||||
using namespace math_backend::float4::dispatch;
|
||||
|
||||
class Point4F
|
||||
{
|
||||
//-------------------------------------- Public data
|
||||
|
|
@ -152,15 +156,12 @@ inline void Point4F::set(F32 _x, F32 _y, F32 _z, F32 _w)
|
|||
|
||||
inline F32 Point4F::len() const
|
||||
{
|
||||
return mSqrt(x*x + y*y + z*z + w*w);
|
||||
return GetFloat4().length(*this);
|
||||
}
|
||||
|
||||
inline void Point4F::interpolate(const Point4F& _from, const Point4F& _to, F32 _factor)
|
||||
{
|
||||
x = (_from.x * (1.0f - _factor)) + (_to.x * _factor);
|
||||
y = (_from.y * (1.0f - _factor)) + (_to.y * _factor);
|
||||
z = (_from.z * (1.0f - _factor)) + (_to.z * _factor);
|
||||
w = (_from.w * (1.0f - _factor)) + (_to.w * _factor);
|
||||
GetFloat4().lerp(_from, _to, _factor, *this);
|
||||
}
|
||||
|
||||
inline void Point4F::zero()
|
||||
|
|
@ -193,55 +194,55 @@ inline Point4F& Point4F::operator/=(F32 scalar)
|
|||
if (mIsZero(scalar))
|
||||
return *this;
|
||||
|
||||
F32 denom = 1 / scalar;
|
||||
|
||||
x *= denom;
|
||||
y *= denom;
|
||||
z *= denom;
|
||||
w *= denom;
|
||||
GetFloat4().div_scalar(*this, scalar, *this);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point4F Point4F::operator+(const Point4F& _add) const
|
||||
{
|
||||
return Point4F( x + _add.x, y + _add.y, z + _add.z, w + _add.w );
|
||||
Point4F res;
|
||||
GetFloat4().add(*this, _add, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline Point4F& Point4F::operator+=(const Point4F& _add)
|
||||
{
|
||||
x += _add.x;
|
||||
y += _add.y;
|
||||
z += _add.z;
|
||||
w += _add.w;
|
||||
|
||||
GetFloat4().add(*this, _add, *this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Point4F Point4F::operator-(const Point4F& _rSub) const
|
||||
{
|
||||
return Point4F( x - _rSub.x, y - _rSub.y, z - _rSub.z, w - _rSub.w );
|
||||
Point4F res;
|
||||
GetFloat4().sub(*this, _rSub, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline Point4F Point4F::operator*(const Point4F &_vec) const
|
||||
{
|
||||
return Point4F(x * _vec.x, y * _vec.y, z * _vec.z, w * _vec.w);
|
||||
Point4F res;
|
||||
GetFloat4().mul(*this, _vec, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline Point4F Point4F::operator*(F32 _mul) const
|
||||
{
|
||||
return Point4F(x * _mul, y * _mul, z * _mul, w * _mul);
|
||||
Point4F res;
|
||||
GetFloat4().mul_scalar(*this, _mul, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline Point4F Point4F::operator /(F32 t) const
|
||||
{
|
||||
F32 f = 1.0f / t;
|
||||
return Point4F( x * f, y * f, z * f, w * f );
|
||||
Point4F res;
|
||||
GetFloat4().div_scalar(*this, t, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
inline F32 mDot(const Point4F &p1, const Point4F &p2)
|
||||
{
|
||||
return (p1.x*p2.x + p1.y*p2.y + p1.z*p2.z + p1.w*p2.w);
|
||||
return GetFloat4().dot(p1, p2);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
|
|||
39
Engine/source/math/public/float3_dispatch.h
Normal file
39
Engine/source/math/public/float3_dispatch.h
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
#ifndef _FLOAT3_DISPATCH_H_
|
||||
#define _FLOAT3_DISPATCH_H_
|
||||
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace math_backend::float3::dispatch
|
||||
{
|
||||
struct Float3Funcs
|
||||
{
|
||||
void (*add)(const float*, const float*, float*) = nullptr;
|
||||
void (*sub)(const float*, const float*, float*) = nullptr;
|
||||
void (*mul)(const float*, const float*, float*) = nullptr;
|
||||
void (*mul_scalar)(const float*, float, float*) = nullptr;
|
||||
void (*div)(const float*, const float*, float*) = nullptr;
|
||||
void (*div_scalar)(const float*, float, float*) = nullptr;
|
||||
float (*dot)(const float*, const float*) = nullptr;
|
||||
float (*length)(const float*) = nullptr;
|
||||
float (*lengthSquared)(const float*) = nullptr;
|
||||
void (*normalize)(float*) = nullptr;
|
||||
void (*normalize_mag)(float*, float) = nullptr;
|
||||
void (*lerp)(const float*, const float*, float, float*) = nullptr;
|
||||
void (*cross)(const float*, const float*, float*) = nullptr;
|
||||
};
|
||||
|
||||
// Global dispatch table
|
||||
extern Float3Funcs gFloat3;
|
||||
|
||||
// Backend installers (defined in ISA libraries)
|
||||
void install_scalar();
|
||||
void install_sse2();
|
||||
void install_sse41();
|
||||
void install_avx();
|
||||
void install_avx2();
|
||||
void install_neon();
|
||||
}
|
||||
|
||||
#endif // !_FLOAT4_DISPATCH_H_
|
||||
39
Engine/source/math/public/float4_dispatch.h
Normal file
39
Engine/source/math/public/float4_dispatch.h
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
#ifndef _FLOAT4_DISPATCH_H_
|
||||
#define _FLOAT4_DISPATCH_H_
|
||||
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace math_backend::float4::dispatch
|
||||
{
|
||||
struct Float4Funcs
|
||||
{
|
||||
void (*add)(const float*, const float*, float*) = nullptr;
|
||||
void (*sub)(const float*, const float*, float*) = nullptr;
|
||||
void (*mul)(const float*, const float*, float*) = nullptr;
|
||||
void (*mul_scalar)(const float*, float, float*) = nullptr;
|
||||
void (*div)(const float*, const float*, float*) = nullptr;
|
||||
void (*div_scalar)(const float*, float, float*) = nullptr;
|
||||
float (*dot)(const float*, const float*) = nullptr;
|
||||
float (*length)(const float*) = nullptr;
|
||||
float (*lengthSquared)(const float*) = nullptr;
|
||||
void (*normalize)(float*) = nullptr;
|
||||
void (*normalize_mag)(float*, float) = nullptr;
|
||||
void (*lerp)(const float*, const float*, float, float*) = nullptr;
|
||||
void (*cross)(const float*, const float*, float*) = nullptr;
|
||||
};
|
||||
|
||||
// Global dispatch table
|
||||
extern Float4Funcs gFloat4;
|
||||
|
||||
// Backend installers (defined in ISA libraries)
|
||||
void install_scalar();
|
||||
void install_sse2();
|
||||
void install_sse41();
|
||||
void install_avx();
|
||||
void install_avx2();
|
||||
void install_neon();
|
||||
}
|
||||
|
||||
#endif // !_FLOAT4_DISPATCH_H_
|
||||
39
Engine/source/math/public/mat44_dispatch.h
Normal file
39
Engine/source/math/public/mat44_dispatch.h
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
#ifndef _MAT44_DISPATCH_H_
|
||||
#define _MAT44_DISPATCH_H_
|
||||
|
||||
|
||||
namespace math_backend::mat44::dispatch
|
||||
{
|
||||
struct Mat44Funcs
|
||||
{
|
||||
void (*transpose)(float*) = nullptr;
|
||||
void (*inverse)(float*) = nullptr;
|
||||
void (*inverse_to)(const float*, float*) = nullptr;
|
||||
void (*affine_inverse)(float*) = nullptr;
|
||||
void (*normalize)(float*) = nullptr;
|
||||
void (*mul_mat44)(const float* a, const float* b, float* r) = nullptr;
|
||||
void (*mul_pos3)(const float* a, const float* b, float* r) = nullptr;
|
||||
void (*mul_vec3)(const float* a, const float* b, float* r) = nullptr;
|
||||
void (*mul_float4)(const float* a, const float* b, float* r) = nullptr;
|
||||
void (*transform_plane)(const float* m, const float* s, const float* p, float* r) = nullptr;
|
||||
float (*determinant)(const float*) = nullptr;
|
||||
void (*scale)(float*, const float*) = nullptr;
|
||||
void (*get_scale)(const float*, float*) = nullptr;
|
||||
|
||||
void (*batch_mul_pos3)(const float* m, const float* pts, int count, float* result_ptrs) = nullptr;
|
||||
};
|
||||
|
||||
// Global dispatch table
|
||||
extern Mat44Funcs gMat44;
|
||||
|
||||
// Backend installers (defined in ISA libraries)
|
||||
void install_scalar();
|
||||
void install_sse2();
|
||||
void install_sse41();
|
||||
void install_avx();
|
||||
void install_avx2();
|
||||
void install_neon();
|
||||
}
|
||||
|
||||
#endif // !_MAT44_DISPATCH_H_
|
||||
103
Engine/source/math/public/math_backend.cpp
Normal file
103
Engine/source/math/public/math_backend.cpp
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
#include "math/public/math_backend.h"
|
||||
|
||||
namespace math_backend::float4::dispatch
|
||||
{
|
||||
// Single definition of the global dispatch table
|
||||
Float4Funcs gFloat4{};
|
||||
}
|
||||
|
||||
namespace math_backend::float3::dispatch
|
||||
{
|
||||
// Single definition of the global dispatch table
|
||||
Float3Funcs gFloat3{};
|
||||
}
|
||||
|
||||
namespace math_backend::mat44::dispatch
|
||||
{
|
||||
Mat44Funcs gMat44{};
|
||||
}
|
||||
|
||||
namespace math_backend
|
||||
{
|
||||
// Use an anonymous namespace for the static initializer
|
||||
namespace {
|
||||
struct ScalarInitializer
|
||||
{
|
||||
ScalarInitializer()
|
||||
{
|
||||
// Install scalar defaults immediately for all types
|
||||
float4::dispatch::install_scalar();
|
||||
float3::dispatch::install_scalar();
|
||||
mat44::dispatch::install_scalar();
|
||||
}
|
||||
};
|
||||
|
||||
// Static instance ensures constructor runs before main()
|
||||
ScalarInitializer g_scalarInitializer;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
math_backend::backend math_backend::choose_backend(U32 cpu_flags)
|
||||
{
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
|
||||
|
||||
if (cpu_flags & CPU_PROP_AVX2) return backend::avx2;
|
||||
if (cpu_flags & CPU_PROP_AVX) return backend::avx;
|
||||
if (cpu_flags & CPU_PROP_SSE4_1) return backend::sse41;
|
||||
if (cpu_flags & CPU_PROP_SSE2) return backend::sse2;
|
||||
|
||||
#elif defined(__aarch64__) || defined(__ARM_NEON)
|
||||
|
||||
if (cpu_flags & CPU_PROP_NEON) return backend::neon;
|
||||
|
||||
#endif
|
||||
return backend::scalar;
|
||||
}
|
||||
|
||||
void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
|
||||
{
|
||||
{
|
||||
g_backend = choose_backend(cpu_flags);
|
||||
|
||||
switch (g_backend)
|
||||
{
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
|
||||
case backend::avx2:
|
||||
float4::dispatch::install_avx2();
|
||||
float3::dispatch::install_avx2();
|
||||
mat44::dispatch::install_avx2();
|
||||
break;
|
||||
|
||||
case backend::avx:
|
||||
float4::dispatch::install_avx();
|
||||
float3::dispatch::install_avx();
|
||||
mat44::dispatch::install_avx();
|
||||
break;
|
||||
|
||||
case backend::sse41:
|
||||
float4::dispatch::install_sse41();
|
||||
float3::dispatch::install_sse41();
|
||||
mat44::dispatch::install_sse41();
|
||||
break;
|
||||
|
||||
case backend::sse2:
|
||||
float4::dispatch::install_sse2();
|
||||
float3::dispatch::install_sse2();
|
||||
mat44::dispatch::install_sse2();
|
||||
break;
|
||||
#elif defined(__aarch64__) || defined(__ARM_NEON)
|
||||
case backend::neon:
|
||||
float4::dispatch::install_neon();
|
||||
float3::dispatch::install_neon();
|
||||
mat44::dispatch::install_neon();
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
float4::dispatch::install_scalar();
|
||||
float3::dispatch::install_scalar();
|
||||
mat44::dispatch::install_scalar();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
95
Engine/source/math/public/math_backend.h
Normal file
95
Engine/source/math/public/math_backend.h
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
#pragma once
|
||||
#ifndef _MATH_BACKEND_H_
|
||||
#define _MATH_BACKEND_H_
|
||||
|
||||
#ifndef _MCONSTANTS_H_
|
||||
#include "math/mConstants.h"
|
||||
#endif
|
||||
#ifndef _PLATFORMASSERT_H_
|
||||
#include "platform/platformAssert.h"
|
||||
#endif
|
||||
#ifndef _FLOAT4_DISPATCH_H_
|
||||
#include "math/public/float4_dispatch.h"
|
||||
#endif
|
||||
#ifndef _FLOAT3_DISPATCH_H_
|
||||
#include "math/public/float3_dispatch.h"
|
||||
#endif
|
||||
#ifndef _MAT44_DISPATCH_H_
|
||||
#include "math/public/mat44_dispatch.h"
|
||||
#endif
|
||||
|
||||
namespace math_backend
|
||||
{
|
||||
enum class backend
|
||||
{
|
||||
scalar,
|
||||
sse2,
|
||||
sse41,
|
||||
avx,
|
||||
avx2,
|
||||
neon
|
||||
};
|
||||
|
||||
static backend g_backend = backend::scalar;
|
||||
backend choose_backend(U32 cpu_flags);
|
||||
void install_from_cpu_flags(uint32_t cpu_flags);
|
||||
}
|
||||
|
||||
#include <mutex>
|
||||
|
||||
namespace math_backend::float4::dispatch
|
||||
{
|
||||
//--------------------------------------------------
|
||||
// Thread-safe getter, ensures scalar installed if empty
|
||||
//--------------------------------------------------
|
||||
inline Float4Funcs& GetFloat4()
|
||||
{
|
||||
if (gFloat4.mul == nullptr)
|
||||
{
|
||||
static std::once_flag once;
|
||||
std::call_once(once, []{
|
||||
install_scalar();
|
||||
});
|
||||
}
|
||||
return gFloat4;
|
||||
}
|
||||
}
|
||||
|
||||
namespace math_backend::float3::dispatch
|
||||
{
|
||||
//--------------------------------------------------
|
||||
// Thread-safe getter, ensures scalar installed if empty
|
||||
//--------------------------------------------------
|
||||
inline Float3Funcs& GetFloat3()
|
||||
{
|
||||
if (gFloat3.mul == nullptr)
|
||||
{
|
||||
static std::once_flag once;
|
||||
std::call_once(once, []{
|
||||
install_scalar();
|
||||
});
|
||||
}
|
||||
return gFloat3;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
namespace math_backend::mat44::dispatch
|
||||
{
|
||||
//--------------------------------------------------
|
||||
// Thread-safe getter, ensures scalar installed if empty
|
||||
//--------------------------------------------------
|
||||
inline Mat44Funcs& GetMat44()
|
||||
{
|
||||
if (gMat44.mul_mat44 == nullptr)
|
||||
{
|
||||
static std::once_flag once;
|
||||
std::call_once(once, []{
|
||||
install_scalar();
|
||||
});
|
||||
}
|
||||
return gMat44;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // !_MATH_BACKEND_H_
|
||||
|
|
@ -381,8 +381,10 @@ void FrustumData::_update() const
|
|||
|
||||
// Transform the points into the desired culling space.
|
||||
|
||||
for( U32 i = 0; i < mPoints.size(); ++ i )
|
||||
mTransform.mulP( mPoints[ i ] );
|
||||
/*for( U32 i = 0; i < mPoints.size(); ++ i )
|
||||
mTransform.mulP( mPoints[ i ] );*/
|
||||
|
||||
mTransform.batch_mulP(mPoints.address(), mPoints.size());
|
||||
|
||||
// Update the axis aligned bounding box from
|
||||
// the newly transformed points.
|
||||
|
|
|
|||
|
|
@ -75,10 +75,12 @@ enum ProcessorProperties
|
|||
CPU_PROP_SSE4_1 = (1<<7), ///< Supports SSE4_1 instruction set extension.
|
||||
CPU_PROP_SSE4_2 = (1<<8), ///< Supports SSE4_2 instruction set extension.
|
||||
CPU_PROP_AVX = (1<<9), ///< Supports AVX256 instruction set extension.
|
||||
CPU_PROP_MP = (1<<10), ///< This is a multi-processor system.
|
||||
CPU_PROP_LE = (1<<11), ///< This processor is LITTLE ENDIAN.
|
||||
CPU_PROP_64bit = (1<<12), ///< This processor is 64-bit capable
|
||||
CPU_PROP_NEON = (1<<13), ///< Supports the Arm Neon instruction set extension.
|
||||
CPU_PROP_AVX2 = (1<<10), ///< Supports AVX256 instruction set extension.
|
||||
CPU_PROP_AVX512 = (1<<11), ///< Supports AVX512 instruction set extension.
|
||||
CPU_PROP_MP = (1<<12), ///< This is a multi-processor system.
|
||||
CPU_PROP_LE = (1<<13), ///< This processor is LITTLE ENDIAN.
|
||||
CPU_PROP_64bit = (1<<14), ///< This processor is 64-bit capable
|
||||
CPU_PROP_NEON = (1<<15), ///< Supports the Arm Neon instruction set extension.
|
||||
};
|
||||
|
||||
/// Processor info manager.
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
#import "math/mMath.h"
|
||||
#import "core/strings/stringFunctions.h"
|
||||
#include "console/engineAPI.h"
|
||||
#include "math/public/math_backend.h"
|
||||
|
||||
extern void mInstallLibrary_C();
|
||||
|
||||
|
|
@ -107,14 +108,10 @@ void Math::init(U32 properties)
|
|||
|
||||
Con::printf("Math Init:");
|
||||
Con::printf(" Installing Standard C extensions");
|
||||
mInstallLibrary_C();
|
||||
mInstallLibrary_C();
|
||||
|
||||
#ifdef TORQUE_CPU_X86
|
||||
if( properties & CPU_PROP_SSE )
|
||||
{
|
||||
Con::printf( " Installing SSE extensions" );
|
||||
}
|
||||
#endif
|
||||
Con::printf( " Installing ISA extensions" );
|
||||
math_backend::install_from_cpu_flags(properties);
|
||||
|
||||
Con::printf(" ");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@
|
|||
#include "math/mMath.h"
|
||||
#include "core/strings/stringFunctions.h"
|
||||
#include "console/engineAPI.h"
|
||||
#include "math/public/math_backend.h"
|
||||
|
||||
extern void mInstallLibrary_C();
|
||||
extern void mInstallLibrary_ASM();
|
||||
|
|
@ -90,29 +91,14 @@ void Math::init(U32 properties)
|
|||
Con::printf(" Installing Standard C extensions");
|
||||
mInstallLibrary_C();
|
||||
|
||||
Con::printf(" Installing ISA extensions");
|
||||
math_backend::install_from_cpu_flags(properties);
|
||||
|
||||
#if defined(TORQUE_CPU_X32) || defined(TORQUE_CPU_X64)
|
||||
Con::printf(" Installing Assembly extensions");
|
||||
mInstallLibrary_ASM();
|
||||
#endif
|
||||
|
||||
if (properties & CPU_PROP_FPU)
|
||||
{
|
||||
Con::printf(" Installing FPU extensions");
|
||||
}
|
||||
|
||||
if (properties & CPU_PROP_MMX)
|
||||
{
|
||||
Con::printf(" Installing MMX extensions");
|
||||
}
|
||||
|
||||
#if !defined(__MWERKS__) || (__MWERKS__ >= 0x2400)
|
||||
if (properties & CPU_PROP_SSE)
|
||||
{
|
||||
Con::printf(" Installing SSE extensions");
|
||||
}
|
||||
#endif //mwerks>2.4
|
||||
|
||||
Con::printf(" ");
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -72,13 +72,15 @@ enum CpuFlags
|
|||
|
||||
BIT_XSAVE_RESTORE = BIT(27),
|
||||
BIT_AVX = BIT(28),
|
||||
BIT_AVX2 = BIT(5),
|
||||
BIT_AVX512F = BIT(16)
|
||||
};
|
||||
|
||||
static void detectCpuFeatures(Platform::SystemInfo_struct::Processor &processor)
|
||||
{
|
||||
S32 cpuInfo[4];
|
||||
__cpuid(cpuInfo, 1);
|
||||
//U32 eax = cpuInfo[0]; // eax
|
||||
int nIds = cpuInfo[0];
|
||||
U32 edx = cpuInfo[3]; // edx
|
||||
U32 ecx = cpuInfo[2]; // ecx
|
||||
|
||||
|
|
@ -90,6 +92,18 @@ static void detectCpuFeatures(Platform::SystemInfo_struct::Processor &processor)
|
|||
processor.properties |= (ecx & BIT_SSE4_1) ? CPU_PROP_SSE4_1 : 0;
|
||||
processor.properties |= (ecx & BIT_SSE4_2) ? CPU_PROP_SSE4_2 : 0;
|
||||
|
||||
if (nIds >= 7)
|
||||
{
|
||||
int ext[4];
|
||||
__cpuidex(ext, 7, 0);
|
||||
|
||||
if (ext[1] & (BIT_AVX2)) // EBX bit 5
|
||||
processor.properties |= CPU_PROP_AVX2;
|
||||
|
||||
if (ext[1] & (BIT_AVX512F)) // AVX-512 Foundation
|
||||
processor.properties |= CPU_PROP_AVX512;
|
||||
}
|
||||
|
||||
// AVX detection requires that xsaverestore is supported
|
||||
if (ecx & BIT_XSAVE_RESTORE && ecx & BIT_AVX)
|
||||
{
|
||||
|
|
@ -176,6 +190,10 @@ void Processor::init()
|
|||
Con::printf(" SSE4.2 detected" );
|
||||
if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX)
|
||||
Con::printf(" AVX detected");
|
||||
if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX2)
|
||||
Con::printf(" AVX2 detected");
|
||||
if (Platform::SystemInfo.processor.properties & CPU_PROP_AVX512)
|
||||
Con::printf(" AVX512 detected");
|
||||
|
||||
if (Platform::SystemInfo.processor.properties & CPU_PROP_MP)
|
||||
Con::printf(" MultiCore CPU detected [%i cores, %i logical]", Platform::SystemInfo.processor.numPhysicalProcessors, Platform::SystemInfo.processor.numLogicalProcessors);
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@
|
|||
#include "console/engineAPI.h"
|
||||
|
||||
#include "math/mMath.h"
|
||||
|
||||
#include "math/public/math_backend.h"
|
||||
|
||||
extern void mInstallLibrary_C();
|
||||
extern void mInstallLibrary_ASM();
|
||||
|
|
@ -98,24 +98,12 @@ void Math::init(U32 properties)
|
|||
Con::printf(" Installing Standard C extensions");
|
||||
mInstallLibrary_C();
|
||||
|
||||
Con::printf(" Installing ISA extensions");
|
||||
math_backend::install_from_cpu_flags(properties);
|
||||
|
||||
Con::printf(" Installing Assembly extensions");
|
||||
mInstallLibrary_ASM();
|
||||
|
||||
if (properties & CPU_PROP_FPU)
|
||||
{
|
||||
Con::printf(" Installing FPU extensions");
|
||||
}
|
||||
|
||||
if (properties & CPU_PROP_MMX)
|
||||
{
|
||||
Con::printf(" Installing MMX extensions");
|
||||
}
|
||||
|
||||
if (properties & CPU_PROP_SSE)
|
||||
{
|
||||
Con::printf(" Installing SSE extensions");
|
||||
}
|
||||
|
||||
Con::printf(" ");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -250,7 +250,7 @@ MatrixF AssimpAppNode::getNodeTransform(F32 time)
|
|||
// Check for inverted node coordinate spaces => can happen when modelers
|
||||
// use the 'mirror' tool in their 3d app. Shows up as negative <scale>
|
||||
// transforms in the collada model.
|
||||
if (m_matF_determinant(nodeTransform) < 0.0f)
|
||||
if (nodeTransform.determinant() < 0.0f)
|
||||
{
|
||||
// Mark this node as inverted so we can mirror mesh geometry, then
|
||||
// de-invert the transform matrix
|
||||
|
|
|
|||
|
|
@ -1099,7 +1099,7 @@ void ColladaAppMesh::lookupSkinData()
|
|||
// Inverted node coordinate spaces (negative scale factor) are corrected
|
||||
// in ColladaAppNode::getNodeTransform, so need to apply the same operation
|
||||
// here to match
|
||||
if (m_matF_determinant(invBind) < 0.0f)
|
||||
if (invBind.determinant() < 0.0f)
|
||||
initialTransforms[iJoint].scale(Point3F(1, 1, -1));
|
||||
|
||||
initialTransforms[iJoint].mul(invBind);
|
||||
|
|
|
|||
|
|
@ -192,7 +192,7 @@ MatrixF ColladaAppNode::getNodeTransform(F32 time)
|
|||
// Check for inverted node coordinate spaces => can happen when modelers
|
||||
// use the 'mirror' tool in their 3d app. Shows up as negative <scale>
|
||||
// transforms in the collada model.
|
||||
if (m_matF_determinant(nodeTransform) < 0.0f)
|
||||
if (nodeTransform.determinant() < 0.0f)
|
||||
{
|
||||
// Mark this node as inverted so we can mirror mesh geometry, then
|
||||
// de-invert the transform matrix
|
||||
|
|
|
|||
|
|
@ -36,6 +36,8 @@ void FPSTracker::reset()
|
|||
{
|
||||
fpsNext = (F32)Platform::getRealMilliseconds()/1000.0f + mUpdateInterval;
|
||||
|
||||
fpsAccumTime = 0.0f;
|
||||
fpsAccumVirtualTime = 0.0f;
|
||||
fpsRealLast = 0.0f;
|
||||
fpsReal = 0.0f;
|
||||
fpsRealMin = 0.000001f; // Avoid division by zero.
|
||||
|
|
@ -51,42 +53,60 @@ void FPSTracker::update()
|
|||
F32 realSeconds = (F32)Platform::getRealMilliseconds()/1000.0f;
|
||||
F32 virtualSeconds = (F32)Platform::getVirtualMilliseconds()/1000.0f;
|
||||
|
||||
fpsFrames++;
|
||||
if (fpsFrames > 1)
|
||||
if (fpsRealLast == 0.0f)
|
||||
{
|
||||
fpsReal = fpsReal*(1.0-alpha) + (realSeconds-fpsRealLast)*alpha;
|
||||
fpsVirtual = fpsVirtual*(1.0-alpha) + (virtualSeconds-fpsVirtualLast)*alpha;
|
||||
|
||||
if( fpsFrames > 10 ) // Wait a few frames before updating these.
|
||||
{
|
||||
// Update min/max. This is a bit counter-intuitive, as the comparisons are
|
||||
// inversed because these are all one-over-x values.
|
||||
|
||||
if( fpsReal > fpsRealMin )
|
||||
fpsRealMin = fpsReal;
|
||||
if( fpsReal < fpsRealMax )
|
||||
fpsRealMax = fpsReal;
|
||||
}
|
||||
fpsRealLast = realSeconds;
|
||||
fpsVirtualLast = virtualSeconds;
|
||||
return;
|
||||
}
|
||||
|
||||
fpsRealLast = realSeconds;
|
||||
F32 frameTimeReal = realSeconds - fpsRealLast;
|
||||
F32 frameTimeVirtual = virtualSeconds - fpsVirtualLast;
|
||||
|
||||
fpsRealLast = realSeconds;
|
||||
fpsVirtualLast = virtualSeconds;
|
||||
|
||||
// update variables every few frames
|
||||
F32 update = fpsRealLast - fpsNext;
|
||||
if (update > 0.5f)
|
||||
{
|
||||
F32 delta = realSeconds - fpsNext;
|
||||
Con::setVariable( "fps::frameDelta",avar("%g", delta));
|
||||
Con::setVariable( "fps::real", avar( "%4.1f", 1.0f / fpsReal ) );
|
||||
Con::setVariable( "fps::realMin", avar( "%4.1f", 1.0f / fpsRealMin ) );
|
||||
Con::setVariable( "fps::realMax", avar( "%4.1f", 1.0f / fpsRealMax ) );
|
||||
Con::setVariable( "fps::virtual", avar( "%4.1f", 1.0f / fpsVirtual ) );
|
||||
// Accumulate for windowed FPS calculation
|
||||
fpsFrames++;
|
||||
fpsAccumTime += frameTimeReal;
|
||||
fpsAccumVirtualTime += frameTimeVirtual;
|
||||
|
||||
if (update > mUpdateInterval)
|
||||
fpsNext = fpsRealLast + mUpdateInterval;
|
||||
else
|
||||
fpsNext += mUpdateInterval;
|
||||
// Only update console values at interval
|
||||
if (realSeconds >= fpsNext)
|
||||
{
|
||||
fpsReal = 0.0f;
|
||||
fpsVirtual = 0.0f;
|
||||
|
||||
if (fpsAccumTime > 0.0f)
|
||||
fpsReal = fpsFrames / fpsAccumTime;
|
||||
|
||||
if (fpsAccumVirtualTime > 0.0f)
|
||||
fpsVirtual = fpsFrames / fpsAccumVirtualTime;
|
||||
|
||||
// Update min/max correctly (these are FPS now)
|
||||
if (fpsReal > 0.0f)
|
||||
{
|
||||
if (fpsReal < fpsRealMin)
|
||||
fpsRealMin = fpsReal;
|
||||
|
||||
if (fpsReal > fpsRealMax)
|
||||
fpsRealMax = fpsReal;
|
||||
}
|
||||
|
||||
// frameDelta = actual accumulated real time over window
|
||||
Con::setVariable("fps::frameTimeMs", avar("%4.3f", (fpsAccumTime / fpsFrames) * 1000.0f));
|
||||
Con::setVariable("fps::frameDelta", avar("%g", fpsAccumTime));
|
||||
Con::setVariable("fps::real", avar("%4.1f", fpsReal));
|
||||
Con::setVariable("fps::realMin", avar("%4.1f", fpsRealMin));
|
||||
Con::setVariable("fps::realMax", avar("%4.1f", fpsRealMax));
|
||||
Con::setVariable("fps::virtual", avar("%4.1f", fpsVirtual));
|
||||
|
||||
// Reset window
|
||||
fpsFrames = 0;
|
||||
fpsAccumTime = 0.0f;
|
||||
fpsAccumVirtualTime = 0.0f;
|
||||
|
||||
fpsNext = realSeconds + mUpdateInterval;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,8 @@
|
|||
|
||||
struct FPSTracker
|
||||
{
|
||||
F32 fpsAccumTime;
|
||||
F32 fpsAccumVirtualTime;
|
||||
F32 fpsRealLast;
|
||||
F32 fpsReal;
|
||||
F32 fpsRealMin;
|
||||
|
|
@ -48,4 +50,4 @@ struct FPSTracker
|
|||
|
||||
extern FPSTracker gFPS;
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue