From 88ffdd01cd9aac5cc684d3c89cf4ee54f6c368af Mon Sep 17 00:00:00 2001 From: marauder2k7 Date: Fri, 6 Mar 2026 09:41:35 +0000 Subject: [PATCH] fix linux build --- Engine/source/math/impl/mat44_impl.inl | 6 +-- Engine/source/math/isa/avx/avx_intrinsics.h | 52 +++++++++---------- Engine/source/math/isa/avx2/avx2_intrinsics.h | 52 +++++++++---------- Engine/source/math/isa/neon/neon_intrinsics.h | 26 +++++----- Engine/source/math/isa/sse2/sse2_intrinsics.h | 26 +++++----- .../source/math/isa/sse41/sse41_intrinsics.h | 26 +++++----- Engine/source/math/mMatrix.h | 12 ++--- 7 files changed, 99 insertions(+), 101 deletions(-) diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl index 1bf009a50..da6971d0d 100644 --- a/Engine/source/math/impl/mat44_impl.inl +++ b/Engine/source/math/impl/mat44_impl.inl @@ -395,15 +395,13 @@ namespace math_backend::mat44 inline void mat44_batch_mul_pos3(const float* m, const float* points, int count, float* result) { int i = 0; - f32x4x4 ma = m_load(m); - // AVX has 8 lanes to play with #if defined(MATH_SIMD_AVX2) || defined(MATH_SIMD_AVX) // 8-wide AVX only for (; i + 8 <= count; i += 8) { vec4_batch8 va = load_vec3_batch8(&points[i*3], 1.0f, false); - vec4_batch8 vr = m_mul_pos3_batch8(ma, va); + vec4_batch8 vr = m_mul_pos3_batch8(m, va); store_vec3_batch8(&result[i*3], vr); } #endif // MATH_SIMD_AVX2 || MATH_SIMD_AVX @@ -412,7 +410,7 @@ namespace math_backend::mat44 for (; i + 4 <= count; i += 4) { vec4_batch4 va = load_vec3_batch4(&points[i * 3], 1.0f, false); - vec4_batch4 vr = m_mul_pos3_batch4(ma, va); + vec4_batch4 vr = m_mul_pos3_batch4(m, va); store_vec3_batch4(&result[i * 3], vr); } diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h index 069b15850..40337e8c5 100644 --- a/Engine/source/math/isa/avx/avx_intrinsics.h +++ b/Engine/source/math/isa/avx/avx_intrinsics.h @@ -517,24 +517,24 @@ namespace } // Batch 8 mul_Vec4. - inline vec4_batch8 m_mul_pos3_batch8(const f32x4x4& m, const vec4_batch8& v) + inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v) { vec4_batch8 r; - __m256 m00 = _mm256_set1_ps(m.r0.m128_f32[0]); - __m256 m01 = _mm256_set1_ps(m.r0.m128_f32[1]); - __m256 m02 = _mm256_set1_ps(m.r0.m128_f32[2]); - __m256 m03 = _mm256_set1_ps(m.r0.m128_f32[3]); + __m256 m00 = _mm256_set1_ps(m[0]); + __m256 m01 = _mm256_set1_ps(m[1]); + __m256 m02 = _mm256_set1_ps(m[2]); + __m256 m03 = _mm256_set1_ps(m[3]); - __m256 m10 = _mm256_set1_ps(m.r1.m128_f32[0]); - __m256 m11 = _mm256_set1_ps(m.r1.m128_f32[1]); - __m256 m12 = _mm256_set1_ps(m.r1.m128_f32[2]); - __m256 m13 = _mm256_set1_ps(m.r1.m128_f32[3]); + __m256 m10 = _mm256_set1_ps(m[4]); + __m256 m11 = _mm256_set1_ps(m[5]); + __m256 m12 = _mm256_set1_ps(m[6]); + __m256 m13 = _mm256_set1_ps(m[7]); - __m256 m20 = _mm256_set1_ps(m.r2.m128_f32[0]); - __m256 m21 = _mm256_set1_ps(m.r2.m128_f32[1]); - __m256 m22 = _mm256_set1_ps(m.r2.m128_f32[2]); - __m256 m23 = _mm256_set1_ps(m.r2.m128_f32[3]); + __m256 m20 = _mm256_set1_ps(m[8]); + __m256 m21 = _mm256_set1_ps(m[9]); + __m256 m22 = _mm256_set1_ps(m[10]); + __m256 m23 = _mm256_set1_ps(m[11]); // // row0 dot @@ -570,24 +570,24 @@ namespace } // Batch 4 mul_Vec4. - inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v) + inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v) { vec4_batch4 r; - f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]); - f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]); - f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]); - f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]); + f32x4 m00 = _mm_set1_ps(m[0]); + f32x4 m01 = _mm_set1_ps(m[1]); + f32x4 m02 = _mm_set1_ps(m[2]); + f32x4 m03 = _mm_set1_ps(m[3]); - f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]); - f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]); - f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]); - f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]); + f32x4 m10 = _mm_set1_ps(m[4]); + f32x4 m11 = _mm_set1_ps(m[5]); + f32x4 m12 = _mm_set1_ps(m[6]); + f32x4 m13 = _mm_set1_ps(m[7]); - f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]); - f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]); - f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]); - f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]); + f32x4 m20 = _mm_set1_ps(m[8]); + f32x4 m21 = _mm_set1_ps(m[9]); + f32x4 m22 = _mm_set1_ps(m[10]); + f32x4 m23 = _mm_set1_ps(m[11]); // // row0 dot diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h index a10ba77d8..3291c3521 100644 --- a/Engine/source/math/isa/avx2/avx2_intrinsics.h +++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h @@ -517,24 +517,24 @@ namespace } // Batch 8 mul_Vec4. - inline vec4_batch8 m_mul_pos3_batch8(const f32x4x4& m, const vec4_batch8& v) + inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v) { vec4_batch8 r; - __m256 m00 = _mm256_set1_ps(m.r0.m128_f32[0]); - __m256 m01 = _mm256_set1_ps(m.r0.m128_f32[1]); - __m256 m02 = _mm256_set1_ps(m.r0.m128_f32[2]); - __m256 m03 = _mm256_set1_ps(m.r0.m128_f32[3]); + __m256 m00 = _mm256_set1_ps(m[0]); + __m256 m01 = _mm256_set1_ps(m[1]); + __m256 m02 = _mm256_set1_ps(m[2]); + __m256 m03 = _mm256_set1_ps(m[3]); - __m256 m10 = _mm256_set1_ps(m.r1.m128_f32[0]); - __m256 m11 = _mm256_set1_ps(m.r1.m128_f32[1]); - __m256 m12 = _mm256_set1_ps(m.r1.m128_f32[2]); - __m256 m13 = _mm256_set1_ps(m.r1.m128_f32[3]); + __m256 m10 = _mm256_set1_ps(m[4]); + __m256 m11 = _mm256_set1_ps(m[5]); + __m256 m12 = _mm256_set1_ps(m[6]); + __m256 m13 = _mm256_set1_ps(m[7]); - __m256 m20 = _mm256_set1_ps(m.r2.m128_f32[0]); - __m256 m21 = _mm256_set1_ps(m.r2.m128_f32[1]); - __m256 m22 = _mm256_set1_ps(m.r2.m128_f32[2]); - __m256 m23 = _mm256_set1_ps(m.r2.m128_f32[3]); + __m256 m20 = _mm256_set1_ps(m[8]); + __m256 m21 = _mm256_set1_ps(m[9]); + __m256 m22 = _mm256_set1_ps(m[10]); + __m256 m23 = _mm256_set1_ps(m[11]); // // row0 dot @@ -570,24 +570,24 @@ namespace } // Batch 4 mul_Vec4. - inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v) + inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v) { vec4_batch4 r; - f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]); - f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]); - f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]); - f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]); + f32x4 m00 = _mm_set1_ps(m[0]); + f32x4 m01 = _mm_set1_ps(m[1]); + f32x4 m02 = _mm_set1_ps(m[2]); + f32x4 m03 = _mm_set1_ps(m[3]); - f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]); - f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]); - f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]); - f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]); + f32x4 m10 = _mm_set1_ps(m[4]); + f32x4 m11 = _mm_set1_ps(m[5]); + f32x4 m12 = _mm_set1_ps(m[6]); + f32x4 m13 = _mm_set1_ps(m[7]); - f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]); - f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]); - f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]); - f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]); + f32x4 m20 = _mm_set1_ps(m[8]); + f32x4 m21 = _mm_set1_ps(m[9]); + f32x4 m22 = _mm_set1_ps(m[10]); + f32x4 m23 = _mm_set1_ps(m[11]); // // row0 dot diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h index fc93c9c97..e118b1600 100644 --- a/Engine/source/math/isa/neon/neon_intrinsics.h +++ b/Engine/source/math/isa/neon/neon_intrinsics.h @@ -452,24 +452,24 @@ namespace } } - inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v) + inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v) { vec4_batch4 r; - float32x4_t m00 = vdupq_n_f32(m.r0[0]); - float32x4_t m01 = vdupq_n_f32(m.r0[1]); - float32x4_t m02 = vdupq_n_f32(m.r0[2]); - float32x4_t m03 = vdupq_n_f32(m.r0[3]); + float32x4_t m00 = vdupq_n_f32(m[0]); + float32x4_t m01 = vdupq_n_f32(m[1]); + float32x4_t m02 = vdupq_n_f32(m[2]); + float32x4_t m03 = vdupq_n_f32(m[3]); - float32x4_t m10 = vdupq_n_f32(m.r1[0]); - float32x4_t m11 = vdupq_n_f32(m.r1[1]); - float32x4_t m12 = vdupq_n_f32(m.r1[2]); - float32x4_t m13 = vdupq_n_f32(m.r1[3]); + float32x4_t m10 = vdupq_n_f32(m[4]); + float32x4_t m11 = vdupq_n_f32(m[5]); + float32x4_t m12 = vdupq_n_f32(m[6]); + float32x4_t m13 = vdupq_n_f32(m[7]); - float32x4_t m20 = vdupq_n_f32(m.r2[0]); - float32x4_t m21 = vdupq_n_f32(m.r2[1]); - float32x4_t m22 = vdupq_n_f32(m.r2[2]); - float32x4_t m23 = vdupq_n_f32(m.r2[3]); + float32x4_t m20 = vdupq_n_f32(m[8]); + float32x4_t m21 = vdupq_n_f32(m[9]); + float32x4_t m22 = vdupq_n_f32(m[10]); + float32x4_t m23 = vdupq_n_f32(m[11]); // row0 dot r.x = vaddq_f32( diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h index df3852340..a582d3793 100644 --- a/Engine/source/math/isa/sse2/sse2_intrinsics.h +++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h @@ -601,24 +601,24 @@ namespace } // Batch 4 mul_Vec4. - inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v) + inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v) { vec4_batch4 r; - f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]); - f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]); - f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]); - f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]); + f32x4 m00 = _mm_set1_ps(m[0]); + f32x4 m01 = _mm_set1_ps(m[1]); + f32x4 m02 = _mm_set1_ps(m[2]); + f32x4 m03 = _mm_set1_ps(m[3]); - f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]); - f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]); - f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]); - f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]); + f32x4 m10 = _mm_set1_ps(m[4]); + f32x4 m11 = _mm_set1_ps(m[5]); + f32x4 m12 = _mm_set1_ps(m[6]); + f32x4 m13 = _mm_set1_ps(m[7]); - f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]); - f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]); - f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]); - f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]); + f32x4 m20 = _mm_set1_ps(m[8]); + f32x4 m21 = _mm_set1_ps(m[9]); + f32x4 m22 = _mm_set1_ps(m[10]); + f32x4 m23 = _mm_set1_ps(m[11]); // // row0 dot diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h index 45c056dcf..80dfdbc71 100644 --- a/Engine/source/math/isa/sse41/sse41_intrinsics.h +++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h @@ -571,24 +571,24 @@ namespace } // Batch 4 mul_Vec4. - inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v) + inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v) { vec4_batch4 r; - f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]); - f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]); - f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]); - f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]); + f32x4 m00 = _mm_set1_ps(m[0]); + f32x4 m01 = _mm_set1_ps(m[1]); + f32x4 m02 = _mm_set1_ps(m[2]); + f32x4 m03 = _mm_set1_ps(m[3]); - f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]); - f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]); - f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]); - f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]); + f32x4 m10 = _mm_set1_ps(m[4]); + f32x4 m11 = _mm_set1_ps(m[5]); + f32x4 m12 = _mm_set1_ps(m[6]); + f32x4 m13 = _mm_set1_ps(m[7]); - f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]); - f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]); - f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]); - f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]); + f32x4 m20 = _mm_set1_ps(m[8]); + f32x4 m21 = _mm_set1_ps(m[9]); + f32x4 m22 = _mm_set1_ps(m[10]); + f32x4 m23 = _mm_set1_ps(m[11]); // // row0 dot diff --git a/Engine/source/math/mMatrix.h b/Engine/source/math/mMatrix.h index 873ff967a..3b160c94c 100644 --- a/Engine/source/math/mMatrix.h +++ b/Engine/source/math/mMatrix.h @@ -231,8 +231,8 @@ public: void mul( Point4F& p ) const; ///< M * p -> p (full [4x4] * [1x4]) void mulP( Point3F& p ) const; ///< M * p -> p (assume w = 1.0f) void mulP( const Point3F &p, Point3F *d) const; ///< M * p -> d (assume w = 1.0f) - void batch_mulP(Point3F* points, size_t count) const; - void batch_mulP(const Point3F* points, size_t count, Point3F* out) const; + void batch_mulP(Point3F* points, U32 count) const; + void batch_mulP(const Point3F* points, U32 count, Point3F* out) const; void mulV( VectorF& p ) const; ///< M * v -> v (assume w = 0.0f) void mulV( const VectorF &p, Point3F *d) const; ///< M * v -> d (assume w = 0.0f) @@ -479,7 +479,7 @@ inline void MatrixF::mulP( const Point3F &p, Point3F *d) const GetMat44().mul_pos3(*this, p, *d); } -inline void MatrixF::batch_mulP(Point3F* points, size_t count) const +inline void MatrixF::batch_mulP(Point3F* points, U32 count) const { // Allocate temporary buffer Point3F* temp = new Point3F[count]; @@ -487,7 +487,7 @@ inline void MatrixF::batch_mulP(Point3F* points, size_t count) const GetMat44().batch_mul_pos3(m, reinterpret_cast(points), count, reinterpret_cast(temp)); // Copy the results back to out - for (size_t i = 0; i < count; ++i) + for (U32 i = 0; i < count; ++i) { points[i] = temp[i]; } @@ -496,7 +496,7 @@ inline void MatrixF::batch_mulP(Point3F* points, size_t count) const delete[] temp; } -inline void MatrixF::batch_mulP(const Point3F* points, size_t count, Point3F* out) const +inline void MatrixF::batch_mulP(const Point3F* points, U32 count, Point3F* out) const { // Allocate temporary buffer Point3F* temp = new Point3F[count]; @@ -504,7 +504,7 @@ inline void MatrixF::batch_mulP(const Point3F* points, size_t count, Point3F* ou GetMat44().batch_mul_pos3(m, reinterpret_cast(points), count, reinterpret_cast(temp)); // Copy the results back to out - for (size_t i = 0; i < count; ++i) + for (U32 i = 0; i < count; ++i) { out[i] = temp[i]; }