fix linux build

This commit is contained in:
marauder2k7 2026-03-06 09:41:35 +00:00
parent c09d5a4579
commit 88ffdd01cd
7 changed files with 99 additions and 101 deletions

View file

@ -395,15 +395,13 @@ namespace math_backend::mat44
inline void mat44_batch_mul_pos3(const float* m, const float* points, int count, float* result)
{
int i = 0;
f32x4x4 ma = m_load(m);
// AVX has 8 lanes to play with
#if defined(MATH_SIMD_AVX2) || defined(MATH_SIMD_AVX)
// 8-wide AVX only
for (; i + 8 <= count; i += 8)
{
vec4_batch8 va = load_vec3_batch8(&points[i*3], 1.0f, false);
vec4_batch8 vr = m_mul_pos3_batch8(ma, va);
vec4_batch8 vr = m_mul_pos3_batch8(m, va);
store_vec3_batch8(&result[i*3], vr);
}
#endif // MATH_SIMD_AVX2 || MATH_SIMD_AVX
@ -412,7 +410,7 @@ namespace math_backend::mat44
for (; i + 4 <= count; i += 4)
{
vec4_batch4 va = load_vec3_batch4(&points[i * 3], 1.0f, false);
vec4_batch4 vr = m_mul_pos3_batch4(ma, va);
vec4_batch4 vr = m_mul_pos3_batch4(m, va);
store_vec3_batch4(&result[i * 3], vr);
}

View file

@ -517,24 +517,24 @@ namespace
}
// Batch 8 mul_Vec4.
inline vec4_batch8 m_mul_pos3_batch8(const f32x4x4& m, const vec4_batch8& v)
inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v)
{
vec4_batch8 r;
__m256 m00 = _mm256_set1_ps(m.r0.m128_f32[0]);
__m256 m01 = _mm256_set1_ps(m.r0.m128_f32[1]);
__m256 m02 = _mm256_set1_ps(m.r0.m128_f32[2]);
__m256 m03 = _mm256_set1_ps(m.r0.m128_f32[3]);
__m256 m00 = _mm256_set1_ps(m[0]);
__m256 m01 = _mm256_set1_ps(m[1]);
__m256 m02 = _mm256_set1_ps(m[2]);
__m256 m03 = _mm256_set1_ps(m[3]);
__m256 m10 = _mm256_set1_ps(m.r1.m128_f32[0]);
__m256 m11 = _mm256_set1_ps(m.r1.m128_f32[1]);
__m256 m12 = _mm256_set1_ps(m.r1.m128_f32[2]);
__m256 m13 = _mm256_set1_ps(m.r1.m128_f32[3]);
__m256 m10 = _mm256_set1_ps(m[4]);
__m256 m11 = _mm256_set1_ps(m[5]);
__m256 m12 = _mm256_set1_ps(m[6]);
__m256 m13 = _mm256_set1_ps(m[7]);
__m256 m20 = _mm256_set1_ps(m.r2.m128_f32[0]);
__m256 m21 = _mm256_set1_ps(m.r2.m128_f32[1]);
__m256 m22 = _mm256_set1_ps(m.r2.m128_f32[2]);
__m256 m23 = _mm256_set1_ps(m.r2.m128_f32[3]);
__m256 m20 = _mm256_set1_ps(m[8]);
__m256 m21 = _mm256_set1_ps(m[9]);
__m256 m22 = _mm256_set1_ps(m[10]);
__m256 m23 = _mm256_set1_ps(m[11]);
//
// row0 dot
@ -570,24 +570,24 @@ namespace
}
// Batch 4 mul_Vec4.
inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
{
vec4_batch4 r;
f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
f32x4 m00 = _mm_set1_ps(m[0]);
f32x4 m01 = _mm_set1_ps(m[1]);
f32x4 m02 = _mm_set1_ps(m[2]);
f32x4 m03 = _mm_set1_ps(m[3]);
f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
f32x4 m10 = _mm_set1_ps(m[4]);
f32x4 m11 = _mm_set1_ps(m[5]);
f32x4 m12 = _mm_set1_ps(m[6]);
f32x4 m13 = _mm_set1_ps(m[7]);
f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
f32x4 m20 = _mm_set1_ps(m[8]);
f32x4 m21 = _mm_set1_ps(m[9]);
f32x4 m22 = _mm_set1_ps(m[10]);
f32x4 m23 = _mm_set1_ps(m[11]);
//
// row0 dot

View file

@ -517,24 +517,24 @@ namespace
}
// Batch 8 mul_Vec4.
inline vec4_batch8 m_mul_pos3_batch8(const f32x4x4& m, const vec4_batch8& v)
inline vec4_batch8 m_mul_pos3_batch8(const float* m, const vec4_batch8& v)
{
vec4_batch8 r;
__m256 m00 = _mm256_set1_ps(m.r0.m128_f32[0]);
__m256 m01 = _mm256_set1_ps(m.r0.m128_f32[1]);
__m256 m02 = _mm256_set1_ps(m.r0.m128_f32[2]);
__m256 m03 = _mm256_set1_ps(m.r0.m128_f32[3]);
__m256 m00 = _mm256_set1_ps(m[0]);
__m256 m01 = _mm256_set1_ps(m[1]);
__m256 m02 = _mm256_set1_ps(m[2]);
__m256 m03 = _mm256_set1_ps(m[3]);
__m256 m10 = _mm256_set1_ps(m.r1.m128_f32[0]);
__m256 m11 = _mm256_set1_ps(m.r1.m128_f32[1]);
__m256 m12 = _mm256_set1_ps(m.r1.m128_f32[2]);
__m256 m13 = _mm256_set1_ps(m.r1.m128_f32[3]);
__m256 m10 = _mm256_set1_ps(m[4]);
__m256 m11 = _mm256_set1_ps(m[5]);
__m256 m12 = _mm256_set1_ps(m[6]);
__m256 m13 = _mm256_set1_ps(m[7]);
__m256 m20 = _mm256_set1_ps(m.r2.m128_f32[0]);
__m256 m21 = _mm256_set1_ps(m.r2.m128_f32[1]);
__m256 m22 = _mm256_set1_ps(m.r2.m128_f32[2]);
__m256 m23 = _mm256_set1_ps(m.r2.m128_f32[3]);
__m256 m20 = _mm256_set1_ps(m[8]);
__m256 m21 = _mm256_set1_ps(m[9]);
__m256 m22 = _mm256_set1_ps(m[10]);
__m256 m23 = _mm256_set1_ps(m[11]);
//
// row0 dot
@ -570,24 +570,24 @@ namespace
}
// Batch 4 mul_Vec4.
inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
{
vec4_batch4 r;
f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
f32x4 m00 = _mm_set1_ps(m[0]);
f32x4 m01 = _mm_set1_ps(m[1]);
f32x4 m02 = _mm_set1_ps(m[2]);
f32x4 m03 = _mm_set1_ps(m[3]);
f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
f32x4 m10 = _mm_set1_ps(m[4]);
f32x4 m11 = _mm_set1_ps(m[5]);
f32x4 m12 = _mm_set1_ps(m[6]);
f32x4 m13 = _mm_set1_ps(m[7]);
f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
f32x4 m20 = _mm_set1_ps(m[8]);
f32x4 m21 = _mm_set1_ps(m[9]);
f32x4 m22 = _mm_set1_ps(m[10]);
f32x4 m23 = _mm_set1_ps(m[11]);
//
// row0 dot

View file

@ -452,24 +452,24 @@ namespace
}
}
inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
{
vec4_batch4 r;
float32x4_t m00 = vdupq_n_f32(m.r0[0]);
float32x4_t m01 = vdupq_n_f32(m.r0[1]);
float32x4_t m02 = vdupq_n_f32(m.r0[2]);
float32x4_t m03 = vdupq_n_f32(m.r0[3]);
float32x4_t m00 = vdupq_n_f32(m[0]);
float32x4_t m01 = vdupq_n_f32(m[1]);
float32x4_t m02 = vdupq_n_f32(m[2]);
float32x4_t m03 = vdupq_n_f32(m[3]);
float32x4_t m10 = vdupq_n_f32(m.r1[0]);
float32x4_t m11 = vdupq_n_f32(m.r1[1]);
float32x4_t m12 = vdupq_n_f32(m.r1[2]);
float32x4_t m13 = vdupq_n_f32(m.r1[3]);
float32x4_t m10 = vdupq_n_f32(m[4]);
float32x4_t m11 = vdupq_n_f32(m[5]);
float32x4_t m12 = vdupq_n_f32(m[6]);
float32x4_t m13 = vdupq_n_f32(m[7]);
float32x4_t m20 = vdupq_n_f32(m.r2[0]);
float32x4_t m21 = vdupq_n_f32(m.r2[1]);
float32x4_t m22 = vdupq_n_f32(m.r2[2]);
float32x4_t m23 = vdupq_n_f32(m.r2[3]);
float32x4_t m20 = vdupq_n_f32(m[8]);
float32x4_t m21 = vdupq_n_f32(m[9]);
float32x4_t m22 = vdupq_n_f32(m[10]);
float32x4_t m23 = vdupq_n_f32(m[11]);
// row0 dot
r.x = vaddq_f32(

View file

@ -601,24 +601,24 @@ namespace
}
// Batch 4 mul_Vec4.
inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
{
vec4_batch4 r;
f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
f32x4 m00 = _mm_set1_ps(m[0]);
f32x4 m01 = _mm_set1_ps(m[1]);
f32x4 m02 = _mm_set1_ps(m[2]);
f32x4 m03 = _mm_set1_ps(m[3]);
f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
f32x4 m10 = _mm_set1_ps(m[4]);
f32x4 m11 = _mm_set1_ps(m[5]);
f32x4 m12 = _mm_set1_ps(m[6]);
f32x4 m13 = _mm_set1_ps(m[7]);
f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
f32x4 m20 = _mm_set1_ps(m[8]);
f32x4 m21 = _mm_set1_ps(m[9]);
f32x4 m22 = _mm_set1_ps(m[10]);
f32x4 m23 = _mm_set1_ps(m[11]);
//
// row0 dot

View file

@ -571,24 +571,24 @@ namespace
}
// Batch 4 mul_Vec4.
inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v)
inline vec4_batch4 m_mul_pos3_batch4(const float* m, const vec4_batch4& v)
{
vec4_batch4 r;
f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]);
f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]);
f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]);
f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]);
f32x4 m00 = _mm_set1_ps(m[0]);
f32x4 m01 = _mm_set1_ps(m[1]);
f32x4 m02 = _mm_set1_ps(m[2]);
f32x4 m03 = _mm_set1_ps(m[3]);
f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]);
f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]);
f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]);
f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]);
f32x4 m10 = _mm_set1_ps(m[4]);
f32x4 m11 = _mm_set1_ps(m[5]);
f32x4 m12 = _mm_set1_ps(m[6]);
f32x4 m13 = _mm_set1_ps(m[7]);
f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]);
f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]);
f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]);
f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]);
f32x4 m20 = _mm_set1_ps(m[8]);
f32x4 m21 = _mm_set1_ps(m[9]);
f32x4 m22 = _mm_set1_ps(m[10]);
f32x4 m23 = _mm_set1_ps(m[11]);
//
// row0 dot

View file

@ -231,8 +231,8 @@ public:
void mul( Point4F& p ) const; ///< M * p -> p (full [4x4] * [1x4])
void mulP( Point3F& p ) const; ///< M * p -> p (assume w = 1.0f)
void mulP( const Point3F &p, Point3F *d) const; ///< M * p -> d (assume w = 1.0f)
void batch_mulP(Point3F* points, size_t count) const;
void batch_mulP(const Point3F* points, size_t count, Point3F* out) const;
void batch_mulP(Point3F* points, U32 count) const;
void batch_mulP(const Point3F* points, U32 count, Point3F* out) const;
void mulV( VectorF& p ) const; ///< M * v -> v (assume w = 0.0f)
void mulV( const VectorF &p, Point3F *d) const; ///< M * v -> d (assume w = 0.0f)
@ -479,7 +479,7 @@ inline void MatrixF::mulP( const Point3F &p, Point3F *d) const
GetMat44().mul_pos3(*this, p, *d);
}
inline void MatrixF::batch_mulP(Point3F* points, size_t count) const
inline void MatrixF::batch_mulP(Point3F* points, U32 count) const
{
// Allocate temporary buffer
Point3F* temp = new Point3F[count];
@ -487,7 +487,7 @@ inline void MatrixF::batch_mulP(Point3F* points, size_t count) const
GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
// Copy the results back to out
for (size_t i = 0; i < count; ++i)
for (U32 i = 0; i < count; ++i)
{
points[i] = temp[i];
}
@ -496,7 +496,7 @@ inline void MatrixF::batch_mulP(Point3F* points, size_t count) const
delete[] temp;
}
inline void MatrixF::batch_mulP(const Point3F* points, size_t count, Point3F* out) const
inline void MatrixF::batch_mulP(const Point3F* points, U32 count, Point3F* out) const
{
// Allocate temporary buffer
Point3F* temp = new Point3F[count];
@ -504,7 +504,7 @@ inline void MatrixF::batch_mulP(const Point3F* points, size_t count, Point3F* ou
GetMat44().batch_mul_pos3(m, reinterpret_cast<const float*>(points), count, reinterpret_cast<float*>(temp));
// Copy the results back to out
for (size_t i = 0; i < count; ++i)
for (U32 i = 0; i < count; ++i)
{
out[i] = temp[i];
}