diff --git a/Engine/source/math/impl/mat44_impl.inl b/Engine/source/math/impl/mat44_impl.inl index ab0799b08..c203ecf5e 100644 --- a/Engine/source/math/impl/mat44_impl.inl +++ b/Engine/source/math/impl/mat44_impl.inl @@ -32,6 +32,53 @@ namespace math_backend::mat44 m_store(m, ma); } + inline void mat44_transform_plane_impl(const float* m, const float* scale, const float* plane, float* plane_result) + { + f32x4x4 M = m_load(m); + + f32x4 plane_v = v_load(plane); + f32x4 scale_v = v_load3_vec(scale); + f32x4 invScale = v_rcp_nr(scale_v); + + // normal = plane.xyz + f32x4 normal = plane_v; + + // apply Inv(s) + normal = v_mul(normal, invScale); + + // multiply by Inv(Tr(m)) (only the rotation part matters) + f32x4 nx = v_mul(v_swizzle_singular_mask(normal, 0), M.r0); + f32x4 ny = v_mul(v_swizzle_singular_mask(normal, 1), M.r1); + f32x4 nz = v_mul(v_swizzle_singular_mask(normal, 2), M.r2); + + normal = v_add(v_add(nx, ny), nz); + + normal = v_normalize3(normal); + + // compute point on plane + float d = v_extract0(v_swizzle_singular_mask(plane_v, 3)); + + f32x4 point = v_mul(plane_v, v_set1(-d)); + point = v_preserve_w(point, v_set1(1.0f)); + + // apply scale + point = v_mul(point, scale_v); + + // transform point by matrix + point = m_mul_vec4(M, point); + + // compute new plane distance + float newD = -v_extract0(v_dot3(point, normal)); + + alignas(16) float n[4]; + v_store(n, normal); + + plane_result[0] = n[0]; + plane_result[1] = n[1]; + plane_result[2] = n[2]; + plane_result[3] = newD; + } + inline void mat44_get_scale_impl(const float* m, float* s) { f32x4x4 ma = m_load(m); @@ -341,4 +388,39 @@ namespace math_backend::mat44 m_store(m, mo); } + //-------------------------------------------------- + // MATRIX BATCH FUNCTIONS + //-------------------------------------------------- + + inline void mat44_batch_mul_pos3(const float* m, const float* points, size_t count, float* result) + { + size_t i = 0; + f32x4x4 ma = m_load(m); + + // AVX has 8 lanes to play with +#if defined(MATH_SIMD_AVX2) || defined(MATH_SIMD_AVX) + // 8-wide AVX only + for (; i + 8 <= count; i += 8) + { + vec4_batch8 va = load_vec3_batch8(&points[i*3], 1.0f, false); + vec4_batch8 vr = m_mul_pos3_batch8(ma, va); + store_vec3_batch8(&result[i*3], vr); + } +#endif // MATH_SIMD_AVX2 || MATH_SIMD_AVX + + // 4-wide + for (; i + 4 <= count; i += 4) + { + vec4_batch4 va = load_vec3_batch4(&points[i * 3], 1.0f, false); + vec4_batch4 vr = m_mul_pos3_batch4(ma, va); + store_vec3_batch4(&result[i * 3], vr); + } + + for (; i < count; ++i) + { + size_t idx = i * 3; + mat44_mul_pos3_impl(m, &points[idx], &result[idx]); + } + } + } // namespace math_backend::mat44 diff --git a/Engine/source/math/impl/math_c.cpp b/Engine/source/math/impl/math_c.cpp index ac6358b60..6d45350fa 100644 --- a/Engine/source/math/impl/math_c.cpp +++ b/Engine/source/math/impl/math_c.cpp @@ -3,6 +3,7 @@ #include "math/public/float3_dispatch.h" #include "math/public/mat44_dispatch.h" #include "math/mConstants.h" +#include "math/mMatrix.h" #include // for sqrtf, etc. namespace math_backend::float4::dispatch @@ -375,6 +376,88 @@ namespace math_backend::mat44::dispatch mresult[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15]; }; + gMat44.transform_plane = [](const F32* m, const F32* s, const F32* p, F32* presult) { + // We take in a matrix, a scale factor, and a plane equation. We want to output + // the resultant normal + // We have T = m*s + // To multiply the normal, we want Inv(Tr(m*s)) + // Inv(Tr(ms)) = Inv(Tr(s) * Tr(m)) + // = Inv(Tr(m)) * Inv(Tr(s)) + // + // Inv(Tr(s)) = Inv(s) = [ 1/x 0 0 0] + // [ 0 1/y 0 0] + // [ 0 0 1/z 0] + // [ 0 0 0 1] + // + // Since m is an affine matrix, + // Tr(m) = [ [ ] 0 ] + // [ [ R ] 0 ] + // [ [ ] 0 ] + // [ [ x y z ] 1 ] + // + // Inv(Tr(m)) = [ [ -1 ] 0 ] + // [ [ R ] 0 ] + // [ [ ] 0 ] + // [ [ A B C ] 1 ] + // Where: + // + // P = (x, y, z) + // A = -(Row(0, r) * P); + // B = -(Row(1, r) * P); + // C = -(Row(2, r) * P); + + MatrixF invScale(true); + F32* pScaleElems = invScale; + pScaleElems[MatrixF::idx(0, 0)] = 1.0f / s[0]; + pScaleElems[MatrixF::idx(1, 1)] = 1.0f / s[1]; + pScaleElems[MatrixF::idx(2, 2)] = 1.0f / s[2]; + + const Point3F shear(m[MatrixF::idx(3, 0)], m[MatrixF::idx(3, 1)], m[MatrixF::idx(3, 2)]); + + const Point3F row0(m[MatrixF::idx(0, 0)], m[MatrixF::idx(0, 1)], m[MatrixF::idx(0, 2)]); + const Point3F row1(m[MatrixF::idx(1, 0)], m[MatrixF::idx(1, 1)], m[MatrixF::idx(1, 2)]); + const Point3F row2(m[MatrixF::idx(2, 0)], m[MatrixF::idx(2, 1)], m[MatrixF::idx(2, 2)]); + + const F32 A = -mDot(row0, shear); + const F32 B = -mDot(row1, shear); + const F32 C = -mDot(row2, shear); + + MatrixF invTrMatrix(true); + F32* destMat = invTrMatrix; + destMat[MatrixF::idx(0, 0)] = m[MatrixF::idx(0, 0)]; + destMat[MatrixF::idx(1, 0)] = m[MatrixF::idx(1, 0)]; + destMat[MatrixF::idx(2, 0)] = m[MatrixF::idx(2, 0)]; + destMat[MatrixF::idx(0, 1)] = m[MatrixF::idx(0, 1)]; + destMat[MatrixF::idx(1, 1)] = m[MatrixF::idx(1, 1)]; + destMat[MatrixF::idx(2, 1)] = m[MatrixF::idx(2, 1)]; + destMat[MatrixF::idx(0, 2)] = m[MatrixF::idx(0, 2)]; + destMat[MatrixF::idx(1, 2)] = m[MatrixF::idx(1, 2)]; + destMat[MatrixF::idx(2, 2)] = m[MatrixF::idx(2, 2)]; + destMat[MatrixF::idx(0, 3)] = A; + destMat[MatrixF::idx(1, 3)] = B; + destMat[MatrixF::idx(2, 3)] = C; + invTrMatrix.mul(invScale); + + Point3F norm(p[0], p[1], p[2]); + Point3F point = norm * -p[3]; + invTrMatrix.mulP(norm); + norm.normalize(); + + MatrixF temp; + dMemcpy(temp, m, sizeof(F32) * 16); + point.x *= s[0]; + point.y *= s[1]; + point.z *= s[2]; + temp.mulP(point); + + PlaneF resultPlane(point, norm); + presult[0] = resultPlane.x; + presult[1] = resultPlane.y; + presult[2] = resultPlane.z; + presult[3] = resultPlane.d; + + }; + gMat44.normalize = [](float* a) { F32 col0[3], col1[3], col2[3]; // extract columns 0 and 1 @@ -404,5 +487,14 @@ namespace math_backend::mat44::dispatch a[10] = col2[2]; }; + + gMat44.batch_mul_pos3 = [](const float* m, const float* pts, size_t count, float* result_ptrs) { + size_t i = 0; + for (; i < count; i++) + { + size_t idx = i * 3; + gMat44.mul_pos3(m, &pts[idx], &result_ptrs[idx]); + } + }; } } diff --git a/Engine/source/math/isa/avx/avx_intrinsics.h b/Engine/source/math/isa/avx/avx_intrinsics.h index 28a5fa4fa..069b15850 100644 --- a/Engine/source/math/isa/avx/avx_intrinsics.h +++ b/Engine/source/math/isa/avx/avx_intrinsics.h @@ -57,7 +57,7 @@ namespace // Shuffle helpers //------------------------------------------------------ - inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, const int x, const int y, const int z, const int w) + inline f32x4 v_shuffle_mask(f32x4 v1, f32x4 v2, int x, int y, int z, int w) { f32x4 lo = v1; f32x4 hi = v2; @@ -160,13 +160,13 @@ namespace // full dot4 inline f32x4 v_dot4(f32x4 a, f32x4 b) { - return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into lane 1 + return _mm_dp_ps(a, b, 0xFF); // f32x4, 4 lanes into all lanes } // dot3 (ignores w) inline f32x4 v_dot3(f32x4 a, f32x4 b) { - return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into lane 1 + return _mm_dp_ps(a, b, 0x7F); // f32x4, 3 last lanes into all lanes } // cross product xyz only. @@ -182,7 +182,7 @@ namespace inline f32x4 v_normalize3(f32x4 v) { - const f32x4 zero = _mm_setzero_ps(); + const f32x4 zero = _mm_setzero_ps(); const f32x4 fallback = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); // {0,0,1,0} f32x4 dot = v_dot3(v, v); @@ -405,13 +405,6 @@ namespace f32x8 w; }; - struct vec3_batch8 - { - f32x8 x; - f32x8 y; - f32x8 z; - }; - struct vec4_batch4 { f32x4 x; @@ -420,47 +413,37 @@ namespace f32x4 w; }; - struct vec3_batch4 + inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW) { - f32x4 x; - f32x4 y; - f32x4 z; - }; + vec4_batch8 r; - inline vec3_batch8 load_vec3_batch8(const float* ptr) - { - // ptr points to x0 - // layout: x y z x y z ... + r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9], + ptr[12], ptr[15], ptr[18], ptr[21]); + r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10], + ptr[13], ptr[16], ptr[19], ptr[22]); + r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11], + ptr[14], ptr[17], ptr[20], ptr[23]); - vec3_batch8 r; - - r.x = _mm256_set_ps( - ptr[21], ptr[18], ptr[15], ptr[12], - ptr[9], ptr[6], ptr[3], ptr[0]); - - r.y = _mm256_set_ps( - ptr[22], ptr[19], ptr[16], ptr[13], - ptr[10], ptr[7], ptr[4], ptr[1]); - - r.z = _mm256_set_ps( - ptr[23], ptr[20], ptr[17], ptr[14], - ptr[11], ptr[8], ptr[5], ptr[2]); + if (fillW) + { + r.w = _mm256_set1_ps(w); + } return r; } - inline vec3_batch4 load_vec3_batch4(const float* ptr) + inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW) { - vec3_batch4 r; + vec4_batch4 r; - r.x = _mm_set_ps( - ptr[9], ptr[6], ptr[3], ptr[0]); + r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]); + r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]); + r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]); - r.y = _mm_set_ps( - ptr[10], ptr[7], ptr[4], ptr[1]); - - r.z = _mm_set_ps( - ptr[11], ptr[8], ptr[5], ptr[2]); + if (fillW) + { + r.w = _mm_set1_ps(w); + } return r; } @@ -478,7 +461,7 @@ namespace } // Store the result back to a float3 array with size of 8 - inline void store_vec3_batch8(float* out, const vec3_batch8& v) + inline void store_vec3_batch8(float* out, const vec4_batch8& v) { alignas(32) float xs[8]; alignas(32) float ys[8]; @@ -497,7 +480,7 @@ namespace } // Store the result back to a float3 array with size of 4 - inline void store_vec3_batch4(float* out, const vec3_batch4& v) + inline void store_vec3_batch4(float* out, const vec4_batch4& v) { alignas(16) float xs[8]; alignas(16) float ys[8]; @@ -515,7 +498,7 @@ namespace } } - inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b) + inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b) { f32x4 mulx = _mm_mul_ps(a.x, b.x); f32x4 muly = _mm_mul_ps(a.y, b.y); @@ -524,7 +507,7 @@ namespace return _mm_add_ps(_mm_add_ps(mulx, muly), mulz); } - inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b) + inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b) { f32x8 mulx = _mm256_mul_ps(a.x, b.x); f32x8 muly = _mm256_mul_ps(a.y, b.y); @@ -533,4 +516,110 @@ namespace return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz); } + // Batch 8 mul_Vec4. + inline vec4_batch8 m_mul_pos3_batch8(const f32x4x4& m, const vec4_batch8& v) + { + vec4_batch8 r; + + __m256 m00 = _mm256_set1_ps(m.r0.m128_f32[0]); + __m256 m01 = _mm256_set1_ps(m.r0.m128_f32[1]); + __m256 m02 = _mm256_set1_ps(m.r0.m128_f32[2]); + __m256 m03 = _mm256_set1_ps(m.r0.m128_f32[3]); + + __m256 m10 = _mm256_set1_ps(m.r1.m128_f32[0]); + __m256 m11 = _mm256_set1_ps(m.r1.m128_f32[1]); + __m256 m12 = _mm256_set1_ps(m.r1.m128_f32[2]); + __m256 m13 = _mm256_set1_ps(m.r1.m128_f32[3]); + + __m256 m20 = _mm256_set1_ps(m.r2.m128_f32[0]); + __m256 m21 = _mm256_set1_ps(m.r2.m128_f32[1]); + __m256 m22 = _mm256_set1_ps(m.r2.m128_f32[2]); + __m256 m23 = _mm256_set1_ps(m.r2.m128_f32[3]); + + // + // row0 dot + // + r.x = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(v.x, m00), + _mm256_mul_ps(v.y, m01)), + _mm256_add_ps( + _mm256_mul_ps(v.z, m02), m03)); + + // + // row1 dot + // + r.y = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(v.x, m10), + _mm256_mul_ps(v.y, m11)), + _mm256_add_ps( + _mm256_mul_ps(v.z, m12), m13)); + + // + // row2 dot + // + r.z = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(v.x, m20), + _mm256_mul_ps(v.y, m21)), + _mm256_add_ps( + _mm256_mul_ps(v.z, m22), m23)); + + return r; + } + + // Batch 4 mul_Vec4. + inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v) + { + vec4_batch4 r; + + f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]); + f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]); + f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]); + f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]); + + f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]); + f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]); + f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]); + f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]); + + f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]); + f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]); + f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]); + f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]); + + // + // row0 dot + // + r.x = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m00), + _mm_mul_ps(v.y, m01)), + _mm_add_ps( + _mm_mul_ps(v.z, m02), m03)); + + // + // row1 dot + // + r.y = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m10), + _mm_mul_ps(v.y, m11)), + _mm_add_ps( + _mm_mul_ps(v.z, m12), m13)); + + // + // row2 dot + // + r.z = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m20), + _mm_mul_ps(v.y, m21)), + _mm_add_ps( + _mm_mul_ps(v.z, m22), m23)); + + return r; + } + } diff --git a/Engine/source/math/isa/avx/mat44.cpp b/Engine/source/math/isa/avx/mat44.cpp index 50d64b6a2..b0fddcd1c 100644 --- a/Engine/source/math/isa/avx/mat44.cpp +++ b/Engine/source/math/isa/avx/mat44.cpp @@ -15,9 +15,12 @@ namespace math_backend::mat44::dispatch gMat44.mul_pos3 = mat44_mul_pos3_impl; gMat44.mul_vec3 = mat44_mul_vec3_impl; gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.transform_plane = mat44_transform_plane_impl; gMat44.scale = mat44_scale_impl; gMat44.get_scale = mat44_get_scale_impl; gMat44.normalize = mat44_normalize_impl; gMat44.determinant = mat44_get_determinant; + + gMat44.batch_mul_pos3 = mat44_batch_mul_pos3; } } diff --git a/Engine/source/math/isa/avx2/avx2_intrinsics.h b/Engine/source/math/isa/avx2/avx2_intrinsics.h index bb367f198..a10ba77d8 100644 --- a/Engine/source/math/isa/avx2/avx2_intrinsics.h +++ b/Engine/source/math/isa/avx2/avx2_intrinsics.h @@ -405,13 +405,6 @@ namespace f32x8 w; }; - struct vec3_batch8 - { - f32x8 x; - f32x8 y; - f32x8 z; - }; - struct vec4_batch4 { f32x4 x; @@ -420,47 +413,37 @@ namespace f32x4 w; }; - struct vec3_batch4 + inline vec4_batch8 load_vec3_batch8(const float* ptr, float w, bool fillW) { - f32x4 x; - f32x4 y; - f32x4 z; - }; + vec4_batch8 r; - inline vec3_batch8 load_vec3_batch8(const float* ptr) - { - // ptr points to x0 - // layout: x y z x y z ... + r.x = _mm256_set_ps(ptr[0], ptr[3], ptr[6], ptr[9], + ptr[12], ptr[15], ptr[18], ptr[21]); + r.y = _mm256_set_ps(ptr[1], ptr[4], ptr[7], ptr[10], + ptr[13], ptr[16], ptr[19], ptr[22]); + r.z = _mm256_set_ps(ptr[2], ptr[5], ptr[8], ptr[11], + ptr[14], ptr[17], ptr[20], ptr[23]); - vec3_batch8 r; - - r.x = _mm256_set_ps( - ptr[21], ptr[18], ptr[15], ptr[12], - ptr[9], ptr[6], ptr[3], ptr[0]); - - r.y = _mm256_set_ps( - ptr[22], ptr[19], ptr[16], ptr[13], - ptr[10], ptr[7], ptr[4], ptr[1]); - - r.z = _mm256_set_ps( - ptr[23], ptr[20], ptr[17], ptr[14], - ptr[11], ptr[8], ptr[5], ptr[2]); + if (fillW) + { + r.w = _mm256_set1_ps(w); + } return r; } - inline vec3_batch4 load_vec3_batch4(const float* ptr) + inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW) { - vec3_batch4 r; + vec4_batch4 r; - r.x = _mm_set_ps( - ptr[9], ptr[6], ptr[3], ptr[0]); + r.x = _mm_set_ps(ptr[0], ptr[3], ptr[6], ptr[9]); + r.y = _mm_set_ps(ptr[1], ptr[4], ptr[7], ptr[10]); + r.z = _mm_set_ps(ptr[2], ptr[5], ptr[8], ptr[11]); - r.y = _mm_set_ps( - ptr[10], ptr[7], ptr[4], ptr[1]); - - r.z = _mm_set_ps( - ptr[11], ptr[8], ptr[5], ptr[2]); + if (fillW) + { + r.w = _mm_set1_ps(w); + } return r; } @@ -478,7 +461,7 @@ namespace } // Store the result back to a float3 array with size of 8 - inline void store_vec3_batch8(float* out, const vec3_batch8& v) + inline void store_vec3_batch8(float* out, const vec4_batch8& v) { alignas(32) float xs[8]; alignas(32) float ys[8]; @@ -497,7 +480,7 @@ namespace } // Store the result back to a float3 array with size of 4 - inline void store_vec3_batch4(float* out, const vec3_batch4& v) + inline void store_vec3_batch4(float* out, const vec4_batch4& v) { alignas(16) float xs[8]; alignas(16) float ys[8]; @@ -515,7 +498,7 @@ namespace } } - inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b) + inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b) { f32x4 mulx = _mm_mul_ps(a.x, b.x); f32x4 muly = _mm_mul_ps(a.y, b.y); @@ -524,7 +507,7 @@ namespace return _mm_add_ps(_mm_add_ps(mulx, muly), mulz); } - inline f32x8 vec3_batch8_dot(const vec3_batch8& a, const vec3_batch8& b) + inline f32x8 vec3_batch8_dot(const vec4_batch8& a, const vec4_batch8& b) { f32x8 mulx = _mm256_mul_ps(a.x, b.x); f32x8 muly = _mm256_mul_ps(a.y, b.y); @@ -533,4 +516,110 @@ namespace return _mm256_add_ps(_mm256_add_ps(mulx, muly), mulz); } + // Batch 8 mul_Vec4. + inline vec4_batch8 m_mul_pos3_batch8(const f32x4x4& m, const vec4_batch8& v) + { + vec4_batch8 r; + + __m256 m00 = _mm256_set1_ps(m.r0.m128_f32[0]); + __m256 m01 = _mm256_set1_ps(m.r0.m128_f32[1]); + __m256 m02 = _mm256_set1_ps(m.r0.m128_f32[2]); + __m256 m03 = _mm256_set1_ps(m.r0.m128_f32[3]); + + __m256 m10 = _mm256_set1_ps(m.r1.m128_f32[0]); + __m256 m11 = _mm256_set1_ps(m.r1.m128_f32[1]); + __m256 m12 = _mm256_set1_ps(m.r1.m128_f32[2]); + __m256 m13 = _mm256_set1_ps(m.r1.m128_f32[3]); + + __m256 m20 = _mm256_set1_ps(m.r2.m128_f32[0]); + __m256 m21 = _mm256_set1_ps(m.r2.m128_f32[1]); + __m256 m22 = _mm256_set1_ps(m.r2.m128_f32[2]); + __m256 m23 = _mm256_set1_ps(m.r2.m128_f32[3]); + + // + // row0 dot + // + r.x = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(v.x, m00), + _mm256_mul_ps(v.y, m01)), + _mm256_add_ps( + _mm256_mul_ps(v.z, m02), m03)); + + // + // row1 dot + // + r.y = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(v.x, m10), + _mm256_mul_ps(v.y, m11)), + _mm256_add_ps( + _mm256_mul_ps(v.z, m12), m13)); + + // + // row2 dot + // + r.z = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(v.x, m20), + _mm256_mul_ps(v.y, m21)), + _mm256_add_ps( + _mm256_mul_ps(v.z, m22), m23)); + + return r; + } + + // Batch 4 mul_Vec4. + inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v) + { + vec4_batch4 r; + + f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]); + f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]); + f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]); + f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]); + + f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]); + f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]); + f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]); + f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]); + + f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]); + f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]); + f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]); + f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]); + + // + // row0 dot + // + r.x = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m00), + _mm_mul_ps(v.y, m01)), + _mm_add_ps( + _mm_mul_ps(v.z, m02), m03)); + + // + // row1 dot + // + r.y = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m10), + _mm_mul_ps(v.y, m11)), + _mm_add_ps( + _mm_mul_ps(v.z, m12), m13)); + + // + // row2 dot + // + r.z = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m20), + _mm_mul_ps(v.y, m21)), + _mm_add_ps( + _mm_mul_ps(v.z, m22), m23)); + + return r; + } + } diff --git a/Engine/source/math/isa/avx2/mat44.cpp b/Engine/source/math/isa/avx2/mat44.cpp index e6b891e9e..83c90a647 100644 --- a/Engine/source/math/isa/avx2/mat44.cpp +++ b/Engine/source/math/isa/avx2/mat44.cpp @@ -15,9 +15,11 @@ namespace math_backend::mat44::dispatch gMat44.mul_pos3 = mat44_mul_pos3_impl; gMat44.mul_vec3 = mat44_mul_vec3_impl; gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.transform_plane = mat44_transform_plane_impl; gMat44.scale = mat44_scale_impl; gMat44.get_scale = mat44_get_scale_impl; gMat44.normalize = mat44_normalize_impl; gMat44.determinant = mat44_get_determinant; + gMat44.batch_mul_pos3 = mat44_batch_mul_pos3; } } diff --git a/Engine/source/math/isa/neon/neon_intrinsics.h b/Engine/source/math/isa/neon/neon_intrinsics.h index 888adab45..a02da6391 100644 --- a/Engine/source/math/isa/neon/neon_intrinsics.h +++ b/Engine/source/math/isa/neon/neon_intrinsics.h @@ -410,4 +410,95 @@ namespace return m_transpose(adj); } + struct vec4_batch4 + { + f32x4 x; + f32x4 y; + f32x4 z; + f32x4 w; + }; + + inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW) + { + vec4_batch4 r; + + r.x = (f32x4){ ptr[9], ptr[6], ptr[3], ptr[0] }; + r.y = (f32x4){ ptr[10], ptr[7], ptr[4], ptr[1] }; + r.z = (f32x4){ ptr[11], ptr[8], ptr[5], ptr[2] }; + + if (fillW) + { + r.w = vdupq_n_f32(w); + } + + return r; + } + + inline void store_vec3_batch4(float* out, const vec4_batch4& v) + { + alignas(16) float xs[4]; + alignas(16) float ys[4]; + alignas(16) float zs[4]; + + vst1q_f32(xs, v.x); + vst1q_f32(ys, v.y); + vst1q_f32(zs, v.z); + + for (int i = 0; i < 4; ++i) + { + out[i * 3 + 0] = xs[i]; + out[i * 3 + 1] = ys[i]; + out[i * 3 + 2] = zs[i]; + } + } + + inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v) + { + vec4_batch4 r; + + float32x4_t m00 = vdupq_n_f32(m.r0.m128_f32[0]); + float32x4_t m01 = vdupq_n_f32(m.r0.m128_f32[1]); + float32x4_t m02 = vdupq_n_f32(m.r0.m128_f32[2]); + float32x4_t m03 = vdupq_n_f32(m.r0.m128_f32[3]); + + float32x4_t m10 = vdupq_n_f32(m.r1.m128_f32[0]); + float32x4_t m11 = vdupq_n_f32(m.r1.m128_f32[1]); + float32x4_t m12 = vdupq_n_f32(m.r1.m128_f32[2]); + float32x4_t m13 = vdupq_n_f32(m.r1.m128_f32[3]); + + float32x4_t m20 = vdupq_n_f32(m.r2.m128_f32[0]); + float32x4_t m21 = vdupq_n_f32(m.r2.m128_f32[1]); + float32x4_t m22 = vdupq_n_f32(m.r2.m128_f32[2]); + float32x4_t m23 = vdupq_n_f32(m.r2.m128_f32[3]); + + // row0 dot + r.x = vaddq_f32( + vaddq_f32( + vmulq_f32(v.x, m00), + vmulq_f32(v.y, m01)), + vaddq_f32( + vmulq_f32(v.z, m02), + m03)); + + // row1 dot + r.y = vaddq_f32( + vaddq_f32( + vmulq_f32(v.x, m10), + vmulq_f32(v.y, m11)), + vaddq_f32( + vmulq_f32(v.z, m12), + m13)); + + // row2 dot + r.z = vaddq_f32( + vaddq_f32( + vmulq_f32(v.x, m20), + vmulq_f32(v.y, m21)), + vaddq_f32( + vmulq_f32(v.z, m22), + m23)); + + return r; + } + } diff --git a/Engine/source/math/isa/sse2/mat44.cpp b/Engine/source/math/isa/sse2/mat44.cpp index 2a84c029f..d8d06ea58 100644 --- a/Engine/source/math/isa/sse2/mat44.cpp +++ b/Engine/source/math/isa/sse2/mat44.cpp @@ -15,9 +15,11 @@ namespace math_backend::mat44::dispatch gMat44.mul_pos3 = mat44_mul_pos3_impl; gMat44.mul_vec3 = mat44_mul_vec3_impl; gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.transform_plane = mat44_transform_plane_impl; gMat44.scale = mat44_scale_impl; gMat44.get_scale = mat44_get_scale_impl; gMat44.normalize = mat44_normalize_impl; gMat44.determinant = mat44_get_determinant; + gMat44.batch_mul_pos3 = mat44_batch_mul_pos3; } } diff --git a/Engine/source/math/isa/sse2/sse2_intrinsics.h b/Engine/source/math/isa/sse2/sse2_intrinsics.h index 0150a78a5..df3852340 100644 --- a/Engine/source/math/isa/sse2/sse2_intrinsics.h +++ b/Engine/source/math/isa/sse2/sse2_intrinsics.h @@ -544,17 +544,10 @@ namespace f32x4 w; }; - struct vec3_batch4 - { - f32x4 x; - f32x4 y; - f32x4 z; - }; - - inline vec3_batch4 load_vec3_batch4(const float* ptr) + inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW) { - vec3_batch4 r; + vec4_batch4 r; r.x = _mm_set_ps( ptr[9], ptr[6], ptr[3], ptr[0]); @@ -565,6 +558,11 @@ namespace r.z = _mm_set_ps( ptr[11], ptr[8], ptr[5], ptr[2]); + if (fillW) + { + r.w = _mm_set1_ps(w); + } + return r; } @@ -575,7 +573,7 @@ namespace } // Store the result back to a float3 array with size of 4 - inline void store_vec3_batch4(float* out, const vec3_batch4& v) + inline void store_vec3_batch4(float* out, const vec4_batch4& v) { alignas(16) float xs[8]; alignas(16) float ys[8]; @@ -593,7 +591,7 @@ namespace } } - inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b) + inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b) { f32x4 mulx = _mm_mul_ps(a.x, b.x); f32x4 muly = _mm_mul_ps(a.y, b.y); @@ -601,4 +599,57 @@ namespace return _mm_add_ps(_mm_add_ps(mulx, muly), mulz); } + + // Batch 4 mul_Vec4. + inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v) + { + vec4_batch4 r; + + f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]); + f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]); + f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]); + f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]); + + f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]); + f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]); + f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]); + f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]); + + f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]); + f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]); + f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]); + f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]); + + // + // row0 dot + // + r.x = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m00), + _mm_mul_ps(v.y, m01)), + _mm_add_ps( + _mm_mul_ps(v.z, m02), m03)); + + // + // row1 dot + // + r.y = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m10), + _mm_mul_ps(v.y, m11)), + _mm_add_ps( + _mm_mul_ps(v.z, m12), m13)); + + // + // row2 dot + // + r.z = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m20), + _mm_mul_ps(v.y, m21)), + _mm_add_ps( + _mm_mul_ps(v.z, m22), m23)); + + return r; + } } diff --git a/Engine/source/math/isa/sse41/mat44.cpp b/Engine/source/math/isa/sse41/mat44.cpp index 489d495b8..f9e33138b 100644 --- a/Engine/source/math/isa/sse41/mat44.cpp +++ b/Engine/source/math/isa/sse41/mat44.cpp @@ -15,9 +15,11 @@ namespace math_backend::mat44::dispatch gMat44.mul_pos3 = mat44_mul_pos3_impl; gMat44.mul_vec3 = mat44_mul_vec3_impl; gMat44.mul_float4 = mat44_mul_float4_impl; + gMat44.transform_plane = mat44_transform_plane_impl; gMat44.scale = mat44_scale_impl; gMat44.get_scale = mat44_get_scale_impl; gMat44.normalize = mat44_normalize_impl; gMat44.determinant = mat44_get_determinant; + gMat44.batch_mul_pos3 = mat44_batch_mul_pos3; } } diff --git a/Engine/source/math/isa/sse41/sse41_intrinsics.h b/Engine/source/math/isa/sse41/sse41_intrinsics.h index 4134480aa..45c056dcf 100644 --- a/Engine/source/math/isa/sse41/sse41_intrinsics.h +++ b/Engine/source/math/isa/sse41/sse41_intrinsics.h @@ -514,17 +514,10 @@ namespace f32x4 w; }; - struct vec3_batch4 - { - f32x4 x; - f32x4 y; - f32x4 z; - }; - - inline vec3_batch4 load_vec3_batch4(const float* ptr) + inline vec4_batch4 load_vec3_batch4(const float* ptr, float w, bool fillW) { - vec3_batch4 r; + vec4_batch4 r; r.x = _mm_set_ps( ptr[9], ptr[6], ptr[3], ptr[0]); @@ -535,6 +528,11 @@ namespace r.z = _mm_set_ps( ptr[11], ptr[8], ptr[5], ptr[2]); + if (fillW) + { + r.w = _mm_set1_ps(w); + } + return r; } @@ -545,7 +543,7 @@ namespace } // Store the result back to a float3 array with size of 4 - inline void store_vec3_batch4(float* out, const vec3_batch4& v) + inline void store_vec3_batch4(float* out, const vec4_batch4& v) { alignas(16) float xs[8]; alignas(16) float ys[8]; @@ -563,7 +561,7 @@ namespace } } - inline f32x4 vec3_batch4_dot(const vec3_batch4& a, const vec3_batch4& b) + inline f32x4 vec3_batch4_dot(const vec4_batch4& a, const vec4_batch4& b) { f32x4 mulx = _mm_mul_ps(a.x, b.x); f32x4 muly = _mm_mul_ps(a.y, b.y); @@ -571,4 +569,57 @@ namespace return _mm_add_ps(_mm_add_ps(mulx, muly), mulz); } + + // Batch 4 mul_Vec4. + inline vec4_batch4 m_mul_pos3_batch4(const f32x4x4& m, const vec4_batch4& v) + { + vec4_batch4 r; + + f32x4 m00 = _mm_set1_ps(m.r0.m128_f32[0]); + f32x4 m01 = _mm_set1_ps(m.r0.m128_f32[1]); + f32x4 m02 = _mm_set1_ps(m.r0.m128_f32[2]); + f32x4 m03 = _mm_set1_ps(m.r0.m128_f32[3]); + + f32x4 m10 = _mm_set1_ps(m.r1.m128_f32[0]); + f32x4 m11 = _mm_set1_ps(m.r1.m128_f32[1]); + f32x4 m12 = _mm_set1_ps(m.r1.m128_f32[2]); + f32x4 m13 = _mm_set1_ps(m.r1.m128_f32[3]); + + f32x4 m20 = _mm_set1_ps(m.r2.m128_f32[0]); + f32x4 m21 = _mm_set1_ps(m.r2.m128_f32[1]); + f32x4 m22 = _mm_set1_ps(m.r2.m128_f32[2]); + f32x4 m23 = _mm_set1_ps(m.r2.m128_f32[3]); + + // + // row0 dot + // + r.x = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m00), + _mm_mul_ps(v.y, m01)), + _mm_add_ps( + _mm_mul_ps(v.z, m02), m03)); + + // + // row1 dot + // + r.y = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m10), + _mm_mul_ps(v.y, m11)), + _mm_add_ps( + _mm_mul_ps(v.z, m12), m13)); + + // + // row2 dot + // + r.z = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(v.x, m20), + _mm_mul_ps(v.y, m21)), + _mm_add_ps( + _mm_mul_ps(v.z, m22), m23)); + + return r; + } } diff --git a/Engine/source/math/mMatrix.h b/Engine/source/math/mMatrix.h index 92bea5fad..873ff967a 100644 --- a/Engine/source/math/mMatrix.h +++ b/Engine/source/math/mMatrix.h @@ -231,6 +231,8 @@ public: void mul( Point4F& p ) const; ///< M * p -> p (full [4x4] * [1x4]) void mulP( Point3F& p ) const; ///< M * p -> p (assume w = 1.0f) void mulP( const Point3F &p, Point3F *d) const; ///< M * p -> d (assume w = 1.0f) + void batch_mulP(Point3F* points, size_t count) const; + void batch_mulP(const Point3F* points, size_t count, Point3F* out) const; void mulV( VectorF& p ) const; ///< M * v -> v (assume w = 0.0f) void mulV( const VectorF &p, Point3F *d) const; ///< M * v -> d (assume w = 0.0f) @@ -477,6 +479,40 @@ inline void MatrixF::mulP( const Point3F &p, Point3F *d) const GetMat44().mul_pos3(*this, p, *d); } +inline void MatrixF::batch_mulP(Point3F* points, size_t count) const +{ + // Allocate temporary buffer + Point3F* temp = new Point3F[count]; + + GetMat44().batch_mul_pos3(m, reinterpret_cast(points), count, reinterpret_cast(temp)); + + // Copy the results back to out + for (size_t i = 0; i < count; ++i) + { + points[i] = temp[i]; + } + + // Free temporary buffer + delete[] temp; +} + +inline void MatrixF::batch_mulP(const Point3F* points, size_t count, Point3F* out) const +{ + // Allocate temporary buffer + Point3F* temp = new Point3F[count]; + + GetMat44().batch_mul_pos3(m, reinterpret_cast(points), count, reinterpret_cast(temp)); + + // Copy the results back to out + for (size_t i = 0; i < count; ++i) + { + out[i] = temp[i]; + } + + // Free temporary buffer + delete[] temp; +} + inline void MatrixF::mulV( VectorF& v) const { // M * v -> v @@ -663,7 +699,9 @@ inline bool MatrixF::isNaN() inline void mTransformPlane(const MatrixF& mat, const Point3F& scale, const PlaneF& plane, PlaneF * result) { - m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x); + //m_matF_x_scale_x_planeF(mat, &scale.x, &plane.x, &result->x); + + GetMat44().transform_plane(mat, scale, plane, *result); } #else // !USE_TEMPLATE_MATRIX diff --git a/Engine/source/math/public/mat44_dispatch.h b/Engine/source/math/public/mat44_dispatch.h index 2c4342aee..6f6ade3ab 100644 --- a/Engine/source/math/public/mat44_dispatch.h +++ b/Engine/source/math/public/mat44_dispatch.h @@ -16,9 +16,12 @@ namespace math_backend::mat44::dispatch void (*mul_pos3)(const float* a, const float* b, float* r) = nullptr; void (*mul_vec3)(const float* a, const float* b, float* r) = nullptr; void (*mul_float4)(const float* a, const float* b, float* r) = nullptr; + void (*transform_plane)(const float* m, const float* s, const float* p, float* r) = nullptr; float (*determinant)(const float*) = nullptr; void (*scale)(float*, const float*) = nullptr; void (*get_scale)(const float*, float*) = nullptr; + + void (*batch_mul_pos3)(const float* m, const float* pts, size_t count, float* result_ptrs) = nullptr; }; // Global dispatch table diff --git a/Engine/source/math/util/frustum.cpp b/Engine/source/math/util/frustum.cpp index 632d7cded..eff17a0c1 100644 --- a/Engine/source/math/util/frustum.cpp +++ b/Engine/source/math/util/frustum.cpp @@ -381,8 +381,10 @@ void FrustumData::_update() const // Transform the points into the desired culling space. - for( U32 i = 0; i < mPoints.size(); ++ i ) - mTransform.mulP( mPoints[ i ] ); + /*for( U32 i = 0; i < mPoints.size(); ++ i ) + mTransform.mulP( mPoints[ i ] );*/ + + mTransform.batch_mulP(mPoints.address(), mPoints.size()); // Update the axis aligned bounding box from // the newly transformed points.