Torque3D/Engine/source/math/isa/avx2/float3.cpp

#include "float3_dispatch.h"
#include <immintrin.h> // AVX/AVX2 intrinsics

namespace
{
   typedef __m128 f32x4;

   // Load 3 floats into 4-wide SIMD, zero the 4th lane
   inline f32x4 v_load3(const float* p) { return _mm_set_ps(0.0f, p[2], p[1], p[0]); }

   // Store 3 floats from SIMD register back to memory
   inline void v_store3(float* dst, f32x4 v)
   {
      alignas(16) float tmp[4];   // temp storage
      _mm_store_ps(tmp, v);        // store all 4 lanes
      dst[0] = tmp[0];
      dst[1] = tmp[1];
      dst[2] = tmp[2];
   }

   // extract just the first lane.
   inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }

   // Broadcast a single float across all 4 lanes
   inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }

   // Element-wise multiply
   inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }

   // Element-wise divide fast (1/b)
   inline f32x4 v_div_fast(f32x4 a, f32x4 b)
   {
      f32x4 rcp = _mm_rcp_ps(b);
      // Optional refinement here
      return _mm_mul_ps(a, rcp);
   }

   // Element-wise divide (to change from fast use _mm_div_ps(a,b)
   inline f32x4 v_div(f32x4 a, f32x4 b) { return v_div_fast(a, b); }

   // Element-wise add
   inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }

   // Element-wise subtract
   inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }

   // Horizontal sum of all elements (for dot product, length, etc.)
   inline f32x4 v_hadd3(f32x4 a)
   {
      __m128 t1 = _mm_hadd_ps(a, a);  // sums pairs: [a0+a1, a2+a3, ...]
      __m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
      return t2;
   }

   float float3_dot_avx(const float* a, const float* b)
   {
      f32x4 va = v_load3(a);
      f32x4 vb = v_load3(b);
      __m128 dp = _mm_dp_ps(va, vb, 0x71); // multiply 3 (0x71), sum 3, lowest lane
      return _mm_cvtss_f32(dp);
   }

   inline f32x4 v_cross(f32x4 a, f32x4 b)
   {
      f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
      f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));

      f32x4 c = _mm_sub_ps(
         _mm_mul_ps(a, b_yzx),
         _mm_mul_ps(a_yzx, b)
      );

      return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
   }
}

#include "float3_impl.inl"

namespace math_backend::float3::dispatch
{
   // Install AVX2 backend
   void install_avx2()
   {
      gFloat3.add             = float3_add_impl;
      gFloat3.sub             = float3_sub_impl;
      gFloat3.mul             = float3_mul_impl;
      gFloat3.mul_scalar      = float3_mul_scalar_impl;
      gFloat3.div             = float3_div_impl;
      gFloat3.div_scalar      = float3_div_scalar_impl;
      gFloat3.dot             = float3_dot_avx;
      gFloat3.length          = float3_length_impl;
      gFloat3.lengthSquared   = float3_length_squared_impl;
      gFloat3.normalize       = float3_normalize_impl;
      gFloat3.normalize_mag   = float3_normalize_mag_impl;
      gFloat3.lerp            = float3_lerp_impl;
      gFloat3.cross           = float3_cross_impl;
   }
}
further sse simd additions avx2 float3 added added normalize_magnitude added divide fast to float3 may copy to float4 move static spheremesh to drawSphere (initialize on first use) so platform has a chance to load the math backend 2026-02-26 21:11:31 +00:00			`#include "float3_dispatch.h"`
			`#include <immintrin.h> // AVX/AVX2 intrinsics`

			`namespace`
			`{`
			`typedef __m128 f32x4;`

			`// Load 3 floats into 4-wide SIMD, zero the 4th lane`
			`inline f32x4 v_load3(const float* p) { return _mm_set_ps(0.0f, p[2], p[1], p[0]); }`

			`// Store 3 floats from SIMD register back to memory`
			`inline void v_store3(float* dst, f32x4 v)`
			`{`
			`alignas(16) float tmp[4]; // temp storage`
			`_mm_store_ps(tmp, v); // store all 4 lanes`
			`dst[0] = tmp[0];`
			`dst[1] = tmp[1];`
			`dst[2] = tmp[2];`
			`}`

			`// extract just the first lane.`
			`inline float v_extract0(f32x4 v) { return _mm_cvtss_f32(v); }`

			`// Broadcast a single float across all 4 lanes`
			`inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }`

			`// Element-wise multiply`
			`inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }`

			`// Element-wise divide fast (1/b)`
			`inline f32x4 v_div_fast(f32x4 a, f32x4 b)`
			`{`
			`f32x4 rcp = _mm_rcp_ps(b);`
			`// Optional refinement here`
			`return _mm_mul_ps(a, rcp);`
			`}`

all float3 and float4 functions and isas completed all options of float3 and float4 functions in isas and math_c neon still to be done but that will be on mac. 2026-02-27 11:28:51 +00:00			`// Element-wise divide (to change from fast use _mm_div_ps(a,b)`
			`inline f32x4 v_div(f32x4 a, f32x4 b) { return v_div_fast(a, b); }`

further sse simd additions avx2 float3 added added normalize_magnitude added divide fast to float3 may copy to float4 move static spheremesh to drawSphere (initialize on first use) so platform has a chance to load the math backend 2026-02-26 21:11:31 +00:00			`// Element-wise add`
			`inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }`

			`// Element-wise subtract`
			`inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }`

			`// Horizontal sum of all elements (for dot product, length, etc.)`
			`inline f32x4 v_hadd3(f32x4 a)`
			`{`
			`__m128 t1 = _mm_hadd_ps(a, a); // sums pairs: [a0+a1, a2+a3, ...]`
			`__m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3`
			`return t2;`
			`}`

			`float float3_dot_avx(const float* a, const float* b)`
			`{`
all float3 and float4 functions and isas completed all options of float3 and float4 functions in isas and math_c neon still to be done but that will be on mac. 2026-02-27 11:28:51 +00:00			`f32x4 va = v_load3(a);`
			`f32x4 vb = v_load3(b);`
			`__m128 dp = _mm_dp_ps(va, vb, 0x71); // multiply 3 (0x71), sum 3, lowest lane`
further sse simd additions avx2 float3 added added normalize_magnitude added divide fast to float3 may copy to float4 move static spheremesh to drawSphere (initialize on first use) so platform has a chance to load the math backend 2026-02-26 21:11:31 +00:00			`return _mm_cvtss_f32(dp);`
			`}`
all float3 and float4 functions and isas completed all options of float3 and float4 functions in isas and math_c neon still to be done but that will be on mac. 2026-02-27 11:28:51 +00:00
			`inline f32x4 v_cross(f32x4 a, f32x4 b)`
			`{`
			`f32x4 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));`
			`f32x4 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));`

			`f32x4 c = _mm_sub_ps(`
			`_mm_mul_ps(a, b_yzx),`
			`_mm_mul_ps(a_yzx, b)`
			`);`

			`return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));`
			`}`
further sse simd additions avx2 float3 added added normalize_magnitude added divide fast to float3 may copy to float4 move static spheremesh to drawSphere (initialize on first use) so platform has a chance to load the math backend 2026-02-26 21:11:31 +00:00			`}`

			`#include "float3_impl.inl"`

			`namespace math_backend::float3::dispatch`
			`{`
			`// Install AVX2 backend`
			`void install_avx2()`
			`{`
			`gFloat3.add = float3_add_impl;`
			`gFloat3.sub = float3_sub_impl;`
			`gFloat3.mul = float3_mul_impl;`
			`gFloat3.mul_scalar = float3_mul_scalar_impl;`
			`gFloat3.div = float3_div_impl;`
			`gFloat3.div_scalar = float3_div_scalar_impl;`
			`gFloat3.dot = float3_dot_avx;`
			`gFloat3.length = float3_length_impl;`
			`gFloat3.lengthSquared = float3_length_squared_impl;`
			`gFloat3.normalize = float3_normalize_impl;`
			`gFloat3.normalize_mag = float3_normalize_mag_impl;`
			`gFloat3.lerp = float3_lerp_impl;`
all float3 and float4 functions and isas completed all options of float3 and float4 functions in isas and math_c neon still to be done but that will be on mac. 2026-02-27 11:28:51 +00:00			`gFloat3.cross = float3_cross_impl;`
further sse simd additions avx2 float3 added added normalize_magnitude added divide fast to float3 may copy to float4 move static spheremesh to drawSphere (initialize on first use) so platform has a chance to load the math backend 2026-02-26 21:11:31 +00:00			`}`
			`}`