engine/platformLinux/linuxMath_ASM.asm

1015 lines
15 KiB
NASM
Raw Normal View History

2024-01-07 04:36:33 +00:00
;
; Assembler FPU fun. Naive implementations
;
; TODO:
; - pipeline with fxch
; - remove redundant loads by leaving on stack
;
segment .data
; scratch array
array times 16 dd 0
; inverse of determinant
inverse_determinant dd 0
segment .text
;
; void m_quatF_set_matF_ASM( F32 x, F32 y, F32 z, F32 w, F32* m )
;
%define in_x [ebp+8]
%define in_y [ebp+12]
%define in_z [ebp+16]
%define in_w [ebp+20]
%define in_m [ebp+24]
%define xs [eax+0]
%define ys [eax+4]
%define zs [eax+8]
%define wx [eax+12]
%define wy [eax+16]
%define wz [eax+20]
%define xx [eax+24]
%define xy [eax+28]
%define xz [eax+32]
%define yy [eax+36]
%define yz [eax+40]
%define zz [eax+44]
global m_quatF_set_matF_ASM
m_quatF_set_matF_ASM:
push ebp
mov ebp, esp
; f -> eax, m -> ecx
mov eax, array
mov ecx, in_m
; xs = x * 2.0f
fld dword in_x
fadd st0, st0
fstp dword xs
; ys = y * 2.0f
fld dword in_y
fadd st0, st0
fstp dword ys
; zs = z * 2.0f
fld dword in_z
fadd st0, st0
fstp dword zs
; load w
fld dword in_w
; wx = w * xs
fld dword xs
fmul st0, st1
fstp dword wx
; wy = w * ys
fld dword ys
fmul st0, st1
fstp dword wy
; wz = w * zs
fld dword zs
fmul st0, st1
fstp dword wz
; pop w
fstp st0
; load x
fld dword in_x
; xx = x * xs
fld dword xs
fmul st0, st1
fstp dword xx
; xy = x * ys
fld dword ys
fmul st0, st1
fstp dword xy
; xz = x * zs
fld dword zs
fmul st0, st1
fstp dword xz
; pop x
fstp st0
; don't bother loading y once, the pop negates
; yy = y * ys
fld dword in_y
fld dword ys
fmulp st1, st0
fstp dword yy
; yz = y * zs
fld dword in_y
fld dword zs
fmulp st1, st0
fstp dword yz
; and forget z
; zz = z * zs
fld dword in_z
fld dword zs
fmulp st1, st0
fstp dword zz
; m[0] = 1.0f - ( yy + zz )
fld1
fld dword yy
fld dword zz
faddp st1, st0
fsubp st1, st0
fstp dword [ecx+0]
; m[4] = xy - wz
fld dword xy
fld dword wz
fsubp st1, st0
fstp dword [ecx+16]
; m[8] = xz + wy
fld dword xz
fld dword wy
faddp st1, st0
fstp dword [ecx+32]
; m[12] = 0.0f
fldz
fstp dword [ecx+48]
; m[1] = xy + wz
fld dword xy
fld dword wz
faddp st1, st0
fstp dword [ecx+4]
; m[5] = 1.0f - ( xx + zz )
fld1
fld dword xx
fld dword zz
faddp st1, st0
fsubp st1, st0
fstp dword [ecx+20]
; m[9] = yz - wx
fld dword yz
fld dword wx
fsubp st1, st0
fstp dword [ecx+36]
; m[13] = 0.0f
fldz
fstp dword [ecx+52]
; m[2] = xz - wy
fld dword xz
fld dword wy
fsubp st1, st0
fstp dword [ecx+8]
; m[6] = yz + wx
fld dword yz
fld dword wx
faddp st1, st0
fstp dword [ecx+24]
; m[10] = 1.0f - ( xx + yy )
fld1
fld dword xx
fld dword yy
faddp st1, st0
fsubp st1, st0
fstp dword [ecx+40]
; m[14] = 0.0f
; m[3] = 0.0f
; m[7] = 0.0f
; m[11] = 0.0f
fldz
fst dword [ecx+56]
fst dword [ecx+12]
fst dword [ecx+28]
fstp dword [ecx+44]
; m[15] = 1.0f
fld1
fstp dword [ecx+60]
pop ebp
ret
;
; F32 m_matF_determinant_ASM( const F32* m )
;
global m_matF_determinant_ASM
m_matF_determinant_ASM:
%define in_m [ebp+8]
push ebp
mov ebp, esp
mov eax, in_m
fld dword [eax+20]
fld dword [eax+40]
fmulp st1, st0
fld dword [eax+24]
fld dword [eax+36]
fmulp st1, st0
fsubp st1, st0
fld dword [eax+0]
fmulp st1, st0
fld dword [eax+8]
fld dword [eax+36]
fmulp st1, st0
fld dword [eax+4]
fld dword [eax+40]
fmulp st1, st0
fsubp st1, st0
fld dword [eax+16]
fmulp st1, st0
fld dword [eax+4]
fld dword [eax+24]
fmulp st1, st0
fld dword [eax+8]
fld dword [eax+20]
fmulp st1, st0
fsubp st1, st0
fld dword [eax+32]
fmulp st1, st0
faddp st1, st0
faddp st1, st0
; leave it on the FPU stack
pop ebp
ret
;
; void m_matF_adjoint_ASM( F32* dst, const F32* src, F32 invDet )
;
%define in_dst [ebp+8]
%define in_src [ebp+12]
%define in_invDet [ebp+16]
global m_matF_adjoint_ASM
m_matF_adjoint_ASM:
push ebp
mov ebp, esp
mov edx, in_dst
mov eax, in_src
; dst[0] = (src[5] * src[10]- src[6] * src[9]) * invDet;
fld dword [eax+20]
fld dword [eax+40]
fmulp st1, st0
fld dword [eax+24]
fld dword [eax+36]
fmulp st1, st0
fsubp st1, st0
fmul dword in_invDet
fstp dword [edx+0]
; dst[1] = (src[9] * src[2] - src[10]* src[1]) * invDet;
fld dword [eax+36]
fld dword [eax+8]
fmulp st1, st0
fld dword [eax+40]
fld dword [eax+4]
fmulp st1, st0
fsubp st1, st0
fmul dword in_invDet
fstp dword [edx+4]
; dst[2] = (src[1] * src[6] - src[2] * src[5]) * invDet;
fld dword [eax+4]
fld dword [eax+24]
fmulp st1, st0
fld dword [eax+8]
fld dword [eax+20]
fmulp st1, st0
fsubp st1, st0
fmul dword in_invDet
fstp dword [edx+8]
; dst[4] = (src[6] * src[8] - src[4] * src[10])* invDet;
fld dword [eax+24]
fld dword [eax+32]
fmulp st1, st0
fld dword [eax+16]
fld dword [eax+40]
fmulp st1, st0
fsubp st1, st0
fmul dword in_invDet
fstp dword [edx+16]
; dst[5] = (src[10]* src[0] - src[8] * src[2]) * invDet;
fld dword [eax+40]
fld dword [eax+0]
fmulp st1, st0
fld dword [eax+32]
fld dword [eax+8]
fmulp st1, st0
fsubp st1, st0
fmul dword in_invDet
fstp dword [edx+20]
; dst[6] = (src[2] * src[4] - src[0] * src[6]) * invDet;
fld dword [eax+8]
fld dword [eax+16]
fmulp st1, st0
fld dword [eax+0]
fld dword [eax+24]
fmulp st1, st0
fsubp st1, st0
fmul dword in_invDet
fstp dword [edx+24]
; dst[8] = (src[4] * src[9] - src[5] * src[8]) * invDet;
fld dword [eax+16]
fld dword [eax+36]
fmulp st1, st0
fld dword [eax+20]
fld dword [eax+32]
fmulp st1, st0
fsubp st1, st0
fmul dword in_invDet
fstp dword [edx+32]
; dst[9] = (src[8] * src[1] - src[9] * src[0]) * invDet;
fld dword [eax+32]
fld dword [eax+4]
fmulp st1, st0
fld dword [eax+36]
fld dword [eax+0]
fmulp st1, st0
fsubp st1, st0
fmul dword in_invDet
fstp dword [edx+36]
; dst[10]= (src[0] * src[5] - src[1] * src[4]) * invDet;
fld dword [eax+0]
fld dword [eax+20]
fmulp st1, st0
fld dword [eax+4]
fld dword [eax+16]
fmulp st1, st0
fsubp st1, st0
fmul dword in_invDet
fstp dword [edx+40]
pop ebp
ret
;
; m_matF_x_vectorF_ASM( const F32* m, const F32* v, F32* r )
;
%define in_m [ebp+8]
%define in_v [ebp+12]
%define in_r [ebp+16]
global m_matF_x_vectorF_ASM
m_matF_x_vectorF_ASM:
push ebp
mov ebp, esp
mov eax, in_m
mov ecx, in_v
mov edx, in_r
; vresult[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2];
fld dword [eax]
fld dword [eax+4]
fmul dword [ecx+4]
fxch
fmul dword [ecx]
fld dword [eax+8]
fmul dword [ecx+8]
fxch st2
faddp st1, st0
faddp st1, st0
fstp dword [edx]
; vresult[1] = m[4]*v[0] + m[5]*v[1] + m[6]*v[2];
fld dword [eax+16]
fld dword [eax+20]
fmul dword [ecx+4]
fxch
fmul dword [ecx]
fld dword [eax+24]
fmul dword [ecx+8]
fxch st2
faddp st1, st0
faddp st1, st0
fstp dword [edx+4]
; vresult[2] = m[8]*v[0] + m[9]*v[1] + m[10]*v[2];
fld dword [eax+32]
fld dword [eax+36]
fmul dword [ecx+4]
fxch
fmul dword [ecx]
fld dword [eax+40]
fmul dword [ecx+8]
fxch st2
faddp st1, st0
faddp st1, st0
fstp dword [edx+8]
pop ebp
ret
;
; void m_matF_inverse_ASM( F32* m )
;
%define in_m [ebp+8]
global m_matF_inverse_ASM
m_matF_inverse_ASM:
push ebp
mov ebp, esp
; calculate determinant
push dword in_m
call m_matF_determinant_ASM
add esp, byte 4
; calculate inverse of determinant
fld1
fxch
fdivp st1, st0
fstp dword [inverse_determinant]
; calculate adjoint matrix
push dword [inverse_determinant]
push dword in_m
push dword array
call m_matF_adjoint_ASM
add esp, byte 12
mov eax, array
mov edx, in_m
; assign back
fld dword [eax]
fstp dword [edx]
fld dword [eax+4]
fstp dword [edx+4]
fld dword [eax+8]
fstp dword [edx+8]
fld dword [eax+16]
fstp dword [edx+16]
fld dword [eax+20]
fstp dword [edx+20]
fld dword [eax+24]
fstp dword [edx+24]
fld dword [eax+32]
fstp dword [edx+32]
fld dword [eax+36]
fstp dword [edx+36]
fld dword [eax+40]
fstp dword [edx+40]
; invert the translation
fld dword [edx+12]
fchs
fstp dword [eax]
fld dword [edx+28]
fchs
fstp dword [eax+4]
fld dword [edx+44]
fchs
fstp dword [eax+8]
mov ecx, eax
add ecx, 16
push dword ecx
push dword eax
push edx
call m_matF_x_vectorF_ASM
add esp, byte 12
mov eax, array
mov edx, in_m
fld dword [eax+16]
fstp dword [edx+12]
fld dword [eax+20]
fstp dword [edx+28]
fld dword [eax+24]
fstp dword [edx+44]
pop ebp
ret
;
; void m_matF_affineInverse_ASM( F32* m )
;
%define in_m [ebp+8]
global m_matF_affineInverse_ASM
m_matF_affineInverse_ASM:
push ebp
mov ebp, esp
mov eax, array
mov edx, in_m
; copy to temp
fld dword [edx]
fstp dword [eax]
fld dword [edx+4]
fstp dword [eax+4]
fld dword [edx+8]
fstp dword [eax+8]
fld dword [edx+12]
fstp dword [eax+12]
fld dword [edx+16]
fstp dword [eax+16]
fld dword [edx+20]
fstp dword [eax+20]
fld dword [edx+24]
fstp dword [eax+24]
fld dword [edx+28]
fstp dword [eax+28]
fld dword [edx+32]
fstp dword [eax+32]
fld dword [edx+36]
fstp dword [eax+36]
fld dword [edx+40]
fstp dword [eax+40]
fld dword [edx+44]
fstp dword [eax+44]
fld dword [edx+48]
fstp dword [eax+48]
fld dword [edx+52]
fstp dword [eax+52]
fld dword [edx+56]
fstp dword [eax+56]
fld dword [edx+60]
fstp dword [eax+60]
; switcheroos
fld dword [eax+16]
fstp dword [edx+4]
fld dword [eax+4]
fstp dword [edx+16]
fld dword [eax+32]
fstp dword [edx+8]
fld dword [eax+8]
fstp dword [edx+32]
fld dword [eax+36]
fstp dword [edx+24]
fld dword [eax+24]
fstp dword [edx+36]
; m[3] = -(temp[0]*temp[3] + temp[4]*temp[7] + temp[8]*temp[11]);
fld dword [eax+0]
fld dword [eax+16]
fmul dword [eax+28]
fxch
fmul dword [eax+12]
fld dword [eax+32]
fmul dword [eax+44]
fxch st2
faddp st1, st0
faddp st1, st0
fchs
fstp dword [edx+12]
; m[7] = -(temp[1]*temp[3] + temp[5]*temp[7] + temp[9]*temp[11]);
fld dword [eax+4]
fld dword [eax+20]
fmul dword [eax+28]
fxch
fmul dword [eax+12]
fld dword [eax+36]
fmul dword [eax+44]
fxch st2
faddp st1, st0
faddp st1, st0
fchs
fstp dword [edx+28]
; m[11] = -(temp[2]*temp[3] + temp[6]*temp[7] + temp[10]*temp[11]);
fld dword [eax+8]
fld dword [eax+24]
fmul dword [eax+28]
fxch
fmul dword [eax+12]
fld dword [eax+40]
fmul dword [eax+44]
fxch st2
faddp st1, st0
faddp st1, st0
fchs
fstp dword [edx+44]
pop ebp
ret
;
; void m_matF_x_matF_ASM( const F32* a, const F32* b, F32* mresult )
;
%define in_a [ebp+8]
%define in_b [ebp+12]
%define in_m [ebp+16]
global m_matF_x_matF_ASM
m_matF_x_matF_ASM:
push ebp
mov ebp, esp
mov edx, in_a
mov eax, in_b
mov ecx, in_m
; mresult[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8] + a[3]*b[12];
fld dword [edx]
fld dword [edx+4]
fxch
fmul dword [eax]
fxch
fmul dword [eax+16]
faddp st1, st0
fld dword [edx+8]
fmul dword [eax+32]
faddp st1, st0
fld dword [edx+12]
fmul dword [eax+48]
faddp st1, st0
fstp dword [ecx]
; mresult[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9] + a[3]*b[13];
fld dword [edx]
fld dword [edx+4]
fxch
fmul dword [eax+4]
fxch
fmul dword [eax+20]
faddp st1, st0
fld dword [edx+8]
fmul dword [eax+36]
faddp st1, st0
fld dword [edx+12]
fmul dword [eax+52]
faddp st1, st0
fstp dword [ecx+4]
; mresult[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10] + a[3]*b[14];
fld dword [edx]
fld dword [edx+4]
fxch
fmul dword [eax+8]
fxch
fmul dword [eax+24]
faddp st1, st0
fld dword [edx+8]
fmul dword [eax+40]
faddp st1, st0
fld dword [edx+12]
fmul dword [eax+56]
faddp st1, st0
fstp dword [ecx+8]
; mresult[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11] + a[3]*b[15];
fld dword [edx]
fld dword [edx+4]
fxch
fmul dword [eax+12]
fxch
fmul dword [eax+28]
faddp st1, st0
fld dword [edx+8]
fmul dword [eax+44]
faddp st1, st0
fld dword [edx+12]
fmul dword [eax+60]
faddp st1, st0
fstp dword [ecx+12]
; mresult[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8] + a[7]*b[12];
fld dword [edx+16]
fld dword [edx+20]
fxch
fmul dword [eax]
fxch
fmul dword [eax+16]
faddp st1, st0
fld dword [edx+24]
fmul dword [eax+32]
faddp st1, st0
fld dword [edx+28]
fmul dword [eax+48]
faddp st1, st0
fstp dword [ecx+16]
; mresult[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9] + a[7]*b[13];
fld dword [edx+16]
fld dword [edx+20]
fxch
fmul dword [eax+4]
fxch
fmul dword [eax+20]
faddp st1, st0
fld dword [edx+24]
fmul dword [eax+36]
faddp st1, st0
fld dword [edx+28]
fmul dword [eax+52]
faddp st1, st0
fstp dword [ecx+20]
; mresult[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10] + a[7]*b[14];
fld dword [edx+16]
fld dword [edx+20]
fxch
fmul dword [eax+8]
fxch
fmul dword [eax+24]
faddp st1, st0
fld dword [edx+24]
fmul dword [eax+40]
faddp st1, st0
fld dword [edx+28]
fmul dword [eax+56]
faddp st1, st0
fstp dword [ecx+24]
; mresult[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11] + a[7]*b[15];
fld dword [edx+16]
fld dword [edx+20]
fxch
fmul dword [eax+12]
fxch
fmul dword [eax+28]
faddp st1, st0
fld dword [edx+24]
fmul dword [eax+44]
faddp st1, st0
fld dword [edx+28]
fmul dword [eax+60]
faddp st1, st0
fstp dword [ecx+28]
; mresult[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8] + a[11]*b[12];
fld dword [edx+32]
fld dword [edx+36]
fxch
fmul dword [eax]
fxch
fmul dword [eax+16]
faddp st1, st0
fld dword [edx+40]
fmul dword [eax+32]
faddp st1, st0
fld dword [edx+44]
fmul dword [eax+48]
faddp st1, st0
fstp dword [ecx+32]
; mresult[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9] + a[11]*b[13];
fld dword [edx+32]
fld dword [edx+36]
fxch
fmul dword [eax+4]
fxch
fmul dword [eax+20]
faddp st1, st0
fld dword [edx+40]
fmul dword [eax+36]
faddp st1, st0
fld dword [edx+44]
fmul dword [eax+52]
faddp st1, st0
fstp dword [ecx+36]
; mresult[10]= a[8]*b[2] + a[9]*b[6] + a[10]*b[10] + a[11]*b[14];
fld dword [edx+32]
fld dword [edx+36]
fxch
fmul dword [eax+8]
fxch
fmul dword [eax+24]
faddp st1, st0
fld dword [edx+40]
fmul dword [eax+40]
faddp st1, st0
fld dword [edx+44]
fmul dword [eax+56]
faddp st1, st0
fstp dword [ecx+40]
; mresult[11]= a[8]*b[3] + a[9]*b[7] + a[10]*b[11] + a[11]*b[15];
fld dword [edx+32]
fld dword [edx+36]
fxch
fmul dword [eax+12]
fxch
fmul dword [eax+28]
faddp st1, st0
fld dword [edx+40]
fmul dword [eax+44]
faddp st1, st0
fld dword [edx+44]
fmul dword [eax+60]
faddp st1, st0
fstp dword [ecx+44]
; mresult[12]= a[12]*b[0] + a[13]*b[4] + a[14]*b[8] + a[15]*b[12];
fld dword [edx+48]
fld dword [edx+52]
fxch
fmul dword [eax]
fxch
fmul dword [eax+16]
faddp st1, st0
fld dword [edx+56]
fmul dword [eax+32]
faddp st1, st0
fld dword [edx+60]
fmul dword [eax+48]
faddp st1, st0
fstp dword [ecx+48]
; mresult[13]= a[12]*b[1] + a[13]*b[5] + a[14]*b[9] + a[15]*b[13];
fld dword [edx+48]
fld dword [edx+52]
fxch
fmul dword [eax+4]
fxch
fmul dword [eax+20]
faddp st1, st0
fld dword [edx+56]
fmul dword [eax+36]
faddp st1, st0
fld dword [edx+60]
fmul dword [eax+52]
faddp st1, st0
fstp dword [ecx+52]
; mresult[14]= a[12]*b[2] + a[13]*b[6] + a[14]*b[10] + a[15]*b[14];
fld dword [edx+48]
fld dword [edx+52]
fxch
fmul dword [eax+8]
fxch
fmul dword [eax+24]
faddp st1, st0
fld dword [edx+56]
fmul dword [eax+40]
faddp st1, st0
fld dword [edx+60]
fmul dword [eax+56]
faddp st1, st0
fstp dword [ecx+56]
; mresult[15]= a[12]*b[3] + a[13]*b[7] + a[14]*b[11] + a[15]*b[15];
fld dword [edx+48]
fld dword [edx+52]
fxch
fmul dword [eax+12]
fxch
fmul dword [eax+28]
faddp st1, st0
fld dword [edx+56]
fmul dword [eax+44]
faddp st1, st0
fld dword [edx+60]
fmul dword [eax+60]
faddp st1, st0
fstp dword [ecx+60]
pop ebp
ret