engine/platformLinux/mSolver_ASM.asm
2024-01-07 04:36:33 +00:00

388 lines
5.4 KiB
NASM

;
; ASM implementations of mSolver code
;
; externs
extern mSolveCubic_c__FffffPf
; data
segment .data
; float constants
f_quarter dd 0.25
f_half dd 0.5
f_n_half dd -0.5
f_two dd 2.0
f_n_three dd -3.0
f_four dd 4.0
f_n_four dd -4.0
f_n_six dd -6.0
f_eight dd 8.0
; integer constants
i_n_one dd 1
; variables
i dd 0
nreal dd 0
w dd 0
b0 dd 0
b1 dd 0
b2 dd 0
d0 dd 0
d1 dd 0
h dd 0
t dd 0
z dd 0
px dd 0
cubeA dd 0
cubeB dd 0
cubeC dd 0
cubeD dd 0
p dd 0
q dd 0
dis dd 0
phi dd 0
segment .text
; functions
;
; U32 mSolveQuartic_ASM(F32 A, F32 B, F32 C, F32 D, F32 E, F32* x)
;
%define A [ebp+8]
%define B [ebp+12]
%define C [ebp+16]
%define D [ebp+20]
%define E [ebp+24]
%define x [ebp+28]
global mSolveQuartic_ASM
mSolveQuartic_ASM:
push ebp
mov ebp, esp
; if (A==0.0)
; return mSolveCubic(B, C, D, E, x);
fld dword A
fldz
fcompp
fnstsw ax
and ah, 68
xor ah, 64
jne .L1
push dword x
push dword E
push dword D
push dword C
push dword B
call mSolveCubic_c__FffffPf
add esp, byte 20
jmp .L2
.L1:
; w = B/(4*A); /* offset */
fld dword A
fmul dword [f_four]
fld dword B
fxch
fdivp st1, st0
fstp dword [w]
; b2 = -6*square(w) + C/A; /* coeffs. of shifted polynomial */
fld dword [w]
fmul st0, st0
fld dword [f_n_six]
fmulp st1, st0
fld dword C
fld dword A
fdivp st1, st0
faddp st1, st0
fstp dword [b2]
; b1 = (8*square(w) - 2*C/A)*w + D/A;
fld dword [w]
fmul st0, st0
fmul dword [f_eight]
fld dword C
fld dword A
fdivp st1, st0
fmul dword [f_two]
fsubp st1, st0
fmul dword [w]
fld dword D
fld dword A
fdivp st1, st0
faddp st1, st0
fstp dword [b1]
; b0 = ((-3*square(w) + C/A)*w - D/A)*w + E/A;
fld dword [w]
fmul st0, st0
fmul dword [f_n_three]
fld dword C
fld dword A
fdivp st1, st0
faddp st1, st0
fmul dword [w]
fld dword D
fld dword A
fdivp st1, st0
fsubp st1, st0
fmul dword [w]
fld dword E
fld dword A
fdivp st1, st0
faddp st1, st0
fstp dword [b0]
; // cubic resolvent
; F32 cubeA = 1.0;
; F32 cubeB = b2;
; F32 cubeC = -4 * b0;
; F32 cubeD = square(b1) - 4 * b0 * b2;
fld1
fstp dword [cubeA]
fld dword [b2]
fstp dword [cubeB]
fld dword [b0]
fmul dword [f_n_four]
fstp dword [cubeC]
fld dword [b1]
fmul st0, st0
fld dword [f_four]
fmul dword [b0]
fmul dword [b2]
fsubp st1, st0
fstp dword [cubeD]
; mSolveCubic(cubeA, cubeB, cubeC, cubeD, x);
push dword x
push dword [cubeD]
push dword [cubeC]
push dword [cubeB]
push dword [cubeA]
call mSolveCubic_c__FffffPf
add esp, byte 20
mov eax, dword x
; z = x[0];
fld dword [eax+0]
fstp dword [z]
; nreal = 0;
xor dword [nreal], 0
; px = x;
mov dword [px], eax
; t = mSqrt(0.25 * square(z) - b0);
fld dword [z]
fmul st0, st0
fmul dword [f_quarter]
fsub dword [b0]
fsqrt
fstp dword [t]
; for (i=-1; i<=1; i+=2) {
mov ecx, dword [i_n_one]
.L3
; d0 = -0.5*z + i*t; /* coeffs. of quadratic factor */
fld dword [f_n_half]
fmul dword [z]
fld dword [i]
fmul dword [t]
faddp st1, st0
fstp dword [d0]
; d1 = (t!=0.0)? -i*0.5*b1/t : i*mSqrt(-z - b2);
fild dword [i]
fld dword [t]
fldz
fcompp
fnstsw ax
;; if t == 0.0 jump to .L4
;; if C3 is set, jump to .L4
;; if ah & C3 is not-zero, jump to .L4
and ah, 64
jnz .L4
fchs
fmul dword [f_half]
fmul dword [b1]
fdiv dword [t]
jmp .L5
.L4:
fld dword [z]
fchs
fsub dword [b2]
fsqrt
fmulp st1, st0
.L5:
fstp dword [d1]
; h = 0.25 * square(d1) - d0;
fld dword [d1]
fmul st0, st0
fmul dword [f_quarter]
fsub dword [d0]
fst dword [h]
; if (h>=0.0) {
; h = mSqrt(h);
; nreal += 2;
; *px = -0.5*d1 - h - w;
; px++;
; *px = -0.5*d1 + h - w;
; px++;
; }
fldz
fcomp st1
; if h < 0.0, jump to .L6
; if C0 is 0, jump to .L6
; if ( ah & 0x1 ) == 0.0, jump to .L6
fnstsw ax
and ah, 0x1
jz .L6
fsqrt
fstp dword [h]
add dword [nreal], 2
fld dword [d1]
fmul dword [f_n_half]
fsub dword [h]
fsub dword [w]
fstp dword [px]
mov eax, dword [px]
add eax, 4
mov dword [px], eax
fld dword [d1]
fmul dword [f_n_half]
fadd dword [h]
fsub dword [w]
fstp dword [px]
mov eax, dword [px]
add eax, 4
mov dword [px], eax
; }
.L6:
add ecx, 2
cmp ecx, 1
jle near .L3
; // sort results in ascending order
; if (nreal == 4)
; {
mov ecx, dword [nreal]
cmp ecx, 4
jnz near .L7
mov ecx, x
; if (x[2] < x[0])
fld dword [ecx]
fld dword [ecx+8]
fcom st1
fnstsw ax
and ah, 0x1
jz .L8
; swap(x[0], x[2]);
fstp dword [ecx]
fstp dword [ecx+8]
jmp .L9
.L8:
fstp st0
fstp st0
.L9:
; if (x[3] < x[1])
fld dword [ecx+4]
fld dword [ecx+12]
fcom st1
fnstsw ax
and ah, 0x1
jz .L10
; swap(x[1], x[3]);
fstp dword [ecx+4]
fstp dword [ecx+12]
jmp .L11
.L10:
fstp st0
fstp st0
.L11:
; if (x[1] < x[0])
fld dword [ecx]
fld dword [ecx+4]
fcom st1
fnstsw ax
and ah, 0x1
jz .L12
; swap(x[0], x[1]);
fstp dword [ecx]
fstp dword [ecx+4]
jmp .L13
.L12:
fstp st0
fstp st0
.L13:
; if (x[3] < x[2])
fld dword [ecx+12]
fld dword [ecx+8]
fcom st1
fnstsw ax
and ah, 0x1
jz .L14
; swap(x[2], x[3]);
fstp dword [ecx+12]
fstp dword [ecx+8]
jmp .L15
.L14:
fstp st0
fstp st0
.L15:
; if (x[2] < x[1])
fld dword [ecx+4]
fld dword [ecx+8]
fcom st1
fnstsw ax
and ah, 0x1
jz .L16
; swap(x[1], x[2]);
fstp dword [ecx+4]
fstp dword [ecx+8]
jmp .L17
.L16:
fstp st0
fstp st0
.L17
; }
;
.L7
; return(nreal);
mov eax, dword [nreal]
.L2:
pop ebp
ret