mirror of
https://github.com/tribes2/engine.git
synced 2026-01-20 03:34:48 +00:00
388 lines
5.4 KiB
NASM
388 lines
5.4 KiB
NASM
;
|
|
; ASM implementations of mSolver code
|
|
;
|
|
|
|
; externs
|
|
extern mSolveCubic_c__FffffPf
|
|
|
|
; data
|
|
segment .data
|
|
|
|
; float constants
|
|
f_quarter dd 0.25
|
|
f_half dd 0.5
|
|
f_n_half dd -0.5
|
|
f_two dd 2.0
|
|
f_n_three dd -3.0
|
|
f_four dd 4.0
|
|
f_n_four dd -4.0
|
|
f_n_six dd -6.0
|
|
f_eight dd 8.0
|
|
|
|
; integer constants
|
|
i_n_one dd 1
|
|
|
|
; variables
|
|
i dd 0
|
|
nreal dd 0
|
|
w dd 0
|
|
b0 dd 0
|
|
b1 dd 0
|
|
b2 dd 0
|
|
d0 dd 0
|
|
d1 dd 0
|
|
h dd 0
|
|
t dd 0
|
|
z dd 0
|
|
px dd 0
|
|
cubeA dd 0
|
|
cubeB dd 0
|
|
cubeC dd 0
|
|
cubeD dd 0
|
|
|
|
p dd 0
|
|
q dd 0
|
|
dis dd 0
|
|
phi dd 0
|
|
|
|
segment .text
|
|
|
|
; functions
|
|
|
|
;
|
|
; U32 mSolveQuartic_ASM(F32 A, F32 B, F32 C, F32 D, F32 E, F32* x)
|
|
;
|
|
|
|
%define A [ebp+8]
|
|
%define B [ebp+12]
|
|
%define C [ebp+16]
|
|
%define D [ebp+20]
|
|
%define E [ebp+24]
|
|
%define x [ebp+28]
|
|
|
|
global mSolveQuartic_ASM
|
|
|
|
mSolveQuartic_ASM:
|
|
|
|
push ebp
|
|
mov ebp, esp
|
|
|
|
; if (A==0.0)
|
|
; return mSolveCubic(B, C, D, E, x);
|
|
fld dword A
|
|
fldz
|
|
fcompp
|
|
fnstsw ax
|
|
and ah, 68
|
|
xor ah, 64
|
|
jne .L1
|
|
|
|
push dword x
|
|
push dword E
|
|
push dword D
|
|
push dword C
|
|
push dword B
|
|
call mSolveCubic_c__FffffPf
|
|
add esp, byte 20
|
|
jmp .L2
|
|
|
|
.L1:
|
|
; w = B/(4*A); /* offset */
|
|
fld dword A
|
|
fmul dword [f_four]
|
|
fld dword B
|
|
fxch
|
|
fdivp st1, st0
|
|
fstp dword [w]
|
|
|
|
; b2 = -6*square(w) + C/A; /* coeffs. of shifted polynomial */
|
|
fld dword [w]
|
|
fmul st0, st0
|
|
fld dword [f_n_six]
|
|
fmulp st1, st0
|
|
fld dword C
|
|
fld dword A
|
|
fdivp st1, st0
|
|
faddp st1, st0
|
|
fstp dword [b2]
|
|
|
|
; b1 = (8*square(w) - 2*C/A)*w + D/A;
|
|
fld dword [w]
|
|
fmul st0, st0
|
|
fmul dword [f_eight]
|
|
fld dword C
|
|
fld dword A
|
|
fdivp st1, st0
|
|
fmul dword [f_two]
|
|
fsubp st1, st0
|
|
fmul dword [w]
|
|
fld dword D
|
|
fld dword A
|
|
fdivp st1, st0
|
|
faddp st1, st0
|
|
fstp dword [b1]
|
|
|
|
; b0 = ((-3*square(w) + C/A)*w - D/A)*w + E/A;
|
|
fld dword [w]
|
|
fmul st0, st0
|
|
fmul dword [f_n_three]
|
|
fld dword C
|
|
fld dword A
|
|
fdivp st1, st0
|
|
faddp st1, st0
|
|
fmul dword [w]
|
|
fld dword D
|
|
fld dword A
|
|
fdivp st1, st0
|
|
fsubp st1, st0
|
|
fmul dword [w]
|
|
fld dword E
|
|
fld dword A
|
|
fdivp st1, st0
|
|
faddp st1, st0
|
|
fstp dword [b0]
|
|
|
|
; // cubic resolvent
|
|
; F32 cubeA = 1.0;
|
|
; F32 cubeB = b2;
|
|
; F32 cubeC = -4 * b0;
|
|
; F32 cubeD = square(b1) - 4 * b0 * b2;
|
|
fld1
|
|
fstp dword [cubeA]
|
|
fld dword [b2]
|
|
fstp dword [cubeB]
|
|
fld dword [b0]
|
|
fmul dword [f_n_four]
|
|
fstp dword [cubeC]
|
|
fld dword [b1]
|
|
fmul st0, st0
|
|
fld dword [f_four]
|
|
fmul dword [b0]
|
|
fmul dword [b2]
|
|
fsubp st1, st0
|
|
fstp dword [cubeD]
|
|
|
|
; mSolveCubic(cubeA, cubeB, cubeC, cubeD, x);
|
|
push dword x
|
|
push dword [cubeD]
|
|
push dword [cubeC]
|
|
push dword [cubeB]
|
|
push dword [cubeA]
|
|
call mSolveCubic_c__FffffPf
|
|
add esp, byte 20
|
|
|
|
mov eax, dword x
|
|
; z = x[0];
|
|
fld dword [eax+0]
|
|
fstp dword [z]
|
|
; nreal = 0;
|
|
xor dword [nreal], 0
|
|
; px = x;
|
|
mov dword [px], eax
|
|
; t = mSqrt(0.25 * square(z) - b0);
|
|
fld dword [z]
|
|
fmul st0, st0
|
|
fmul dword [f_quarter]
|
|
fsub dword [b0]
|
|
fsqrt
|
|
fstp dword [t]
|
|
|
|
; for (i=-1; i<=1; i+=2) {
|
|
mov ecx, dword [i_n_one]
|
|
.L3
|
|
; d0 = -0.5*z + i*t; /* coeffs. of quadratic factor */
|
|
fld dword [f_n_half]
|
|
fmul dword [z]
|
|
fld dword [i]
|
|
fmul dword [t]
|
|
faddp st1, st0
|
|
fstp dword [d0]
|
|
; d1 = (t!=0.0)? -i*0.5*b1/t : i*mSqrt(-z - b2);
|
|
fild dword [i]
|
|
fld dword [t]
|
|
fldz
|
|
fcompp
|
|
fnstsw ax
|
|
;; if t == 0.0 jump to .L4
|
|
;; if C3 is set, jump to .L4
|
|
;; if ah & C3 is not-zero, jump to .L4
|
|
and ah, 64
|
|
jnz .L4
|
|
|
|
fchs
|
|
fmul dword [f_half]
|
|
fmul dword [b1]
|
|
fdiv dword [t]
|
|
jmp .L5
|
|
|
|
.L4:
|
|
fld dword [z]
|
|
fchs
|
|
fsub dword [b2]
|
|
fsqrt
|
|
fmulp st1, st0
|
|
|
|
.L5:
|
|
fstp dword [d1]
|
|
|
|
; h = 0.25 * square(d1) - d0;
|
|
fld dword [d1]
|
|
fmul st0, st0
|
|
fmul dword [f_quarter]
|
|
fsub dword [d0]
|
|
fst dword [h]
|
|
|
|
; if (h>=0.0) {
|
|
; h = mSqrt(h);
|
|
; nreal += 2;
|
|
; *px = -0.5*d1 - h - w;
|
|
; px++;
|
|
; *px = -0.5*d1 + h - w;
|
|
; px++;
|
|
; }
|
|
fldz
|
|
fcomp st1
|
|
; if h < 0.0, jump to .L6
|
|
; if C0 is 0, jump to .L6
|
|
; if ( ah & 0x1 ) == 0.0, jump to .L6
|
|
fnstsw ax
|
|
and ah, 0x1
|
|
jz .L6
|
|
|
|
fsqrt
|
|
fstp dword [h]
|
|
add dword [nreal], 2
|
|
|
|
fld dword [d1]
|
|
fmul dword [f_n_half]
|
|
fsub dword [h]
|
|
fsub dword [w]
|
|
fstp dword [px]
|
|
mov eax, dword [px]
|
|
add eax, 4
|
|
mov dword [px], eax
|
|
fld dword [d1]
|
|
fmul dword [f_n_half]
|
|
fadd dword [h]
|
|
fsub dword [w]
|
|
fstp dword [px]
|
|
mov eax, dword [px]
|
|
add eax, 4
|
|
mov dword [px], eax
|
|
|
|
; }
|
|
.L6:
|
|
add ecx, 2
|
|
cmp ecx, 1
|
|
jle near .L3
|
|
|
|
; // sort results in ascending order
|
|
; if (nreal == 4)
|
|
; {
|
|
mov ecx, dword [nreal]
|
|
cmp ecx, 4
|
|
jnz near .L7
|
|
|
|
mov ecx, x
|
|
|
|
; if (x[2] < x[0])
|
|
fld dword [ecx]
|
|
fld dword [ecx+8]
|
|
fcom st1
|
|
fnstsw ax
|
|
and ah, 0x1
|
|
jz .L8
|
|
|
|
; swap(x[0], x[2]);
|
|
fstp dword [ecx]
|
|
fstp dword [ecx+8]
|
|
jmp .L9
|
|
|
|
.L8:
|
|
fstp st0
|
|
fstp st0
|
|
|
|
.L9:
|
|
; if (x[3] < x[1])
|
|
fld dword [ecx+4]
|
|
fld dword [ecx+12]
|
|
fcom st1
|
|
fnstsw ax
|
|
and ah, 0x1
|
|
jz .L10
|
|
|
|
; swap(x[1], x[3]);
|
|
fstp dword [ecx+4]
|
|
fstp dword [ecx+12]
|
|
jmp .L11
|
|
|
|
.L10:
|
|
fstp st0
|
|
fstp st0
|
|
|
|
.L11:
|
|
; if (x[1] < x[0])
|
|
fld dword [ecx]
|
|
fld dword [ecx+4]
|
|
fcom st1
|
|
fnstsw ax
|
|
and ah, 0x1
|
|
jz .L12
|
|
|
|
; swap(x[0], x[1]);
|
|
fstp dword [ecx]
|
|
fstp dword [ecx+4]
|
|
jmp .L13
|
|
|
|
.L12:
|
|
fstp st0
|
|
fstp st0
|
|
|
|
.L13:
|
|
; if (x[3] < x[2])
|
|
fld dword [ecx+12]
|
|
fld dword [ecx+8]
|
|
fcom st1
|
|
fnstsw ax
|
|
and ah, 0x1
|
|
jz .L14
|
|
|
|
; swap(x[2], x[3]);
|
|
fstp dword [ecx+12]
|
|
fstp dword [ecx+8]
|
|
jmp .L15
|
|
|
|
.L14:
|
|
fstp st0
|
|
fstp st0
|
|
|
|
.L15:
|
|
; if (x[2] < x[1])
|
|
fld dword [ecx+4]
|
|
fld dword [ecx+8]
|
|
fcom st1
|
|
fnstsw ax
|
|
and ah, 0x1
|
|
jz .L16
|
|
|
|
; swap(x[1], x[2]);
|
|
fstp dword [ecx+4]
|
|
fstp dword [ecx+8]
|
|
jmp .L17
|
|
|
|
.L16:
|
|
fstp st0
|
|
fstp st0
|
|
|
|
.L17
|
|
; }
|
|
;
|
|
|
|
.L7
|
|
; return(nreal);
|
|
mov eax, dword [nreal]
|
|
|
|
.L2:
|
|
pop ebp
|
|
ret
|