; ; ASM implementations of mSolver code ; ; externs extern mSolveCubic_c__FffffPf ; data segment .data ; float constants f_quarter dd 0.25 f_half dd 0.5 f_n_half dd -0.5 f_two dd 2.0 f_n_three dd -3.0 f_four dd 4.0 f_n_four dd -4.0 f_n_six dd -6.0 f_eight dd 8.0 ; integer constants i_n_one dd 1 ; variables i dd 0 nreal dd 0 w dd 0 b0 dd 0 b1 dd 0 b2 dd 0 d0 dd 0 d1 dd 0 h dd 0 t dd 0 z dd 0 px dd 0 cubeA dd 0 cubeB dd 0 cubeC dd 0 cubeD dd 0 p dd 0 q dd 0 dis dd 0 phi dd 0 segment .text ; functions ; ; U32 mSolveQuartic_ASM(F32 A, F32 B, F32 C, F32 D, F32 E, F32* x) ; %define A [ebp+8] %define B [ebp+12] %define C [ebp+16] %define D [ebp+20] %define E [ebp+24] %define x [ebp+28] global mSolveQuartic_ASM mSolveQuartic_ASM: push ebp mov ebp, esp ; if (A==0.0) ; return mSolveCubic(B, C, D, E, x); fld dword A fldz fcompp fnstsw ax and ah, 68 xor ah, 64 jne .L1 push dword x push dword E push dword D push dword C push dword B call mSolveCubic_c__FffffPf add esp, byte 20 jmp .L2 .L1: ; w = B/(4*A); /* offset */ fld dword A fmul dword [f_four] fld dword B fxch fdivp st1, st0 fstp dword [w] ; b2 = -6*square(w) + C/A; /* coeffs. of shifted polynomial */ fld dword [w] fmul st0, st0 fld dword [f_n_six] fmulp st1, st0 fld dword C fld dword A fdivp st1, st0 faddp st1, st0 fstp dword [b2] ; b1 = (8*square(w) - 2*C/A)*w + D/A; fld dword [w] fmul st0, st0 fmul dword [f_eight] fld dword C fld dword A fdivp st1, st0 fmul dword [f_two] fsubp st1, st0 fmul dword [w] fld dword D fld dword A fdivp st1, st0 faddp st1, st0 fstp dword [b1] ; b0 = ((-3*square(w) + C/A)*w - D/A)*w + E/A; fld dword [w] fmul st0, st0 fmul dword [f_n_three] fld dword C fld dword A fdivp st1, st0 faddp st1, st0 fmul dword [w] fld dword D fld dword A fdivp st1, st0 fsubp st1, st0 fmul dword [w] fld dword E fld dword A fdivp st1, st0 faddp st1, st0 fstp dword [b0] ; // cubic resolvent ; F32 cubeA = 1.0; ; F32 cubeB = b2; ; F32 cubeC = -4 * b0; ; F32 cubeD = square(b1) - 4 * b0 * b2; fld1 fstp dword [cubeA] fld dword [b2] fstp dword [cubeB] fld dword [b0] fmul dword [f_n_four] fstp dword [cubeC] fld dword [b1] fmul st0, st0 fld dword [f_four] fmul dword [b0] fmul dword [b2] fsubp st1, st0 fstp dword [cubeD] ; mSolveCubic(cubeA, cubeB, cubeC, cubeD, x); push dword x push dword [cubeD] push dword [cubeC] push dword [cubeB] push dword [cubeA] call mSolveCubic_c__FffffPf add esp, byte 20 mov eax, dword x ; z = x[0]; fld dword [eax+0] fstp dword [z] ; nreal = 0; xor dword [nreal], 0 ; px = x; mov dword [px], eax ; t = mSqrt(0.25 * square(z) - b0); fld dword [z] fmul st0, st0 fmul dword [f_quarter] fsub dword [b0] fsqrt fstp dword [t] ; for (i=-1; i<=1; i+=2) { mov ecx, dword [i_n_one] .L3 ; d0 = -0.5*z + i*t; /* coeffs. of quadratic factor */ fld dword [f_n_half] fmul dword [z] fld dword [i] fmul dword [t] faddp st1, st0 fstp dword [d0] ; d1 = (t!=0.0)? -i*0.5*b1/t : i*mSqrt(-z - b2); fild dword [i] fld dword [t] fldz fcompp fnstsw ax ;; if t == 0.0 jump to .L4 ;; if C3 is set, jump to .L4 ;; if ah & C3 is not-zero, jump to .L4 and ah, 64 jnz .L4 fchs fmul dword [f_half] fmul dword [b1] fdiv dword [t] jmp .L5 .L4: fld dword [z] fchs fsub dword [b2] fsqrt fmulp st1, st0 .L5: fstp dword [d1] ; h = 0.25 * square(d1) - d0; fld dword [d1] fmul st0, st0 fmul dword [f_quarter] fsub dword [d0] fst dword [h] ; if (h>=0.0) { ; h = mSqrt(h); ; nreal += 2; ; *px = -0.5*d1 - h - w; ; px++; ; *px = -0.5*d1 + h - w; ; px++; ; } fldz fcomp st1 ; if h < 0.0, jump to .L6 ; if C0 is 0, jump to .L6 ; if ( ah & 0x1 ) == 0.0, jump to .L6 fnstsw ax and ah, 0x1 jz .L6 fsqrt fstp dword [h] add dword [nreal], 2 fld dword [d1] fmul dword [f_n_half] fsub dword [h] fsub dword [w] fstp dword [px] mov eax, dword [px] add eax, 4 mov dword [px], eax fld dword [d1] fmul dword [f_n_half] fadd dword [h] fsub dword [w] fstp dword [px] mov eax, dword [px] add eax, 4 mov dword [px], eax ; } .L6: add ecx, 2 cmp ecx, 1 jle near .L3 ; // sort results in ascending order ; if (nreal == 4) ; { mov ecx, dword [nreal] cmp ecx, 4 jnz near .L7 mov ecx, x ; if (x[2] < x[0]) fld dword [ecx] fld dword [ecx+8] fcom st1 fnstsw ax and ah, 0x1 jz .L8 ; swap(x[0], x[2]); fstp dword [ecx] fstp dword [ecx+8] jmp .L9 .L8: fstp st0 fstp st0 .L9: ; if (x[3] < x[1]) fld dword [ecx+4] fld dword [ecx+12] fcom st1 fnstsw ax and ah, 0x1 jz .L10 ; swap(x[1], x[3]); fstp dword [ecx+4] fstp dword [ecx+12] jmp .L11 .L10: fstp st0 fstp st0 .L11: ; if (x[1] < x[0]) fld dword [ecx] fld dword [ecx+4] fcom st1 fnstsw ax and ah, 0x1 jz .L12 ; swap(x[0], x[1]); fstp dword [ecx] fstp dword [ecx+4] jmp .L13 .L12: fstp st0 fstp st0 .L13: ; if (x[3] < x[2]) fld dword [ecx+12] fld dword [ecx+8] fcom st1 fnstsw ax and ah, 0x1 jz .L14 ; swap(x[2], x[3]); fstp dword [ecx+12] fstp dword [ecx+8] jmp .L15 .L14: fstp st0 fstp st0 .L15: ; if (x[2] < x[1]) fld dword [ecx+4] fld dword [ecx+8] fcom st1 fnstsw ax and ah, 0x1 jz .L16 ; swap(x[1], x[2]); fstp dword [ecx+4] fstp dword [ecx+8] jmp .L17 .L16: fstp st0 fstp st0 .L17 ; } ; .L7 ; return(nreal); mov eax, dword [nreal] .L2: pop ebp ret