; ; NASM implementation for terrain/blender.cc ; segment .data delta_a times 2 dd 0 delta_b times 2 dd 0 delta_c times 2 dd 0 delta_d times 2 dd 0 alpha_a0 times 2 dd 0 alpha_b0 times 2 dd 0 alpha_c0 times 2 dd 0 alpha_d0 times 2 dd 0 alpha_a1 times 2 dd 0 alpha_b1 times 2 dd 0 alpha_c1 times 2 dd 0 alpha_d1 times 2 dd 0 alpha_a2 times 2 dd 0 alpha_b2 times 2 dd 0 alpha_c2 times 2 dd 0 alpha_d2 times 2 dd 0 alpha_a3 times 2 dd 0 alpha_b3 times 2 dd 0 alpha_c3 times 2 dd 0 alpha_d3 times 2 dd 0 ldelt_a times 2 dd 0 ldelt_b times 2 dd 0 ldelt_c times 2 dd 0 ldelt_d times 2 dd 0 rdelt_a times 2 dd 0 rdelt_b times 2 dd 0 rdelt_c times 2 dd 0 rdelt_d times 2 dd 0 zero times 2 dd 0 ; FIXME: get back to this ; redLightMask times 2 dd 0xf800000000000000 ; greenLightMask times 2 dd 0x07c0000000000000 ; blueLightMask times 2 dd 0x003e000000000000 ; bluePackMask times 2 dd 0x003e0000000000000 ; greenPackMask times 2 dd 0x07c0000000000000 ; redPackMask times 2 dd 0x00f8000000000000 redLightMask times 2 dd 0 greenLightMask times 2 dd 0 blueLightMask times 2 dd 0 bluePackMask times 2 dd 0 greenPackMask times 2 dd 0 redPackMask times 2 dd 0 rdeltq times 2 dd 0 ldeltq times 2 dd 0 ix dw 0 iy dw 0 lpoints times 4 dd 0 texelsPerLumelShift dd 0 texelsPerLumel dd 0 texelsPerLumelDiv2 dd 0 segment .text ; global macros %define dst [ebp+8] %define sq_shift [ebp+12] %define aoff [ebp+16] %define bmp_ptrs [ebp+20] %define alpha_ptrs [ebp+24] ; ; void doSquare4( U32 *dst, ; int sq_shift, ; int *aoff, ; U32 **bmp_ptrs, ; U8 **alpha_ptrs ) ; global doSquare4 doSquare4: ; prologue push ebp mov ebp, esp ; setup ix, iy mov eax, 1 mov cl, sq_shift shl eax, cl mov dword [iy], eax shr eax, 1 mov dword [ix], eax ; actual code from blender.cc movd mm1, sq_shift ; get alpha values for the corners of the square for each texture type. ; replicate the values into 4 words of the qwords. Also calc vertical ; stepping values for the alpha values on left and right edges. ; load alpha value into bh to mul by 256 for precision. then ; punpcklwd mm0, mm0 followed by punpckldq mm0, mm0 ; to replicate the low word into all words of mm0. ; shift down difference by sqshift to divide by pixels per square to get ; increment. mov esi, aoff mov edi, alpha_ptrs mov eax, [edi] mov edx, eax add eax, [esi] xor ebx,ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_a0], mm2 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_a1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_a], mm0 psraw mm3, mm1 movq [rdelt_a], mm3 mov eax, [edi+4] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_b0], mm2 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_b1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_b], mm0 psraw mm3, mm1 movq [rdelt_b], mm3 mov eax, [edi+8] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_c0], mm2 movq [alpha_c2], mm0 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_c1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_c], mm0 psraw mm3, mm1 movq [rdelt_c], mm3 mov eax, [edi+12] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_d0], mm2 movq [alpha_d2], mm0 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_d1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_d], mm0 psraw mm3, mm1 movq [rdelt_d], mm3 mov esi, bmp_ptrs mov eax, [esi] mov ebx, [esi+4] mov ecx, [esi+8] mov edx, [esi+12] movq mm0, [alpha_a1] movq mm2, [alpha_b1] movq mm3, [alpha_c1] movq mm4, [alpha_a0] movq mm5, [alpha_b0] movq mm6, [alpha_c0] movq mm7, [alpha_d0] mov edi, dst yloop4: ; mm1 should be sq_shift at this point ; calculate alpha step increments...word-size steps are replicated ; to fill qword. psubw mm0, mm4 psraw mm0, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_a], mm0 ;delta = ainc ainc ainc ainc psubw mm2, mm5 psraw mm2, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_b], mm2 ;delta = ainc ainc ainc ainc psubw mm3, mm6 psraw mm3, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_c], mm3 ;delta = ainc ainc ainc ainc movq mm0, [alpha_d1] psubw mm0, mm7 psraw mm0, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_d], mm0 ;delta = ainc ainc ainc ainc mov esi, [ix] pxor mm2, mm2 xloop4: movq mm0, [eax] movq mm1, mm0 punpcklbw mm0, mm2 pmulhw mm0, mm4 paddw mm4, [delta_a] punpckhbw mm1, mm2 pmulhw mm1, mm4 paddw mm4, [delta_a] packuswb mm0, mm1 movq mm3, [ebx] movq mm1, mm3 punpcklbw mm3, mm2 pmulhw mm3, mm5 paddw mm5, [delta_b] punpckhbw mm1, mm2 pmulhw mm1, mm5 paddw mm5, [delta_b] packuswb mm3, mm1 paddb mm0, mm3 movq mm3, [ecx] movq mm1, mm3 punpcklbw mm3, mm2 pmulhw mm3, mm6 paddw mm6, [delta_c] punpckhbw mm1, mm2 pmulhw mm1, mm6 paddw mm6, [delta_c] packuswb mm3, mm1 paddb mm0, mm3 movq mm3, [edx] movq mm1, mm3 punpcklbw mm3, mm2 pmulhw mm3, mm7 paddw mm7, [delta_d] punpckhbw mm1, mm2 pmulhw mm1, mm7 paddw mm7, [delta_d] packuswb mm3, mm1 paddb mm0, mm3 ; double result, to make up for alpha vals being signed (max = 127) ; so our math turns out a bit short, example: ; (0x7f00 * 0xff) >> 16 = 0x7e....* 2 = 252...not quite 255 ; would have been (0xff00 * 0xff) >> 16 = 0xfe = 254, ; if I could do an unsigned pmulhw... ; pmulhuw is in an intel document I found, but doesn't compile.... paddb mm0, mm0 movq [edi], mm0 add eax, 8 add ebx, 8 add ecx, 8 add edx, 8 add edi, 8 dec esi jnz near xloop4 movq mm4, [alpha_a0] paddw mm4, [ldelt_a] movq [alpha_a0], mm4 movq mm5, [alpha_b0] paddw mm5, [ldelt_b] movq [alpha_b0], mm5 movq mm6, [alpha_c0] paddw mm6, [ldelt_c] movq [alpha_c0], mm6 movq mm7, [alpha_d0] paddw mm7, [ldelt_d] movq [alpha_d0], mm7 movq mm0, [alpha_d1] paddw mm0, [rdelt_d] movq [alpha_d1], mm0 movq mm2, [alpha_b1] paddw mm2, [rdelt_b] movq [alpha_b1], mm2 movq mm3, [alpha_c1] paddw mm3, [rdelt_c] movq [alpha_c1], mm3 movq mm0, [alpha_a1] paddw mm0, [rdelt_a] movq [alpha_a1], mm0 movd mm1, sq_shift ; top of loop expects this dec dword [iy] jnz near yloop4 emms ; epilogue pop ebp ret ; ; void doSquare3( U32 *dst, ; int sq_shift, ; int *aoff, ; U32 **bmp_ptrs, ; U8 **alpha_ptrs ) ; global doSquare3 doSquare3: ; prologue push ebp mov ebp, esp ; setup ix, iy mov eax, 1 mov cl, sq_shift shl eax, cl mov dword [iy], eax shr eax, 1 mov dword [ix], eax movd mm1, sq_shift ; get alpha values for the corners of the square for each texture type. ; replicate the values into 4 words of the qwords. Also calc vertical ; stepping values for the alpha values on left and right edges. ; load alpha value into bh to mul by 256 for precision. then ; punpcklwd mm0, mm0 followed by punpckldq mm0, mm0 ; to replicate the low word into all words of mm0. ; shift down difference by sqshift to divide by pixels per square to get ; increment. mov esi, aoff mov edi, alpha_ptrs mov eax, [edi] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_a0], mm2 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_a1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_a], mm0 psraw mm3, mm1 movq [rdelt_a], mm3 mov eax, [edi+4] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_b0], mm2 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_b1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_b], mm0 psraw mm3, mm1 movq [rdelt_b], mm3 mov eax, [edi+8] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_c0], mm2 movq [alpha_c2], mm0 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_c1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_c], mm0 psraw mm3, mm1 movq [rdelt_c], mm3 mov esi, bmp_ptrs mov eax, [esi] mov ebx, [esi+4] mov ecx, [esi+8] movq mm0, [alpha_a1] movq mm2, [alpha_b1] movq mm3, [alpha_c1] movq mm4, [alpha_a0] movq mm5, [alpha_b0] movq mm6, [alpha_c0] mov edi, dst yloop3: ; mm1 should be sq_shift at this point ; mm0 should be [alpha_a1] ; mm2 should be [alpha_b1] ; mm3 should be [alpha_c1] ; calculate alpha step increments...word-size steps are replicated ; to fill qword. psubw mm0, mm4 psraw mm0, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_a], mm0 ;delta = ainc ainc ainc ainc psubw mm2, mm5 psraw mm2, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_b], mm2 ;delta = ainc ainc ainc ainc psubw mm3, mm6 psraw mm3, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_c], mm3 ;delta = ainc ainc ainc ainc mov esi, ix pxor mm2, mm2 movq mm7, [delta_a] xloop3: movq mm0, [eax] movq mm1, mm0 punpcklbw mm0, mm2 pmulhw mm0, mm4 paddw mm4, mm7 punpckhbw mm1, mm2 pmulhw mm1, mm4 paddw mm4, mm7 packuswb mm0, mm1 movq mm3, [ebx] movq mm1, mm3 punpcklbw mm3, mm2 pmulhw mm3, mm5 paddw mm5, [delta_b] punpckhbw mm1, mm2 pmulhw mm1, mm5 paddw mm5, [delta_b] packuswb mm3, mm1 paddb mm0, mm3 movq mm3, [ecx] movq mm1, mm3 punpcklbw mm3, mm2 pmulhw mm3, mm6 paddw mm6, [delta_c] punpckhbw mm1, mm2 pmulhw mm1, mm6 paddw mm6, [delta_c] packuswb mm3, mm1 paddb mm0, mm3 paddb mm0, mm0 movq [edi], mm0 add eax, 8 add ebx, 8 add ecx, 8 add edi, 8 dec esi jnz near xloop3 movq mm4, [alpha_a0] paddw mm4, [ldelt_a] movq [alpha_a0], mm4 movq mm5, [alpha_b0] paddw mm5, [ldelt_b] movq [alpha_b0], mm5 movq mm6, [alpha_c0] paddw mm6, [ldelt_c] movq [alpha_c0], mm6 movq mm2, [alpha_b1] paddw mm2, [rdelt_b] movq [alpha_b1], mm2 movq mm3, [alpha_c1] paddw mm3, [rdelt_c] movq [alpha_c1], mm3 movq mm0, [alpha_a1] paddw mm0, [rdelt_a] movq [alpha_a1], mm0 movd mm1, sq_shift ; top of loop expects this dec dword [iy] jnz near yloop3 emms ; epilogue pop ebp ret ; ; void doSquare2( U32 *dst, ; int sq_shift, ; int *aoff, ; U32 **bmp_ptrs, ; U8 **alpha_ptrs ) ; global doSquare2 doSquare2: ; prologue push ebp mov ebp, esp ; setup ix, iy mov eax, 1 mov cl, sq_shift shl eax, cl mov dword [iy], eax shr eax, 1 mov dword [ix], eax movd mm1, sq_shift ; get alpha values for the corners of the square for each texture type. ; replicate the values into 4 words of the qwords. Also calc vertical ; stepping values for the alpha values on left and right edges. ; punpcklwd mm0, mm0 followed by punpckldq mm0, mm0 ; to replicate the low word into all words of mm0. ; shift down difference by sqshift to divide by pixels per square to get ; increment. mov esi, aoff mov edi, alpha_ptrs mov eax, [edi] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_a0], mm2 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_a1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_a], mm0 psraw mm3, mm1 movq [rdelt_a], mm3 mov eax, [edi+4] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_b0], mm2 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_b1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_b], mm0 psraw mm3, mm1 movq [rdelt_b], mm3 mov esi, bmp_ptrs mov eax, [esi] mov ebx, [esi+4] movq mm0, [alpha_a1] movq mm2, [alpha_b1] movq mm4, [alpha_a0] movq mm5, [alpha_b0] mov edi, dst yloop2: ; mm1 should be sq_shift at this point ; mm0 should be [alpha_a1] ; mm2 should be [alpha_b1] ; calculate alpha step increments...word-size steps are replicated ; to fill qword. psubw mm0, mm4 psraw mm0, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_a], mm0 ;delta = ainc ainc ainc ainc psubw mm2, mm5 psraw mm2, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_b], mm2 ;delta = ainc ainc ainc ainc mov esi, ix pxor mm2, mm2 movq mm6, [delta_a] movq mm7, [delta_b] xloop2: movq mm0, [eax] movq mm3, [ebx] movq mm1, mm0 punpcklbw mm0, mm2 pmulhw mm0, mm4 paddw mm4, mm6 punpckhbw mm1, mm2 pmulhw mm1, mm4 paddw mm4, mm6 packuswb mm0, mm1 movq mm1, mm3 punpcklbw mm3, mm2 pmulhw mm3, mm5 paddw mm5, mm7 punpckhbw mm1, mm2 pmulhw mm1, mm5 paddw mm5, mm7 packuswb mm3, mm1 paddb mm0, mm3 paddb mm0, mm0 movq [edi], mm0 add edi, 8 add eax, 8 add ebx, 8 dec esi jnz xloop2 movq mm4, [alpha_a0] paddw mm4, [ldelt_a] movq [alpha_a0], mm4 movq mm5, [alpha_b0] paddw mm5, [ldelt_b] movq [alpha_b0], mm5 movq mm2, [alpha_b1] paddw mm2, [rdelt_b] movq [alpha_b1], mm2 movq mm0, [alpha_a1] paddw mm0, [rdelt_a] movq [alpha_a1], mm0 movd mm1, sq_shift ; top of loop expects this dec dword [iy] jnz near yloop2 emms ; epilogue pop ebp ret global setupLumel setupLumel: ; prologue push ebp mov ebp, esp ; we only need to load the high bits up, they're already 0 ; in the low bits mov dword [redLightMask], 0xf8000000 mov dword [greenLightMask], 0x07c0000 mov dword [blueLightMask], 0x003e0000 mov dword [bluePackMask], 0x003e0000 mov dword [greenPackMask], 0x07c00000 mov dword [redPackMask], 0x00f80000 ; epilogue pop ebp ret ; ; void doLumel( U16 *dstptr, ; U32 *srcptr, ; int nextdstrow, ; int nextsrcrow ) ; %define dstptr [ebp+8] %define srcptr [ebp+12] %define nextdstrow [ebp+16] %define nextsrcrow [ebp+20] global doLumel doLumel: ; prologue push ebp mov ebp, esp movd mm7, [texelsPerLumelShift] movd mm0, [lpoints] movq mm4, mm0 pand mm0, [redLightMask] movq mm5, mm4 pand mm4, [greenLightMask] psllq mm0, 31 pand mm5, [blueLightMask] psllq mm4, 20 paddw mm0, mm4 psllq mm5, 9 paddw mm0, mm5 ; mm0 = 0000rrrrggggbbbb qword for lp[0] movd mm1, [lpoints+8] ; get lp2 movq mm4, mm1 pand mm1, [redLightMask] movq mm5, mm4 pand mm4, [greenLightMask] psllq mm1, 31 pand mm5, [blueLightMask] psllq mm4, 20 paddw mm1, mm4 psllq mm5, 9 paddw mm1, mm5 ; mm1 = 0000rrrrggggbbbb qword for lp[2] psubw mm1, mm0 psraw mm1, mm7 movq [ldeltq], mm1 movd mm2, [lpoints+4] ; get lp[1] movq mm4, mm2 pand mm2, [redLightMask] movq mm5, mm4 pand mm4, [greenLightMask] psllq mm2, 31 pand mm5, [blueLightMask] psllq mm4, 20 paddw mm2, mm4 psllq mm5, 9 paddw mm2, mm5 ; mm2 = 0000rrrrggggbbbb qword for lp[1] movd mm3, [lpoints+12] ; get lp3 movq mm4, mm3 pand mm3, [redLightMask] movq mm5, mm4 pand mm4, [greenLightMask] psllq mm3, 31 pand mm5, [blueLightMask] psllq mm4, 20 paddw mm3, mm4 psllq mm5, 9 paddw mm3, mm5 ; mm3 = 0000rrrrggggbbbb qword for lp[3] psubw mm3, mm2 psraw mm3, mm7 movq [rdeltq], mm3 mov edi, dstptr mov esi, srcptr pxor mm6, mm6 mov eax, [texelsPerLumel] ; yloop count cmp eax, 1 jne not_special ; special case for 1x1 lumel movd mm4, [esi] punpcklbw mm4, mm6 ; mm6 is expected to be 0 here pmulhw mm4, mm0 paddw mm4, mm4 movq mm7, mm4 movq mm6, mm4 psrlq mm4, 34 pand mm7, [redPackMask] psrlq mm6, 13 ; MASKALPHA ; pand mm4, [bluePackMask] psllq mm7, 8 pand mm6, [greenPackMask] paddw mm4, mm7 paddw mm4, mm6 movd eax, mm4 mov [edi],ax jmp done not_special: ; mm0 = left at loop start ; mm2 = right yloopL: movq mm1, mm0 ;mm1 = start movq mm3, mm2 psubw mm3, mm0 psraw mm3, mm7 ; mm3 = delta mov ebx, [texelsPerLumelDiv2] ; loop count xloopL: movq mm4, [esi] movq mm5, mm4 punpcklbw mm4, mm6 ; mm6 is expected to be 0 here pmulhw mm4, mm1 paddw mm1, mm3 punpckhbw mm5, mm6 pmulhw mm5, mm1 paddw mm1, mm3 paddw mm4, mm4 paddw mm5, mm5 movq mm7, mm4 movq mm6, mm4 psrlq mm4, 34 pand mm7, [redPackMask] psrlq mm6, 13 ; MASKALPHA ; pand mm4, [bluePackMask] psllq mm7, 8 pand mm6, [greenPackMask] paddw mm4, mm7 paddw mm4, mm6 movq mm7, mm5 movq mm6, mm5 psrlq mm5, 34 pand mm7, [redPackMask] psrlq mm6, 13 ; MASKALPHA ; pand mm4, [bluePackMask] psllq mm7, 8 pand mm6, [greenPackMask] paddw mm5, mm7 paddw mm5, mm6 psllq mm5, 16 ; lazy, I reused code above and must now shift paddw mm4, mm5 ; write 2 16-bit pixels. I'd consider doing 4 at a time, but ; loop count can be as small as 1 at lowest detail already. ; could do separate loop I guess. movd [edi], mm4 pxor mm6, mm6 add edi, 4 add esi, 8 dec ebx jnz near xloopL movd mm7, [texelsPerLumelShift] paddw mm0, [ldeltq] paddw mm2, [rdeltq] add edi, nextdstrow add esi, nextsrcrow dec eax jnz near yloopL done: emms ; epilogue pop ebp ret