;/****************************************************************************
; * $Id$
; *
; ***************************************************************************/

BITS 32

%macro cglobal 1
	%ifdef PREFIX
		global _%1
		%define %1 _%1
	%else
		global %1
	%endif
%endmacro

;=============================================================================
; Read only data
;=============================================================================

SECTION .rodata data align=16

ALIGN 16
mmx_one:
	times 4	dw 1
mask:
;	db 255,0,0,0,0,0,0,0
	db 0,0,0,0,0,0,0,255 ; bug fixed by Fizick
mmx_ones:
	times 8 db 1
mask4:
	times 4 dw 255
	
;=============================================================================
; Macros
;=============================================================================


;=============================================================================
; Code
;=============================================================================

SECTION .text

cglobal VerticalBilin_iSSE
cglobal HorizontalBilin_iSSE
cglobal DiagonalBilin_iSSE
cglobal RB2F_iSSE

;                           
;void VerticalBilin_iSSE(unsigned char *pDst, const unsigned char *pSrc
;                        int nDstPitch, int nSrcPitch,
;                        int nWidth, int nHeight);
;

ALIGN 16
VerticalBilin_iSSE:

		push esi
		push edi
		push ebx
		
		mov edi, [esp + 16]		; pDst
		mov esi, [esp + 20]		; pSrc
		mov edx, [esp + 28]		; nSrcPitch
		mov eax, esi
		add eax, edx
		mov ecx, [esp + 36]		; nHeight
		dec ecx

v_loopy:

        xor ebx, ebx

v_loopx:
        movq mm1, [esi+ebx]
        movq mm2, [eax+ebx]
        pavgb mm1, mm2
        movq [edi+ebx], mm1
        add ebx, 8
        cmp ebx, [esp + 32]
        jl v_loopx

        add eax, edx
        add esi, edx
        add edi, [esp + 24]
        dec ecx
        jnz v_loopy

        xor ebx, ebx

v_loopfinal:

        movq mm1, [esi+ebx]
        movq [edi+ebx], mm1
        add ebx, 8
        cmp ebx, [esp + 32]
        jl v_loopfinal

        emms
        
        pop ebx
        pop edi
        pop esi

		ret
	
;                           
;void HorizontalBilin_iSSE(unsigned char *pDst, const unsigned char *pSrc
;                          int nDstPitch, int nSrcPitch,
;                          int nWidth, int nHeight);
;

ALIGN 16
HorizontalBilin_iSSE:

		push esi
		push edi
		push ebx
    
        mov edi, [esp + 16]		; pDst
        mov esi, [esp + 20]		; pSrc
        mov edx, [esp + 32]		; nWidth
        sub edx, 8
        mov ecx, [esp + 36]		; nHeight
        movq mm7, [mask]

h_loopy:

        xor ebx, ebx

h_loopx:
        movq mm1, [esi+ebx]
        movq mm2, [esi+ebx+1]
        pavgb mm1, mm2
        movq [edi+ebx], mm1
        add ebx, 8
        cmp ebx, edx
        jl h_loopx

        movq mm2, [esi+ebx]
        movq mm1, [esi+ebx]
        movq mm3, mm2
        psrlq mm2, 8
        pand mm3, mm7
        por mm3, mm2
        pavgb mm1, mm2
        movq [edi+ebx], mm1

        add esi, [esp + 28]
        add edi, [esp + 24]
        dec ecx
        jnz h_loopy

        emms	
        
        pop ebx
        pop edi
        pop esi
        
        ret
        
        
;                           
;void DiagonalBilin_iSSE(unsigned char *pDst, const unsigned char *pSrc
;                        int nDstPitch, int nSrcPitch,
;                        int nWidth, int nHeight);
;

ALIGN 16
DiagonalBilin_iSSE:

		push esi
		push edi
		push ebx        
        
        mov edi, [esp + 16]
        mov esi, [esp + 20]
        mov edx, [esp + 28]
        mov eax, [esp + 20]
        add eax, edx
        mov edx, [esp + 32]
        sub edx, 8
        mov ecx, [esp + 36]
        dec ecx

        movq mm7, [mmx_ones]
        movq mm6, [mask]

d_loopy:

        xor ebx, ebx

d_loopx:

        movq mm0, [esi+ebx]
        movq mm1, [esi+ebx+1]

        movq mm2, [eax+ebx]
        movq mm3, [eax+ebx+1]

        movq mm4, mm0            
        movq mm5, mm2

        pxor mm4, mm1
        pxor mm5, mm3

        pavgb mm0, mm1
        pavgb mm2, mm3

        por mm4, mm5

        movq mm5, mm0           

        pxor mm5, mm2
        pand mm4, mm7

        pand mm4, mm5
        
        pavgb mm0, mm2

        psubb mm0, mm4

        movq [edi+ebx], mm0

        add ebx, 8
        cmp ebx, edx
        jl d_loopx

        movq mm0, [esi+ebx]
        movq mm2, [eax+ebx]
        
        movq mm1, [esi+ebx]
        movq mm3, [eax+ebx]

        movq mm4, mm0
        movq mm5, mm2

        pand mm4, mm6
        psrlq mm1, 8

        pand mm5, mm6
        psrlq mm3, 8

        por mm1, mm4
        por mm3, mm5

        movq mm4, mm0
        movq mm5, mm2

        pxor mm4, mm1
        pxor mm5, mm3

        pavgb mm0, mm1
        pavgb mm2, mm3

        por mm4, mm5
        movq mm5, mm0

        pxor mm5, mm2

        pand mm4, mm5
        pavgb mm0, mm2

        pand mm4, mm7

        paddb mm0, mm4

        movq [edi+ebx], mm0

        add esi, [esp + 28]
        add edi, [esp + 24]
        add eax, [esp + 28]
        dec ecx
        jnz d_loopy

        xor ebx, ebx

d_loop_final : 

        movq mm1, [esi+ebx]
        movq mm2, [esi+ebx+1]
        pavgb mm1, mm2
        movq [edi+ebx], mm1
        add ebx, 8
        cmp ebx, edx
        jl d_loop_final

        movq mm2, [esi+ebx]
        movq mm1, [esi+ebx]
        movq mm3, mm2
        psrlq mm2, 8
        pand mm3, mm6
        por mm3, mm2
        pavgb mm1, mm2
        movq [edi+ebx], mm1

		emms ; Fizick
        
        pop ebx
        pop edi
        pop esi

        ret
        
;                           
;void RB2F_iSSE(unsigned char *pDst, const unsigned char *pSrc
;               int nDstPitch, int nSrcPitch,
;               int nWidth, int nHeight);
;        

ALIGN 16
RB2F_iSSE:

		push esi
		push edi
		push ebx
		push ebp       
        
        mov edi, [esp + 20]
        mov esi, [esp + 24]
        mov edx, [esp + 32]
        mov eax, [esp + 24]
        add eax, edx
        shl edx, 1
        mov ebp, edx
        mov edx, [esp + 36]
        sub edx, 4
        mov ecx, [esp + 40]

        movq mm7, [mmx_ones]
        pxor mm6, mm6

r_loopy:

        xor ebx, ebx

r_loopx:
        movq mm0, [esi+ebx*2]
        movq mm2, [eax+ebx*2]
        
        movq mm1, [esi+ebx*2+1]
        movq mm3, [eax+ebx*2+1]

        movq mm4, mm0            
        movq mm5, mm2

        pxor mm4, mm1
        pxor mm5, mm3

        pavgb mm0, mm1
        pavgb mm2, mm3

        por mm4, mm5

        movq mm5, mm0           

        pxor mm5, mm2
        pand mm4, mm7

        pand mm4, mm5
        
        pavgb mm0, mm2

        psubb mm0, mm4

        pand mm0, [mask4]

        packuswb mm0, mm6

        movd [edi+ebx], mm0

        add ebx, 4
        cmp ebx, edx
        jl r_loopx

        movq mm0, [esi+ebx*2]
        movq mm2, [eax+ebx*2]
                
        movq mm1, [esi+ebx*2]
        movq mm3, [eax+ebx*2]

        movq mm4, mm0            
        movq mm5, mm2

        psrlq mm1, 8
        psrlq mm3, 8

        pxor mm4, mm1
        pxor mm5, mm3

        pavgb mm0, mm1
        pavgb mm2, mm3

        por mm4, mm5

        movq mm5, mm0           

        pxor mm5, mm2
        pand mm4, mm7

        pand mm4, mm5
        
        pavgb mm0, mm2

        psubb mm0, mm4

        pand mm0, [mask4]

        packuswb mm0, mm6

        movd [edi+ebx], mm0
       
        add esi, ebp
        add edi, [esp + 28]
        add eax, ebp
        dec ecx
        jnz r_loopy

        emms
        
        pop ebp
        pop ebx
        pop edi
        pop esi
        
        ret
          
