;/****************************************************************************
; * $Id$
; *
; ***************************************************************************/

BITS 32

%macro cglobal 1
	%ifdef PREFIX
		global _%1
		%define %1 _%1
	%else
		global %1
	%endif
%endmacro

;=============================================================================
; Read only data
;=============================================================================

SECTION .rodata data align=16

ALIGN 16
mmx_one:
	times 4	dw 1
	
;=============================================================================
; Macros
;=============================================================================


;=============================================================================
; Code
;=============================================================================

SECTION .text

cglobal Sad4x8_iSSE

;-----------------------------------------------------------------------------
;
; unsigned int Sad4x8_iSSE(const unsigned char * cur,
;						   const unsigned char * ref,
;						   const unsigned int cpitch,
;						   const unsigned int rpitch);
;
;-----------------------------------------------------------------------------

ALIGN 16
Sad4x8_iSSE:
	push ebx
	
	pxor mm0, mm0
	pxor mm1, mm1
	
    mov eax, [esp+4+ 4]		; cur
    mov ebx, [esp+4+ 8]		; ref
    mov ecx, [esp+4+12]		; cpitch
    mov edx, [esp+4+16]		; dpitch

	pxor mm2, mm2
	pxor mm3, mm3
   	
    push esi
    push edi
    
    lea	esi, [ecx+2*ecx]    ; 3*cpitch
    lea edi, [edx+2*edx]	; 3*dpitch
    
	movd   mm0, [eax]		;0
	movd   mm2, [ebx]
	movd   mm1, [eax+ecx]   ;1
	movd   mm3, [ebx+edx]

	pxor   mm4, mm4
	pxor   mm5, mm5
	pxor   mm6, mm6
	pxor   mm7, mm7

	psadbw mm0, mm2			;0
	psadbw mm1, mm3			;1

	movd   mm4, [eax+2*ecx]	;2
	movd   mm5, [ebx+2*edx]
	movd   mm6, [eax+esi]	;3
	movd   mm7, [ebx+edi]
	
	pxor   mm2, mm2
	pxor   mm3, mm3

	psadbw mm4, mm5			;2
	psadbw mm6, mm7			;3
	
	pxor   mm5, mm5
	pxor   mm7, mm7
	
	paddd  mm0, mm1			;0+1
	paddd  mm4, mm6			;2+3
	
	pxor   mm1, mm1
	pxor   mm6, mm6
	
	movd   mm3, [eax+4*ecx]	;4
	movd   mm2, [ebx+4*edx]
	
	movd   mm7, [eax+2*esi]	;6
	movd   mm5, [ebx+2*edi]
	
	paddd  mm0, mm4			;0+1+2+3
  	
	psadbw mm2, mm3			;4
	psadbw mm5, mm7			;6

	lea esi, [esi+2*ecx]	; 5*cpitch 
	lea edi, [edi+2*edx]	; 5*dpitch
	
	paddd  mm2, mm5			;4+6
	
	lea ecx, [esi+2*ecx]	; 7*cpitch
	lea edx, [edi+2*edx]	; 7*cpitch
	
	pxor   mm5, mm5
	pxor   mm4, mm4
	
	movd   mm6, [eax+esi]	;5
	movd   mm4, [ebx+edi]
	movd   mm5, [eax+ecx]	;7
	movd   mm1, [ebx+edx]
	
	psadbw mm6, mm4			;5
	psadbw mm5, mm1			;7
	
	paddd  mm0, mm2			;0+1+2+3+4+6
	paddd  mm5, mm6			;5+7	
	
	pop edi
	pop esi
	pop ebx

	paddd  mm0, mm5			;0+1+2+3+4+5+6+7
	movd   eax, mm0
	
	ret