;/****************************************************************************
; * $Id$
; *
; ***************************************************************************/

BITS 32

%macro cglobal 1
	%ifdef PREFIX
		global _%1
		%define %1 _%1
	%else
		global %1
	%endif
%endmacro

%ifdef __DEBUG
	%define __emms emms
%else
	%define __emms
%endif

;=============================================================================
; Read only data
;=============================================================================

SECTION .rodata data align=16

ALIGN 16
mmx_one:
	times 4	dw 1
	
;=============================================================================
; Macros
;=============================================================================


;=============================================================================
; Code
;=============================================================================

SECTION .text

; mm7 and mm6 must be times 8 db mean
; mm0 will be the var
; mm0..mm7 are used
; esi must point to the beginning of the block, it will be changed
; ebx must be the pitch
; ecx must be two times the pitch
%macro VAR8x8 0
    pxor mm0, mm0				; 0 --> mm0 = sum1
    pxor mm1, mm1				; 0 --> mm1 = sum2
    
    movq mm2, [esi]				; first line
    movq mm3, [esi + ebx]		; second line
    
    movq mm4, [esi + ebx * 2]   ; third line
    movq mm5, [esi + ecx]		; fourth line
    
    psadbw mm2, mm7
    psadbw mm3, mm6
    
    psadbw mm4, mm7
    psadbw mm5, mm6
    
    paddw mm0, mm2
    paddw mm1, mm3
    
    lea esi, [4 * ebx]			;
    
    movq mm2, [esi]				; fifth line
    movq mm3, [esi + ebx]		; sixth line
    
    paddw mm0, mm4
    paddw mm1, mm5
    
    movq mm4, [esi + 2 * ebx]	; seventh line
    movq mm5, [esi + ecx]		; eighth line
    
    psadbw mm2, mm7
    psadbw mm3, mm6
    
    paddw mm0, mm2
    paddw mm1, mm3
    
    psadbw mm4, mm7
    psadbw mm5, mm6
    
    paddw mm0, mm4
    paddw mm1, mm5
    
    paddw mm0, mm1
%endmacro


cglobal Var8x8_iSSE


;-----------------------------------------------------------------------------
;
; unsigned int Var8x8_iSSE(const unsigned char *pSrc,
;						   int nPitch,
;						   int *pLuma);
;
;-----------------------------------------------------------------------------

ALIGN 16
Var8x8_iSSE:

    push ebx
    push esi
    push edi

    mov esi, [esp + 12 + 4]		; pSrc --> esi
    mov ebx, [esp + 12 + 8]		; nPitch --> ebx
    
    mov ecx, ebx
    shl ecx, 1
    add ecx, ebx				; 3 * nPitch --> ecx
    
    ; computes the mean
    pxor mm7, mm7				; 0 --> mm7
    pxor mm6, mm6				; 0 --> mm6
    
    VAR8x8
    
    movd [esp + 24], mm0		; pLuma <-- sum
    
    paddw mm0, 32
    pslrq mm0, 6				; mm0 <-- (sum + 32) / 64 = mean
    
    pshufw mm1, mm0, 0			; mm1 contains 4 times the mean
    
    movq mm7, mm1
    pov esi, [esp + 12 + 4]		; pSrc --> esi
    psllq mm7, 8
    
    por mm7, mm1				; mm7 contains 8 times the mean
    movq mm6, mm7				; mm6 <-- 8 times db mean
    
    VAR8x8
    
    movd eax, mm0
    
    pop edi
    pop esi
    pop ebx
    
    ret   
    