// MVTOOLS plugin for Avisynth
// Block motion interpolation function 
// Copyright(c)2005 A.G.Balakhnin aka Fizick

// See legal notice in Copying.txt for more information

// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; version 2 of the License.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
// http://www.gnu.org/copyleft/gpl.html .

#include "MVInter.h"
#include "Padding.h"
//#include "Interpolation.h"

#include "math.h"

MVInter::MVInter(PClip _child, PClip mvbw, PClip mvfw, int _time256, int _mode, int _thres, int _nIdx,
                           int nSCD1, int nSCD2, bool _mmx, bool _isse, IScriptEnvironment* env) :
GenericVideoFilter(_child),
MVFilter(mvbw, "MVInter", env),
mvClipB(mvbw, nSCD1, nSCD2, env),
mvClipF(mvfw, nSCD1, nSCD2, env)
{
   time256 = _time256;
   mode = _mode;
   if (_thres == 0) // default
	   thres = nBlkSize*nBlkSize/4; // threshold for count of occlusions per block
   else 
	   thres = _thres;
   nIdx = _nIdx;
   mmx = _mmx;
   isse = _isse;

     mvCore->AddFrames(nIdx, MV_BUFFER_FRAMES, mvClipB.GetLevelCount(), nWidth, nHeight,
                        nPel, nBlkSize, nBlkSize, YUVPLANES, isse);
     mvCore->AddFrames(nIdx, MV_BUFFER_FRAMES, mvClipF.GetLevelCount(), nWidth, nHeight,
                        nPel, nBlkSize, nBlkSize, YUVPLANES, isse);

	if ( isse )
   {
      switch (nBlkSize)
      {
      case 16:
         BLITLUMA = Copy16_mmx;
         BLITCHROMA = Copy8_mmx;
         break;
      case 4:
         BLITLUMA = Copy4_mmx;
         BLITCHROMA = Copy2_mmx;
         break;
      case 8:
      default:
         BLITLUMA = Copy8_mmx;
         BLITCHROMA = Copy4_mmx;
      }
   }
   else
   {
      switch (nBlkSize)
      {
      case 16:
         BLITLUMA = Copy_C<16>; // "mmx" version could be used, but it's more like a debugging version
         BLITCHROMA = Copy_C<8>; // idem
         break;
      case 4:
         BLITLUMA = Copy_C<4>; // "mmx" version could be used, but it's more like a debugging version
         BLITCHROMA = Copy_C<2>; // idem
         break;
      case 8:
      default:
         BLITLUMA = Copy_C<8>;
         BLITCHROMA = Copy_C<4>; // idem
      }
   }

	 maskBY = new BYTE [nHeight*nWidth];
	 maskBU = new BYTE [nHeight*nWidth/4];
	 maskBV = new BYTE [nHeight*nWidth/4];
	 maskFY = new BYTE [nHeight*nWidth];
	 maskFU = new BYTE [nHeight*nWidth/4];
	 maskFV = new BYTE [nHeight*nWidth/4];

	 OccY = new BYTE [nHeight*nWidth];
	 OccU = new BYTE [nHeight*nWidth/4];
	 OccV = new BYTE [nHeight*nWidth/4];

	 smallMaskF = new BYTE [nBlkX * nBlkY];
	 smallMaskB = new BYTE [nBlkX * nBlkY];
	 smallMaskO = new BYTE [nBlkX * nBlkY];

/*   destinations = new unsigned char *[nBlkSize * nBlkSize];

	for ( int i = 0; i < nBlkSize * nBlkSize; i++ )
		destinations[i] = new unsigned char[nBlkX * nBlkY];

	upsizer = new Upsizer(nBlkX, nBlkY, nBlkSize, nBlkSize, nBlkSize / 2,
                       nBlkSize / 2, 0, 0, new BilinearResampling());

   destinationsUV = new unsigned char *[nBlkSize * nBlkSize/4];

	for ( int i = 0; i < nBlkSize * nBlkSize/4; i++ )
		destinationsUV[i] = new unsigned char[nBlkX * nBlkY];

	upsizerUV = new Upsizer(nBlkX, nBlkY, nBlkSize/2, nBlkSize/2, nBlkSize / 4,
                       nBlkSize / 4, 0, 0, new BilinearResampling());

*/
	 upsizer = new SimpleResize(nWidth, nHeight, nBlkX, nBlkY, env->GetCPUFlags()); 
	 upsizerUV = new SimpleResize(nWidth/2, nHeight/2, nBlkX, nBlkY, env->GetCPUFlags()); 

}

MVInter::~MVInter()
{
//	for ( int i = 0; i < nBlkSize * nBlkSize; i++ )
//		delete[] destinations[i];
//	delete[] destinations;


//	for ( int i = 0; i < nBlkSize * nBlkSize/4; i++ )
//		delete[] destinationsUV[i];
//	delete[] destinationsUV;

	delete upsizer; 
	delete upsizerUV;

	delete maskBY;
	delete maskBU;
	delete maskBV;
	delete maskFY;
	delete maskFU;
	delete maskFV;
	delete OccY;
	delete OccU;
	delete OccV;
	delete smallMaskF;
	delete smallMaskB;
	delete smallMaskO;
}


void MVInter::MakeSmallMask(BYTE *image, int imagePitch, BYTE *smallmask, int nBlkX, int nBlkY, int nBlkSize, int threshold)
{
	// it can be MMX

	// count occlusions in blocks
	BYTE *psmallmask = smallmask;

	for (int ny = 0; ny <nBlkY; ny++)
	{
		for (int nx = 0; nx <nBlkX; nx++)
		{
			psmallmask[nx] = 0;
			for (int j=0; j<nBlkSize; j++)
			{
				for (int i=0; i<nBlkSize; i++)
				{
					if(image[i]==0) // 0 is mark of occlusion
						psmallmask[nx]++; // count
				}
				image += imagePitch;
			}
			image += -imagePitch*nBlkSize + nBlkSize;
		}
		image += imagePitch*nBlkSize -nBlkX*nBlkSize; 
		psmallmask += nBlkX;
	}

	// make small binary mask
	psmallmask = smallmask;

	for (int ny = 0; ny <nBlkY; ny++)
	{
		for (int nx = 0; nx <nBlkX; nx++)
		{
			if (psmallmask[nx] >= threshold)
				psmallmask[nx] = 255; 
			else
				psmallmask[nx] = 0; 

		}
		psmallmask += nBlkX;
	}

}

void MVInter::InflateMask(BYTE *smallmask, int nBlkX, int nBlkY)
{

	// inflate mask
	BYTE *psmallmask = smallmask + nBlkX +1;

	for (int ny = 1; ny <nBlkY-1; ny++) // skip edges
	{
		for (int nx = 1; nx <nBlkX-1; nx++)// skip edges
		{
			if (psmallmask[nx] == 255)
			{
				psmallmask[nx-1] = 192; 
				psmallmask[nx+1] = 192; 
				psmallmask[nx-nBlkX-1] = 144; 
				psmallmask[nx-nBlkX] = 192; 
				psmallmask[nx-nBlkX+1] = 144; 
				psmallmask[nx+nBlkX-1] = 144; 
				psmallmask[nx+nBlkX] = 192; 
				psmallmask[nx+nBlkX+1] = 144; 
			}
		}
		psmallmask += nBlkX;
	}

}

void MVInter::MultMasks(BYTE *smallmaskF, BYTE *smallmaskB, BYTE *smallmaskO,  int nBlkX, int nBlkY)
{
	for (int j=0; j<nBlkY; j++)
	{
		for (int i=0; i<nBlkX; i++) {
			smallmaskO[i] = (smallmaskF[i]*smallmaskB[i])/255;
		}
		smallmaskF += nBlkX;
		smallmaskB += nBlkX;
		smallmaskO += nBlkX;
	}
}

/*
void MVInter::Reorganize(unsigned char *d, int dp, BYTE **destinations, int nBlkX, int nBlkY, int nBlkSize)
{
	// borrow code from MVMask
	int bhdp = nBlkSize * dp;
	for ( int i = 0; i < nBlkSize; i++ )
	{
		int jdp = 0;
		for ( int j = 0; j < nBlkSize; j++ )
		{
			const unsigned char *dest = destinations[i + j * nBlkSize];
			int kbhdp = 0;
			int krw = 0;
			for ( int k = 0; k < nBlkY; k++ )
			{
				int mbw = 0;
				for ( int m = 0; m < nBlkX; m++ )
				{
					d[mbw + i + kbhdp + jdp] = dest[krw + m];
					mbw += nBlkSize;
				}
				kbhdp += bhdp;
				krw += nBlkX;
			}
			jdp += dp;
		}
	}
}
*/

inline BYTE MEDIAN(BYTE a, BYTE b, BYTE c)
{
	BYTE mn = min(a, b);
	BYTE mx = max(a, b);
	BYTE m = min(mx, c);
	m = max(mn, m);
	return m;
}

void MVInter::ResultBlock(BYTE *pDst, int dst_pitch, const BYTE * pMCB, int MCB_pitch, const BYTE * pMCF, int MCF_pitch,
	const BYTE * pRef, int ref_pitch, const BYTE * pSrc, int src_pitch, BYTE *maskB, int maskB_pitch, BYTE *maskF, int maskF_pitch,
	BYTE *pOcc, int Occ_pitch, int blksize, int time256, int mode)
{
	if (mode==0) // default, best working mode
	{
		for (int h=0; h<blksize; h++)
		{
			for (int w=0; w<blksize; w++)
			{
					int f = (maskF[w]*pMCB[w] + (255-maskF[w])*pMCF[w] + 255 )>>8;
					int b =    (maskB[w]*pMCF[w] + (255-maskB[w])*pMCB[w] + 255)>>8;
					int avg =   (pRef[w]*time256 + pSrc[w]*(256-time256) + 255) >> 8 ; // simple temporal non-MC average
					int m = ( b*time256 + f*(256-time256) )>>8;
					pDst[w]= ( avg * pOcc[w] + m * (255 - pOcc[w]) + 255 )>>8;

			}
			pDst += dst_pitch;
			pMCB += MCB_pitch;
			pMCF += MCF_pitch;
			pRef += ref_pitch;
			pSrc += src_pitch;
			maskB += maskB_pitch;
			maskF += maskF_pitch;
			pOcc += Occ_pitch;
		}
	}
	else // other (debug) modes
	{
		for (int h=0; h<blksize; h++)
		{
			for (int w=0; w<blksize; w++)
			{
				if (mode==1)	// forward shift 			
					pDst[w] = maskF[w];
				else if (mode==2) // backward shift
					pDst[w] = maskB[w];
				else if (mode==3)	// forward shift occlusion mask			
					pDst[w] = maskF[w];
				else if (mode==4) // backward shift occlusion mask	
					pDst[w] = maskB[w];
				else if (mode==5) // fetch forward
					pDst[w] = pMCF[w];
				else if (mode==6) // fetch backward
					pDst[w] = pMCB[w];
				else if (mode==7) {
					int mca =   (pMCB[w]*time256 + pMCF[w]*(256-time256)) >> 8 ; // MC fetched average
					pDst[w] = mca;
				}
				else if (mode==8) {
					int mca =   (pMCB[w]*time256 + pMCF[w]*(256-time256)) >> 8 ; // MC fetched average
					int sta =  MEDIAN(pRef[w], pSrc[w], mca); // static median
					pDst[w] = sta;
				}
				else if (mode==9) {
					int avg =   (pRef[w]*time256 + pSrc[w]*(256-time256)) >> 8 ; // simple temporal non-MC average
					int dyn =  MEDIAN(avg, pMCB[w], pMCF[w]); // dynamic median
					pDst[w] = dyn;
				}
				else if (mode==10) {
					if (maskB[w]!=0 && maskF[w]!=0)
					{
						pDst[w] = MEDIAN(maskB[w], maskF[w], pMCF[w]);
					}
					else if (maskF[w]!=0)
						pDst[w]= maskB[w];
					else
						pDst[w]= pMCF[w];
				}
				else if (mode==11) {
//					maskF[w]=0;
					pDst[w] = (maskF[w]*pMCB[w] + (255-maskF[w])*pMCF[w] + 255)>>8;
				}

				else if (mode==12) {
					pDst[w] = (maskB[w]*pMCF[w] + (255-maskB[w])*pMCB[w] + 255)>>8;
				}

				else if (mode==13) {
					pDst[w] =  ( ( (maskB[w]*pMCF[w] + (255-maskB[w])*pMCB[w] + 255)>>8 )*time256 +
					             ( (maskF[w]*pMCB[w] + (255-maskF[w])*pMCF[w] + 255)>>8 )*(256-time256) ) >> 8;
				}

				else if (mode==14) {
					pDst[w]= pOcc[w];
				}

				else if (mode==15) {
					int f = (maskF[w]*pMCB[w] + (255-maskF[w])*pMCF[w] + 255)>>8;
					int b =    (maskB[w]*pMCF[w] + (255-maskB[w])*pMCB[w] + 255)>>8;
					int avg =   (pRef[w]*time256 + pSrc[w]*(256-time256)) >> 8 ; // simple temporal non-MC average
					int m = ( b*time256 + f*(256-time256) )>>8;
					pDst[w]= ( avg * pOcc[w] + m * (255 - pOcc[w]) + 255)>>8;
				}
				else if (mode==16) {
					int avg =   (pRef[w]*time256 + pSrc[w]*(256-time256)) >> 8 ; // simple temporal non-MC average
					pDst[w]= avg;
				}

			}
			pDst += dst_pitch;
			pMCB += MCB_pitch;
			pMCF += MCF_pitch;
			pRef += ref_pitch;
			pSrc += src_pitch;
			maskB += maskB_pitch;
			maskF += maskF_pitch;
			pOcc += Occ_pitch;
		}
	}
}


PVideoFrame __stdcall MVInter::GetFrame(int n, IScriptEnvironment* env)
{
	PVideoFrame	src	= child->GetFrame(n, env);
   PVideoFrame dst;
   BYTE *pDst[3];
	const BYTE *pRef[3], *pSrc[3];
    int pPitches[3], pRefPitches[3], pSrcPitches[3];

   mvClipB.Update(n, env);// backward from next to current

   int off = mvClipB.GetDeltaFrame(); // integer offset of reference frame
   
   mvClipF.Update(n+off, env);// forward from current to next

   if ( mvClipB.IsUsable() && mvClipF.IsUsable() )
   {
		PVideoFrame ref = child->GetFrame(n + off, env);//  ref for backward compensation
		dst = env->NewVideoFrame(vi);

		MVFrames *pFrames = mvCore->GetFrames(nIdx);
         MVGroupOfFrames *pRefGOFF = pFrames->GetFrame(n); // forward ref
         MVGroupOfFrames *pRefGOFB = pFrames->GetFrame(n + off); // backward ref

         PROFILE_START(MOTION_PROFILE_INTERPOLATION);

         pRefGOFF->SetPlane(YRPLAN(src), YPITCH(src), YPLANE);
         pRefGOFF->SetPlane(URPLAN(src), UPITCH(src), UPLANE);
         pRefGOFF->SetPlane(VRPLAN(src), VPITCH(src), VPLANE);
         pRefGOFF->Pad(YUVPLANES);
         pRefGOFF->Refine(YUVPLANES);

         pRefGOFB->SetPlane(YRPLAN(ref), YPITCH(ref), YPLANE);
         pRefGOFB->SetPlane(URPLAN(ref), UPITCH(ref), UPLANE);
         pRefGOFB->SetPlane(VRPLAN(ref), VPITCH(ref), VPLANE);
         pRefGOFB->Pad(YUVPLANES);
         pRefGOFB->Refine(YUVPLANES);

         PROFILE_STOP(MOTION_PROFILE_INTERPOLATION);

         MVPlane *pPlanesB[3];
         MVPlane *pPlanesF[3];


         pDst[0] = YWPLAN(dst);
         pDst[1] = UWPLAN(dst);
         pDst[2] = VWPLAN(dst);
         pPitches[0] = YPITCH(dst);
         pPitches[1] = UPITCH(dst);
         pPitches[2] = VPITCH(dst);
         pPlanesB[0] = pRefGOFB->GetFrame(0)->GetPlane(YPLANE);
         pPlanesB[1] = pRefGOFB->GetFrame(0)->GetPlane(UPLANE);
         pPlanesB[2] = pRefGOFB->GetFrame(0)->GetPlane(VPLANE);

         pPlanesF[0] = pRefGOFF->GetFrame(0)->GetPlane(YPLANE);
         pPlanesF[1] = pRefGOFF->GetFrame(0)->GetPlane(UPLANE);
         pPlanesF[2] = pRefGOFF->GetFrame(0)->GetPlane(VPLANE);

         pRef[0] = YRPLAN(ref);
         pRef[1] = URPLAN(ref);
         pRef[2] = VRPLAN(ref);
         pRefPitches[0] = YPITCH(ref);
         pRefPitches[1] = UPITCH(ref);
         pRefPitches[2] = VPITCH(ref);

         pSrc[0] = YRPLAN(src);
         pSrc[1] = URPLAN(src);
         pSrc[2] = VRPLAN(src);
         pSrcPitches[0] = YPITCH(src);
         pSrcPitches[1] = UPITCH(src);
         pSrcPitches[2] = VPITCH(src);


		MemZoneSet(maskBY, 0, nWidth, nHeight, 0, 0, nWidth); // put zeros
		MemZoneSet(maskFY, 0, nWidth, nHeight, 0, 0, nWidth); 


         PROFILE_START(MOTION_PROFILE_COMPENSATION);
		 int blocks = mvClipB.GetBlkCount();

		 int maxoffset = nWidth*(nHeight-nBlkSize)-nBlkSize;

		// make forward shifted images by projection to build occlusion mask
         for ( int i = 0; i < blocks; i++ )
         {
            const FakeBlockData &blockF = mvClipF.GetBlock(0, i);
			int offset = blockF.GetX() - ((blockF.GetMV().x*(time256))>>8)/nPel + (blockF.GetY() - ((blockF.GetMV().y*(time256))>>8)/nPel)*nWidth;
			int offset0 = blockF.GetX() + blockF.GetY()*pSrcPitches[0];
			if (offset>= 0 && offset < maxoffset)
				BLITLUMA(maskFY + offset, nWidth, pSrc[0]+offset0, pSrcPitches[0]);//, nBlkSize, nBlkSize
		 }

		 int dummyplane = PLANAR_Y; // always use it

		 if (mode != 1  && mode != 2) {
			 // make small binary mask from  occlusion  regions
			 MakeSmallMask(maskFY, nWidth, smallMaskF, nBlkX, nBlkY, nBlkSize, thres);
			 InflateMask(smallMaskF, nBlkX, nBlkY);
			 // upsize small mask to full frame size
//			upsizer->Resize(smallMaskF, destinations, nBlkX, nBlkX, true);
//			Reorganize(maskFY, nWidth, destinations, nBlkX, nBlkY, nBlkSize);
			  upsizer->SimpleResizeDo(maskFY, nWidth, nHeight, nWidth, smallMaskF, nBlkX, nBlkX, dummyplane);
//			upsizerUV->Resize(smallMaskF, destinationsUV, nBlkX, nBlkX, true);
//			Reorganize(maskFU, nWidth/2, destinationsUV, nBlkX, nBlkY, nBlkSize/2);
//			Reorganize(maskFV, nWidth/2, destinationsUV, nBlkX, nBlkY, nBlkSize/2);
			  upsizerUV->SimpleResizeDo(maskFU, nWidth/2, nHeight/2, nWidth/2, smallMaskF, nBlkX, nBlkX, dummyplane);
			  upsizerUV->SimpleResizeDo(maskFV, nWidth/2, nHeight/2, nWidth/2, smallMaskF, nBlkX, nBlkX, dummyplane);
			// now we have forward fullframe blured occlusion mask in maskF arrays
		 }


		//  same mask for backward
		for ( int i = 0; i < blocks; i++ )
         {
            const FakeBlockData &blockB = mvClipB.GetBlock(0, i);
			int offset = blockB.GetX() - ((blockB.GetMV().x*(256-time256))>>8)/nPel + (blockB.GetY() - ((blockB.GetMV().y*(256-time256))>>8)/nPel)*nWidth;
			int offset0 = blockB.GetX() + blockB.GetY()*pRefPitches[0];
			if (offset>= 0 && offset < maxoffset)
				BLITLUMA(maskBY + offset, nWidth, pRef[0]+offset0, pRefPitches[0]);//, nBlkSize, nBlkSize
		 }
		if (mode != 1 && mode != 2) {
			 // make small binary mask from  occlusion  regions
			 MakeSmallMask(maskBY, nWidth, smallMaskB, nBlkX, nBlkY, nBlkSize, thres);
			 InflateMask(smallMaskB, nBlkX, nBlkY);
			 // upsize small mask to full frame size
//			upsizer->Resize(smallMaskB, destinations, nBlkX, nBlkX, true);
//			Reorganize(maskBY, nWidth, destinations, nBlkX, nBlkY, nBlkSize);
			  upsizer->SimpleResizeDo(maskBY, nWidth, nHeight, nWidth, smallMaskB, nBlkX, nBlkX, dummyplane);
//			upsizerUV->Resize(smallMaskB, destinationsUV, nBlkX, nBlkX, true);
//			Reorganize(maskBU, nWidth/2, destinationsUV, nBlkX, nBlkY, nBlkSize/2);
//			Reorganize(maskBV, nWidth/2, destinationsUV, nBlkX, nBlkY, nBlkSize/2);
			  upsizerUV->SimpleResizeDo(maskBU, nWidth/2, nHeight/2, nWidth/2, smallMaskB, nBlkX, nBlkX, dummyplane);
			  upsizerUV->SimpleResizeDo(maskBV, nWidth/2, nHeight/2, nWidth/2, smallMaskB, nBlkX, nBlkX, dummyplane);

			// make final (both directions) occlusion mask
			MultMasks(smallMaskF, smallMaskB, smallMaskO,  nBlkX, nBlkY);
			 InflateMask(smallMaskO, nBlkX, nBlkY);
			 // upsize small mask to full frame size
//			upsizer->Resize(smallMaskO, destinations, nBlkX, nBlkX, true);
//			Reorganize(OccY, nWidth, destinations, nBlkX, nBlkY, nBlkSize);
			  upsizer->SimpleResizeDo(OccY, nWidth, nHeight, nWidth, smallMaskO, nBlkX, nBlkX, dummyplane);
//			upsizerUV->Resize(smallMaskO, destinationsUV, nBlkX, nBlkX, true);
//			Reorganize(OccU, nWidth/2, destinationsUV, nBlkX, nBlkY, nBlkSize/2);
//			Reorganize(OccV, nWidth/2, destinationsUV, nBlkX, nBlkY, nBlkSize/2);
			  upsizerUV->SimpleResizeDo(OccU, nWidth/2, nHeight/2, nWidth/2, smallMaskO, nBlkX, nBlkX, dummyplane);
			  upsizerUV->SimpleResizeDo(OccV, nWidth/2, nHeight/2, nWidth/2, smallMaskO, nBlkX, nBlkX, dummyplane);
		}

		// pointers
		 BYTE * pmaskBY = maskBY;
		 BYTE * pmaskFY = maskFY;
		 BYTE * pmaskBU = maskBU;
		 BYTE * pmaskFU = maskFU;
		 BYTE * pmaskBV = maskBV;
		 BYTE * pmaskFV = maskFV;
		 BYTE * pOccY = OccY;
		 BYTE * pOccU = OccU;
		 BYTE * pOccV = OccV;

		 // fetch image blocks
         for ( int i = 0; i < blocks; i++ )
         {
            const FakeBlockData &blockB = mvClipB.GetBlock(0, i);
            const FakeBlockData &blockF = mvClipF.GetBlock(0, i);

			// luma
            ResultBlock(pDst[0], pPitches[0],
               pPlanesB[0]->GetPointer(blockB.GetX() * nPel + ((blockB.GetMV().x*(256-time256))>>8), blockB.GetY() * nPel + ((blockB.GetMV().y*(256-time256))>>8)), 
               pPlanesB[0]->GetPitch(),
               pPlanesF[0]->GetPointer(blockF.GetX() * nPel + ((blockF.GetMV().x*time256)>>8), blockF.GetY() * nPel + ((blockF.GetMV().y*time256)>>8)), 
               pPlanesF[0]->GetPitch(),
			   pRef[0], pRefPitches[0],
			   pSrc[0], pSrcPitches[0],
			   pmaskBY, nWidth,
			   pmaskFY, nWidth,
			   pOccY, nWidth,
			   nBlkSize, time256, mode);
			// chroma u
            ResultBlock(pDst[1], pPitches[1],
               pPlanesB[1]->GetPointer((blockB.GetX() * nPel + ((blockB.GetMV().x*(256-time256))>>8))>>1, (blockB.GetY() * nPel + ((blockB.GetMV().y*(256-time256))>>8))>>1), 
               pPlanesB[1]->GetPitch(),
               pPlanesF[1]->GetPointer((blockF.GetX() * nPel + ((blockF.GetMV().x*time256)>>8))>>1, (blockF.GetY() * nPel + ((blockF.GetMV().y*time256)>>8))>>1), 
               pPlanesF[1]->GetPitch(),
			   pRef[1], pRefPitches[1],
			   pSrc[1], pSrcPitches[1],
			   pmaskBU, nWidth/2,
			   pmaskFU, nWidth/2,
			   pOccU, nWidth/2,
			   (nBlkSize)>>1, time256, mode);
			// chroma v
            ResultBlock(pDst[2], pPitches[2],
               pPlanesB[2]->GetPointer((blockB.GetX() * nPel + ((blockB.GetMV().x*(256-time256))>>8))>>1, (blockB.GetY() * nPel + ((blockB.GetMV().y*(256-time256))>>8))>>1), 
               pPlanesB[2]->GetPitch(),
               pPlanesF[2]->GetPointer((blockF.GetX() * nPel + ((blockF.GetMV().x*time256)>>8))>>1, (blockF.GetY() * nPel + ((blockF.GetMV().y*time256)>>8))>>1), 
               pPlanesF[2]->GetPitch(),
			   pRef[2], pRefPitches[2],
			   pSrc[2], pSrcPitches[2],
			   pmaskBV, nWidth/2,
			   pmaskFV, nWidth/2,
			   pOccV, nWidth/2,
			   (nBlkSize)>>1, time256, mode);


            // update pDsts
            pDst[0] += nBlkSize;
            pDst[1] += nBlkSize >> 1;
            pDst[2] += nBlkSize >> 1;
            pRef[0] += nBlkSize;
            pRef[1] += nBlkSize >> 1;
            pRef[2] += nBlkSize >> 1;
            pSrc[0] += nBlkSize;
            pSrc[1] += nBlkSize >> 1;
            pSrc[2] += nBlkSize >> 1;
			pmaskBY += nBlkSize;
			pmaskBU += nBlkSize>>1;
			pmaskBV += nBlkSize>>1;
			pmaskFY += nBlkSize;
			pmaskFU += nBlkSize>>1;
			pmaskFV += nBlkSize>>1;
			pOccY += nBlkSize;
			pOccU += nBlkSize>>1;
			pOccV += nBlkSize>>1;


            if ( !((i + 1) % nBlkX)  )
            {
               pDst[0] += nBlkSize * pPitches[0] - nWidth;
               pDst[1] += ( nBlkSize >>1 ) * pPitches[1] - (nWidth >>1);
               pDst[2] += ( nBlkSize >>1 ) * pPitches[2] - (nWidth >>1);
               pRef[0] += nBlkSize * pRefPitches[0] - nWidth;
               pRef[1] += ( nBlkSize >>1 ) * pRefPitches[1] - (nWidth>>1);
               pRef[2] += ( nBlkSize >>1 ) * pRefPitches[2] - (nWidth>>1);
               pSrc[0] += nBlkSize * pSrcPitches[0] - nWidth;
               pSrc[1] += ( nBlkSize >>1 ) * pSrcPitches[1] - (nWidth>>1);
               pSrc[2] += ( nBlkSize >>1 ) * pSrcPitches[2] - (nWidth>>1);
               pmaskBY += (nBlkSize-1) * nWidth;
               pmaskBU += ((nBlkSize>>1) - 1) * (nWidth>>1);
               pmaskBV += ((nBlkSize>>1) - 1) * (nWidth>>1);
               pmaskFY += (nBlkSize-1) * nWidth;
               pmaskFU += ((nBlkSize>>1) - 1) * (nWidth>>1);
               pmaskFV += ((nBlkSize>>1) - 1) * (nWidth>>1);
               pOccY += (nBlkSize-1) * nWidth;
               pOccU += ((nBlkSize>>1) - 1) * (nWidth>>1);
               pOccV += ((nBlkSize>>1) - 1) * (nWidth>>1);
            }
         }
         _asm emms;
         PROFILE_STOP(MOTION_PROFILE_COMPENSATION);

		return dst;		 
   }
   else 
   {
	   return src;
   }

}