// Make a motion compensate temporal denoiser

// See legal notice in Copying.txt for more information

// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
// http://www.gnu.org/copyleft/gpl.html .

#include "EncDenoise.h"
#include "CopyCode.h"
#include "Padding.h"

EncDenoise::EncDenoise(PClip _child, PClip vectors, bool sc, int q, int aOff, int bOff, int lv, int dz,
               const MotionParameters &params, IScriptEnvironment* env) :
GenericMotionFilter(_child, params, vectors->GetVideoInfo(), env), mvclip(vectors)
{
	CreateFGOP(&fgop);

   if ( nBlkSize != 8 )
      env->ThrowError("EncDenoise : error, blksize in MVAnalyse must be 8 in order to use this filter");

   scBehavior = sc;
   frameSize = (nWidth + 2 * nHPadding) * (nHeight + 2 * nVPadding);
   buffer = new unsigned char[frameSize * 2];
   bufferPos = 0;

   aOffset = aOff;
   bOffset = bOff;
   quant = sat(q, 0, 51);
   level = lv;
   deadzone = dz;
   lastf = -5;
}

EncDenoise::~EncDenoise()
{
    delete[] buffer;
}

void EncDenoise::InterpolateHorHpel(unsigned char *pDst, int nDstPitch,
                                    const unsigned char *pSrc, int nSrcPitch)
{
    for ( int j = 0; j < 8; j++ )
    {
        for ( int i = 0; i < 8; i++ )
            pDst[i] = (pSrc[i] + pSrc[i+1] + 1) >> 1;
        pSrc += nSrcPitch;
        pDst += nDstPitch;
    }
}

void EncDenoise::InterpolateVerHpel(unsigned char *pDst, int nDstPitch,
                                    const unsigned char *pSrc, int nSrcPitch)
{
    for ( int j = 0; j < 8; j++ )
    {
        for ( int i = 0; i < 8; i++ )
            pDst[i] = (pSrc[i] + pSrc[i+nSrcPitch] + 1) >> 1;
        pSrc += nSrcPitch;
        pDst += nDstPitch;
    }
}

void EncDenoise::InterpolateDiagHpel(unsigned char *pDst, int nDstPitch,
                                     const unsigned char *pSrc, int nSrcPitch)
{
    for ( int j = 0; j < 8; j++ )
    {
        for ( int i = 0; i < 8; i++ )
            pDst[i] = (pSrc[i] + pSrc[i+1] + pSrc[nSrcPitch+i] + pSrc[nSrcPitch+i+1] + 2) >> 2;
        pSrc += nSrcPitch;
        pDst += nDstPitch;
    }
}

void EncDenoise::MoveBlock(unsigned char *pDst, int dstPitch,
                           const unsigned char *pSrc, int srcPitch)
{
    __asm {
        mov eax, [pDst]
        mov ebx, [pSrc]
        mov ecx, [dstPitch]
        mov edx, [srcPitch]

        movq mm0, [ebx]             // 0 ->
        movq mm1, [ebx + edx]       // 1 ->
        movq mm2, [ebx + 2 * edx]   // 2 ->
        movq [eax], mm0             // 0 <-
        movq mm3, [ebx + 4 * edx]   // 4 ->
        movq [eax + ecx], mm1       // 1 <-
        movq [eax + 2 * ecx], mm2   // 2 <-
        movq [eax + 4 * ecx], mm3   // 4 <-

        lea eax, [eax + ecx * 2]    // 2
        lea ebx, [ebx + edx * 2]    // 2

        movq mm0, [ebx + edx]       // 2 + 1 ->
        movq mm1, [ebx + 4 * edx]   // 2 + 4 ->
        movq [eax + ecx], mm0       // 2 + 1 <-
        movq [eax + 4 * ecx], mm1   // 2 + 4 <-

        lea eax, [eax + ecx]        // 3
        lea ebx, [ebx + edx]        // 3

        movq mm0, [ebx + 2 * edx]   // 3 + 2 ->
        movq mm1, [ebx + 4 * edx]   // 3 + 4 ->
        movq [eax + 2 * ecx], mm0   // 3 + 2 <-
        movq [eax + 4 * ecx], mm1   // 3 + 4 <-
    }
}

const int alphas[52] = {
    0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 4, 4,
    5, 6, 7, 8, 9, 10,
    12, 13, 15, 17, 20,
    22, 25, 28, 32, 36,
    40, 45, 50, 56, 63, 
    71, 80, 90, 101, 113,
    127, 144, 162, 182,
    203, 226, 255, 255
};

const int betas[52] = {
    0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 2, 2,
    2, 3, 3, 3, 3, 4,
    4, 4, 6, 6, 
    7, 7, 8, 8, 9, 9,
    10, 10, 11, 11, 12,
    12, 13, 13, 14, 14, 
    15, 15, 16, 16, 17, 
    17, 18, 18
};

const int cs[52] = {
    0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0,
    0, 0, 0, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 
    1, 2, 2, 2, 2, 3, 
    3, 3, 4, 4, 5, 5,
    6, 7, 8, 8, 10, 
    11, 12, 13, 15, 17
};



void EncDenoise::DeblockPicture(unsigned char *srcp, int srcPitch, int w, int h,
                                int q, int aOff, int bOff)
{
    int indexa, indexb;
    for ( int j = 0; j < h; j += 4 )
    {
        for ( int i = 0; i < w; i += 4 )
        {
            indexa = sat(quant + aOffset, 0, 51);
            indexb = sat(quant + bOffset, 0, 51);
            if ( j > 0 )
                DeblockHorEdge(srcp + i, srcPitch, indexa, indexb);
            if ( i > 0 )
                DeblockVerEdge(srcp + i, srcPitch, indexa, indexb);
        }
        srcp += 4 * srcPitch;
    }
}

void EncDenoise::DeblockHorEdge(unsigned char *srcp, int srcPitch, int ia, int ib)
{
    int alpha = alphas[ia];
    int beta = betas[ib];
    int c, c0 = cs[ia];
    unsigned char *sq0 = srcp;
    unsigned char *sq1 = srcp + srcPitch;
    unsigned char *sq2 = srcp + 2 * srcPitch;
    unsigned char *sp0 = srcp - srcPitch;
    unsigned char *sp1 = srcp - 2 * srcPitch;
    unsigned char *sp2 = srcp - 3 * srcPitch;
    int delta, ap, aq, deltap1, deltaq1;

    for ( int i = 0; i < 4; i ++ )
    {
        if (( abs(sp0[i] - sq0[i]) < alpha ) && ( abs(sp1[i] - sp0[i]) < beta ) && ( abs(sq0[i] - sq1[i]) < beta ))
        {
            ap = abs(sp2[i] - sp0[i]);
            aq = abs(sq2[i] - sq0[i]);
            c = c0;
            if ( aq < beta ) c++;
            if ( ap < beta ) c++;
            delta = sat((((sq0[i] - sp0[i]) << 2) + (sp1[i] - sq1[i]) + 4) >> 3, -c, c);
            deltap1 = sat((sp2[i] + ((sp0[i] + sq0[i] + 1) >> 1) - (sp1[i] << 1)) >> 1, -c0, c0);
            deltaq1 = sat((sq2[i] + ((sp0[i] + sq0[i] + 1) >> 1) - (sq1[i] << 1)) >> 1, -c0, c0);
            sp0[i] = (unsigned char)sat(sp0[i] + delta, 0, 255);
            sq0[i] = (unsigned char)sat(sq0[i] - delta, 0, 255);
            if ( ap < beta )
                sp1[i] = (unsigned char)(sp1[i] + deltap1);
            if ( aq < beta )
                sq1[i] = (unsigned char)(sq1[i] + deltaq1);
        }
    }
}

void EncDenoise::DeblockVerEdge(unsigned char *srcp, int srcPitch, int ia, int ib)
{
    int alpha = alphas[ia];
    int beta = betas[ib];
    int c, c0 = cs[ia];
    unsigned char *s = srcp;

    int delta, ap, aq, deltap1, deltaq1;

    for ( int i = 0; i < 4; i ++ )
    {
        if (( abs(s[0] - s[-1]) < alpha ) && ( abs(s[1] - s[0]) < beta ) && ( abs(s[-1] - s[-2]) < beta ))
        {
            ap = abs(s[2] - s[0]);
            aq = abs(s[-3] - s[-1]);
            c = c0;
            if ( aq < beta ) c++;
            if ( ap < beta ) c++;
            delta = sat((((s[0] - s[-1]) << 2) + (s[-2] - s[1]) + 4) >> 3, -c, c);
            deltaq1 = sat((s[2] + ((s[0] + s[-1] + 1) >> 1) - (s[1] << 1)) >> 1, -c0, c0);
            deltap1 = sat((s[-3] + ((s[0] + s[-1] + 1) >> 1) - (s[-2] << 1)) >> 1, -c0, c0);
            s[0] = (unsigned char)sat(s[0] - delta, 0, 255);
            s[-1] = (unsigned char)sat(s[-1] + delta, 0, 255);
            if ( ap < beta )
                s[1] = (unsigned char)(s[1] + deltaq1);
            if ( aq < beta )
                s[-2] = (unsigned char)(s[-2] + deltap1);
        }
        s += srcPitch;
    }
}

const int fmatrix[16] = {
    1,  1,  1,  1,
    2,  1, -1, -2,
    1, -1, -1,  1,
    1, -2,  2, -1
};

const int mf[18] = {
    13107, 11916, 10082, 9362, 8192, 7282,
    5243, 4660, 4194, 3647, 3355, 2893,
    8066, 7490, 6554, 5825, 5243, 4559
};

const int v[18] = {
    10, 11, 13, 14, 16, 18, 
    16, 18, 20, 23, 25, 29,
    13, 14, 16, 18, 20, 23
};


const int scan[4][4] = {
    {0, 2, 0, 2}, 
    {2, 1, 2, 1},
    {0, 2, 0, 2},
    {2, 1, 2, 1}
};

const int imatrix[16] = {
    2,  2,  2,  1,
    2,  1, -2, -2,
    2, -1, -2,  2,
    2, -2,  2, -1
};

const int aaa[16] = {
    25, 20, 25, 20,
    20, 16, 20, 16, 
    25, 20, 25, 20,
    20, 16, 20, 16
};


void EncDenoise::QDQBlock(__int16 dstp[4][4], int q, int f)
{
    int offset = (1 << (q / 6 + 15)) / f;
    int sign, srca;
    for ( int i = 0; i < 4; i++ )
    {
        for ( int j = 0; j < 4; j++ )
        {
            srca = abs(dstp[i][j]);
            sign = (dstp[i][j] < 0) ? -1 : 1;
            dstp[i][j] = (((sign * ((srca * mf[scan[i][j] * 3 + (q % 6)] + offset) >> (q / 6 + 15))) * v[scan[i][j] * 3 + (q % 6)]) << (q / 6));
        }
    }
}

void EncDenoise::ProcessFrame(unsigned char *pDst, unsigned char *pRef, int dstPitch,
                              const unsigned char *pSrc, int srcPitch, int w, int h, int scale)
{
	const FakePlaneOfBlocks &blocks = fgop->GetPlane(0);

    __int16 blocks2[4][4][4];
    unsigned char blocks3[64];
 
    for ( int i = 0; i < blocks.GetBlockCount(); i++)
    {
		int x = blocks[i].GetX();
		int y = blocks[i].GetY();
        int dx = blocks[i].GetMV().x / nPel;
        int dy = blocks[i].GetMV().y / nPel;

        if (( nPel == 2 ) && (( blocks[i].GetMV().x & 1 ) || (blocks[i].GetMV().y & 1)))
        {
            if (( blocks[i].GetMV().x & 1 ) && ( blocks[i].GetMV().y & 1 ))
            {
                int nOffsetX  = (blocks[i].GetMV().x < 0) ? -1 : 0;
                int nOffsetY  = (blocks[i].GetMV().y < 0) ? -1 : 0;
                InterpolateDiagHpel(blocks3, 8, pRef + (x + dx + nOffsetX) + (y + dy + nOffsetY) * dstPitch, dstPitch);
            }
            else if ( blocks[i].GetMV().x & 1 )
            {
                int nOffsetX  = (blocks[i].GetMV().x < 0) ? -1 : 0;
                InterpolateHorHpel(blocks3, 8, pRef + (x + dx + nOffsetX) + (y + dy) * dstPitch, dstPitch);
            }
            else
            {
                int nOffsetY  = (blocks[i].GetMV().y < 0) ? -1 : 0;
                InterpolateVerHpel(blocks3, 8, pRef + (x + dx) + (y + dy + nOffsetY) * dstPitch, dstPitch);
            }
            x264_sub4x4_dct_mmxext(blocks2[0],  pSrc + x + y * srcPitch, srcPitch,
                blocks3, 8);
            x264_sub4x4_dct_mmxext(blocks2[1],  pSrc + x + 4 + y * srcPitch, srcPitch,
                blocks3 + 4, 8);
            x264_sub4x4_dct_mmxext(blocks2[2],  pSrc + x + (y + 4) * srcPitch, srcPitch,
                blocks3 + 32, 8);
            x264_sub4x4_dct_mmxext(blocks2[3],  pSrc + x + 4 + (y + 4) * srcPitch, srcPitch,
                blocks3 + 36, 8);

            MoveBlock(pDst + x + y * dstPitch, dstPitch, blocks3, 8);
        }
        else 
        {
            x264_sub4x4_dct_mmxext(blocks2[0],  pSrc + x + y * srcPitch, srcPitch,
                pRef + (x + dx) + (y + dy) * dstPitch, dstPitch);
            x264_sub4x4_dct_mmxext(blocks2[1],  pSrc + x + 4 + y * srcPitch, srcPitch,
                pRef + (x + dx + 4) + (y + dy) * dstPitch, dstPitch);
            x264_sub4x4_dct_mmxext(blocks2[2],  pSrc + x + (y + 4) * srcPitch, srcPitch,
                pRef + (x + dx) + (y + dy + 4) * dstPitch, dstPitch);
            x264_sub4x4_dct_mmxext(blocks2[3],  pSrc + x + 4 + (y + 4) * srcPitch, srcPitch,
                pRef + (x + dx + 4) + (y + dy + 4) * dstPitch, dstPitch);

            MoveBlock(pDst + x + y * dstPitch, dstPitch, pRef + (x + dx) + (y + dy) * dstPitch, dstPitch);
        }

        

        QDQBlock(blocks2[0], quant, deadzone);
        QDQBlock(blocks2[1], quant, deadzone);
        QDQBlock(blocks2[2], quant, deadzone);
        QDQBlock(blocks2[3], quant, deadzone);

        x264_add4x4_idct_mmxext(pDst + x + y * dstPitch, dstPitch, blocks2[0]);
        x264_add4x4_idct_mmxext(pDst + x + 4 + y * dstPitch, dstPitch, blocks2[1]);
        x264_add4x4_idct_mmxext(pDst + x + (y + 4) * dstPitch, dstPitch, blocks2[2]);
        x264_add4x4_idct_mmxext(pDst + x + 4 + (y + 4) * dstPitch, dstPitch, blocks2[3]);
    }

    DeblockPicture(pDst, dstPitch, w, h, quant, aOffset, bOffset);
}

PVideoFrame __stdcall EncDenoise::GetFrame(int n, IScriptEnvironment* env)
{
	PVideoFrame	src	= child->GetFrame(n, env);
    PVideoFrame dst = env->NewVideoFrame(vi);

    int lfPitch = nHPadding * 2 + nWidth;
    int lfOff = nHPadding + nVPadding * lfPitch;

    GetVectorStream(n, env, mvclip, fgop);

    refFrame = buffer + bufferPos * frameSize;
    curFrame = buffer + (1 - bufferPos) * frameSize;

	//env->BitBlt(derp_y, der_pitch_y, srcp_y, src_pitch_y, width, height);

    int off = ( isBackward ) ? 1 : -1;

	if ( IsUsable(fgop) )
	{
        if ( lastf != n + off )
        {
            env->BitBlt(refFrame + lfOff, lfPitch,
                child->GetFrame(n + off, env)->GetReadPtr(PLANAR_Y),
                child->GetFrame(n + off, env)->GetPitch(PLANAR_Y),
                nWidth, nHeight);
            Padding::PadReferenceFrame(refFrame, lfPitch, nHPadding, nVPadding, nWidth, nHeight);
        }
        
        ProcessFrame(curFrame + lfOff, refFrame + lfOff, lfPitch, src->GetReadPtr(PLANAR_Y),
            src->GetPitch(PLANAR_Y), nWidth, nHeight, 1);

        env->BitBlt(dst->GetWritePtr(PLANAR_Y),  dst->GetPitch(PLANAR_Y),
            curFrame + lfOff, lfPitch, nWidth, nHeight);

        env->BitBlt(dst->GetWritePtr(PLANAR_U), dst->GetPitch(PLANAR_U),
            src->GetReadPtr(PLANAR_U), src->GetPitch(PLANAR_U), nWidth / 2, nHeight / 2);

        env->BitBlt(dst->GetWritePtr(PLANAR_V), dst->GetPitch(PLANAR_V),
            src->GetReadPtr(PLANAR_V), src->GetPitch(PLANAR_V), nWidth / 2, nHeight / 2);
    }
    else {
    
        if (( !scBehavior ) && ( n + off >= 0 ) && ( n + off < vi.num_frames ))
        {
            env->BitBlt(dst->GetWritePtr(PLANAR_Y), dst->GetPitch(PLANAR_Y),
                        child->GetFrame(n + off, env)->GetReadPtr(PLANAR_Y),
                        child->GetFrame(n + off, env)->GetPitch(PLANAR_Y), nWidth, nHeight);
	        env->BitBlt(dst->GetWritePtr(PLANAR_U), dst->GetPitch(PLANAR_U),
                        child->GetFrame(n + off, env)->GetReadPtr(PLANAR_U),
                        child->GetFrame(n + off, env)->GetPitch(PLANAR_U), nWidth >> 1, nHeight >> 1);
	        env->BitBlt(dst->GetWritePtr(PLANAR_V), dst->GetPitch(PLANAR_V),
                        child->GetFrame(n + off, env)->GetReadPtr(PLANAR_V),
                        child->GetFrame(n + off, env)->GetPitch(PLANAR_V), nWidth >> 1, nHeight >> 1);
            env->BitBlt(curFrame + lfOff, lfPitch, child->GetFrame(n + off, env)->GetReadPtr(PLANAR_Y),
                        child->GetFrame(n + off, env)->GetPitch(PLANAR_Y), nWidth, nHeight);
        }
        else {
            env->BitBlt(curFrame + lfOff, lfPitch, src->GetReadPtr(PLANAR_Y),
                        src->GetPitch(PLANAR_Y), nWidth, nHeight);

            env->BitBlt(dst->GetWritePtr(PLANAR_Y), dst->GetPitch(PLANAR_Y),
                src->GetReadPtr(PLANAR_Y), src->GetPitch(PLANAR_Y), nWidth, nHeight);

            env->BitBlt(dst->GetWritePtr(PLANAR_U), dst->GetPitch(PLANAR_U),
                src->GetReadPtr(PLANAR_U), src->GetPitch(PLANAR_U), nWidth / 2, nHeight / 2);

            env->BitBlt(dst->GetWritePtr(PLANAR_V), dst->GetPitch(PLANAR_V),
                src->GetReadPtr(PLANAR_V), src->GetPitch(PLANAR_V), nWidth / 2, nHeight / 2);
        }
    }

    Padding::PadReferenceFrame(curFrame, lfPitch, nHPadding, nVPadding, nWidth, nHeight);

    bufferPos = 1 - bufferPos;
    
    lastf = n;

	return dst;
}