/*
		This program is free software; you can redistribute it and/or modify
		it under the terms of the GNU General Public License as published by
		the Free Software Foundation.

		This program is distributed in the hope that it will be useful,
		but WITHOUT ANY WARRANTY; without even the implied warranty of
		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
		GNU General Public License for more details.

		You should have received a copy of the GNU General Public License
		along with this program; if not, write to the Free Software
		Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

		Original implementation by Donald Graft.
		Optimizations and improvements by Klaus Post.

		The author can be contacted at:
		Donald Graft
		neuron2@comcast.net.
*/

/* IMPORTANT NOTE: This filter must be built with the Intel Compiler.
   The VC++ 6.0 compiler is too stupid to generate correct code. */
/* VC++ 2003 (Toolkit) works fine - Fizick */

#include "avisynth.h"
#include "info.h"
#include "stdio.h"

#define BLKSIZE 32
#define NORM (235*BLKSIZE*BLKSIZE)
#define MAX_COPIES 20
#define MYVERSION "2.23"

struct FRAMEINFO
{
	unsigned int frame_no;
	PVideoFrame frame;
	unsigned int pitch, pitchY, pitchUV;
	unsigned char *frame_ptr;
	unsigned char *frame_ptrY;
	unsigned char *frame_ptrU;
	unsigned char *frame_ptrV;
	double metric;
	int highest_x;
	int highest_y;
};

class Dup : public GenericVideoFilter
{
	double threshold;
	bool show, copy, debug, chroma, blend;
	int maxcopies;
	bool have_isse;
	char log[1024];
	FILE *logfp;

	struct FRAMEINFO cache[MAX_COPIES+2];
	PVideoFrame copyframe;
	int cache_count, useframe, last_n;

public:
    Dup(PClip _child, double _threshold, bool _chroma, bool _show, bool _copy, int _maxcopies, bool _blend,
		bool _debug, const char *_log, IScriptEnvironment* env) :
	    GenericVideoFilter(_child), threshold(_threshold), chroma(_chroma), show(_show), copy(_copy),
		maxcopies(_maxcopies), blend(_blend), debug(_debug)
	{
		if (!vi.IsYUY2() && !vi.IsYV12())
			env->ThrowError("Dup: requires YUY2 or YV12 source");
		if (maxcopies > 20)
			env->ThrowError("Dup: maxcopies must be <= 20");
		if (threshold < 0.0 || threshold > 100.0)
			env->ThrowError("Dup: threshold out of range (0.0-100.0)");
		if (blend == true && copy == false)
			env->ThrowError("Dup: blend=true requires copy=true");
		strcpy(log, _log);
		logfp = NULL;
		if (*log)
		{
			logfp = fopen(log, "w");
			if (logfp == NULL)
			{
				env->ThrowError("Dup: cannot open log file");
			}
		}
		if (debug)
		{
			char b[80];
			sprintf(b, "Dup %s by Donald Graft/Klaus Post, Copyright 2003-2007\n", MYVERSION);
			OutputDebugString(b);
			if (logfp)
				fprintf(logfp, "%s\n", b);
		}
		if (env->GetCPUFlags() & CPUF_INTEGER_SSE) have_isse = true;
		else have_isse = false;
		useframe = -1;
		last_n = -1;
		copyframe = env->NewVideoFrame(vi);

		xblocks = (vi.width+BLKSIZE-1) / BLKSIZE;
		yblocks = (vi.height+BLKSIZE-1) / BLKSIZE;

		sum = (unsigned int *) malloc(xblocks * yblocks * sizeof(unsigned int));
		if (sum == NULL) env->ThrowError("Dup: cannot allocate needed memory");
		/* For safety in case someone came in without doing it. */
		__asm emms;

	}
    ~Dup()
	{
		if (sum) free(sum);
		if (logfp)
			fclose(logfp);
	}
    PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env);
    void isse_scenechange(const BYTE* c_plane, const BYTE* tplane, int height, int width, int pitch,  int t_pitch, int* blk_values);
    void isse_scenechange_16(const BYTE* c_plane, const BYTE* tplane, int height, int width, int pitch,  int t_pitch, int* blk_values);
    void mmx_average_planes(BYTE* dst_plane, const BYTE** src_planes, int width_mod8, int planes, int div);
private:
  	int xblocks, yblocks;
	unsigned int *sum, highest_sum;

};

PVideoFrame __stdcall Dup::GetFrame(int n, IScriptEnvironment* env)
{
	int f;
    const unsigned char *srcp;
    const unsigned char *src0p;

    int row_size, row_sizeY, row_sizeUV;
    int height, heightY, heightUV;

	int highest_x, highest_y;
	int i, j, x, y;
	char buf[80];
	int xlim, ylim, xtmp, ytmp;
	unsigned char *dstp;
	double metric;
	int offset_remainX = (vi.width&(~(BLKSIZE-1)));  // Offset into the frame (pixels)
	int offset_remainY = (vi.height/BLKSIZE)*BLKSIZE;  // yposition the remaining pixels start (lines)
	int remainX = vi.width&(BLKSIZE-1) + offset_remainX;       // Where does the remaining pixels end? (pixels)
	int remainY = (vi.height&(BLKSIZE-1)) + offset_remainY;   // Where does the remaining pixels end? (lines)

	PVideoFrame showframe;

	// Detect random access and restart.
	if (n != last_n + 1) useframe = -1;
	last_n = n;

	if (show == true) showframe = env->NewVideoFrame(vi);

	if (n > useframe)
	{
		/* Restart the duplicate detection with the current frame.
		   useframe will end up with the frame number of the last duplicate in
		   the string (if any). If there are no duplicates, it will be the
		   current frame number. */
		cache[0].frame_no = n;
		cache[0].frame = child->GetFrame(n, env);
		if (vi.IsYUY2())
		{
			cache[0].frame_ptr = (unsigned char *) cache[0].frame->GetReadPtr();
			cache[0].pitch = cache[0].frame->GetPitch();
			row_size = cache[0].frame->GetRowSize();
			height = cache[0].frame->GetHeight();
		}
		else
		{
			cache[0].frame_ptrY = (unsigned char *) cache[0].frame->GetReadPtr(PLANAR_Y);
			cache[0].frame_ptrU = (unsigned char *) cache[0].frame->GetReadPtr(PLANAR_U);
			cache[0].frame_ptrV = (unsigned char *) cache[0].frame->GetReadPtr(PLANAR_V);
			cache[0].pitchY = cache[0].frame->GetPitch(PLANAR_Y);
			row_sizeY = cache[0].frame->GetRowSize(PLANAR_Y);
			heightY = cache[0].frame->GetHeight(PLANAR_Y);
			cache[0].pitchUV = cache[0].frame->GetPitch(PLANAR_U);
			row_sizeUV = cache[0].frame->GetRowSize(PLANAR_U);
			heightUV = cache[0].frame->GetHeight(PLANAR_U);
		}
		cache[0].metric = 0.0;
		cache_count = 1;


		/* Compare forward either until a) we break after the first compare because
		   copy=false, b) we hit a frame whose difference exceeds threshold,
		   c) we reach maxcopies, or d) we reach end of clip. */
		for (f = 1; f < maxcopies + 2; f++)
		{
			/* Clear the block sums. */
 			for (i = 0; i < yblocks; i++)
			{
 				for (j = 0; j < xblocks; j++)
				{
					sum[i*xblocks+j] = 0;
				}
			}
			/* Get the next frame to compare against the current frame. */
			cache[f].frame_no = n + f;
			cache[f].frame = child->GetFrame((n + f < vi.num_frames) ? n + f : vi.num_frames - 1, env);
			/* Do the comparison. */
			if (vi.IsYUY2())
			{
				cache[f].frame_ptr = (unsigned char *) cache[f].frame->GetReadPtr();
				cache[f].pitch = cache[f].frame->GetPitch();
				src0p = cache[0].frame_ptr;
				srcp = cache[f].frame_ptr;

				for (y = 0; y < height; y++)
				{
					for (x = 0; x < row_size;)
					{
						sum[(y/BLKSIZE)*xblocks + x/(2*BLKSIZE)] += abs((int)srcp[x] - (int)src0p[x]);
						chroma == true ? x++ : x+=2;
					}
					srcp += cache[f].pitch;
					src0p += cache[0].pitch;
				}
			}
			else
			{
				cache[f].frame_ptrY = (unsigned char *) cache[f].frame->GetReadPtr(PLANAR_Y);
				cache[f].frame_ptrU = (unsigned char *) cache[f].frame->GetReadPtr(PLANAR_U);
				cache[f].frame_ptrV = (unsigned char *) cache[f].frame->GetReadPtr(PLANAR_V);
				cache[f].pitchY = cache[f].frame->GetPitch(PLANAR_Y);
				cache[f].pitchUV = cache[f].frame->GetPitch(PLANAR_U);

				src0p = cache[0].frame_ptrY;
				srcp = cache[f].frame_ptrY;
				if (have_isse == true)
				{
					isse_scenechange(srcp, src0p, heightY, row_sizeY, cache[f].pitchY,  cache[0].pitchY,(int*)sum);
					// Right remaining
					for (y = 0; y < remainY; y++)
					{
						for (x = offset_remainX; x < remainX; x++)
						{
							sum[(y/BLKSIZE)*xblocks + x/BLKSIZE] += abs((int)srcp[x] - (int)src0p[x]);
						}
						srcp += cache[f].pitchY;
						src0p += cache[0].pitchY;
					}
					// Bottom remaining
					src0p = cache[0].frame_ptrY+ (cache[0].pitchY*offset_remainY);
					srcp = cache[f].frame_ptrY + (cache[f].pitchY*offset_remainY);
					for (y = offset_remainY; y < heightY; y++)
					{
						for (x = 0; x < row_sizeY; x++)
						{
							sum[(y/BLKSIZE)*xblocks + x/BLKSIZE] += abs((int)srcp[x] - (int)src0p[x]);
						}
						srcp += cache[f].pitchY;
						src0p += cache[0].pitchY;
					}
				}
				else
				{
					for (y = 0; y < heightY; y++)
					{
						for (x = 0; x < row_sizeY; x++)
						{
							sum[(y/BLKSIZE)*xblocks + x/BLKSIZE] += abs((int)srcp[x] - (int)src0p[x]);
						}
						srcp += cache[f].pitchY;
						src0p += cache[0].pitchY;
					}
				}
				if (chroma == true)
				{
					src0p = cache[0].frame_ptrU;
					srcp = cache[f].frame_ptrU;
					if (have_isse == true)
					{
						isse_scenechange_16(srcp, src0p, heightUV, row_sizeUV, cache[f].pitchUV,  cache[0].pitchUV,(int*)sum);
						// Right remaining
						for (y = 0; y < (remainY>>1); y++)
						{
							for (x = (offset_remainX>>1); x < row_sizeUV; x++)
							{
								sum[2*(y/BLKSIZE)*xblocks + 2*x/BLKSIZE] += abs((int)srcp[x] - (int)src0p[x]);
							}
							srcp += cache[f].pitchUV;
							src0p += cache[0].pitchUV;
						}
						// Bottom remaining
						src0p = cache[0].frame_ptrU+ ((cache[0].pitchUV*offset_remainY)>>1);
						srcp = cache[f].frame_ptrU + ((cache[f].pitchUV*offset_remainY)>>1);
						for (y = (offset_remainY>>1); y < heightUV; y++)
						{
							for (x = 0; x < row_sizeUV; x++)
							{
								sum[2*(y/BLKSIZE)*xblocks + 2*x/BLKSIZE] += abs((int)srcp[x] - (int)src0p[x]);
							}
							srcp += cache[f].pitchUV;
							src0p += cache[0].pitchUV;
						}
					}
					else
					{
						for (y = 0; y < heightUV; y++)
						{
							for (x = 0; x < row_sizeUV; x++)
							{
								sum[(2*y/BLKSIZE)*xblocks + 2*x/BLKSIZE] += abs((int)srcp[x] - (int)src0p[x]);
							}
							srcp += cache[f].pitchUV;
							src0p += cache[0].pitchUV;
						}
					}

					src0p = cache[0].frame_ptrV;
					srcp = cache[f].frame_ptrV;
					if (have_isse == true)
					{
						isse_scenechange_16(srcp, src0p, heightUV, row_sizeUV, cache[f].pitchUV,  cache[0].pitchUV,(int*)sum);
						// Right remaining
						for (y = 0; y < (remainY>>1); y++)
						{
							for (x = (offset_remainX>>1); x < (remainX>>1); x++)
							{
								sum[2*(y/BLKSIZE)*xblocks + 2*x/BLKSIZE] += abs((int)srcp[x] - (int)src0p[x]);
							}
							srcp += cache[f].pitchUV;
							src0p += cache[0].pitchUV;
						}
						// Bottom remaining
						src0p = cache[0].frame_ptrV+ ((cache[0].pitchUV*offset_remainY)>>1);
						srcp = cache[f].frame_ptrV + ((cache[f].pitchUV*offset_remainY)>>1);
						for (y = (offset_remainY>>1); y < (heightUV); y++)
						{
							for (x = 0; x < row_sizeUV; x++)
							{
								sum[2*(y/BLKSIZE)*xblocks + 2*x/BLKSIZE] += abs((int)srcp[x] - (int)src0p[x]);
							}
							srcp += cache[f].pitchUV;
							src0p += cache[0].pitchUV;
						}
					}
					else
					{
						for (y = 0; y < heightUV; y++)
						{
							for (x = 0; x < row_sizeUV; x++)
							{
								sum[(2*y/BLKSIZE)*xblocks + 2*x/BLKSIZE] += abs((int)srcp[x] - (int)src0p[x]);
							}
							srcp += cache[f].pitchUV;
							src0p += cache[0].pitchUV;
						}
					}
				}
			}

			/* Now find the 32x32 block that has the greatest difference. */
			highest_sum = 0;
			highest_x = highest_y = 0;
			for (i = 0; i < yblocks; i++)
			{
				for (j = 0; j < xblocks; j++)
				{
					if (sum[i * xblocks + j] > highest_sum)
					{
						highest_sum = sum[i * xblocks + j];
						if (vi.IsYUY2()) highest_x = j * BLKSIZE * 2;
						else highest_x = j * BLKSIZE;
						highest_y = i * BLKSIZE;
					}
				}
			}
			/* Caluclate the percentage difference for the block and store the results. */
			cache[f].metric = (highest_sum * 100.0) / NORM;
			cache[f].highest_x = highest_x;
			cache[f].highest_y = highest_y;
			cache_count++;
			if (copy == false || cache[f].metric > threshold || n + f >= vi.num_frames - 1) break;
		}

		/* Make needed local copies because once we leave we can't rely on things remaining
		   in the Avisynth frame cache. */
		if (copy == true)
		{
			/* Copy the last frame in the string of duplicates to a local copy. If there are
			   no duplicates, it will be the current frame. Also record the frame number of the
			   last frame in the string of duplicates. */
			useframe = n + cache_count - 2;
			if (vi.IsYUY2())
			{
				env->BitBlt(copyframe->GetWritePtr(), copyframe->GetPitch(),
							cache[cache_count-2].frame->GetReadPtr(), cache[cache_count-2].frame->GetPitch(),
							cache[cache_count-2].frame->GetRowSize(), cache[cache_count-2].frame->GetHeight());
			}
			else
			{
				env->BitBlt(copyframe->GetWritePtr(PLANAR_Y),
							copyframe->GetPitch(PLANAR_Y),
							cache[cache_count-2].frame->GetReadPtr(PLANAR_Y),
							cache[cache_count-2].frame->GetPitch(PLANAR_Y),
							cache[cache_count-2].frame->GetRowSize(PLANAR_Y),
							cache[cache_count-2].frame->GetHeight(PLANAR_Y));
				env->BitBlt(copyframe->GetWritePtr(PLANAR_U),
							copyframe->GetPitch(PLANAR_U),
							cache[cache_count-2].frame->GetReadPtr(PLANAR_U),
							cache[cache_count-2].frame->GetPitch(PLANAR_U),
							cache[cache_count-2].frame->GetRowSize(PLANAR_U),
							cache[cache_count-2].frame->GetHeight(PLANAR_U));
				env->BitBlt(copyframe->GetWritePtr(PLANAR_V),
							copyframe->GetPitch(PLANAR_V),
							cache[cache_count-2].frame->GetReadPtr(PLANAR_V),
							cache[cache_count-2].frame->GetPitch(PLANAR_V),
							cache[cache_count-2].frame->GetRowSize(PLANAR_V),
							cache[cache_count-2].frame->GetHeight(PLANAR_V));
			}

			/* If blend=true, modify copyframe to be a blend of all the frames in the
			   string of duplicates. */
			if (blend == true)
			{
				int i;
				char buf[80];
				int dpitchY, dpitchUV, row_sizeY2, heightY2;
				BYTE* dst_planeY;
				BYTE* dst_planeU;
				BYTE* dst_planeV;
				const BYTE** src_planesY=0;
				const BYTE** src_planesU=0;
				const BYTE** src_planesV=0;
				int* src_pitchY=0;
				int* src_pitchUV=0;
				int planesY=0;
				int planesUV=0;

				/* There's nothing to blend if the current frame is the last frame,
				   or the current frame is the only one in the cache. */
				if (n < vi.num_frames - 1 && cache_count > 2)
				{
					OutputDebugString("Dup: blending...\n");
					if (vi.IsYUY2())
					{
						dst_planeY = copyframe->GetWritePtr();
						src_planesY = new const BYTE*[cache_count];
						src_pitchY = new int[cache_count];
					    dpitchY = copyframe->GetPitch();
						row_sizeY2=row_size;
						heightY2=height;
					}
					else
					{
						dpitchY = copyframe->GetPitch(PLANAR_Y);
						dpitchUV = copyframe->GetPitch(PLANAR_U);
						dst_planeY = copyframe->GetWritePtr(PLANAR_Y);
						dst_planeU = copyframe->GetWritePtr(PLANAR_U);
						dst_planeV = copyframe->GetWritePtr(PLANAR_V);
						src_planesY = new const BYTE*[cache_count];
						src_planesU = new const BYTE*[cache_count];
						src_planesV = new const BYTE*[cache_count];
						src_pitchY = new int[cache_count];
						src_pitchUV = new int[cache_count];
						row_sizeY2=row_sizeY;
						heightY2=heightY;
					}
					for (i = 0; i < cache_count - 2; i++)
					{
						if (debug == true)
						{
							sprintf(buf, "Dup: blending %d into %d\n", cache[i].frame_no, cache[cache_count-2].frame_no);
							OutputDebugString(buf);
							if (logfp)
								fprintf(logfp, "%s\n", buf);
						}
						if (vi.IsYUY2())
						{
							 src_planesY[planesY] = cache[i].frame_ptr;
							 src_pitchY[planesY] = cache[i].pitch;
							 planesY++;
						}
						else  //YV12
						{
							src_planesY[planesY] = cache[i].frame_ptrY;
							src_pitchY[planesY] = cache[i].pitchY;
						    planesY++;

							src_planesU[planesUV] = cache[i].frame_ptrU;
							src_planesV[planesUV] = cache[i].frame_ptrV;
							src_pitchUV[planesUV] = cache[i].pitchUV;
							planesUV++;
						}
					}  // End for i
					// Blend Y
					if (planesY)
					{
						int c_div=32768/(planesY+1); // Fizick - add 1 for dst plane
						for (int j=0;j<heightY2;j++)
						{
							mmx_average_planes(dst_planeY,src_planesY,row_sizeY2, planesY-1, c_div);
							dst_planeY+=dpitchY;
							for (int i=0;i<planesY;i++)
							{
								src_planesY[i]+=src_pitchY[i];
							}
						}
					} // End if planesY
					if (planesUV)
					{
						int c_div=32768/(planesUV+1); // Fizick - add 1 for dst plane
						for (int j=0;j<heightUV;j++)
						{
							mmx_average_planes(dst_planeU,src_planesU,row_sizeUV, planesUV-1, c_div);
							mmx_average_planes(dst_planeV,src_planesV,row_sizeUV, planesUV-1, c_div);
							dst_planeU+=dpitchUV;
							dst_planeV+=dpitchUV;
							for (int i=0;i<planesUV;i++)
							{
								src_planesU[i]+=src_pitchUV[i];
								src_planesV[i]+=src_pitchUV[i];
							}
						}
					} // End if planesUV
					delete[] src_planesY;
					delete[] src_pitchY;
					if (src_planesU) delete[] src_planesU;
					if (src_planesV) delete[] src_planesV;
					if (src_pitchUV) delete[] src_pitchUV;

				} // End if blend
			}

			/* If show=true, make another copy of copyframe. We need this extra
			   copy because we are going to write show data on it and that has to be
			   cleared on each new frame. To clear it, we start again with copyframe
			   each time. */
			if (show == true)
			{
				if (vi.IsYUY2())
				{
					env->BitBlt(showframe->GetWritePtr(), showframe->GetPitch(),
								copyframe->GetReadPtr(), copyframe->GetPitch(),
								copyframe->GetRowSize(), copyframe->GetHeight());
				}
				else
				{
					env->BitBlt(showframe->GetWritePtr(PLANAR_Y),
								showframe->GetPitch(PLANAR_Y),
								copyframe->GetReadPtr(PLANAR_Y),
								copyframe->GetPitch(PLANAR_Y),
								copyframe->GetRowSize(PLANAR_Y),
								copyframe->GetHeight(PLANAR_Y));
					env->BitBlt(showframe->GetWritePtr(PLANAR_U),
								showframe->GetPitch(PLANAR_U),
								copyframe->GetReadPtr(PLANAR_U),
								copyframe->GetPitch(PLANAR_U),
								copyframe->GetRowSize(PLANAR_U),
								copyframe->GetHeight(PLANAR_U));
					env->BitBlt(showframe->GetWritePtr(PLANAR_V),
								showframe->GetPitch(PLANAR_V),
								copyframe->GetReadPtr(PLANAR_V),
								copyframe->GetPitch(PLANAR_V),
								copyframe->GetRowSize(PLANAR_V),
								copyframe->GetHeight(PLANAR_V));
				}
			}
		}
		else if (show == true)
		{
			/* We just need a copy of the current frame. We need this extra
			   copy because we are going to write show data on it and that has to be
			   cleared on each new frame. To clear it, we start again with the current frame
			   each time. */
			if (vi.IsYUY2())
			{
				env->BitBlt(showframe->GetWritePtr(), showframe->GetPitch(),
							cache[0].frame->GetReadPtr(), cache[0].frame->GetPitch(),
							cache[0].frame->GetRowSize(), cache[0].frame->GetHeight());
			}
			else
			{
				env->BitBlt(showframe->GetWritePtr(PLANAR_Y),
							showframe->GetPitch(PLANAR_Y),
							cache[0].frame->GetReadPtr(PLANAR_Y),
							cache[0].frame->GetPitch(PLANAR_Y),
							cache[0].frame->GetRowSize(PLANAR_Y),
							cache[0].frame->GetHeight(PLANAR_Y));
				env->BitBlt(showframe->GetWritePtr(PLANAR_U),
							showframe->GetPitch(PLANAR_U),
							cache[0].frame->GetReadPtr(PLANAR_U),
							cache[0].frame->GetPitch(PLANAR_U),
							cache[0].frame->GetRowSize(PLANAR_U),
							cache[0].frame->GetHeight(PLANAR_U));
				env->BitBlt(showframe->GetWritePtr(PLANAR_V),
							showframe->GetPitch(PLANAR_V),
							cache[0].frame->GetReadPtr(PLANAR_V),
							cache[0].frame->GetPitch(PLANAR_V),
							cache[0].frame->GetRowSize(PLANAR_V),
							cache[0].frame->GetHeight(PLANAR_V));
			}
		}
	}
	else if (show == true)
	{
		/* We are in a string of duplicates and just need to refresh
		   showframe if show=true. */
		if (copy == true)
		{
			/* Refresh from copyframe. */
			if (vi.IsYUY2())
			{
				env->BitBlt(showframe->GetWritePtr(), showframe->GetPitch(),
							copyframe->GetReadPtr(), copyframe->GetPitch(),
							copyframe->GetRowSize(), copyframe->GetHeight());
			}
			else
			{
				env->BitBlt(showframe->GetWritePtr(PLANAR_Y),
							showframe->GetPitch(PLANAR_Y),
							copyframe->GetReadPtr(PLANAR_Y),
							copyframe->GetPitch(PLANAR_Y),
							copyframe->GetRowSize(PLANAR_Y),
							copyframe->GetHeight(PLANAR_Y));
				env->BitBlt(showframe->GetWritePtr(PLANAR_U),
							showframe->GetPitch(PLANAR_U),
							copyframe->GetReadPtr(PLANAR_U),
							copyframe->GetPitch(PLANAR_U),
							copyframe->GetRowSize(PLANAR_U),
							copyframe->GetHeight(PLANAR_U));
				env->BitBlt(showframe->GetWritePtr(PLANAR_V),
							showframe->GetPitch(PLANAR_V),
							copyframe->GetReadPtr(PLANAR_V),
							copyframe->GetPitch(PLANAR_V),
							copyframe->GetRowSize(PLANAR_V),
							copyframe->GetHeight(PLANAR_V));
			}
		}
		else
		{
			/* Refresh from the current frame. */
			if (vi.IsYUY2())
			{
				env->BitBlt(showframe->GetWritePtr(), showframe->GetPitch(),
							cache[0].frame->GetReadPtr(), cache[0].frame->GetPitch(),
							cache[0].frame->GetRowSize(), cache[0].frame->GetHeight());
			}
			else
			{
				env->BitBlt(showframe->GetWritePtr(PLANAR_Y),
							showframe->GetPitch(PLANAR_Y),
							cache[0].frame->GetReadPtr(PLANAR_Y),
							cache[0].frame->GetPitch(PLANAR_Y),
							cache[0].frame->GetRowSize(PLANAR_Y),
							cache[0].frame->GetHeight(PLANAR_Y));
				env->BitBlt(showframe->GetWritePtr(PLANAR_U),
							showframe->GetPitch(PLANAR_U),
							cache[0].frame->GetReadPtr(PLANAR_U),
							cache[0].frame->GetPitch(PLANAR_U),
							cache[0].frame->GetRowSize(PLANAR_U),
							cache[0].frame->GetHeight(PLANAR_U));
				env->BitBlt(showframe->GetWritePtr(PLANAR_V),
							showframe->GetPitch(PLANAR_V),
							cache[0].frame->GetReadPtr(PLANAR_V),
							cache[0].frame->GetPitch(PLANAR_V),
							cache[0].frame->GetRowSize(PLANAR_V),
							cache[0].frame->GetHeight(PLANAR_V));
			}
		}
	}

	if (show == true)
	{
		/* Generate show data overlay. */
		if (copy == true)
		{
			sprintf(buf, "Dup %s", MYVERSION);
			if (vi.IsYUY2()) DrawStringYUY2(showframe, 0, 0, buf);
			else  DrawString(showframe, 0, 0, buf);
			sprintf(buf, "Copyright 2003-2007 Donald Graft/Klaus Post");
			if (vi.IsYUY2()) DrawStringYUY2(showframe, 0, 1, buf);
			else  DrawString(showframe, 0, 1, buf);
			int f = cache[cache_count-2].frame_no;
			if (f >= vi.num_frames) f = vi.num_frames - 1;
			sprintf(buf, "frm %d: diff from frm %d = %2.2f%%", n, cache[0].frame_no, cache[n-cache[0].frame_no].metric);
			if (vi.IsYUY2()) DrawStringYUY2(showframe, 0, 3, buf);
			else  DrawString(showframe, 0, 3, buf);
			sprintf(buf, "Using frm %d", f);
			if (vi.IsYUY2()) DrawStringYUY2(showframe, 0, 4, buf);
			else  DrawString(showframe, 0, 4, buf);
			if (blend == true && cache_count > 2)
			{
				sprintf(buf, "Blended %d through %d",
						cache[0].frame_no, cache[cache_count-2].frame_no);
				if (vi.IsYUY2()) DrawStringYUY2(showframe, 0, 5, buf);
				else  DrawString(showframe, 0, 5, buf);
			}
		}
		else
		{
			highest_x = cache[1].highest_x;
			highest_y = cache[1].highest_y;
			metric = cache[1].metric;
			sprintf(buf, "Dup %s", MYVERSION);
			if (vi.IsYUY2()) DrawStringYUY2(showframe, 0, 0, buf);
			else  DrawString(showframe, 0, 0, buf);
			sprintf(buf, "Copyright 2003-2007 Donald Graft/Klaus Post");
			if (vi.IsYUY2()) DrawStringYUY2(showframe, 0, 1, buf);
			else  DrawString(showframe, 0, 1, buf);
			if (vi.IsYUY2())
			{
				int pitch = showframe->GetPitch();
				dstp = showframe->GetWritePtr();
				xlim = highest_x + 2*BLKSIZE;
				if (xlim > row_size) xlim = row_size;
				ylim = highest_y + BLKSIZE;
				if (ylim > height) ylim = height;
				for (y = highest_y; y < ylim; y++)
				{
					(dstp + y * pitch)[highest_x] = 235;
					xtmp = highest_x+2*(BLKSIZE - 1);
					if (xtmp < row_size)
						(dstp + y * pitch)[xtmp] = 235;
				}
				for (x = highest_x; x < xlim; x+=4)
				{
					(dstp + (highest_y) * pitch)[x] = 235;
					(dstp + (highest_y) * pitch)[x+2] = 235;
					(dstp + (highest_y) * pitch)[x+1] = 128;
					(dstp + (highest_y) * pitch)[x+3] = 128;
					ytmp = highest_y + BLKSIZE - 1;
					if (ytmp < height)
					{
						(dstp + ytmp * pitch)[x] = 235;
						(dstp + ytmp * pitch)[x+2] = 235;
						(dstp + ytmp * pitch)[x+1] = 128;
						(dstp + ytmp * pitch)[x+3] = 128;
					}
				}
				if (metric < threshold)
				{
					for (y = highest_y, x = 0; y < ylim; y++, x++)
					{
						xtmp = highest_x+2*x;
						if (xtmp < row_size)
							(dstp + y * pitch)[highest_x+2*x] = 235;
						xtmp = highest_x+2*(BLKSIZE - 1 - x);
						if (xtmp < row_size)
							(dstp + y * pitch)[highest_x+2*(BLKSIZE - 1 - x)] = 235;
					}
				}
			}
			else
			{
				int pitchY = showframe->GetPitch(PLANAR_Y);
				dstp = showframe->GetWritePtr(PLANAR_Y);
				xlim = highest_x + BLKSIZE;
				if (xlim > row_sizeY) xlim = row_sizeY;
				ylim = highest_y + BLKSIZE;
				if (ylim > heightY) ylim = heightY;
				for (y = highest_y; y < ylim; y++)
				{
					(dstp + y * pitchY)[highest_x] = 235;
					xtmp = highest_x + (BLKSIZE - 1);
					if (xtmp < row_sizeY)
						(dstp + y * pitchY)[xtmp] = 235;
				}
				for (x = highest_x; x < xlim; x++)
				{
					(dstp + (highest_y) * pitchY)[x] = 235;
					ytmp = highest_y + BLKSIZE - 1;
					if (ytmp < heightY)
					{
						(dstp + ytmp * pitchY)[x] = 235;
					}
				}
				if (metric < threshold)
				{
					for (y = highest_y, x = 0; y < ylim; y++, x++)
					{
						xtmp = highest_x+x;
						if (xtmp < row_sizeY)
							(dstp + y * pitchY)[highest_x+x] = 235;
						xtmp = highest_x+(BLKSIZE - 1 - x);
						if (xtmp < row_sizeY)
							(dstp + y * pitchY)[highest_x+(BLKSIZE - 1 - x)] = 235;
					}
				}
			}
			int f = n + 1;
			if (f >= vi.num_frames) f = vi.num_frames - 1;
			sprintf(buf, "frm %d: diff from frm %d = %2.2f%%", n, f, cache[1].metric);
			if (vi.IsYUY2()) DrawStringYUY2(showframe, 0, 3, buf);
			else  DrawString(showframe, 0, 3, buf);
			sprintf(buf, "Using frm %d", cache[0].frame_no);
			if (vi.IsYUY2()) DrawStringYUY2(showframe, 0, 4, buf);
			else  DrawString(showframe, 0, 4, buf);
		}
	}
	if (debug == true)
	{
		/* Generate debug data. */
		if (copy == true)
		{
			int f = cache[cache_count-2].frame_no;
			if (f >= vi.num_frames) f = vi.num_frames - 1;
			sprintf(buf, "Dup: frm %d: Using frm %d", n, f);
			OutputDebugString(buf);
			if (logfp)
				fprintf(logfp, "%s\n", buf);
		}
		else
		{
			int f = n + 1;
			if (f >= vi.num_frames)
				f = vi.num_frames - 1;
			sprintf(buf, "Dup: frm %d: diff from frm %d = %2.2f%%", n, f, cache[1].metric);
			OutputDebugString(buf);
			if (logfp)
				fprintf(logfp, "%s\n", buf);
			sprintf(buf, "Dup: Using frm %d", cache[0].frame_no);
			OutputDebugString(buf);
			if (logfp)
				fprintf(logfp, "%s\n", buf);
		}
	}

	/* Return the appropriate frame. */
	if (n == useframe) useframe = -1;
	if (show == true) return showframe;
	else if (copy == true)
	{
		PVideoFrame copy = env->NewVideoFrame(vi);
		/* Refresh from copyframe. */
		if (vi.IsYUY2())
		{
			env->BitBlt(copy->GetWritePtr(), copy->GetPitch(),
						copyframe->GetReadPtr(), copyframe->GetPitch(),
						copyframe->GetRowSize(), copyframe->GetHeight());
		}
		else
		{
			env->BitBlt(copy->GetWritePtr(PLANAR_Y),
						copy->GetPitch(PLANAR_Y),
						copyframe->GetReadPtr(PLANAR_Y),
						copyframe->GetPitch(PLANAR_Y),
						copyframe->GetRowSize(PLANAR_Y),
						copyframe->GetHeight(PLANAR_Y));
			env->BitBlt(copy->GetWritePtr(PLANAR_U),
						copy->GetPitch(PLANAR_U),
						copyframe->GetReadPtr(PLANAR_U),
						copyframe->GetPitch(PLANAR_U),
						copyframe->GetRowSize(PLANAR_U),
						copyframe->GetHeight(PLANAR_U));
			env->BitBlt(copy->GetWritePtr(PLANAR_V),
						copy->GetPitch(PLANAR_V),
						copyframe->GetReadPtr(PLANAR_V),
						copyframe->GetPitch(PLANAR_V),
						copyframe->GetRowSize(PLANAR_V),
						copyframe->GetHeight(PLANAR_V));
		}
		return copy;
	}
	else return cache[0].frame;
}

AVSValue __cdecl Create_Dup(AVSValue args, void* user_data, IScriptEnvironment* env)
{
	char path[1024];
	char buf[80], *p;
	char log[1024];
	double  threshold = 3;
	bool chroma = true;
	bool show = false;
	bool copy = true;
	int  maxcopies = 20;
	bool blend = false;
	bool debug = false;
	log[0] = 0;

	/* Load user defaults if they exist. */
	try
	{
		FILE *f;

		const char* plugin_dir = env->GetVar("$PluginDir$").AsString();
		strcpy(path, plugin_dir);
		strcat(path, "\\Dup.def");
		if ((f = fopen(path, "r")) != NULL)
		{
			while(fgets(buf, 80, f) != 0)
			{
				if (strncmp(buf, "threshold=", 10) == 0)
				{
					p = buf;
					while(*p++ != '=');
					threshold = atof(p);
				}
				if (strncmp(buf, "chroma=", 7) == 0)
				{
					p = buf;
					while(*p++ != '=');
					if (*p == 't') chroma = true;
					else chroma = false;
				}
				if (strncmp(buf, "show=", 5) == 0)
				{
					p = buf;
					while(*p++ != '=');
					if (*p == 't') show = true;
					else show = false;
				}
				if (strncmp(buf, "copy=", 5) == 0)
				{
					p = buf;
					while(*p++ != '=');
					if (*p == 't') copy = true;
					else copy = false;
				}
				if (strncmp(buf, "maxcopies=", 10) == 0)
				{
					p = buf;
					while(*p++ != '=');
					maxcopies = atoi(p);
				}
				if (strncmp(buf, "blend=", 6) == 0)
				{
					p = buf;
					while(*p++ != '=');
					if (*p == 't') blend = true;
					else blend = false;
				}
				if (strncmp(buf, "debug=", 6) == 0)
				{
					p = buf;
					while(*p++ != '=');
					if (*p == 't') debug = true;
					else debug = false;
				}
				if (strncmp(buf, "log=", 4) == 0)
				{
					p = buf;
					while(*p++ != '=');
					strcpy(log, p);
				}
			}
		}
	}
	catch (...)
	{
		// plugin directory not set
		// probably using an older version avisynth
	}

    return new Dup(args[0].AsClip(),
		args[1].AsFloat(threshold),		// threshold for duplicate declaration
		args[2].AsBool(chroma),			// use chroma in differencing
		args[3].AsBool(show),			// show biggest difference area
		args[4].AsBool(copy),			// copy
		args[5].AsInt(maxcopies),		// max successive copies to emit
		args[6].AsBool(blend),			// blend the duplicates
		args[7].AsBool(debug),			// debug
		args[8].AsString(log),			// debug log file
		env);
}

 /***
  * Accumulated differences of two planes.
  *
  * This routine is for testing luma planes.
  * The accumulated differences for each 32x32 box is written directly to blk_values.
  * Boxes not fitting within mod32 width sizes are filled with '0'.
  * (c) 2002, Donald Graft (algorithm)
  * (c) 2003, Klaus Post (ISSE code)
  ***/


void Dup::isse_scenechange(const BYTE* c_plane, const BYTE* tplane, int height, int width, int pitch, int t_pitch, int* blk_values) {
  __declspec(align(8)) static __int64 full = 0xffffffffffffffffi64;
  int wp=(width/BLKSIZE)*BLKSIZE;
  int hp=(height/BLKSIZE)*BLKSIZE;
  int pad_blk=(wp-width!=0);

  int y=0;
 __asm {
    mov esi, c_plane
    mov edi, tplane
    mov ebx,0
    jmp yloopover
    align 16
yloop:
    mov eax,[pad_blk]
    cmp eax,0
    je no_pad
    mov eax,blk_values
    mov [eax],0
    add eax,4
    mov blk_values,eax
no_pad:

    mov ebx, [y]
    mov edx, pitch    //copy pitch
    mov ecx, t_pitch    //copy pitch
    add ebx, 32
    shl edx,5
    shl ecx,5
    add edi,ecx     // add pitch to both planes
    add esi,edx
    mov y, ebx
yloopover:
    cmp ebx,[hp]
    jge endframe
    xor ebx, ebx  // X pos.
    align 16
xloop:
    cmp ebx,[wp]
    jge yloop
    mov eax,ebx       // Width (esi)
    mov ecx,ebx       // Width (edi)

    pxor mm6,mm6   // We maintain two sums, for better pairablility
    pxor mm7,mm7
    mov edx, 32
y_loop_inner:
    movq mm0,[esi+eax]
     movq mm2,[esi+eax+8]
    movq mm1,[edi+ecx]
     movq mm3,[edi+ecx+8]
    psadbw mm0,mm1    // Sum of absolute difference
     psadbw mm2,mm3
    paddd mm6,mm0     // Add...
     paddd mm7,mm2
    movq mm0,[esi+eax+16]
     movq mm2,[esi+eax+24]
    movq mm1,[edi+ecx+16]
     movq mm3,[edi+ecx+24]
    psadbw mm0,mm1
     psadbw mm2,mm3
    paddd mm6,mm0
     paddd mm7,mm2

    add eax,pitch
    add ecx,t_pitch

    dec edx
    jnz y_loop_inner

    mov eax,blk_values
    paddd mm6,mm7
    movd [eax],mm6
    add eax,4
    mov blk_values,eax

    add ebx,32
    jmp xloop

endframe:
    emms
  }
}

 /***
  * Accumulated differences of two planes.
  *
  * This routine is for testing chroma planes.
  * The accumulated differences for each 16x16 box is ADDED to the current values.
  * (c) 2002, Donald Graft (algorithm)
  * (c) 2003, Klaus Post (ISSE code)
  ***/

void Dup::isse_scenechange_16(const BYTE* c_plane, const BYTE* tplane, int height, int width, int pitch, int t_pitch, int* blk_values) {
  __declspec(align(8)) static __int64 full = 0xffffffffffffffffi64;
  int wp=(width/16)*16;
  int hp=(height/16)*16;
  int y=0;
  int pad_blk=(wp-width!=0);
 __asm {
    mov esi, c_plane
    mov edi, tplane
    mov ebx,0
    jmp yloopover
    align 16
yloop:
    mov eax,[pad_blk]
    cmp eax,0
    je no_pad
    mov eax,blk_values
    mov [eax],0
    add eax,4
    mov blk_values,eax
no_pad:
    mov ebx, [y]
    mov edx, pitch    //copy pitch
    mov ecx, t_pitch    //copy pitch
    add ebx, 16;
    shl edx,4
    shl ecx,4
    add edi,ecx     // add pitch to both planes
    add esi,edx
    mov y, ebx
yloopover:
    cmp ebx,[hp]
    jge endframe
    xor ebx, ebx  // X pos.
    align 16
xloop:
    cmp ebx,[wp]
    jge yloop
    mov eax,ebx       // Width (esi)
    mov ecx,ebx       // Width (edi)
    pxor mm6,mm6   // We maintain two sums, for better pairablility
    pxor mm7,mm7
    mov edx, 16
y_loop_inner:
    movq mm0,[esi+eax]
     movq mm2,[esi+eax+8]
    movq mm1,[edi+ecx]
     movq mm3,[edi+ecx+8]
    psadbw mm0,mm1    // Sum of absolute difference
     psadbw mm2,mm3
    paddd mm6,mm0     // Add...
     paddd mm7,mm2

    add eax,pitch
    add ecx,t_pitch

    dec edx
    jnz y_loop_inner

    mov eax,blk_values
    movd mm5,[eax]
    paddd mm6,mm7
    paddd mm6,mm5
    movd [eax],mm6
    add eax,4
    mov blk_values,eax

    add ebx,16
    jmp xloop


    jmp xloop
endframe:
    emms
  }
}

 /**
  * Blends one line of several frames equally weighed.
  *
  * An array of pointers is delivered as source frames.
  *
  * "planes" is the number of planes that should be blended.
  * "div" (divisor) should be multiplied by 32768.
  * (c) 2003, Klaus Post
  */

void Dup::mmx_average_planes(BYTE* dst_plane, const BYTE** src_planes, int width_mod8, int planes, int div) {
  __declspec(align(8)) static __int64 low_ffff = 0x000000000000ffffi64;

  __int64 div64 = (__int64)(div) | ((__int64)(div)<<16) | ((__int64)(div)<<32) | ((__int64)(div)<<48);
  div>>=1;
  __int64 add64 = (__int64)(div) | ((__int64)(div)<<32);

  if (planes<0) return; // Fizick (was plane<=0) to not skip very first frame
  __asm {
    mov esi,dst_plane;
    xor eax,eax          // EAX will be plane offset (all planes).
    align 16
testplane:
    cmp eax, [width_mod8]
    jge outloop

    movq mm0,[esi+eax]  // Load current frame pixels
     pxor mm2,mm2        // Clear mm2
    movq mm6,mm0
     movq mm7,mm0
    mov edi,[src_planes];  // Adress of planeP array is now in edi
    mov ebx,[planes]   // How many planes (this will be our counter)
    punpcklbw mm6,mm2    // mm0 = lower 4 pixels
     punpckhbw mm7,mm2     // mm1 = upper 4 pixels
    lea edi,[edi+ebx*4]

    align 16
kernel_loop:
    mov edx,[edi]
    movq mm4,[edx+eax]      // Load 8 pixels from test plane
     pxor mm1,mm1
    movq mm5,mm4
    punpcklbw mm4,mm1         // mm4 = lower pixels
     punpckhbw mm5,mm1        // mm5 = upper pixels
    paddusw mm6,mm4
     paddusw mm7,mm5

    sub edi,4
    dec ebx
    jns kernel_loop //  Fizick (was jnz kernel_loop) to not skip very first frame
     // Multiply (or in reality divides) added values
    movq mm4,[add64]
    pxor mm5,mm5
     movq mm0,mm6
    movq mm1,mm6
     punpcklwd mm0,mm5         // low,low
    movq mm6,[div64]
    punpckhwd mm1,mm5         // low,high
     movq mm2,mm7
    pmaddwd mm0,mm6
     punpcklwd mm2,mm5         // high,low
     movq mm3,mm7
     paddd mm0,mm4
    pmaddwd mm1,mm6
     punpckhwd mm3,mm5         // high,high
     psrld mm0,15
     paddd mm1,mm4
    pmaddwd mm2,mm6
     packssdw mm0, mm0
     psrld mm1,15
     paddd mm2,mm4
    pmaddwd mm3,mm6
     packssdw mm1, mm1
     psrld mm2,15
     paddd mm3,mm4
    psrld mm3,15
     packssdw mm2, mm2
    packssdw mm3, mm3
     packuswb mm0,mm5
    packuswb mm1,mm5
     packuswb mm2,mm5
    packuswb mm3,mm5
     movq mm4, [low_ffff]
    pand mm0, mm4;
     pand mm1, mm4;
    pand mm2, mm4;
     pand mm3, mm4;
    psllq mm1, 16
    psllq mm2, 32
     por mm0,mm1
    psllq mm3, 48
    por mm2,mm3
    por mm0,mm2
    movq [esi+eax],mm0

    add eax,8   // Next 8 pixels
    jmp testplane
outloop:
    emms
  }

}


extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit2(IScriptEnvironment* env)
{
    env->AddFunction("Dup", "c[threshold]f[chroma]b[show]b[copy]b[maxcopies]i[blend]b[debug]b[log]s", Create_Dup, 0);
    return 0;
}

