/* SCE CONFIDENTIAL
 * PlayStation(R)3 Programmer Tool Runtime Library 475.001
 * Copyright (C) 2011 Sony Computer Entertainment Inc.
 * All Rights Reserved.
 */

#include <cell/spurs.h>
#include <cell/dma.h>
#include <spu_printf.h>

#include <math.h>

#define VERVOSE_DEBUG 0

#include "vision_param.h"
#include "cellImage.h"

//#define PRINT_SAD_RESULT

#define min(a,b) (((a) < (b)) ? (a) : (b))

static const int VTASK_TAG_IN[2] = {3,4};
static const int VTASK_TAG_OUT[2] = {5,6};

#define TPLATE_SIZE 16 //fix!!

#define SEARCHX 8
#define SEARCHY 8
#define SEARCHX_RANGE (SEARCHX * 2 + 1)
#define SEARCHY_RANGE (SEARCHY * 2 + 1)
#define SLICE_HEIGHT (SEARCHY_RANGE + TPLATE_SIZE)


VisionParam gVisionParam; // 16 byte alignment
CellImage gImage; // 16 byte alignment
vec_uchar16 gTemplate[TPLATE_SIZE] __attribute__((aligned(16)));
BlockMatchingInfo gPosInfo; // 16 byte alignment

// slice image (search range)
vec_uchar16 gImageSlice[SLICE_HEIGHT * 640 / 16] __attribute__((aligned(CELL_IMAGE_DATA_ALIGN)));

void block_matching_16x16(vec_uchar16 *ref, vec_uchar16 *image,
						  int width, float x_in, float y_in,
						  float *x_out, float *y_out, float *conf_out);

int cellSpursTaskMain(qword argTask, uint64_t argTaskSet)
{
	int ret;
	ret = 0;
	
	uint32_t status = 0;
	
	(void)argTaskSet;
	
#if VERVOSE_DEBUG
	spu_printf("# cellSpursTaskMain()...1\n");
	spu_printf("# argTask=0x%x, &argTask=0x%x, argTask[0]=0x%x, argTaskSet=0x%x, &argTaskSet=0x%x\n",
			   (qword)argTask, &argTask, argTask[0], argTaskSet, &argTaskSet);
	spu_printf("# &gVisionParam=0x%x, sizeof(VisionParam)=%d, sizeof(gVisionParam)=%d\n",
			   &gVisionParam, sizeof(VisionParam), sizeof(gVisionParam));
#endif //VERVOSE_DEBUG

	cellDmaGet(&gVisionParam, spu_extract((vec_ullong2)argTask, 0),
			   sizeof(gVisionParam), VTASK_TAG_IN[0], 0, 0);
	cellDmaWaitTagStatusAll(1<<(VTASK_TAG_IN[0]));

	cellDmaGet(&gImage, gVisionParam.data[0], sizeof(CellImage), 3, 0, 0);
	cellDmaWaitTagStatusAll(1<<3);

	cellDmaGet(gTemplate, gVisionParam.data[1], TPLATE_SIZE * TPLATE_SIZE, VTASK_TAG_IN[0], 0, 0);
	cellDmaGet(&gPosInfo, gVisionParam.data[2], sizeof(gPosInfo), VTASK_TAG_IN[0], 0, 0);
	cellDmaWaitTagStatusAll(1<<VTASK_TAG_IN[0]);

	float x_pos = gPosInfo.ix;
	float y_pos = gPosInfo.iy;
	float *x_pos_o = &gPosInfo.ox;
	float *y_pos_o = &gPosInfo.oy;
	float *conf = &gPosInfo.score;

	int width = gImage.GetWidth();
	
	// get first chunk of data
	int dma_pos = ((int)(floorf(y_pos)) - SEARCHY) * width;
	cellDmaLargeGet(gImageSlice, gImage.GetData() + dma_pos,
					SLICE_HEIGHT * width, VTASK_TAG_IN[1], 0, 0);
	cellDmaWaitTagStatusAll(1<<VTASK_TAG_IN[1]);

	float y_pos_slice = SEARCHY + (y_pos - floorf(y_pos));
	block_matching_16x16(gTemplate, gImageSlice, width, x_pos, y_pos_slice,
						 x_pos_o, y_pos_o, conf);
	*y_pos_o += floorf(y_pos) - SEARCHY;
	
	cellDmaPut(&gPosInfo, gVisionParam.data[2], sizeof(gPosInfo), VTASK_TAG_IN[0], 0, 0);
	cellDmaWaitTagStatusAll(1<<VTASK_TAG_OUT[0]);
	
	cellSpursTaskExit(status);
	
	return status;
}

static vec_uchar16 shuf_pats[16]=
{
	(vec_uchar16)(vec_uint4){0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f},
	(vec_uchar16)(vec_uint4){0x01020304,0x05060708,0x090a0b0c,0x0d0e0f10},
	(vec_uchar16)(vec_uint4){0x02030405,0x06070809,0x0a0b0c0d,0x0e0f1011},
	(vec_uchar16)(vec_uint4){0x03040506,0x0708090a,0x0b0c0d0e,0x0f101112},
	
	(vec_uchar16)(vec_uint4){0x04050607,0x08090a0b,0x0c0d0e0f,0x10111213},
	(vec_uchar16)(vec_uint4){0x05060708,0x090a0b0c,0x0d0e0f10,0x11121314},
	(vec_uchar16)(vec_uint4){0x06070809,0x0a0b0c0d,0x0e0f1011,0x12131415},
	(vec_uchar16)(vec_uint4){0x0708090a,0x0b0c0d0e,0x0f101112,0x13141516},
	
	(vec_uchar16)(vec_uint4){0x08090a0b,0x0c0d0e0f,0x10111213,0x14151617},
	(vec_uchar16)(vec_uint4){0x090a0b0c,0x0d0e0f10,0x11121314,0x15161718},
	(vec_uchar16)(vec_uint4){0x0a0b0c0d,0x0e0f1011,0x12131415,0x16171819},
	(vec_uchar16)(vec_uint4){0x0b0c0d0e,0x0f101112,0x13141516,0x1718191a},
	
	(vec_uchar16)(vec_uint4){0x0c0d0e0f,0x10111213,0x14151617,0x18191a1b},
	(vec_uchar16)(vec_uint4){0x0d0e0f10,0x11121314,0x15161718,0x191a1b1c},
	(vec_uchar16)(vec_uint4){0x0e0f1011,0x12131415,0x16171819,0x1a1b1c1d},
	(vec_uchar16)(vec_uint4){0x0f101112,0x13141516,0x1718191a,0x1b1c1d1e},
};


//////////////////////////////////////////////////////
// block_matching_16x16
// input upper-left position of template(16x16)
//////////////////////////////////////////////////////
void block_matching_16x16(vec_uchar16 *ref, vec_uchar16 *image,
						  int width, float x_in, float y_in,
						  float *x_out, float *y_out, float *conf_out)
{
	
#ifdef PRINT_SAD_RESULT
	spu_printf("before bm x_in = %f, y_in = %f [corner] \n", x_in, y_in);
#endif    
	*x_out = x_in;
	*y_out = y_in;
	// match patch
	vec_uchar16 q0, q1, q2, q3, q4, q5, q6, q7;
	vec_uchar16 q8, q9, q10, q11, q12, q13, q14, q15;
	q0 = ref[0];
	q1 = ref[1];
	q2 = ref[2];
	q3 = ref[3];
	q4 = ref[4];
	q5 = ref[5];
	q6 = ref[6];
	q7 = ref[7];
	q8 = ref[8];
	q9 = ref[9];
	q10 = ref[10];
	q11 = ref[11];
	q12 = ref[12];
	q13 = ref[13];
	q14 = ref[14];
	q15 = ref[15];
	int dx = SEARCHX;
	int dy = SEARCHY;
	int best = 255 * 256;
	int i,j;
	static vec_uchar16 vsearch_quads[TPLATE_SIZE + SEARCHY_RANGE];
	static unsigned short totals[SEARCHX_RANGE * SEARCHY_RANGE];
	float xx = floorf(x_in);
	float yy = floorf(y_in);
	int x = int(xx);
	int y = int(yy);
	float conf;
	for (i=0; i<SEARCHX_RANGE; i++) { // horizontal search
		
		// generate TPLATE_SIZE+SEARCHY_RANGE quads using proper shufbyte mask, store in memory
		vec_uchar16 shuf_pat = shuf_pats[((x + i - SEARCHX) & 0xf)];
		vec_uchar16 *qbuf = &(image[(x + i - SEARCHX + (y - SEARCHY) * width) / TPLATE_SIZE]);
		for (j=0; j<(TPLATE_SIZE + SEARCHY_RANGE); j++) {
			vsearch_quads[j] = spu_shuffle(qbuf[j * width / TPLATE_SIZE],
										   qbuf[1 + j * width / TPLATE_SIZE], shuf_pat);
		}
		for (j=0; j<SEARCHY_RANGE; j++) {
			vec_uchar16 c0,c1,c2,c3,c4,c5,c6,c7;
			vec_uchar16 c8,c9,c10,c11,c12,c13,c14,c15;
			c0 = spu_absd(vsearch_quads[j + 0], q0);
			c1 = spu_absd(vsearch_quads[j + 1], q1);
			c2 = spu_absd(vsearch_quads[j + 2], q2);
			c3 = spu_absd(vsearch_quads[j + 3], q3);
			c4 = spu_absd(vsearch_quads[j + 4], q4);
			c5 = spu_absd(vsearch_quads[j + 5], q5);
			c6 = spu_absd(vsearch_quads[j + 6], q6);
			c7 = spu_absd(vsearch_quads[j + 7], q7);
			c8 = spu_absd(vsearch_quads[j + 8], q8);
			c9 = spu_absd(vsearch_quads[j + 9], q9);
			c10 = spu_absd(vsearch_quads[j + 10], q10);
			c11 = spu_absd(vsearch_quads[j + 11], q11);
			c12 = spu_absd(vsearch_quads[j + 12], q12);
			c13 = spu_absd(vsearch_quads[j + 13], q13);
			c14 = spu_absd(vsearch_quads[j + 14], q14);
			c15 = spu_absd(vsearch_quads[j + 15], q15);
			vec_ushort8 s0,s1,s2,s3,s4,s5,s6,s7;
			s0 = spu_sumb(c0, c1);
			s1 = spu_sumb(c2, c3);
			s2 = spu_sumb(c4, c5);
			s3 = spu_sumb(c6, c7);
			s4 = spu_sumb(c8, c9);
			s5 = spu_sumb(c10, c11);
			s6 = spu_sumb(c12, c13);
			s7 = spu_sumb(c14, c15);
			s0 = spu_add(s0, s1);
			s2 = spu_add(s2, s3);
			s4 = spu_add(s4, s5);
			s6 = spu_add(s6, s7);
			s0 = spu_add(s0, s2);
			s4 = spu_add(s4, s6);
			
			vec_ushort8 s = spu_add(s0, s4);
			s1 = spu_rlqwbyte(s, 2);
			s2 = spu_add(s, s1);
			s1 = spu_rlqwbyte(s, 4);
			s2 = spu_add(s2, s1);
			s1 = spu_rlqwbyte(s, 6);
			s2 = spu_add(s2, s1);
			s2 = spu_rlqwbyte(s, 8);
			s2 = spu_add(s2, s1);
			s1 = spu_rlqwbyte(s, 10);
			s2 = spu_add(s2, s1);
			s1 = spu_rlqwbyte(s, 12);
			s2 = spu_add(s2, s1);
			s1 = spu_rlqwbyte(s, 14);
			s2 = spu_add(s2, s1);
			uint16_t tot = spu_extract(s2, 0);
			totals[j + i * SEARCHY_RANGE] = tot;
			
			if (tot <= best) {
				dx = i;
				dy = j;
				best = tot;
			}
		}
	}
	*conf_out = conf = 1.0f - 2.0f * (float)(best) / (float)(255 * 256);
	
#ifdef PRINT_SAD_RESULT
	for (i=0; i<SEARCHX_RANGE; i++) { // horizontal search
		spu_printf("[sad %d]\t", i);
		for (j=0; j<SEARCHY_RANGE; j++) {
			spu_printf("%f\t", 1.0f-2.0f*(float)(totals[j+i*SEARCHY_RANGE])/(float)(255*256));
		}
		spu_printf("\n");
	}
#endif //PRINT_SADLIST    
	
	if (dx > 0 && dx < SEARCHX_RANGE &&
		dy > 0 && dy < SEARCHY_RANGE) {
		float a, b, c, d, e;
		b = 1.0f - 2.0f * (float)(best) / (float)(255 * 256);
		if (b > 0.0f) {
			a = 1.0f - 2.0f * (float)(totals[(dx - 1) * SEARCHY_RANGE + dy]) / (float)(255 * 256);
			c = 1.0f - 2.0f * (float)(totals[(dx + 1) * SEARCHY_RANGE + dy]) / (float)(255 * 256);
			d = 1.0f - 2.0f * (float)(totals[dx * SEARCHY_RANGE + dy - 1]) / (float)(255 * 256);
			e = 1.0f - 2.0f * (float)(totals[dx * SEARCHY_RANGE + dy + 1]) / (float)(255 * 256);

#define STEEPNESS 0.00005f
			
			if (2.0f * b - a - c < STEEPNESS || 2.0f * b - d - e < STEEPNESS)
				conf = 0.0f;
			if (conf > 0.0f)
			{
				//subpixel linear
				xx += (c - a) / (2.0f * (b - min(a, c)))/*+0.5f*/;
				yy += (e - d) / (2.0f * (b - min(d, e)))/*+0.5f*/;
			}
		}
	}

	if (conf > 0.0f) // at least a 50% match
	{
		xx += dx - SEARCHX;
		yy += dy - SEARCHY;
		*x_out = xx;
		*y_out = yy;
	}
	
#ifdef PRINT_SAD_RESULT
	spu_printf("bm x_out = %f, y_out = %f [corner]\n",*x_out, *y_out);
#endif    
	
}
