/* SCE CONFIDENTIAL
 * PlayStation(R)3 Programmer Tool Runtime Library 475.001
 * Copyright (C) 2011 Sony Computer Entertainment Inc.
 * All Rights Reserved.
 */

#include <cell/spurs.h>
#include <cell/dma.h>
#include <spu_printf.h>

#define VERVOSE_DEBUG 0

#include "vision_param.h"
#include "cellImage.h"
#include "shufflebytes_masks.h"

#define max(a, b) (((a) > (b)) ? (a) : (b))
#define min(a, b) (((a) < (b)) ? (a) : (b))
#define limit(x, a, b) (x) = min(max((a), (x)), (b))

static const int VTASK_TAG_IN[2] = {3, 4};
static const int VTASK_TAG_OUT[2] = {5, 6};

VisionParam gVisionParam; // 16 byte alignment
CellImage gSrcImage; // 16 byte alignment
CellImage gDstImage; // 16 byte alignment

// slice of image
#define SH 24
vec_uchar16 gSrcSliceBuf[2][640 * SH / 16]     __attribute__((aligned(CELL_IMAGE_DATA_ALIGN))); 
vec_uchar16 gDstSliceBuf[2][640 * SH / 4 / 16] __attribute__((aligned(CELL_IMAGE_DATA_ALIGN)));

void resize_half(const vec_uchar16 *srcslice, int width, int height, vec_uchar16 *dstslice);

int cellSpursTaskMain(qword argTask, uint64_t argTaskSet)
{
	int ret;
	ret = 0;
	
	uint32_t status = 0;

#if VERVOSE_DEBUG
	spu_printf("# cellSpursTaskMain()...1\n");
	spu_printf("# argTask=0x%x, &argTask=0x%x, argTask[0]=0x%x, argTaskSet=0x%x, &argTaskSet=0x%x\n",
			   (qword)argTask, &argTask, argTask[0], argTaskSet, &argTaskSet);
	spu_printf("# &gVisionParam=0x%x, sizeof(VisionParam)=%d, sizeof(gVisionParam)=%d\n",
			   &gVisionParam, sizeof(VisionParam), sizeof(gVisionParam));
#endif //VERVOSE_DEBUG

	(void)argTaskSet;

	cellDmaGet(&gVisionParam, spu_extract((vec_ullong2)argTask, 0),
			   sizeof(gVisionParam), VTASK_TAG_IN[0], 0, 0);
	cellDmaWaitTagStatusAll(1<<(VTASK_TAG_IN[0]));

	cellDmaGet(&gSrcImage, gVisionParam.data[0], sizeof(CellImage), 3, 0, 0);
	cellDmaWaitTagStatusAll(1<<3);

	cellDmaGet(&gDstImage, gVisionParam.data[1], sizeof(CellImage), 3, 0, 0);
	cellDmaWaitTagStatusAll(1<<3);

	int sliceHeight;
	int numSlices;
	int dblBufIdx;
	
	const int width  = gSrcImage.GetWidth();
	const int height = gSrcImage.GetHeight();
	sliceHeight = min(SH * 640 / width, 60);
	
	numSlices = height / sliceHeight;
	
	dblBufIdx = 0;
	// get first chunk of data

	cellDmaGet(gSrcSliceBuf[dblBufIdx], gSrcImage.GetData(), sliceHeight * width,
			   VTASK_TAG_IN[dblBufIdx], 0, 0);

	uint64_t EA_upper_bound = gSrcImage.GetData() + height * width;

	for(int sliceNum = 0; sliceNum < numSlices; sliceNum++, dblBufIdx = 1 - dblBufIdx) {
		// issue input dma for _next_ slice into other buf
		uint64_t dmaEA = gSrcImage.GetData() + ((sliceNum + 1) * sliceHeight) * width;
		int dmaSize = EA_upper_bound - dmaEA;
		limit(dmaSize, 0, width * sliceHeight);
		
		cellDmaGet(gSrcSliceBuf[1 - dblBufIdx], dmaEA, dmaSize,
				   VTASK_TAG_IN[1 - dblBufIdx], 0, 0);
		
		cellDmaWaitTagStatusAll(1 << VTASK_TAG_OUT[dblBufIdx]);
		cellDmaWaitTagStatusAll(1 << VTASK_TAG_IN[dblBufIdx]);
		
		resize_half((const vec_uchar16 *)gSrcSliceBuf[dblBufIdx], width, sliceHeight,
					gDstSliceBuf[dblBufIdx]);
		
		// start sending current dst buf
		cellDmaPut(gDstSliceBuf[dblBufIdx],
				   gDstImage.GetData() + sliceNum * sliceHeight * width / 4,
				   sliceHeight * width / 4,
				   VTASK_TAG_OUT[dblBufIdx], 0, 0);
	}

	cellDmaWaitTagStatusAll(1 << VTASK_TAG_OUT[1 - dblBufIdx]);
	cellDmaWaitTagStatusAll(1 << VTASK_TAG_OUT[dblBufIdx]);
	cellSpursTaskExit(status);

	return status;
}


void resize_half(const vec_uchar16 *srcslice, int width, int height, vec_uchar16 *dstslice)
{
	const int vec_w = width / 16;
	const int h = height;
	vec_uchar16 *src = (vec_uchar16 *)srcslice;
	vec_uchar16 *dst = dstslice;

	for (int j=0; j<h; j+=2) {
		for (int i=0; i<vec_w; i+=2) {
			vec_uchar16 val_lo = src[i];
			vec_uchar16 val_hi = src[i + 1];
			vec_uchar16 val2_lo = src[i + vec_w];
			vec_uchar16 val2_hi = src[i + vec_w + 1];
			// average vertically first
			vec_uchar16 vlo = spu_avg(val_lo, val2_lo);
			vec_uchar16 vhi = spu_avg(val_hi, val2_hi);
			// now, average horizontally
			vec_uchar16 vlo_rot = (vec_uchar16)spu_rl((vec_ushort8)vlo, 8);
			vec_uchar16 vhi_rot = (vec_uchar16)spu_rl((vec_ushort8)vhi, 8);
			vlo = spu_avg(vlo, vlo_rot);
			vhi = spu_avg(vhi, vhi_rot);
			dst[i / 2] = spu_shuffle(vlo, vhi, short8_hi_to_char16_mask);
		}
		src += vec_w * 2;
		dst += vec_w / 2;
	}
}
