/*   SCE CONFIDENTIAL                                       */
/*   PlayStation(R)3 Programmer Tool Runtime Library 475.001 */
/*   Copyright (C) 2008 Sony Computer Entertainment Inc.    */
/*   All Rights Reserved.                                   */

CELL_GCM_DECL void CELL_GCM_FUNC(SetReferenceCommand)(CELL_GCM_ARGS(uint32_t ref))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_REFERENCE(CELL_GCM_CURRENT, ref);
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetJumpCommand)(CELL_GCM_ARGS(uint32_t offset))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT((offset & 3) == 0);

	CELL_GCM_ASM_RESERVE_IMM(1, 1);
	CELL_GCM_CURRENT[0] = CELL_GCM_JUMP(offset);
	CELL_GCM_CURRENT += 1;
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetCallCommand)(CELL_GCM_ARGS(uint32_t offset))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT((offset&3) == 0);

	CELL_GCM_ASM_RESERVE_IMM(1, 1);
	CELL_GCM_CURRENT[0] = CELL_GCM_CALL(offset);
	CELL_GCM_CURRENT += 1;
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetReturnCommand)(CELL_GCM_NO_ARGS())
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(1, 0);
	CELL_GCM_CURRENT[0] = CELL_GCM_RETURN();
	CELL_GCM_CURRENT += 1;
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTimeStamp)(CELL_GCM_ARGS(uint32_t index))
{
	CELL_GCM_ASM_IN();
	uint32_t offset = 0x10 * index;
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_GET_REPORT(CELL_GCM_CURRENT, 1, offset);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}


CELL_GCM_DECL void CELL_GCM_FUNC(SetAntiAliasingControl)(CELL_GCM_ARGS(
	uint32_t enable, uint32_t alphaToCoverage, uint32_t alphaToOne, uint32_t sampleMask))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 4);
	CELL_GCM_METHOD_SET_ANTI_ALIASING_CONTROL(CELL_GCM_CURRENT, 
		enable, alphaToCoverage, alphaToOne, sampleMask);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetWaitLabel)(CELL_GCM_ARGS(uint8_t index, uint32_t value))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 2);

	uint32_t offset = 0x10 * index;
	CELL_GCM_METHOD_CHANNEL_SEMAPHORE_OFFSET(CELL_GCM_CURRENT, offset);
	CELL_GCM_METHOD_CHANNEL_SEMAPHORE_ACQUIRE(CELL_GCM_CURRENT, value);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetWriteCommandLabel)(CELL_GCM_ARGS(
	uint8_t index, uint32_t value))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 2);

	uint32_t offset = 0x10 * index;
	CELL_GCM_METHOD_CHANNEL_SEMAPHORE_OFFSET(CELL_GCM_CURRENT, offset);
	CELL_GCM_METHOD_CHANNEL_SEMAPHORE_RELEASE(CELL_GCM_CURRENT, value);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetWriteBackEndLabel)(CELL_GCM_ARGS(uint8_t index, uint32_t value))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 2);

	// swap byte 0 and 2
	uint32_t war_value = value;
	war_value = ( war_value & 0xff00ff00)
		| ((war_value >> 16) & 0xff)
		| (((war_value >> 0 ) & 0xff) << 16);

	uint32_t offset = 0x10 * index;
	CELL_GCM_METHOD_SET_SEMAPHORE_OFFSET(CELL_GCM_CURRENT, offset);
	CELL_GCM_METHOD_BACK_END_WRITE_SEMAPHORE_RELEASE(CELL_GCM_CURRENT, war_value);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetWriteTextureLabel)(CELL_GCM_ARGS(uint8_t index, uint32_t value))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 2);
	uint32_t offset = 0x10 * index;
#ifdef __SPU__
	uint32_t *ptr = CELL_GCM_CURRENT;
	uint32_t *nptr = CELL_GCM_CURRENT + 4;
	uint32_t ptroffset = (uint32_t)ptr & 0xf;
	vec_uint4 *vptr0 = (vec_uint4*)((uintptr_t)ptr);
	vec_uint4 *vptr1 = (vec_uint4*)((uintptr_t)nptr);
	vec_uint4 dstVec0 = *vptr0;
	vec_uint4 dstVec1 = *vptr1;
	CELL_GCM_CURRENT = nptr; 
	vec_uint4 src0 = (vec_uint4){CELL_GCM_METHOD(CELL_GCM_NV4097_SET_SEMAPHORE_OFFSET, 1),
								 (offset),
								 CELL_GCM_METHOD(CELL_GCM_NV4097_TEXTURE_READ_SEMAPHORE_RELEASE, 1),
								 (value)};
	vec_uint4 mask = (vec_uint4)spu_splats(0xffffffff);
	vec_uint4 mask0 = (vec_uint4)spu_rlmaskqwbyte(mask, -ptroffset);
	vec_uint4 val0 = spu_rlmaskqwbyte(src0, -ptroffset);
	vec_uint4 val1 = spu_slqwbyte(src0, 16 - ptroffset);
	*vptr0 = spu_sel(dstVec0, val0, mask0);
	*vptr1 = spu_sel(val1, dstVec1, mask0);
#else
	CELL_GCM_METHOD_SET_SEMAPHORE_OFFSET(CELL_GCM_CURRENT, offset);
	CELL_GCM_METHOD_TEXTURE_READ_SEMAPHORE_RELEASE(CELL_GCM_CURRENT, value);
#endif
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetInvalidateZcull)(CELL_GCM_NO_ARGS())
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 0);
	CELL_GCM_METHOD_INVALIDATE_ZCULL(CELL_GCM_CURRENT);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetAlphaFunc)(CELL_GCM_ARGS(uint32_t af, uint32_t ref))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(3, 2);
	CELL_GCM_METHOD_SET_ALPHA_FUNC_REF(CELL_GCM_CURRENT, af, ref);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetBlendColor)(CELL_GCM_ARGS(
	uint32_t color, uint32_t color2))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 2);
	CELL_GCM_METHOD_SET_BLEND_COLOR(CELL_GCM_CURRENT, color);
	CELL_GCM_METHOD_SET_BLEND_COLOR2(CELL_GCM_CURRENT, color2);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetBlendEquation)(CELL_GCM_ARGS(
	uint16_t color, uint16_t alpha))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 2);
	CELL_GCM_METHOD_SET_BLEND_EQUATION(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(color), 
		CELL_GCM_COMMAND_CAST(alpha));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetBlendFunc)(CELL_GCM_ARGS(uint16_t sfcolor, 
	uint16_t dfcolor, uint16_t sfalpha, uint16_t dfalpha))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(3, 4);
	CELL_GCM_METHOD_SET_BLEND_FUNC_SFACTOR_DFACTOR(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(sfcolor), 
		CELL_GCM_COMMAND_CAST(sfalpha), 
		CELL_GCM_COMMAND_CAST(dfcolor), 
		CELL_GCM_COMMAND_CAST(dfalpha));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetClearSurface)(CELL_GCM_ARGS(uint32_t mask))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(6, 1);
	CELL_GCM_METHOD_CLEAR_SURFACE(CELL_GCM_CURRENT, mask);

	// hw bug workaround, send nop
	CELL_GCM_METHOD_NO_OPERATION(CELL_GCM_CURRENT);
	
	CELL_GCM_METHOD_WAIT_FOR_IDLE(CELL_GCM_CURRENT);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetClearColor)(CELL_GCM_ARGS(uint32_t color))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_COLOR_CLEAR_VALUE(CELL_GCM_CURRENT, color);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetClearDepthStencil)(CELL_GCM_ARGS(uint32_t value))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_ZSTENCIL_CLEAR_VALUE(CELL_GCM_CURRENT, value);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetColorMask)(CELL_GCM_ARGS(uint32_t mask))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_COLOR_MASK(CELL_GCM_CURRENT, mask);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetColorMaskMrt)(CELL_GCM_ARGS(uint32_t mask))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_COLOR_MASK_MRT(CELL_GCM_CURRENT, mask);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetCullFace)(CELL_GCM_ARGS(uint32_t cfm))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_CULL_FACE(CELL_GCM_CURRENT, cfm);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDepthFunc)(CELL_GCM_ARGS(uint32_t zf))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_DEPTH_FUNC(CELL_GCM_CURRENT, zf);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDepthMask)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_DEPTH_MASK(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetFrontFace)(CELL_GCM_ARGS(uint32_t dir))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_FRONT_FACE(CELL_GCM_CURRENT, dir);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetLineWidth)(CELL_GCM_ARGS(uint32_t width))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);

#ifdef CELL_GCM_BITFIELD
	CELL_GCM_METHOD_SET_LINE_WIDTH(CELL_GCM_CURRENT, width & 0x1ff);
#else
	CELL_GCM_METHOD_SET_LINE_WIDTH(CELL_GCM_CURRENT, width);
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetLineSmoothEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_LINE_SMOOTH_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetLineStippleEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_LINE_STIPPLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetLineStipplePattern)(CELL_GCM_ARGS(
	const uint16_t* pattern, uint8_t factor))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 2);
	CELL_GCM_METHOD_SET_LINE_STIPPLE_PATTERN(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(*pattern), 
		CELL_GCM_COMMAND_CAST(factor));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetLogicOp)(CELL_GCM_ARGS(uint32_t op))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_LOGIC_OP(CELL_GCM_CURRENT, op);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetPolySmoothEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_POLY_SMOOTH_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetPolygonStippleEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_POLYGON_STIPPLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetFrontPolygonMode)(CELL_GCM_ARGS(uint32_t mode))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_FRONT_POLYGON_MODE(CELL_GCM_CURRENT, mode);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetBackPolygonMode)(CELL_GCM_ARGS(uint32_t mode))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_BACK_POLYGON_MODE(CELL_GCM_CURRENT, mode);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetScissor)(CELL_GCM_ARGS(uint16_t x, uint16_t y, uint16_t w, uint16_t h))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(3, 4);
	CELL_GCM_METHOD_SET_SCISSOR_HORIZONTAL_VERTICAL(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(x), 
		CELL_GCM_COMMAND_CAST(w), 
		CELL_GCM_COMMAND_CAST(y), 
		CELL_GCM_COMMAND_CAST(h));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetShadeMode)(CELL_GCM_ARGS(uint32_t sm))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_SHADE_MODE(CELL_GCM_CURRENT, sm);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTwoSideLightEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_TWO_SIDE_LIGHT_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetStencilFunc)(CELL_GCM_ARGS(uint32_t func, 
	int32_t ref, uint32_t mask))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 3);
	CELL_GCM_METHOD_SET_STENCIL_FUNC_REF_MASK(CELL_GCM_CURRENT, 
		func, ref, mask);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetBackStencilFunc)(CELL_GCM_ARGS(uint32_t func, 
	int32_t ref, uint32_t mask))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 3);
	CELL_GCM_METHOD_SET_BACK_STENCIL_FUNC_REF_MASK(CELL_GCM_CURRENT, 
		func, ref, mask);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetStencilMask)(CELL_GCM_ARGS(uint32_t sm))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_STENCIL_MASK(CELL_GCM_CURRENT, sm);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetBackStencilMask)(CELL_GCM_ARGS(uint32_t sm))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_BACK_STENCIL_MASK(CELL_GCM_CURRENT, sm);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetStencilOp)(CELL_GCM_ARGS(uint32_t fail, 
	uint32_t depthFail, uint32_t depthPass))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 3);
	CELL_GCM_METHOD_SET_STENCIL_OP_FAIL_ZFAIL_ZPASS(CELL_GCM_CURRENT, 
		fail, depthFail, depthPass);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetBackStencilOp)(CELL_GCM_ARGS(uint32_t fail, 
	uint32_t depthFail, uint32_t depthPass))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 3);
	CELL_GCM_METHOD_SET_BACK_STENCIL_OP_FAIL_ZFAIL_ZPASS(CELL_GCM_CURRENT, 
		fail, depthFail, depthPass);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetZMinMaxControl)(CELL_GCM_ARGS(
	const uint32_t cullNearFarEnable, const uint32_t zclampEnable, const uint32_t cullIgnoreW))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 3);
	CELL_GCM_METHOD_SET_ZMIN_MAX_CONTROL(CELL_GCM_CURRENT, cullNearFarEnable, zclampEnable, cullIgnoreW);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetAlphaTestEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_ALPHA_TEST_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetBlendEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_BLEND_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetBlendEnableMrt)(CELL_GCM_ARGS(uint32_t mrt1, 
	uint32_t mrt2, uint32_t mrt3))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 3);
	CELL_GCM_METHOD_SET_BLEND_ENABLE_MRT(CELL_GCM_CURRENT, 
		mrt1, mrt2, mrt3);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetLogicOpEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_LOGIC_OP_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetCullFaceEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_CULL_FACE_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDepthBoundsTestEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_DEPTH_BOUNDS_TEST_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDepthTestEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_DEPTH_TEST_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDitherEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_DITHER_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetStencilTestEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_STENCIL_TEST_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTwoSidedStencilTestEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_TWO_SIDED_STENCIL_TEST_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetPolygonOffsetFillEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_POLY_OFFSET_FILL_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetRestartIndexEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_RESTART_INDEX_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetPointSpriteControl)(CELL_GCM_ARGS(
	uint32_t enable, uint32_t rmode, uint32_t texcoord))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 3);
	CELL_GCM_METHOD_SET_POINT_PARAMS_ENABLE(CELL_GCM_CURRENT, 
		enable);
	CELL_GCM_METHOD_SET_POINT_SPRITE_CONTROL(CELL_GCM_CURRENT, 
		enable, rmode, texcoord);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetInvalidateTextureCache)(CELL_GCM_ARGS(uint32_t value))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_INVALIDATE_L2(CELL_GCM_CURRENT, value);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTextureBorderColor)(CELL_GCM_ARGS(
	uint8_t index, uint32_t color))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_TEXIMAGE_COUNT);
	CELL_GCM_ASM_RESERVE_IMM(2, 2);
	CELL_GCM_METHOD_SET_TEXTURE_BORDER_COLOR(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(index), color);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTextureFilter)(CELL_GCM_ARGS(uint8_t index, 
	uint16_t bias, uint8_t min, uint8_t mag, uint8_t conv))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_TEXIMAGE_COUNT);
	CELL_GCM_ASM_RESERVE_IMM(2, 5);

#ifdef	CELL_GCM_BITFIELD
	CELL_GCM_METHOD_SET_TEXTURE_FILTER(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(bias & 0x1fff), 
		CELL_GCM_COMMAND_CAST(min), 
		CELL_GCM_COMMAND_CAST(mag), 
		CELL_GCM_COMMAND_CAST(conv),
		0,0,0,0);
		// a_signed, r_signed, g_signed, b_signed
#else
	CELL_GCM_METHOD_SET_TEXTURE_FILTER(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(bias), 
		CELL_GCM_COMMAND_CAST(min), 
		CELL_GCM_COMMAND_CAST(mag), 
		CELL_GCM_COMMAND_CAST(conv),
		0,0,0,0);
		// a_signed, r_signed, g_signed, b_signed
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTextureAddress)(CELL_GCM_ARGS(uint8_t index, 
	uint8_t wraps, uint8_t wrapt, uint8_t wrapr, uint8_t unsignedRemap, 
	uint8_t zfunc, uint8_t gamma))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_TEXIMAGE_COUNT);
	CELL_GCM_ASM_RESERVE_IMM(2, 7);
	CELL_GCM_METHOD_SET_TEXTURE_ADDRESS(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(wraps), 
		CELL_GCM_COMMAND_CAST(wrapt), 
		CELL_GCM_COMMAND_CAST(wrapr), 
		CELL_GCM_COMMAND_CAST(unsignedRemap), 
		CELL_GCM_COMMAND_CAST(zfunc), 
		CELL_GCM_COMMAND_CAST(gamma), 
		0,
		0);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTextureControl)(CELL_GCM_ARGS(uint8_t index, 
	uint32_t enable, uint16_t minlod, uint16_t maxlod, uint8_t maxaniso))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_TEXIMAGE_COUNT);
	CELL_GCM_ASM_RESERVE_IMM(2, 5);

#ifdef CELL_GCM_BITFIELD
	CELL_GCM_METHOD_SET_TEXTURE_CONTROL0(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		enable, 
		CELL_GCM_COMMAND_CAST(minlod & 0xfff), 
		CELL_GCM_COMMAND_CAST(maxlod & 0xfff), 
		CELL_GCM_COMMAND_CAST(maxaniso),
		CELL_GCM_COMMAND_CAST(0));
#else
	CELL_GCM_METHOD_SET_TEXTURE_CONTROL0(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		enable, 
		CELL_GCM_COMMAND_CAST(minlod), 
		CELL_GCM_COMMAND_CAST(maxlod), 
		CELL_GCM_COMMAND_CAST(maxaniso),
		CELL_GCM_COMMAND_CAST(0));
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTextureOptimization)(CELL_GCM_ARGS(uint8_t index, const uint8_t slope, const uint8_t iso, const uint8_t aniso))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_TEXIMAGE_COUNT);
	CELL_GCM_ASM_RESERVE_IMM(2, 4);
	uint32_t value = CELL_GCM_COMMAND_CAST(slope) | (CELL_GCM_COMMAND_CAST(iso)<<6) | (CELL_GCM_COMMAND_CAST(aniso)<<7) | (0x2d<<8);
	CELL_GCM_METHOD_SET_TEXTURE_CONTROL2(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(index), value);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetCylindricalWrap)(CELL_GCM_ARGS(uint32_t enable, uint32_t reserved))
{
	CELL_GCM_ASM_IN();
	(void)reserved;
	CELL_GCM_ASM_RESERVE_IMM(2, 2);
	CELL_GCM_METHOD_SET_CYLINDRICAL_WRAP(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawBegin)(CELL_GCM_ARGS(const uint8_t mode))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(6, 1);

	// hw bug workaround, send 3 invalidate vertex file
	CELL_GCM_METHOD_INVALIDATE_VERTEX_FILE_3(CELL_GCM_CURRENT);

	// start draw mode
	CELL_GCM_METHOD_SET_BEGIN_END(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(mode));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawEnd)(CELL_GCM_NO_ARGS())
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 0);

	// end draw mode
	CELL_GCM_METHOD_SET_BEGIN_END(CELL_GCM_CURRENT, 0);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetInvalidateVertexCache)(CELL_GCM_NO_ARGS())
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(8, 0);
#ifdef __SPU__
	uint32_t *ptr = CELL_GCM_CURRENT;
	CELL_GCM_CURRENT += 8; 
	uint32_t offset = (uint32_t)ptr & 0xf;
	vec_uint4 src0 = (vec_uint4){CELL_GCM_METHOD(CELL_GCM_NV4097_INVALIDATE_VERTEX_CACHE_FILE, 1), 0,
								 CELL_GCM_METHOD(CELL_GCM_NV4097_INVALIDATE_VERTEX_FILE, 1), 0};
	vec_uint4 src1 = (vec_uint4){CELL_GCM_METHOD(CELL_GCM_NV4097_INVALIDATE_VERTEX_FILE, 1), 0,
								 CELL_GCM_METHOD(CELL_GCM_NV4097_INVALIDATE_VERTEX_FILE, 1), 0};
	vec_uint4 *vptr0 = (vec_uint4*)((uintptr_t)ptr);
	vec_uint4 *vptr1 = (vec_uint4*)((uintptr_t)ptr + 16);
	vec_uint4 *vptr2 = (vec_uint4*)((uintptr_t)ptr + 32);
	vec_uint4 dstVec0 = *vptr0;
	vec_uint4 dstVec2 = *vptr2;
	vec_uint4 mask = (vec_uint4)spu_splats(0xffffffff);
	vec_uint4 mask0 = (vec_uint4)spu_rlmaskqwbyte(mask, -offset);
	vec_uint4 val0 = spu_rlmaskqwbyte(src0, -offset);
	vec_uint4 val1 = spu_slqwbyte(src0, 16 - offset);
	vec_uint4 val2 = spu_rlmaskqwbyte(src1, -offset);
	vec_uint4 val3 = spu_slqwbyte(src1, 16 - offset);
	*vptr0 = spu_sel(dstVec0, val0, mask0);
	*vptr1 = spu_sel(val1, val2, mask0);
	*vptr2 = spu_sel(val3, dstVec2, mask0);
#else
	CELL_GCM_METHOD_INVALIDATE_VERTEX_CACHE_FILE(CELL_GCM_CURRENT);
	CELL_GCM_METHOD_INVALIDATE_VERTEX_FILE(CELL_GCM_CURRENT); 
	CELL_GCM_METHOD_INVALIDATE_VERTEX_FILE(CELL_GCM_CURRENT);
	CELL_GCM_METHOD_INVALIDATE_VERTEX_FILE(CELL_GCM_CURRENT);
#endif
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetRestartIndex)(CELL_GCM_ARGS(uint32_t index))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_RESTART_INDEX(CELL_GCM_CURRENT, index);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetFrequencyDividerOperation)(CELL_GCM_ARGS(uint16_t operation))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_FREQUENCY_DIVIDER_OPERATION(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(operation));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTransformBranchBits)(CELL_GCM_ARGS(uint32_t branchBits))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_TRANSFORM_BRANCH_BITS(CELL_GCM_CURRENT, branchBits);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexAttribInputMask)(CELL_GCM_ARGS(uint16_t mask))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_VERTEX_ATTRIB_INPUT_MASK(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(mask));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetFragmentProgramGammaEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_SHADER_PACKER(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetUserClipPlaneControl)(CELL_GCM_ARGS(
	uint32_t plane0, uint32_t plane1, uint32_t plane2, uint32_t plane3, 
	uint32_t plane4, uint32_t plane5))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 6);
	CELL_GCM_METHOD_SET_USER_CLIP_PLANE_CONTROL(CELL_GCM_CURRENT, 
		plane0, plane1, plane2, plane3, plane4, plane5);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetRenderEnable)(CELL_GCM_ARGS(uint8_t mode, uint32_t index))
{
	CELL_GCM_ASM_IN();
	uint32_t hwOffset = 0x10 * index;

	if(mode == CELL_GCM_CONDITIONAL)
	{
		CELL_GCM_ASM_RESERVE_IMM(4, 2);

		// hw bug workaround, send nop
		CELL_GCM_METHOD_NO_OPERATION(CELL_GCM_CURRENT);
		CELL_GCM_METHOD_SET_RENDER_ENABLE(CELL_GCM_CURRENT, 2, hwOffset);
	}
	// mode == CELL_GCM_TRUE, CELL_GCM_FALSE
	else
	{
		CELL_GCM_ASM_RESERVE_IMM(2, 2);
		CELL_GCM_METHOD_SET_RENDER_ENABLE(CELL_GCM_CURRENT, 1, 0);
	}

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetZpassPixelCountEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_ZPASS_PIXEL_COUNT_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetClearReport)(CELL_GCM_ARGS(uint32_t type))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_CLEAR_REPORT_VALUE(CELL_GCM_CURRENT, type);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}


CELL_GCM_DECL void CELL_GCM_FUNC(SetReport)(CELL_GCM_ARGS(uint32_t type, uint32_t index))
{
	CELL_GCM_ASM_IN();
	uint32_t hwOffset = 0x10 * index;

	CELL_GCM_ASM_RESERVE_IMM(2, 2);
	CELL_GCM_METHOD_GET_REPORT(CELL_GCM_CURRENT, type, hwOffset);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetZcullStatsEnable)(CELL_GCM_ARGS(uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_ZCULL_STATS_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetZcullControl)(CELL_GCM_ARGS(const uint8_t zCullDir, const uint8_t zCullFormat))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 2);
	CELL_GCM_METHOD_SET_ZCULL_CONTROL0(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(zCullDir), CELL_GCM_COMMAND_CAST(zCullFormat));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetZcullLimit)(CELL_GCM_ARGS(uint16_t moveForwardLimit, uint16_t pushBackLimit))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 2);
	CELL_GCM_METHOD_SET_ZCULL_CONTROL1(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(moveForwardLimit), CELL_GCM_COMMAND_CAST(pushBackLimit));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetScullControl)(CELL_GCM_ARGS(const uint8_t sFunc, const uint8_t sRef, const uint8_t sMask))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 3);
	CELL_GCM_METHOD_SET_SCULL_CONTROL(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(sFunc), CELL_GCM_COMMAND_CAST(sRef), CELL_GCM_COMMAND_CAST(sMask));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexTextureAddress)(CELL_GCM_ARGS(const uint8_t index, const uint8_t wraps, const uint8_t wrapt))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_VERTEX_TEXTURE);

	CELL_GCM_ASM_RESERVE_IMM(2, 3);
	CELL_GCM_METHOD_SET_VERTEX_TEXTURE_ADDRESS(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(wraps), 
		CELL_GCM_COMMAND_CAST(wrapt));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexTextureFilter)(CELL_GCM_ARGS(const uint8_t index, const uint16_t bias))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_VERTEX_TEXTURE);

	CELL_GCM_ASM_RESERVE_IMM(2, 2);

#ifdef CELL_GCM_BITFIELD
	CELL_GCM_METHOD_SET_VERTEX_TEXTURE_FILTER(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(bias & 0x1fff));
#else
	CELL_GCM_METHOD_SET_VERTEX_TEXTURE_FILTER(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(bias));
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexTextureControl)(CELL_GCM_ARGS(const uint8_t index, const uint32_t enable, const uint16_t minLod, const uint16_t maxLod))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_VERTEX_TEXTURE);

	CELL_GCM_ASM_RESERVE_IMM(2, 4);

#ifdef CELL_GCM_BITFIELD
	CELL_GCM_METHOD_SET_VERTEX_TEXTURE_CONTROL0(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		enable, 
		CELL_GCM_COMMAND_CAST(minLod & 0xfff), 
		CELL_GCM_COMMAND_CAST(maxLod & 0xfff));
#else
	CELL_GCM_METHOD_SET_VERTEX_TEXTURE_CONTROL0(CELL_GCM_CURRENT, index, enable, minLod, maxLod);
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexTextureBorderColor)(CELL_GCM_ARGS(const uint8_t index, const uint32_t color))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_VERTEX_TEXTURE);

	CELL_GCM_ASM_RESERVE_IMM(2, 2);
	CELL_GCM_METHOD_SET_VERTEX_TEXTURE_BORDER_COLOR(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), color);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetPerfMonTrigger)(CELL_GCM_NO_ARGS())
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 0);
	CELL_GCM_METHOD_WAIT_FOR_IDLE(CELL_GCM_CURRENT);
	CELL_GCM_METHOD_PM_TRIGGER(CELL_GCM_CURRENT);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetFogMode)(CELL_GCM_ARGS(const uint32_t mode))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_FOG_MODE(CELL_GCM_CURRENT, mode);
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTransferLocation)(CELL_GCM_ARGS(const uint32_t location))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT((location <= CELL_GCM_LOCATION_MAIN));
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SURFACE2D_SET_CONTEXT_DMA_IMAGE_DESTIN(CELL_GCM_CURRENT, CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER + location);
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDepthFormat)(CELL_GCM_ARGS(const uint32_t format))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_CONTROL0(CELL_GCM_CURRENT, ((format&1)<<12) | 0x00100000);
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetBlendOptimization)(CELL_GCM_ARGS(const uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_REDUCE_DST_COLOR(CELL_GCM_CURRENT, enable);
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetPolygonOffsetLineEnable)(CELL_GCM_ARGS(const uint32_t enable))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_POLY_OFFSET_LINE_ENABLE(CELL_GCM_CURRENT, enable);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexAttribOutputMask)(CELL_GCM_ARGS(const uint32_t mask))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_VERTEX_ATTRIB_OUTPUT_MASK(CELL_GCM_CURRENT, mask);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTextureRemap)(CELL_GCM_ARGS(const uint8_t index, const uint32_t remap))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_TEXIMAGE_COUNT);
	CELL_GCM_ASM_RESERVE_IMM(2, 2);
	CELL_GCM_METHOD_SET_TEXTURE_CONTROL1(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(index), remap);
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexProgramStartSlot)(CELL_GCM_ARGS(const uint32_t startSlot))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_TRANSFORM_PROGRAM_START(CELL_GCM_CURRENT, startSlot);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexProgramRegisterCount)(CELL_GCM_ARGS(const uint32_t registerCount))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	// how many registers does our vshader use ?
	if (registerCount <= 32)
	{
		CELL_GCM_METHOD_SET_TRANSFORM_TIMEOUT(CELL_GCM_CURRENT, 0xFFFF, 32);
	}
	else
	{
		CELL_GCM_METHOD_SET_TRANSFORM_TIMEOUT(CELL_GCM_CURRENT, 0xFFFF, 48);
	}
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetAnisoSpread)(CELL_GCM_ARGS(const uint8_t index, const uint8_t reduceSamplesEnable, const uint8_t hReduceSamplesEnable, const uint8_t vReduceSamplesEnable, const uint8_t spacingSelect, const uint8_t hSpacingSelect, const uint8_t vSpacingSelect))
{
	CELL_GCM_ASM_IN();
	uint32_t val = 0x0;
	val = (CELL_GCM_COMMAND_CAST( spacingSelect&0x7)<<0)  
		| (CELL_GCM_COMMAND_CAST( reduceSamplesEnable&0x1)<<4)
		| (CELL_GCM_COMMAND_CAST(hSpacingSelect&0x7)<<8)  
		| (CELL_GCM_COMMAND_CAST(hReduceSamplesEnable&0x1)<<12)
		| (CELL_GCM_COMMAND_CAST(vSpacingSelect&0x7)<<16) 
		| (CELL_GCM_COMMAND_CAST(vReduceSamplesEnable&0x1)<<20);
	CELL_GCM_ASM_RESERVE_IMM(2, 7);
	CELL_GCM_METHOD_SET_ANISO_SPREAD(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(index), val);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexDataArrayFormat)(CELL_GCM_ARGS(const uint8_t index, const uint16_t frequency, const uint8_t stride, const uint8_t size, const uint8_t type))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 5);
	CELL_GCM_METHOD_SET_VERTEX_DATA_ARRAY_FORMAT(CELL_GCM_CURRENT,
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(frequency), 
		CELL_GCM_COMMAND_CAST(stride), 
		CELL_GCM_COMMAND_CAST(size), 
		CELL_GCM_COMMAND_CAST(type));
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexDataArrayOffset)(CELL_GCM_ARGS(const uint8_t index, const uint8_t location, const uint32_t offset))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT((location <= CELL_GCM_LOCATION_MAIN));
	CELL_GCM_ASM_RESERVE_IMM(2, 3);
	CELL_GCM_METHOD_SET_VERTEX_DATA_ARRAY_OFFSET(CELL_GCM_CURRENT,
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(location), 
		offset);
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetUpdateFragmentProgramParameterLocation)(CELL_GCM_ARGS(const uint32_t offset, const uint32_t location))
{
	CELL_GCM_ASM_IN();
	// local memory 64B, main memory 128B alignment restriction
	CELL_GCM_ASSERT((location <= CELL_GCM_LOCATION_MAIN));
	CELL_GCM_ASSERT((offset & (64*(location+1)-1)) == 0);

	CELL_GCM_ASM_RESERVE_IMM(2, 2);
	CELL_GCM_METHOD_SET_SHADER_PROGRAM(CELL_GCM_CURRENT, 
		location+1, offset);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexDataBase)(CELL_GCM_ARGS(const uint32_t baseOffset, const uint32_t baseIndex))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(3, 2);
	CELL_GCM_METHOD_SET_VERTEX_DATA_BASE(CELL_GCM_CURRENT, 
		baseOffset, baseIndex);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetFragmentProgramControl)(CELL_GCM_ARGS(const CGprogram prog, const uint8_t controlTxp, const uint8_t reserved0, const uint8_t reserved1))
{
	CELL_GCM_ASM_IN();
	const CgBinaryProgram *ps = (const CgBinaryProgram*) prog;
	CgBinaryFragmentProgram *binaryFragmentProgram;
	uint32_t shCtrl0;
	uint32_t registerCount;

	(void)reserved0;
	(void)reserved1;

	// check binary format revision -- offline recompile necessary
	// -- enforce the correct ucode for nv40/nv47/rsx
	CELL_GCM_ASSERT(ps->binaryFormatRevision == CG_BINARY_FORMAT_REVISION);
	binaryFragmentProgram = (CgBinaryFragmentProgram*) ((char*)ps +  ps->program);

	CELL_GCM_ASM_RESERVE_IMM(2, 4);
	// shader properties:
	registerCount = binaryFragmentProgram->registerCount;
	CELL_GCM_ASSERT(registerCount <= 48);
	if (registerCount < 2)
	{
		// register count must be [2, 48]
		registerCount = 2;
	}
	shCtrl0 = (CELL_GCM_COMMAND_CAST(controlTxp) << CELL_GCM_SHIFT_SET_SHADER_CONTROL_CONTROL_TXP) 
		& CELL_GCM_MASK_SET_SHADER_CONTROL_CONTROL_TXP;
	shCtrl0 |= (1<<10) | (registerCount << 24);
	shCtrl0 |= binaryFragmentProgram->depthReplace ? 0xE : 0x0;
	shCtrl0 |= binaryFragmentProgram->outputFromH0 ? 0x00 : 0x40;
	shCtrl0 |= binaryFragmentProgram->pixelKill ? 0x80 : 0x00;
	CELL_GCM_METHOD_SET_SHADER_CONTROL(CELL_GCM_CURRENT, shCtrl0);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetClearZcullSurface)(CELL_GCM_ARGS(const uint32_t depth, const uint32_t stencil))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 2);
	CELL_GCM_METHOD_SET_CLEAR_ZCULL_SURFACE(CELL_GCM_CURRENT, 
		depth, stencil);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetZcullEnable)(CELL_GCM_ARGS(const uint32_t depth, const uint32_t stencil))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 2);
	CELL_GCM_METHOD_SET_ZCULL_ENABLE(CELL_GCM_CURRENT, 
		depth, stencil);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetUserCommand)(CELL_GCM_ARGS(const uint32_t cause))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_DRIVER_INTERRUPT(CELL_GCM_CURRENT, cause);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetReportLocation)(CELL_GCM_ARGS(const uint32_t location))
{
	CELL_GCM_ASM_IN();
	static const uint32_t handle[2] = {
		CELL_GCM_CONTEXT_DMA_TO_MEMORY_GET_REPORT,
		CELL_GCM_CONTEXT_DMA_REPORT_LOCATION_MAIN
	};

	CELL_GCM_ASSERT((location <= CELL_GCM_LOCATION_MAIN));
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	CELL_GCM_METHOD_SET_CONTEXT_DMA_REPORT(CELL_GCM_CURRENT, handle[location]);
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetNotifyIndex)(CELL_GCM_ARGS(uint32_t index))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(index < CELL_GCM_NOTIFY_MAIN_MAX_COUNT);

	CELL_GCM_ASM_RESERVE_IMM(2, 1);
	uint32_t handle=CELL_GCM_CONTEXT_DMA_NOTIFY_MAIN_0-index;
	CELL_GCM_METHOD_NV4097_CONTEXT_DMA_NOTIFIES(CELL_GCM_CURRENT, handle);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetNotify)(CELL_GCM_NO_ARGS())
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 0);
	CELL_GCM_METHOD_NV4097_NOTIFY(CELL_GCM_CURRENT, 0);
	CELL_GCM_METHOD_NO_OPERATION(CELL_GCM_CURRENT);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDepthBounds)(CELL_GCM_ARGS(float zmin, float zmax))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(3, 0);

#ifdef CELL_GCM_ASM
	CELL_GCM_METHOD_SET_DEPTH_BOUNDS_MIN_MAX(CELL_GCM_CURRENT, zmin, zmax);
#else
	CellGcmCast d0,d1;
	d0.f = zmin;
	d1.f = zmax;
	CELL_GCM_METHOD_SET_DEPTH_BOUNDS_MIN_MAX(CELL_GCM_CURRENT, d0.u, d1.u);
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetPointSize)(CELL_GCM_ARGS(float size))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 0);

#ifdef CELL_GCM_ASM
	CELL_GCM_METHOD_SET_POINT_SIZE(CELL_GCM_CURRENT, size);
#else
	CellGcmCast d;
	d.f = size;
	CELL_GCM_METHOD_SET_POINT_SIZE(CELL_GCM_CURRENT, d.u);
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetPolygonOffset)(CELL_GCM_ARGS(float factor, float units))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(3, 0);

#ifdef CELL_GCM_ASM
	CELL_GCM_METHOD_SET_POLYGON_OFFSET_SCALE_FACTOR_BIAS(CELL_GCM_CURRENT, factor, units);
#else
	CellGcmCast d0,d1;
	d0.f = factor;
	d1.f = units;
	CELL_GCM_METHOD_SET_POLYGON_OFFSET_SCALE_FACTOR_BIAS(CELL_GCM_CURRENT, d0.u, d1.u);
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetFogParams)(CELL_GCM_ARGS(const float p0, const float p1))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(3, 0);

#ifdef CELL_GCM_ASM
	CELL_GCM_METHOD_SET_FOG_PARAMS(CELL_GCM_CURRENT, p0, p1);
#else
	CellGcmCast d0,d1;
	d0.f = p0;
	d1.f = p1;
	CELL_GCM_METHOD_SET_FOG_PARAMS(CELL_GCM_CURRENT, d0.u, d1.u);
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetClipMinMax)(CELL_GCM_ARGS(float min, float max))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(3, 0);

#ifdef CELL_GCM_ASM
	CELL_GCM_METHOD_SET_CLIP_MIN_MAX(CELL_GCM_CURRENT, min, max);
#else

	CellGcmCast d0,d1;
	d0.f = min;
	d1.f = max;

	CELL_GCM_METHOD_SET_CLIP_MIN_MAX(CELL_GCM_CURRENT,
		d0.u, d1.u);
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetViewport)(CELL_GCM_ARGS(uint16_t x, 
	uint16_t y, uint16_t w, uint16_t h, float min, float max, const float scale[4], 
	const float offset[4]))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(24, 7);

#ifdef CELL_GCM_ASM
	CELL_GCM_METHOD_SET_VIEWPORT(CELL_GCM_CURRENT, x, y, w, h, min, max, scale, offset);
#else
	CellGcmCast d0,d1;
	d0.f = min;
	d1.f = max;

	CellGcmCast o[4],s[4];
	o[0].f = offset[0];
	o[1].f = offset[1];
	o[2].f = offset[2];
	o[3].f = offset[3];

	s[0].f = scale[0];
	s[1].f = scale[1];
	s[2].f = scale[2];
	s[3].f = scale[3];

	CELL_GCM_METHOD_SET_VIEWPORT_HORIZONTAL_VERTICAL(CELL_GCM_CURRENT,
		CELL_GCM_COMMAND_CAST(x), 
		CELL_GCM_COMMAND_CAST(w), 
		CELL_GCM_COMMAND_CAST(y), 
		CELL_GCM_COMMAND_CAST(h));
	CELL_GCM_METHOD_SET_CLIP_MIN_MAX(CELL_GCM_CURRENT,
		d0.u, d1.u);
	CELL_GCM_METHOD_SET_VIEWPORT_OFFSET_SCALE(CELL_GCM_CURRENT,
		o[0].u, o[1].u, o[2].u, o[3].u, s[0].u, s[1].u, s[2].u, s[3].u);

	// hw bug workaround, send twice
	CELL_GCM_METHOD_SET_VIEWPORT_OFFSET_SCALE(CELL_GCM_CURRENT,
		o[0].u, o[1].u, o[2].u, o[3].u, s[0].u, s[1].u, s[2].u, s[3].u);
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}


CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexProgramConstants)(CELL_GCM_ARGS(
	uint32_t first, uint32_t count, const float *data))
{
	CELL_GCM_ASM_IN();

	uint32_t loop = count >> 5;
	uint32_t rest = count & 0x1F;
	const float * __restrict value = data;

	CELL_GCM_ASM_RESERVE_REG(loop*34+(rest!=0 ? 2+rest : 0), 3);

#ifdef	CELL_GCM_ASM
	CELL_GCM_METHOD_SET_TRANSFORM_CONSTANT_LOAD_COUNT_VALUE(CELL_GCM_CURRENT, first, count, value);
#else
	uint32_t i;
	for(i=0;i<loop;i++)
	{
		uint32_t loadAt = first + i * 8;

		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, 33);
		CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(loadAt);

		CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[2], value, sizeof(float)*16);
		CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[18], &value[16], sizeof(float)*16);
		CELL_GCM_CURRENT += 34;
		value += 32;
	}

	if(rest)
	{
		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, rest+1);
		CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(first + (loop<<3));
		CELL_GCM_CURRENT += 2;

		for (i=0; i < rest; ++i)
		{
			CellGcmCast d0;
			d0.f = value[0];
			CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP(d0.u);
			CELL_GCM_CURRENT++;
			value++;
		}
	}

#endif
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexProgramParameterBlock)(CELL_GCM_ARGS(uint32_t baseConst, uint32_t constCount, const float * __restrict value))
{
	CELL_GCM_ASM_IN();

	uint32_t blockCount  = (constCount*4) >> 5;		// # 32 blocks
	uint32_t blockRemain = (constCount*4) & 0x1f;		// remainder 

	CELL_GCM_ASM_RESERVE_REG(blockCount*34+(blockRemain!=0 ? 2+blockRemain : 0), 3);

#ifdef	CELL_GCM_ASM
	CELL_GCM_METHOD_SET_TRANSFORM_CONSTANT_LOAD_COUNT_VALUE(CELL_GCM_CURRENT, baseConst, constCount*4, value);
#else
	uint32_t i;
	for (i=0; i < blockCount; i++)
	{
		uint32_t loadAt = baseConst+i*8;

		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, 33);
		CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(loadAt);

		CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[2], value, sizeof(float)*16);
		CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[18], &value[16], sizeof(float)*16);
		CELL_GCM_CURRENT += 34;
		value += 32;
	}

	if(blockRemain)
	{
		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, blockRemain+1);
		CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(baseConst + blockCount*8);
		CELL_GCM_CURRENT += 2;

		blockRemain >>= 2;
		for (i=0; i < blockRemain; ++i)
		{
			CELL_GCM_MEMCPY(CELL_GCM_CURRENT, value, sizeof(float)*4);
			CELL_GCM_CURRENT += 4;
			value += 4;
		}
	}
#endif
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetSkipNop)(CELL_GCM_ARGS(uint32_t count))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(count <= CELL_GCM_MAX_METHOD_COUNT);
	CELL_GCM_ASM_RESERVE_REG(1+count, 1);
#ifdef	CELL_GCM_ASM
	CELL_GCM_METHOD_SET_SKIP_NOP(CELL_GCM_CURRENT, count);
#else
	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_NO_OPERATION, count);
	CELL_GCM_CURRENT += 1+count;
#endif
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

#if (CELL_GCM_UNSAFE==0)
CELL_GCM_DECL void CELL_GCM_FUNC(ReserveMethodSize)(CELL_GCM_ARGS(uint32_t size))
{
	CELL_GCM_RESERVE(size);
}
#endif

CELL_GCM_DECL void CELL_GCM_FUNC(SetFragmentProgramOffset)(CELL_GCM_ARGS(const CGprogram prog, const uint32_t offset, const uint32_t location))
{
	const CgBinaryProgram *ps = (const CgBinaryProgram*) prog;
	CgBinaryFragmentProgram *binaryFragmentProgram;
	uint32_t texMask;
	uint32_t inMask;
	uint32_t texMask2D;
	uint32_t texMaskCentroid;
	uint32_t i;

	// check binary format revision -- offline recompile necessary
	// -- enforce the correct ucode for nv40/nv47/rsx
	CELL_GCM_ASSERT(ps->binaryFormatRevision == CG_BINARY_FORMAT_REVISION);
	CELL_GCM_ASSERT((location <= CELL_GCM_LOCATION_MAIN));
	binaryFragmentProgram = (CgBinaryFragmentProgram*) ((char*)ps +  ps->program);

	// check program size & location
	CELL_GCM_ASSERT(ps->ucodeSize > 0);
	CELL_GCM_ASSERT((ps->ucodeSize & 15) == 0);
	/* alignment restriction, SET_SHADER_PROGRAM needs to be 64byte alignment*/
	CELL_GCM_ASSERT((offset & 63) == 0);

	CELL_GCM_ASSERT((offset & 0xe0000000) == 0);

	CELL_GCM_RESERVE(4);

	// let hardware pulls
	CELL_GCM_METHOD_SET_SHADER_PROGRAM(CELL_GCM_CURRENT, location+1, (offset&0x1fffffff));


	// which attributes does the pshader really reference ?
	inMask = binaryFragmentProgram->attributeInputMask | 0x20;
	CELL_GCM_METHOD_SET_VERTEX_ATTRIB_OUTPUT_MASK(CELL_GCM_CURRENT, inMask); 

	// deal with the 2d opt
	texMask = binaryFragmentProgram->texCoordsInputMask;
	texMask2D = binaryFragmentProgram->texCoords2D;
	texMaskCentroid = binaryFragmentProgram->texCoordsCentroid;
	for(i=0; texMask; i++)
	{
		// keep the cached variable in sync
		if (texMask&1) {
			uint32_t hwTexCtrl = (texMask2D & 1) | ((texMaskCentroid & 1) << 4);
			CELL_GCM_RESERVE(2);
			CELL_GCM_METHOD_SET_TEX_COORD_CONTROL(CELL_GCM_CURRENT, i,hwTexCtrl);
		}

		texMask >>= 1;
		texMask2D >>= 1;
		texMaskCentroid >>= 1;
	}
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetPolygonStipplePattern)(CELL_GCM_ARGS(const uint32_t* pattern))
{
	CELL_GCM_RESERVE(33);
	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_POLYGON_STIPPLE_PATTERN, 32);
	CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[1],  &pattern[0],  sizeof(uint32_t)*16);
	CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[17], &pattern[16], sizeof(uint32_t)*16);
	CELL_GCM_CURRENT += 33;

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetNopCommand)(CELL_GCM_ARGS(uint32_t count))
{
	CELL_GCM_ASSERT(count <= 1024);
	CELL_GCM_RESERVE(count);
	uint32_t i;
	for(i=0;i<count;i++){
		CELL_GCM_CURRENT[i] = 0;
	}
	CELL_GCM_CURRENT += count;
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTransferDataMode)(CELL_GCM_ARGS(const uint8_t mode))
{
	uint32_t srcHandle,dstHandle;

	switch(mode)
	{
	case CELL_GCM_TRANSFER_MAIN_TO_LOCAL:
		srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		break;
	case CELL_GCM_TRANSFER_LOCAL_TO_MAIN:
		srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		break;
	case CELL_GCM_TRANSFER_LOCAL_TO_LOCAL:
		srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		break;
	case CELL_GCM_TRANSFER_MAIN_TO_MAIN:
		srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		break;
	default:
		srcHandle = 0;
		dstHandle = 0;
		CELL_GCM_ASSERT(0);
		break;
	}

	CELL_GCM_RESERVE(3);
	CELL_GCM_METHOD_COPY2D_SET_CONTEXT_DMA_BUFFER(
		CELL_GCM_CURRENT,
		srcHandle,
		dstHandle);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetWriteBackEndLabelForConditional)(CELL_GCM_ARGS(uint8_t index, uint32_t value))
{
	CELL_GCM_RESERVE(6);

	// swap byte 0 and 2
	uint32_t war_value = value;
	war_value = ( war_value & 0xff00ff00)
		| ((war_value >> 16) & 0xff)
		| (((war_value >> 0 ) & 0xff) << 16);

	uint32_t offset = 0x10 * index;
	CELL_GCM_METHOD_SET_SEMAPHORE_OFFSET(CELL_GCM_CURRENT, offset);
	CELL_GCM_METHOD_BACK_END_WRITE_SEMAPHORE_RELEASE(CELL_GCM_CURRENT, war_value);
	CELL_GCM_METHOD_WAIT_FOR_IDLE(CELL_GCM_CURRENT);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetWriteTextureLabelForConditional)(CELL_GCM_ARGS(uint8_t index, uint32_t value))
{
	CELL_GCM_RESERVE(6);

	uint32_t offset = 0x10 * index;
	CELL_GCM_METHOD_SET_SEMAPHORE_OFFSET(CELL_GCM_CURRENT, offset);
	CELL_GCM_METHOD_TEXTURE_READ_SEMAPHORE_RELEASE(CELL_GCM_CURRENT, value);
	CELL_GCM_METHOD_WAIT_FOR_IDLE(CELL_GCM_CURRENT);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexProgram)(CELL_GCM_ARGS(const CGprogram prog, const void *ucode))
{
	const CgBinaryProgram *vs;
	CgBinaryVertexProgram *binaryVertexProgram;
	const CgBinaryParameter *param;
	const uint32_t *rawData;
	uint32_t paramCount;
	uint32_t instCount;
	uint32_t instIndex;

	// check binary format revision -- offline recompile necessary
	// -- enforce the correct ucode for nv40/nv47/rsx
	vs = (const CgBinaryProgram*) prog;
	CELL_GCM_ASSERT(vs->binaryFormatRevision == CG_BINARY_FORMAT_REVISION);
	binaryVertexProgram = (CgBinaryVertexProgram*) ((char*)vs + vs->program);

	rawData = (const uint32_t*)ucode;
	instCount = binaryVertexProgram->instructionCount;
	instIndex = binaryVertexProgram->instructionSlot;

	// check program size
	CELL_GCM_ASSERT(instCount * 16 == vs->ucodeSize);
	CELL_GCM_ASSERT((instIndex + instCount) <= CELL_GCM_VTXPRG_MAX_INST);

	uint32_t loop, rest;
	loop = instCount / 8;
	rest = (instCount % 8) * 4;
	CELL_GCM_RESERVE(7 + loop*33 + (rest!=0 ? rest+1 : 0));

	// download program to slots [0,len)
	// -- cpu overhead ahead: multiple vertex programs can reside in the instruction
	//    slots. The high level driver/caller is supposed to manage these and only
	//    redownload the instructions as needed.
	CELL_GCM_METHOD_SET_TRANSFORM_PROGRAM_LOAD_START(CELL_GCM_CURRENT, 
		instIndex, 
		instIndex);

	// upload the ucode in the instruction registers
	uint32_t i, j;
	for (i = 0; i < loop; i++)
	{
		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_PROGRAM, 32);
		CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[1], &rawData[0], sizeof(uint32_t)*16);
		CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[17], &rawData[16], sizeof(uint32_t)*16);
		CELL_GCM_CURRENT += (1 + 32);
		rawData += 32;
	}
	if (rest > 0)
	{
		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_PROGRAM, rest);
		for (j = 0; j < rest; j++)
		{
			CELL_GCM_CURRENT[j+1] = CELL_GCM_ENDIAN_SWAP(rawData[j]);
		}
		CELL_GCM_CURRENT += (1 + rest);
	}

	// curie needs the correct input/ouput mask
	CELL_GCM_ASSERT((binaryVertexProgram->attributeInputMask & ~0xffff) == 0);
	CELL_GCM_METHOD_SET_VERTEX_ATTRIB_INPUT_MASK(CELL_GCM_CURRENT, 
		binaryVertexProgram->attributeInputMask);

	// how many registers does our vshader use ?
	if (binaryVertexProgram->registerCount <= 32)
	{
		CELL_GCM_METHOD_SET_TRANSFORM_TIMEOUT(CELL_GCM_CURRENT, 0xFFFF, 32);
	}
	else
	{
		CELL_GCM_METHOD_SET_TRANSFORM_TIMEOUT(CELL_GCM_CURRENT, 0xFFFF, 48);
	}

	// set constants and defaults
	// -- cpu overhead ahead: we should not redownload all constants
	//    everythime the shader changes!
	param = (const CgBinaryParameter *)((const char *)vs + vs->parameterArray);
	for(paramCount = vs->parameterCount; paramCount-- > 0;)
	{
		if (param->defaultValue &&
			((param->var == CG_CONSTANT) || (param->var == CG_UNIFORM)))
		{
			float value[4];
#ifdef CELL_GCM_LITTLE_ENDIAN
			// use no endian swap memcpy because next function do endian swap value[]
			memcpy(&value, (const char *)vs + param->defaultValue, sizeof(value));
#else
			CELL_GCM_MEMCPY(&value, (const char *)vs + param->defaultValue, sizeof(value));
#endif
			CELL_GCM_FUNC(SetVertexProgramParameter)(CELL_GCM_ARGS_FUNC((CGparameter)param, value));
		}
		++param;
	}
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetFragmentProgramLoadLocation)(CELL_GCM_ARGS(const CellCgbFragmentProgramConfiguration *conf, const uint32_t location))
{
	uint32_t rawData = ((conf->offset)&0x1fffffff);
	uint32_t shCtrl0;
	uint32_t registerCount;
	uint32_t texMask;
	uint32_t inMask;
	uint32_t texMask2D;
	uint32_t texMaskCentroid;
	uint32_t i;

	// local memory 64B, main memory 128B alignment restriction
	CELL_GCM_ASSERT((rawData & (64*(location+1)-1)) == 0);

	CELL_GCM_ASSERT(((conf->offset) & 0xe0000000) == 0);

	CELL_GCM_RESERVE(4);

	// set fragment program address
	CELL_GCM_METHOD_SET_SHADER_PROGRAM(CELL_GCM_CURRENT, 
		location+1, rawData);

	// enable the attributes referenced by the fragment program 
	inMask = conf->attributeInputMask;
	
	CELL_GCM_METHOD_SET_VERTEX_ATTRIB_OUTPUT_MASK(CELL_GCM_CURRENT, inMask); 

	// texture coordinates control
	texMask = conf->texCoordsInputMask;
	texMask2D = conf->texCoords2D;
	texMaskCentroid = conf->texCoordsCentroid;
	for(i=0; texMask; i++)
	{
		if (texMask&1)
		{
			uint32_t hwTexCtrl = (texMask2D & 1) | ((texMaskCentroid & 1) << 4);
			CELL_GCM_RESERVE(2);
			CELL_GCM_METHOD_SET_TEX_COORD_CONTROL(CELL_GCM_CURRENT, i,hwTexCtrl);
		}
		texMask >>= 1;
		texMask2D >>= 1;
		texMaskCentroid >>= 1;
	}

	// shader properties
	registerCount = conf->registerCount;
	CELL_GCM_ASSERT(registerCount <= 48);
	CELL_GCM_ASSERT(registerCount >= 2);
	
	if (registerCount < 2)
	{
		// register count must be [2, 48]
		registerCount = 2;
	}
	shCtrl0 = conf->fragmentControl | (registerCount << 24);
	CELL_GCM_RESERVE(2);
	CELL_GCM_METHOD_SET_SHADER_CONTROL(CELL_GCM_CURRENT, shCtrl0);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexProgramLoad)(CELL_GCM_ARGS(const CellCgbVertexProgramConfiguration *conf, const void *ucode))
{
	const uint32_t *rawData;
	uint32_t instCount;
	uint32_t instIndex;
	
	rawData = (const uint32_t*)ucode;
	instCount = conf->instructionCount;
	instIndex = conf->instructionSlot;

	CELL_GCM_ASSERT((instIndex + instCount) <= CELL_GCM_VTXPRG_MAX_INST);

	uint32_t loop, rest;
	loop = instCount / 8;
	rest = (instCount % 8) * 4;
	CELL_GCM_RESERVE(7 + loop*33 + (rest!=0 ? rest+1 : 0));

	// download program to slots [instIndex,instIndex+len-1]
	CELL_GCM_METHOD_SET_TRANSFORM_PROGRAM_LOAD_START(CELL_GCM_CURRENT, 
		instIndex, 
		instIndex);

	// upload the ucode in the instruction registers
	uint32_t i, j;
	for (i = 0; i < loop; i++)
	{
		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_PROGRAM, 32);
#ifdef CELL_GCM_LITTLE_ENDIAN
		// use no endian swap memcpy because cgb ucode is always big endian
		memcpy(&CELL_GCM_CURRENT[1], &rawData[0], sizeof(uint32_t)*32);
#else
		CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[1], &rawData[0], sizeof(uint32_t)*16);
		CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[17], &rawData[16], sizeof(uint32_t)*16);
#endif
		CELL_GCM_CURRENT += (1 + 32);
		rawData += 32;
	}
	if (rest > 0)
	{
		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_PROGRAM, rest);
		for (j = 0; j < rest; j++)
		{
			// direct copy because cgb ucode is always big endian
			CELL_GCM_CURRENT[j+1] = rawData[j];
		}
		CELL_GCM_CURRENT += (1 + rest);
	}

	//set the attribute input mask
	CELL_GCM_METHOD_SET_VERTEX_ATTRIB_INPUT_MASK(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(conf->attributeInputMask));

	// how many registers does our vshader use ?
	if (conf->registerCount <= 32)
	{
		CELL_GCM_METHOD_SET_TRANSFORM_TIMEOUT(CELL_GCM_CURRENT, 0xFFFF, 32);
	}
	else
	{
		CELL_GCM_METHOD_SET_TRANSFORM_TIMEOUT(CELL_GCM_CURRENT, 0xFFFF, 48);
	}

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexProgramLoadSlot)(CELL_GCM_ARGS(const uint32_t loadSlot, const uint32_t instCount, const void *ucode))
{
	const uint32_t *rawData;
	rawData = (const uint32_t*)ucode;

	CELL_GCM_ASSERT((loadSlot + instCount) <= CELL_GCM_VTXPRG_MAX_INST);

	uint32_t loop, rest;
	loop = instCount / 8;
	rest = (instCount % 8) * 4;
	CELL_GCM_RESERVE(2 + loop*33 + (rest!=0 ? rest+1 : 0));

	// download program to slots [slot,slot+instCount-1]
	CELL_GCM_METHOD_SET_TRANSFORM_PROGRAM_LOAD(CELL_GCM_CURRENT, loadSlot);

	// upload the ucode in the instruction registers
	uint32_t i, j;
	for (i = 0; i < loop; i++)
	{
		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_PROGRAM, 32);
		CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[1], &rawData[0], sizeof(uint32_t)*16);
		CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[17], &rawData[16], sizeof(uint32_t)*16);
		CELL_GCM_CURRENT += (1 + 32);
		rawData += 32;
	}
	if (rest > 0)
	{
		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_PROGRAM, rest);
		for (j = 0; j < rest; j++)
		{
			CELL_GCM_CURRENT[j+1] = CELL_GCM_ENDIAN_SWAP(rawData[j]);
		}
		CELL_GCM_CURRENT += (1 + rest);
	}

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexDataArray)(CELL_GCM_ARGS(uint8_t index, uint16_t frequency, uint8_t stride, uint8_t size, uint8_t type, uint8_t location, uint32_t offset))
{
	CELL_GCM_ASSERT((location <= CELL_GCM_LOCATION_MAIN));
	CELL_GCM_RESERVE(4);
#ifdef __SPU__
	uint32_t *ptr = CELL_GCM_CURRENT;
	uint32_t *nptr = CELL_GCM_CURRENT + 4;
	uint32_t ptroffset = (uint32_t)ptr & 0xf;
	vec_uint4 *vptr0 = (vec_uint4*)((uintptr_t)ptr);
	vec_uint4 *vptr1 = (vec_uint4*)((uintptr_t)nptr);
	vec_uint4 dstVec0 = *vptr0;
	vec_uint4 dstVec1 = *vptr1;
	CELL_GCM_CURRENT = nptr; 
	vec_uint4 src0 = (vec_uint4){CELL_GCM_METHOD(CELL_GCM_NV4097_SET_VERTEX_DATA_ARRAY_FORMAT + (index) * 4, 1),
								 (((frequency) << 16) | ((stride) << 8) | ((size) << 4) | (type)),
								 CELL_GCM_METHOD(CELL_GCM_NV4097_SET_VERTEX_DATA_ARRAY_OFFSET + (index) * 4, 1),
								 (((location) << 31) | (offset))};
	vec_uint4 mask = (vec_uint4)spu_splats(0xffffffff);
	vec_uint4 mask0 = (vec_uint4)spu_rlmaskqwbyte(mask, -ptroffset);
	vec_uint4 val0 = spu_rlmaskqwbyte(src0, -ptroffset);
	vec_uint4 val1 = spu_slqwbyte(src0, 16 - ptroffset);
	*vptr0 = spu_sel(dstVec0, val0, mask0);
	*vptr1 = spu_sel(val1, dstVec1, mask0);
#else
	CELL_GCM_METHOD_SET_VERTEX_DATA_ARRAY_FORMAT(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(frequency), 
		CELL_GCM_COMMAND_CAST(stride), 
		CELL_GCM_COMMAND_CAST(size), 
		CELL_GCM_COMMAND_CAST(type));
	CELL_GCM_METHOD_SET_VERTEX_DATA_ARRAY_OFFSET(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(location), 
		offset);
#endif
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTextureBorder)(CELL_GCM_ARGS(uint8_t index, 
	const CellGcmTexture *texture, uint8_t border))
{
	uint32_t format, offset, control1, control3, imagerect;

	CELL_GCM_ASSERT(index < CELL_GCM_MAX_TEXIMAGE_COUNT);
	CELL_GCM_ASSERT((texture->location <= CELL_GCM_LOCATION_MAIN));
	CELL_GCM_RESERVE(9);

	/*	set all paramters	*/
	offset = texture->offset;
	format = (texture->location + 1) | (texture->cubemap << 2) | (border << 3) 
		| (texture->dimension << 4) | (texture->format << 8) 
		| (texture->mipmap << 16);
	imagerect = texture->height | (texture->width << 16);
	control1 = texture->remap;
	control3 = texture->pitch | (texture->depth << 20);

	CELL_GCM_METHOD_SET_TEXTURE_OFFSET_FORMAT(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		offset, 
		format);
	CELL_GCM_METHOD_SET_TEXTURE_IMAGE_RECT(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		imagerect);
	CELL_GCM_METHOD_SET_TEXTURE_CONTROL3(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		control3);
	CELL_GCM_METHOD_SET_TEXTURE_CONTROL1(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		control1);

	CELL_GCM_DEBUG_TEXTURE_CHECK(CELL_GCM_CURRENT,texture,border);
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetWaitFlip)(CELL_GCM_NO_ARGS())
{
	CELL_GCM_FUNC(SetWaitLabel)(CELL_GCM_ARGS_FUNC(1,0));
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetFragmentProgramParameterPointer)(CELL_GCM_ARGS(const CGprogram prog, const CGparameter param, const uint32_t offset, const uint32_t srcOffset))
{
	CgBinaryProgram *prg = (CgBinaryProgram*) prog;
	CgBinaryParameter *p = (CgBinaryParameter*) param;
	uint32_t src = srcOffset;

	switch (p->type) 
	{
	case CG_FLOAT:
	case CG_BOOL:
	case CG_FLOAT1:
	case CG_BOOL1:
		if (p->embeddedConst)
		{
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);
			uint32_t j;
			for(j=0; j<ec->ucodeCount; j++)
			{
				uint32_t dst = offset + ec->ucodeOffset[j];

				// data in src needs to be 16bit swapped value
				CELL_GCM_FUNC(SetTransferData)(CELL_GCM_ARGS_FUNC(CELL_GCM_TRANSFER_MAIN_TO_LOCAL, dst, 4, src, 4, 4, 1));
			}
		}
		break;

	case CG_FLOAT2:
	case CG_BOOL2:
		if (p->embeddedConst)
		{
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);
			uint32_t j;
			for(j=0; j<ec->ucodeCount; j++)
			{
				uint32_t dst = offset + ec->ucodeOffset[j];

				// data in src needs to be 16bit swapped value
				CELL_GCM_FUNC(SetTransferData)(CELL_GCM_ARGS_FUNC(CELL_GCM_TRANSFER_MAIN_TO_LOCAL, dst, 8, src, 8, 8, 1));
			}
		}
		break;

	case CG_FLOAT3:
	case CG_BOOL3:
		if (p->embeddedConst)
		{
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);
			uint32_t j;
			for(j=0; j<ec->ucodeCount; j++)
			{
				uint32_t dst = offset + ec->ucodeOffset[j];

				// data in src needs to be 16bit swapped value
				CELL_GCM_FUNC(SetTransferData)(CELL_GCM_ARGS_FUNC(CELL_GCM_TRANSFER_MAIN_TO_LOCAL, dst, 12, src, 12, 12, 1));
			}
		}
		break;

	case CG_FLOAT4:
	case CG_BOOL4:
		if (p->embeddedConst)
		{
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);
			uint32_t j;
			for(j=0; j<ec->ucodeCount; j++)
			{
				uint32_t dst = offset + ec->ucodeOffset[j];

				// data in src needs to be 16bit swapped value
				CELL_GCM_FUNC(SetTransferData)(CELL_GCM_ARGS_FUNC(CELL_GCM_TRANSFER_MAIN_TO_LOCAL, dst, 16, src, 16, 16, 1));
			}
		}
		break;
	case CG_FLOAT3x3:
	case CG_BOOL3x3:
		p++;
		for (uint32_t cnt = 0; cnt < 3; cnt++, p++, src+=12) {
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			if (ptr)
			{
				CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);
				uint32_t j;
				for(j=0; j<ec->ucodeCount; j++)
				{
					uint32_t dst = offset + ec->ucodeOffset[j];

					// data in src needs to be 16bit swapped value
					CELL_GCM_FUNC(SetTransferData)(CELL_GCM_ARGS_FUNC(CELL_GCM_TRANSFER_MAIN_TO_LOCAL, dst, 12, src, 12, 12, 1));
				}
			}
		}
		break;
	case CG_FLOAT4x3:
	case CG_BOOL4x3:
		p++;
		for (uint32_t cnt = 0; cnt < 4; cnt++, p++, src+=12) {
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			if (ptr)
			{
				CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);
				uint32_t j;
				for(j=0; j<ec->ucodeCount; j++)
				{
					uint32_t dst = offset + ec->ucodeOffset[j];

					// data in src needs to be 16bit swapped value
					CELL_GCM_FUNC(SetTransferData)(CELL_GCM_ARGS_FUNC(CELL_GCM_TRANSFER_MAIN_TO_LOCAL, dst, 12, src, 12, 12, 1));
				}
			}
		}
		break;
	case CG_FLOAT3x4:
	case CG_BOOL3x4:
		p++;
		for (uint32_t cnt = 0; cnt < 3; cnt++, p++, src+=16) {
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			if (ptr)
			{
				CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);
				uint32_t j;
				for(j=0; j<ec->ucodeCount; j++)
				{
					uint32_t dst = offset + ec->ucodeOffset[j];

					// data in src needs to be 16bit swapped value
					CELL_GCM_FUNC(SetTransferData)(CELL_GCM_ARGS_FUNC(CELL_GCM_TRANSFER_MAIN_TO_LOCAL, dst, 16, src, 16, 16, 1));
				}
			}
		}
		break;
	case CG_FLOAT4x4:
	case CG_BOOL4x4:
		p++;
		for (uint32_t cnt = 0; cnt < 4; cnt++, p++, src+=16) {
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			if (ptr)
			{
				CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);
				uint32_t j;
				for(j=0; j<ec->ucodeCount; j++)
				{
					uint32_t dst = offset + ec->ucodeOffset[j];

					// data in src needs to be 16bit swapped value
					CELL_GCM_FUNC(SetTransferData)(CELL_GCM_ARGS_FUNC(CELL_GCM_TRANSFER_MAIN_TO_LOCAL, dst, 16, src, 16, 16, 1));
				}
			}
		}
		break;
	default:
		CELL_GCM_ASSERT(0);
		break;
	}
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetFragmentProgramParameter)(CELL_GCM_ARGS(const CGprogram prog, const CGparameter param, const float *value, const uint32_t offset))
{
	CgBinaryProgram *prg = (CgBinaryProgram*) prog;
	CgBinaryParameter *p = (CgBinaryParameter*) param;

	switch (p->type) 
	{
	case CG_FLOAT:
	case CG_BOOL:
	case CG_FLOAT1:
	case CG_BOOL1:
		if (p->embeddedConst)
		{
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);

			float tmp[1];
			// hw bug workaround, swap 16bit value
			tmp[0] = cellGcmSwap16Float32(value[0]);

			uint32_t j;
			for(j=0; j<ec->ucodeCount; j++)
			{
				uint32_t dst = offset + ec->ucodeOffset[j];
				CELL_GCM_FUNC(SetInlineTransfer)(CELL_GCM_ARGS_FUNC(dst, tmp, 1));
			}
		}
		break;
	case CG_FLOAT2:
	case CG_BOOL2:
		if (p->embeddedConst)
		{
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);

			float tmp[2];
			// hw bug workaround, swap 16bit value
			tmp[0] = cellGcmSwap16Float32(value[0]);
			tmp[1] = cellGcmSwap16Float32(value[1]);

			uint32_t j;
			for(j=0; j<ec->ucodeCount; j++)
			{
				uint32_t dst = offset + ec->ucodeOffset[j];
				CELL_GCM_FUNC(SetInlineTransfer)(CELL_GCM_ARGS_FUNC(dst, tmp, 2));
			}
		}
		break;
	case CG_FLOAT3:
	case CG_BOOL3:
		if (p->embeddedConst)
		{
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);

			float tmp[3];
			// hw bug workaround, swap 16bit value
			tmp[0] = cellGcmSwap16Float32(value[0]);
			tmp[1] = cellGcmSwap16Float32(value[1]);
			tmp[2] = cellGcmSwap16Float32(value[2]);

			uint32_t j;
			for(j=0; j<ec->ucodeCount; j++)
			{
				uint32_t dst = offset + ec->ucodeOffset[j];
				CELL_GCM_FUNC(SetInlineTransfer)(CELL_GCM_ARGS_FUNC(dst, tmp, 3));
			}
		}
		break;
	case CG_FLOAT4:
	case CG_BOOL4:
		if (p->embeddedConst)
		{
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);

			float tmp[4];
			// hw bug workaround, swap 16bit value
			tmp[0] = cellGcmSwap16Float32(value[0]);
			tmp[1] = cellGcmSwap16Float32(value[1]);
			tmp[2] = cellGcmSwap16Float32(value[2]);
			tmp[3] = cellGcmSwap16Float32(value[3]);

			uint32_t j;
			for(j=0; j<ec->ucodeCount; j++)
			{
				uint32_t dst = offset + ec->ucodeOffset[j];
				CELL_GCM_FUNC(SetInlineTransfer)(CELL_GCM_ARGS_FUNC(dst, tmp, 4));
			}
		}
		break;
	case CG_FLOAT3x3:
	case CG_BOOL3x3:
		p++;
		for (uint32_t cnt = 0; cnt < 3; cnt++, p++, value += 3) {
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			if (ptr)
			{
				CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);

				float tmp[3];
				// hw bug workaround, swap 16bit value
				tmp[0] = cellGcmSwap16Float32(value[0]);
				tmp[1] = cellGcmSwap16Float32(value[1]);
				tmp[2] = cellGcmSwap16Float32(value[2]);

				uint32_t j;
				for(j=0; j<ec->ucodeCount; j++)
				{
					uint32_t dst = offset + ec->ucodeOffset[j];
					CELL_GCM_FUNC(SetInlineTransfer)(CELL_GCM_ARGS_FUNC(dst, tmp, 3));
				}
			}
		}
		break;
	case CG_FLOAT4x3:
	case CG_BOOL4x3:
		p++;
		for (uint32_t cnt = 0; cnt < 4; cnt++,p++,value += 3) {
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			if (ptr)
			{
				CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);

				float tmp[3];
				// hw bug workaround, swap 16bit value
				tmp[0] = cellGcmSwap16Float32(value[0]);
				tmp[1] = cellGcmSwap16Float32(value[1]);
				tmp[2] = cellGcmSwap16Float32(value[2]);

				uint32_t j;
				for(j=0; j<ec->ucodeCount; j++)
				{
					uint32_t dst = offset + ec->ucodeOffset[j];
					CELL_GCM_FUNC(SetInlineTransfer)(CELL_GCM_ARGS_FUNC(dst,tmp,3));
				}
			}
		}
		break;
	case CG_FLOAT3x4:
	case CG_BOOL3x4:
		p++;
		for (uint32_t cnt = 0; cnt < 3; cnt++,p++,value += 4) {
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			if (ptr)
			{
				CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);

				float tmp[4];
				// hw bug workaround, swap 16bit value
				tmp[0] = cellGcmSwap16Float32(value[0]);
				tmp[1] = cellGcmSwap16Float32(value[1]);
				tmp[2] = cellGcmSwap16Float32(value[2]);
				tmp[3] = cellGcmSwap16Float32(value[3]);

				uint32_t j;
				for(j=0; j<ec->ucodeCount; j++)
				{
					uint32_t dst = offset + ec->ucodeOffset[j];
					CELL_GCM_FUNC(SetInlineTransfer)(CELL_GCM_ARGS_FUNC(dst,tmp,4));
				}
			}
		}
		break;
	case CG_FLOAT4x4:
	case CG_BOOL4x4:
		p++;
		for (uint32_t cnt = 0; cnt < 4; cnt++, p++, value += 4) {
			// set embedded constants
			CgBinaryEmbeddedConstantOffset ptr = p->embeddedConst;
			if (ptr)
			{
				CgBinaryEmbeddedConstant *ec = (CgBinaryEmbeddedConstant*) ((char*)prg + ptr);

				float tmp[4];
				// hw bug workaround, swap 16bit value
				tmp[0] = cellGcmSwap16Float32(value[0]);
				tmp[1] = cellGcmSwap16Float32(value[1]);
				tmp[2] = cellGcmSwap16Float32(value[2]);
				tmp[3] = cellGcmSwap16Float32(value[3]);

				uint32_t j;
				for(j=0; j<ec->ucodeCount; j++)
				{
					uint32_t dst = offset + ec->ucodeOffset[j];
					CELL_GCM_FUNC(SetInlineTransfer)(CELL_GCM_ARGS_FUNC(dst, tmp, 4));
				}
			}
		}
		break;
	default:
		CELL_GCM_ASSERT(0);
		break;
	}
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetFragmentProgram)(CELL_GCM_ARGS(const CGprogram prog, uint32_t offset))
{
	CELL_GCM_FUNC(SetFragmentProgramOffset)(CELL_GCM_ARGS_FUNC(prog, offset, 0));
	CELL_GCM_FUNC(SetFragmentProgramControl)(CELL_GCM_ARGS_FUNC(prog, 0, 1, 0));

	//CELL_GCM_INTERNAL_ASSERT(SetPixelShaderConsistency(!binaryFragmentProgram->outputFromH0));

#if 0  // removed due to expose depth clamp func to apps, only apps use depth replace (which is very few) will be affected
	// depth replace fp should not be znear/zfar culled
	if(binaryFragmentProgram->depthReplace)
		CELL_GCM_METHOD_SET_ZMIN_MAX_CONTROL(CELL_GCM_CURRENT, 0,0,0);
	else
		CELL_GCM_METHOD_SET_ZMIN_MAX_CONTROL(CELL_GCM_CURRENT, 1,0,0);
#endif
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexProgramParameter)(CELL_GCM_ARGS(const CGparameter param, const float *value))
{
	CgBinaryParameter *p = (CgBinaryParameter*) param;

	CELL_GCM_ASSERT((p->var == CG_CONSTANT) || (p->var == CG_UNIFORM));
	CELL_GCM_ASSERT((p->res == CG_UNDEFINED) || (p->res == CG_C));
	CELL_GCM_ASSERT(p->resIndex < CELL_GCM_VTXPRG_MAX_CONST);

	if (p->res == CG_UNDEFINED)
		return;

	switch (p->type)
	{
	case CG_FLOAT:
	case CG_FLOAT1:
		{
			if (p->resIndex != -1) {
				CELL_GCM_ASSERT(p->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
	
				CELL_GCM_RESERVE(6);
				CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, 5);
				CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(p->resIndex);
				CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[2], value, sizeof(float));
				CELL_GCM_CURRENT[3] = 0;
				CELL_GCM_CURRENT[4] = 0;
				CELL_GCM_CURRENT[5] = 0;
				CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
				CELL_GCM_CURRENT+=6;
			}
		}
		break;

	case CG_FLOAT2:
		{
			if (p->resIndex != -1) {
				CELL_GCM_ASSERT(p->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
	
				CELL_GCM_RESERVE(6);
				CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, 5);
				CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(p->resIndex);
				CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[2], value, sizeof(float)*2);
				CELL_GCM_CURRENT[4] = 0;
				CELL_GCM_CURRENT[5] = 0;
				CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
				CELL_GCM_CURRENT+=6;
			}
		}
		break;

	case CG_FLOAT3:
		{
			if (p->resIndex != -1) {
				CELL_GCM_ASSERT(p->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
	
				CELL_GCM_RESERVE(6);
				CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, 5);
				CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(p->resIndex);
				CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[2], value, sizeof(float)*3);
				CELL_GCM_CURRENT[5] = 0;
				CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
				CELL_GCM_CURRENT+=6;
			}
		}
		break;

	case CG_FLOAT4:
		{
			if (p->resIndex != -1) {
				CELL_GCM_ASSERT(p->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
	
				CELL_GCM_RESERVE(6);
				CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, 5);
				CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(p->resIndex);
				CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[2], value, sizeof(float)*4);
				CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
				CELL_GCM_CURRENT+=6;
			}
		}
		break;

	case CG_FLOAT3x4:
		{
			// set 4 consts
			CELL_GCM_ASSERT((p+1)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
			CELL_GCM_ASSERT((p+2)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
			CELL_GCM_ASSERT((p+3)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);

			{
				p++;
				for (int cnt = 0; cnt < 3; cnt++,p++,value += 4) 
				{
					if ((p->res != CG_UNDEFINED) && (p->resIndex != -1)) 
					{
						CELL_GCM_RESERVE(6);
						CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, 5);
						CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(p->resIndex);
						CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[2], value, sizeof(float)*4);
						CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
						CELL_GCM_CURRENT+=6;
					}
				}
			}
		}
		break;

	case CG_FLOAT4x4:
		{
			// set 4 consts
			CELL_GCM_ASSERT((p+1)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
			CELL_GCM_ASSERT((p+2)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
			CELL_GCM_ASSERT((p+3)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
			CELL_GCM_ASSERT((p+4)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);

			if (((p+1)->res != CG_UNDEFINED) 
				&& ((p+2)->res != CG_UNDEFINED) 
				&& ((p+3)->res != CG_UNDEFINED) 
				&& ((p+4)->res != CG_UNDEFINED)) 
			{
				if (((p+1)->resIndex != -1)
					&& ((p+2)->resIndex != -1)
					&& ((p+3)->resIndex != -1)
					&& ((p+4)->resIndex != -1)) {

					CELL_GCM_RESERVE(18);
					CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, 17);
					CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP((p+1)->resIndex);
					CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[2], value, sizeof(float)*16);
					CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
					CELL_GCM_CURRENT += 18;
				}
			}
			else 
			{
				p++;
				for (int cnt = 0; cnt < 4; cnt++, p++, value += 4) 
				{
					if ((p->res != CG_UNDEFINED) && (p->resIndex != -1)) 
					{
						CELL_GCM_RESERVE(6);
						CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, 5);
						CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(p->resIndex);
						CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[2], value, sizeof(float)*4);
						CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
						CELL_GCM_CURRENT+=6;
					}
				}
			}
		}
		break;

	case CG_FLOAT3x3:
		{
			// set 3 consts
			CELL_GCM_ASSERT((p+1)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
			CELL_GCM_ASSERT((p+2)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
			CELL_GCM_ASSERT((p+3)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);

			{
				p++;
				for (int cnt = 0; cnt < 3; cnt++, p++, value += 3) 
				{
					if (((p)->res != CG_UNDEFINED) && (p->resIndex != -1)) 
					{
						CELL_GCM_RESERVE(6);
						CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, 5);
						CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(p->resIndex);
						CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[2], value, sizeof(float)*3);
						CELL_GCM_CURRENT[5] = 0;
						CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
						CELL_GCM_CURRENT+=6;
					}
				}
			}
		}
		break;

	case CG_FLOAT4x3:
		{
			// set 3 consts
			CELL_GCM_ASSERT((p+1)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
			CELL_GCM_ASSERT((p+2)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
			CELL_GCM_ASSERT((p+3)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);
			CELL_GCM_ASSERT((p+4)->resIndex < CELL_GCM_VTXPRG_MAX_CONST);

			{
				p++;
				for (int cnt = 0; cnt < 4; cnt++,p++,value += 3) 
				{
					if (((p)->res != CG_UNDEFINED) && (p->resIndex != -1)) 
					{
						CELL_GCM_RESERVE(6);
						CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, 5);
						CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(p->resIndex);
						CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[2], value, sizeof(float)*3);
						CELL_GCM_CURRENT[5] = 0;
						CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
						CELL_GCM_CURRENT+=6;
					}
				}
			}
		}
		break;

	default:
		CELL_GCM_ASSERT(0);
		break;
	}
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetFragmentProgramLoad)(CELL_GCM_ARGS(const CellCgbFragmentProgramConfiguration *conf))
{
	CELL_GCM_FUNC(SetFragmentProgramLoadLocation)(CELL_GCM_ARGS_FUNC(conf, CELL_GCM_LOCATION_LOCAL));
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetUpdateFragmentProgramParameter)(CELL_GCM_ARGS(uint32_t offset))
{
	CELL_GCM_FUNC(SetUpdateFragmentProgramParameterLocation)(CELL_GCM_ARGS_FUNC(offset, CELL_GCM_LOCATION_LOCAL));
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTextureFilterSigned)(CELL_GCM_ARGS(uint8_t index, 
	uint16_t bias, uint8_t min, uint8_t mag, uint8_t conv,
	uint8_t as, uint8_t rs, uint8_t gs, uint8_t bs))
{
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_TEXIMAGE_COUNT);
	CELL_GCM_RESERVE(2);

#ifdef	CELL_GCM_BITFIELD
	CELL_GCM_METHOD_SET_TEXTURE_FILTER(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(bias & 0x1fff), 
		CELL_GCM_COMMAND_CAST(min), 
		CELL_GCM_COMMAND_CAST(mag), 
		CELL_GCM_COMMAND_CAST(conv),
		CELL_GCM_COMMAND_CAST(as),
		CELL_GCM_COMMAND_CAST(rs),
		CELL_GCM_COMMAND_CAST(gs),
		CELL_GCM_COMMAND_CAST(bs));
#else
	CELL_GCM_METHOD_SET_TEXTURE_FILTER(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(bias), 
		CELL_GCM_COMMAND_CAST(min), 
		CELL_GCM_COMMAND_CAST(mag), 
		CELL_GCM_COMMAND_CAST(conv),
		CELL_GCM_COMMAND_CAST(as),
		CELL_GCM_COMMAND_CAST(rs),
		CELL_GCM_COMMAND_CAST(gs),
		CELL_GCM_COMMAND_CAST(bs));
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTextureAddressAnisoBiasRemap)(CELL_GCM_ARGS(uint8_t index, 
	uint8_t wraps, uint8_t wrapt, uint8_t wrapr, uint8_t unsignedRemap, 
	uint8_t zfunc, uint8_t gamma, uint8_t anisoBias, uint8_t signedRemap))
{
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_TEXIMAGE_COUNT);
	CELL_GCM_ASSERT(signedRemap == CELL_GCM_TEXTURE_SIGNED_REMAP_NORMAL || signedRemap == CELL_GCM_TEXTURE_SIGNED_REMAP_CLAMPED);
	CELL_GCM_ASSERT(unsignedRemap == CELL_GCM_TEXTURE_UNSIGNED_REMAP_NORMAL || unsignedRemap == CELL_GCM_TEXTURE_UNSIGNED_REMAP_BIASED);
	CELL_GCM_RESERVE(2);
	CELL_GCM_METHOD_SET_TEXTURE_ADDRESS(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(wraps), 
		CELL_GCM_COMMAND_CAST(wrapt), 
		CELL_GCM_COMMAND_CAST(wrapr), 
		CELL_GCM_COMMAND_CAST(unsignedRemap), 
		CELL_GCM_COMMAND_CAST(zfunc), 
		CELL_GCM_COMMAND_CAST(gamma), 
		CELL_GCM_COMMAND_CAST(anisoBias),
		CELL_GCM_COMMAND_CAST(signedRemap));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTextureAddressAnisoBias)(CELL_GCM_ARGS(uint8_t index, 
	uint8_t wraps, uint8_t wrapt, uint8_t wrapr, uint8_t unsignedRemap, 
	uint8_t zfunc, uint8_t gamma, uint8_t anisoBias))
{
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_TEXIMAGE_COUNT);
	CELL_GCM_RESERVE(2);
	CELL_GCM_METHOD_SET_TEXTURE_ADDRESS(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		CELL_GCM_COMMAND_CAST(wraps), 
		CELL_GCM_COMMAND_CAST(wrapt), 
		CELL_GCM_COMMAND_CAST(wrapr), 
		CELL_GCM_COMMAND_CAST(unsignedRemap), 
		CELL_GCM_COMMAND_CAST(zfunc), 
		CELL_GCM_COMMAND_CAST(gamma), 
		CELL_GCM_COMMAND_CAST(anisoBias),
		0);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTexture)(CELL_GCM_ARGS(uint8_t index,
	const CellGcmTexture *texture))
{	
	CELL_GCM_FUNC(SetTextureBorder)(CELL_GCM_ARGS_FUNC(index , texture , 0x1));
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexTexture)(CELL_GCM_ARGS(const uint8_t index, const CellGcmTexture *texture))
{
	uint32_t offset, format, control3, imagerect;

	// parameter check
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_VERTEX_TEXTURE);
	CELL_GCM_ASSERTS((texture->format == (CELL_GCM_TEXTURE_LN|CELL_GCM_TEXTURE_NR|CELL_GCM_TEXTURE_W32_Z32_Y32_X32_FLOAT)) 
	           || (texture->format == (CELL_GCM_TEXTURE_LN|CELL_GCM_TEXTURE_NR|CELL_GCM_TEXTURE_X32_FLOAT)), "Vertex Texture only support LN_NR_W32_Z32_Y32_X32_FLOAT or LN_NR_X32_FLOAT format" ); 
	CELL_GCM_ASSERTS( (texture->offset & 127)==0, "Texture offset must be 128 byte aligned" );
	CELL_GCM_ASSERTS((texture->mipmap > 0) && (texture->mipmap <=13), "Legal range for mipmap [1, 13]" );

#ifdef CELL_GCM_ASSERT_ENABLE
	if (texture->format == (CELL_GCM_TEXTURE_LN|CELL_GCM_TEXTURE_NR|CELL_GCM_TEXTURE_W32_Z32_Y32_X32_FLOAT))
		CELL_GCM_ASSERTS(((texture->pitch)&15) == 0, "Pitch must be 16B aligned for CELL_GCM_TEXTURE_LN|CELL_GCM_TEXTURE_NR|CELL_GCM_TEXTURE_W32_Z32_Y32_X32_FLOAT");
	else if (texture->format == (CELL_GCM_TEXTURE_LN|CELL_GCM_TEXTURE_NR|CELL_GCM_TEXTURE_X32_FLOAT))
		CELL_GCM_ASSERTS(((texture->pitch)&3) == 0, "Pitch must be 4B aligned for CELL_GCM_TEXTURE_LN|CELL_GCM_TEXTURE_NR|CELL_GCM_TEXTURE_X32_FLOAT");
#endif

	CELL_GCM_RESERVE(7);

	offset = texture->offset;
	format = (texture->location + 1) | (texture->dimension << 4) 
		| (texture->format << 8) | (texture->mipmap << 16);
	imagerect = texture->height | (texture->width << 16);
	control3 = texture->pitch;

	CELL_GCM_METHOD_SET_VERTEX_TEXTURE_OFFSET_FORMAT(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index),
		offset,
		format);
	CELL_GCM_METHOD_SET_VERTEX_TEXTURE_CONTROL3(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index),
		control3);
	CELL_GCM_METHOD_SET_VERTEX_TEXTURE_IMAGE_RECT(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index),
		imagerect);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetSurface)(CELL_GCM_ARGS(const CellGcmSurface *surface))
{
	CELL_GCM_FUNC(SetSurfaceWindow)(CELL_GCM_ARGS_FUNC(
		surface, 
		CELL_GCM_WINDOW_ORIGIN_BOTTOM, 
		CELL_GCM_WINDOW_PIXEL_CENTER_HALF));
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetSurfaceWindow)(CELL_GCM_ARGS(
	const CellGcmSurface *surface, const uint32_t origin, const uint32_t pixelCenter))
{
	CELL_GCM_RESERVE(32);

	// alignment restriction, SET_SURFACE_COLOR_XOFFSET, SET_SURFACE_ZETA_OFFSET
	// need to be 64Byte alignment
	CELL_GCM_ASSERT((surface->colorOffset[0] & 63) == 0);
	CELL_GCM_ASSERT((surface->colorOffset[1] & 63) == 0);
	CELL_GCM_ASSERT((surface->colorOffset[2] & 63) == 0);
	CELL_GCM_ASSERT((surface->colorOffset[3] & 63) == 0);
	CELL_GCM_ASSERT((surface->depthOffset & 63) == 0);

	// alignment restriction, SET_SURFACE_PITCH_X needs to be 64byte alignment
	CELL_GCM_ASSERT((surface->colorPitch[0] & 63) == 0);
	CELL_GCM_ASSERT((surface->colorPitch[1] & 63) == 0);
	CELL_GCM_ASSERT((surface->colorPitch[2] & 63) == 0);
	CELL_GCM_ASSERT((surface->colorPitch[3] & 63) == 0);
	CELL_GCM_ASSERT((surface->depthPitch & 63) == 0);

	// pitch >= 64 && pitch < 128K
	CELL_GCM_ASSERT((surface->colorPitch[0] >= 64) && (surface->colorPitch[0] < 128*1024));
	CELL_GCM_ASSERT((surface->colorPitch[1] >= 64) && (surface->colorPitch[1] < 128*1024));
	CELL_GCM_ASSERT((surface->colorPitch[2] >= 64) && (surface->colorPitch[2] < 128*1024));
	CELL_GCM_ASSERT((surface->colorPitch[3] >= 64) && (surface->colorPitch[3] < 128*1024));
	CELL_GCM_ASSERT((surface->depthPitch >= 64) && (surface->depthPitch < 128*1024));

	// Check if "location" is valid
	CELL_GCM_ASSERT((surface->colorLocation[0] <= CELL_GCM_LOCATION_MAIN));
	CELL_GCM_ASSERT((surface->colorLocation[1] <= CELL_GCM_LOCATION_MAIN));
	CELL_GCM_ASSERT((surface->colorLocation[2] <= CELL_GCM_LOCATION_MAIN));
	CELL_GCM_ASSERT((surface->colorLocation[3] <= CELL_GCM_LOCATION_MAIN));
	CELL_GCM_ASSERT((surface->depthLocation <= CELL_GCM_LOCATION_MAIN));

	// Check if "width" is valid
	CELL_GCM_ASSERT((surface->width >= 1) && (surface->width <= 4096));
	// Check if "height" is valid
	CELL_GCM_ASSERT((surface->height >= 1) && (surface->height <= 4096));

	// Set Context Dma
	CELL_GCM_METHOD_SET_CONTEXT_DMA_COLOR_A(CELL_GCM_CURRENT,
		CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER+surface->colorLocation[0]);
	CELL_GCM_METHOD_SET_CONTEXT_DMA_COLOR_B(CELL_GCM_CURRENT,
		CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER+surface->colorLocation[1]);
	CELL_GCM_METHOD_SET_CONTEXT_DMA_COLOR_C_D(CELL_GCM_CURRENT,
		CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER+surface->colorLocation[2],
		CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER+surface->colorLocation[3]);
	CELL_GCM_METHOD_SET_CONTEXT_DMA_Z(CELL_GCM_CURRENT,
		CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER+surface->depthLocation);


	// buffer format, color and z buffers
#ifdef __SPU__
	uint32_t log2Width = 31 - __builtin_clz((uint32_t)surface->width);
	uint32_t log2Height = 31 - __builtin_clz((uint32_t)surface->height);
#endif
#ifdef __PPU__
#ifdef __SNC__
	uint32_t log2Width = 31 - __cntlzw(surface->width);
	uint32_t log2Height = 31 - __cntlzw(surface->height);
#else
	uint32_t log2Width = 31 - ({__asm__("cntlzw %0,%1" : "=r" (log2Width) : "r" (surface->width)); log2Width;});
	uint32_t log2Height = 31 - ({__asm__("cntlzw %0,%1" : "=r" (log2Height) : "r" (surface->height)); log2Height;});
#endif
#endif
#if defined(__linux__) || defined(WIN32)
	uint32_t n,i;

	n = surface->width;
	for(i=1; (n >> i) > 0; i++){}
	uint32_t log2Width = i-1;

	n = surface->height;
	for(i=1; (n >> i) > 0; i++){}
	uint32_t log2Height = i-1;
#endif

	CELL_GCM_METHOD_SET_SURFACE_FORMAT_PITCH_A_B_OFFSET_A_B_Z(CELL_GCM_CURRENT, 
		surface->colorFormat, surface->depthFormat, surface->antialias, surface->type, log2Width, log2Height,
		surface->colorPitch[0],
		surface->colorOffset[0],
		surface->depthOffset,
		surface->colorOffset[1],
		surface->colorPitch[1]);
	CELL_GCM_METHOD_SET_SURFACE_PITCH_Z(CELL_GCM_CURRENT, 
		surface->depthPitch);
	CELL_GCM_METHOD_SET_SURFACE_PITCH_C_D_OFFSET_C_D(CELL_GCM_CURRENT, 
		surface->colorPitch[2],
		surface->colorPitch[3],
		surface->colorOffset[2],
		surface->colorOffset[3]);
	CELL_GCM_METHOD_SET_SURFACE_COLOR_TARGET(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(surface->colorTarget));

	// Set the window origin
	CELL_GCM_METHOD_SET_WINDOW_OFFSET(CELL_GCM_CURRENT, 
		surface->x, surface->y);

	// surface clips - hw expects origin/size values
	CELL_GCM_METHOD_SET_SURFACE_CLIP_HORIZONTAL_VERTICAL(CELL_GCM_CURRENT, 
		surface->x, surface->width,
		surface->y, surface->height);

	// allow fp's to know x/y position
	// only when height == 4096, it should be 4096 - 1
	CELL_GCM_METHOD_SET_SHADER_WINDOW(CELL_GCM_CURRENT,
		surface->height - (((surface->height) & 0x1000) >> 12), origin, pixelCenter);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetInlineTransfer)(CELL_GCM_ARGS(const uint32_t dstOffset, const void *srcAdr, const uint32_t sizeInWords))
{
	uint32_t *src;
	uint32_t *srcEnd;
	uint32_t paddedSizeInWords;
	uint32_t alignedVideoOffset;
	uint32_t pixelShift;

	// sanity checks
	CELL_GCM_ASSERT((dstOffset & 3) == 0);  // destination must be aligned
	CELL_GCM_ASSERT(sizeInWords < 2*896); // hw/class limit
	CELL_GCM_ASSERT(sizeInWords <= 512);   // our artifical limit

	// handle 64 byte alignment restriction
	alignedVideoOffset = dstOffset & ~63;
	pixelShift = (dstOffset & 63) >> 2;

	// setup remaining image from cpu blit stuff
	paddedSizeInWords = (sizeInWords + 1) & ~1; // even width only

	CELL_GCM_RESERVE(10+paddedSizeInWords);

	CELL_GCM_METHOD_SURFACE2D_SET_OFFSET_DESTIN(CELL_GCM_CURRENT, 
		alignedVideoOffset);
	CELL_GCM_METHOD_SURFACE2D_SET_COLOR_FORMAT_PITCH(CELL_GCM_CURRENT, 
		CELL_GCM_TRANSFER_SURFACE_FORMAT_Y32, 
		0x1000, 0x1000);
	CELL_GCM_METHOD_INLINE2D_POINT_SIZE_OUT_IN(CELL_GCM_CURRENT, 
		pixelShift, 0, 
		sizeInWords, 1, 
		sizeInWords, 1);

	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV308A_COLOR, paddedSizeInWords);
	CELL_GCM_CURRENT += 1;
	// copy data into the command fifo
	src = (uint32_t*)srcAdr;
	srcEnd = src + sizeInWords;
	while(src<srcEnd)
	{
		CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP(src[0]);
		CELL_GCM_CURRENT += 1;
		src += 1;
	}
	if (paddedSizeInWords != sizeInWords)
	{
		CELL_GCM_CURRENT[0] = 0;
		CELL_GCM_CURRENT += 1;
	}

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(InlineTransfer)(CELL_GCM_ARGS(const uint32_t dstOffset, const void *srcAdr, const uint32_t sizeInWords, const uint8_t location))
{
	CELL_GCM_ASSERT((location <= CELL_GCM_LOCATION_MAIN));
	CELL_GCM_RESERVE(12+((sizeInWords + 1) & ~1));

	CELL_GCM_METHOD_SURFACE2D_SET_CONTEXT_DMA_IMAGE_DESTIN(CELL_GCM_CURRENT, CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER + location);

#if CELL_GCM_UNSAFE
		CELL_GCM_FUNC(SetInlineTransfer)(CELL_GCM_ARGS_FUNC(dstOffset, srcAdr, sizeInWords));
#else
		CELL_GCM_FUNC(SetInlineTransferUnsafe)(CELL_GCM_ARGS_FUNC(dstOffset, srcAdr, sizeInWords));
#endif
}

CELL_GCM_DECL uint32_t CELL_GCM_FUNC(SetTransferImage)(CELL_GCM_ARGS(
	uint8_t mode, uint32_t dstOffset, uint32_t dstPitch, uint32_t dstX, 
	uint32_t dstY, uint32_t srcOffset, uint32_t srcPitch, uint32_t srcX, 
	uint32_t srcY, uint32_t width, uint32_t height, uint32_t bytesPerPixel))
{
	uint32_t srcHandle,dstHandle;

	switch(mode)
	{
	case CELL_GCM_TRANSFER_LOCAL_TO_LOCAL:
		srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		break;
	case CELL_GCM_TRANSFER_MAIN_TO_LOCAL:
		srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		break;
	case CELL_GCM_TRANSFER_LOCAL_TO_MAIN:
		srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		break;
	case CELL_GCM_TRANSFER_MAIN_TO_MAIN:
		srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		break;
	default:
		return (uint32_t)CELL_GCM_ERROR_FAILURE;
	}

	CELL_GCM_RESERVE_RET(6, (uint32_t)CELL_GCM_ERROR_FAILURE);
	CELL_GCM_METHOD_SURFACE2D_SET_CONTEXT_DMA_IMAGE_DESTIN(CELL_GCM_CURRENT, 
		dstHandle);
	CELL_GCM_METHOD_SCALE2D_SET_CONTEXT_DMA_IMAGE(CELL_GCM_CURRENT, 
		srcHandle);
	CELL_GCM_METHOD_SCALE2D_SET_CONTEXT_SURFACE(CELL_GCM_CURRENT, 
		CELL_GCM_CONTEXT_SURFACE2D);

	uint32_t NV_SURFACE2D_MAX_DIM = 10;
	uint32_t BLOCKSIZE = 1 << NV_SURFACE2D_MAX_DIM;
	uint32_t srcFormat;
	uint32_t dstFormat;
	uint32_t x;
	uint32_t y;
	uint32_t finalDstX;
	uint32_t finalDstY;

	// sanity test
	CELL_GCM_ASSERT((dstOffset & 63) == 0); // 64byte alignment
	CELL_GCM_ASSERT((dstPitch & 63) == 0); // 64byte alignment
	CELL_GCM_ASSERT(srcPitch < 0xffff);
	CELL_GCM_ASSERT(dstPitch < 0xffff);

	// determine color format
	switch(bytesPerPixel)
	{
	case 2:
		srcFormat = CELL_GCM_TRANSFER_SCALE_FORMAT_R5G6B5;
		dstFormat = CELL_GCM_TRANSFER_SURFACE_FORMAT_R5G6B5;
		break;
	case 4:
		srcFormat = CELL_GCM_TRANSFER_SCALE_FORMAT_A8R8G8B8;
		dstFormat = CELL_GCM_TRANSFER_SURFACE_FORMAT_A8R8G8B8;
		break;
	case 1: // LE_Y8 is not supported on curie
	default:
		srcFormat = 0;
		dstFormat = 0;
		CELL_GCM_ASSERT(0);
		break;
	}

	// split large blits
	finalDstX = dstX + width;
	finalDstY = dstY + height;
	for(y = dstY; y < finalDstY;)
	{
		// determine this blits height
		uint32_t dstTop = y & ~(BLOCKSIZE - 1);
		uint32_t dstBot = dstTop + BLOCKSIZE;
		uint32_t dstBltHeight = ((dstBot<finalDstY) ? dstBot : finalDstY) - y;
		for(x = dstX; x < finalDstX;)
		{
			// determine this blits width
			uint32_t dstLeft = x & ~(BLOCKSIZE - 1);
			uint32_t dstRight = dstLeft + BLOCKSIZE;
			uint32_t dstBltWidth = ((dstRight<finalDstX) ? dstRight : finalDstX) - x;

			// align the surface/destination surface properly
			uint32_t dstBlockOffset = bytesPerPixel * (dstLeft & ~(BLOCKSIZE - 1)) + dstPitch * dstTop;
			uint32_t srcBlockOffset = bytesPerPixel * (srcX + x-dstX) + srcPitch * (srcY + y-dstY);

			// handle bizarre class behavior
			uint32_t safeDstBltWidth = (dstBltWidth < 16) ? 16 : (dstBltWidth + 1) & ~1;
			uint32_t destinOffset = dstOffset + dstBlockOffset;

			CELL_GCM_RESERVE_RET(20, (uint32_t)CELL_GCM_ERROR_FAILURE);

			// set src + first blit
			CELL_GCM_METHOD_SURFACE2D_SET_OFFSET_DESTIN(CELL_GCM_CURRENT, 
				destinOffset);
			CELL_GCM_METHOD_SURFACE2D_SET_COLOR_FORMAT_PITCH(CELL_GCM_CURRENT, 
				dstFormat, 
				dstPitch, dstPitch);

			CELL_GCM_METHOD_SCALE2D_SET_COLOR_OPERATION_CLIP_IMAGE_OUT_DS_DX_DT_DY(CELL_GCM_CURRENT, 
				CELL_GCM_TRANSFER_CONVERSION_TRUNCATE, 
				srcFormat,
				CELL_GCM_TRANSFER_OPERATION_SRCCOPY,
				x - dstLeft, y - dstTop,
				dstBltWidth, dstBltHeight,
				x - dstLeft, y - dstTop,
				dstBltWidth, dstBltHeight,
				1<<20,
				1<<20);

			CELL_GCM_METHOD_SCALE2D_IMAGE_IN(CELL_GCM_CURRENT, 
				safeDstBltWidth,dstBltHeight,
				srcPitch, CELL_GCM_TRANSFER_ORIGIN_CORNER, CELL_GCM_TRANSFER_INTERPOLATOR_ZOH,
				srcOffset + srcBlockOffset,
				0, 0);
			x += dstBltWidth;
		}
		y += dstBltHeight;
	}

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);

	return CELL_OK;
}

CELL_GCM_DECL void CELL_GCM_FUNC(TransferData)(CELL_GCM_ARGS(uint32_t dstOffset, int32_t dstPitch, uint32_t srcOffset, int32_t srcPitch, int32_t bytesPerRow, int32_t rowCount))
{
	int32_t CL0039_MIN_PITCH = -32768;
	int32_t CL0039_MAX_PITCH = 32767;
	int32_t CL0039_MAX_ROWS = 0x7ff;
	uint32_t CL0039_MAX_LINES = 0x3fffff;
	uint32_t colCount;
	uint32_t rows;
	uint32_t cols;

	// argument check
	CELL_GCM_ASSERT(bytesPerRow >= 0);
	CELL_GCM_ASSERT(rowCount >= 0);

	// can we turn this into a contigous blit ?
	if ((srcPitch == bytesPerRow) && (dstPitch == bytesPerRow))
	{
		bytesPerRow *= rowCount;
		rowCount = 1;
		srcPitch = 0;
		dstPitch = 0;
	}

	// unusual pitch values
	if ((srcPitch < CL0039_MIN_PITCH) || (srcPitch > CL0039_MAX_PITCH) ||
		(dstPitch < CL0039_MIN_PITCH) || (dstPitch > CL0039_MAX_PITCH))
	{
		// fallback: blit per line (could improve this case)
		// Blit one line at a time
		while(--rowCount >= 0)
		{
			for(colCount = bytesPerRow; colCount>0; colCount -= cols)
			{
				// clamp to limit
				cols = (colCount > CL0039_MAX_LINES) ? CL0039_MAX_LINES : colCount;

				// do the blit
				CELL_GCM_RESERVE(9);
				CELL_GCM_METHOD_COPY2D_OFFSET_PITCH_LINE_FORMAT_NOTIFY(CELL_GCM_CURRENT, 
					srcOffset + (bytesPerRow - colCount),
					dstOffset + (bytesPerRow - colCount),
					0,
					0,
					cols,
					1,
					1, 1,
					0);
			}

			dstOffset += dstPitch;
			srcOffset += srcPitch;
		}
	}
	else
	{
		// for each batch of rows
		for(;rowCount>0; rowCount -= rows)
		{
			// clamp to limit ?
			rows = (rowCount > CL0039_MAX_ROWS) ? CL0039_MAX_ROWS : rowCount;

			// for each batch of cols
			for(colCount = bytesPerRow; colCount>0; colCount -= cols)
			{
				// clamp to limit
				cols = (colCount > CL0039_MAX_LINES) ? CL0039_MAX_LINES : colCount;

				// do the blit
				CELL_GCM_RESERVE(9);
				CELL_GCM_METHOD_COPY2D_OFFSET_PITCH_LINE_FORMAT_NOTIFY(CELL_GCM_CURRENT, 
					srcOffset + (bytesPerRow - colCount),
					dstOffset + (bytesPerRow - colCount),
					srcPitch,
					dstPitch,
					cols,
					rows,
					1, 1,
					0);
			}

			// Advance to next set of rows
			srcOffset += rows * srcPitch;
			dstOffset += rows * dstPitch;
		}
	}

	CELL_GCM_RESERVE(2);
	CELL_GCM_METHOD_COPY2D_OFFSET_OUT(CELL_GCM_CURRENT, 0);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL uint32_t CELL_GCM_FUNC(SetTransferData)(CELL_GCM_ARGS(
	uint8_t mode, uint32_t dstOffset, uint32_t dstPitch, uint32_t srcOffset, 
	uint32_t srcPitch, uint32_t bytesPerRow, uint32_t rowCount))
{
	uint32_t srcHandle,dstHandle;

	switch(mode)
	{
	case CELL_GCM_TRANSFER_MAIN_TO_LOCAL:
		srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		break;
	case CELL_GCM_TRANSFER_LOCAL_TO_MAIN:
		srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		break;
	case CELL_GCM_TRANSFER_LOCAL_TO_LOCAL:
		srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		break;
	case CELL_GCM_TRANSFER_MAIN_TO_MAIN:
		srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		break;
	default:
		return (uint32_t)CELL_GCM_ERROR_FAILURE;
	}

	CELL_GCM_RESERVE_RET(3, (uint32_t)CELL_GCM_ERROR_FAILURE);
	CELL_GCM_METHOD_COPY2D_SET_CONTEXT_DMA_BUFFER(CELL_GCM_CURRENT, 
		srcHandle,
		dstHandle);
	CELL_GCM_FUNC(TransferData)(CELL_GCM_ARGS_FUNC(dstOffset, dstPitch, srcOffset, srcPitch, bytesPerRow, rowCount));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);

	return CELL_OK;
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetConvertSwizzleFormat)(CELL_GCM_ARGS(
	uint32_t dstOffset, uint32_t dstWidth, uint32_t dstHeight, uint32_t dstX, uint32_t dstY,
	uint32_t srcOffset, uint32_t srcPitch, uint32_t srcX, uint32_t srcY, uint32_t width, uint32_t height, uint32_t bytesPerPixel,
	uint8_t mode))
{
	uint32_t srcHandle,dstHandle;

	switch(mode)
	{
	  case CELL_GCM_TRANSFER_MAIN_TO_LOCAL:
	  default:
		  srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		  dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		  break;
	  case CELL_GCM_TRANSFER_LOCAL_TO_MAIN:
		  srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		  dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		  break;
	  case CELL_GCM_TRANSFER_LOCAL_TO_LOCAL:
		  srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		  dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		  break;
	  case CELL_GCM_TRANSFER_MAIN_TO_MAIN:
		  srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		  dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		  break;
	}

    uint32_t NV_MEM2MEM_MAX_HEIGHT_VALUE = 2047;
    uint32_t NV_SURFACE_SWIZZLED_MAX_DIM = 10;

#ifdef __SPU__
	uint32_t dstwlog2 = 31 - __builtin_clz(dstWidth);
	uint32_t dsthlog2 = 31 - __builtin_clz(dstHeight);
#endif
#ifdef __PPU__
#ifdef __SNC__
	uint32_t dstwlog2 = 31 - __cntlzw(dstWidth);
	uint32_t dsthlog2 = 31 - __cntlzw(dstHeight);
#else
	uint32_t dstwlog2 = 31 - ({__asm__("cntlzw %0,%1" : "=r" (dstwlog2) : "r" (dstWidth)); dstwlog2;});
	uint32_t dsthlog2 = 31 - ({__asm__("cntlzw %0,%1" : "=r" (dsthlog2) : "r" (dstHeight)); dsthlog2;});
#endif
#endif
#if defined(__linux__) || defined(WIN32)
	uint32_t n,i;

	n = dstWidth;
	for(i=1; (n >> i) > 0; i++){}
	uint32_t dstwlog2 = i-1;

	n = dstHeight;
	for(i=1; (n >> i) > 0; i++){}
	uint32_t dsthlog2 = i-1;
#endif

    // a few sanity checks
    CELL_GCM_ASSERT(height && width);
    CELL_GCM_ASSERT((width<4096) && (height<4096));
    CELL_GCM_ASSERT((height <= dstHeight) && (width <= dstWidth));
    CELL_GCM_ASSERT(((dstY + height) <= dstHeight) && ((dstX + width) <= dstWidth));
    CELL_GCM_ASSERT(((dstWidth & (dstWidth - 1)) == 0) && (((dstHeight & (dstHeight - 1)) == 0)));
    CELL_GCM_ASSERT(srcPitch < 0xffff);

    switch (bytesPerPixel)
    {
        case 2:
        case 4:
            break;
        case 8:
            dstWidth <<= 1;
            dstX <<= 1;
            srcX <<= 1;
            width <<= 1;
            bytesPerPixel >>= 1;
            dstwlog2 += 1;
            break;
        case 16:
            dstWidth <<= 2;
            dstX <<= 2;
            srcX <<= 2;
            width <<= 2;
            bytesPerPixel >>= 2;
            dstwlog2 += 2;
            break;
        default:
            CELL_GCM_ASSERT(0);
            break;
    }

    // destination is a 1xN or Nx1 ? => swizzled result is a Nx1 linear texture
    // destination is a 2xN          => col 0 and col1 1 are interleaved
    // (NVXX_CONTEXT_SURFACE_SWIZZLED does not support an 1xN or Nx1 destination)
    if ((dstwlog2 <= 1) || (dsthlog2 == 0))
    {
		// set src/dst location
		CELL_GCM_RESERVE(3);
		CELL_GCM_METHOD_COPY2D_SET_CONTEXT_DMA_BUFFER(CELL_GCM_CURRENT, srcHandle, dstHandle);

        uint32_t dstPitch;
        uint32_t linesLeft;

        // Nx1 mapping is (x_n, ..., x_0)
        // 1xN mapping is (y_n, ..., y_0)
        // 2xN mapping is (y_n, ..., y_0, x_0)

        // get rid of src/dst position
		dstPitch = bytesPerPixel << dstwlog2;
		srcOffset = srcOffset + srcX * bytesPerPixel + srcY * srcPitch;
		dstOffset = dstOffset + dstX * bytesPerPixel + dstY * dstPitch;

        // MEM2MEM maximum height is 2047..
        for(linesLeft = height; linesLeft;)
        {
            // actualHeight = min(NV_MEM2MEM_MAX_HEIGHT_VALUE, linesLeft);
            uint32_t actualHeight = (linesLeft > NV_MEM2MEM_MAX_HEIGHT_VALUE)
                               ?  NV_MEM2MEM_MAX_HEIGHT_VALUE
                               :  linesLeft;

			// todo: this is incorrect for the vid->vid case
			CELL_GCM_FUNC(TransferData)(CELL_GCM_ARGS_FUNC(dstOffset, dstPitch, srcOffset, srcPitch, width*bytesPerPixel, actualHeight));

            srcOffset = srcOffset + actualHeight * srcPitch;
            dstOffset = dstOffset + actualHeight * dstPitch;
            linesLeft -= actualHeight;
        }
        return;
    }
    else
    {
		// set src/dst location
		CELL_GCM_RESERVE(6);
		CELL_GCM_METHOD_SWIZZLE2D_SET_CONTEXT_DMA_IMAGE(CELL_GCM_CURRENT, dstHandle);
		CELL_GCM_METHOD_SCALE2D_SET_CONTEXT_DMA_IMAGE(CELL_GCM_CURRENT, srcHandle);
		CELL_GCM_METHOD_SCALE2D_SET_CONTEXT_SURFACE(CELL_GCM_CURRENT, CELL_GCM_CONTEXT_SWIZZLE2D);

#ifdef CELL_GCM_ASSERT_ENABLE
        uint32_t origSrcOffset;
#endif
        uint32_t srcFormat;
        uint32_t dstFormat;
        uint32_t logWidthLimit;
        uint32_t logHeightLimit;
        uint32_t yTop;
        uint32_t xEnd;
        uint32_t yEnd;
        uint32_t x;
        uint32_t y;

        // note:
        //  NVXX_CONTEXT_SURFACE_SWIZZLED expects the destination to be 64byte aligned,
        //  and only lods with <= 16 texels (i.e. 4x4, 8x2, 2x8, 16x1, 1x16@16bit) can
        //  cause the lower lods to be unaligned (32texels@16 bit is a multiple of 64).
        //  [note that 2x4 is not handled with mem2mem but 2x4 is supposed to be well
        //   aligned, because the earlier lod 4x8 had 32 texels!]
        //  iow: unaligned lods have a either a width or height of 2 or 1.
        //  -- a assert guards this logic!)
        CELL_GCM_ASSERT((dstWidth >= 4) && (dstHeight >= 2));

        // determine color format
		switch(bytesPerPixel)
		{
		case 2:
			srcFormat = CELL_GCM_TRANSFER_SCALE_FORMAT_R5G6B5;
			dstFormat = CELL_GCM_TRANSFER_SURFACE_FORMAT_R5G6B5;
			break;
		case 4:
			srcFormat = CELL_GCM_TRANSFER_SCALE_FORMAT_A8R8G8B8;
			dstFormat = CELL_GCM_TRANSFER_SURFACE_FORMAT_A8R8G8B8;
			break;
		case 1: // LE_Y8 is not supported on curie
		default:
			srcFormat = 0;
			dstFormat = 0;
			CELL_GCM_ASSERT(0);
			break;
		}

        // The HW cannot handle arbitrarily large blts, so the blit is split into
        // multiple blocks. The regions are aligned to the dst. These are the begin/end
        // of a given block.
        logWidthLimit  = (dstwlog2 > NV_SURFACE_SWIZZLED_MAX_DIM ) ? NV_SURFACE_SWIZZLED_MAX_DIM : dstwlog2;
        logHeightLimit = (dsthlog2 > NV_SURFACE_SWIZZLED_MAX_DIM ) ? NV_SURFACE_SWIZZLED_MAX_DIM : dsthlog2;

        // align the Src Blt to the Dst, that way we can forget about srcX and srcY.
#ifdef CELL_GCM_ASSERT_ENABLE
        origSrcOffset = srcOffset;
#endif
        srcOffset += (srcX - dstX) * bytesPerPixel + (srcY - dstY) * srcPitch;

        // blit limits
        xEnd = dstX + width;
        yEnd = dstY + height;

        // For the top row of blocks, yTop != y
        yTop = dstY & ~((1 << NV_SURFACE_SWIZZLED_MAX_DIM) - 1);
        for(y = dstY; y < yEnd;)
        {
            uint32_t xLeft;
            uint32_t yBottom;
            uint32_t bltHeight;

            // determine actual copy height for this iteration
            yBottom = yTop + (1 << NV_SURFACE_SWIZZLED_MAX_DIM);
            if(yBottom > (1ul << dsthlog2))
            {
                yBottom = (1 << dsthlog2);
            }
            bltHeight = (yBottom > yEnd) ? yEnd - y : yBottom - y;

            // for the left column of blocks, xLeft != x
            xLeft = dstX & ~((1 << NV_SURFACE_SWIZZLED_MAX_DIM) - 1);
            for(x = dstX; x < xEnd;)
            {
                uint32_t xRight;
                uint32_t bltWidth;
                uint32_t blockSrcOffset;
                uint32_t blockDstOffset;
                uint32_t blockX;
                uint32_t blockY;
                uint32_t srcWidth;

                // determine actual copy width for this iteration
                xRight = xLeft + (1 << NV_SURFACE_SWIZZLED_MAX_DIM);
                bltWidth = (xRight > xEnd ) ? xEnd - x : xRight - x;

                // NVXX_CONTEXT_SURFACE_SWIZZLED ignores the lower bits of the 
                // destination offset.
				if (!dstwlog2)
				{
					blockDstOffset = dstOffset + yTop * bytesPerPixel;
				}
				else if (!dsthlog2)
				{
					blockDstOffset = dstOffset + xLeft * bytesPerPixel;
				}
				else
				{
					// #'common' bits
					uint32_t log = (dstwlog2 < dsthlog2) ? dstwlog2 : dsthlog2;  
					// # of bits to interleave
					uint32_t doubleLog = log << 1;                     
					// bits to preserve
					uint32_t upperMask = ~((1 << doubleLog) - 1);      
					// bits to interleave
					uint32_t lowerMask = ~upperMask;                   

					// calc offset
					uint32_t upperU = (xLeft << log) & upperMask;
					uint32_t upperV = (yTop << log) & upperMask;
					uint32_t lower  = ((xLeft & 0x001) <<  0) | ((yTop & 0x001) <<  1)
									| ((xLeft & 0x002) <<  1) | ((yTop & 0x002) <<  2)
									| ((xLeft & 0x004) <<  2) | ((yTop & 0x004) <<  3)
									| ((xLeft & 0x008) <<  3) | ((yTop & 0x008) <<  4)
									| ((xLeft & 0x010) <<  4) | ((yTop & 0x010) <<  5)
									| ((xLeft & 0x020) <<  5) | ((yTop & 0x020) <<  6)
									| ((xLeft & 0x040) <<  6) | ((yTop & 0x040) <<  7)
									| ((xLeft & 0x080) <<  7) | ((yTop & 0x080) <<  8)
									| ((xLeft & 0x100) <<  8) | ((yTop & 0x100) <<  9)
									| ((xLeft & 0x200) <<  9) | ((yTop & 0x200) << 10)
									| ((xLeft & 0x400) << 10) | ((yTop & 0x400) << 11)
									| ((xLeft & 0x800) << 11) | ((yTop & 0x800) << 12);
					CELL_GCM_ASSERT((xLeft < 4096) && (yTop < 4096));
					blockDstOffset = dstOffset + ((lower & lowerMask) | upperU | upperV) * bytesPerPixel;
				}
				
				CELL_GCM_ASSERT((blockDstOffset & 0x3f) == 0); // ** SERIOUS (RENDERING) ERROR **

                // clip - blockX and blockY are the X and Y offsets within this block
                blockX = x & ((1 << NV_SURFACE_SWIZZLED_MAX_DIM) - 1);
                blockY = y & ((1 << NV_SURFACE_SWIZZLED_MAX_DIM) - 1);

                // compute blt location in src
                blockSrcOffset = srcOffset + x * bytesPerPixel + y * srcPitch;
                CELL_GCM_ASSERT(blockSrcOffset >= origSrcOffset);

                // handle bizarre class behavior
                srcWidth = (bltWidth < 16) ? 16 : (bltWidth + 1) & ~1;

                // set dst format/offset
				CELL_GCM_RESERVE(18);
				CELL_GCM_METHOD_SWIZZLE2D_SET_FORMAT_OFFSET(CELL_GCM_CURRENT,
					dstFormat, logWidthLimit, logHeightLimit,
					blockDstOffset);

                // set src + first blit
				CELL_GCM_METHOD_SCALE2D_SET_COLOR_OPERATION_CLIP_IMAGE_OUT_DS_DX_DT_DY(CELL_GCM_CURRENT, 
					CELL_GCM_TRANSFER_CONVERSION_TRUNCATE, 
					srcFormat,
					CELL_GCM_TRANSFER_OPERATION_SRCCOPY,
					blockX, blockY,
					bltWidth, bltHeight,
					blockX, blockY,
					bltWidth, bltHeight,
					1 << 20,
					1 << 20);

				CELL_GCM_METHOD_SCALE2D_IMAGE_IN(CELL_GCM_CURRENT, 
					srcWidth, bltHeight,
					srcPitch, CELL_GCM_TRANSFER_ORIGIN_CORNER, CELL_GCM_TRANSFER_INTERPOLATOR_ZOH,
					blockSrcOffset,
					0, 0);

                // increment in X
                x = xLeft = xRight;
            }
        
            // increment in Y
            y = yTop = yBottom;
        }
    }

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetInlineTransferPointer)(CELL_GCM_ARGS(const uint32_t offset, const uint32_t count, void **pointer))
{
	// check the limitation
	CELL_GCM_ASSERT((offset & 3) == 0);  // 4B alignment because format Y32
	CELL_GCM_ASSERT(count <= CELL_GCM_MAX_METHOD_COUNT);

	uint32_t dstOffset = offset & ~63;		// 64B align surface2d offset limitation
	uint32_t pointX = (offset & 63) >>2;	// pixel offset from 64B align offset
	uint32_t evenCount = (count + 1) & ~1;	// even width only
	CELL_GCM_RESERVE(10 + evenCount);

	CELL_GCM_METHOD_SURFACE2D_SET_OFFSET_DESTIN(CELL_GCM_CURRENT, dstOffset);
	CELL_GCM_METHOD_SURFACE2D_SET_COLOR_FORMAT_PITCH(CELL_GCM_CURRENT, CELL_GCM_TRANSFER_SURFACE_FORMAT_Y32, 0x1000, 0x1000);
	CELL_GCM_METHOD_INLINE2D_POINT_SIZE_OUT_IN(CELL_GCM_CURRENT, pointX, 0, count, 1, count, 1);

	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV308A_COLOR, evenCount);
	CELL_GCM_CURRENT += 1;

	*pointer = CELL_GCM_CURRENT;

	CELL_GCM_CURRENT += evenCount;
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTransferDataFormat)(CELL_GCM_ARGS(const int32_t inPitch, const int32_t outPitch, const uint32_t lineLength, const uint32_t lineCount, const uint8_t inFormat, const uint8_t outFormat))
{
	// check the limitation
	CELL_GCM_ASSERT((inPitch >= -32768) && (inPitch <= 32767));
	CELL_GCM_ASSERT((outPitch >= -32768) && (outPitch <= 32767));
	CELL_GCM_ASSERT(lineLength <= 0x3fffff);
	CELL_GCM_ASSERT(lineCount <= 2047);
	CELL_GCM_ASSERT((inFormat == 1) || (inFormat == 2) || (inFormat == 4));
	CELL_GCM_ASSERT((outFormat == 1) || (outFormat == 2) || (outFormat == 4));

	CELL_GCM_RESERVE(6);

	CELL_GCM_METHOD_COPY2D_PITCH_LINE_FORMAT(
		CELL_GCM_CURRENT,
		inPitch, outPitch,
		lineLength, lineCount,
		CELL_GCM_COMMAND_CAST(inFormat), 
		CELL_GCM_COMMAND_CAST(outFormat));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTransferDataOffset)(CELL_GCM_ARGS(const uint32_t dstOffset, const uint32_t srcOffset))
{
	CELL_GCM_RESERVE(6);

	CELL_GCM_METHOD_COPY2D_OFFSET_IN(CELL_GCM_CURRENT,srcOffset);
	CELL_GCM_METHOD_COPY2D_OFFSET_OUT(CELL_GCM_CURRENT, dstOffset);
	CELL_GCM_METHOD_COPY2D_BUFFER_NOTIFY(CELL_GCM_CURRENT, 0);		// start the transfer

	CELL_GCM_RESERVE(2);
	CELL_GCM_METHOD_COPY2D_OFFSET_OUT(CELL_GCM_CURRENT, 0);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTransferScaleMode)(CELL_GCM_ARGS(const uint8_t mode, const uint8_t surface))
{
	uint32_t srcHandle,dstHandle;

	switch(mode){
	  case CELL_GCM_TRANSFER_MAIN_TO_LOCAL:
		  srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		  dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		  break;
	  case CELL_GCM_TRANSFER_LOCAL_TO_MAIN:
		  srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		  dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		  break;
	  case CELL_GCM_TRANSFER_LOCAL_TO_LOCAL:
		  srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		  dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER;
		  break;
	  case CELL_GCM_TRANSFER_MAIN_TO_MAIN:
		  srcHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		  dstHandle = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER;
		  break;
	  default:
		  srcHandle = 0;
		  dstHandle = 0;
		  CELL_GCM_ASSERT(0);
		  break;
	}

	// reserve size
	CELL_GCM_RESERVE(6);

	switch(surface){
	  case CELL_GCM_TRANSFER_SURFACE:
		  CELL_GCM_METHOD_SCALE2D_SET_CONTEXT_DMA_IMAGE(CELL_GCM_CURRENT, srcHandle);
		  CELL_GCM_METHOD_SCALE2D_SET_CONTEXT_SURFACE(CELL_GCM_CURRENT, CELL_GCM_CONTEXT_SURFACE2D);
		  CELL_GCM_METHOD_SURFACE2D_SET_CONTEXT_DMA_IMAGE_DESTIN(CELL_GCM_CURRENT, dstHandle);
		  break;

	  case CELL_GCM_TRANSFER_SWIZZLE:
		  CELL_GCM_METHOD_SCALE2D_SET_CONTEXT_DMA_IMAGE(CELL_GCM_CURRENT, srcHandle);
		  CELL_GCM_METHOD_SCALE2D_SET_CONTEXT_SURFACE(CELL_GCM_CURRENT, CELL_GCM_CONTEXT_SWIZZLE2D);
		  CELL_GCM_METHOD_SWIZZLE2D_SET_CONTEXT_DMA_IMAGE(CELL_GCM_CURRENT, dstHandle);
		  break;

	  default:
		  CELL_GCM_ASSERT(0);
	}
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTransferScaleSurface)(CELL_GCM_ARGS(const CellGcmTransferScale *scale, CellGcmTransferSurface *surface))
{
	// check the limitation
	CELL_GCM_ASSERT((scale->inW >= 2) && (scale->inW <= 2046));
	CELL_GCM_ASSERT((scale->inW & 1) == 0);		// multiple of 2
	CELL_GCM_ASSERT(scale->inH <= 2047);

	CELL_GCM_ASSERT((surface->pitch & 63) == 0);	// 64B alignment
	CELL_GCM_ASSERT(surface->pitch >= 64);
	CELL_GCM_ASSERT((surface->offset & 63) == 0);	// 64B alignment


	// reserve word size
	CELL_GCM_RESERVE(20);

	// setting Surface2D methods
	CELL_GCM_METHOD_SURFACE2D_SET_COLOR_FORMAT_PITCH_OFFSET(
		CELL_GCM_CURRENT,
		surface->format,
		surface->pitch,
		surface->offset);

	// setting Scale2D methods
	CELL_GCM_METHOD_SCALE2D_SET_COLOR_OPERATION_CLIP_IMAGE_OUT_DS_DX_DT_DY(
		CELL_GCM_CURRENT,
		CELL_GCM_TRANSFER_CONVERSION_TRUNCATE,
		scale->format,
		CELL_GCM_TRANSFER_OPERATION_SRCCOPY,
		scale->clipX, scale->clipY, scale->clipW, scale->clipH,
		scale->outX, scale->outY, scale->outW, scale->outH,
		scale->ratioX, scale->ratioY);

	CELL_GCM_METHOD_SCALE2D_IMAGE_IN(
		CELL_GCM_CURRENT,
		scale->inW,
		scale->inH,
		scale->pitch,
		scale->origin,
		scale->interp,
		scale->offset,
		scale->inX, scale->inY);	// start the transfer

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTransferScaleSwizzle)(CELL_GCM_ARGS(const CellGcmTransferScale *scale, CellGcmTransferSwizzle *swizzle))
{
	// check the limitation
	CELL_GCM_ASSERT((scale->inW >= 2) && (scale->inW <= 2046));
	CELL_GCM_ASSERT((scale->inW & 1) == 0);		// multiple of 2
	CELL_GCM_ASSERT(scale->inH <= 2047);

	CELL_GCM_ASSERT(swizzle->width <= 11);	// less than equal 2^11
	CELL_GCM_ASSERT(swizzle->height <= 11);	// less than equal 2^11
	CELL_GCM_ASSERT((swizzle->offset & 63) == 0);	// 64B alignment


	// reserve word size
	CELL_GCM_RESERVE(18);

	// setting Swizzle2D methods
	CELL_GCM_METHOD_SWIZZLE2D_SET_FORMAT_OFFSET(
		CELL_GCM_CURRENT,
		swizzle->format,
		swizzle->width,
		swizzle->height,
		swizzle->offset);

	// setting Scale2D methods
	CELL_GCM_METHOD_SCALE2D_SET_COLOR_OPERATION_CLIP_IMAGE_OUT_DS_DX_DT_DY(
		CELL_GCM_CURRENT,
		CELL_GCM_TRANSFER_CONVERSION_TRUNCATE,
		scale->format,
		CELL_GCM_TRANSFER_OPERATION_SRCCOPY,
		scale->clipX, scale->clipY, scale->clipW, scale->clipH,
		scale->outX, scale->outY, scale->outW, scale->outH,
		scale->ratioX, scale->ratioY);

	CELL_GCM_METHOD_SCALE2D_IMAGE_IN(
		CELL_GCM_CURRENT,
		scale->inW,
		scale->inH,
		scale->pitch,
		scale->origin,
		scale->interp,
		scale->offset,
		scale->inX, scale->inY);	// start the transfer

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetTransferReportData)(CELL_GCM_ARGS(const uint32_t offset, const uint32_t index, const uint32_t count))
{
	CELL_GCM_RESERVE(12);

	CELL_GCM_ASSERT((index + count) < 2048);

	// set context dma report -> main memory
	CELL_GCM_METHOD_COPY2D_SET_CONTEXT_DMA_BUFFER(
		CELL_GCM_CURRENT,
		CELL_GCM_CONTEXT_DMA_TO_MEMORY_GET_REPORT,
		CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER);

	// transfer sizeof(CellGcmReportData) * 2048 entries
	CELL_GCM_METHOD_COPY2D_OFFSET_PITCH_LINE_FORMAT_NOTIFY(
		CELL_GCM_CURRENT,
		index*16, offset,	// OffsetIn, OffsetOut,
		0, 0,				// PitchIn, PitchOut,
		count*16, 1,		// LineLenghtIn, LineCount,
		1, 1,				// Format in, out
		0);

	// hw bug workaround, reset offset to zero
	CELL_GCM_RESERVE(2);
	CELL_GCM_METHOD_COPY2D_OFFSET_OUT(CELL_GCM_CURRENT, 0);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawArrays)(CELL_GCM_ARGS(uint8_t mode, 
	uint32_t first, uint32_t count))
{
	// parameter check
	CELL_GCM_ASSERT(count > 0);
	CELL_GCM_ASSERT(first < 0xfffff);
	CELL_GCM_ASSERT((first+count) <= 0xfffff);

	uint32_t lcount;

	--count;
	lcount = count & 0xff;
	count >>= 8;

	// hw limit 0x7ff loop batches, if count > 256 * 0x7ff
	uint32_t loop, rest;
	loop = count / CELL_GCM_MAX_METHOD_COUNT;
	rest = count % CELL_GCM_MAX_METHOD_COUNT;

	// reserve buffer size
	CELL_GCM_RESERVE(8);

	// hw bug workaround, send 3 invalidate vertex file
	CELL_GCM_METHOD_INVALIDATE_VERTEX_FILE_3(CELL_GCM_CURRENT);

	// Draw first batch of 1-256...
	CELL_GCM_METHOD_SET_BEGIN_END(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(mode));
	CELL_GCM_METHOD_DRAW_ARRAYS(CELL_GCM_CURRENT, first, lcount);
	first += lcount + 1;

	// ...then complete batches of 256's
	uint32_t i,j;

	for(i=0;i<loop;i++){

		CELL_GCM_RESERVE(1 + CELL_GCM_MAX_METHOD_COUNT);

		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_DRAW_ARRAYS, CELL_GCM_MAX_METHOD_COUNT);
		CELL_GCM_CURRENT++;

		for(j=0;j<CELL_GCM_MAX_METHOD_COUNT;j++){
			CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP((first) | ((255U)<<24));
			CELL_GCM_CURRENT++;
			first += 256;
		}
	}

	if(rest){

		CELL_GCM_RESERVE(1 + rest);

		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_DRAW_ARRAYS, rest);
		CELL_GCM_CURRENT++;

		for(j=0;j<rest;j++){
			CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP((first) | ((255U)<<24));
			CELL_GCM_CURRENT++;
			first += 256;
		}
	}

	CELL_GCM_RESERVE(2);

	CELL_GCM_METHOD_SET_BEGIN_END(CELL_GCM_CURRENT, 0);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawIndexArray)(CELL_GCM_ARGS(uint8_t mode, 
	uint32_t count, uint8_t type, uint8_t location, uint32_t indicies))
{
	uint32_t startOffset;
	uint32_t startIndex;
	uint32_t misalignedIndexCount;

	CELL_GCM_ASSERT((location <= CELL_GCM_LOCATION_MAIN));
	CELL_GCM_ASSERT((indicies & 0xe0000000) == 0);
	startOffset = (indicies&0x1fffffff);
	/* alignment restriction, SET_INDEX_ARRAY_ADDRESS needs to be 2Byte alignment */
	CELL_GCM_ASSERT((startOffset & 1) == 0);

	// need to compute the number of indexes from starting
	// address to next 128-byte alignment

	// type == 32
	if(type == CELL_GCM_DRAW_INDEX_ARRAY_TYPE_32)
		misalignedIndexCount = (((startOffset + 127) & ~127) - startOffset) >> 2;
	// type == 16
	else
		misalignedIndexCount = (((startOffset + 127) & ~127) - startOffset) >> 1;

	CELL_GCM_RESERVE(7);

	CELL_GCM_METHOD_INVALIDATE_VERTEX_FILE(CELL_GCM_CURRENT);

	// begin
	CELL_GCM_METHOD_SET_INDEX_ARRAY_OFFSET_FORMAT(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(location), startOffset, CELL_GCM_COMMAND_CAST(type));
	CELL_GCM_METHOD_SET_BEGIN_END(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(mode));

	startIndex = 0;
	// starting address of first index is not 128 byte aligned
	// send the mis-aligned indices thus aligning the rest to 128 byte boundary
	if (misalignedIndexCount && (misalignedIndexCount < count))
	{
		uint32_t tmp = misalignedIndexCount-1;
		CELL_GCM_RESERVE(2);
		CELL_GCM_METHOD_DRAW_INDEX_ARRAY(CELL_GCM_CURRENT, startIndex,tmp);
		count -= misalignedIndexCount;
		startIndex += misalignedIndexCount;
	}

	// avoid writing more then 2047(0x7ff) words per inc method (hw limit)
	CELL_GCM_ASSERT(count && (count <= 0xfffff)); // hw limit
	while(count > 0x7FF00)
	{
		CELL_GCM_RESERVE(1+CELL_GCM_MAX_METHOD_COUNT);

		count -= 0x7ff00;
		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_DRAW_INDEX_ARRAY, CELL_GCM_MAX_METHOD_COUNT);
		CELL_GCM_CURRENT += 1;
		for (uint32_t lcount = CELL_GCM_MAX_METHOD_COUNT; lcount; --lcount)
		{
			CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP(0xFF000000 | startIndex);
			CELL_GCM_CURRENT += 1;
			startIndex += 0x100;
		}
	}

	// round up count to 256(0x100) counts
	uint32_t mcount = (count + 0xff)>>8;

	CELL_GCM_RESERVE(1+mcount);

	// [startIndex, startIndex+0xff] range in DRAW_INDEX_ARRAY
	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_DRAW_INDEX_ARRAY, mcount);
	CELL_GCM_CURRENT += 1;
	while(count > 0x100)
	{
		count -= 0x100;
		CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP(0xFF000000 | startIndex);
		CELL_GCM_CURRENT += 1;
		startIndex += 0x100;
	}

	// remainder indices
	if(count)
	{
		--count;
		CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP((count << 24) | startIndex);
		CELL_GCM_CURRENT += 1;
	}

	CELL_GCM_RESERVE(2);
	CELL_GCM_METHOD_SET_BEGIN_END(CELL_GCM_CURRENT, 0);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawInlineArray)(CELL_GCM_ARGS(const uint8_t mode, const uint32_t count, const void *data))
{
	uint32_t loop, rest, i,j;
	uint32_t *value = (uint32_t *)data;

	loop = count / CELL_GCM_MAX_METHOD_COUNT;
	rest = count % CELL_GCM_MAX_METHOD_COUNT;

	CELL_GCM_RESERVE(6);

	// hw bug workaround, send 3 invalidate vertex file
	CELL_GCM_METHOD_INVALIDATE_VERTEX_FILE_3(CELL_GCM_CURRENT);

	// start draw mode
	CELL_GCM_METHOD_SET_BEGIN_END(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(mode));

	for(i=0;i<loop;i++){

		CELL_GCM_RESERVE(1 + CELL_GCM_MAX_METHOD_COUNT);

		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_INLINE_ARRAY, CELL_GCM_MAX_METHOD_COUNT);
		CELL_GCM_CURRENT++;

		for(j=0;j<CELL_GCM_MAX_METHOD_COUNT;j++){
			CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP(*value);
			CELL_GCM_CURRENT++;
			value++;
		}
	}

	if(rest){

		CELL_GCM_RESERVE(1 + rest);

		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_INLINE_ARRAY, rest);
		CELL_GCM_CURRENT++;

		for(j=0;j<rest;j++){
			CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP(*value);
			CELL_GCM_CURRENT++;
			value++;
		}
	}

	CELL_GCM_RESERVE(2);

	// end draw mode
	CELL_GCM_METHOD_SET_BEGIN_END(CELL_GCM_CURRENT, 0);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawInlineIndexArray32)(CELL_GCM_ARGS(const uint8_t mode, const uint32_t start, const uint32_t count, const uint32_t *data))
{
	// hw restriction
	CELL_GCM_ASSERT(count <= 0xfffff);

	uint32_t loop, rest, i, j;

	data = data + start;
	loop = count / CELL_GCM_MAX_METHOD_COUNT;
	rest = count % CELL_GCM_MAX_METHOD_COUNT;

	// reserve word size
	CELL_GCM_RESERVE(6);

	// hw bug workaround, send 3 invalidate vertex file
	CELL_GCM_METHOD_INVALIDATE_VERTEX_FILE_3(CELL_GCM_CURRENT);

	// start draw mode
	CELL_GCM_METHOD_SET_BEGIN_END(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(mode));

	for(i=0;i<loop;i++){
		CELL_GCM_RESERVE(1+CELL_GCM_MAX_METHOD_COUNT);

		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_ARRAY_ELEMENT32, CELL_GCM_MAX_METHOD_COUNT);
		CELL_GCM_CURRENT++;

		for(j=0;j<CELL_GCM_MAX_METHOD_COUNT;j++){
			CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP(*data);
			CELL_GCM_CURRENT++;
			data++;
		}
	}

	if(rest){
		CELL_GCM_RESERVE(1+rest);

		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_ARRAY_ELEMENT32, rest);
		CELL_GCM_CURRENT++;

		for(j=0;j<rest;j++){
			CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP(*data);
			CELL_GCM_CURRENT++;
			data++;
		}
	}

	// end draw mode
	CELL_GCM_RESERVE(2);
	CELL_GCM_METHOD_SET_BEGIN_END(CELL_GCM_CURRENT, 0);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawInlineIndexArray16)(CELL_GCM_ARGS(const uint8_t mode, const uint32_t start, const uint32_t count, const uint16_t *data))
{
	uint32_t loop, rest, i, j;
	uint32_t odd;
	uint32_t lcount;

	if(count & 1){	// odd count
		odd = 1;
		lcount = count - 1;
	}
	else{			// even count
		odd = 0;
		lcount = count;
	}

	// hw restriction
	CELL_GCM_ASSERT(lcount <= 0xfffff);

	data = data + start;
	loop = (lcount>>1) / CELL_GCM_MAX_METHOD_COUNT;
	rest = (lcount>>1) % CELL_GCM_MAX_METHOD_COUNT;

	// reserve word size
	CELL_GCM_RESERVE(6);

	// hw bug workaround, send 3 invalidate vertex file
	CELL_GCM_METHOD_INVALIDATE_VERTEX_FILE_3(CELL_GCM_CURRENT);

	// start draw mode
	CELL_GCM_METHOD_SET_BEGIN_END(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(mode));

	if(odd){
		CELL_GCM_RESERVE(2);

		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_ARRAY_ELEMENT32, 1);
		CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(CELL_GCM_COMMAND_CAST(data[0]));
		CELL_GCM_CURRENT+=2;
		data++;
	}

	for(i=0;i<loop;i++){
		CELL_GCM_RESERVE(1+CELL_GCM_MAX_METHOD_COUNT);

		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_ARRAY_ELEMENT16, CELL_GCM_MAX_METHOD_COUNT);
		CELL_GCM_CURRENT++;

		for(j=0;j<CELL_GCM_MAX_METHOD_COUNT;j++){
			CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP(CELL_GCM_COMMAND_CAST(data[0]) | CELL_GCM_COMMAND_CAST(data[1])<<16);
			CELL_GCM_CURRENT++;
			data+=2;
		}
	}

	if(rest){
		CELL_GCM_RESERVE(1+rest);

		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_ARRAY_ELEMENT16, rest);
		CELL_GCM_CURRENT++;

		for(j=0;j<rest;j++){
			CELL_GCM_CURRENT[0] = CELL_GCM_ENDIAN_SWAP(CELL_GCM_COMMAND_CAST(data[0]) | CELL_GCM_COMMAND_CAST(data[1])<<16);
			CELL_GCM_CURRENT++;
			data+=2;
		}
	}

	// end draw mode
	CELL_GCM_RESERVE(2);
	CELL_GCM_METHOD_SET_BEGIN_END(CELL_GCM_CURRENT, 0);

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawInlineArrayPointer)(CELL_GCM_ARGS(const uint32_t count, void **pointer))
{
	CELL_GCM_ASSERT(count <= CELL_GCM_MAX_METHOD_COUNT);
	CELL_GCM_RESERVE(count+1);

	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_INLINE_ARRAY, count);
	CELL_GCM_CURRENT++;
	*pointer = CELL_GCM_CURRENT;

	CELL_GCM_CURRENT += count;

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexProgramConstantsPointer)(CELL_GCM_ARGS(uint32_t first, uint32_t count, void **pointer))
{
	CELL_GCM_ASSERT(first+count/4 <= CELL_GCM_VTXPRG_MAX_CONST);
	CELL_GCM_ASSERT(count <= 32);
	CELL_GCM_RESERVE(count+2);

	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, count+1);
	CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(first);
	CELL_GCM_CURRENT += 2;
	*pointer = CELL_GCM_CURRENT;

	CELL_GCM_CURRENT += count;

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawInlineIndexArray32Pointer)(CELL_GCM_ARGS(const uint32_t count, void **pointer))
{
	CELL_GCM_ASSERT(count <= CELL_GCM_MAX_METHOD_COUNT);
	CELL_GCM_RESERVE(count+1);

	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_ARRAY_ELEMENT32, count);
	CELL_GCM_CURRENT++;
	*pointer = CELL_GCM_CURRENT;

	CELL_GCM_CURRENT += count;

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawInlineIndexArray16Pointer)(CELL_GCM_ARGS(const uint32_t count, void **pointer))
{
	CELL_GCM_ASSERT(count <= CELL_GCM_MAX_METHOD_COUNT);
	CELL_GCM_RESERVE(count+1);

	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD_NI(CELL_GCM_NV4097_ARRAY_ELEMENT16, count);
	CELL_GCM_CURRENT++;
	*pointer = CELL_GCM_CURRENT;

	CELL_GCM_CURRENT += count;

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexProgramParameterBlockPointer)(CELL_GCM_ARGS(const uint32_t baseConst, const uint32_t constCount, void **pointer))
{
	uint32_t count = constCount*4;

	CELL_GCM_ASSERT(baseConst+constCount <= CELL_GCM_VTXPRG_MAX_CONST);
	CELL_GCM_ASSERT(count <= 32);
	CELL_GCM_RESERVE(count+2);

	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_CONSTANT_LOAD, count+1);
	CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(baseConst);
	CELL_GCM_CURRENT += 2;
	*pointer = CELL_GCM_CURRENT;

	CELL_GCM_CURRENT += count;

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetInlineTransferAlignedPointer)(CELL_GCM_ARGS(const uint32_t offset, const uint32_t count, void **pointer))
{
	CELL_GCM_ASSERT((count & 3) == 0);	// multiple of 4
	CELL_GCM_RESERVE(13 + ((count + 1) & ~1));
	uint32_t nopCount = 4 - ((((uint32_t)CELL_GCM_CURRENT + 40) & 15) >> 2);	// 4 to 1
	if(nopCount != 4){	// not aligned
#if CELL_GCM_UNSAFE
		CELL_GCM_FUNC(SetNopCommand)(CELL_GCM_ARGS_FUNC(nopCount));
#else
		CELL_GCM_FUNC(SetNopCommandUnsafe)(CELL_GCM_ARGS_FUNC(nopCount));
#endif
	}
#if CELL_GCM_UNSAFE
	CELL_GCM_FUNC(SetInlineTransferPointer)(CELL_GCM_ARGS_FUNC(offset, count, pointer));
#else
	CELL_GCM_FUNC(SetInlineTransferPointerUnsafe)(CELL_GCM_ARGS_FUNC(offset, count, pointer));
#endif
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexProgramConstantsAlignedPointer)(CELL_GCM_ARGS(uint32_t first, uint32_t count, void **pointer))
{
	CELL_GCM_ASSERT((count & 3) == 0);	// multiple of 4
	CELL_GCM_RESERVE(count+5);
	uint32_t nopCount = 4 - ((((uint32_t)CELL_GCM_CURRENT + 8) & 15) >> 2);	// 4 to 1
	if(nopCount != 4){	// not aligned
#if CELL_GCM_UNSAFE
		CELL_GCM_FUNC(SetNopCommand)(CELL_GCM_ARGS_FUNC(nopCount));
#else
		CELL_GCM_FUNC(SetNopCommandUnsafe)(CELL_GCM_ARGS_FUNC(nopCount));
#endif
	}
#if CELL_GCM_UNSAFE
	CELL_GCM_FUNC(SetVertexProgramConstantsPointer)(CELL_GCM_ARGS_FUNC(first, count, pointer));
#else
	CELL_GCM_FUNC(SetVertexProgramConstantsPointerUnsafe)(CELL_GCM_ARGS_FUNC(first, count, pointer));
#endif
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexProgramParameterBlockAlignedPointer)(CELL_GCM_ARGS(const uint32_t baseConst, const uint32_t constCount, void **pointer))
{
	CELL_GCM_RESERVE(constCount*4+5);
	uint32_t nopCount = 4 - ((((uint32_t)CELL_GCM_CURRENT + 8) & 15) >> 2);	// 4 to 1
	if(nopCount != 4){	// not aligned
#if CELL_GCM_UNSAFE
		CELL_GCM_FUNC(SetNopCommand)(CELL_GCM_ARGS_FUNC(nopCount));
#else
		CELL_GCM_FUNC(SetNopCommandUnsafe)(CELL_GCM_ARGS_FUNC(nopCount));
#endif
	}
#if CELL_GCM_UNSAFE
	CELL_GCM_FUNC(SetVertexProgramParameterBlockPointer)(CELL_GCM_ARGS_FUNC(baseConst, constCount, pointer));
#else
	CELL_GCM_FUNC(SetVertexProgramParameterBlockPointerUnsafe)(CELL_GCM_ARGS_FUNC(baseConst, constCount, pointer));
#endif
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawInlineArrayAlignedPointer)(CELL_GCM_ARGS(const uint32_t count, void **pointer))
{
	CELL_GCM_ASSERT((count & 3) == 0);	// multiple of 4
	CELL_GCM_RESERVE(count+4);
	uint32_t nopCount = 4 - ((((uint32_t)CELL_GCM_CURRENT + 4) & 15) >> 2);	// 4 to 1
	if(nopCount != 4){	// not aligned
#if CELL_GCM_UNSAFE
		CELL_GCM_FUNC(SetNopCommand)(CELL_GCM_ARGS_FUNC(nopCount));
#else
		CELL_GCM_FUNC(SetNopCommandUnsafe)(CELL_GCM_ARGS_FUNC(nopCount));
#endif
	}
#if CELL_GCM_UNSAFE
	CELL_GCM_FUNC(SetDrawInlineArrayPointer)(CELL_GCM_ARGS_FUNC(count, pointer));
#else
	CELL_GCM_FUNC(SetDrawInlineArrayPointerUnsafe)(CELL_GCM_ARGS_FUNC(count, pointer));
#endif
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawInlineIndexArray32AlignedPointer)(CELL_GCM_ARGS(const uint32_t count, void **pointer))
{
	CELL_GCM_ASSERT((count & 3) == 0);	// multiple of 4
	CELL_GCM_RESERVE(count+4);
	uint32_t nopCount = 4 - ((((uint32_t)CELL_GCM_CURRENT + 4) & 15) >> 2);	// 4 to 1
	if(nopCount != 4){	// not aligned
#if CELL_GCM_UNSAFE
		CELL_GCM_FUNC(SetNopCommand)(CELL_GCM_ARGS_FUNC(nopCount));
#else
		CELL_GCM_FUNC(SetNopCommandUnsafe)(CELL_GCM_ARGS_FUNC(nopCount));
#endif
	}
#if CELL_GCM_UNSAFE
	CELL_GCM_FUNC(SetDrawInlineIndexArray32Pointer)(CELL_GCM_ARGS_FUNC(count, pointer));
#else
	CELL_GCM_FUNC(SetDrawInlineIndexArray32PointerUnsafe)(CELL_GCM_ARGS_FUNC(count, pointer));
#endif
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetDrawInlineIndexArray16AlignedPointer)(CELL_GCM_ARGS(const uint32_t count, void **pointer))
{
	CELL_GCM_ASSERT((count & 3) == 0);	// multiple of 4
	CELL_GCM_RESERVE(count+4);
	uint32_t nopCount = 4 - ((((uint32_t)CELL_GCM_CURRENT + 4) & 15) >> 2);	// 4 to 1
	if(nopCount != 4){ // not aligned
#if CELL_GCM_UNSAFE
		CELL_GCM_FUNC(SetNopCommand)(CELL_GCM_ARGS_FUNC(nopCount));
#else
		CELL_GCM_FUNC(SetNopCommandUnsafe)(CELL_GCM_ARGS_FUNC(nopCount));
#endif
	}
#if CELL_GCM_UNSAFE
	CELL_GCM_FUNC(SetDrawInlineIndexArray16Pointer)(CELL_GCM_ARGS_FUNC(count, pointer));
#else
	CELL_GCM_FUNC(SetDrawInlineIndexArray16PointerUnsafe)(CELL_GCM_ARGS_FUNC(count, pointer));
#endif
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetWaitForIdle)(CELL_GCM_NO_ARGS())
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 0);
	CELL_GCM_METHOD_WAIT_FOR_IDLE(CELL_GCM_CURRENT);
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexData4f)(CELL_GCM_ARGS(uint8_t index, const float v[4]))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(5, 2);
#ifdef CELL_GCM_ASM
	CELL_GCM_METHOD_SET_VERTEX_DATA4F(CELL_GCM_CURRENT, index, v);
#else
	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_VERTEX_DATA4F_M + CELL_GCM_COMMAND_CAST(index) * 16, 4);
	CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[1], v, sizeof(float)*4);
	CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
	CELL_GCM_CURRENT += 5;
#endif
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexData3f)(CELL_GCM_ARGS(uint8_t index, const float v[3]))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(4, 2);
#ifdef CELL_GCM_ASM
	CELL_GCM_METHOD_SET_VERTEX_DATA3F(CELL_GCM_CURRENT, index, v);
#else
	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_VERTEX_DATA3F_M + CELL_GCM_COMMAND_CAST(index) * 4 * 4, 3);
	CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[1], v, sizeof(float)*3);
	CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
	CELL_GCM_CURRENT += 4;
#endif
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexData2f)(CELL_GCM_ARGS(uint8_t index, const float v[2]))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(3, 2);
#ifdef CELL_GCM_ASM
	CELL_GCM_METHOD_SET_VERTEX_DATA2F(CELL_GCM_CURRENT, index, v);
#else
	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_VERTEX_DATA2F_M + CELL_GCM_COMMAND_CAST(index) * 4 * 2, 2);
	CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[1], v, sizeof(float)*2);
	CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
	CELL_GCM_CURRENT += 3;
#endif
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexData1f)(CELL_GCM_ARGS(uint8_t index, const float v))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 1);
#ifdef CELL_GCM_ASM
	CELL_GCM_METHOD_SET_VERTEX_DATA1F(CELL_GCM_CURRENT, index, v);
#else
	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_VERTEX_DATA1F_M + CELL_GCM_COMMAND_CAST(index) * 4, 1);
	CellGcmCast c;
	c.f = v;
	CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(c.u);
	CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
	CELL_GCM_CURRENT += 2;
#endif
	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexData4s)(CELL_GCM_ARGS(uint8_t index, const int16_t v[4]))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(3, 2);

	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_VERTEX_DATA4S_M + CELL_GCM_COMMAND_CAST(index) * 4 * 2, 2);
	CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP((uint16_t)v[0] | ((uint16_t)v[1]) << 16);
	CELL_GCM_CURRENT[2] = CELL_GCM_ENDIAN_SWAP((uint16_t)v[2] | ((uint16_t)v[3]) << 16);
	CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
	CELL_GCM_CURRENT += 3;

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexDataScaled4s)(CELL_GCM_ARGS(uint8_t index, const int16_t v[4]))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(3, 2);

	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_VERTEX_DATA_SCALED4S_M + CELL_GCM_COMMAND_CAST(index) * 4 * 2, 2);
	CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP((uint16_t)v[0] | ((uint16_t)v[1]) << 16);
	CELL_GCM_CURRENT[2] = CELL_GCM_ENDIAN_SWAP((uint16_t)v[2] | ((uint16_t)v[3]) << 16);
	CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
	CELL_GCM_CURRENT += 3;

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexData2s)(CELL_GCM_ARGS(uint8_t index, const int16_t v[2]))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 2);

	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_VERTEX_DATA2S_M + CELL_GCM_COMMAND_CAST(index) * 4, 1);
	CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP((uint16_t)v[0] | (uint16_t)v[1] << 16);
	CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
	CELL_GCM_CURRENT += 2;

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL void CELL_GCM_FUNC(SetVertexData4ub)(CELL_GCM_ARGS(uint8_t index, const uint8_t v[4]))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 2);

	CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_VERTEX_DATA4UB_M + CELL_GCM_COMMAND_CAST(index) * 4 * 1, 1);
	CELL_GCM_CURRENT[1] = CELL_GCM_ENDIAN_SWAP(v[0] | v[1] << 8 | v[2] << 16 | v[3] << 24);
	CELL_GCM_DEBUG_CHECK(CELL_GCM_CURRENT);
	CELL_GCM_CURRENT += 2;

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL CELL_GCM_FUNC_TYPE CELL_GCM_FUNC(SetTextureControlAlphaKill)(CELL_GCM_ARGS(const uint8_t index, const uint32_t enable, const uint16_t minlod, const uint16_t maxlod, const uint8_t maxaniso, const uint8_t alphakill))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASSERT(index < CELL_GCM_MAX_TEXIMAGE_COUNT);
	CELL_GCM_ASM_RESERVE_IMM(2, 6);

#ifdef CELL_GCM_BITFIELD
	CELL_GCM_METHOD_SET_TEXTURE_CONTROL0(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		enable, 
		CELL_GCM_COMMAND_CAST(minlod & 0xfff), 
		CELL_GCM_COMMAND_CAST(maxlod & 0xfff), 
		CELL_GCM_COMMAND_CAST(maxaniso),
		CELL_GCM_COMMAND_CAST(alphakill));
#else
	CELL_GCM_METHOD_SET_TEXTURE_CONTROL0(CELL_GCM_CURRENT, 
		CELL_GCM_COMMAND_CAST(index), 
		enable, 
		CELL_GCM_COMMAND_CAST(minlod), 
		CELL_GCM_COMMAND_CAST(maxlod), 
		CELL_GCM_COMMAND_CAST(maxaniso),
		CELL_GCM_COMMAND_CAST(alphakill));
#endif

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}

CELL_GCM_DECL CELL_GCM_FUNC_TYPE CELL_GCM_FUNC(SetNoParanoidTextureFetches)(CELL_GCM_ARGS(const uint16_t samplerMask))
{
	CELL_GCM_ASM_IN();
	CELL_GCM_ASM_RESERVE_IMM(2, 2);

	CELL_GCM_METHOD_SET_NO_PARANOID_TEXTURE_FETCHES(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(samplerMask));

	CELL_GCM_DEBUG_FINISH(CELL_GCM_THIS);
	CELL_GCM_ASM_OUT();
}
