// SwiftShader Software Renderer
//
// Copyright(c) 2005-2012 TransGaming Inc.
//
// All rights reserved. No part of this software may be copied, distributed, transmitted,
// transcribed, stored in a retrieval system, translated into any human or computer
// language by any means, or disclosed to third parties without the explicit written
// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
// or implied, including but not limited to any patent rights, are granted to you.
//

#include "QuadRasterizer.hpp"

#include "Math.hpp"
#include "Primitive.hpp"
#include "Renderer.hpp"
#include "Constants.hpp"
#include "Debug.hpp"

namespace sw
{
	extern bool veryEarlyDepthTest;
	extern bool complementaryDepthBuffer;

	extern int clusterCount;

	QuadRasterizer::QuadRasterizer(const PixelProcessor::State &state, const PixelShader *pixelShader) : PixelRoutine(state, pixelShader)
	{
	}

	QuadRasterizer::~QuadRasterizer()
	{
	}

	void QuadRasterizer::generate()
	{
		Function<Void, Pointer<Byte>, Int, Int, Pointer<Byte> > function;
		{
			#if PERF_PROFILE
				Long pixelTime = Ticks();
			#endif

			Pointer<Byte> primitive(function.arg(0));
			Int count(function.arg(1));
			Int cluster(function.arg(2));
			Pointer<Byte> data(function.arg(3));

			Registers r(shader);
			r.constants = *Pointer<Pointer<Byte> >(data + OFFSET(DrawData,constants));
			r.cluster = cluster;
			r.data = data;
			
			Do
			{
				r.primitive = primitive;

				Int yMin = *Pointer<Int>(primitive + OFFSET(Primitive,yMin));
				Int yMax = *Pointer<Int>(primitive + OFFSET(Primitive,yMax));

				Int cluster2 = r.cluster + r.cluster;
				yMin += clusterCount * 2 - 2 - cluster2;
				yMin &= -clusterCount * 2;
				yMin += cluster2;

				If(yMin < yMax)
				{
					rasterize(r, yMin, yMax);
				}

				primitive += sizeof(Primitive) * state.multiSample;
				count--;
			}
			Until(count == 0)

			if(state.occlusionEnabled)
			{
				UInt clusterOcclusion = *Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster);
				clusterOcclusion += r.occlusion;
				*Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster) = clusterOcclusion;
			}

			#if PERF_PROFILE
				r.cycles[PERF_PIXEL] = Ticks() - pixelTime;

				for(int i = 0; i < PERF_TIMERS; i++)
				{
					*Pointer<Long>(data + OFFSET(DrawData,cycles[i]) + 8 * cluster) += r.cycles[i];
				}
			#endif

			Return();
		}

		routine = function(L"PixelRoutine_%0.8X", state.shaderID);
	}

	void QuadRasterizer::rasterize(Registers &r, Int &yMin, Int &yMax)
	{
		Pointer<Byte> cBuffer[4];
		Pointer<Byte> zBuffer;
		Pointer<Byte> sBuffer;

		for(int index = 0; index < 4; index++)
		{
			if(state.colorWriteActive(index))
			{
				cBuffer[index] = *Pointer<Pointer<Byte> >(r.data + OFFSET(DrawData,colorBuffer[index])) + yMin * *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
			}
		}

		if(state.depthTestActive)
		{
			zBuffer = *Pointer<Pointer<Byte> >(r.data + OFFSET(DrawData,depthBuffer)) + yMin * *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB));
		}

		if(state.stencilActive)
		{
			sBuffer = *Pointer<Pointer<Byte> >(r.data + OFFSET(DrawData,stencilBuffer)) + yMin * *Pointer<Int>(r.data + OFFSET(DrawData,stencilPitchB));
		}

		Int y = yMin;
		
		Do
		{
			Int x0;
			Int x1;
			Int x2;

			x0 = Int(*Pointer<Short>(r.primitive + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
			x2 = Int(*Pointer<Short>(r.primitive + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
			x0 = Min(x0, x2);
			
			for(unsigned int q = 1; q < state.multiSample; q++)
			{
				Int x0q = Int(*Pointer<Short>(r.primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
				Int x2q = Int(*Pointer<Short>(r.primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
				x0q = Min(x0q, x2q);

				x0 = Min(x0q, x0);
			}
			
			x0 &= 0xFFFFFFFE;

			x1 = Int(*Pointer<Short>(r.primitive + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
			x2 = Int(*Pointer<Short>(r.primitive + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
			x1 = Max(x1, x2);

			for(unsigned int q = 1; q < state.multiSample; q++)
			{
				Int x1q = Int(*Pointer<Short>(r.primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
				Int x2q = Int(*Pointer<Short>(r.primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
				x1q = Max(x1q, x2q);

				x1 = Max(x1q, x1);
			}

			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,yQuad), 16);

			if(interpolateZ())
			{
				for(unsigned int q = 0; q < state.multiSample; q++)
				{
					Float4 y = yyyy;

					if(state.multiSample > 1)
					{
						y -= *Pointer<Float4>(r.constants + OFFSET(Constants,Y) + q * sizeof(float4));
					}

					r.Dz[q] = *Pointer<Float4>(r.primitive + OFFSET(Primitive,z.C), 16) + y * *Pointer<Float4>(r.primitive + OFFSET(Primitive,z.B), 16);
				}
			}

			if(veryEarlyDepthTest && state.multiSample == 1)
			{
				if(!state.stencilActive && state.depthTestActive && (state.depthCompareMode == Context::DEPTH_LESSEQUAL || state.depthCompareMode == Context::DEPTH_LESS))   // FIXME: Both modes ok?
				{
					Float4 xxxx = Float4(Float(x0)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,xQuad), 16);

					Pointer<Byte> buffer;
					Int pitch;

					if(!state.quadLayoutDepthBuffer)
					{
						buffer = zBuffer + 4 * x0;
						pitch = *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB));
					}
					else
					{	
						buffer = zBuffer + 8 * x0;
					}

					For(Int x = x0, x < x1, x += 2)
					{
						Float4 z = interpolate(xxxx, r.Dz[0], z, r.primitive + OFFSET(Primitive,z), false, false);

						Float4 zValue;
						
						if(!state.quadLayoutDepthBuffer)
						{
							// FIXME: Properly optimizes?
							zValue.xy = *Pointer<Float4>(buffer);
							zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
						}
						else
						{
							zValue = *Pointer<Float4>(buffer, 16);
						}

						Int4 zTest;

						if(complementaryDepthBuffer)
						{
							zTest = CmpLE(zValue, z);
						}
						else
						{
							zTest = CmpNLT(zValue, z);
						}

						Int zMask = SignMask(zTest);

						If(zMask == 0)
						{
							x0 += 2;
						}
						Else
						{
							x = x1;
						}

						xxxx += Float4(2);

						if(!state.quadLayoutDepthBuffer)
						{
							buffer += 8;
						}
						else
						{
							buffer += 16;
						}
					}
				}
			}

			If(x0 < x1)
			{
				if(interpolateW())
				{
					r.Dw = *Pointer<Float4>(r.primitive + OFFSET(Primitive,w.C), 16) + yyyy * *Pointer<Float4>(r.primitive + OFFSET(Primitive,w.B), 16);
				}

				for(int interpolant = 0; interpolant < 11; interpolant++)
				{
					int componentCount = interpolant < 10 ? 4 : 1;   // Fog only has one component

					for(int component = 0; component < componentCount; component++)
					{
						if(state.interpolant[interpolant].component & (1 << component))
						{
							r.Dv[interpolant][component] = *Pointer<Float4>(r.primitive + OFFSET(Primitive,V[interpolant][component].C), 16);

							if(!(state.interpolant[interpolant].flat & (1 << component)))
							{
								r.Dv[interpolant][component] += yyyy * *Pointer<Float4>(r.primitive + OFFSET(Primitive,V[interpolant][component].B), 16);
							}
						}
					}
				}

				Short4 xLeft[4];
				Short4 xRight[4];

				for(unsigned int q = 0; q < state.multiSample; q++)
				{
					xLeft[q] = *Pointer<Short4>(r.primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline) + y * sizeof(Primitive::Span));
					xRight[q] = xLeft[q];

					xLeft[q] = Swizzle(xLeft[q], 0xA0) - Short4(1, 2, 1, 2);
					xRight[q] = Swizzle(xRight[q], 0xF5) - Short4(0, 1, 0, 1);
				}

				For(Int x = x0, x < x1, x += 2)
				{
					Short4 xxxx = Short4(x);
					Int cMask[4];

					for(unsigned int q = 0; q < state.multiSample; q++)
					{
						Short4 mask = CmpGT(xxxx, xLeft[q]) & CmpGT(xRight[q], xxxx);
						cMask[q] = SignMask(Pack(mask, mask)) & 0x0000000F;
					}

					quad(r, cBuffer, zBuffer, sBuffer, cMask, x, y);
				}
			}

			for(int index = 0; index < 4; index++)
			{
				if(state.colorWriteActive(index))
				{
					cBuffer[index] += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index])) << (1 + sw::log2(clusterCount));   // FIXME: Precompute
				}
			}

			if(state.depthTestActive)
			{
				zBuffer += *Pointer<Int>(r.data + OFFSET(DrawData,depthPitchB)) << (1 + sw::log2(clusterCount));   // FIXME: Precompute
			}

			if(state.stencilActive)
			{
				sBuffer += *Pointer<Int>(r.data + OFFSET(DrawData,stencilPitchB)) << (1 + sw::log2(clusterCount));   // FIXME: Precompute
			}

			y += 2 * clusterCount;
		}
		Until(y >= yMax)
	}
}
