// SwiftShader Software Renderer
//
// Copyright(c) 2005-2012 TransGaming Inc.
//
// All rights reserved. No part of this software may be copied, distributed, transmitted,
// transcribed, stored in a retrieval system, translated into any human or computer
// language by any means, or disclosed to third parties without the explicit written
// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
// or implied, including but not limited to any patent rights, are granted to you.
//

#include "Renderer.hpp"

#include "Clipper.hpp"
#include "Math.hpp"
#include "FrameBuffer.hpp"
#include "Timer.hpp"
#include "Surface.hpp"
#include "Half.hpp"
#include "Primitive.hpp"
#include "Polygon.hpp"
#include "SwiftConfig.hpp"
#include "MutexLock.hpp"
#include "CPUID.hpp"
#include "Memory.hpp"
#include "Resource.hpp"
#include "Constants.hpp"
#include "Debug.hpp"
#include "Reactor/Reactor.hpp"

#include <malloc.h>
#include <assert.h>

#undef max

bool disableServer = true;

#ifndef NDEBUG
unsigned int minPrimitives = 1;
unsigned int maxPrimitives = 1 << 21;
#endif

namespace sw
{
	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
	extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
	extern bool booleanFaceRegister;
	extern bool fullPixelPositionRegister;

	extern bool forceWindowed;
	extern bool complementaryDepthBuffer;
	extern bool postBlendSRGB;
	extern bool exactColorRounding;
	extern Context::TransparencyAntialiasing transparencyAntialiasing;
	extern bool forceClearRegisters;

	extern bool precacheVertex;
	extern bool precacheSetup;
	extern bool precachePixel;

	int batchSize = 128;
	int threadCount = 1;
	int unitCount = 1;
	int clusterCount = 1;

	TranscendentalPrecision logPrecision = ACCURATE;
	TranscendentalPrecision expPrecision = ACCURATE;
	TranscendentalPrecision rcpPrecision = ACCURATE;
	TranscendentalPrecision rsqPrecision = ACCURATE;
	bool perspectiveCorrection = true;

	struct Parameters
	{
		Renderer *renderer;
		int threadIndex;
	};

	DrawCall::DrawCall()
	{
		queries = 0;

		vsDirtyConstF = 256 + 1;
		vsDirtyConstI = 16;
		vsDirtyConstB = 16;

		psDirtyConstF = 224;
		psDirtyConstI = 16;
		psDirtyConstB = 16;

		references = -1;

		data = (DrawData*)allocate(sizeof(DrawData));
		data->constants = &constants;
	}

	DrawCall::~DrawCall()
	{
		delete queries;

		deallocate(data);
	}

	Renderer::Renderer(Context *context, bool halfIntegerCoordinates, bool symmetricNormalizedDepth, bool booleanFaceRegister, bool fullPixelPositionRegister, bool exactColorRounding) : context(context), VertexProcessor(context), PixelProcessor(context), SetupProcessor(context), viewport()
	{
		sw::halfIntegerCoordinates = halfIntegerCoordinates;
		sw::symmetricNormalizedDepth = symmetricNormalizedDepth;
		sw::booleanFaceRegister = booleanFaceRegister;
		sw::fullPixelPositionRegister = fullPixelPositionRegister;
		sw::exactColorRounding = exactColorRounding;

		setRenderTarget(0, 0);
		clipper = new Clipper();

		updateViewMatrix = true;
		updateBaseMatrix = true;
		updateProjectionMatrix = true;
		updateClipPlanes = true;

		#if PERF_HUD
			resetTimers();
		#endif

		for(int i = 0; i < 16; i++)
		{
			vertexTask[i] = 0;

			worker[i] = 0;
			resume[i] = 0;
			suspend[i] = 0;
		}

		threadsAwake = 0;
		resumeApp = new Event();

		currentDraw = 0;
		nextDraw = 0;

		qHead = 0;
		qSize = 0;

		for(int i = 0; i < 16; i++)
		{
			triangleBatch[i] = 0;
			primitiveBatch[i] = 0;
		}

		for(int draw = 0; draw < DRAW_COUNT; draw++)
		{
			drawCall[draw] = new DrawCall();
			drawList[draw] = drawCall[draw];
		}

		for(int unit = 0; unit < 16; unit++)
		{
			primitiveProgress[unit].init();
		}

		for(int cluster = 0; cluster < 16; cluster++)
		{
			pixelProgress[cluster].init();
		}

		clipFlags = 0;

		swiftConfig = new SwiftConfig(disableServer);
		updateConfiguration(true);

		sync = new Resource(0);
	}

	Renderer::~Renderer()
	{
		sync->destruct();

		delete clipper;
		clipper = 0;

		terminateThreads();
		delete resumeApp;

		for(int draw = 0; draw < DRAW_COUNT; draw++)
		{
			delete drawCall[draw];
		}

		delete swiftConfig;
	}

	void Renderer::blit(Surface *source, const Rect &sRect, Surface *dest, const Rect &dRect, bool filter)
	{
		blitter.blit(source, sRect, dest, dRect, filter);
	}

	void Renderer::draw(Context::DrawType drawType, unsigned int indexOffset, unsigned int count, bool update)
	{
		#ifndef NDEBUG
			if(count < minPrimitives || count > maxPrimitives)
			{
				return;
			}
		#endif

		context->drawType = drawType;

		updateConfiguration();
		updateClipper();

		int ss = context->getSuperSampleCount();
		int ms = context->getMultiSampleCount();

		for(int q = 0; q < ss; q++)
		{
			int oldMultiSampleMask = context->multiSampleMask;
			context->multiSampleMask = (context->sampleMask >> (ms * q)) & ((unsigned)0xFFFFFFFF >> (32 - ms));

			if(!context->multiSampleMask)
			{
				continue;
			}

			sync->lock(sw::PRIVATE);

			if(update || oldMultiSampleMask != context->multiSampleMask)
			{
				vertexState = VertexProcessor::update();
				setupState = SetupProcessor::update();
				pixelState = PixelProcessor::update();

				vertexRoutine = VertexProcessor::routine(vertexState);
				setupRoutine = SetupProcessor::routine(setupState);
				pixelRoutine = PixelProcessor::routine(pixelState);
			}

			int batch = batchSize / ms;

			if(context->isDrawTriangle())
			{
				switch(context->fillMode)
				{
				case Context::FILL_SOLID:
					setupPrimitives = setupSolidTriangles;
					break;
				case Context::FILL_WIREFRAME:
					setupPrimitives = setupWireframeTriangle;
					batch = 1;
					break;
				case Context::FILL_VERTEX:
					setupPrimitives = setupVertexTriangle;
					batch = 1;
					break;
				default: ASSERT(false);
				}
			}
			else if(context->isDrawLine())
			{
				setupPrimitives = setupLines;
			}
			else   // Point draw
			{
				setupPrimitives = setupPoints;
			}

			DrawCall *draw = 0;

			do
			{
				for(int i = 0; i < DRAW_COUNT; i++)
				{
					if(drawCall[i]->references == -1)
					{
						draw = drawCall[i];
						drawList[nextDraw % DRAW_COUNT] = draw;

						break;
					}
				}

				if(!draw)
				{
					resumeApp->wait();
				}
			}
			while(!draw);

			DrawData *data = draw->data;

			if(queries.size() != 0)
			{
				for(std::list<Query*>::iterator query = queries.begin(); query != queries.end(); query++)
				{
					atomicIncrement(&(*query)->reference);
				}

				draw->queries = new std::list<Query*>(queries);
			}

			draw->drawType = drawType;
			draw->batchSize = batch;

			vertexRoutine->bind();
			setupRoutine->bind();
			pixelRoutine->bind();

			draw->vertexRoutine = vertexRoutine;
			draw->setupRoutine = setupRoutine;
			draw->pixelRoutine = pixelRoutine;
			draw->vertexPointer = (VertexProcessor::RoutinePointer)vertexRoutine->getEntry();;
			draw->setupPointer = (SetupProcessor::RoutinePointer)setupRoutine->getEntry();
			draw->pixelPointer = (PixelProcessor::RoutinePointer)pixelRoutine->getEntry();
			draw->setupPrimitives = setupPrimitives;
			draw->setupState = setupState;

			for(int i = 0; i < 16; i++)
			{
				draw->vertexStream[i] = context->input[i].resource;
				data->input[i] = context->input[i].buffer;
				data->stride[i] = context->input[i].stride;

				if(draw->vertexStream[i])
				{
					draw->vertexStream[i]->lock(PUBLIC, PRIVATE);
				}
			}

			if(context->indexBuffer)
			{
				data->indices = (unsigned char*)context->indexBuffer->lock(PUBLIC, PRIVATE) + indexOffset;
			}

			draw->indexBuffer = context->indexBuffer;

			for(int sampler = 0; sampler < 20; sampler++)
			{
				draw->texture[sampler] = 0;
			}

			for(int sampler = 0; sampler < 16; sampler++)
			{
				if(pixelState.sampler[sampler].textureType != TEXTURE_NULL)
				{
					draw->texture[sampler] = context->texture[sampler];
					draw->texture[sampler]->lock(PUBLIC, isReadWriteTexture(sampler) ? MANAGED : PRIVATE);   // If the texure is both read and written, use the same read/write lock as render targets

					data->mipmap[sampler] = context->sampler[sampler].getTextureData();
				}
			}

			if(context->pixelShader)
			{
				if(draw->psDirtyConstF)
				{
					memcpy(&data->ps.cW, PixelProcessor::cW, sizeof(word4) * 4 * (draw->psDirtyConstF < 8 ? draw->psDirtyConstF : 8));
					memcpy(&data->ps.c, PixelProcessor::c, sizeof(float4) * draw->psDirtyConstF);
					draw->psDirtyConstF = 0;
				}

				if(draw->psDirtyConstI)
				{
					memcpy(&data->ps.i, PixelProcessor::i, sizeof(int4) * draw->psDirtyConstI);
					draw->psDirtyConstI = 0;
				}

				if(draw->psDirtyConstB)
				{
					memcpy(&data->ps.b, PixelProcessor::b, sizeof(bool) * draw->psDirtyConstB);
					draw->psDirtyConstB = 0;
				}
			}
			
			if(context->pixelShaderVersion() <= 0x0104)
			{
				for(int stage = 0; stage < 8; stage++)
				{
					if(pixelState.textureStage[stage].stageOperation != TextureStage::STAGE_DISABLE || context->pixelShader)
					{
						data->textureStage[stage] = context->textureStage[stage].uniforms;
					}
					else break;
				}
			}

			if(context->vertexShader)
			{
				if(context->vertexShader->getVersion() >= 0x0300)
				{
					for(int sampler = 0; sampler < 4; sampler++)
					{
						if(vertexState.samplerState[sampler].textureType != TEXTURE_NULL)
						{
							draw->texture[16 + sampler] = context->texture[16 + sampler];
							draw->texture[16 + sampler]->lock(PUBLIC, PRIVATE);

							data->mipmap[16 + sampler] = context->sampler[16 + sampler].getTextureData();
						}
					}
				}

				if(draw->vsDirtyConstF)
				{
					memcpy(&data->vs.c, VertexProcessor::c, sizeof(float4) * draw->vsDirtyConstF);
					draw->vsDirtyConstF = 0;
				}

				if(draw->vsDirtyConstI)
				{
					memcpy(&data->vs.i, VertexProcessor::i, sizeof(int4) * draw->vsDirtyConstI);
					draw->vsDirtyConstI = 0;
				}

				if(draw->vsDirtyConstB)
				{
					memcpy(&data->vs.b, VertexProcessor::b, sizeof(bool) * draw->vsDirtyConstB);
					draw->vsDirtyConstB = 0;
				}
			}
			else
			{
				data->ff = ff;

				draw->vsDirtyConstF = 256 + 1;
				draw->vsDirtyConstI = 16;
				draw->vsDirtyConstB = 16;
			}

			if(pixelState.stencilActive)
			{
				data->stencil[0] = stencil;
				data->stencil[1] = stencilCCW;
			}

			if(pixelState.fogActive)
			{
				data->fog = fog;
			}

			if(setupState.isDrawPoint)
			{
				data->point = point;
			}

			data->factor = factor;

			if(pixelState.transparencyAntialiasing == Context::TRANSPARENCY_ALPHA_TO_COVERAGE)
			{
				float ref = (float)context->alphaReference * (1.0f / 255.0f);
				float margin = sw::min(ref, 1.0f - ref);

				if(ms == 4)
				{
					data->a2c0 = replicate(ref - margin * 0.6f);
					data->a2c1 = replicate(ref - margin * 0.2f);
					data->a2c2 = replicate(ref + margin * 0.2f);
					data->a2c3 = replicate(ref + margin * 0.6f);
				}
				else if(ms == 2)
				{
					data->a2c0 = replicate(ref - margin * 0.3f);
					data->a2c1 = replicate(ref + margin * 0.3f);
				}
				else ASSERT(false);
			}

			if(pixelState.occlusionEnabled)
			{
				for(int cluster = 0; cluster < clusterCount; cluster++)
				{
					data->occlusion[cluster] = 0;
				}
			}

			#if PERF_PROFILE
				for(int cluster = 0; cluster < clusterCount; cluster++)
				{
					for(int i = 0; i < PERF_TIMERS; i++)
					{
						data->cycles[i][cluster] = 0;
					}
				}
			#endif

			// Viewport
			{
				float W = 0.5f * viewport.width;
				float H = 0.5f * viewport.height;
				float X0 = viewport.x0 + W;
				float Y0 = viewport.y0 + H;
				float N = viewport.minZ;
				float F = viewport.maxZ;
				float Z = F - N;

				if(context->isDrawTriangle(false))
				{
					N += depthBias;
				}

				if(complementaryDepthBuffer)
				{
					Z = -Z;
					N = 1 - N;
				}

				static const float X[5][16] =   // Fragment offsets
				{
					{+0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 1 sample
					{-0.2500f, +0.2500f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 2 samples
					{-0.3000f, +0.1000f, +0.3000f, -0.1000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 4 samples
					{+0.1875f, -0.3125f, +0.3125f, -0.4375f, -0.0625f, +0.4375f, +0.0625f, -0.1875f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 8 samples
					{+0.2553f, -0.1155f, +0.1661f, -0.1828f, +0.2293f, -0.4132f, -0.1773f, -0.0577f, +0.3891f, -0.4656f, +0.4103f, +0.4248f, -0.2109f, +0.3966f, -0.2664f, -0.3872f}    // 16 samples
				};

				static const float Y[5][16] =   // Fragment offsets
				{
					{+0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 1 sample
					{-0.2500f, +0.2500f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 2 samples
					{-0.1000f, -0.3000f, +0.1000f, +0.3000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 4 samples
					{-0.4375f, -0.3125f, -0.1875f, -0.0625f, +0.0625f, +0.1875f, +0.3125f, +0.4375f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 8 samples
					{-0.4503f, +0.1883f, +0.3684f, -0.4668f, -0.0690f, -0.1315f, +0.4999f, +0.0728f, +0.1070f, -0.3086f, +0.3725f, -0.1547f, -0.1102f, -0.3588f, +0.1789f, +0.0269f}    // 16 samples
				};

				int s = sw::log2(ss);

				data->Wx16 = replicate(W * 16);
				data->Hx16 = replicate(H * 16);
				data->X0x16 = replicate(X0 * 16);
				data->Y0x16 = replicate(Y0 * 16);
				data->XXXX = replicate(X[s][q] / W);
				data->YYYY = replicate(Y[s][q] / H);
				data->halfPixelX = replicate(0.5f / W);
				data->halfPixelY = replicate(0.5f / H);
				data->viewportHeight = abs(viewport.height);
				data->slopeDepthBias = slopeDepthBias;
				data->depthRange = Z;
				data->depthNear = N;
				draw->clipFlags = clipFlags;

				if(clipFlags)
				{
					if(clipFlags & Clipper::CLIP_PLANE0) data->clipPlane[0] = clipPlane[0];
					if(clipFlags & Clipper::CLIP_PLANE1) data->clipPlane[1] = clipPlane[1];
					if(clipFlags & Clipper::CLIP_PLANE2) data->clipPlane[2] = clipPlane[2];
					if(clipFlags & Clipper::CLIP_PLANE3) data->clipPlane[3] = clipPlane[3];
					if(clipFlags & Clipper::CLIP_PLANE4) data->clipPlane[4] = clipPlane[4];
					if(clipFlags & Clipper::CLIP_PLANE5) data->clipPlane[5] = clipPlane[5];
				}
			}

			// Target
			{
				for(int index = 0; index < 4; index++)
				{
					draw->renderTarget[index] = context->renderTarget[index];

					if(draw->renderTarget[index])
					{
						data->colorBuffer[index] = (unsigned int*)context->renderTarget[index]->lockInternal(0, 0, q * ms, LOCK_READWRITE, MANAGED);
						data->colorPitchB[index] = context->renderTarget[index]->getInternalPitchB();
						data->colorSliceB[index] = context->renderTarget[index]->getInternalSliceB();
					}
				}

				draw->depthStencil = context->depthStencil;

				if(draw->depthStencil)
				{
					data->depthBuffer = (float*)context->depthStencil->lockInternal(0, 0, q * ms, LOCK_READWRITE, MANAGED);
					data->depthPitchB = context->depthStencil->getInternalPitchB();
					data->depthSliceB = context->depthStencil->getInternalSliceB();

					data->stencilBuffer = (unsigned char*)context->depthStencil->lockStencil(q * ms, MANAGED);
					data->stencilPitchB = context->depthStencil->getStencilPitchB();
					data->stencilSliceB = context->depthStencil->getStencilSliceB();
				}
			}

			// Scissor
			{
				data->scissorX0 = scissor.x0;
				data->scissorX1 = scissor.x1;
				data->scissorY0 = scissor.y0;
				data->scissorY1 = scissor.y1;
			}

			draw->primitive = 0;
			draw->count = count;

			draw->references = (count + batch - 1) / batch;

			mutex.lock();
			nextDraw++;
			mutex.unlock();

			if(!threadsAwake)
			{
				suspend[0]->wait();

				threadsAwake = 1;
				task[0].type = Task::RESUME;

				resume[0]->signal();
			}
		}
	}

	void Renderer::threadFunction(void *parameters)
	{
		Renderer *renderer = static_cast<Parameters*>(parameters)->renderer;
		int threadIndex = static_cast<Parameters*>(parameters)->threadIndex;

		if(logPrecision < IEEE)
		{
			CPUID::setFlushToZero(true);
			CPUID::setDenormalsAreZero(true);
		}

		renderer->threadLoop(threadIndex);
	}

	void Renderer::threadLoop(int threadIndex)
	{
		while(!exitThreads)
		{
			taskLoop(threadIndex);

			suspend[threadIndex]->signal();
			resume[threadIndex]->wait();
		}
	}

	void Renderer::taskLoop(int threadIndex)
	{
		while(task[threadIndex].type != Task::SUSPEND)
		{
			scheduleTask(threadIndex);
			executeTask(threadIndex);
		}
	}

	void Renderer::findAvailableTasks()
	{
		// Find pixel tasks
		for(int cluster = 0; cluster < clusterCount; cluster++)
		{
			if(!pixelProgress[cluster].executing)
			{
				for(int unit = 0; unit < unitCount; unit++)
				{
					if(primitiveProgress[unit].references > 0)   // Contains processed primitives
					{
						if(pixelProgress[cluster].drawCall == primitiveProgress[unit].drawCall)
						{
							if(pixelProgress[cluster].processedPrimitives == primitiveProgress[unit].firstPrimitive)   // Previous primitives have been rendered
							{
								Task &task = taskQueue[qHead];
								task.type = Task::PIXELS;
								task.primitiveUnit = unit;
								task.pixelCluster = cluster;

								pixelProgress[cluster].executing = true;

								// Commit to the task queue
								qHead = (qHead + 1) % 32;
								qSize++;

								break;
							}
						}
					}
				}
			}
		}
	
		// Find primitive tasks
		if(currentDraw == nextDraw)
		{
			return;   // No more primitives to process
		}

		for(int unit = 0; unit < unitCount; unit++)
		{
			DrawCall *draw = drawList[currentDraw % DRAW_COUNT];

			if(draw->primitive >= draw->count)
			{
				currentDraw++;

				if(currentDraw == nextDraw)
				{
					return;   // No more primitives to process
				}

				draw = drawList[currentDraw % DRAW_COUNT];
			}

			if(!primitiveProgress[unit].references)   // Task not already being executed and not still in use by a pixel unit
			{
				int primitive = draw->primitive;
				int count = draw->count;
				int batch = draw->batchSize;

				primitiveProgress[unit].drawCall = currentDraw;
				primitiveProgress[unit].firstPrimitive = primitive;
				primitiveProgress[unit].primitiveCount = count - primitive >= batch ? batch : count - primitive;

				draw->primitive += batch;

				Task &task = taskQueue[qHead];
				task.type = Task::PRIMITIVES;
				task.primitiveUnit = unit;

				primitiveProgress[unit].references = -1;

				// Commit to the task queue
				qHead = (qHead + 1) % 32;
				qSize++;
			}
		}
	}

	void Renderer::scheduleTask(int threadIndex)
	{
		mutex.lock();

		if((int)qSize < threadCount - threadsAwake + 1)
		{
			findAvailableTasks();
		}

		if(qSize != 0)
		{
			task[threadIndex] = taskQueue[(qHead - qSize) % 32];
			qSize--;

			if(threadsAwake != threadCount)
			{
				int wakeup = qSize - threadsAwake + 1;

				for(int i = 0; i < threadCount && wakeup > 0; i++)
				{
					if(task[i].type == Task::SUSPEND)
					{
						suspend[i]->wait();
						task[i].type = Task::RESUME;
						resume[i]->signal();

						threadsAwake++;
						wakeup--;
					}
				}
			}
		}
		else
		{
			task[threadIndex].type = Task::SUSPEND;

			threadsAwake--;
		}

		mutex.unlock();
	}

	void Renderer::executeTask(int threadIndex)
	{
		#if PERF_HUD
			int64_t startTick = Timer::ticks();
		#endif

		switch(task[threadIndex].type)
		{
		case Task::PRIMITIVES:
			{
				int unit = task[threadIndex].primitiveUnit;
				
				int input = primitiveProgress[unit].firstPrimitive;
				int count = primitiveProgress[unit].primitiveCount;
				DrawCall *draw = drawList[primitiveProgress[unit].drawCall % DRAW_COUNT];
				int (*setupPrimitives)(Renderer *renderer, int batch, int count) = draw->setupPrimitives;

				processPrimitiveVertices(unit, input, count, draw->count, threadIndex);

				#if PERF_HUD
					int64_t time = Timer::ticks();
					vertexTime[threadIndex] += time - startTick;
					startTick = time;
				#endif

				int visible = setupPrimitives(this, unit, count);

				primitiveProgress[unit].visible = visible;
				primitiveProgress[unit].references = clusterCount;

				#if PERF_HUD
					setupTime[threadIndex] += Timer::ticks() - startTick;
				#endif
			}
			break;
		case Task::PIXELS:
			{
				int unit = task[threadIndex].primitiveUnit;
				int visible = primitiveProgress[unit].visible;

				if(visible > 0)
				{
					int cluster = task[threadIndex].pixelCluster;
					Primitive *primitive = primitiveBatch[unit];
					DrawCall *draw = drawList[pixelProgress[cluster].drawCall % DRAW_COUNT];
					DrawData *data = draw->data;
					PixelProcessor::RoutinePointer pixelRoutine = draw->pixelPointer;

					pixelRoutine(primitive, visible, cluster, data);
				}

				finishRendering(task[threadIndex]);

				#if PERF_HUD
					pixelTime[threadIndex] += Timer::ticks() - startTick;
				#endif
			}
			break;
		case Task::RESUME:
			break;
		case Task::SUSPEND:
			break;
		default:
			ASSERT(false);
		}
	}

	void Renderer::synchronize()
	{
		sync->lock(sw::PUBLIC);
		sync->unlock();
	}

	void Renderer::finishRendering(Task &pixelTask)
	{
		int unit = pixelTask.primitiveUnit;
		int cluster = pixelTask.pixelCluster;

		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall % DRAW_COUNT];
		DrawData &data = *draw.data;
		int primitive = primitiveProgress[unit].firstPrimitive;
		int count = primitiveProgress[unit].primitiveCount;

		pixelProgress[cluster].processedPrimitives = primitive + count;

		if(pixelProgress[cluster].processedPrimitives >= draw.count)
		{
			pixelProgress[cluster].drawCall++;
			pixelProgress[cluster].processedPrimitives = 0;
		}

		int ref = atomicDecrement(&primitiveProgress[unit].references);

		if(ref == 0)
		{
			ref = atomicDecrement(&draw.references);

			if(ref == 0)
			{
				#if PERF_PROFILE
					for(int cluster = 0; cluster < clusterCount; cluster++)
					{
						for(int i = 0; i < PERF_TIMERS; i++)
						{
							profiler.cycles[i] += data.cycles[i][cluster];
						}
					}
				#endif

				if(draw.queries)
				{
					for(std::list<Query*>::iterator q = draw.queries->begin(); q != draw.queries->end(); q++)
					{
						Query *query = *q;

						for(int cluster = 0; cluster < clusterCount; cluster++)
						{
							atomicAdd((volatile int*)&query->data, data.occlusion[cluster]);
						}

						atomicDecrement(&query->reference);
					}

					delete draw.queries;
					draw.queries = 0;
				}

				for(int i = 0; i < 4; i++)
				{
					if(draw.renderTarget[i])
					{
						draw.renderTarget[i]->unlockInternal();
					}
				}

				if(draw.depthStencil)
				{
					draw.depthStencil->unlockInternal();
					draw.depthStencil->unlockStencil();
				}

				for(int i = 0; i < 16 + 4; i++)
				{
					if(draw.texture[i])
					{
						draw.texture[i]->unlock();
					}
				}

				for(int i = 0; i < 16; i++)
				{
					if(draw.vertexStream[i])
					{
						draw.vertexStream[i]->unlock();
					}
				}

				if(draw.indexBuffer)
				{
					draw.indexBuffer->unlock();
				}

				draw.vertexRoutine->unbind();
				draw.setupRoutine->unbind();
				draw.pixelRoutine->unbind();

				sync->unlock();

				draw.references = -1;
				resumeApp->signal();
			}
		}

		pixelProgress[cluster].executing = false;
	}

	void Renderer::processPrimitiveVertices(int unit, unsigned int start, unsigned int count, unsigned int loop, int thread)
	{
		Triangle *triangle = triangleBatch[unit];
		DrawCall *draw = drawList[primitiveProgress[unit].drawCall % DRAW_COUNT];
		DrawData *data = draw->data;
		VertexTask *task = vertexTask[thread];

		const void *indices = data->indices;
		VertexProcessor::RoutinePointer vertexRoutine = draw->vertexPointer;

		if(task->vertexCache.drawCall != primitiveProgress[unit].drawCall)
		{
			task->vertexCache.clear();
			task->vertexCache.drawCall = primitiveProgress[unit].drawCall;
		}

		unsigned int batch[128][3];   // FIXME: Adjust to dynamic batch size

		switch(draw->drawType)
		{
		case Context::DRAW_POINTLIST:
			{
				unsigned int index = start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index;
					batch[i][1] = index;
					batch[i][2] = index;

					index += 1;
				}
			}
			break;
		case Context::DRAW_LINELIST:
			{
				unsigned int index = 2 * start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index + 0;
					batch[i][1] = index + 1;
					batch[i][2] = index + 1;

					index += 2;
				}
			}
			break;
		case Context::DRAW_LINESTRIP:
			{
				unsigned int index = start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index + 0;
					batch[i][1] = index + 1;
					batch[i][2] = index + 1;

					index += 1;
				}
			}
			break;
		case Context::DRAW_LINELOOP:
			{
				unsigned int index = start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = (index + 0) % loop;
					batch[i][1] = (index + 1) % loop;
					batch[i][2] = (index + 1) % loop;

					index += 1;
				}
			}
			break;
		case Context::DRAW_TRIANGLELIST:
			{
				unsigned int index = 3 * start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index + 0;
					batch[i][1] = index + 1;
					batch[i][2] = index + 2;

					index += 3;
				}
			}
			break;
		case Context::DRAW_TRIANGLESTRIP:
			{
				unsigned int index = start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index + 0;
					batch[i][1] = index + (index & 1) + 1;
					batch[i][2] = index + (~index & 1) + 1;

					index += 1;
				}
			}
			break;
		case Context::DRAW_TRIANGLEFAN:
			{
				unsigned int index = start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index + 1;
					batch[i][1] = index + 2;
					batch[i][2] = 0;

					index += 1;
				}
			}
			break;
		case Context::DRAW_INDEXEDPOINTLIST8:
			{
				const unsigned char *index = (const unsigned char*)indices + start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = *index;
					batch[i][1] = *index;
					batch[i][2] = *index;

					index += 1;
				}
			}
			break;
		case Context::DRAW_INDEXEDPOINTLIST16:
			{
				const unsigned short *index = (const unsigned short*)indices + start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = *index;
					batch[i][1] = *index;
					batch[i][2] = *index;

					index += 1;
				}
			}
			break;
		case Context::DRAW_INDEXEDPOINTLIST32:
			{
				const unsigned int *index = (const unsigned int*)indices + start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = *index;
					batch[i][1] = *index;
					batch[i][2] = *index;

					index += 1;
				}
			}
			break;
		case Context::DRAW_INDEXEDLINELIST8:
			{
				const unsigned char *index = (const unsigned char*)indices + 2 * start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[1];

					index += 2;
				}
			}
			break;
		case Context::DRAW_INDEXEDLINELIST16:
			{
				const unsigned short *index = (const unsigned short*)indices + 2 * start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[1];

					index += 2;
				}
			}
			break;
		case Context::DRAW_INDEXEDLINELIST32:
			{
				const unsigned int *index = (const unsigned int*)indices + 2 * start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[1];

					index += 2;
				}
			}
			break;
		case Context::DRAW_INDEXEDLINESTRIP8:
			{
				const unsigned char *index = (const unsigned char*)indices + start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[1];

					index += 1;
				}
			}
			break;
		case Context::DRAW_INDEXEDLINESTRIP16:
			{
				const unsigned short *index = (const unsigned short*)indices + start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[1];

					index += 1;
				}
			}
			break;
		case Context::DRAW_INDEXEDLINESTRIP32:
			{
				const unsigned int *index = (const unsigned int*)indices + start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[1];

					index += 1;
				}
			}
			break;
		case Context::DRAW_INDEXEDLINELOOP8:
			{
				const unsigned char *index = (const unsigned char*)indices;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[(start + i + 0) % loop];
					batch[i][1] = index[(start + i + 1) % loop];
					batch[i][2] = index[(start + i + 1) % loop];
				}
			}
			break;
		case Context::DRAW_INDEXEDLINELOOP16:
			{
				const unsigned short *index = (const unsigned short*)indices;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[(start + i + 0) % loop];
					batch[i][1] = index[(start + i + 1) % loop];
					batch[i][2] = index[(start + i + 1) % loop];
				}
			}
			break;
		case Context::DRAW_INDEXEDLINELOOP32:
			{
				const unsigned int *index = (const unsigned int*)indices;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[(start + i + 0) % loop];
					batch[i][1] = index[(start + i + 1) % loop];
					batch[i][2] = index[(start + i + 1) % loop];
				}
			}
			break;
		case Context::DRAW_INDEXEDTRIANGLELIST8:
			{
				const unsigned char *index = (const unsigned char*)indices + 3 * start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[2];

					index += 3;
				}
			}
			break;
		case Context::DRAW_INDEXEDTRIANGLELIST16:
			{
				const unsigned short *index = (const unsigned short*)indices + 3 * start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[2];

					index += 3;
				}
			}
			break;
		case Context::DRAW_INDEXEDTRIANGLELIST32:
			{
				const unsigned int *index = (const unsigned int*)indices + 3 * start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[1];
					batch[i][2] = index[2];

					index += 3;
				}
			}
			break;
		case Context::DRAW_INDEXEDTRIANGLESTRIP8:
			{
				const unsigned char *index = (const unsigned char*)indices + start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[((start + i) & 1) + 1];
					batch[i][2] = index[(~(start + i) & 1) + 1];

					index += 1;
				}
			}
			break;
		case Context::DRAW_INDEXEDTRIANGLESTRIP16:
			{
				const unsigned short *index = (const unsigned short*)indices + start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[((start + i) & 1) + 1];
					batch[i][2] = index[(~(start + i) & 1) + 1];

					index += 1;
				}
			}
			break;
		case Context::DRAW_INDEXEDTRIANGLESTRIP32:
			{
				const unsigned int *index = (const unsigned int*)indices + start;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[0];
					batch[i][1] = index[((start + i) & 1) + 1];
					batch[i][2] = index[(~(start + i) & 1) + 1];

					index += 1;
				}
			}
			break;
		case Context::DRAW_INDEXEDTRIANGLEFAN8:
			{
				const unsigned char *index = (const unsigned char*)indices;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[start + i + 1];
					batch[i][1] = index[start + i + 2];
					batch[i][2] = index[0];
				}
			}
			break;
		case Context::DRAW_INDEXEDTRIANGLEFAN16:
			{
				const unsigned short *index = (const unsigned short*)indices;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[start + i + 1];
					batch[i][1] = index[start + i + 2];
					batch[i][2] = index[0];
				}
			}
			break;
		case Context::DRAW_INDEXEDTRIANGLEFAN32:
			{
				const unsigned int *index = (const unsigned int*)indices;

				for(unsigned int i = 0; i < count; i++)
				{
					batch[i][0] = index[start + i + 1];
					batch[i][1] = index[start + i + 2];
					batch[i][2] = index[0];
				}
			}
			break;
		default:
			ASSERT(false);
		}

		task->count = count * 3;
		vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
	}

	int Renderer::setupSolidTriangles(Renderer *renderer, int unit, int count)
	{
		Triangle *triangle = renderer->triangleBatch[unit];
		Primitive *primitive = renderer->primitiveBatch[unit];

		DrawCall &draw = *renderer->drawList[renderer->primitiveProgress[unit].drawCall % DRAW_COUNT];
		SetupProcessor::State &state = draw.setupState;
		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;

		int ms = state.multiSample;
		int pos = state.positionRegister;
		const DrawData *data = draw.data;
		int visible = 0;

		for(int i = 0; i < count; i++, triangle++)
		{
			Vertex &v0 = triangle->v0;
			Vertex &v1 = triangle->v1;
			Vertex &v2 = triangle->v2;

			if((v0.clipFlags & v1.clipFlags & v2.clipFlags) == Clipper::CLIP_FINITE)
			{
				Polygon polygon(&v0.v[pos], &v1.v[pos], &v2.v[pos]);

				int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags | draw.clipFlags;

				if(clipFlagsOr != Clipper::CLIP_FINITE)
				{
					if(!renderer->clipper->clip(polygon, clipFlagsOr, draw))
					{
						continue;
					}
				}

				if(setupRoutine(primitive, triangle, &polygon, data))
				{
					primitive += ms;
					visible++;
				}
			}
		}

		return visible;
	}

	int Renderer::setupWireframeTriangle(Renderer *renderer, int unit, int count)
	{
		Triangle *triangle = renderer->triangleBatch[unit];
		Primitive *primitive = renderer->primitiveBatch[unit];
		int visible = 0;

		DrawCall &draw = *renderer->drawList[renderer->primitiveProgress[unit].drawCall % DRAW_COUNT];
		SetupProcessor::State &state = draw.setupState;
		SetupProcessor::RoutinePointer setupRoutine = draw.setupPointer;

		const Vertex &v0 = triangle[0].v0;
		const Vertex &v1 = triangle[0].v1;
		const Vertex &v2 = triangle[0].v2;

		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w + (v0.x * v2.y - v0.y * v2.x) * v1.w + (v2.x * v1.y - v1.x * v2.y) * v0.w;

		if(state.cullMode == Context::CULL_CLOCKWISE)
		{
			if(d >= 0) return 0;
		}
		else if(state.cullMode == Context::CULL_COUNTERCLOCKWISE)
		{
			if(d <= 0) return 0;
		}

		// Copy attributes
		triangle[1].v0 = v1;
		triangle[1].v1 = v2;
		triangle[2].v0 = v2;
		triangle[2].v1 = v0;

		if(state.color[0][0].flat)   // FIXME
		{
			for(int i = 0; i < 2; i++)
			{
				triangle[1].v0.C[i] = triangle[0].v0.C[i];
				triangle[1].v1.C[i] = triangle[0].v0.C[i];
				triangle[2].v0.C[i] = triangle[0].v0.C[i];
				triangle[2].v1.C[i] = triangle[0].v0.C[i];
			}
		}

		for(int i = 0; i < 3; i++)
		{
			if(setupLine(renderer, *primitive, *triangle, draw))
			{
				primitive->area = 0.5f * d;

				primitive++;
				visible++;
			}

			triangle++;
		}

		return visible;
	}
	
	int Renderer::setupVertexTriangle(Renderer *renderer, int unit, int count)
	{
		Triangle *triangle = renderer->triangleBatch[unit];
		Primitive *primitive = renderer->primitiveBatch[unit];
		int visible = 0;

		DrawCall &draw = *renderer->drawList[renderer->primitiveProgress[unit].drawCall % DRAW_COUNT];
		SetupProcessor::State &state = draw.setupState;

		const Vertex &v0 = triangle[0].v0;
		const Vertex &v1 = triangle[0].v1;
		const Vertex &v2 = triangle[0].v2;

		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w + (v0.x * v2.y - v0.y * v2.x) * v1.w + (v2.x * v1.y - v1.x * v2.y) * v0.w;

		if(state.cullMode == Context::CULL_CLOCKWISE)
		{
			if(d >= 0) return 0;
		}
		else if(state.cullMode == Context::CULL_COUNTERCLOCKWISE)
		{
			if(d <= 0) return 0;
		}

		// Copy attributes
		triangle[1].v0 = v1;
		triangle[2].v0 = v2;

		for(int i = 0; i < 3; i++)
		{
			if(setupPoint(renderer, *primitive, *triangle, draw))
			{
				primitive->area = 0.5f * d;

				primitive++;
				visible++;
			}

			triangle++;
		}

		return visible;
	}

	int Renderer::setupLines(Renderer *renderer, int unit, int count)
	{
		Triangle *triangle = renderer->triangleBatch[unit];
		Primitive *primitive = renderer->primitiveBatch[unit];
		int visible = 0;

		DrawCall &draw = *renderer->drawList[renderer->primitiveProgress[unit].drawCall % DRAW_COUNT];
		SetupProcessor::State &state = draw.setupState;

		int ms = state.multiSample;

		for(int i = 0; i < count; i++)
		{
			if(setupLine(renderer, *primitive, *triangle, draw))
			{
				primitive += ms;
				visible++;
			}

			triangle++;
		}

		return visible;
	}

	int Renderer::setupPoints(Renderer *renderer, int unit, int count)
	{
		Triangle *triangle = renderer->triangleBatch[unit];
		Primitive *primitive = renderer->primitiveBatch[unit];
		int visible = 0;

		DrawCall &draw = *renderer->drawList[renderer->primitiveProgress[unit].drawCall % DRAW_COUNT];
		SetupProcessor::State &state = draw.setupState;

		int ms = state.multiSample;

		for(int i = 0; i < count; i++)
		{
			if(setupPoint(renderer, *primitive, *triangle, draw))
			{
				primitive += ms;
				visible++;
			}

			triangle++;
		}

		return visible;
	}

	bool Renderer::setupLine(Renderer *renderer, Primitive &primitive, Triangle &triangle, const DrawCall &draw)
	{
		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
		const SetupProcessor::State &state = draw.setupState;
		const DrawData &data = *draw.data;

		Vertex &v0 = triangle.v0;
		Vertex &v1 = triangle.v1;

		int pos = state.positionRegister;

		const float4 &P0 = v0.v[pos];
		const float4 &P1 = v1.v[pos];

		if(P0.w <= 0 && P1.w <= 0)
		{
			return false;
		}

		const float W = data.Wx16[0] * (1.0f / 16.0f);
		const float H = data.Hx16[0] * (1.0f / 16.0f);

		float dx = W * (P1.x / P1.w - P0.x / P0.w);
		float dy = H * (P1.y / P1.w - P0.y / P0.w);

		if(dx == 0 && dy == 0)
		{
			return false;
		}

		if(false)   // Rectangle
		{
			float4 P[4];
			int C[4];

			P[0] = P0;
			P[1] = P1;
			P[2] = P1;
			P[3] = P0;

			float scale = 0.5f / sqrt(dx*dx + dy*dy);

			dx *= scale;
			dy *= scale;

			float dx0w = dx * P0.w / W;
			float dy0h = dy * P0.w / H;
			float dx0h = dx * P0.w / H;
			float dy0w = dy * P0.w / W;

			float dx1w = dx * P1.w / W;
			float dy1h = dy * P1.w / H;
			float dx1h = dx * P1.w / H;
			float dy1w = dy * P1.w / W;

			P[0].x += -dy0w + -dx0w;
			P[0].y += -dx0h + +dy0h;
			C[0] = computeClipFlags(P[0], data);

			P[1].x += -dy1w + +dx1w;
			P[1].y += -dx1h + +dy1h;
			C[1] = computeClipFlags(P[1], data);

			P[2].x += +dy1w + +dx1w;
			P[2].y += +dx1h + -dy1h;
			C[2] = computeClipFlags(P[2], data);

			P[3].x += +dy0w + -dx0w;
			P[3].y += +dx0h + +dy0h;
			C[3] = computeClipFlags(P[3], data);

			if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
			{
				Polygon polygon(P, 4);

				int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | draw.clipFlags;

				if(clipFlagsOr != Clipper::CLIP_FINITE)
				{
					if(!renderer->clipper->clip(polygon, clipFlagsOr, draw))
					{
						return false;
					}
				}

				return setupRoutine(&primitive, &triangle, &polygon, &data);
			}
		}
		else   // Diamond test convention
		{
			float4 P[8];
			int C[8];

			P[0] = P0;
			P[1] = P0;
			P[2] = P0;
			P[3] = P0;
			P[4] = P1;
			P[5] = P1;
			P[6] = P1;
			P[7] = P1;

			float dx0 = 0.5f * P0.w / W;
			float dy0 = 0.5f * P0.w / H;

			float dx1 = 0.5f * P1.w / W;
			float dy1 = 0.5f * P1.w / H;

			P[0].x += -dx0;
			C[0] = computeClipFlags(P[0], data);

			P[1].y += +dy0;
			C[1] = computeClipFlags(P[1], data);

			P[2].x += +dx0;
			C[2] = computeClipFlags(P[2], data);

			P[3].y += -dy0;
			C[3] = computeClipFlags(P[3], data);

			P[4].x += -dx1;
			C[4] = computeClipFlags(P[4], data);

			P[5].y += +dy1;
			C[5] = computeClipFlags(P[5], data);

			P[6].x += +dx1;
			C[6] = computeClipFlags(P[6], data);

			P[7].y += -dy1;
			C[7] = computeClipFlags(P[7], data);

			if((C[0] & C[1] & C[2] & C[3] & C[4] & C[5] & C[6] & C[7]) == Clipper::CLIP_FINITE)
			{
				float4 L[6];

				if(dx > -dy)
				{
					if(dx > dy)   // Right
					{
						L[0] = P[0];
						L[1] = P[1];
						L[2] = P[5];
						L[3] = P[6];
						L[4] = P[7];
						L[5] = P[3];
					}
					else   // Down
					{
						L[0] = P[0];
						L[1] = P[4];
						L[2] = P[5];
						L[3] = P[6];
						L[4] = P[2];
						L[5] = P[3];
					}
				}
				else
				{
					if(dx > dy)   // Up
					{
						L[0] = P[0];
						L[1] = P[1];
						L[2] = P[2];
						L[3] = P[6];
						L[4] = P[7];
						L[5] = P[4];
					}
					else   // Left
					{
						L[0] = P[1];
						L[1] = P[2];
						L[2] = P[3];
						L[3] = P[7];
						L[4] = P[4];
						L[5] = P[5];
					}
				}

				Polygon polygon(L, 6);

				int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | C[4] | C[5] | C[6] | C[7] | draw.clipFlags;

				if(clipFlagsOr != Clipper::CLIP_FINITE)
				{
					if(!renderer->clipper->clip(polygon, clipFlagsOr, draw))
					{
						return false;
					}
				}

				return setupRoutine(&primitive, &triangle, &polygon, &data);
			}
		}

		return false;
	}

	bool Renderer::setupPoint(Renderer *renderer, Primitive &primitive, Triangle &triangle, const DrawCall &draw)
	{
		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
		const SetupProcessor::State &state = draw.setupState;
		const DrawData &data = *draw.data;

		Vertex &v = triangle.v0;

		float pSize;

		int pts = state.pointSizeRegister;

		if(state.pointSizeRegister != 0xF)
		{
			pSize = v.v[pts].y;
		}
		else
		{
			pSize = data.point.pointSize[0];
		}

		pSize = clamp(pSize, data.point.pointSizeMin, data.point.pointSizeMax);

		float4 P[4];
		int C[4];

		int pos = state.positionRegister;

		P[0] = v.v[pos];
		P[1] = v.v[pos];
		P[2] = v.v[pos];
		P[3] = v.v[pos];

		const float X = pSize * P[0].w * data.halfPixelX[0];
		const float Y = pSize * P[0].w * data.halfPixelY[0];

		P[0].x -= X;
		P[0].y += Y;
		C[0] = computeClipFlags(P[0], data);

		P[1].x += X;
		P[1].y += Y;
		C[1] = computeClipFlags(P[1], data);

		P[2].x += X;
		P[2].y -= Y;
		C[2] = computeClipFlags(P[2], data);

		P[3].x -= X;
		P[3].y -= Y;
		C[3] = computeClipFlags(P[3], data);

		triangle.v1 = triangle.v0;
		triangle.v2 = triangle.v0;

		triangle.v1.X += iround(16 * 0.5f * pSize);
		triangle.v2.Y -= iround(16 * 0.5f * pSize) * (data.Hx16[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner

		Polygon polygon(P, 4);

		if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
		{
			int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | draw.clipFlags;

			if(clipFlagsOr != Clipper::CLIP_FINITE)
			{
				if(!renderer->clipper->clip(polygon, clipFlagsOr, draw))
				{
					return false;
				}
			}
			
			return setupRoutine(&primitive, &triangle, &polygon, &data);
		}

		return false;
	}

	unsigned int Renderer::computeClipFlags(const float4 &v, const DrawData &data)
	{
		float clX = v.x + data.halfPixelX[0] * v.w;
		float clY = v.y + data.halfPixelY[0] * v.w;

		return ((clX > v.w)  << 0) |
			   ((clY > v.w)  << 1) |
			   ((v.z > v.w)  << 2) |
			   ((clX < -v.w) << 3) |
		       ((clY < -v.w) << 4) |
			   ((v.z < 0)    << 5) |
			   Clipper::CLIP_FINITE;   // FIXME: xyz finite
	}

	void Renderer::initializeThreads()
	{
		unitCount = ceilPow2(threadCount);
		clusterCount = ceilPow2(threadCount);

		for(int i = 0; i < unitCount; i++)
		{
			triangleBatch[i] = (Triangle*)allocate(batchSize * sizeof(Triangle));
			primitiveBatch[i] = (Primitive*)allocate(batchSize * sizeof(Primitive));
		}

		for(int i = 0; i < threadCount; i++)
		{
			vertexTask[i] = (VertexTask*)allocate(sizeof(VertexTask));
			vertexTask[i]->vertexCache.drawCall = -1;

			task[i].type = Task::SUSPEND;

			resume[i] = new Event();
			suspend[i] = new Event();

			Parameters parameters;
			parameters.threadIndex = i;
			parameters.renderer = this;

			exitThreads = false;
			worker[i] = new Thread(threadFunction, &parameters);

			suspend[i]->wait();
			suspend[i]->signal();
		}
	}

	void Renderer::terminateThreads()
	{
		while(threadsAwake != 0)
		{
			Thread::sleep(1);
		}

		for(int thread = 0; thread < threadCount; thread++)
		{
			if(worker[thread])
			{
				exitThreads = true;
				resume[thread]->signal();
				worker[thread]->join();
				
				delete worker[thread];
				worker[thread] = 0;
				delete resume[thread];
				resume[thread] = 0;
				delete suspend[thread];
				suspend[thread] = 0;
			}
		
			deallocate(vertexTask[thread]);
			vertexTask[thread] = 0;
		}

		for(int i = 0; i < 16; i++)
		{
			deallocate(triangleBatch[i]);
			triangleBatch[i] = 0;

			deallocate(primitiveBatch[i]);
			primitiveBatch[i] = 0;
		}
	}

	void Renderer::loadConstants(const VertexShader *vertexShader)
	{
		if(!vertexShader) return;

		int count = vertexShader->getLength();

		for(int i = 0; i < count; i++)
		{
			const Shader::Instruction *instruction = vertexShader->getInstruction(i);

			if(instruction->opcode == Shader::OPCODE_DEF)
			{
				int index = instruction->dst.index;
				float value[4];

				value[0] = instruction->src[0].value[0];
				value[1] = instruction->src[0].value[1];
				value[2] = instruction->src[0].value[2];
				value[3] = instruction->src[0].value[3];

				setVertexShaderConstantF(index, value);
			}
			else if(instruction->opcode == Shader::OPCODE_DEFI)
			{
				int index = instruction->dst.index;
				int integer[4];

				integer[0] = instruction->src[0].integer[0];
				integer[1] = instruction->src[0].integer[1];
				integer[2] = instruction->src[0].integer[2];
				integer[3] = instruction->src[0].integer[3];

				setVertexShaderConstantI(index, integer);
			}
			else if(instruction->opcode == Shader::OPCODE_DEFB)
			{
				int index = instruction->dst.index;
				int boolean = instruction->src[0].boolean[0];

				setVertexShaderConstantB(index, &boolean);
			}
		}
	}

	void Renderer::loadConstants(const PixelShader *pixelShader)
	{
		if(!pixelShader) return;

		int count = pixelShader->getLength();

		for(int i = 0; i < count; i++)
		{
			const Shader::Instruction *instruction = pixelShader->getInstruction(i);

			if(instruction->opcode == Shader::OPCODE_DEF)
			{
				int index = instruction->dst.index;
				float value[4];

				value[0] = instruction->src[0].value[0];
				value[1] = instruction->src[0].value[1];
				value[2] = instruction->src[0].value[2];
				value[3] = instruction->src[0].value[3];

				setPixelShaderConstantF(index, value);
			}
			else if(instruction->opcode == Shader::OPCODE_DEFI)
			{
				int index = instruction->dst.index;
				int integer[4];

				integer[0] = instruction->src[0].integer[0];
				integer[1] = instruction->src[0].integer[1];
				integer[2] = instruction->src[0].integer[2];
				integer[3] = instruction->src[0].integer[3];

				setPixelShaderConstantI(index, integer);
			}
			else if(instruction->opcode == Shader::OPCODE_DEFB)
			{
				int index = instruction->dst.index;
				int boolean = instruction->src[0].boolean[0];

				setPixelShaderConstantB(index, &boolean);
			}
		}
	}

	void Renderer::setIndexBuffer(Resource *indexBuffer)
	{
		context->indexBuffer = indexBuffer;
	}

	void Renderer::setMultiSampleMask(unsigned int mask)
	{
		context->sampleMask = mask;
	}

	void Renderer::setTransparencyAntialiasing(Context::TransparencyAntialiasing transparencyAntialiasing)
	{
		sw::transparencyAntialiasing = transparencyAntialiasing;
	}

	bool Renderer::isReadWriteTexture(int sampler)
	{
		for(int index = 0; index < 4; index++)
		{
			if(context->renderTarget[index] && context->texture[sampler] == context->renderTarget[index]->getResource())
			{
				return true;
			}
		}
	
		if(context->depthStencil && context->texture[sampler] == context->depthStencil->getResource())
		{
			return true;
		}

		return false;
	}
	
	void Renderer::updateClipper()
	{
		if(updateClipPlanes)
		{
			if(VertexProcessor::isFixedFunction())   // User plane in world space
			{
				const Matrix &scissorWorld = getViewTransform();

				if(clipFlags & Clipper::CLIP_PLANE0) clipPlane[0] = scissorWorld * userPlane[0];
				if(clipFlags & Clipper::CLIP_PLANE1) clipPlane[1] = scissorWorld * userPlane[1];
				if(clipFlags & Clipper::CLIP_PLANE2) clipPlane[2] = scissorWorld * userPlane[2];
				if(clipFlags & Clipper::CLIP_PLANE3) clipPlane[3] = scissorWorld * userPlane[3];
				if(clipFlags & Clipper::CLIP_PLANE4) clipPlane[4] = scissorWorld * userPlane[4];
				if(clipFlags & Clipper::CLIP_PLANE5) clipPlane[5] = scissorWorld * userPlane[5];
			}
			else   // User plane in clip space
			{
				if(clipFlags & Clipper::CLIP_PLANE0) clipPlane[0] = userPlane[0];
				if(clipFlags & Clipper::CLIP_PLANE1) clipPlane[1] = userPlane[1];
				if(clipFlags & Clipper::CLIP_PLANE2) clipPlane[2] = userPlane[2];
				if(clipFlags & Clipper::CLIP_PLANE3) clipPlane[3] = userPlane[3];
				if(clipFlags & Clipper::CLIP_PLANE4) clipPlane[4] = userPlane[4];
				if(clipFlags & Clipper::CLIP_PLANE5) clipPlane[5] = userPlane[5];
			}

			updateClipPlanes = false;
		}
	}

	void Renderer::setTextureResource(unsigned int sampler, Resource *resource)
	{
		ASSERT(sampler < (16 + 4));

		context->texture[sampler] = resource;
	}

	void Renderer::setTextureLevel(unsigned int sampler, unsigned int face, unsigned int level, Surface *surface, TextureType type)
	{
		ASSERT(sampler < (16 + 4) && face < 6 && level < MIPMAP_LEVELS);
		
		context->sampler[sampler].setTextureLevel(face, level, surface, type);
	}

	void Renderer::setTextureFilter(SamplerType type, int sampler, FilterType textureFilter)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setTextureFilter(sampler, textureFilter);
		}
		else
		{
			VertexProcessor::setTextureFilter(sampler, textureFilter);
		}
	}

	void Renderer::setMipmapFilter(SamplerType type, int sampler, MipmapType mipmapFilter)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setMipmapFilter(sampler, mipmapFilter);
		}
		else
		{
			VertexProcessor::setMipmapFilter(sampler, mipmapFilter);
		}
	}

	void Renderer::setGatherEnable(SamplerType type, int sampler, bool enable)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setGatherEnable(sampler, enable);
		}
		else
		{
			VertexProcessor::setGatherEnable(sampler, enable);
		}
	}

	void Renderer::setAddressingModeU(SamplerType type, int sampler, AddressingMode addressMode)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setAddressingModeU(sampler, addressMode);
		}
		else
		{
			VertexProcessor::setAddressingModeU(sampler, addressMode);
		}
	}

	void Renderer::setAddressingModeV(SamplerType type, int sampler, AddressingMode addressMode)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setAddressingModeV(sampler, addressMode);
		}
		else
		{
			VertexProcessor::setAddressingModeV(sampler, addressMode);
		}
	}

	void Renderer::setAddressingModeW(SamplerType type, int sampler, AddressingMode addressMode)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setAddressingModeW(sampler, addressMode);
		}
		else
		{
			VertexProcessor::setAddressingModeW(sampler, addressMode);
		}
	}

	void Renderer::setReadSRGB(SamplerType type, int sampler, bool sRGB)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setReadSRGB(sampler, sRGB);
		}
		else
		{
			VertexProcessor::setReadSRGB(sampler, sRGB);
		}
	}

	void Renderer::setMipmapLOD(SamplerType type, int sampler, float bias)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setMipmapLOD(sampler, bias);
		}
		else
		{
			VertexProcessor::setMipmapLOD(sampler, bias);
		}
	}

	void Renderer::setBorderColor(SamplerType type, int sampler, const Color<float> &borderColor)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setBorderColor(sampler, borderColor);
		}
		else
		{
			VertexProcessor::setBorderColor(sampler, borderColor);
		}
	}

	void Renderer::setMaxAnisotropy(SamplerType type, int sampler, unsigned int maxAnisotropy)
	{
		if(type == SAMPLER_PIXEL)
		{
			PixelProcessor::setMaxAnisotropy(sampler, maxAnisotropy);
		}
		else
		{
			VertexProcessor::setMaxAnisotropy(sampler, maxAnisotropy);
		}
	}

	void Renderer::setPointSpriteEnable(bool pointSpriteEnable)
	{
		context->setPointSpriteEnable(pointSpriteEnable);
	}

	void Renderer::setPointScaleEnable(bool pointScaleEnable)
	{
		context->setPointScaleEnable(pointScaleEnable);
	}

	void Renderer::setDepthBias(float bias)
	{
		depthBias = bias;
	}

	void Renderer::setSlopeDepthBias(float slopeBias)
	{
		slopeDepthBias = slopeBias;
	}

	void Renderer::setPixelShader(const PixelShader *shader)
	{
		context->pixelShader = shader;

		loadConstants(shader);
	}

	void Renderer::setVertexShader(const VertexShader *shader)
	{
		context->vertexShader = shader;

		loadConstants(shader);
	}

	void Renderer::setPixelShaderConstantF(int index, const float value[4], int count)
	{
		for(int i = 0; i < DRAW_COUNT; i++)
		{
			if(drawCall[i]->psDirtyConstF < index + count)
			{
				drawCall[i]->psDirtyConstF = index + count;
			}
		}

		for(int i = 0; i < count; i++)
		{
			PixelProcessor::setFloatConstant(index + i, value);
			value += 4;
		}
	}

	void Renderer::setPixelShaderConstantI(int index, const int value[4], int count)
	{
		for(int i = 0; i < DRAW_COUNT; i++)
		{
			if(drawCall[i]->psDirtyConstI < index + count)
			{
				drawCall[i]->psDirtyConstI = index + count;
			}
		}

		for(int i = 0; i < count; i++)
		{
			PixelProcessor::setIntegerConstant(index + i, value);
			value += 4;
		}
	}

	void Renderer::setPixelShaderConstantB(int index, const int *boolean, int count)
	{
		for(int i = 0; i < DRAW_COUNT; i++)
		{
			if(drawCall[i]->psDirtyConstB < index + count)
			{
				drawCall[i]->psDirtyConstB = index + count;
			}
		}

		for(int i = 0; i < count; i++)
		{
			PixelProcessor::setBooleanConstant(index + i, *boolean);
			boolean++;
		}
	}

	void Renderer::setVertexShaderConstantF(int index, const float value[4], int count)
	{
		for(int i = 0; i < DRAW_COUNT; i++)
		{
			if(drawCall[i]->vsDirtyConstF < index + count)
			{
				drawCall[i]->vsDirtyConstF = index + count;
			}
		}

		for(int i = 0; i < count; i++)
		{
			VertexProcessor::setFloatConstant(index + i, value);
			value += 4;
		}
	}

	void Renderer::setVertexShaderConstantI(int index, const int value[4], int count)
	{
		for(int i = 0; i < DRAW_COUNT; i++)
		{
			if(drawCall[i]->vsDirtyConstI < index + count)
			{
				drawCall[i]->vsDirtyConstI = index + count;
			}
		}

		for(int i = 0; i < count; i++)
		{
			VertexProcessor::setIntegerConstant(index + i, value);
			value += 4;
		}
	}

	void Renderer::setVertexShaderConstantB(int index, const int *boolean, int count)
	{
		for(int i = 0; i < DRAW_COUNT; i++)
		{
			if(drawCall[i]->vsDirtyConstB < index + count)
			{
				drawCall[i]->vsDirtyConstB = index + count;
			}
		}

		for(int i = 0; i < count; i++)
		{
			VertexProcessor::setBooleanConstant(index + i, *boolean);
			boolean++;
		}
	}

	void Renderer::setModelMatrix(const Matrix &M, int i)
	{
		VertexProcessor::setModelMatrix(M, i);
	}

	void Renderer::setViewMatrix(const Matrix &V)
	{
		VertexProcessor::setViewMatrix(V);
		updateClipPlanes = true;
	}

	void Renderer::setBaseMatrix(const Matrix &B)
	{
		VertexProcessor::setBaseMatrix(B);
		updateClipPlanes = true;
	}

	void Renderer::setProjectionMatrix(const Matrix &P)
	{
		VertexProcessor::setProjectionMatrix(P);
		updateClipPlanes = true;
	}

	void Renderer::addQuery(Query *query)
	{
		queries.push_back(query);
	}
	
	void Renderer::removeQuery(Query *query)
	{
		queries.remove(query);
	}

	#if PERF_HUD
		int Renderer::getThreadCount()
		{
			return threadCount;
		}
		
		int64_t Renderer::getVertexTime(int thread)
		{
			return vertexTime[thread];
		}

		int64_t Renderer::getSetupTime(int thread)
		{
			return setupTime[thread];
		}
			
		int64_t Renderer::getPixelTime(int thread)
		{
			return pixelTime[thread];
		}

		void Renderer::resetTimers()
		{
			for(int thread = 0; thread < threadCount; thread++)
			{
				vertexTime[thread] = 0;
				setupTime[thread] = 0;
				pixelTime[thread] = 0;
			}
		}
	#endif

	void Renderer::setViewport(const Viewport &viewport)
	{
		this->viewport = viewport;
	}

	void Renderer::setScissor(const Rect &scissor)
	{
		this->scissor = scissor;
	}

	void Renderer::setClipFlags(int flags)
	{
		clipFlags = flags << 8;   // Bottom 8 bits used by legacy frustum
	}

	void Renderer::setClipPlane(unsigned int index, const float plane[4])
	{
		if(index < 6)
		{
			userPlane[index] = plane;
		}
		else ASSERT(false);

		updateClipPlanes = true;
	}

	void Renderer::updateConfiguration(bool initialUpdate)
	{
		bool newConfiguration = swiftConfig->hasNewConfiguration();

		if(newConfiguration || initialUpdate)
		{
			terminateThreads();

			SwiftConfig::Configuration configuration = {0};
			swiftConfig->getConfiguration(configuration);

			precacheVertex = !newConfiguration && configuration.precache;
			precacheSetup = !newConfiguration && configuration.precache;
			precachePixel = !newConfiguration && configuration.precache;

			VertexProcessor::setRoutineCacheSize(configuration.vertexRoutineCacheSize);
			PixelProcessor::setRoutineCacheSize(configuration.pixelRoutineCacheSize);
			SetupProcessor::setRoutineCacheSize(configuration.setupRoutineCacheSize);

			switch(configuration.textureSampleQuality)
			{
			case 0:  Sampler::setFilterQuality(FILTER_POINT);       break;
			case 1:  Sampler::setFilterQuality(FILTER_LINEAR);      break;
			case 2:  Sampler::setFilterQuality(FILTER_ANISOTROPIC); break;
			default: Sampler::setFilterQuality(FILTER_ANISOTROPIC); break;
			}

			switch(configuration.mipmapQuality)
			{
			case 0:  Sampler::setMipmapQuality(MIPMAP_POINT);  break;
			case 1:  Sampler::setMipmapQuality(MIPMAP_LINEAR); break;
			default: Sampler::setMipmapQuality(MIPMAP_LINEAR); break;
			}

			setPerspectiveCorrection(configuration.perspectiveCorrection);

			switch(configuration.transcendentalPrecision)
			{
			case 0:
				logPrecision = APPROXIMATE;
				expPrecision = APPROXIMATE;
				rcpPrecision = APPROXIMATE;
				rsqPrecision = APPROXIMATE;
				break;
			case 1:
				logPrecision = PARTIAL;
				expPrecision = PARTIAL;
				rcpPrecision = PARTIAL;
				rsqPrecision = PARTIAL;
				break;
			case 2:
				logPrecision = ACCURATE;
				expPrecision = ACCURATE;
				rcpPrecision = ACCURATE;
				rsqPrecision = ACCURATE;
				break;
			case 3:
				logPrecision = WHQL;
				expPrecision = WHQL;
				rcpPrecision = WHQL;
				rsqPrecision = WHQL;
				break;
			case 4:
				logPrecision = IEEE;
				expPrecision = IEEE;
				rcpPrecision = IEEE;
				rsqPrecision = IEEE;
				break;
			default:
				logPrecision = ACCURATE;
				expPrecision = ACCURATE;
				rcpPrecision = ACCURATE;
				rsqPrecision = ACCURATE;
				break;
			}

			switch(configuration.transparencyAntialiasing)
			{
			case 0:  transparencyAntialiasing = Context::TRANSPARENCY_NONE;              break;
			case 1:  transparencyAntialiasing = Context::TRANSPARENCY_ALPHA_TO_COVERAGE; break;
			default: transparencyAntialiasing = Context::TRANSPARENCY_NONE;              break;
			}

			switch(configuration.threadCount)
			{
			case -1: threadCount = CPUID::coreCount();        break;
			case 0:  threadCount = CPUID::processAffinity();  break;
			default: threadCount = configuration.threadCount; break;
			}

			CPUID::setEnableSSE4_1(configuration.enableSSE4_1);
			CPUID::setEnableSSSE3(configuration.enableSSSE3);
			CPUID::setEnableSSE3(configuration.enableSSE3);
			CPUID::setEnableSSE2(configuration.enableSSE2);
			CPUID::setEnableSSE(configuration.enableSSE);

			for(int pass = 0; pass < 10; pass++)
			{
				optimization[pass] = configuration.optimization[pass];
			}

			forceWindowed = configuration.forceWindowed;
			complementaryDepthBuffer = configuration.complementaryDepthBuffer;
			postBlendSRGB = configuration.postBlendSRGB;
			exactColorRounding = configuration.exactColorRounding;
			forceClearRegisters = configuration.forceClearRegisters;

		#ifndef NDEBUG
			minPrimitives = configuration.minPrimitives;
			maxPrimitives = configuration.maxPrimitives;
		#endif
		}

		if(!initialUpdate && !worker[0])
		{
			initializeThreads();
		}
	}
}
