Add SwiftShader source to repo Oct 6 code drop from Transgaming Review URL: https://chromereviews.googleplex.com/3846015
diff --git a/src/Shader/VertexRoutine.cpp b/src/Shader/VertexRoutine.cpp new file mode 100644 index 0000000..f28a741 --- /dev/null +++ b/src/Shader/VertexRoutine.cpp
@@ -0,0 +1,605 @@ +// SwiftShader Software Renderer +// +// Copyright(c) 2005-2011 TransGaming Inc. +// +// All rights reserved. No part of this software may be copied, distributed, transmitted, +// transcribed, stored in a retrieval system, translated into any human or computer +// language by any means, or disclosed to third parties without the explicit written +// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express +// or implied, including but not limited to any patent rights, are granted to you. +// + +#include "VertexRoutine.hpp" + +#include "VertexShader.hpp" +#include "Vertex.hpp" +#include "Half.hpp" +#include "Renderer.hpp" +#include "Constants.hpp" +#include "Debug.hpp" + +namespace sw +{ + VertexRoutine::VertexRoutine(const VertexProcessor::State &state) : state(state) + { + routine = 0; + } + + VertexRoutine::~VertexRoutine() + { + } + + void VertexRoutine::generate() + { + Function<Void, Pointer<Byte>, Pointer<Byte>, Pointer<Byte>, Pointer<Byte>> function; + { + Pointer<Byte> vertex(function.arg(0)); + Pointer<Byte> batch(function.arg(1)); + Pointer<Byte> task(function.arg(2)); + Pointer<Byte> data(function.arg(3)); + + const bool texldl = state.shaderContainsTexldl; + + Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache); + Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex); + Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag); + + UInt count = *Pointer<UInt>(task+ OFFSET(VertexTask,count)); + + Registers r; + r.data = data; + r.constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants)); + + Do + { + UInt index = *Pointer<UInt>(batch); + UInt tagIndex = index & UInt(0x0000003C); + UInt indexQ = !texldl ? index & UInt(0xFFFFFFFC) : index; // FIXME: TEXLDL hack to have independent LODs, hurts performance. + + If(*Pointer<UInt>(tagCache + tagIndex) != indexQ) + { + *Pointer<UInt>(tagCache + tagIndex) = indexQ; + + readInput(r, indexQ); + pipeline(r); + postTransform(r); + computeClipFlags(r); + + Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex)); + writeCache(cacheLine0, r); + } + + UInt cacheIndex = index & UInt(0x0000003F); + Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex)); + writeVertex(vertex, cacheLine); + + vertex += sizeof(Vertex); + batch += sizeof(unsigned int); + count--; + } + Until(count == UInt(0)) + + Return(); + } + + routine = function(L"VertexRoutine_%0.16llX", state.shaderHash); + } + + Routine *VertexRoutine::getRoutine() + { + return routine; + } + + void VertexRoutine::readInput(Registers &r, UInt &index) + { + for(int i = 0; i < 16; i++) + { + Pointer<Byte> input = *Pointer<Pointer<Byte>>(r.data + OFFSET(DrawData,input) + sizeof(void*) * i); + UInt stride = *Pointer<UInt>(r.data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i); + + r.v[i] = readStream(r, input, stride, state.input[i], index); + } + } + + void VertexRoutine::computeClipFlags(Registers &r) + { + int pos = state.positionRegister; + + // Backtransform + if(state.preTransformed) + { + Float4 rhw = Float4(1.0f, 1.0f, 1.0f, 1.0f) / r.ow[pos]; + + Float4 W = *Pointer<Float4>(r.data + OFFSET(DrawData,WWWWx16)) * Float4(1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f); + Float4 H = *Pointer<Float4>(r.data + OFFSET(DrawData,HHHHx16)) * Float4(1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f); + Float4 L = *Pointer<Float4>(r.data + OFFSET(DrawData,LLLLx16)) * Float4(1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f); + Float4 T = *Pointer<Float4>(r.data + OFFSET(DrawData,TTTTx16)) * Float4(1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f); + + r.ox[pos] = (r.ox[pos] - L) / W * rhw; + r.oy[pos] = (r.oy[pos] - T) / H * rhw; + r.oz[pos] = r.oz[pos] * rhw; + r.ow[pos] = rhw; + } + + if(state.superSampling) + { + r.ox[pos] = r.ox[pos] + *Pointer<Float4>(r.data + OFFSET(DrawData,XXXX)) * r.ow[pos]; + r.oy[pos] = r.oy[pos] + *Pointer<Float4>(r.data + OFFSET(DrawData,YYYY)) * r.ow[pos]; + } + + Float4 clipX = r.ox[pos]; + Float4 clipY = r.oy[pos]; + + if(state.multiSampling) // Clip at pixel edges instead of pixel centers + { + clipX += *Pointer<Float4>(r.data + OFFSET(DrawData,offX)) * r.ow[pos]; + clipY += *Pointer<Float4>(r.data + OFFSET(DrawData,offY)) * r.ow[pos]; + } + + Int4 maxX = CmpLT(r.ow[pos], clipX); + Int4 maxY = CmpLT(r.ow[pos], clipY); + Int4 maxZ = CmpLT(r.ow[pos], r.oz[pos]); + + Int4 minX = CmpNLE(-r.ow[pos], clipX); + Int4 minY = CmpNLE(-r.ow[pos], clipY); + Int4 minZ = CmpNLE(Float4(0.0f, 0.0f, 0.0f, 0.0f), r.oz[pos]); + + Int flags; + + flags = SignMask(maxX); + r.clipFlags = *Pointer<Int>(r.constants + OFFSET(Constants,maxX) + flags * 4); // FIXME: Array indexing + flags = SignMask(maxY); + r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,maxY) + flags * 4); + flags = SignMask(maxZ); + r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,maxZ) + flags * 4); + flags = SignMask(minX); + r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minX) + flags * 4); + flags = SignMask(minY); + r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minY) + flags * 4); + flags = SignMask(minZ); + r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minZ) + flags * 4); + + Int4 finiteX = CmpLE(Abs(r.ox[pos]), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos))); + Int4 finiteY = CmpLE(Abs(r.oy[pos]), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos))); + Int4 finiteZ = CmpLE(Abs(r.oz[pos]), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos))); + + flags = SignMask(finiteX & finiteY & finiteZ); + r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,fini) + flags * 4); + + if(state.preTransformed) + { + r.clipFlags &= 0xFBFBFBFB; // Don't clip against far clip plane + } + } + + Color4f VertexRoutine::readStream(Registers &r, Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index) + { + const bool texldl = state.shaderContainsTexldl; + + Color4f v; + + Pointer<Byte> source0 = buffer + index * stride; + Pointer<Byte> source1 = source0 + (!texldl ? stride : 0); + Pointer<Byte> source2 = source1 + (!texldl ? stride : 0); + Pointer<Byte> source3 = source2 + (!texldl ? stride : 0); + + switch(stream.type) + { + case STREAMTYPE_FLOAT: + { + if(stream.count == 0) + { + // Null stream, all default components + } + else if(stream.count == 1) + { + v.x.x = *Pointer<Float>(source0); + v.x.y = *Pointer<Float>(source1); + v.x.z = *Pointer<Float>(source2); + v.x.w = *Pointer<Float>(source3); + } + else + { + v.x = *Pointer<Float4>(source0); + v.y = *Pointer<Float4>(source1); + v.z = *Pointer<Float4>(source2); + v.w = *Pointer<Float4>(source3); + + transpose4xN(v.x, v.y, v.z, v.w, stream.count); + } + } + break; + case STREAMTYPE_BYTE: + { + v.x = Float4(*Pointer<Byte4>(source0)); + v.y = Float4(*Pointer<Byte4>(source1)); + v.z = Float4(*Pointer<Byte4>(source2)); + v.w = Float4(*Pointer<Byte4>(source3)); + + transpose4xN(v.x, v.y, v.z, v.w, stream.count); + + if(stream.normalized) + { + if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte)); + if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte)); + if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte)); + if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte)); + } + } + break; + case STREAMTYPE_SBYTE: + { + v.x = Float4(*Pointer<SByte4>(source0)); + v.y = Float4(*Pointer<SByte4>(source1)); + v.z = Float4(*Pointer<SByte4>(source2)); + v.w = Float4(*Pointer<SByte4>(source3)); + + transpose4xN(v.x, v.y, v.z, v.w, stream.count); + + if(stream.normalized) + { + if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte)); + if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte)); + if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte)); + if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleSByte)); + } + } + break; + case STREAMTYPE_COLOR: + { + v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte)); + v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte)); + v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte)); + v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleByte)); + + transpose4x4(v.x, v.y, v.z, v.w); + + // Swap red and blue + Float4 t = v.x; + v.x = v.z; + v.z = t; + } + break; + case STREAMTYPE_SHORT: + { + v.x = Float4(*Pointer<Short4>(source0)); + v.y = Float4(*Pointer<Short4>(source1)); + v.z = Float4(*Pointer<Short4>(source2)); + v.w = Float4(*Pointer<Short4>(source3)); + + transpose4xN(v.x, v.y, v.z, v.w, stream.count); + + if(stream.normalized) + { + if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort)); + if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort)); + if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort)); + if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleShort)); + } + } + break; + case STREAMTYPE_USHORT: + { + v.x = Float4(*Pointer<UShort4>(source0)); + v.y = Float4(*Pointer<UShort4>(source1)); + v.z = Float4(*Pointer<UShort4>(source2)); + v.w = Float4(*Pointer<UShort4>(source3)); + + transpose4xN(v.x, v.y, v.z, v.w, stream.count); + + if(stream.normalized) + { + if(stream.count >= 1) v.x *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort)); + if(stream.count >= 2) v.y *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort)); + if(stream.count >= 3) v.z *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort)); + if(stream.count >= 4) v.w *= *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleUShort)); + } + } + break; + case STREAMTYPE_UDEC3: + { + // FIXME: Vectorize + { + Int x, y, z; + + x = y = z = *Pointer<Int>(source0); + + v.x.x = Float(x & 0x000003FF); + v.x.y = Float(y & 0x000FFC00); + v.x.z = Float(z & 0x3FF00000); + } + + { + Int x, y, z; + + x = y = z = *Pointer<Int>(source1); + + v.y.x = Float(x & 0x000003FF); + v.y.y = Float(y & 0x000FFC00); + v.y.z = Float(z & 0x3FF00000); + } + + { + Int x, y, z; + + x = y = z = *Pointer<Int>(source2); + + v.z.x = Float(x & 0x000003FF); + v.z.y = Float(y & 0x000FFC00); + v.z.z = Float(z & 0x3FF00000); + } + + { + Int x, y, z; + + x = y = z = *Pointer<Int>(source3); + + v.w.x = Float(x & 0x000003FF); + v.w.y = Float(y & 0x000FFC00); + v.w.z = Float(z & 0x3FF00000); + } + + transpose4x3(v.x, v.y, v.z, v.w); + + v.y *= Float4(1.0f / 0x00000400, 1.0f / 0x00000400, 1.0f / 0x00000400, 1.0f / 0x00000400); + v.z *= Float4(1.0f / 0x00100000, 1.0f / 0x00100000, 1.0f / 0x00100000, 1.0f / 0x00100000); + } + break; + case STREAMTYPE_DEC3N: + { + // FIXME: Vectorize + { + Int x, y, z; + + x = y = z = *Pointer<Int>(source0); + + v.x.x = Float((x << 22) & 0xFFC00000); + v.x.y = Float((y << 12) & 0xFFC00000); + v.x.z = Float((z << 2) & 0xFFC00000); + } + + { + Int x, y, z; + + x = y = z = *Pointer<Int>(source1); + + v.y.x = Float((x << 22) & 0xFFC00000); + v.y.y = Float((y << 12) & 0xFFC00000); + v.y.z = Float((z << 2) & 0xFFC00000); + } + + { + Int x, y, z; + + x = y = z = *Pointer<Int>(source2); + + v.z.x = Float((x << 22) & 0xFFC00000); + v.z.y = Float((y << 12) & 0xFFC00000); + v.z.z = Float((z << 2) & 0xFFC00000); + } + + { + Int x, y, z; + + x = y = z = *Pointer<Int>(source3); + + v.w.x = Float((x << 22) & 0xFFC00000); + v.w.y = Float((y << 12) & 0xFFC00000); + v.w.z = Float((z << 2) & 0xFFC00000); + } + + transpose4x3(v.x, v.y, v.z, v.w); + + v.x *= Float4(1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f); + v.y *= Float4(1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f); + v.z *= Float4(1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f); + } + break; + case STREAMTYPE_FIXED: + { + v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed)); + v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed)); + v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed)); + v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(r.constants + OFFSET(Constants,unscaleFixed)); + + transpose4xN(v.x, v.y, v.z, v.w, stream.count); + } + break; + case STREAMTYPE_HALF: + { + if(stream.count >= 1) + { + UShort x0 = *Pointer<UShort>(source0 + 0); + UShort x1 = *Pointer<UShort>(source1 + 0); + UShort x2 = *Pointer<UShort>(source2 + 0); + UShort x3 = *Pointer<UShort>(source3 + 0); + + v.x.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x0) * 4); + v.x.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x1) * 4); + v.x.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x2) * 4); + v.x.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(x3) * 4); + } + + if(stream.count >= 2) + { + UShort y0 = *Pointer<UShort>(source0 + 2); + UShort y1 = *Pointer<UShort>(source1 + 2); + UShort y2 = *Pointer<UShort>(source2 + 2); + UShort y3 = *Pointer<UShort>(source3 + 2); + + v.y.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y0) * 4); + v.y.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y1) * 4); + v.y.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y2) * 4); + v.y.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(y3) * 4); + } + + if(stream.count >= 3) + { + UShort z0 = *Pointer<UShort>(source0 + 4); + UShort z1 = *Pointer<UShort>(source1 + 4); + UShort z2 = *Pointer<UShort>(source2 + 4); + UShort z3 = *Pointer<UShort>(source3 + 4); + + v.z.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z0) * 4); + v.z.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z1) * 4); + v.z.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z2) * 4); + v.z.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(z3) * 4); + } + + if(stream.count >= 4) + { + UShort w0 = *Pointer<UShort>(source0 + 6); + UShort w1 = *Pointer<UShort>(source1 + 6); + UShort w2 = *Pointer<UShort>(source2 + 6); + UShort w3 = *Pointer<UShort>(source3 + 6); + + v.w.x = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w0) * 4); + v.w.y = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w1) * 4); + v.w.z = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w2) * 4); + v.w.w = *Pointer<Float>(r.constants + OFFSET(Constants,half2float) + Int(w3) * 4); + } + } + break; + case STREAMTYPE_INDICES: + { + v.x.x = *Pointer<Float>(source0); + v.x.y = *Pointer<Float>(source1); + v.x.z = *Pointer<Float>(source2); + v.x.w = *Pointer<Float>(source3); + } + break; + default: + ASSERT(false); + } + + if(stream.count < 1) v.x = Float4(0.0f, 0.0f, 0.0f, 0.0f); + if(stream.count < 2) v.y = Float4(0.0f, 0.0f, 0.0f, 0.0f); + if(stream.count < 3) v.z = Float4(0.0f, 0.0f, 0.0f, 0.0f); + if(stream.count < 4) v.w = Float4(1.0f, 1.0f, 1.0f, 1.0f); + + return v; + } + + void VertexRoutine::postTransform(Registers &r) + { + int pos = state.positionRegister; + + if(state.postTransform && !state.preTransformed) + { + Float4 posScale = *Pointer<Float4>(r.data + OFFSET(DrawData,posScale)); // FIXME: Unpack + + r.ox[pos] = r.ox[pos] * posScale.x; + r.oy[pos] = r.oy[pos] * posScale.y; + + Float4 posOffset = *Pointer<Float4>(r.data + OFFSET(DrawData,posOffset)); // FIXME: Unpack + + r.ox[pos] = r.ox[pos] + r.ow[pos] * posOffset.x; + r.oy[pos] = r.oy[pos] + r.ow[pos] * posOffset.y; + } + } + + void VertexRoutine::writeCache(Pointer<Byte> &cacheLine, Registers &r) + { + Color4f v; + + for(int i = 0; i < 12; i++) + { + if(state.output[i].write) + { + v.x = r.ox[i]; + v.y = r.oy[i]; + v.z = r.oz[i]; + v.w = r.ow[i]; + + if(state.output[i].xClamp) + { + v.x = Max(v.x, Float4(0.0f, 0.0f, 0.0f, 0.0f)); + v.x = Min(v.x, Float4(1.0f, 1.0f, 1.0f, 1.0f)); + } + + if(state.output[i].yClamp) + { + v.y = Max(v.y, Float4(0.0f, 0.0f, 0.0f, 0.0f)); + v.y = Min(v.y, Float4(1.0f, 1.0f, 1.0f, 1.0f)); + } + + if(state.output[i].zClamp) + { + v.z = Max(v.z, Float4(0.0f, 0.0f, 0.0f, 0.0f)); + v.z = Min(v.z, Float4(1.0f, 1.0f, 1.0f, 1.0f)); + } + + if(state.output[i].wClamp) + { + v.w = Max(v.w, Float4(0.0f, 0.0f, 0.0f, 0.0f)); + v.w = Min(v.w, Float4(1.0f, 1.0f, 1.0f, 1.0f)); + } + + if(state.output[i].write == 0x01) + { + *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x; + *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y; + *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z; + *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w; + } + else + { + if(state.output[i].write == 0x02) + { + transpose2x4(v.x, v.y, v.z, v.w); + } + else + { + transpose4x4(v.x, v.y, v.z, v.w); + } + + *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x; + *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y; + *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z; + *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w; + } + } + } + + *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (r.clipFlags >> 0) & 0x0000000FF; // FIXME: unsigned char Vertex::clipFlags + *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (r.clipFlags >> 8) & 0x0000000FF; + *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (r.clipFlags >> 16) & 0x0000000FF; + *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (r.clipFlags >> 24) & 0x0000000FF; + + int pos = state.positionRegister; + + v.x = r.ox[pos]; + v.y = r.oy[pos]; + v.z = r.oz[pos]; + v.w = r.ow[pos]; + + Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0, 0, 0, 0))) & As<Int4>(Float4(1, 1, 1, 1)))); + Float4 rhw = Float4(1.0f) / w; + + v.x = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,LLLLx16)) + v.x * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,WWWWx16)))); + v.y = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,TTTTx16)) + v.y * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,HHHHx16)))); + v.z = v.z * rhw; + v.w = rhw; + + transpose4x4(v.x, v.y, v.z, v.w); + + *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x; + *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y; + *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z; + *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w; + } + + void VertexRoutine::writeVertex(Pointer<Byte> &vertex, Pointer<Byte> &cache) + { + for(int i = 0; i < 12; i++) + { + if(state.output[i].write) + { + *Pointer<Float4>(vertex + OFFSET(Vertex,v[i])) = *Pointer<Float4>(cache + OFFSET(Vertex,v[i])); + } + } + + *Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags)); + *Pointer<Float4>(vertex + OFFSET(Vertex,X)) = *Pointer<Float4>(cache + OFFSET(Vertex,X)); + } +}