Duplicate source files for Vulkan.
The Vulkan implementation needs a directory for each architectural
layer, similar to the OpenGL ES stack. The entire rendering stack is
duplicated, leaving only Reactor common between them:
Renderer -> Device
Shader -> Pipeline
Common -> System
Main -> WSI
Bug b/117152542
Change-Id: I9c26b23654016d637f88ec2416f019ef65b9afbd
Reviewed-on: https://swiftshader-review.googlesource.com/c/21248
Reviewed-by: Alexis Hétu <sugoi@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
new file mode 100644
index 0000000..6522a13
--- /dev/null
+++ b/src/Device/Blitter.cpp
@@ -0,0 +1,1481 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Blitter.hpp"
+
+#include "Shader/ShaderCore.hpp"
+#include "Reactor/Reactor.hpp"
+#include "Common/Memory.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+ Blitter::Blitter()
+ {
+ blitCache = new RoutineCache<State>(1024);
+ }
+
+ Blitter::~Blitter()
+ {
+ delete blitCache;
+ }
+
+ void Blitter::clear(void *pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
+ {
+ if(fastClear(pixel, format, dest, dRect, rgbaMask))
+ {
+ return;
+ }
+
+ sw::Surface *color = sw::Surface::create(1, 1, 1, format, pixel, sw::Surface::bytes(format), sw::Surface::bytes(format));
+ SliceRectF sRect(0.5f, 0.5f, 0.5f, 0.5f, 0); // Sample from the middle.
+ blit(color, sRect, dest, dRect, {rgbaMask});
+ delete color;
+ }
+
+ bool Blitter::fastClear(void *pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
+ {
+ if(format != FORMAT_A32B32G32R32F)
+ {
+ return false;
+ }
+
+ float *color = (float*)pixel;
+ float r = color[0];
+ float g = color[1];
+ float b = color[2];
+ float a = color[3];
+
+ uint32_t packed;
+
+ switch(dest->getFormat())
+ {
+ case FORMAT_R5G6B5:
+ if((rgbaMask & 0x7) != 0x7) return false;
+ packed = ((uint16_t)(31 * b + 0.5f) << 0) |
+ ((uint16_t)(63 * g + 0.5f) << 5) |
+ ((uint16_t)(31 * r + 0.5f) << 11);
+ break;
+ case FORMAT_X8B8G8R8:
+ if((rgbaMask & 0x7) != 0x7) return false;
+ packed = ((uint32_t)(255) << 24) |
+ ((uint32_t)(255 * b + 0.5f) << 16) |
+ ((uint32_t)(255 * g + 0.5f) << 8) |
+ ((uint32_t)(255 * r + 0.5f) << 0);
+ break;
+ case FORMAT_A8B8G8R8:
+ if((rgbaMask & 0xF) != 0xF) return false;
+ packed = ((uint32_t)(255 * a + 0.5f) << 24) |
+ ((uint32_t)(255 * b + 0.5f) << 16) |
+ ((uint32_t)(255 * g + 0.5f) << 8) |
+ ((uint32_t)(255 * r + 0.5f) << 0);
+ break;
+ case FORMAT_X8R8G8B8:
+ if((rgbaMask & 0x7) != 0x7) return false;
+ packed = ((uint32_t)(255) << 24) |
+ ((uint32_t)(255 * r + 0.5f) << 16) |
+ ((uint32_t)(255 * g + 0.5f) << 8) |
+ ((uint32_t)(255 * b + 0.5f) << 0);
+ break;
+ case FORMAT_A8R8G8B8:
+ if((rgbaMask & 0xF) != 0xF) return false;
+ packed = ((uint32_t)(255 * a + 0.5f) << 24) |
+ ((uint32_t)(255 * r + 0.5f) << 16) |
+ ((uint32_t)(255 * g + 0.5f) << 8) |
+ ((uint32_t)(255 * b + 0.5f) << 0);
+ break;
+ default:
+ return false;
+ }
+
+ bool useDestInternal = !dest->isExternalDirty();
+ uint8_t *slice = (uint8_t*)dest->lock(dRect.x0, dRect.y0, dRect.slice, sw::LOCK_WRITEONLY, sw::PUBLIC, useDestInternal);
+
+ for(int j = 0; j < dest->getSamples(); j++)
+ {
+ uint8_t *d = slice;
+
+ switch(Surface::bytes(dest->getFormat()))
+ {
+ case 2:
+ for(int i = dRect.y0; i < dRect.y1; i++)
+ {
+ sw::clear((uint16_t*)d, packed, dRect.x1 - dRect.x0);
+ d += dest->getPitchB(useDestInternal);
+ }
+ break;
+ case 4:
+ for(int i = dRect.y0; i < dRect.y1; i++)
+ {
+ sw::clear((uint32_t*)d, packed, dRect.x1 - dRect.x0);
+ d += dest->getPitchB(useDestInternal);
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ slice += dest->getSliceB(useDestInternal);
+ }
+
+ dest->unlock(useDestInternal);
+
+ return true;
+ }
+
+ void Blitter::blit(Surface *source, const SliceRectF &sourceRect, Surface *dest, const SliceRect &destRect, const Blitter::Options& options)
+ {
+ if(dest->getInternalFormat() == FORMAT_NULL)
+ {
+ return;
+ }
+
+ if(blitReactor(source, sourceRect, dest, destRect, options))
+ {
+ return;
+ }
+
+ SliceRectF sRect = sourceRect;
+ SliceRect dRect = destRect;
+
+ bool flipX = destRect.x0 > destRect.x1;
+ bool flipY = destRect.y0 > destRect.y1;
+
+ if(flipX)
+ {
+ swap(dRect.x0, dRect.x1);
+ swap(sRect.x0, sRect.x1);
+ }
+ if(flipY)
+ {
+ swap(dRect.y0, dRect.y1);
+ swap(sRect.y0, sRect.y1);
+ }
+
+ source->lockInternal(0, 0, sRect.slice, sw::LOCK_READONLY, sw::PUBLIC);
+ dest->lockInternal(0, 0, dRect.slice, sw::LOCK_WRITEONLY, sw::PUBLIC);
+
+ float w = sRect.width() / dRect.width();
+ float h = sRect.height() / dRect.height();
+
+ float xStart = sRect.x0 + (0.5f - dRect.x0) * w;
+ float yStart = sRect.y0 + (0.5f - dRect.y0) * h;
+
+ for(int j = dRect.y0; j < dRect.y1; j++)
+ {
+ float y = yStart + j * h;
+
+ for(int i = dRect.x0; i < dRect.x1; i++)
+ {
+ float x = xStart + i * w;
+
+ // FIXME: Support RGBA mask
+ dest->copyInternal(source, i, j, x, y, options.filter);
+ }
+ }
+
+ source->unlockInternal();
+ dest->unlockInternal();
+ }
+
+ void Blitter::blit3D(Surface *source, Surface *dest)
+ {
+ source->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PUBLIC);
+ dest->lockInternal(0, 0, 0, sw::LOCK_WRITEONLY, sw::PUBLIC);
+
+ float w = static_cast<float>(source->getWidth()) / static_cast<float>(dest->getWidth());
+ float h = static_cast<float>(source->getHeight()) / static_cast<float>(dest->getHeight());
+ float d = static_cast<float>(source->getDepth()) / static_cast<float>(dest->getDepth());
+
+ for(int k = 0; k < dest->getDepth(); k++)
+ {
+ float z = (k + 0.5f) * d;
+
+ for(int j = 0; j < dest->getHeight(); j++)
+ {
+ float y = (j + 0.5f) * h;
+
+ for(int i = 0; i < dest->getWidth(); i++)
+ {
+ float x = (i + 0.5f) * w;
+
+ dest->copyInternal(source, i, j, k, x, y, z, true);
+ }
+ }
+ }
+
+ source->unlockInternal();
+ dest->unlockInternal();
+ }
+
+ bool Blitter::read(Float4 &c, Pointer<Byte> element, const State &state)
+ {
+ c = Float4(0.0f, 0.0f, 0.0f, 1.0f);
+
+ switch(state.sourceFormat)
+ {
+ case FORMAT_L8:
+ c.xyz = Float(Int(*Pointer<Byte>(element)));
+ c.w = float(0xFF);
+ break;
+ case FORMAT_A8:
+ c.w = Float(Int(*Pointer<Byte>(element)));
+ break;
+ case FORMAT_R8I:
+ case FORMAT_R8_SNORM:
+ c.x = Float(Int(*Pointer<SByte>(element)));
+ c.w = float(0x7F);
+ break;
+ case FORMAT_R8:
+ case FORMAT_R8UI:
+ c.x = Float(Int(*Pointer<Byte>(element)));
+ c.w = float(0xFF);
+ break;
+ case FORMAT_R16I:
+ c.x = Float(Int(*Pointer<Short>(element)));
+ c.w = float(0x7FFF);
+ break;
+ case FORMAT_R16UI:
+ c.x = Float(Int(*Pointer<UShort>(element)));
+ c.w = float(0xFFFF);
+ break;
+ case FORMAT_R32I:
+ c.x = Float(*Pointer<Int>(element));
+ c.w = float(0x7FFFFFFF);
+ break;
+ case FORMAT_R32UI:
+ c.x = Float(*Pointer<UInt>(element));
+ c.w = float(0xFFFFFFFF);
+ break;
+ case FORMAT_A8R8G8B8:
+ c = Float4(*Pointer<Byte4>(element)).zyxw;
+ break;
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8_SNORM:
+ c = Float4(*Pointer<SByte4>(element));
+ break;
+ case FORMAT_A8B8G8R8:
+ case FORMAT_A8B8G8R8UI:
+ case FORMAT_SRGB8_A8:
+ c = Float4(*Pointer<Byte4>(element));
+ break;
+ case FORMAT_X8R8G8B8:
+ c = Float4(*Pointer<Byte4>(element)).zyxw;
+ c.w = float(0xFF);
+ break;
+ case FORMAT_R8G8B8:
+ c.z = Float(Int(*Pointer<Byte>(element + 0)));
+ c.y = Float(Int(*Pointer<Byte>(element + 1)));
+ c.x = Float(Int(*Pointer<Byte>(element + 2)));
+ c.w = float(0xFF);
+ break;
+ case FORMAT_B8G8R8:
+ c.x = Float(Int(*Pointer<Byte>(element + 0)));
+ c.y = Float(Int(*Pointer<Byte>(element + 1)));
+ c.z = Float(Int(*Pointer<Byte>(element + 2)));
+ c.w = float(0xFF);
+ break;
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X8B8G8R8_SNORM:
+ c = Float4(*Pointer<SByte4>(element));
+ c.w = float(0x7F);
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_X8B8G8R8UI:
+ case FORMAT_SRGB8_X8:
+ c = Float4(*Pointer<Byte4>(element));
+ c.w = float(0xFF);
+ break;
+ case FORMAT_A16B16G16R16I:
+ c = Float4(*Pointer<Short4>(element));
+ break;
+ case FORMAT_A16B16G16R16:
+ case FORMAT_A16B16G16R16UI:
+ c = Float4(*Pointer<UShort4>(element));
+ break;
+ case FORMAT_X16B16G16R16I:
+ c = Float4(*Pointer<Short4>(element));
+ c.w = float(0x7FFF);
+ break;
+ case FORMAT_X16B16G16R16UI:
+ c = Float4(*Pointer<UShort4>(element));
+ c.w = float(0xFFFF);
+ break;
+ case FORMAT_A32B32G32R32I:
+ c = Float4(*Pointer<Int4>(element));
+ break;
+ case FORMAT_A32B32G32R32UI:
+ c = Float4(*Pointer<UInt4>(element));
+ break;
+ case FORMAT_X32B32G32R32I:
+ c = Float4(*Pointer<Int4>(element));
+ c.w = float(0x7FFFFFFF);
+ break;
+ case FORMAT_X32B32G32R32UI:
+ c = Float4(*Pointer<UInt4>(element));
+ c.w = float(0xFFFFFFFF);
+ break;
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8_SNORM:
+ c.x = Float(Int(*Pointer<SByte>(element + 0)));
+ c.y = Float(Int(*Pointer<SByte>(element + 1)));
+ c.w = float(0x7F);
+ break;
+ case FORMAT_G8R8:
+ case FORMAT_G8R8UI:
+ c.x = Float(Int(*Pointer<Byte>(element + 0)));
+ c.y = Float(Int(*Pointer<Byte>(element + 1)));
+ c.w = float(0xFF);
+ break;
+ case FORMAT_G16R16I:
+ c.x = Float(Int(*Pointer<Short>(element + 0)));
+ c.y = Float(Int(*Pointer<Short>(element + 2)));
+ c.w = float(0x7FFF);
+ break;
+ case FORMAT_G16R16:
+ case FORMAT_G16R16UI:
+ c.x = Float(Int(*Pointer<UShort>(element + 0)));
+ c.y = Float(Int(*Pointer<UShort>(element + 2)));
+ c.w = float(0xFFFF);
+ break;
+ case FORMAT_G32R32I:
+ c.x = Float(*Pointer<Int>(element + 0));
+ c.y = Float(*Pointer<Int>(element + 4));
+ c.w = float(0x7FFFFFFF);
+ break;
+ case FORMAT_G32R32UI:
+ c.x = Float(*Pointer<UInt>(element + 0));
+ c.y = Float(*Pointer<UInt>(element + 4));
+ c.w = float(0xFFFFFFFF);
+ break;
+ case FORMAT_A32B32G32R32F:
+ c = *Pointer<Float4>(element);
+ break;
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_B32G32R32F:
+ c.z = *Pointer<Float>(element + 8);
+ case FORMAT_G32R32F:
+ c.x = *Pointer<Float>(element + 0);
+ c.y = *Pointer<Float>(element + 4);
+ break;
+ case FORMAT_R32F:
+ c.x = *Pointer<Float>(element);
+ break;
+ case FORMAT_R5G6B5:
+ c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
+ c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
+ c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
+ break;
+ case FORMAT_A2B10G10R10:
+ case FORMAT_A2B10G10R10UI:
+ c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
+ c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
+ c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
+ c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
+ break;
+ case FORMAT_D16:
+ c.x = Float(Int((*Pointer<UShort>(element))));
+ break;
+ case FORMAT_D24S8:
+ case FORMAT_D24X8:
+ c.x = Float(Int((*Pointer<UInt>(element) & UInt(0xFFFFFF00)) >> 8));
+ break;
+ case FORMAT_D32:
+ c.x = Float(Int((*Pointer<UInt>(element))));
+ break;
+ case FORMAT_D32F_COMPLEMENTARY:
+ case FORMAT_D32FS8_COMPLEMENTARY:
+ c.x = 1.0f - *Pointer<Float>(element);
+ break;
+ case FORMAT_D32F:
+ case FORMAT_D32FS8:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ c.x = *Pointer<Float>(element);
+ break;
+ case FORMAT_S8:
+ c.x = Float(Int(*Pointer<Byte>(element)));
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+ }
+
+ bool Blitter::write(Float4 &c, Pointer<Byte> element, const State &state)
+ {
+ bool writeR = state.writeRed;
+ bool writeG = state.writeGreen;
+ bool writeB = state.writeBlue;
+ bool writeA = state.writeAlpha;
+ bool writeRGBA = writeR && writeG && writeB && writeA;
+
+ switch(state.destFormat)
+ {
+ case FORMAT_L8:
+ *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
+ break;
+ case FORMAT_A8:
+ if(writeA) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.w))); }
+ break;
+ case FORMAT_A8R8G8B8:
+ if(writeRGBA)
+ {
+ Short4 c0 = RoundShort4(c.zyxw);
+ *Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
+ }
+ else
+ {
+ if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
+ if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+ if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
+ if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
+ }
+ break;
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_A8:
+ if(writeRGBA)
+ {
+ Short4 c0 = RoundShort4(c);
+ *Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
+ }
+ else
+ {
+ if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
+ if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+ if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+ if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
+ }
+ break;
+ case FORMAT_X8R8G8B8:
+ if(writeRGBA)
+ {
+ Short4 c0 = RoundShort4(c.zyxw) | Short4(0x0000, 0x0000, 0x0000, 0x00FF);
+ *Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
+ }
+ else
+ {
+ if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
+ if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+ if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
+ if(writeA) { *Pointer<Byte>(element + 3) = Byte(0xFF); }
+ }
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ if(writeRGBA)
+ {
+ Short4 c0 = RoundShort4(c) | Short4(0x0000, 0x0000, 0x0000, 0x00FF);
+ *Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
+ }
+ else
+ {
+ if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
+ if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+ if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+ if(writeA) { *Pointer<Byte>(element + 3) = Byte(0xFF); }
+ }
+ break;
+ case FORMAT_R8G8B8:
+ if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
+ if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+ if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
+ break;
+ case FORMAT_B8G8R8:
+ if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
+ if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+ if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+ break;
+ case FORMAT_A32B32G32R32F:
+ if(writeRGBA)
+ {
+ *Pointer<Float4>(element) = c;
+ }
+ else
+ {
+ if(writeR) { *Pointer<Float>(element) = c.x; }
+ if(writeG) { *Pointer<Float>(element + 4) = c.y; }
+ if(writeB) { *Pointer<Float>(element + 8) = c.z; }
+ if(writeA) { *Pointer<Float>(element + 12) = c.w; }
+ }
+ break;
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ if(writeA) { *Pointer<Float>(element + 12) = 1.0f; }
+ case FORMAT_B32G32R32F:
+ if(writeR) { *Pointer<Float>(element) = c.x; }
+ if(writeG) { *Pointer<Float>(element + 4) = c.y; }
+ if(writeB) { *Pointer<Float>(element + 8) = c.z; }
+ break;
+ case FORMAT_G32R32F:
+ if(writeR && writeG)
+ {
+ *Pointer<Float2>(element) = Float2(c);
+ }
+ else
+ {
+ if(writeR) { *Pointer<Float>(element) = c.x; }
+ if(writeG) { *Pointer<Float>(element + 4) = c.y; }
+ }
+ break;
+ case FORMAT_R32F:
+ if(writeR) { *Pointer<Float>(element) = c.x; }
+ break;
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8_SNORM:
+ if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X8B8G8R8_SNORM:
+ if(writeA && (state.destFormat == FORMAT_X8B8G8R8I || state.destFormat == FORMAT_X8B8G8R8_SNORM))
+ {
+ *Pointer<SByte>(element + 3) = SByte(0x7F);
+ }
+ if(writeB) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.z))); }
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8_SNORM:
+ if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
+ case FORMAT_R8I:
+ case FORMAT_R8_SNORM:
+ if(writeR) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.x))); }
+ break;
+ case FORMAT_A8B8G8R8UI:
+ if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
+ case FORMAT_X8B8G8R8UI:
+ if(writeA && (state.destFormat == FORMAT_X8B8G8R8UI))
+ {
+ *Pointer<Byte>(element + 3) = Byte(0xFF);
+ }
+ if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+ case FORMAT_G8R8UI:
+ case FORMAT_G8R8:
+ if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+ case FORMAT_R8UI:
+ case FORMAT_R8:
+ if(writeR) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x))); }
+ break;
+ case FORMAT_A16B16G16R16I:
+ if(writeRGBA)
+ {
+ *Pointer<Short4>(element) = Short4(RoundInt(c));
+ }
+ else
+ {
+ if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+ if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
+ if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
+ if(writeA) { *Pointer<Short>(element + 6) = Short(RoundInt(Float(c.w))); }
+ }
+ break;
+ case FORMAT_X16B16G16R16I:
+ if(writeRGBA)
+ {
+ *Pointer<Short4>(element) = Short4(RoundInt(c));
+ }
+ else
+ {
+ if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+ if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
+ if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
+ }
+ if(writeA) { *Pointer<Short>(element + 6) = Short(0x7F); }
+ break;
+ case FORMAT_G16R16I:
+ if(writeR && writeG)
+ {
+ *Pointer<Short2>(element) = Short2(Short4(RoundInt(c)));
+ }
+ else
+ {
+ if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+ if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
+ }
+ break;
+ case FORMAT_R16I:
+ if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+ break;
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_A16B16G16R16:
+ if(writeRGBA)
+ {
+ *Pointer<UShort4>(element) = UShort4(RoundInt(c));
+ }
+ else
+ {
+ if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+ if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
+ if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
+ if(writeA) { *Pointer<UShort>(element + 6) = UShort(RoundInt(Float(c.w))); }
+ }
+ break;
+ case FORMAT_X16B16G16R16UI:
+ if(writeRGBA)
+ {
+ *Pointer<UShort4>(element) = UShort4(RoundInt(c));
+ }
+ else
+ {
+ if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+ if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
+ if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
+ }
+ if(writeA) { *Pointer<UShort>(element + 6) = UShort(0xFF); }
+ break;
+ case FORMAT_G16R16UI:
+ case FORMAT_G16R16:
+ if(writeR && writeG)
+ {
+ *Pointer<UShort2>(element) = UShort2(UShort4(RoundInt(c)));
+ }
+ else
+ {
+ if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+ if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
+ }
+ break;
+ case FORMAT_R16UI:
+ if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+ break;
+ case FORMAT_A32B32G32R32I:
+ if(writeRGBA)
+ {
+ *Pointer<Int4>(element) = RoundInt(c);
+ }
+ else
+ {
+ if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
+ if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
+ if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
+ if(writeA) { *Pointer<Int>(element + 12) = RoundInt(Float(c.w)); }
+ }
+ break;
+ case FORMAT_X32B32G32R32I:
+ if(writeRGBA)
+ {
+ *Pointer<Int4>(element) = RoundInt(c);
+ }
+ else
+ {
+ if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
+ if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
+ if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
+ }
+ if(writeA) { *Pointer<Int>(element + 12) = Int(0x7FFFFFFF); }
+ break;
+ case FORMAT_G32R32I:
+ if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
+ case FORMAT_R32I:
+ if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
+ break;
+ case FORMAT_A32B32G32R32UI:
+ if(writeRGBA)
+ {
+ *Pointer<UInt4>(element) = UInt4(RoundInt(c));
+ }
+ else
+ {
+ if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
+ if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
+ if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
+ if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(RoundInt(Float(c.w))); }
+ }
+ break;
+ case FORMAT_X32B32G32R32UI:
+ if(writeRGBA)
+ {
+ *Pointer<UInt4>(element) = UInt4(RoundInt(c));
+ }
+ else
+ {
+ if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
+ if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
+ if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
+ }
+ if(writeA) { *Pointer<UInt4>(element + 12) = UInt4(0xFFFFFFFF); }
+ break;
+ case FORMAT_G32R32UI:
+ if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
+ case FORMAT_R32UI:
+ if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
+ break;
+ case FORMAT_R5G6B5:
+ if(writeR && writeG && writeB)
+ {
+ *Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) |
+ (RoundInt(Float(c.y)) << Int(5)) |
+ (RoundInt(Float(c.x)) << Int(11)));
+ }
+ else
+ {
+ unsigned short mask = (writeB ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeR ? 0xF800 : 0x0000);
+ unsigned short unmask = ~mask;
+ *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+ (UShort(RoundInt(Float(c.z)) |
+ (RoundInt(Float(c.y)) << Int(5)) |
+ (RoundInt(Float(c.x)) << Int(11))) & UShort(mask));
+ }
+ break;
+ case FORMAT_A2B10G10R10:
+ case FORMAT_A2B10G10R10UI:
+ if(writeRGBA)
+ {
+ *Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) |
+ (RoundInt(Float(c.y)) << 10) |
+ (RoundInt(Float(c.z)) << 20) |
+ (RoundInt(Float(c.w)) << 30));
+ }
+ else
+ {
+ unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
+ (writeB ? 0x3FF00000 : 0x0000) |
+ (writeG ? 0x000FFC00 : 0x0000) |
+ (writeR ? 0x000003FF : 0x0000);
+ unsigned int unmask = ~mask;
+ *Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
+ (UInt(RoundInt(Float(c.x)) |
+ (RoundInt(Float(c.y)) << 10) |
+ (RoundInt(Float(c.z)) << 20) |
+ (RoundInt(Float(c.w)) << 30)) & UInt(mask));
+ }
+ break;
+ case FORMAT_D16:
+ *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x)));
+ break;
+ case FORMAT_D24S8:
+ case FORMAT_D24X8:
+ *Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) << 8);
+ break;
+ case FORMAT_D32:
+ *Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)));
+ break;
+ case FORMAT_D32F_COMPLEMENTARY:
+ case FORMAT_D32FS8_COMPLEMENTARY:
+ *Pointer<Float>(element) = 1.0f - c.x;
+ break;
+ case FORMAT_D32F:
+ case FORMAT_D32FS8:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ *Pointer<Float>(element) = c.x;
+ break;
+ case FORMAT_S8:
+ *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
+ break;
+ default:
+ return false;
+ }
+ return true;
+ }
+
+ bool Blitter::read(Int4 &c, Pointer<Byte> element, const State &state)
+ {
+ c = Int4(0, 0, 0, 1);
+
+ switch(state.sourceFormat)
+ {
+ case FORMAT_A8B8G8R8I:
+ c = Insert(c, Int(*Pointer<SByte>(element + 3)), 3);
+ case FORMAT_X8B8G8R8I:
+ c = Insert(c, Int(*Pointer<SByte>(element + 2)), 2);
+ case FORMAT_G8R8I:
+ c = Insert(c, Int(*Pointer<SByte>(element + 1)), 1);
+ case FORMAT_R8I:
+ c = Insert(c, Int(*Pointer<SByte>(element)), 0);
+ break;
+ case FORMAT_A8B8G8R8UI:
+ c = Insert(c, Int(*Pointer<Byte>(element + 3)), 3);
+ case FORMAT_X8B8G8R8UI:
+ c = Insert(c, Int(*Pointer<Byte>(element + 2)), 2);
+ case FORMAT_G8R8UI:
+ c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
+ case FORMAT_R8UI:
+ c = Insert(c, Int(*Pointer<Byte>(element)), 0);
+ break;
+ case FORMAT_A16B16G16R16I:
+ c = Insert(c, Int(*Pointer<Short>(element + 6)), 3);
+ case FORMAT_X16B16G16R16I:
+ c = Insert(c, Int(*Pointer<Short>(element + 4)), 2);
+ case FORMAT_G16R16I:
+ c = Insert(c, Int(*Pointer<Short>(element + 2)), 1);
+ case FORMAT_R16I:
+ c = Insert(c, Int(*Pointer<Short>(element)), 0);
+ break;
+ case FORMAT_A16B16G16R16UI:
+ c = Insert(c, Int(*Pointer<UShort>(element + 6)), 3);
+ case FORMAT_X16B16G16R16UI:
+ c = Insert(c, Int(*Pointer<UShort>(element + 4)), 2);
+ case FORMAT_G16R16UI:
+ c = Insert(c, Int(*Pointer<UShort>(element + 2)), 1);
+ case FORMAT_R16UI:
+ c = Insert(c, Int(*Pointer<UShort>(element)), 0);
+ break;
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ c = *Pointer<Int4>(element);
+ break;
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_X32B32G32R32UI:
+ c = Insert(c, *Pointer<Int>(element + 8), 2);
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ c = Insert(c, *Pointer<Int>(element + 4), 1);
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ c = Insert(c, *Pointer<Int>(element), 0);
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+ }
+
+ bool Blitter::write(Int4 &c, Pointer<Byte> element, const State &state)
+ {
+ bool writeR = state.writeRed;
+ bool writeG = state.writeGreen;
+ bool writeB = state.writeBlue;
+ bool writeA = state.writeAlpha;
+ bool writeRGBA = writeR && writeG && writeB && writeA;
+
+ switch(state.destFormat)
+ {
+ case FORMAT_A8B8G8R8I:
+ if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
+ case FORMAT_X8B8G8R8I:
+ if(writeA && (state.destFormat != FORMAT_A8B8G8R8I))
+ {
+ *Pointer<SByte>(element + 3) = SByte(0x7F);
+ }
+ if(writeB) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 2)); }
+ case FORMAT_G8R8I:
+ if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
+ case FORMAT_R8I:
+ if(writeR) { *Pointer<SByte>(element) = SByte(Extract(c, 0)); }
+ break;
+ case FORMAT_A8B8G8R8UI:
+ if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
+ case FORMAT_X8B8G8R8UI:
+ if(writeA && (state.destFormat != FORMAT_A8B8G8R8UI))
+ {
+ *Pointer<Byte>(element + 3) = Byte(0xFF);
+ }
+ if(writeB) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 2)); }
+ case FORMAT_G8R8UI:
+ if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
+ case FORMAT_R8UI:
+ if(writeR) { *Pointer<Byte>(element) = Byte(Extract(c, 0)); }
+ break;
+ case FORMAT_A16B16G16R16I:
+ if(writeA) { *Pointer<Short>(element + 6) = Short(Extract(c, 3)); }
+ case FORMAT_X16B16G16R16I:
+ if(writeA && (state.destFormat != FORMAT_A16B16G16R16I))
+ {
+ *Pointer<Short>(element + 6) = Short(0x7FFF);
+ }
+ if(writeB) { *Pointer<Short>(element + 4) = Short(Extract(c, 2)); }
+ case FORMAT_G16R16I:
+ if(writeG) { *Pointer<Short>(element + 2) = Short(Extract(c, 1)); }
+ case FORMAT_R16I:
+ if(writeR) { *Pointer<Short>(element) = Short(Extract(c, 0)); }
+ break;
+ case FORMAT_A16B16G16R16UI:
+ if(writeA) { *Pointer<UShort>(element + 6) = UShort(Extract(c, 3)); }
+ case FORMAT_X16B16G16R16UI:
+ if(writeA && (state.destFormat != FORMAT_A16B16G16R16UI))
+ {
+ *Pointer<UShort>(element + 6) = UShort(0xFFFF);
+ }
+ if(writeB) { *Pointer<UShort>(element + 4) = UShort(Extract(c, 2)); }
+ case FORMAT_G16R16UI:
+ if(writeG) { *Pointer<UShort>(element + 2) = UShort(Extract(c, 1)); }
+ case FORMAT_R16UI:
+ if(writeR) { *Pointer<UShort>(element) = UShort(Extract(c, 0)); }
+ break;
+ case FORMAT_A32B32G32R32I:
+ if(writeRGBA)
+ {
+ *Pointer<Int4>(element) = c;
+ }
+ else
+ {
+ if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+ if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
+ if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
+ if(writeA) { *Pointer<Int>(element + 12) = Extract(c, 3); }
+ }
+ break;
+ case FORMAT_X32B32G32R32I:
+ if(writeRGBA)
+ {
+ *Pointer<Int4>(element) = c;
+ }
+ else
+ {
+ if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+ if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
+ if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
+ }
+ if(writeA) { *Pointer<Int>(element + 12) = Int(0x7FFFFFFF); }
+ break;
+ case FORMAT_G32R32I:
+ if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+ if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
+ break;
+ case FORMAT_R32I:
+ if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+ break;
+ case FORMAT_A32B32G32R32UI:
+ if(writeRGBA)
+ {
+ *Pointer<UInt4>(element) = As<UInt4>(c);
+ }
+ else
+ {
+ if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
+ if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
+ if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
+ if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(Extract(c, 3)); }
+ }
+ break;
+ case FORMAT_X32B32G32R32UI:
+ if(writeRGBA)
+ {
+ *Pointer<UInt4>(element) = As<UInt4>(c);
+ }
+ else
+ {
+ if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
+ if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
+ if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
+ }
+ if(writeA) { *Pointer<UInt>(element + 3) = UInt(0xFFFFFFFF); }
+ break;
+ case FORMAT_G32R32UI:
+ if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
+ if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
+ break;
+ case FORMAT_R32UI:
+ if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+ }
+
+ bool Blitter::GetScale(float4 &scale, Format format)
+ {
+ switch(format)
+ {
+ case FORMAT_L8:
+ case FORMAT_A8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_R8:
+ case FORMAT_G8R8:
+ case FORMAT_R8G8B8:
+ case FORMAT_B8G8R8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ scale = vector(0xFF, 0xFF, 0xFF, 0xFF);
+ break;
+ case FORMAT_R8_SNORM:
+ case FORMAT_G8R8_SNORM:
+ case FORMAT_X8B8G8R8_SNORM:
+ case FORMAT_A8B8G8R8_SNORM:
+ scale = vector(0x7F, 0x7F, 0x7F, 0x7F);
+ break;
+ case FORMAT_A16B16G16R16:
+ scale = vector(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF);
+ break;
+ case FORMAT_R8I:
+ case FORMAT_R8UI:
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8UI:
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X8B8G8R8UI:
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8UI:
+ case FORMAT_R16I:
+ case FORMAT_R16UI:
+ case FORMAT_G16R16:
+ case FORMAT_G16R16I:
+ case FORMAT_G16R16UI:
+ case FORMAT_X16B16G16R16I:
+ case FORMAT_X16B16G16R16UI:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_X32B32G32R32UI:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ case FORMAT_A32B32G32R32F:
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_B32G32R32F:
+ case FORMAT_G32R32F:
+ case FORMAT_R32F:
+ case FORMAT_A2B10G10R10UI:
+ scale = vector(1.0f, 1.0f, 1.0f, 1.0f);
+ break;
+ case FORMAT_R5G6B5:
+ scale = vector(0x1F, 0x3F, 0x1F, 1.0f);
+ break;
+ case FORMAT_A2B10G10R10:
+ scale = vector(0x3FF, 0x3FF, 0x3FF, 0x03);
+ break;
+ case FORMAT_D16:
+ scale = vector(0xFFFF, 0.0f, 0.0f, 0.0f);
+ break;
+ case FORMAT_D24S8:
+ case FORMAT_D24X8:
+ scale = vector(0xFFFFFF, 0.0f, 0.0f, 0.0f);
+ break;
+ case FORMAT_D32:
+ scale = vector(static_cast<float>(0xFFFFFFFF), 0.0f, 0.0f, 0.0f);
+ break;
+ case FORMAT_D32F:
+ case FORMAT_D32FS8:
+ case FORMAT_D32F_COMPLEMENTARY:
+ case FORMAT_D32FS8_COMPLEMENTARY:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ case FORMAT_S8:
+ scale = vector(1.0f, 1.0f, 1.0f, 1.0f);
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+ }
+
+ bool Blitter::ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled)
+ {
+ float4 scale, unscale;
+ if(state.clearOperation &&
+ Surface::isNonNormalizedInteger(state.sourceFormat) &&
+ !Surface::isNonNormalizedInteger(state.destFormat))
+ {
+ // If we're clearing a buffer from an int or uint color into a normalized color,
+ // then the whole range of the int or uint color must be scaled between 0 and 1.
+ switch(state.sourceFormat)
+ {
+ case FORMAT_A32B32G32R32I:
+ unscale = replicate(static_cast<float>(0x7FFFFFFF));
+ break;
+ case FORMAT_A32B32G32R32UI:
+ unscale = replicate(static_cast<float>(0xFFFFFFFF));
+ break;
+ default:
+ return false;
+ }
+ }
+ else if(!GetScale(unscale, state.sourceFormat))
+ {
+ return false;
+ }
+
+ if(!GetScale(scale, state.destFormat))
+ {
+ return false;
+ }
+
+ bool srcSRGB = Surface::isSRGBformat(state.sourceFormat);
+ bool dstSRGB = Surface::isSRGBformat(state.destFormat);
+
+ if(state.convertSRGB && ((srcSRGB && !preScaled) || dstSRGB)) // One of the formats is sRGB encoded.
+ {
+ value *= preScaled ? Float4(1.0f / scale.x, 1.0f / scale.y, 1.0f / scale.z, 1.0f / scale.w) : // Unapply scale
+ Float4(1.0f / unscale.x, 1.0f / unscale.y, 1.0f / unscale.z, 1.0f / unscale.w); // Apply unscale
+ value = (srcSRGB && !preScaled) ? sRGBtoLinear(value) : LinearToSRGB(value);
+ value *= Float4(scale.x, scale.y, scale.z, scale.w); // Apply scale
+ }
+ else if(unscale != scale)
+ {
+ value *= Float4(scale.x / unscale.x, scale.y / unscale.y, scale.z / unscale.z, scale.w / unscale.w);
+ }
+
+ if(state.destFormat == FORMAT_X32B32G32R32F_UNSIGNED)
+ {
+ value = Max(value, Float4(0.0f)); // TODO: Only necessary if source is signed.
+ }
+ else if(Surface::isFloatFormat(state.sourceFormat) && !Surface::isFloatFormat(state.destFormat))
+ {
+ value = Min(value, Float4(scale.x, scale.y, scale.z, scale.w));
+
+ value = Max(value, Float4(Surface::isUnsignedComponent(state.destFormat, 0) ? 0.0f : -scale.x,
+ Surface::isUnsignedComponent(state.destFormat, 1) ? 0.0f : -scale.y,
+ Surface::isUnsignedComponent(state.destFormat, 2) ? 0.0f : -scale.z,
+ Surface::isUnsignedComponent(state.destFormat, 3) ? 0.0f : -scale.w));
+ }
+
+ return true;
+ }
+
+ Int Blitter::ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes, bool quadLayout)
+ {
+ if(!quadLayout)
+ {
+ return y * pitchB + x * bytes;
+ }
+ else
+ {
+ // (x & ~1) * 2 + (x & 1) == (x - (x & 1)) * 2 + (x & 1) == x * 2 - (x & 1) * 2 + (x & 1) == x * 2 - (x & 1)
+ return (y & Int(~1)) * pitchB +
+ ((y & Int(1)) * 2 + x * 2 - (x & Int(1))) * bytes;
+ }
+ }
+
+ Float4 Blitter::LinearToSRGB(Float4 &c)
+ {
+ Float4 lc = Min(c, Float4(0.0031308f)) * Float4(12.92f);
+ Float4 ec = Float4(1.055f) * power(c, Float4(1.0f / 2.4f)) - Float4(0.055f);
+
+ Float4 s = c;
+ s.xyz = Max(lc, ec);
+
+ return s;
+ }
+
+ Float4 Blitter::sRGBtoLinear(Float4 &c)
+ {
+ Float4 lc = c * Float4(1.0f / 12.92f);
+ Float4 ec = power((c + Float4(0.055f)) * Float4(1.0f / 1.055f), Float4(2.4f));
+
+ Int4 linear = CmpLT(c, Float4(0.04045f));
+
+ Float4 s = c;
+ s.xyz = As<Float4>((linear & As<Int4>(lc)) | (~linear & As<Int4>(ec))); // FIXME: IfThenElse()
+
+ return s;
+ }
+
+ Routine *Blitter::generate(const State &state)
+ {
+ Function<Void(Pointer<Byte>)> function;
+ {
+ Pointer<Byte> blit(function.Arg<0>());
+
+ Pointer<Byte> source = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData,source));
+ Pointer<Byte> dest = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData,dest));
+ Int sPitchB = *Pointer<Int>(blit + OFFSET(BlitData,sPitchB));
+ Int dPitchB = *Pointer<Int>(blit + OFFSET(BlitData,dPitchB));
+
+ Float x0 = *Pointer<Float>(blit + OFFSET(BlitData,x0));
+ Float y0 = *Pointer<Float>(blit + OFFSET(BlitData,y0));
+ Float w = *Pointer<Float>(blit + OFFSET(BlitData,w));
+ Float h = *Pointer<Float>(blit + OFFSET(BlitData,h));
+
+ Int x0d = *Pointer<Int>(blit + OFFSET(BlitData,x0d));
+ Int x1d = *Pointer<Int>(blit + OFFSET(BlitData,x1d));
+ Int y0d = *Pointer<Int>(blit + OFFSET(BlitData,y0d));
+ Int y1d = *Pointer<Int>(blit + OFFSET(BlitData,y1d));
+
+ Int sWidth = *Pointer<Int>(blit + OFFSET(BlitData,sWidth));
+ Int sHeight = *Pointer<Int>(blit + OFFSET(BlitData,sHeight));
+
+ bool intSrc = Surface::isNonNormalizedInteger(state.sourceFormat);
+ bool intDst = Surface::isNonNormalizedInteger(state.destFormat);
+ bool intBoth = intSrc && intDst;
+ bool srcQuadLayout = Surface::hasQuadLayout(state.sourceFormat);
+ bool dstQuadLayout = Surface::hasQuadLayout(state.destFormat);
+ int srcBytes = Surface::bytes(state.sourceFormat);
+ int dstBytes = Surface::bytes(state.destFormat);
+
+ bool hasConstantColorI = false;
+ Int4 constantColorI;
+ bool hasConstantColorF = false;
+ Float4 constantColorF;
+ if(state.clearOperation)
+ {
+ if(intBoth) // Integer types
+ {
+ if(!read(constantColorI, source, state))
+ {
+ return nullptr;
+ }
+ hasConstantColorI = true;
+ }
+ else
+ {
+ if(!read(constantColorF, source, state))
+ {
+ return nullptr;
+ }
+ hasConstantColorF = true;
+
+ if(!ApplyScaleAndClamp(constantColorF, state))
+ {
+ return nullptr;
+ }
+ }
+ }
+
+ For(Int j = y0d, j < y1d, j++)
+ {
+ Float y = state.clearOperation ? RValue<Float>(y0) : y0 + Float(j) * h;
+ Pointer<Byte> destLine = dest + (dstQuadLayout ? j & Int(~1) : RValue<Int>(j)) * dPitchB;
+
+ For(Int i = x0d, i < x1d, i++)
+ {
+ Float x = state.clearOperation ? RValue<Float>(x0) : x0 + Float(i) * w;
+ Pointer<Byte> d = destLine + (dstQuadLayout ? (((j & Int(1)) << 1) + (i * 2) - (i & Int(1))) : RValue<Int>(i)) * dstBytes;
+
+ if(hasConstantColorI)
+ {
+ if(!write(constantColorI, d, state))
+ {
+ return nullptr;
+ }
+ }
+ else if(hasConstantColorF)
+ {
+ for(int s = 0; s < state.destSamples; s++)
+ {
+ if(!write(constantColorF, d, state))
+ {
+ return nullptr;
+ }
+
+ d += *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
+ }
+ }
+ else if(intBoth) // Integer types do not support filtering
+ {
+ Int4 color; // When both formats are true integer types, we don't go to float to avoid losing precision
+ Int X = Int(x);
+ Int Y = Int(y);
+
+ if(state.clampToEdge)
+ {
+ X = Clamp(X, 0, sWidth - 1);
+ Y = Clamp(Y, 0, sHeight - 1);
+ }
+
+ Pointer<Byte> s = source + ComputeOffset(X, Y, sPitchB, srcBytes, srcQuadLayout);
+
+ if(!read(color, s, state))
+ {
+ return nullptr;
+ }
+
+ if(!write(color, d, state))
+ {
+ return nullptr;
+ }
+ }
+ else
+ {
+ Float4 color;
+
+ bool preScaled = false;
+ if(!state.filter || intSrc)
+ {
+ Int X = Int(x);
+ Int Y = Int(y);
+
+ if(state.clampToEdge)
+ {
+ X = Clamp(X, 0, sWidth - 1);
+ Y = Clamp(Y, 0, sHeight - 1);
+ }
+
+ Pointer<Byte> s = source + ComputeOffset(X, Y, sPitchB, srcBytes, srcQuadLayout);
+
+ if(!read(color, s, state))
+ {
+ return nullptr;
+ }
+ }
+ else // Bilinear filtering
+ {
+ Float X = x;
+ Float Y = y;
+
+ if(state.clampToEdge)
+ {
+ X = Min(Max(x, 0.5f), Float(sWidth) - 0.5f);
+ Y = Min(Max(y, 0.5f), Float(sHeight) - 0.5f);
+ }
+
+ Float x0 = X - 0.5f;
+ Float y0 = Y - 0.5f;
+
+ Int X0 = Max(Int(x0), 0);
+ Int Y0 = Max(Int(y0), 0);
+
+ Int X1 = X0 + 1;
+ Int Y1 = Y0 + 1;
+ X1 = IfThenElse(X1 >= sWidth, X0, X1);
+ Y1 = IfThenElse(Y1 >= sHeight, Y0, Y1);
+
+ Pointer<Byte> s00 = source + ComputeOffset(X0, Y0, sPitchB, srcBytes, srcQuadLayout);
+ Pointer<Byte> s01 = source + ComputeOffset(X1, Y0, sPitchB, srcBytes, srcQuadLayout);
+ Pointer<Byte> s10 = source + ComputeOffset(X0, Y1, sPitchB, srcBytes, srcQuadLayout);
+ Pointer<Byte> s11 = source + ComputeOffset(X1, Y1, sPitchB, srcBytes, srcQuadLayout);
+
+ Float4 c00; if(!read(c00, s00, state)) return nullptr;
+ Float4 c01; if(!read(c01, s01, state)) return nullptr;
+ Float4 c10; if(!read(c10, s10, state)) return nullptr;
+ Float4 c11; if(!read(c11, s11, state)) return nullptr;
+
+ if(state.convertSRGB && Surface::isSRGBformat(state.sourceFormat)) // sRGB -> RGB
+ {
+ if(!ApplyScaleAndClamp(c00, state)) return nullptr;
+ if(!ApplyScaleAndClamp(c01, state)) return nullptr;
+ if(!ApplyScaleAndClamp(c10, state)) return nullptr;
+ if(!ApplyScaleAndClamp(c11, state)) return nullptr;
+ preScaled = true;
+ }
+
+ Float4 fx = Float4(x0 - Float(X0));
+ Float4 fy = Float4(y0 - Float(Y0));
+ Float4 ix = Float4(1.0f) - fx;
+ Float4 iy = Float4(1.0f) - fy;
+
+ color = (c00 * ix + c01 * fx) * iy +
+ (c10 * ix + c11 * fx) * fy;
+ }
+
+ if(!ApplyScaleAndClamp(color, state, preScaled))
+ {
+ return nullptr;
+ }
+
+ for(int s = 0; s < state.destSamples; s++)
+ {
+ if(!write(color, d, state))
+ {
+ return nullptr;
+ }
+
+ d += *Pointer<Int>(blit + OFFSET(BlitData,dSliceB));
+ }
+ }
+ }
+ }
+ }
+
+ return function(L"BlitRoutine");
+ }
+
+ bool Blitter::blitReactor(Surface *source, const SliceRectF &sourceRect, Surface *dest, const SliceRect &destRect, const Blitter::Options &options)
+ {
+ ASSERT(!options.clearOperation || ((source->getWidth() == 1) && (source->getHeight() == 1) && (source->getDepth() == 1)));
+
+ Rect dRect = destRect;
+ RectF sRect = sourceRect;
+ if(destRect.x0 > destRect.x1)
+ {
+ swap(dRect.x0, dRect.x1);
+ swap(sRect.x0, sRect.x1);
+ }
+ if(destRect.y0 > destRect.y1)
+ {
+ swap(dRect.y0, dRect.y1);
+ swap(sRect.y0, sRect.y1);
+ }
+
+ State state(options);
+ state.clampToEdge = (sourceRect.x0 < 0.0f) ||
+ (sourceRect.y0 < 0.0f) ||
+ (sourceRect.x1 > (float)source->getWidth()) ||
+ (sourceRect.y1 > (float)source->getHeight());
+
+ bool useSourceInternal = !source->isExternalDirty();
+ bool useDestInternal = !dest->isExternalDirty();
+ bool isStencil = options.useStencil;
+
+ state.sourceFormat = isStencil ? source->getStencilFormat() : source->getFormat(useSourceInternal);
+ state.destFormat = isStencil ? dest->getStencilFormat() : dest->getFormat(useDestInternal);
+ state.destSamples = dest->getSamples();
+
+ criticalSection.lock();
+ Routine *blitRoutine = blitCache->query(state);
+
+ if(!blitRoutine)
+ {
+ blitRoutine = generate(state);
+
+ if(!blitRoutine)
+ {
+ criticalSection.unlock();
+ return false;
+ }
+
+ blitCache->add(state, blitRoutine);
+ }
+
+ criticalSection.unlock();
+
+ void (*blitFunction)(const BlitData *data) = (void(*)(const BlitData*))blitRoutine->getEntry();
+
+ BlitData data;
+
+ bool isRGBA = options.writeMask == 0xF;
+ bool isEntireDest = dest->isEntire(destRect);
+
+ data.source = isStencil ? source->lockStencil(0, 0, 0, sw::PUBLIC) :
+ source->lock(0, 0, sourceRect.slice, sw::LOCK_READONLY, sw::PUBLIC, useSourceInternal);
+ data.dest = isStencil ? dest->lockStencil(0, 0, 0, sw::PUBLIC) :
+ dest->lock(0, 0, destRect.slice, isRGBA ? (isEntireDest ? sw::LOCK_DISCARD : sw::LOCK_WRITEONLY) : sw::LOCK_READWRITE, sw::PUBLIC, useDestInternal);
+ data.sPitchB = isStencil ? source->getStencilPitchB() : source->getPitchB(useSourceInternal);
+ data.dPitchB = isStencil ? dest->getStencilPitchB() : dest->getPitchB(useDestInternal);
+ data.dSliceB = isStencil ? dest->getStencilSliceB() : dest->getSliceB(useDestInternal);
+
+ data.w = sRect.width() / dRect.width();
+ data.h = sRect.height() / dRect.height();
+ data.x0 = sRect.x0 + (0.5f - dRect.x0) * data.w;
+ data.y0 = sRect.y0 + (0.5f - dRect.y0) * data.h;
+
+ data.x0d = dRect.x0;
+ data.x1d = dRect.x1;
+ data.y0d = dRect.y0;
+ data.y1d = dRect.y1;
+
+ data.sWidth = source->getWidth();
+ data.sHeight = source->getHeight();
+
+ blitFunction(&data);
+
+ if(isStencil)
+ {
+ source->unlockStencil();
+ dest->unlockStencil();
+ }
+ else
+ {
+ source->unlock(useSourceInternal);
+ dest->unlock(useDestInternal);
+ }
+
+ return true;
+ }
+}
diff --git a/src/Device/Blitter.hpp b/src/Device/Blitter.hpp
new file mode 100644
index 0000000..e3db745
--- /dev/null
+++ b/src/Device/Blitter.hpp
@@ -0,0 +1,121 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Blitter_hpp
+#define sw_Blitter_hpp
+
+#include "Surface.hpp"
+#include "RoutineCache.hpp"
+#include "Reactor/Reactor.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+ class Blitter
+ {
+ struct Options
+ {
+ Options() = default;
+ Options(bool filter, bool useStencil, bool convertSRGB)
+ : writeMask(0xF), clearOperation(false), filter(filter), useStencil(useStencil), convertSRGB(convertSRGB), clampToEdge(false) {}
+ Options(unsigned int writeMask)
+ : writeMask(writeMask), clearOperation(true), filter(false), useStencil(false), convertSRGB(true), clampToEdge(false) {}
+
+ union
+ {
+ struct
+ {
+ bool writeRed : 1;
+ bool writeGreen : 1;
+ bool writeBlue : 1;
+ bool writeAlpha : 1;
+ };
+
+ unsigned char writeMask;
+ };
+
+ bool clearOperation : 1;
+ bool filter : 1;
+ bool useStencil : 1;
+ bool convertSRGB : 1;
+ bool clampToEdge : 1;
+ };
+
+ struct State : Options
+ {
+ State() = default;
+ State(const Options &options) : Options(options) {}
+
+ bool operator==(const State &state) const
+ {
+ return memcmp(this, &state, sizeof(State)) == 0;
+ }
+
+ Format sourceFormat;
+ Format destFormat;
+ int destSamples;
+ };
+
+ struct BlitData
+ {
+ void *source;
+ void *dest;
+ int sPitchB;
+ int dPitchB;
+ int dSliceB;
+
+ float x0;
+ float y0;
+ float w;
+ float h;
+
+ int y0d;
+ int y1d;
+ int x0d;
+ int x1d;
+
+ int sWidth;
+ int sHeight;
+ };
+
+ public:
+ Blitter();
+ virtual ~Blitter();
+
+ void clear(void *pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
+ void blit(Surface *source, const SliceRectF &sRect, Surface *dest, const SliceRect &dRect, const Options &options);
+ void blit3D(Surface *source, Surface *dest);
+
+ private:
+ bool fastClear(void *pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
+
+ bool read(Float4 &color, Pointer<Byte> element, const State &state);
+ bool write(Float4 &color, Pointer<Byte> element, const State &state);
+ bool read(Int4 &color, Pointer<Byte> element, const State &state);
+ bool write(Int4 &color, Pointer<Byte> element, const State &state);
+ static bool GetScale(float4& scale, Format format);
+ static bool ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled = false);
+ static Int ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes, bool quadLayout);
+ static Float4 LinearToSRGB(Float4 &color);
+ static Float4 sRGBtoLinear(Float4 &color);
+ bool blitReactor(Surface *source, const SliceRectF &sRect, Surface *dest, const SliceRect &dRect, const Options &options);
+ Routine *generate(const State &state);
+
+ RoutineCache<State> *blitCache;
+ MutexLock criticalSection;
+ };
+}
+
+#endif // sw_Blitter_hpp
diff --git a/src/Device/Clipper.cpp b/src/Device/Clipper.cpp
new file mode 100644
index 0000000..a100f05
--- /dev/null
+++ b/src/Device/Clipper.cpp
@@ -0,0 +1,359 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Clipper.hpp"
+
+#include "Polygon.hpp"
+#include "Renderer.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+ Clipper::Clipper(bool symmetricNormalizedDepth)
+ {
+ n = symmetricNormalizedDepth ? -1.0f : 0.0f;
+ }
+
+ Clipper::~Clipper()
+ {
+ }
+
+ unsigned int Clipper::computeClipFlags(const float4 &v)
+ {
+ return ((v.x > v.w) ? CLIP_RIGHT : 0) |
+ ((v.y > v.w) ? CLIP_TOP : 0) |
+ ((v.z > v.w) ? CLIP_FAR : 0) |
+ ((v.x < -v.w) ? CLIP_LEFT : 0) |
+ ((v.y < -v.w) ? CLIP_BOTTOM : 0) |
+ ((v.z < n * v.w) ? CLIP_NEAR : 0) |
+ Clipper::CLIP_FINITE; // FIXME: xyz finite
+ }
+
+ bool Clipper::clip(Polygon &polygon, int clipFlagsOr, const DrawCall &draw)
+ {
+ if(clipFlagsOr & CLIP_FRUSTUM)
+ {
+ if(clipFlagsOr & CLIP_NEAR) clipNear(polygon);
+ if(polygon.n >= 3) {
+ if(clipFlagsOr & CLIP_FAR) clipFar(polygon);
+ if(polygon.n >= 3) {
+ if(clipFlagsOr & CLIP_LEFT) clipLeft(polygon);
+ if(polygon.n >= 3) {
+ if(clipFlagsOr & CLIP_RIGHT) clipRight(polygon);
+ if(polygon.n >= 3) {
+ if(clipFlagsOr & CLIP_TOP) clipTop(polygon);
+ if(polygon.n >= 3) {
+ if(clipFlagsOr & CLIP_BOTTOM) clipBottom(polygon);
+ }}}}}
+ }
+
+ if(clipFlagsOr & CLIP_USER)
+ {
+ int clipFlags = draw.clipFlags;
+ DrawData &data = *draw.data;
+
+ if(polygon.n >= 3) {
+ if(clipFlags & CLIP_PLANE0) clipPlane(polygon, data.clipPlane[0]);
+ if(polygon.n >= 3) {
+ if(clipFlags & CLIP_PLANE1) clipPlane(polygon, data.clipPlane[1]);
+ if(polygon.n >= 3) {
+ if(clipFlags & CLIP_PLANE2) clipPlane(polygon, data.clipPlane[2]);
+ if(polygon.n >= 3) {
+ if(clipFlags & CLIP_PLANE3) clipPlane(polygon, data.clipPlane[3]);
+ if(polygon.n >= 3) {
+ if(clipFlags & CLIP_PLANE4) clipPlane(polygon, data.clipPlane[4]);
+ if(polygon.n >= 3) {
+ if(clipFlags & CLIP_PLANE5) clipPlane(polygon, data.clipPlane[5]);
+ }}}}}}
+ }
+
+ return polygon.n >= 3;
+ }
+
+ void Clipper::clipNear(Polygon &polygon)
+ {
+ const float4 **V = polygon.P[polygon.i];
+ const float4 **T = polygon.P[polygon.i + 1];
+
+ int t = 0;
+
+ for(int i = 0; i < polygon.n; i++)
+ {
+ int j = i == polygon.n - 1 ? 0 : i + 1;
+
+ float di = V[i]->z - n * V[i]->w;
+ float dj = V[j]->z - n * V[j]->w;
+
+ if(di >= 0)
+ {
+ T[t++] = V[i];
+
+ if(dj < 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ else
+ {
+ if(dj > 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ }
+
+ polygon.n = t;
+ polygon.i += 1;
+ }
+
+ void Clipper::clipFar(Polygon &polygon)
+ {
+ const float4 **V = polygon.P[polygon.i];
+ const float4 **T = polygon.P[polygon.i + 1];
+
+ int t = 0;
+
+ for(int i = 0; i < polygon.n; i++)
+ {
+ int j = i == polygon.n - 1 ? 0 : i + 1;
+
+ float di = V[i]->w - V[i]->z;
+ float dj = V[j]->w - V[j]->z;
+
+ if(di >= 0)
+ {
+ T[t++] = V[i];
+
+ if(dj < 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ else
+ {
+ if(dj > 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ }
+
+ polygon.n = t;
+ polygon.i += 1;
+ }
+
+ void Clipper::clipLeft(Polygon &polygon)
+ {
+ const float4 **V = polygon.P[polygon.i];
+ const float4 **T = polygon.P[polygon.i + 1];
+
+ int t = 0;
+
+ for(int i = 0; i < polygon.n; i++)
+ {
+ int j = i == polygon.n - 1 ? 0 : i + 1;
+
+ float di = V[i]->w + V[i]->x;
+ float dj = V[j]->w + V[j]->x;
+
+ if(di >= 0)
+ {
+ T[t++] = V[i];
+
+ if(dj < 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ else
+ {
+ if(dj > 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ }
+
+ polygon.n = t;
+ polygon.i += 1;
+ }
+
+ void Clipper::clipRight(Polygon &polygon)
+ {
+ const float4 **V = polygon.P[polygon.i];
+ const float4 **T = polygon.P[polygon.i + 1];
+
+ int t = 0;
+
+ for(int i = 0; i < polygon.n; i++)
+ {
+ int j = i == polygon.n - 1 ? 0 : i + 1;
+
+ float di = V[i]->w - V[i]->x;
+ float dj = V[j]->w - V[j]->x;
+
+ if(di >= 0)
+ {
+ T[t++] = V[i];
+
+ if(dj < 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ else
+ {
+ if(dj > 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ }
+
+ polygon.n = t;
+ polygon.i += 1;
+ }
+
+ void Clipper::clipTop(Polygon &polygon)
+ {
+ const float4 **V = polygon.P[polygon.i];
+ const float4 **T = polygon.P[polygon.i + 1];
+
+ int t = 0;
+
+ for(int i = 0; i < polygon.n; i++)
+ {
+ int j = i == polygon.n - 1 ? 0 : i + 1;
+
+ float di = V[i]->w - V[i]->y;
+ float dj = V[j]->w - V[j]->y;
+
+ if(di >= 0)
+ {
+ T[t++] = V[i];
+
+ if(dj < 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ else
+ {
+ if(dj > 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ }
+
+ polygon.n = t;
+ polygon.i += 1;
+ }
+
+ void Clipper::clipBottom(Polygon &polygon)
+ {
+ const float4 **V = polygon.P[polygon.i];
+ const float4 **T = polygon.P[polygon.i + 1];
+
+ int t = 0;
+
+ for(int i = 0; i < polygon.n; i++)
+ {
+ int j = i == polygon.n - 1 ? 0 : i + 1;
+
+ float di = V[i]->w + V[i]->y;
+ float dj = V[j]->w + V[j]->y;
+
+ if(di >= 0)
+ {
+ T[t++] = V[i];
+
+ if(dj < 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ else
+ {
+ if(dj > 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ }
+
+ polygon.n = t;
+ polygon.i += 1;
+ }
+
+ void Clipper::clipPlane(Polygon &polygon, const Plane &p)
+ {
+ const float4 **V = polygon.P[polygon.i];
+ const float4 **T = polygon.P[polygon.i + 1];
+
+ int t = 0;
+
+ for(int i = 0; i < polygon.n; i++)
+ {
+ int j = i == polygon.n - 1 ? 0 : i + 1;
+
+ float di = p.A * V[i]->x + p.B * V[i]->y + p.C * V[i]->z + p.D * V[i]->w;
+ float dj = p.A * V[j]->x + p.B * V[j]->y + p.C * V[j]->z + p.D * V[j]->w;
+
+ if(di >= 0)
+ {
+ T[t++] = V[i];
+
+ if(dj < 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ else
+ {
+ if(dj > 0)
+ {
+ clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+ T[t++] = &polygon.B[polygon.b++];
+ }
+ }
+ }
+
+ polygon.n = t;
+ polygon.i += 1;
+ }
+
+ inline void Clipper::clipEdge(float4 &Vo, const float4 &Vi, const float4 &Vj, float di, float dj) const
+ {
+ float D = 1.0f / (dj - di);
+
+ Vo.x = (dj * Vi.x - di * Vj.x) * D;
+ Vo.y = (dj * Vi.y - di * Vj.y) * D;
+ Vo.z = (dj * Vi.z - di * Vj.z) * D;
+ Vo.w = (dj * Vi.w - di * Vj.w) * D;
+ }
+}
diff --git a/src/Device/Clipper.hpp b/src/Device/Clipper.hpp
new file mode 100644
index 0000000..057153a
--- /dev/null
+++ b/src/Device/Clipper.hpp
@@ -0,0 +1,77 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Clipper_hpp
+#define sw_Clipper_hpp
+
+#include "Plane.hpp"
+#include "Common/Types.hpp"
+
+namespace sw
+{
+ struct Polygon;
+ struct DrawCall;
+ struct DrawData;
+
+ class Clipper
+ {
+ public:
+ enum ClipFlags
+ {
+ // Indicates the vertex is outside the respective frustum plane
+ CLIP_RIGHT = 1 << 0,
+ CLIP_TOP = 1 << 1,
+ CLIP_FAR = 1 << 2,
+ CLIP_LEFT = 1 << 3,
+ CLIP_BOTTOM = 1 << 4,
+ CLIP_NEAR = 1 << 5,
+
+ CLIP_FRUSTUM = 0x003F,
+
+ CLIP_FINITE = 1 << 7, // All position coordinates are finite
+
+ // User-defined clipping planes
+ CLIP_PLANE0 = 1 << 8,
+ CLIP_PLANE1 = 1 << 9,
+ CLIP_PLANE2 = 1 << 10,
+ CLIP_PLANE3 = 1 << 11,
+ CLIP_PLANE4 = 1 << 12,
+ CLIP_PLANE5 = 1 << 13,
+
+ CLIP_USER = 0x3F00
+ };
+
+ Clipper(bool symmetricNormalizedDepth);
+
+ ~Clipper();
+
+ unsigned int computeClipFlags(const float4 &v);
+ bool clip(Polygon &polygon, int clipFlagsOr, const DrawCall &draw);
+
+ private:
+ void clipNear(Polygon &polygon);
+ void clipFar(Polygon &polygon);
+ void clipLeft(Polygon &polygon);
+ void clipRight(Polygon &polygon);
+ void clipTop(Polygon &polygon);
+ void clipBottom(Polygon &polygon);
+ void clipPlane(Polygon &polygon, const Plane &plane);
+
+ void clipEdge(float4 &Vo, const float4 &Vi, const float4 &Vj, float di, float dj) const;
+
+ float n; // Near clip plane distance
+ };
+}
+
+#endif // sw_Clipper_hpp
diff --git a/src/Device/Color.cpp b/src/Device/Color.cpp
new file mode 100644
index 0000000..9ad6767
--- /dev/null
+++ b/src/Device/Color.cpp
@@ -0,0 +1,19 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Color.hpp"
+
+namespace sw
+{
+}
diff --git a/src/Device/Color.hpp b/src/Device/Color.hpp
new file mode 100644
index 0000000..7afe61f
--- /dev/null
+++ b/src/Device/Color.hpp
@@ -0,0 +1,472 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Color_hpp
+#define sw_Color_hpp
+
+#include "Common/Types.hpp"
+#include "Common/Math.hpp"
+
+namespace sw
+{
+ template<class T>
+ struct Color
+ {
+ Color();
+
+ Color(const Color<byte> &c);
+ Color(const Color<short> &c);
+ Color(const Color<float> &c);
+
+ Color(int c);
+ Color(unsigned short c);
+ Color(unsigned long c);
+ Color(unsigned int c);
+
+ Color(T r, T g, T b, T a = 1);
+
+ operator unsigned int() const;
+
+ T &operator[](int i);
+ const T &operator[](int i) const;
+
+ Color<T> operator+() const;
+ Color<T> operator-() const;
+
+ Color<T>& operator=(const Color<T>& c);
+
+ Color<T> &operator+=(const Color<T> &c);
+ Color<T> &operator*=(float l);
+
+ static Color<T> gradient(const Color<T> &c1, const Color<T> &c2, float d);
+ static Color<T> shade(const Color<T> &c1, const Color<T> &c2, float d);
+
+ template<class S>
+ friend Color<S> operator+(const Color<S> &c1, const Color<S> &c2);
+ template<class S>
+ friend Color<S> operator-(const Color<S> &c1, const Color<S> &c2);
+
+ template<class S>
+ friend Color<S> operator*(float l, const Color<S> &c);
+ template<class S>
+ friend Color<S> operator*(const Color<S> &c1, const Color<S> &c2);
+ template<class S>
+ friend Color<S> operator/(const Color<S> &c, float l);
+
+ T r;
+ T g;
+ T b;
+ T a;
+ };
+}
+
+#include "Common/Math.hpp"
+
+namespace sw
+{
+ template<class T>
+ inline Color<T>::Color()
+ {
+ }
+
+ template<>
+ inline Color<byte>::Color(const Color<byte> &c)
+ {
+ r = c.r;
+ g = c.g;
+ b = c.b;
+ a = c.a;
+ }
+
+ template<>
+ inline Color<byte>::Color(const Color<short> &c)
+ {
+ r = clamp(c.r >> 4, 0, 255);
+ g = clamp(c.g >> 4, 0, 255);
+ b = clamp(c.b >> 4, 0, 255);
+ a = clamp(c.a >> 4, 0, 255);
+ }
+
+ template<>
+ inline Color<byte>::Color(const Color<float> &c)
+ {
+ r = ifloor(clamp(c.r * 256.0f, 0.0f, 255.0f));
+ g = ifloor(clamp(c.g * 256.0f, 0.0f, 255.0f));
+ b = ifloor(clamp(c.b * 256.0f, 0.0f, 255.0f));
+ a = ifloor(clamp(c.a * 256.0f, 0.0f, 255.0f));
+ }
+
+ template<>
+ inline Color<short>::Color(const Color<short> &c)
+ {
+ r = c.r;
+ g = c.g;
+ b = c.b;
+ a = c.a;
+ }
+
+ template<>
+ inline Color<short>::Color(const Color<byte> &c)
+ {
+ r = c.r << 4;
+ g = c.g << 4;
+ b = c.b << 4;
+ a = c.a << 4;
+ }
+
+ template<>
+ inline Color<float>::Color(const Color<float> &c)
+ {
+ r = c.r;
+ g = c.g;
+ b = c.b;
+ a = c.a;
+ }
+
+ template<>
+ inline Color<short>::Color(const Color<float> &c)
+ {
+ r = iround(clamp(c.r * 4095.0f, -4096.0f, 4095.0f));
+ g = iround(clamp(c.g * 4095.0f, -4096.0f, 4095.0f));
+ b = iround(clamp(c.b * 4095.0f, -4096.0f, 4095.0f));
+ a = iround(clamp(c.a * 4095.0f, -4096.0f, 4095.0f));
+ }
+
+ template<>
+ inline Color<float>::Color(const Color<byte> &c)
+ {
+ r = c.r / 255.0f;
+ g = c.g / 255.0f;
+ b = c.b / 255.0f;
+ a = c.a / 255.0f;
+ }
+
+ template<>
+ inline Color<float>::Color(const Color<short> &c)
+ {
+ r = c.r / 4095.0f;
+ g = c.g / 4095.0f;
+ b = c.b / 4095.0f;
+ a = c.a / 4095.0f;
+ }
+
+ template<>
+ inline Color<float>::Color(unsigned short c)
+ {
+ r = (float)(c & 0xF800) / (float)0xF800;
+ g = (float)(c & 0x07E0) / (float)0x07E0;
+ b = (float)(c & 0x001F) / (float)0x001F;
+ a = 1;
+ }
+
+ template<>
+ inline Color<short>::Color(unsigned short c)
+ {
+ // 4.12 fixed-point format
+ r = ((c & 0xF800) >> 4) + ((c & 0xF800) >> 9) + ((c & 0xF800) >> 14);
+ g = ((c & 0x07E0) << 1) + ((c & 0x07E0) >> 5);
+ b = ((c & 0x001F) << 7) + ((c & 0x001F) << 2) + ((c & 0x001F) >> 3);
+ a = 0x1000;
+ }
+
+ template<>
+ inline Color<byte>::Color(unsigned short c)
+ {
+ r = (byte)(((c & 0xF800) >> 8) + ((c & 0xE000) >> 13));
+ g = (byte)(((c & 0x07E0) >> 3) + ((c & 0x0600) >> 9));
+ b = (byte)(((c & 0x001F) << 3) + ((c & 0x001C) >> 2));
+ a = 0xFF;
+ }
+
+ template<>
+ inline Color<float>::Color(int c)
+ {
+ const float d = 1.0f / 255.0f;
+
+ r = (float)((c & 0x00FF0000) >> 16) * d;
+ g = (float)((c & 0x0000FF00) >> 8) * d;
+ b = (float)((c & 0x000000FF) >> 0) * d;
+ a = (float)((c & 0xFF000000) >> 24) * d;
+ }
+
+ template<>
+ inline Color<short>::Color(int c)
+ {
+ // 4.12 fixed-point format
+ r = (short)((c & 0x00FF0000) >> 12);
+ g = (short)((c & 0x0000FF00) >> 4);
+ b = (short)((c & 0x000000FF) << 4);
+ a = (short)((c & 0xFF000000) >> 20);
+ }
+
+ template<>
+ inline Color<byte>::Color(int c)
+ {
+ r = (byte)((c & 0x00FF0000) >> 16);
+ g = (byte)((c & 0x0000FF00) >> 8);
+ b = (byte)((c & 0x000000FF) >> 0);
+ a = (byte)((c & 0xFF000000) >> 24);
+ }
+
+ template<>
+ inline Color<float>::Color(unsigned int c)
+ {
+ const float d = 1.0f / 255.0f;
+
+ r = (float)((c & 0x00FF0000) >> 16) * d;
+ g = (float)((c & 0x0000FF00) >> 8) * d;
+ b = (float)((c & 0x000000FF) >> 0) * d;
+ a = (float)((c & 0xFF000000) >> 24) * d;
+ }
+
+ template<>
+ inline Color<short>::Color(unsigned int c)
+ {
+ // 4.12 fixed-point format
+ r = (short)((c & 0x00FF0000) >> 12);
+ g = (short)((c & 0x0000FF00) >> 4);
+ b = (short)((c & 0x000000FF) << 4);
+ a = (short)((c & 0xFF000000) >> 20);
+ }
+
+ template<>
+ inline Color<byte>::Color(unsigned int c)
+ {
+ r = (byte)((c & 0x00FF0000) >> 16);
+ g = (byte)((c & 0x0000FF00) >> 8);
+ b = (byte)((c & 0x000000FF) >> 0);
+ a = (byte)((c & 0xFF000000) >> 24);
+ }
+
+ template<>
+ inline Color<float>::Color(unsigned long c)
+ {
+ const float d = 1.0f / 255.0f;
+
+ r = (float)((c & 0x00FF0000) >> 16) * d;
+ g = (float)((c & 0x0000FF00) >> 8) * d;
+ b = (float)((c & 0x000000FF) >> 0) * d;
+ a = (float)((c & 0xFF000000) >> 24) * d;
+ }
+
+ template<>
+ inline Color<short>::Color(unsigned long c)
+ {
+ // 4.12 fixed-point format
+ r = (short)((c & 0x00FF0000) >> 12);
+ g = (short)((c & 0x0000FF00) >> 4);
+ b = (short)((c & 0x000000FF) << 4);
+ a = (short)((c & 0xFF000000) >> 20);
+ }
+
+ template<>
+ inline Color<byte>::Color(unsigned long c)
+ {
+ r = (byte)((c & 0x00FF0000) >> 16);
+ g = (byte)((c & 0x0000FF00) >> 8);
+ b = (byte)((c & 0x000000FF) >> 0);
+ a = (byte)((c & 0xFF000000) >> 24);
+ }
+
+ template<class T>
+ inline Color<T>::Color(T r_, T g_, T b_, T a_)
+ {
+ r = r_;
+ g = g_;
+ b = b_;
+ a = a_;
+ }
+
+ template<>
+ inline Color<float>::operator unsigned int() const
+ {
+ return ((unsigned int)min(b * 255.0f, 255.0f) << 0) |
+ ((unsigned int)min(g * 255.0f, 255.0f) << 8) |
+ ((unsigned int)min(r * 255.0f, 255.0f) << 16) |
+ ((unsigned int)min(a * 255.0f, 255.0f) << 24);
+ }
+
+ template<>
+ inline Color<short>::operator unsigned int() const
+ {
+ return ((unsigned int)min(b >> 4, 255) << 0) |
+ ((unsigned int)min(g >> 4, 255) << 8) |
+ ((unsigned int)min(r >> 4, 255) << 16) |
+ ((unsigned int)min(a >> 4, 255) << 24);
+ }
+
+ template<>
+ inline Color<byte>::operator unsigned int() const
+ {
+ return (b << 0) +
+ (g << 8) +
+ (r << 16) +
+ (a << 24);
+ }
+
+ template<class T>
+ inline T &Color<T>::operator[](int i)
+ {
+ return (&r)[i];
+ }
+
+ template<class T>
+ inline const T &Color<T>::operator[](int i) const
+ {
+ return (&r)[i];
+ }
+
+ template<class T>
+ inline Color<T> Color<T>::operator+() const
+ {
+ return *this;
+ }
+
+ template<class T>
+ inline Color<T> Color<T>::operator-() const
+ {
+ return Color(-r, -g, -b, -a);
+ }
+
+ template<class T>
+ inline Color<T> &Color<T>::operator=(const Color& c)
+ {
+ r = c.r;
+ g = c.g;
+ b = c.b;
+ a = c.a;
+
+ return *this;
+ }
+
+ template<class T>
+ inline Color<T> &Color<T>::operator+=(const Color &c)
+ {
+ r += c.r;
+ g += c.g;
+ b += c.b;
+ a += c.a;
+
+ return *this;
+ }
+
+ template<class T>
+ inline Color<T> &Color<T>::operator*=(float l)
+ {
+ *this = l * *this;
+
+ return *this;
+ }
+
+ template<class T>
+ inline Color<T> operator+(const Color<T> &c1, const Color<T> &c2)
+ {
+ return Color<T>(c1.r + c2.r,
+ c1.g + c2.g,
+ c1.b + c2.b,
+ c1.a + c2.a);
+ }
+
+ template<class T>
+ inline Color<T> operator-(const Color<T> &c1, const Color<T> &c2)
+ {
+ return Color<T>(c1.r - c2.r,
+ c1.g - c2.g,
+ c1.b - c2.b,
+ c1.a - c2.a);
+ }
+
+ template<class T>
+ inline Color<T> operator*(float l, const Color<T> &c)
+ {
+ T r = (T)(l * c.r);
+ T g = (T)(l * c.g);
+ T b = (T)(l * c.b);
+ T a = (T)(l * c.a);
+
+ return Color<T>(r, g, b, a);
+ }
+
+ template<class T>
+ inline Color<T> operator*(const Color<T> &c1, const Color<T> &c2)
+ {
+ T r = c1.r * c2.r;
+ T g = c1.g * c2.g;
+ T b = c1.b * c2.b;
+ T a = c1.a * c2.a;
+
+ return Color<T>(r, g, b, a);
+ }
+
+ template<>
+ inline Color<short> operator*(const Color<short> &c1, const Color<short> &c2)
+ {
+ short r = c1.r * c2.r >> 12;
+ short g = c1.g * c2.g >> 12;
+ short b = c1.b * c2.b >> 12;
+ short a = c1.a * c2.a >> 12;
+
+ return Color<short>(r, g, b, a);
+ }
+
+ template<>
+ inline Color<byte> operator*(const Color<byte> &c1, const Color<byte> &c2)
+ {
+ byte r = c1.r * c2.r >> 8;
+ byte g = c1.g * c2.g >> 8;
+ byte b = c1.b * c2.b >> 8;
+ byte a = c1.a * c2.a >> 8;
+
+ return Color<byte>(r, g, b, a);
+ }
+
+ template<class T>
+ inline Color<T> operator/(const Color<T> &c, float l)
+ {
+ l = 1.0f / l;
+
+ T r = (T)(l * c.r);
+ T g = (T)(l * c.g);
+ T b = (T)(l * c.b);
+ T a = (T)(l * c.a);
+
+ return Color<T>(r, g, b, a);
+ }
+
+ template<class T>
+ inline Color<T> Color<T>::gradient(const Color<T> &c1, const Color<T> &c2, float d)
+ {
+ d = 1.0f / d;
+
+ T r = (c2.r - c1.r) * d;
+ T g = (c2.g - c1.g) * d;
+ T b = (c2.b - c1.b) * d;
+ T a = (c2.a - c1.a) * d;
+
+ return Color<T>(r, g, b, a);
+ }
+
+ template<class T>
+ inline Color<T> Color<T>::shade(const Color<T> &c1, const Color<T> &c2, float d)
+ {
+ T r = c1.r + (T)(d * (c2.r - c1.r));
+ T g = c1.g + (T)(d * (c2.g - c1.g));
+ T b = c1.b + (T)(d * (c2.b - c1.b));
+ T a = c1.a + (T)(d * (c2.a - c1.a));
+
+ return Color<T>(r, g, b, a);
+ }
+}
+
+#endif // sw_Color_hpp
diff --git a/src/Device/Config.cpp b/src/Device/Config.cpp
new file mode 100644
index 0000000..7cb309a
--- /dev/null
+++ b/src/Device/Config.cpp
@@ -0,0 +1,82 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Config.hpp"
+
+#include "Common/Thread.hpp"
+#include "Common/Timer.hpp"
+
+namespace sw
+{
+ Profiler profiler;
+
+ Profiler::Profiler()
+ {
+ reset();
+ }
+
+ void Profiler::reset()
+ {
+ framesSec = 0;
+ framesTotal = 0;
+ FPS = 0;
+
+ #if PERF_PROFILE
+ for(int i = 0; i < PERF_TIMERS; i++)
+ {
+ cycles[i] = 0;
+ }
+
+ ropOperations = 0;
+ ropOperationsTotal = 0;
+ ropOperationsFrame = 0;
+
+ texOperations = 0;
+ texOperationsTotal = 0;
+ texOperationsFrame = 0;
+
+ compressedTex = 0;
+ compressedTexTotal = 0;
+ compressedTexFrame = 0;
+ #endif
+ };
+
+ void Profiler::nextFrame()
+ {
+ #if PERF_PROFILE
+ ropOperationsFrame = sw::atomicExchange(&ropOperations, 0);
+ texOperationsFrame = sw::atomicExchange(&texOperations, 0);
+ compressedTexFrame = sw::atomicExchange(&compressedTex, 0);
+
+ ropOperationsTotal += ropOperationsFrame;
+ texOperationsTotal += texOperationsFrame;
+ compressedTexTotal += compressedTexFrame;
+ #endif
+
+ static double fpsTime = sw::Timer::seconds();
+
+ double time = sw::Timer::seconds();
+ double delta = time - fpsTime;
+ framesSec++;
+
+ if(delta > 1.0)
+ {
+ FPS = framesSec / delta;
+
+ fpsTime = time;
+ framesTotal += framesSec;
+ framesSec = 0;
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/Device/Config.hpp b/src/Device/Config.hpp
new file mode 100644
index 0000000..017e38b
--- /dev/null
+++ b/src/Device/Config.hpp
@@ -0,0 +1,103 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Config_hpp
+#define sw_Config_hpp
+
+#include "Common/Types.hpp"
+
+#define PERF_HUD 0 // Display time spent on vertex, setup and pixel processing for each thread
+#define PERF_PROFILE 0 // Profile various pipeline stages and display the timing in SwiftConfig
+
+#define ASTC_SUPPORT 0
+
+// Worker thread count when not set by SwiftConfig
+// 0 = process affinity count (recommended)
+// 1 = rendering on main thread (no worker threads), useful for debugging
+#ifndef DEFAULT_THREAD_COUNT
+#define DEFAULT_THREAD_COUNT 0
+#endif
+
+namespace sw
+{
+ enum
+ {
+ PERF_PIXEL,
+ PERF_PIPE,
+ PERF_INTERP,
+ PERF_SHADER,
+ PERF_TEX,
+ PERF_ROP,
+
+ PERF_TIMERS
+ };
+
+ struct Profiler
+ {
+ Profiler();
+
+ void reset();
+ void nextFrame();
+
+ int framesSec;
+ int framesTotal;
+ double FPS;
+
+ #if PERF_PROFILE
+ double cycles[PERF_TIMERS];
+
+ int64_t ropOperations;
+ int64_t ropOperationsTotal;
+ int64_t ropOperationsFrame;
+
+ int64_t texOperations;
+ int64_t texOperationsTotal;
+ int64_t texOperationsFrame;
+
+ int64_t compressedTex;
+ int64_t compressedTexTotal;
+ int64_t compressedTexFrame;
+ #endif
+ };
+
+ extern Profiler profiler;
+
+ enum
+ {
+ OUTLINE_RESOLUTION = 8192, // Maximum vertical resolution of the render target
+ MIPMAP_LEVELS = 14,
+ TEXTURE_IMAGE_UNITS = 16,
+ VERTEX_TEXTURE_IMAGE_UNITS = 16,
+ TOTAL_IMAGE_UNITS = TEXTURE_IMAGE_UNITS + VERTEX_TEXTURE_IMAGE_UNITS,
+ FRAGMENT_UNIFORM_VECTORS = 264,
+ VERTEX_UNIFORM_VECTORS = 259,
+ MAX_VERTEX_INPUTS = 32,
+ MAX_VERTEX_OUTPUTS = 34,
+ MAX_FRAGMENT_INPUTS = 32,
+ MAX_FRAGMENT_UNIFORM_BLOCKS = 12,
+ MAX_VERTEX_UNIFORM_BLOCKS = 12,
+ MAX_UNIFORM_BUFFER_BINDINGS = MAX_FRAGMENT_UNIFORM_BLOCKS + MAX_VERTEX_UNIFORM_BLOCKS, // Limited to 127 by SourceParameter.bufferIndex in Shader.hpp
+ MAX_UNIFORM_BLOCK_SIZE = 16384,
+ MAX_CLIP_PLANES = 6,
+ MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS = 64,
+ MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS = 64,
+ MIN_PROGRAM_TEXEL_OFFSET = -8,
+ MAX_PROGRAM_TEXEL_OFFSET = 7,
+ MAX_TEXTURE_LOD = MIPMAP_LEVELS - 2, // Trilinear accesses lod+1
+ RENDERTARGETS = 8,
+ NUM_TEMPORARY_REGISTERS = 4096,
+ };
+}
+
+#endif // sw_Config_hpp
diff --git a/src/Device/Context.cpp b/src/Device/Context.cpp
new file mode 100644
index 0000000..25c5775
--- /dev/null
+++ b/src/Device/Context.cpp
@@ -0,0 +1,1496 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Context.hpp"
+
+#include "Primitive.hpp"
+#include "Surface.hpp"
+#include "Shader/PixelShader.hpp"
+#include "Shader/VertexShader.hpp"
+#include "Common/Memory.hpp"
+#include "Common/Debug.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+ extern bool perspectiveCorrection;
+
+ bool halfIntegerCoordinates = false; // Pixel centers are not at integer coordinates
+ bool symmetricNormalizedDepth = false; // [-1, 1] instead of [0, 1]
+ bool booleanFaceRegister = false;
+ bool fullPixelPositionRegister = false;
+ bool leadingVertexFirst = false; // Flat shading uses first vertex, else last
+ bool secondaryColor = false; // Specular lighting is applied after texturing
+ bool colorsDefaultToZero = false;
+
+ bool forceWindowed = false;
+ bool quadLayoutEnabled = false;
+ bool veryEarlyDepthTest = true;
+ bool complementaryDepthBuffer = false;
+ bool postBlendSRGB = false;
+ bool exactColorRounding = false;
+ TransparencyAntialiasing transparencyAntialiasing = TRANSPARENCY_NONE;
+ bool forceClearRegisters = false;
+
+ Context::Context()
+ {
+ init();
+ }
+
+ Context::~Context()
+ {
+ }
+
+ void *Context::operator new(size_t bytes)
+ {
+ return allocate((unsigned int)bytes);
+ }
+
+ void Context::operator delete(void *pointer, size_t bytes)
+ {
+ deallocate(pointer);
+ }
+
+ bool Context::isDrawPoint(bool fillModeAware) const
+ {
+ switch(drawType)
+ {
+ case DRAW_POINTLIST:
+ case DRAW_INDEXEDPOINTLIST8:
+ case DRAW_INDEXEDPOINTLIST16:
+ case DRAW_INDEXEDPOINTLIST32:
+ return true;
+ case DRAW_LINELIST:
+ case DRAW_LINESTRIP:
+ case DRAW_LINELOOP:
+ case DRAW_INDEXEDLINELIST8:
+ case DRAW_INDEXEDLINESTRIP8:
+ case DRAW_INDEXEDLINELOOP8:
+ case DRAW_INDEXEDLINELIST16:
+ case DRAW_INDEXEDLINESTRIP16:
+ case DRAW_INDEXEDLINELOOP16:
+ case DRAW_INDEXEDLINELIST32:
+ case DRAW_INDEXEDLINESTRIP32:
+ case DRAW_INDEXEDLINELOOP32:
+ return false;
+ case DRAW_TRIANGLELIST:
+ case DRAW_TRIANGLESTRIP:
+ case DRAW_TRIANGLEFAN:
+ case DRAW_INDEXEDTRIANGLELIST8:
+ case DRAW_INDEXEDTRIANGLESTRIP8:
+ case DRAW_INDEXEDTRIANGLEFAN8:
+ case DRAW_INDEXEDTRIANGLELIST16:
+ case DRAW_INDEXEDTRIANGLESTRIP16:
+ case DRAW_INDEXEDTRIANGLEFAN16:
+ case DRAW_INDEXEDTRIANGLELIST32:
+ case DRAW_INDEXEDTRIANGLESTRIP32:
+ case DRAW_INDEXEDTRIANGLEFAN32:
+ return fillModeAware ? fillMode == FILL_VERTEX : false;
+ case DRAW_QUADLIST:
+ return false;
+ default:
+ ASSERT(false);
+ }
+
+ return false;
+ }
+
+ bool Context::isDrawLine(bool fillModeAware) const
+ {
+ switch(drawType)
+ {
+ case DRAW_POINTLIST:
+ case DRAW_INDEXEDPOINTLIST8:
+ case DRAW_INDEXEDPOINTLIST16:
+ case DRAW_INDEXEDPOINTLIST32:
+ return false;
+ case DRAW_LINELIST:
+ case DRAW_LINESTRIP:
+ case DRAW_LINELOOP:
+ case DRAW_INDEXEDLINELIST8:
+ case DRAW_INDEXEDLINESTRIP8:
+ case DRAW_INDEXEDLINELOOP8:
+ case DRAW_INDEXEDLINELIST16:
+ case DRAW_INDEXEDLINESTRIP16:
+ case DRAW_INDEXEDLINELOOP16:
+ case DRAW_INDEXEDLINELIST32:
+ case DRAW_INDEXEDLINESTRIP32:
+ case DRAW_INDEXEDLINELOOP32:
+ return true;
+ case DRAW_TRIANGLELIST:
+ case DRAW_TRIANGLESTRIP:
+ case DRAW_TRIANGLEFAN:
+ case DRAW_INDEXEDTRIANGLELIST8:
+ case DRAW_INDEXEDTRIANGLESTRIP8:
+ case DRAW_INDEXEDTRIANGLEFAN8:
+ case DRAW_INDEXEDTRIANGLELIST16:
+ case DRAW_INDEXEDTRIANGLESTRIP16:
+ case DRAW_INDEXEDTRIANGLEFAN16:
+ case DRAW_INDEXEDTRIANGLELIST32:
+ case DRAW_INDEXEDTRIANGLESTRIP32:
+ case DRAW_INDEXEDTRIANGLEFAN32:
+ return fillModeAware ? fillMode == FILL_WIREFRAME : false;
+ case DRAW_QUADLIST:
+ return false;
+ default:
+ ASSERT(false);
+ }
+
+ return false;
+ }
+
+ bool Context::isDrawTriangle(bool fillModeAware) const
+ {
+ switch(drawType)
+ {
+ case DRAW_POINTLIST:
+ case DRAW_INDEXEDPOINTLIST8:
+ case DRAW_INDEXEDPOINTLIST16:
+ case DRAW_INDEXEDPOINTLIST32:
+ return false;
+ case DRAW_LINELIST:
+ case DRAW_LINESTRIP:
+ case DRAW_LINELOOP:
+ case DRAW_INDEXEDLINELIST8:
+ case DRAW_INDEXEDLINESTRIP8:
+ case DRAW_INDEXEDLINELOOP8:
+ case DRAW_INDEXEDLINELIST16:
+ case DRAW_INDEXEDLINESTRIP16:
+ case DRAW_INDEXEDLINELOOP16:
+ case DRAW_INDEXEDLINELIST32:
+ case DRAW_INDEXEDLINESTRIP32:
+ case DRAW_INDEXEDLINELOOP32:
+ return false;
+ case DRAW_TRIANGLELIST:
+ case DRAW_TRIANGLESTRIP:
+ case DRAW_TRIANGLEFAN:
+ case DRAW_INDEXEDTRIANGLELIST8:
+ case DRAW_INDEXEDTRIANGLESTRIP8:
+ case DRAW_INDEXEDTRIANGLEFAN8:
+ case DRAW_INDEXEDTRIANGLELIST16:
+ case DRAW_INDEXEDTRIANGLESTRIP16:
+ case DRAW_INDEXEDTRIANGLEFAN16:
+ case DRAW_INDEXEDTRIANGLELIST32:
+ case DRAW_INDEXEDTRIANGLESTRIP32:
+ case DRAW_INDEXEDTRIANGLEFAN32:
+ return fillModeAware ? fillMode == FILL_SOLID : true;
+ case DRAW_QUADLIST:
+ // Quads are broken up into triangles
+ return fillModeAware ? fillMode == FILL_SOLID : true;
+ default:
+ ASSERT(false);
+ }
+
+ return true;
+ }
+
+ void Context::init()
+ {
+ for(int i = 0; i < 8; i++)
+ {
+ textureStage[i].init(i, &sampler[i], (i >= 1) ? &textureStage[i - 1] : 0);
+ }
+
+ // Set vertex streams to null stream
+ for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+ {
+ input[i].defaults();
+ }
+
+ fogStart = 0.0f;
+ fogEnd = 1.0f;
+
+ for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++) textureWrap[i] = 0;
+ for(int i = 0; i < 8; i++) texGen[i] = TEXGEN_PASSTHRU;
+ for(int i = 0; i < 8; i++) textureTransformCount[i] = 0;
+ for(int i = 0; i < 8; i++) textureTransformProject[i] = false;
+ textureWrapActive = false;
+ localViewer = true;
+ normalizeNormals = false;
+
+ for(int i = 0; i < RENDERTARGETS; ++i)
+ {
+ renderTarget[i] = nullptr;
+ }
+ depthBuffer = nullptr;
+ stencilBuffer = nullptr;
+
+ stencilEnable = false;
+ stencilCompareMode = STENCIL_ALWAYS;
+ stencilReference = 0;
+ stencilMask = 0xFFFFFFFF;
+ stencilFailOperation = OPERATION_KEEP;
+ stencilPassOperation = OPERATION_KEEP;
+ stencilZFailOperation = OPERATION_KEEP;
+ stencilWriteMask = 0xFFFFFFFF;
+
+ twoSidedStencil = false;
+ stencilCompareModeCCW = STENCIL_ALWAYS;
+ stencilReferenceCCW = 0;
+ stencilMaskCCW = 0xFFFFFFFF;
+ stencilFailOperationCCW = OPERATION_KEEP;
+ stencilPassOperationCCW = OPERATION_KEEP;
+ stencilZFailOperationCCW = OPERATION_KEEP;
+ stencilWriteMaskCCW = 0xFFFFFFFF;
+
+ setGlobalMipmapBias(0);
+
+ lightingEnable = true;
+ specularEnable = false;
+ for(int i = 0; i < 8; i++) lightEnable[i] = false;
+ for(int i = 0; i < 8; i++) worldLightPosition[i] = 0;
+
+ alphaCompareMode = ALPHA_ALWAYS;
+ alphaTestEnable = false;
+ fillMode = FILL_SOLID;
+ shadingMode = SHADING_GOURAUD;
+
+ rasterizerDiscard = false;
+
+ depthCompareMode = DEPTH_LESS;
+ depthBufferEnable = true;
+ depthWriteEnable = true;
+
+ alphaBlendEnable = false;
+ sourceBlendFactorState = BLEND_ONE;
+ destBlendFactorState = BLEND_ZERO;
+ blendOperationState = BLENDOP_ADD;
+
+ separateAlphaBlendEnable = false;
+ sourceBlendFactorStateAlpha = BLEND_ONE;
+ destBlendFactorStateAlpha = BLEND_ZERO;
+ blendOperationStateAlpha = BLENDOP_ADD;
+
+ cullMode = CULL_CLOCKWISE;
+ frontFacingCCW = true;
+ alphaReference = 0.0f;
+
+ depthBias = 0.0f;
+ slopeDepthBias = 0.0f;
+
+ for(int i = 0; i < RENDERTARGETS; i++)
+ {
+ colorWriteMask[i] = 0x0000000F;
+ }
+
+ ambientMaterialSource = MATERIAL_MATERIAL;
+ diffuseMaterialSource = MATERIAL_COLOR1;
+ specularMaterialSource = MATERIAL_COLOR2;
+ emissiveMaterialSource = MATERIAL_MATERIAL;
+ colorVertexEnable = true;
+
+ fogEnable = false;
+ pixelFogMode = FOG_NONE;
+ vertexFogMode = FOG_NONE;
+ wBasedFog = false;
+ rangeFogEnable = false;
+
+ indexedVertexBlendEnable = false;
+ vertexBlendMatrixCount = 0;
+
+ pixelShader = 0;
+ vertexShader = 0;
+
+ instanceID = 0;
+
+ occlusionEnabled = false;
+ transformFeedbackQueryEnabled = false;
+ transformFeedbackEnabled = 0;
+
+ pointSpriteEnable = false;
+ pointScaleEnable = false;
+ lineWidth = 1.0f;
+
+ writeSRGB = false;
+ sampleMask = 0xFFFFFFFF;
+
+ colorLogicOpEnabled = false;
+ logicalOperation = LOGICALOP_COPY;
+ }
+
+ const float &Context::exp2Bias()
+ {
+ return bias;
+ }
+
+ const Point &Context::getLightPosition(int light)
+ {
+ return worldLightPosition[light];
+ }
+
+ void Context::setGlobalMipmapBias(float bias)
+ {
+ this->bias = exp2(bias + 0.5f);
+ }
+
+ void Context::setLightingEnable(bool lightingEnable)
+ {
+ this->lightingEnable = lightingEnable;
+ }
+
+ void Context::setSpecularEnable(bool specularEnable)
+ {
+ Context::specularEnable = specularEnable;
+ }
+
+ void Context::setLightEnable(int light, bool lightEnable)
+ {
+ Context::lightEnable[light] = lightEnable;
+ }
+
+ void Context::setLightPosition(int light, Point worldLightPosition)
+ {
+ Context::worldLightPosition[light] = worldLightPosition;
+ }
+
+ void Context::setAmbientMaterialSource(MaterialSource ambientMaterialSource)
+ {
+ Context::ambientMaterialSource = ambientMaterialSource;
+ }
+
+ void Context::setDiffuseMaterialSource(MaterialSource diffuseMaterialSource)
+ {
+ Context::diffuseMaterialSource = diffuseMaterialSource;
+ }
+
+ void Context::setSpecularMaterialSource(MaterialSource specularMaterialSource)
+ {
+ Context::specularMaterialSource = specularMaterialSource;
+ }
+
+ void Context::setEmissiveMaterialSource(MaterialSource emissiveMaterialSource)
+ {
+ Context::emissiveMaterialSource = emissiveMaterialSource;
+ }
+
+ void Context::setPointSpriteEnable(bool pointSpriteEnable)
+ {
+ Context::pointSpriteEnable = pointSpriteEnable;
+ }
+
+ void Context::setPointScaleEnable(bool pointScaleEnable)
+ {
+ Context::pointScaleEnable = pointScaleEnable;
+ }
+
+ bool Context::setDepthBufferEnable(bool depthBufferEnable)
+ {
+ bool modified = (Context::depthBufferEnable != depthBufferEnable);
+ Context::depthBufferEnable = depthBufferEnable;
+ return modified;
+ }
+
+ bool Context::setAlphaBlendEnable(bool alphaBlendEnable)
+ {
+ bool modified = (Context::alphaBlendEnable != alphaBlendEnable);
+ Context::alphaBlendEnable = alphaBlendEnable;
+ return modified;
+ }
+
+ bool Context::setSourceBlendFactor(BlendFactor sourceBlendFactor)
+ {
+ bool modified = (Context::sourceBlendFactorState != sourceBlendFactor);
+ Context::sourceBlendFactorState = sourceBlendFactor;
+ return modified;
+ }
+
+ bool Context::setDestBlendFactor(BlendFactor destBlendFactor)
+ {
+ bool modified = (Context::destBlendFactorState != destBlendFactor);
+ Context::destBlendFactorState = destBlendFactor;
+ return modified;
+ }
+
+ bool Context::setBlendOperation(BlendOperation blendOperation)
+ {
+ bool modified = (Context::blendOperationState != blendOperation);
+ Context::blendOperationState = blendOperation;
+ return modified;
+ }
+
+ bool Context::setSeparateAlphaBlendEnable(bool separateAlphaBlendEnable)
+ {
+ bool modified = (Context::separateAlphaBlendEnable != separateAlphaBlendEnable);
+ Context::separateAlphaBlendEnable = separateAlphaBlendEnable;
+ return modified;
+ }
+
+ bool Context::setSourceBlendFactorAlpha(BlendFactor sourceBlendFactorAlpha)
+ {
+ bool modified = (Context::sourceBlendFactorStateAlpha != sourceBlendFactorAlpha);
+ Context::sourceBlendFactorStateAlpha = sourceBlendFactorAlpha;
+ return modified;
+ }
+
+ bool Context::setDestBlendFactorAlpha(BlendFactor destBlendFactorAlpha)
+ {
+ bool modified = (Context::destBlendFactorStateAlpha != destBlendFactorAlpha);
+ Context::destBlendFactorStateAlpha = destBlendFactorAlpha;
+ return modified;
+ }
+
+ bool Context::setBlendOperationAlpha(BlendOperation blendOperationAlpha)
+ {
+ bool modified = (Context::blendOperationStateAlpha != blendOperationAlpha);
+ Context::blendOperationStateAlpha = blendOperationAlpha;
+ return modified;
+ }
+
+ bool Context::setColorWriteMask(int index, int colorWriteMask)
+ {
+ bool modified = (Context::colorWriteMask[index] != colorWriteMask);
+ Context::colorWriteMask[index] = colorWriteMask;
+ return modified;
+ }
+
+ bool Context::setWriteSRGB(bool sRGB)
+ {
+ bool modified = (Context::writeSRGB != sRGB);
+ Context::writeSRGB = sRGB;
+ return modified;
+ }
+
+ bool Context::setColorLogicOpEnabled(bool enabled)
+ {
+ bool modified = (Context::colorLogicOpEnabled != enabled);
+ Context::colorLogicOpEnabled = enabled;
+ return modified;
+ }
+
+ bool Context::setLogicalOperation(LogicalOperation logicalOperation)
+ {
+ bool modified = (Context::logicalOperation != logicalOperation);
+ Context::logicalOperation = logicalOperation;
+ return modified;
+ }
+
+ void Context::setColorVertexEnable(bool colorVertexEnable)
+ {
+ Context::colorVertexEnable = colorVertexEnable;
+ }
+
+ bool Context::fogActive()
+ {
+ if(!colorUsed()) return false;
+
+ if(pixelShaderModel() >= 0x0300) return false;
+
+ return fogEnable;
+ }
+
+ bool Context::pointSizeActive()
+ {
+ if(vertexShader)
+ {
+ return false;
+ }
+
+ return isDrawPoint(true) && (input[PointSize] || (!preTransformed && pointScaleActive()));
+ }
+
+ FogMode Context::pixelFogActive()
+ {
+ if(fogActive())
+ {
+ return pixelFogMode;
+ }
+
+ return FOG_NONE;
+ }
+
+ bool Context::depthWriteActive()
+ {
+ if(!depthBufferActive()) return false;
+
+ return depthWriteEnable;
+ }
+
+ bool Context::alphaTestActive()
+ {
+ if(transparencyAntialiasing != TRANSPARENCY_NONE) return true;
+ if(!alphaTestEnable) return false;
+ if(alphaCompareMode == ALPHA_ALWAYS) return false;
+ if(alphaReference == 0.0f && alphaCompareMode == ALPHA_GREATEREQUAL) return false;
+
+ return true;
+ }
+
+ bool Context::depthBufferActive()
+ {
+ return depthBuffer && depthBufferEnable;
+ }
+
+ bool Context::stencilActive()
+ {
+ return stencilBuffer && stencilEnable;
+ }
+
+ bool Context::vertexLightingActive()
+ {
+ if(vertexShader)
+ {
+ return false;
+ }
+
+ return lightingEnable && !preTransformed;
+ }
+
+ bool Context::texCoordActive(int coordinate, int component)
+ {
+ bool hasTexture = pointSpriteActive();
+
+ if(vertexShader)
+ {
+ if(!preTransformed)
+ {
+ if(vertexShader->getOutput(T0 + coordinate, component).usage == Shader::USAGE_TEXCOORD)
+ {
+ hasTexture = true;
+ }
+ }
+ else
+ {
+ hasTexture = true; // FIXME: Check vertex buffer streams
+ }
+ }
+ else
+ {
+ switch(texGen[coordinate])
+ {
+ case TEXGEN_NONE:
+ hasTexture = true;
+ break;
+ case TEXGEN_PASSTHRU:
+ hasTexture = hasTexture || (component < input[TexCoord0 + textureStage[coordinate].texCoordIndex].count);
+ break;
+ case TEXGEN_NORMAL:
+ hasTexture = hasTexture || (component <= 2);
+ break;
+ case TEXGEN_POSITION:
+ hasTexture = hasTexture || (component <= 2);
+ break;
+ case TEXGEN_REFLECTION:
+ hasTexture = hasTexture || (component <= 2);
+ break;
+ case TEXGEN_SPHEREMAP:
+ hasTexture = hasTexture || (component <= 1);
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ bool project = isProjectionComponent(coordinate, component);
+ bool usesTexture = false;
+
+ if(pixelShader)
+ {
+ usesTexture = pixelShader->usesTexture(coordinate, component) || project;
+ }
+ else
+ {
+ usesTexture = textureStage[coordinate].usesTexture() || project;
+ }
+
+ return hasTexture && usesTexture;
+ }
+
+ bool Context::texCoordActive(int coordinate)
+ {
+ return texCoordActive(coordinate, 0) ||
+ texCoordActive(coordinate, 1) ||
+ texCoordActive(coordinate, 2) ||
+ texCoordActive(coordinate, 3);
+ }
+
+ bool Context::isProjectionComponent(unsigned int coordinate, int component)
+ {
+ if(pixelShaderModel() <= 0x0103 && coordinate < 8 && textureTransformProject[coordinate])
+ {
+ if(textureTransformCount[coordinate] == 2)
+ {
+ if(component == 1) return true;
+ }
+ else if(textureTransformCount[coordinate] == 3)
+ {
+ if(component == 2) return true;
+ }
+ else if(textureTransformCount[coordinate] == 4 || textureTransformCount[coordinate] == 0)
+ {
+ if(component == 3) return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool Context::vertexSpecularActive()
+ {
+ return vertexLightingActive() && specularEnable && vertexNormalActive();
+ }
+
+ bool Context::vertexNormalActive()
+ {
+ if(vertexShader)
+ {
+ return false;
+ }
+
+ return input[Normal];
+ }
+
+ bool Context::vertexLightActive(int i)
+ {
+ if(vertexShader)
+ {
+ return false;
+ }
+
+ return lightingEnable && lightEnable[i];
+ }
+
+ MaterialSource Context::vertexDiffuseMaterialSourceActive()
+ {
+ if(vertexShader)
+ {
+ return MATERIAL_MATERIAL;
+ }
+
+ if(diffuseMaterialSource == MATERIAL_MATERIAL || !colorVertexEnable ||
+ (diffuseMaterialSource == MATERIAL_COLOR1 && !input[Color0]) ||
+ (diffuseMaterialSource == MATERIAL_COLOR2 && !input[Color1]))
+ {
+ return MATERIAL_MATERIAL;
+ }
+
+ return diffuseMaterialSource;
+ }
+
+ MaterialSource Context::vertexSpecularMaterialSourceActive()
+ {
+ if(vertexShader)
+ {
+ return MATERIAL_MATERIAL;
+ }
+
+ if(!colorVertexEnable ||
+ (specularMaterialSource == MATERIAL_COLOR1 && !input[Color0]) ||
+ (specularMaterialSource == MATERIAL_COLOR2 && !input[Color1]))
+ {
+ return MATERIAL_MATERIAL;
+ }
+
+ return specularMaterialSource;
+ }
+
+ MaterialSource Context::vertexAmbientMaterialSourceActive()
+ {
+ if(vertexShader)
+ {
+ return MATERIAL_MATERIAL;
+ }
+
+ if(!colorVertexEnable ||
+ (ambientMaterialSource == MATERIAL_COLOR1 && !input[Color0]) ||
+ (ambientMaterialSource == MATERIAL_COLOR2 && !input[Color1]))
+ {
+ return MATERIAL_MATERIAL;
+ }
+
+ return ambientMaterialSource;
+ }
+
+ MaterialSource Context::vertexEmissiveMaterialSourceActive()
+ {
+ if(vertexShader)
+ {
+ return MATERIAL_MATERIAL;
+ }
+
+ if(!colorVertexEnable ||
+ (emissiveMaterialSource == MATERIAL_COLOR1 && !input[Color0]) ||
+ (emissiveMaterialSource == MATERIAL_COLOR2 && !input[Color1]))
+ {
+ return MATERIAL_MATERIAL;
+ }
+
+ return emissiveMaterialSource;
+ }
+
+ bool Context::pointSpriteActive()
+ {
+ return isDrawPoint(true) && pointSpriteEnable;
+ }
+
+ bool Context::pointScaleActive()
+ {
+ if(vertexShader)
+ {
+ return false;
+ }
+
+ return isDrawPoint(true) && pointScaleEnable;
+ }
+
+ bool Context::alphaBlendActive()
+ {
+ if(!alphaBlendEnable)
+ {
+ return false;
+ }
+
+ if(!colorUsed())
+ {
+ return false;
+ }
+
+ bool colorBlend = !(blendOperation() == BLENDOP_SOURCE && sourceBlendFactor() == BLEND_ONE);
+ bool alphaBlend = separateAlphaBlendEnable ? !(blendOperationAlpha() == BLENDOP_SOURCE && sourceBlendFactorAlpha() == BLEND_ONE) : colorBlend;
+
+ return colorBlend || alphaBlend;
+ }
+
+ LogicalOperation Context::colorLogicOp()
+ {
+ return colorLogicOpEnabled ? logicalOperation : LOGICALOP_COPY;
+ }
+
+ BlendFactor Context::sourceBlendFactor()
+ {
+ if(!alphaBlendEnable) return BLEND_ONE;
+
+ switch(blendOperationState)
+ {
+ case BLENDOP_ADD:
+ case BLENDOP_SUB:
+ case BLENDOP_INVSUB:
+ return sourceBlendFactorState;
+ case BLENDOP_MIN:
+ return BLEND_ONE;
+ case BLENDOP_MAX:
+ return BLEND_ONE;
+ default:
+ ASSERT(false);
+ }
+
+ return sourceBlendFactorState;
+ }
+
+ BlendFactor Context::destBlendFactor()
+ {
+ if(!alphaBlendEnable) return BLEND_ZERO;
+
+ switch(blendOperationState)
+ {
+ case BLENDOP_ADD:
+ case BLENDOP_SUB:
+ case BLENDOP_INVSUB:
+ return destBlendFactorState;
+ case BLENDOP_MIN:
+ return BLEND_ONE;
+ case BLENDOP_MAX:
+ return BLEND_ONE;
+ default:
+ ASSERT(false);
+ }
+
+ return destBlendFactorState;
+ }
+
+ BlendOperation Context::blendOperation()
+ {
+ if(!alphaBlendEnable) return BLENDOP_SOURCE;
+
+ switch(blendOperationState)
+ {
+ case BLENDOP_ADD:
+ if(sourceBlendFactor() == BLEND_ZERO)
+ {
+ if(destBlendFactor() == BLEND_ZERO)
+ {
+ return BLENDOP_NULL;
+ }
+ else
+ {
+ return BLENDOP_DEST;
+ }
+ }
+ else if(sourceBlendFactor() == BLEND_ONE)
+ {
+ if(destBlendFactor() == BLEND_ZERO)
+ {
+ return BLENDOP_SOURCE;
+ }
+ else
+ {
+ return BLENDOP_ADD;
+ }
+ }
+ else
+ {
+ if(destBlendFactor() == BLEND_ZERO)
+ {
+ return BLENDOP_SOURCE;
+ }
+ else
+ {
+ return BLENDOP_ADD;
+ }
+ }
+ case BLENDOP_SUB:
+ if(sourceBlendFactor() == BLEND_ZERO)
+ {
+ return BLENDOP_NULL; // Negative, clamped to zero
+ }
+ else if(sourceBlendFactor() == BLEND_ONE)
+ {
+ if(destBlendFactor() == BLEND_ZERO)
+ {
+ return BLENDOP_SOURCE;
+ }
+ else
+ {
+ return BLENDOP_SUB;
+ }
+ }
+ else
+ {
+ if(destBlendFactor() == BLEND_ZERO)
+ {
+ return BLENDOP_SOURCE;
+ }
+ else
+ {
+ return BLENDOP_SUB;
+ }
+ }
+ case BLENDOP_INVSUB:
+ if(sourceBlendFactor() == BLEND_ZERO)
+ {
+ if(destBlendFactor() == BLEND_ZERO)
+ {
+ return BLENDOP_NULL;
+ }
+ else
+ {
+ return BLENDOP_DEST;
+ }
+ }
+ else if(sourceBlendFactor() == BLEND_ONE)
+ {
+ if(destBlendFactor() == BLEND_ZERO)
+ {
+ return BLENDOP_NULL; // Negative, clamped to zero
+ }
+ else
+ {
+ return BLENDOP_INVSUB;
+ }
+ }
+ else
+ {
+ if(destBlendFactor() == BLEND_ZERO)
+ {
+ return BLENDOP_NULL; // Negative, clamped to zero
+ }
+ else
+ {
+ return BLENDOP_INVSUB;
+ }
+ }
+ case BLENDOP_MIN:
+ return BLENDOP_MIN;
+ case BLENDOP_MAX:
+ return BLENDOP_MAX;
+ default:
+ ASSERT(false);
+ }
+
+ return blendOperationState;
+ }
+
+ BlendFactor Context::sourceBlendFactorAlpha()
+ {
+ if(!separateAlphaBlendEnable)
+ {
+ return sourceBlendFactor();
+ }
+ else
+ {
+ switch(blendOperationStateAlpha)
+ {
+ case BLENDOP_ADD:
+ case BLENDOP_SUB:
+ case BLENDOP_INVSUB:
+ return sourceBlendFactorStateAlpha;
+ case BLENDOP_MIN:
+ return BLEND_ONE;
+ case BLENDOP_MAX:
+ return BLEND_ONE;
+ default:
+ ASSERT(false);
+ }
+
+ return sourceBlendFactorStateAlpha;
+ }
+ }
+
+ BlendFactor Context::destBlendFactorAlpha()
+ {
+ if(!separateAlphaBlendEnable)
+ {
+ return destBlendFactor();
+ }
+ else
+ {
+ switch(blendOperationStateAlpha)
+ {
+ case BLENDOP_ADD:
+ case BLENDOP_SUB:
+ case BLENDOP_INVSUB:
+ return destBlendFactorStateAlpha;
+ case BLENDOP_MIN:
+ return BLEND_ONE;
+ case BLENDOP_MAX:
+ return BLEND_ONE;
+ default:
+ ASSERT(false);
+ }
+
+ return destBlendFactorStateAlpha;
+ }
+ }
+
+ BlendOperation Context::blendOperationAlpha()
+ {
+ if(!separateAlphaBlendEnable)
+ {
+ return blendOperation();
+ }
+ else
+ {
+ switch(blendOperationStateAlpha)
+ {
+ case BLENDOP_ADD:
+ if(sourceBlendFactorAlpha() == BLEND_ZERO)
+ {
+ if(destBlendFactorAlpha() == BLEND_ZERO)
+ {
+ return BLENDOP_NULL;
+ }
+ else
+ {
+ return BLENDOP_DEST;
+ }
+ }
+ else if(sourceBlendFactorAlpha() == BLEND_ONE)
+ {
+ if(destBlendFactorAlpha() == BLEND_ZERO)
+ {
+ return BLENDOP_SOURCE;
+ }
+ else
+ {
+ return BLENDOP_ADD;
+ }
+ }
+ else
+ {
+ if(destBlendFactorAlpha() == BLEND_ZERO)
+ {
+ return BLENDOP_SOURCE;
+ }
+ else
+ {
+ return BLENDOP_ADD;
+ }
+ }
+ case BLENDOP_SUB:
+ if(sourceBlendFactorAlpha() == BLEND_ZERO)
+ {
+ return BLENDOP_NULL; // Negative, clamped to zero
+ }
+ else if(sourceBlendFactorAlpha() == BLEND_ONE)
+ {
+ if(destBlendFactorAlpha() == BLEND_ZERO)
+ {
+ return BLENDOP_SOURCE;
+ }
+ else
+ {
+ return BLENDOP_SUB;
+ }
+ }
+ else
+ {
+ if(destBlendFactorAlpha() == BLEND_ZERO)
+ {
+ return BLENDOP_SOURCE;
+ }
+ else
+ {
+ return BLENDOP_SUB;
+ }
+ }
+ case BLENDOP_INVSUB:
+ if(sourceBlendFactorAlpha() == BLEND_ZERO)
+ {
+ if(destBlendFactorAlpha() == BLEND_ZERO)
+ {
+ return BLENDOP_NULL;
+ }
+ else
+ {
+ return BLENDOP_DEST;
+ }
+ }
+ else if(sourceBlendFactorAlpha() == BLEND_ONE)
+ {
+ if(destBlendFactorAlpha() == BLEND_ZERO)
+ {
+ return BLENDOP_NULL; // Negative, clamped to zero
+ }
+ else
+ {
+ return BLENDOP_INVSUB;
+ }
+ }
+ else
+ {
+ if(destBlendFactorAlpha() == BLEND_ZERO)
+ {
+ return BLENDOP_NULL; // Negative, clamped to zero
+ }
+ else
+ {
+ return BLENDOP_INVSUB;
+ }
+ }
+ case BLENDOP_MIN:
+ return BLENDOP_MIN;
+ case BLENDOP_MAX:
+ return BLENDOP_MAX;
+ default:
+ ASSERT(false);
+ }
+
+ return blendOperationStateAlpha;
+ }
+ }
+
+ bool Context::indexedVertexBlendActive()
+ {
+ if(vertexShader)
+ {
+ return false;
+ }
+
+ return indexedVertexBlendEnable;
+ }
+
+ int Context::vertexBlendMatrixCountActive()
+ {
+ if(vertexShader)
+ {
+ return 0;
+ }
+
+ return vertexBlendMatrixCount;
+ }
+
+ bool Context::localViewerActive()
+ {
+ if(vertexShader)
+ {
+ return false;
+ }
+
+ return localViewer;
+ }
+
+ bool Context::normalizeNormalsActive()
+ {
+ if(vertexShader)
+ {
+ return false;
+ }
+
+ return normalizeNormals;
+ }
+
+ FogMode Context::vertexFogModeActive()
+ {
+ if(vertexShader || !fogActive())
+ {
+ return FOG_NONE;
+ }
+
+ return vertexFogMode;
+ }
+
+ bool Context::rangeFogActive()
+ {
+ if(vertexShader || !fogActive())
+ {
+ return false;
+ }
+
+ return rangeFogEnable;
+ }
+
+ TexGen Context::texGenActive(int stage)
+ {
+ if(vertexShader || !texCoordActive(stage))
+ {
+ return TEXGEN_PASSTHRU;
+ }
+
+ return texGen[stage];
+ }
+
+ int Context::textureTransformCountActive(int stage)
+ {
+ if(vertexShader || !texCoordActive(stage))
+ {
+ return 0;
+ }
+
+ return textureTransformCount[stage];
+ }
+
+ int Context::texCoordIndexActive(int stage)
+ {
+ if(vertexShader || !texCoordActive(stage))
+ {
+ return stage;
+ }
+
+ return textureStage[stage].texCoordIndex;
+ }
+
+ bool Context::perspectiveActive()
+ {
+ if(!colorUsed())
+ {
+ return false;
+ }
+
+ if(!perspectiveCorrection)
+ {
+ return false;
+ }
+
+ if(isDrawPoint(true))
+ {
+ return false;
+ }
+
+ return true;
+ }
+
+ bool Context::diffuseUsed()
+ {
+ return diffuseUsed(0) || diffuseUsed(1) || diffuseUsed(2) || diffuseUsed(3);
+ }
+
+ bool Context::diffuseUsed(int component)
+ {
+ if(!colorUsed())
+ {
+ return false;
+ }
+
+ if(pixelShader)
+ {
+ return pixelShader->usesDiffuse(component);
+ }
+
+ // Directly using the diffuse input color
+ for(int i = 0; i < 8; i++)
+ {
+ if(textureStage[i].isStageDisabled())
+ {
+ break;
+ }
+
+ if(textureStage[i].usesDiffuse())
+ {
+ return true;
+ }
+ }
+
+ // Using the current color (initialized to diffuse) before it's overwritten
+ for(int i = 0; i < 8; i++)
+ {
+ if(textureStage[i].usesCurrent() || textureStage[i].isStageDisabled()) // Current color contains diffuse before being overwritten
+ {
+ return true;
+ }
+
+ if(textureStage[i].writesCurrent())
+ {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ bool Context::diffuseActive()
+ {
+ return diffuseActive(0) || diffuseActive(1) || diffuseActive(2) || diffuseActive(3);
+ }
+
+ bool Context::diffuseActive(int component)
+ {
+ if(!colorUsed())
+ {
+ return false;
+ }
+
+ // Vertex processor provides diffuse component
+ bool vertexDiffuse;
+
+ if(vertexShader)
+ {
+ vertexDiffuse = vertexShader->getOutput(C0, component).active();
+ }
+ else if(!preTransformed)
+ {
+ vertexDiffuse = input[Color0] || lightingEnable;
+ }
+ else
+ {
+ vertexDiffuse = input[Color0];
+ }
+
+ // Pixel processor requires diffuse component
+ bool pixelDiffuse = diffuseUsed(component);
+
+ return vertexDiffuse && pixelDiffuse;
+ }
+
+ bool Context::specularUsed()
+ {
+ return Context::specularUsed(0) || Context::specularUsed(1) || Context::specularUsed(2) || Context::specularUsed(3);
+ }
+
+ bool Context::specularUsed(int component)
+ {
+ if(!colorUsed())
+ {
+ return false;
+ }
+
+ if(pixelShader)
+ {
+ return pixelShader->usesSpecular(component);
+ }
+
+ bool pixelSpecular = specularEnable;
+
+ for(int i = 0; i < 8; i++)
+ {
+ if(textureStage[i].isStageDisabled()) break;
+
+ pixelSpecular = pixelSpecular || textureStage[i].usesSpecular();
+ }
+
+ return pixelSpecular;
+ }
+
+ bool Context::specularActive()
+ {
+ return specularActive(0) || specularActive(1) || specularActive(2) || specularActive(3);
+ }
+
+ bool Context::specularActive(int component)
+ {
+ if(!colorUsed())
+ {
+ return false;
+ }
+
+ // Vertex processor provides specular component
+ bool vertexSpecular;
+
+ if(!vertexShader)
+ {
+ vertexSpecular = input[Color1] || (lightingEnable && specularEnable);
+ }
+ else
+ {
+ vertexSpecular = vertexShader->getOutput(C1, component).active();
+ }
+
+ // Pixel processor requires specular component
+ bool pixelSpecular = specularUsed(component);
+
+ return vertexSpecular && pixelSpecular;
+ }
+
+ bool Context::colorActive(int color, int component)
+ {
+ if(color == 0)
+ {
+ return diffuseActive(component);
+ }
+ else
+ {
+ return specularActive(component);
+ }
+ }
+
+ bool Context::textureActive()
+ {
+ for(int i = 0; i < 8; i++)
+ {
+ if(textureActive(i))
+ {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool Context::textureActive(int coordinate)
+ {
+ return textureActive(coordinate, 0) || textureActive(coordinate, 1) || textureActive(coordinate, 2) || textureActive(coordinate, 3);
+ }
+
+ bool Context::textureActive(int coordinate, int component)
+ {
+ if(!colorUsed())
+ {
+ return false;
+ }
+
+ if(!texCoordActive(coordinate, component))
+ {
+ return false;
+ }
+
+ if(textureTransformProject[coordinate] && pixelShaderModel() <= 0x0103)
+ {
+ if(textureTransformCount[coordinate] == 2)
+ {
+ if(component == 1) return true;
+ }
+ else if(textureTransformCount[coordinate] == 3)
+ {
+ if(component == 2) return true;
+ }
+ else if(textureTransformCount[coordinate] == 4 || textureTransformCount[coordinate] == 0)
+ {
+ if(component == 3) return true;
+ }
+ }
+
+ if(!pixelShader)
+ {
+ bool texture = textureStage[coordinate].usesTexture();
+ bool cube = sampler[coordinate].hasCubeTexture();
+ bool volume = sampler[coordinate].hasVolumeTexture();
+
+ if(texture)
+ {
+ for(int i = coordinate; i >= 0; i--)
+ {
+ if(textureStage[i].stageOperation == TextureStage::STAGE_DISABLE)
+ {
+ return false;
+ }
+ }
+ }
+
+ switch(component)
+ {
+ case 0:
+ return texture;
+ case 1:
+ return texture;
+ case 2:
+ return (texture && (cube || volume));
+ case 3:
+ return false;
+ }
+ }
+ else
+ {
+ return pixelShader->usesTexture(coordinate, component);
+ }
+
+ return false;
+ }
+
+ unsigned short Context::pixelShaderModel() const
+ {
+ return pixelShader ? pixelShader->getShaderModel() : 0x0000;
+ }
+
+ unsigned short Context::vertexShaderModel() const
+ {
+ return vertexShader ? vertexShader->getShaderModel() : 0x0000;
+ }
+
+ int Context::getMultiSampleCount() const
+ {
+ return renderTarget[0] ? renderTarget[0]->getMultiSampleCount() : 1;
+ }
+
+ int Context::getSuperSampleCount() const
+ {
+ return renderTarget[0] ? renderTarget[0]->getSuperSampleCount() : 1;
+ }
+
+ Format Context::renderTargetInternalFormat(int index)
+ {
+ if(renderTarget[index])
+ {
+ return renderTarget[index]->getInternalFormat();
+ }
+ else
+ {
+ return FORMAT_NULL;
+ }
+ }
+
+ int Context::colorWriteActive()
+ {
+ return colorWriteActive(0) | colorWriteActive(1) | colorWriteActive(2) | colorWriteActive(3);
+ }
+
+ int Context::colorWriteActive(int index)
+ {
+ if(!renderTarget[index] || renderTarget[index]->getInternalFormat() == FORMAT_NULL)
+ {
+ return 0;
+ }
+
+ if(blendOperation() == BLENDOP_DEST && destBlendFactor() == BLEND_ONE &&
+ (!separateAlphaBlendEnable || (blendOperationAlpha() == BLENDOP_DEST && destBlendFactorAlpha() == BLEND_ONE)))
+ {
+ return 0;
+ }
+
+ return colorWriteMask[index];
+ }
+
+ bool Context::colorUsed()
+ {
+ return colorWriteActive() || alphaTestActive() || (pixelShader && pixelShader->containsKill());
+ }
+}
diff --git a/src/Device/Context.hpp b/src/Device/Context.hpp
new file mode 100644
index 0000000..d9110d8
--- /dev/null
+++ b/src/Device/Context.hpp
@@ -0,0 +1,542 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Context_hpp
+#define sw_Context_hpp
+
+#include "Sampler.hpp"
+#include "TextureStage.hpp"
+#include "Stream.hpp"
+#include "Point.hpp"
+#include "Vertex.hpp"
+#include "Common/Types.hpp"
+
+namespace sw
+{
+ class Sampler;
+ class Surface;
+ class PixelShader;
+ class VertexShader;
+ struct Triangle;
+ struct Primitive;
+ struct Vertex;
+ class Resource;
+
+ enum In // Default input stream semantic
+ {
+ Position = 0,
+ BlendWeight = 1,
+ BlendIndices = 2,
+ Normal = 3,
+ PointSize = 4,
+ Color0 = 5,
+ Color1 = 6,
+ TexCoord0 = 7,
+ TexCoord1 = 8,
+ TexCoord2 = 9,
+ TexCoord3 = 10,
+ TexCoord4 = 11,
+ TexCoord5 = 12,
+ TexCoord6 = 13,
+ TexCoord7 = 14,
+ PositionT = 15
+ };
+
+ enum DrawType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ // These types must stay ordered by vertices per primitive. Also, if these basic types
+ // are modified, verify the value assigned to task->verticesPerPrimitive in Renderer.cpp
+ DRAW_POINTLIST = 0x00,
+ DRAW_LINELIST = 0x01,
+ DRAW_LINESTRIP = 0x02,
+ DRAW_LINELOOP = 0x03,
+ DRAW_TRIANGLELIST = 0x04,
+ DRAW_TRIANGLESTRIP = 0x05,
+ DRAW_TRIANGLEFAN = 0x06,
+ DRAW_QUADLIST = 0x07,
+
+ DRAW_NONINDEXED = 0x00,
+ DRAW_INDEXED8 = 0x10,
+ DRAW_INDEXED16 = 0x20,
+ DRAW_INDEXED32 = 0x30,
+
+ DRAW_INDEXEDPOINTLIST8 = DRAW_POINTLIST | DRAW_INDEXED8,
+ DRAW_INDEXEDLINELIST8 = DRAW_LINELIST | DRAW_INDEXED8,
+ DRAW_INDEXEDLINESTRIP8 = DRAW_LINESTRIP | DRAW_INDEXED8,
+ DRAW_INDEXEDLINELOOP8 = DRAW_LINELOOP | DRAW_INDEXED8,
+ DRAW_INDEXEDTRIANGLELIST8 = DRAW_TRIANGLELIST | DRAW_INDEXED8,
+ DRAW_INDEXEDTRIANGLESTRIP8 = DRAW_TRIANGLESTRIP | DRAW_INDEXED8,
+ DRAW_INDEXEDTRIANGLEFAN8 = DRAW_TRIANGLEFAN | DRAW_INDEXED8,
+
+ DRAW_INDEXEDPOINTLIST16 = DRAW_POINTLIST | DRAW_INDEXED16,
+ DRAW_INDEXEDLINELIST16 = DRAW_LINELIST | DRAW_INDEXED16,
+ DRAW_INDEXEDLINESTRIP16 = DRAW_LINESTRIP | DRAW_INDEXED16,
+ DRAW_INDEXEDLINELOOP16 = DRAW_LINELOOP | DRAW_INDEXED16,
+ DRAW_INDEXEDTRIANGLELIST16 = DRAW_TRIANGLELIST | DRAW_INDEXED16,
+ DRAW_INDEXEDTRIANGLESTRIP16 = DRAW_TRIANGLESTRIP | DRAW_INDEXED16,
+ DRAW_INDEXEDTRIANGLEFAN16 = DRAW_TRIANGLEFAN | DRAW_INDEXED16,
+
+ DRAW_INDEXEDPOINTLIST32 = DRAW_POINTLIST | DRAW_INDEXED32,
+ DRAW_INDEXEDLINELIST32 = DRAW_LINELIST | DRAW_INDEXED32,
+ DRAW_INDEXEDLINESTRIP32 = DRAW_LINESTRIP | DRAW_INDEXED32,
+ DRAW_INDEXEDLINELOOP32 = DRAW_LINELOOP | DRAW_INDEXED32,
+ DRAW_INDEXEDTRIANGLELIST32 = DRAW_TRIANGLELIST | DRAW_INDEXED32,
+ DRAW_INDEXEDTRIANGLESTRIP32 = DRAW_TRIANGLESTRIP | DRAW_INDEXED32,
+ DRAW_INDEXEDTRIANGLEFAN32 = DRAW_TRIANGLEFAN | DRAW_INDEXED32,
+
+ DRAW_LAST = DRAW_INDEXEDTRIANGLEFAN32
+ };
+
+ enum FillMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ FILL_SOLID,
+ FILL_WIREFRAME,
+ FILL_VERTEX,
+
+ FILL_LAST = FILL_VERTEX
+ };
+
+ enum ShadingMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ SHADING_FLAT,
+ SHADING_GOURAUD,
+
+ SHADING_LAST = SHADING_GOURAUD
+ };
+
+ enum DepthCompareMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ DEPTH_ALWAYS,
+ DEPTH_NEVER,
+ DEPTH_EQUAL,
+ DEPTH_NOTEQUAL,
+ DEPTH_LESS,
+ DEPTH_LESSEQUAL,
+ DEPTH_GREATER,
+ DEPTH_GREATEREQUAL,
+
+ DEPTH_LAST = DEPTH_GREATEREQUAL
+ };
+
+ enum StencilCompareMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ STENCIL_ALWAYS,
+ STENCIL_NEVER,
+ STENCIL_EQUAL,
+ STENCIL_NOTEQUAL,
+ STENCIL_LESS,
+ STENCIL_LESSEQUAL,
+ STENCIL_GREATER,
+ STENCIL_GREATEREQUAL,
+
+ STENCIL_LAST = STENCIL_GREATEREQUAL
+ };
+
+ enum StencilOperation ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ OPERATION_KEEP,
+ OPERATION_ZERO,
+ OPERATION_REPLACE,
+ OPERATION_INCRSAT,
+ OPERATION_DECRSAT,
+ OPERATION_INVERT,
+ OPERATION_INCR,
+ OPERATION_DECR,
+
+ OPERATION_LAST = OPERATION_DECR
+ };
+
+ enum AlphaCompareMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ ALPHA_ALWAYS,
+ ALPHA_NEVER,
+ ALPHA_EQUAL,
+ ALPHA_NOTEQUAL,
+ ALPHA_LESS,
+ ALPHA_LESSEQUAL,
+ ALPHA_GREATER,
+ ALPHA_GREATEREQUAL,
+
+ ALPHA_LAST = ALPHA_GREATEREQUAL
+ };
+
+ enum CullMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ CULL_NONE,
+ CULL_CLOCKWISE,
+ CULL_COUNTERCLOCKWISE,
+
+ CULL_LAST = CULL_COUNTERCLOCKWISE
+ };
+
+ enum BlendFactor ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ BLEND_ZERO,
+ BLEND_ONE,
+ BLEND_SOURCE,
+ BLEND_INVSOURCE,
+ BLEND_DEST,
+ BLEND_INVDEST,
+ BLEND_SOURCEALPHA,
+ BLEND_INVSOURCEALPHA,
+ BLEND_DESTALPHA,
+ BLEND_INVDESTALPHA,
+ BLEND_SRCALPHASAT,
+ BLEND_CONSTANT,
+ BLEND_INVCONSTANT,
+ BLEND_CONSTANTALPHA,
+ BLEND_INVCONSTANTALPHA,
+
+ BLEND_LAST = BLEND_INVCONSTANTALPHA
+ };
+
+ enum BlendOperation ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ BLENDOP_ADD,
+ BLENDOP_SUB,
+ BLENDOP_INVSUB,
+ BLENDOP_MIN,
+ BLENDOP_MAX,
+
+ BLENDOP_SOURCE, // Copy source
+ BLENDOP_DEST, // Copy dest
+ BLENDOP_NULL, // Nullify result
+
+ BLENDOP_LAST = BLENDOP_NULL
+ };
+
+ enum LogicalOperation ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ LOGICALOP_CLEAR,
+ LOGICALOP_SET,
+ LOGICALOP_COPY,
+ LOGICALOP_COPY_INVERTED,
+ LOGICALOP_NOOP,
+ LOGICALOP_INVERT,
+ LOGICALOP_AND,
+ LOGICALOP_NAND,
+ LOGICALOP_OR,
+ LOGICALOP_NOR,
+ LOGICALOP_XOR,
+ LOGICALOP_EQUIV,
+ LOGICALOP_AND_REVERSE,
+ LOGICALOP_AND_INVERTED,
+ LOGICALOP_OR_REVERSE,
+ LOGICALOP_OR_INVERTED,
+
+ LOGICALOP_LAST = LOGICALOP_OR_INVERTED
+ };
+
+ enum MaterialSource ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ MATERIAL_MATERIAL,
+ MATERIAL_COLOR1,
+ MATERIAL_COLOR2,
+
+ MATERIAL_LAST = MATERIAL_COLOR2
+ };
+
+ enum FogMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ FOG_NONE,
+ FOG_LINEAR,
+ FOG_EXP,
+ FOG_EXP2,
+
+ FOG_LAST = FOG_EXP2
+ };
+
+ enum TexGen ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ TEXGEN_PASSTHRU,
+ TEXGEN_NORMAL,
+ TEXGEN_POSITION,
+ TEXGEN_REFLECTION,
+ TEXGEN_SPHEREMAP,
+ TEXGEN_NONE,
+
+ TEXGEN_LAST = TEXGEN_NONE
+ };
+
+ enum TransparencyAntialiasing ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ TRANSPARENCY_NONE,
+ TRANSPARENCY_ALPHA_TO_COVERAGE,
+
+ TRANSPARENCY_LAST = TRANSPARENCY_ALPHA_TO_COVERAGE
+ };
+
+ class Context
+ {
+ public:
+ Context();
+
+ ~Context();
+
+ void *operator new(size_t bytes);
+ void operator delete(void *pointer, size_t bytes);
+
+ bool isDrawPoint(bool fillModeAware = false) const;
+ bool isDrawLine(bool fillModeAware = false) const;
+ bool isDrawTriangle(bool fillModeAware = false) const;
+
+ void init();
+
+ const float &exp2Bias(); // NOTE: Needs address for JIT
+
+ const Point &getLightPosition(int light);
+
+ void setGlobalMipmapBias(float bias);
+
+ // Set fixed-function vertex pipeline states
+ void setLightingEnable(bool lightingEnable);
+ void setSpecularEnable(bool specularEnable);
+ void setLightEnable(int light, bool lightEnable);
+ void setLightPosition(int light, Point worldLightPosition);
+
+ void setColorVertexEnable(bool colorVertexEnable);
+ void setAmbientMaterialSource(MaterialSource ambientMaterialSource);
+ void setDiffuseMaterialSource(MaterialSource diffuseMaterialSource);
+ void setSpecularMaterialSource(MaterialSource specularMaterialSource);
+ void setEmissiveMaterialSource(MaterialSource emissiveMaterialSource);
+
+ void setPointSpriteEnable(bool pointSpriteEnable);
+ void setPointScaleEnable(bool pointScaleEnable);
+
+ // Set fixed-function pixel pipeline states, return true when modified
+ bool setDepthBufferEnable(bool depthBufferEnable);
+
+ bool setAlphaBlendEnable(bool alphaBlendEnable);
+ bool setSourceBlendFactor(BlendFactor sourceBlendFactor);
+ bool setDestBlendFactor(BlendFactor destBlendFactor);
+ bool setBlendOperation(BlendOperation blendOperation);
+
+ bool setSeparateAlphaBlendEnable(bool separateAlphaBlendEnable);
+ bool setSourceBlendFactorAlpha(BlendFactor sourceBlendFactorAlpha);
+ bool setDestBlendFactorAlpha(BlendFactor destBlendFactorAlpha);
+ bool setBlendOperationAlpha(BlendOperation blendOperationAlpha);
+
+ bool setColorWriteMask(int index, int colorWriteMask);
+ bool setWriteSRGB(bool sRGB);
+
+ bool setColorLogicOpEnabled(bool colorLogicOpEnabled);
+ bool setLogicalOperation(LogicalOperation logicalOperation);
+
+ // Active fixed-function pixel pipeline states
+ bool fogActive();
+ bool pointSizeActive();
+ FogMode pixelFogActive();
+ bool depthWriteActive();
+ bool alphaTestActive();
+ bool depthBufferActive();
+ bool stencilActive();
+
+ bool perspectiveActive();
+
+ // Active fixed-function vertex pipeline states
+ bool vertexLightingActive();
+ bool texCoordActive(int coordinate, int component);
+ bool texCoordActive(int coordinate);
+ bool isProjectionComponent(unsigned int coordinate, int component);
+ bool vertexSpecularInputActive();
+ bool vertexSpecularActive();
+ bool vertexNormalActive();
+ bool vertexLightActive();
+ bool vertexLightActive(int i);
+ MaterialSource vertexDiffuseMaterialSourceActive();
+ MaterialSource vertexSpecularMaterialSourceActive();
+ MaterialSource vertexAmbientMaterialSourceActive();
+ MaterialSource vertexEmissiveMaterialSourceActive();
+
+ bool pointSpriteActive();
+ bool pointScaleActive();
+
+ bool alphaBlendActive();
+ BlendFactor sourceBlendFactor();
+ BlendFactor destBlendFactor();
+ BlendOperation blendOperation();
+
+ BlendFactor sourceBlendFactorAlpha();
+ BlendFactor destBlendFactorAlpha();
+ BlendOperation blendOperationAlpha();
+
+ LogicalOperation colorLogicOp();
+ LogicalOperation indexLogicOp();
+
+ bool indexedVertexBlendActive();
+ int vertexBlendMatrixCountActive();
+ bool localViewerActive();
+ bool normalizeNormalsActive();
+ FogMode vertexFogModeActive();
+ bool rangeFogActive();
+
+ TexGen texGenActive(int stage);
+ int textureTransformCountActive(int stage);
+ int texCoordIndexActive(int stage);
+
+ // Active context states
+ bool diffuseUsed(); // Used by pixel processor but not provided by vertex processor
+ bool diffuseUsed(int component); // Used by pixel processor but not provided by vertex processor
+ bool diffuseActive();
+ bool diffuseActive(int component);
+ bool specularUsed();
+ bool specularUsed(int component);
+ bool specularActive();
+ bool specularActive(int component);
+ bool colorActive(int color, int component);
+ bool textureActive();
+ bool textureActive(int coordinate);
+ bool textureActive(int coordinate, int component);
+
+ unsigned short pixelShaderModel() const;
+ unsigned short vertexShaderModel() const;
+
+ int getMultiSampleCount() const;
+ int getSuperSampleCount() const;
+
+ DrawType drawType;
+
+ bool stencilEnable;
+ StencilCompareMode stencilCompareMode;
+ int stencilReference;
+ int stencilMask;
+ StencilOperation stencilFailOperation;
+ StencilOperation stencilPassOperation;
+ StencilOperation stencilZFailOperation;
+ int stencilWriteMask;
+
+ bool twoSidedStencil;
+ StencilCompareMode stencilCompareModeCCW;
+ int stencilReferenceCCW;
+ int stencilMaskCCW;
+ StencilOperation stencilFailOperationCCW;
+ StencilOperation stencilPassOperationCCW;
+ StencilOperation stencilZFailOperationCCW;
+ int stencilWriteMaskCCW;
+
+ // Pixel processor states
+ AlphaCompareMode alphaCompareMode;
+ bool alphaTestEnable;
+ FillMode fillMode;
+ ShadingMode shadingMode;
+
+ CullMode cullMode;
+ bool frontFacingCCW;
+ float alphaReference;
+
+ float depthBias;
+ float slopeDepthBias;
+
+ TextureStage textureStage[8];
+ Sampler sampler[TOTAL_IMAGE_UNITS];
+
+ Format renderTargetInternalFormat(int index);
+ int colorWriteActive();
+ int colorWriteActive(int index);
+ bool colorUsed();
+
+ Resource *texture[TOTAL_IMAGE_UNITS];
+ Stream input[MAX_VERTEX_INPUTS];
+ Resource *indexBuffer;
+
+ bool preTransformed; // FIXME: Private
+
+ float fogStart;
+ float fogEnd;
+
+ void computeIllumination();
+
+ bool textureWrapActive;
+ unsigned char textureWrap[TEXTURE_IMAGE_UNITS];
+ TexGen texGen[8];
+ bool localViewer;
+ bool normalizeNormals;
+ int textureTransformCount[8];
+ bool textureTransformProject[8];
+
+ Surface *renderTarget[RENDERTARGETS];
+ unsigned int renderTargetLayer[RENDERTARGETS];
+ Surface *depthBuffer;
+ unsigned int depthBufferLayer;
+ Surface *stencilBuffer;
+ unsigned int stencilBufferLayer;
+
+ // Fog
+ bool fogEnable;
+ FogMode pixelFogMode;
+ FogMode vertexFogMode;
+ bool wBasedFog;
+ bool rangeFogEnable;
+
+ // Vertex blending
+ bool indexedVertexBlendEnable;
+ int vertexBlendMatrixCount;
+
+ // Shaders
+ const PixelShader *pixelShader;
+ const VertexShader *vertexShader;
+
+ // Global mipmap bias
+ float bias;
+
+ // Instancing
+ int instanceID;
+
+ // Fixed-function vertex pipeline state
+ bool lightingEnable;
+ bool specularEnable;
+ bool lightEnable[8];
+ Point worldLightPosition[8];
+
+ MaterialSource ambientMaterialSource;
+ MaterialSource diffuseMaterialSource;
+ MaterialSource specularMaterialSource;
+ MaterialSource emissiveMaterialSource;
+ bool colorVertexEnable;
+
+ bool occlusionEnabled;
+ bool transformFeedbackQueryEnabled;
+ uint64_t transformFeedbackEnabled;
+
+ // Pixel processor states
+ bool rasterizerDiscard;
+ bool depthBufferEnable;
+ DepthCompareMode depthCompareMode;
+ bool depthWriteEnable;
+
+ bool alphaBlendEnable;
+ BlendFactor sourceBlendFactorState;
+ BlendFactor destBlendFactorState;
+ BlendOperation blendOperationState;
+
+ bool separateAlphaBlendEnable;
+ BlendFactor sourceBlendFactorStateAlpha;
+ BlendFactor destBlendFactorStateAlpha;
+ BlendOperation blendOperationStateAlpha;
+
+ bool pointSpriteEnable;
+ bool pointScaleEnable;
+ float lineWidth;
+
+ int colorWriteMask[RENDERTARGETS]; // RGBA
+ bool writeSRGB;
+ unsigned int sampleMask;
+ unsigned int multiSampleMask;
+
+ bool colorLogicOpEnabled;
+ LogicalOperation logicalOperation;
+ };
+}
+
+#endif // sw_Context_hpp
diff --git a/src/Device/ETC_Decoder.cpp b/src/Device/ETC_Decoder.cpp
new file mode 100644
index 0000000..dbc6276
--- /dev/null
+++ b/src/Device/ETC_Decoder.cpp
@@ -0,0 +1,741 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ETC_Decoder.hpp"
+
+namespace
+{
+ inline int clampByte(int value)
+ {
+ return (value < 0) ? 0 : ((value > 255) ? 255 : value);
+ }
+
+ inline int clampSByte(int value)
+ {
+ return (value < -128) ? -128 : ((value > 127) ? 127 : value);
+ }
+
+ inline int clampEAC(int value, bool isSigned)
+ {
+ int min = isSigned ? -1023 : 0;
+ int max = isSigned ? 1023 : 2047;
+ return (value < min) ? min : ((value > max) ? max : value);
+ }
+
+ struct bgra8
+ {
+ unsigned char b;
+ unsigned char g;
+ unsigned char r;
+ unsigned char a;
+
+ inline bgra8()
+ {
+ }
+
+ inline void set(int red, int green, int blue)
+ {
+ r = static_cast<unsigned char>(clampByte(red));
+ g = static_cast<unsigned char>(clampByte(green));
+ b = static_cast<unsigned char>(clampByte(blue));
+ }
+
+ inline void set(int red, int green, int blue, int alpha)
+ {
+ r = static_cast<unsigned char>(clampByte(red));
+ g = static_cast<unsigned char>(clampByte(green));
+ b = static_cast<unsigned char>(clampByte(blue));
+ a = static_cast<unsigned char>(clampByte(alpha));
+ }
+
+ const bgra8& addA(int alpha)
+ {
+ a = alpha;
+ return *this;
+ }
+ };
+
+ inline int extend_4to8bits(int x)
+ {
+ return (x << 4) | x;
+ }
+
+ inline int extend_5to8bits(int x)
+ {
+ return (x << 3) | (x >> 2);
+ }
+
+ inline int extend_6to8bits(int x)
+ {
+ return (x << 2) | (x >> 4);
+ }
+
+ inline int extend_7to8bits(int x)
+ {
+ return (x << 1) | (x >> 6);
+ }
+
+ struct ETC2
+ {
+ // Decodes unsigned single or dual channel block to bytes
+ static void DecodeBlock(const ETC2** sources, unsigned char *dest, int nbChannels, int x, int y, int w, int h, int pitch, bool isSigned, bool isEAC)
+ {
+ if(isEAC)
+ {
+ for(int j = 0; j < 4 && (y + j) < h; j++)
+ {
+ int* sDst = reinterpret_cast<int*>(dest);
+ for(int i = 0; i < 4 && (x + i) < w; i++)
+ {
+ for(int c = nbChannels - 1; c >= 0; c--)
+ {
+ sDst[i * nbChannels + c] = clampEAC(sources[c]->getSingleChannel(i, j, isSigned, true), isSigned);
+ }
+ }
+ dest += pitch;
+ }
+ }
+ else
+ {
+ if(isSigned)
+ {
+ signed char* sDst = reinterpret_cast<signed char*>(dest);
+ for(int j = 0; j < 4 && (y + j) < h; j++)
+ {
+ for(int i = 0; i < 4 && (x + i) < w; i++)
+ {
+ for(int c = nbChannels - 1; c >= 0; c--)
+ {
+ sDst[i * nbChannels + c] = clampSByte(sources[c]->getSingleChannel(i, j, isSigned, false));
+ }
+ }
+ sDst += pitch;
+ }
+ }
+ else
+ {
+ for(int j = 0; j < 4 && (y + j) < h; j++)
+ {
+ for(int i = 0; i < 4 && (x + i) < w; i++)
+ {
+ for(int c = nbChannels - 1; c >= 0; c--)
+ {
+ dest[i * nbChannels + c] = clampByte(sources[c]->getSingleChannel(i, j, isSigned, false));
+ }
+ }
+ dest += pitch;
+ }
+ }
+ }
+ }
+
+ // Decodes RGB block to bgra8
+ void decodeBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, unsigned char alphaValues[4][4], bool punchThroughAlpha) const
+ {
+ bool opaqueBit = diffbit;
+ bool nonOpaquePunchThroughAlpha = punchThroughAlpha && !opaqueBit;
+
+ // Select mode
+ if(diffbit || punchThroughAlpha)
+ {
+ int r = (R + dR);
+ int g = (G + dG);
+ int b = (B + dB);
+ if(r < 0 || r > 31)
+ {
+ decodeTBlock(dest, x, y, w, h, pitch, alphaValues, nonOpaquePunchThroughAlpha);
+ }
+ else if(g < 0 || g > 31)
+ {
+ decodeHBlock(dest, x, y, w, h, pitch, alphaValues, nonOpaquePunchThroughAlpha);
+ }
+ else if(b < 0 || b > 31)
+ {
+ decodePlanarBlock(dest, x, y, w, h, pitch, alphaValues);
+ }
+ else
+ {
+ decodeDifferentialBlock(dest, x, y, w, h, pitch, alphaValues, nonOpaquePunchThroughAlpha);
+ }
+ }
+ else
+ {
+ decodeIndividualBlock(dest, x, y, w, h, pitch, alphaValues, nonOpaquePunchThroughAlpha);
+ }
+ }
+
+ private:
+ struct
+ {
+ union
+ {
+ // Individual, differential, H and T modes
+ struct
+ {
+ union
+ {
+ // Individual and differential modes
+ struct
+ {
+ union
+ {
+ struct // Individual colors
+ {
+ unsigned char R2 : 4;
+ unsigned char R1 : 4;
+ unsigned char G2 : 4;
+ unsigned char G1 : 4;
+ unsigned char B2 : 4;
+ unsigned char B1 : 4;
+ };
+
+ struct // Differential colors
+ {
+ signed char dR : 3;
+ unsigned char R : 5;
+ signed char dG : 3;
+ unsigned char G : 5;
+ signed char dB : 3;
+ unsigned char B : 5;
+ };
+ };
+
+ bool flipbit : 1;
+ bool diffbit : 1;
+ unsigned char cw2 : 3;
+ unsigned char cw1 : 3;
+ };
+
+ // T mode
+ struct
+ {
+ // Byte 1
+ unsigned char TR1b : 2;
+ unsigned char TdummyB : 1;
+ unsigned char TR1a : 2;
+ unsigned char TdummyA : 3;
+
+ // Byte 2
+ unsigned char TB1 : 4;
+ unsigned char TG1 : 4;
+
+ // Byte 3
+ unsigned char TG2 : 4;
+ unsigned char TR2 : 4;
+
+ // Byte 4
+ unsigned char Tdb : 1;
+ bool Tflipbit : 1;
+ unsigned char Tda : 2;
+ unsigned char TB2 : 4;
+ };
+
+ // H mode
+ struct
+ {
+ // Byte 1
+ unsigned char HG1a : 3;
+ unsigned char HR1 : 4;
+ unsigned char HdummyA : 1;
+
+ // Byte 2
+ unsigned char HB1b : 2;
+ unsigned char HdummyC : 1;
+ unsigned char HB1a : 1;
+ unsigned char HG1b : 1;
+ unsigned char HdummyB : 3;
+
+ // Byte 3
+ unsigned char HG2a : 3;
+ unsigned char HR2 : 4;
+ unsigned char HB1c : 1;
+
+ // Byte 4
+ unsigned char Hdb : 1;
+ bool Hflipbit : 1;
+ unsigned char Hda : 1;
+ unsigned char HB2 : 4;
+ unsigned char HG2b : 1;
+ };
+ };
+
+ unsigned char pixelIndexMSB[2];
+ unsigned char pixelIndexLSB[2];
+ };
+
+ // planar mode
+ struct
+ {
+ // Byte 1
+ unsigned char GO1 : 1;
+ unsigned char RO : 6;
+ unsigned char PdummyA : 1;
+
+ // Byte 2
+ unsigned char BO1 : 1;
+ unsigned char GO2 : 6;
+ unsigned char PdummyB : 1;
+
+ // Byte 3
+ unsigned char BO3a : 2;
+ unsigned char PdummyD : 1;
+ unsigned char BO2 : 2;
+ unsigned char PdummyC : 3;
+
+ // Byte 4
+ unsigned char RH2 : 1;
+ bool Pflipbit : 1;
+ unsigned char RH1 : 5;
+ unsigned char BO3b : 1;
+
+ // Byte 5
+ unsigned char BHa : 1;
+ unsigned char GH : 7;
+
+ // Byte 6
+ unsigned char RVa : 3;
+ unsigned char BHb : 5;
+
+ // Byte 7
+ unsigned char GVa : 5;
+ unsigned char RVb : 3;
+
+ // Byte 8
+ unsigned char BV : 6;
+ unsigned char GVb : 2;
+ };
+
+ // Single channel block
+ struct
+ {
+ union
+ {
+ unsigned char base_codeword;
+ signed char signed_base_codeword;
+ };
+
+ unsigned char table_index : 4;
+ unsigned char multiplier : 4;
+
+ unsigned char mc1 : 2;
+ unsigned char mb : 3;
+ unsigned char ma : 3;
+
+ unsigned char mf1 : 1;
+ unsigned char me : 3;
+ unsigned char md : 3;
+ unsigned char mc2 : 1;
+
+ unsigned char mh : 3;
+ unsigned char mg : 3;
+ unsigned char mf2 : 2;
+
+ unsigned char mk1 : 2;
+ unsigned char mj : 3;
+ unsigned char mi : 3;
+
+ unsigned char mn1 : 1;
+ unsigned char mm : 3;
+ unsigned char ml : 3;
+ unsigned char mk2 : 1;
+
+ unsigned char mp : 3;
+ unsigned char mo : 3;
+ unsigned char mn2 : 2;
+ };
+ };
+ };
+
+ void decodeIndividualBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, unsigned char alphaValues[4][4], bool nonOpaquePunchThroughAlpha) const
+ {
+ int r1 = extend_4to8bits(R1);
+ int g1 = extend_4to8bits(G1);
+ int b1 = extend_4to8bits(B1);
+
+ int r2 = extend_4to8bits(R2);
+ int g2 = extend_4to8bits(G2);
+ int b2 = extend_4to8bits(B2);
+
+ decodeIndividualOrDifferentialBlock(dest, x, y, w, h, pitch, r1, g1, b1, r2, g2, b2, alphaValues, nonOpaquePunchThroughAlpha);
+ }
+
+ void decodeDifferentialBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, unsigned char alphaValues[4][4], bool nonOpaquePunchThroughAlpha) const
+ {
+ int b1 = extend_5to8bits(B);
+ int g1 = extend_5to8bits(G);
+ int r1 = extend_5to8bits(R);
+
+ int r2 = extend_5to8bits(R + dR);
+ int g2 = extend_5to8bits(G + dG);
+ int b2 = extend_5to8bits(B + dB);
+
+ decodeIndividualOrDifferentialBlock(dest, x, y, w, h, pitch, r1, g1, b1, r2, g2, b2, alphaValues, nonOpaquePunchThroughAlpha);
+ }
+
+ void decodeIndividualOrDifferentialBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, int r1, int g1, int b1, int r2, int g2, int b2, unsigned char alphaValues[4][4], bool nonOpaquePunchThroughAlpha) const
+ {
+ // Table 3.17.2 sorted according to table 3.17.3
+ static const int intensityModifierDefault[8][4] =
+ {
+ { 2, 8, -2, -8 },
+ { 5, 17, -5, -17 },
+ { 9, 29, -9, -29 },
+ { 13, 42, -13, -42 },
+ { 18, 60, -18, -60 },
+ { 24, 80, -24, -80 },
+ { 33, 106, -33, -106 },
+ { 47, 183, -47, -183 }
+ };
+
+ // Table C.12, intensity modifier for non opaque punchthrough alpha
+ static const int intensityModifierNonOpaque[8][4] =
+ {
+ { 0, 8, 0, -8 },
+ { 0, 17, 0, -17 },
+ { 0, 29, 0, -29 },
+ { 0, 42, 0, -42 },
+ { 0, 60, 0, -60 },
+ { 0, 80, 0, -80 },
+ { 0, 106, 0, -106 },
+ { 0, 183, 0, -183 }
+ };
+
+ const int(&intensityModifier)[8][4] = nonOpaquePunchThroughAlpha ? intensityModifierNonOpaque : intensityModifierDefault;
+
+ bgra8 subblockColors0[4];
+ bgra8 subblockColors1[4];
+
+ const int i10 = intensityModifier[cw1][0];
+ const int i11 = intensityModifier[cw1][1];
+ const int i12 = intensityModifier[cw1][2];
+ const int i13 = intensityModifier[cw1][3];
+
+ subblockColors0[0].set(r1 + i10, g1 + i10, b1 + i10);
+ subblockColors0[1].set(r1 + i11, g1 + i11, b1 + i11);
+ subblockColors0[2].set(r1 + i12, g1 + i12, b1 + i12);
+ subblockColors0[3].set(r1 + i13, g1 + i13, b1 + i13);
+
+ const int i20 = intensityModifier[cw2][0];
+ const int i21 = intensityModifier[cw2][1];
+ const int i22 = intensityModifier[cw2][2];
+ const int i23 = intensityModifier[cw2][3];
+
+ subblockColors1[0].set(r2 + i20, g2 + i20, b2 + i20);
+ subblockColors1[1].set(r2 + i21, g2 + i21, b2 + i21);
+ subblockColors1[2].set(r2 + i22, g2 + i22, b2 + i22);
+ subblockColors1[3].set(r2 + i23, g2 + i23, b2 + i23);
+
+ unsigned char* destStart = dest;
+
+ if(flipbit)
+ {
+ for(int j = 0; j < 2 && (y + j) < h; j++)
+ {
+ bgra8* color = (bgra8*)dest;
+ if((x + 0) < w) color[0] = subblockColors0[getIndex(0, j)].addA(alphaValues[j][0]);
+ if((x + 1) < w) color[1] = subblockColors0[getIndex(1, j)].addA(alphaValues[j][1]);
+ if((x + 2) < w) color[2] = subblockColors0[getIndex(2, j)].addA(alphaValues[j][2]);
+ if((x + 3) < w) color[3] = subblockColors0[getIndex(3, j)].addA(alphaValues[j][3]);
+ dest += pitch;
+ }
+
+ for(int j = 2; j < 4 && (y + j) < h; j++)
+ {
+ bgra8* color = (bgra8*)dest;
+ if((x + 0) < w) color[0] = subblockColors1[getIndex(0, j)].addA(alphaValues[j][0]);
+ if((x + 1) < w) color[1] = subblockColors1[getIndex(1, j)].addA(alphaValues[j][1]);
+ if((x + 2) < w) color[2] = subblockColors1[getIndex(2, j)].addA(alphaValues[j][2]);
+ if((x + 3) < w) color[3] = subblockColors1[getIndex(3, j)].addA(alphaValues[j][3]);
+ dest += pitch;
+ }
+ }
+ else
+ {
+ for(int j = 0; j < 4 && (y + j) < h; j++)
+ {
+ bgra8* color = (bgra8*)dest;
+ if((x + 0) < w) color[0] = subblockColors0[getIndex(0, j)].addA(alphaValues[j][0]);
+ if((x + 1) < w) color[1] = subblockColors0[getIndex(1, j)].addA(alphaValues[j][1]);
+ if((x + 2) < w) color[2] = subblockColors1[getIndex(2, j)].addA(alphaValues[j][2]);
+ if((x + 3) < w) color[3] = subblockColors1[getIndex(3, j)].addA(alphaValues[j][3]);
+ dest += pitch;
+ }
+ }
+
+ if(nonOpaquePunchThroughAlpha)
+ {
+ decodePunchThroughAlphaBlock(destStart, x, y, w, h, pitch);
+ }
+ }
+
+ void decodeTBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, unsigned char alphaValues[4][4], bool nonOpaquePunchThroughAlpha) const
+ {
+ // Table C.8, distance index fot T and H modes
+ static const int distance[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
+
+ bgra8 paintColors[4];
+
+ int r1 = extend_4to8bits(TR1a << 2 | TR1b);
+ int g1 = extend_4to8bits(TG1);
+ int b1 = extend_4to8bits(TB1);
+
+ int r2 = extend_4to8bits(TR2);
+ int g2 = extend_4to8bits(TG2);
+ int b2 = extend_4to8bits(TB2);
+
+ const int d = distance[Tda << 1 | Tdb];
+
+ paintColors[0].set(r1, g1, b1);
+ paintColors[1].set(r2 + d, g2 + d, b2 + d);
+ paintColors[2].set(r2, g2, b2);
+ paintColors[3].set(r2 - d, g2 - d, b2 - d);
+
+ unsigned char* destStart = dest;
+
+ for(int j = 0; j < 4 && (y + j) < h; j++)
+ {
+ bgra8* color = (bgra8*)dest;
+ if((x + 0) < w) color[0] = paintColors[getIndex(0, j)].addA(alphaValues[j][0]);
+ if((x + 1) < w) color[1] = paintColors[getIndex(1, j)].addA(alphaValues[j][1]);
+ if((x + 2) < w) color[2] = paintColors[getIndex(2, j)].addA(alphaValues[j][2]);
+ if((x + 3) < w) color[3] = paintColors[getIndex(3, j)].addA(alphaValues[j][3]);
+ dest += pitch;
+ }
+
+ if(nonOpaquePunchThroughAlpha)
+ {
+ decodePunchThroughAlphaBlock(destStart, x, y, w, h, pitch);
+ }
+ }
+
+ void decodeHBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, unsigned char alphaValues[4][4], bool nonOpaquePunchThroughAlpha) const
+ {
+ // Table C.8, distance index fot T and H modes
+ static const int distance[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
+
+ bgra8 paintColors[4];
+
+ int r1 = extend_4to8bits(HR1);
+ int g1 = extend_4to8bits(HG1a << 1 | HG1b);
+ int b1 = extend_4to8bits(HB1a << 3 | HB1b << 1 | HB1c);
+
+ int r2 = extend_4to8bits(HR2);
+ int g2 = extend_4to8bits(HG2a << 1 | HG2b);
+ int b2 = extend_4to8bits(HB2);
+
+ const int d = distance[(Hda << 2) | (Hdb << 1) | ((r1 << 16 | g1 << 8 | b1) >= (r2 << 16 | g2 << 8 | b2) ? 1 : 0)];
+
+ paintColors[0].set(r1 + d, g1 + d, b1 + d);
+ paintColors[1].set(r1 - d, g1 - d, b1 - d);
+ paintColors[2].set(r2 + d, g2 + d, b2 + d);
+ paintColors[3].set(r2 - d, g2 - d, b2 - d);
+
+ unsigned char* destStart = dest;
+
+ for(int j = 0; j < 4 && (y + j) < h; j++)
+ {
+ bgra8* color = (bgra8*)dest;
+ if((x + 0) < w) color[0] = paintColors[getIndex(0, j)].addA(alphaValues[j][0]);
+ if((x + 1) < w) color[1] = paintColors[getIndex(1, j)].addA(alphaValues[j][1]);
+ if((x + 2) < w) color[2] = paintColors[getIndex(2, j)].addA(alphaValues[j][2]);
+ if((x + 3) < w) color[3] = paintColors[getIndex(3, j)].addA(alphaValues[j][3]);
+ dest += pitch;
+ }
+
+ if(nonOpaquePunchThroughAlpha)
+ {
+ decodePunchThroughAlphaBlock(destStart, x, y, w, h, pitch);
+ }
+ }
+
+ void decodePlanarBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, unsigned char alphaValues[4][4]) const
+ {
+ int ro = extend_6to8bits(RO);
+ int go = extend_7to8bits(GO1 << 6 | GO2);
+ int bo = extend_6to8bits(BO1 << 5 | BO2 << 3 | BO3a << 1 | BO3b);
+
+ int rh = extend_6to8bits(RH1 << 1 | RH2);
+ int gh = extend_7to8bits(GH);
+ int bh = extend_6to8bits(BHa << 5 | BHb);
+
+ int rv = extend_6to8bits(RVa << 3 | RVb);
+ int gv = extend_7to8bits(GVa << 2 | GVb);
+ int bv = extend_6to8bits(BV);
+
+ for(int j = 0; j < 4 && (y + j) < h; j++)
+ {
+ int ry = j * (rv - ro) + 2;
+ int gy = j * (gv - go) + 2;
+ int by = j * (bv - bo) + 2;
+ for(int i = 0; i < 4 && (x + i) < w; i++)
+ {
+ ((bgra8*)(dest))[i].set(((i * (rh - ro) + ry) >> 2) + ro,
+ ((i * (gh - go) + gy) >> 2) + go,
+ ((i * (bh - bo) + by) >> 2) + bo,
+ alphaValues[j][i]);
+ }
+ dest += pitch;
+ }
+ }
+
+ // Index for individual, differential, H and T modes
+ inline int getIndex(int x, int y) const
+ {
+ int bitIndex = x * 4 + y;
+ int bitOffset = bitIndex & 7;
+ int lsb = (pixelIndexLSB[1 - (bitIndex >> 3)] >> bitOffset) & 1;
+ int msb = (pixelIndexMSB[1 - (bitIndex >> 3)] >> bitOffset) & 1;
+
+ return (msb << 1) | lsb;
+ }
+
+ void decodePunchThroughAlphaBlock(unsigned char *dest, int x, int y, int w, int h, int pitch) const
+ {
+ for(int j = 0; j < 4 && (y + j) < h; j++)
+ {
+ for(int i = 0; i < 4 && (x + i) < w; i++)
+ {
+ if(getIndex(i, j) == 2) // msb == 1 && lsb == 0
+ {
+ ((bgra8*)dest)[i].set(0, 0, 0, 0);
+ }
+ }
+ dest += pitch;
+ }
+ }
+
+ // Single channel utility functions
+ inline int getSingleChannel(int x, int y, bool isSigned, bool isEAC) const
+ {
+ int codeword = isSigned ? signed_base_codeword : base_codeword;
+ return isEAC ?
+ ((multiplier == 0) ?
+ (codeword * 8 + 4 + getSingleChannelModifier(x, y)) :
+ (codeword * 8 + 4 + getSingleChannelModifier(x, y) * multiplier * 8)) :
+ codeword + getSingleChannelModifier(x, y) * multiplier;
+ }
+
+ inline int getSingleChannelIndex(int x, int y) const
+ {
+ switch(x * 4 + y)
+ {
+ case 0: return ma;
+ case 1: return mb;
+ case 2: return mc1 << 1 | mc2;
+ case 3: return md;
+ case 4: return me;
+ case 5: return mf1 << 2 | mf2;
+ case 6: return mg;
+ case 7: return mh;
+ case 8: return mi;
+ case 9: return mj;
+ case 10: return mk1 << 1 | mk2;
+ case 11: return ml;
+ case 12: return mm;
+ case 13: return mn1 << 2 | mn2;
+ case 14: return mo;
+ default: return mp; // 15
+ }
+ }
+
+ inline int getSingleChannelModifier(int x, int y) const
+ {
+ static const int modifierTable[16][8] = { { -3, -6, -9, -15, 2, 5, 8, 14 },
+ { -3, -7, -10, -13, 2, 6, 9, 12 },
+ { -2, -5, -8, -13, 1, 4, 7, 12 },
+ { -2, -4, -6, -13, 1, 3, 5, 12 },
+ { -3, -6, -8, -12, 2, 5, 7, 11 },
+ { -3, -7, -9, -11, 2, 6, 8, 10 },
+ { -4, -7, -8, -11, 3, 6, 7, 10 },
+ { -3, -5, -8, -11, 2, 4, 7, 10 },
+ { -2, -6, -8, -10, 1, 5, 7, 9 },
+ { -2, -5, -8, -10, 1, 4, 7, 9 },
+ { -2, -4, -8, -10, 1, 3, 7, 9 },
+ { -2, -5, -7, -10, 1, 4, 6, 9 },
+ { -3, -4, -7, -10, 2, 3, 6, 9 },
+ { -1, -2, -3, -10, 0, 1, 2, 9 },
+ { -4, -6, -8, -9, 3, 5, 7, 8 },
+ { -3, -5, -7, -9, 2, 4, 6, 8 } };
+
+ return modifierTable[table_index][getSingleChannelIndex(x, y)];
+ }
+ };
+}
+
+// Decodes 1 to 4 channel images to 8 bit output
+bool ETC_Decoder::Decode(const unsigned char* src, unsigned char *dst, int w, int h, int dstW, int dstH, int dstPitch, int dstBpp, InputType inputType)
+{
+ const ETC2* sources[2];
+ sources[0] = (const ETC2*)src;
+
+ unsigned char alphaValues[4][4] = { { 255, 255, 255, 255 }, { 255, 255, 255, 255 }, { 255, 255, 255, 255 }, { 255, 255, 255, 255 } };
+
+ switch(inputType)
+ {
+ case ETC_R_SIGNED:
+ case ETC_R_UNSIGNED:
+ for(int y = 0; y < h; y += 4)
+ {
+ unsigned char *dstRow = dst + (y * dstPitch);
+ for(int x = 0; x < w; x += 4, sources[0]++)
+ {
+ ETC2::DecodeBlock(sources, dstRow + (x * dstBpp), 1, x, y, dstW, dstH, dstPitch, inputType == ETC_R_SIGNED, true);
+ }
+ }
+ break;
+ case ETC_RG_SIGNED:
+ case ETC_RG_UNSIGNED:
+ sources[1] = sources[0] + 1;
+ for(int y = 0; y < h; y += 4)
+ {
+ unsigned char *dstRow = dst + (y * dstPitch);
+ for(int x = 0; x < w; x += 4, sources[0] += 2, sources[1] += 2)
+ {
+ ETC2::DecodeBlock(sources, dstRow + (x * dstBpp), 2, x, y, dstW, dstH, dstPitch, inputType == ETC_RG_SIGNED, true);
+ }
+ }
+ break;
+ case ETC_RGB:
+ case ETC_RGB_PUNCHTHROUGH_ALPHA:
+ for(int y = 0; y < h; y += 4)
+ {
+ unsigned char *dstRow = dst + (y * dstPitch);
+ for(int x = 0; x < w; x += 4, sources[0]++)
+ {
+ sources[0]->decodeBlock(dstRow + (x * dstBpp), x, y, dstW, dstH, dstPitch, alphaValues, inputType == ETC_RGB_PUNCHTHROUGH_ALPHA);
+ }
+ }
+ break;
+ case ETC_RGBA:
+ for(int y = 0; y < h; y += 4)
+ {
+ unsigned char *dstRow = dst + (y * dstPitch);
+ for(int x = 0; x < w; x += 4)
+ {
+ // Decode Alpha
+ ETC2::DecodeBlock(&sources[0], &(alphaValues[0][0]), 1, x, y, dstW, dstH, 4, false, false);
+ sources[0]++; // RGBA packets are 128 bits, so move on to the next 64 bit packet to decode the RGB color
+
+ // Decode RGB
+ sources[0]->decodeBlock(dstRow + (x * dstBpp), x, y, dstW, dstH, dstPitch, alphaValues, false);
+ sources[0]++;
+ }
+ }
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
diff --git a/src/Device/ETC_Decoder.hpp b/src/Device/ETC_Decoder.hpp
new file mode 100644
index 0000000..1039b37
--- /dev/null
+++ b/src/Device/ETC_Decoder.hpp
@@ -0,0 +1,41 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+class ETC_Decoder
+{
+public:
+ enum InputType
+ {
+ ETC_R_SIGNED,
+ ETC_R_UNSIGNED,
+ ETC_RG_SIGNED,
+ ETC_RG_UNSIGNED,
+ ETC_RGB,
+ ETC_RGB_PUNCHTHROUGH_ALPHA,
+ ETC_RGBA
+ };
+
+ /// ETC_Decoder::Decode - Decodes 1 to 4 channel images to 8 bit output
+ /// @param src Pointer to ETC2 encoded image
+ /// @param dst Pointer to BGRA, 8 bit output
+ /// @param w src image width
+ /// @param h src image height
+ /// @param dstW dst image width
+ /// @param dstH dst image height
+ /// @param dstPitch dst image pitch (bytes per row)
+ /// @param dstBpp dst image bytes per pixel
+ /// @param inputType src's format
+ /// @return true if the decoding was performed
+ static bool Decode(const unsigned char* src, unsigned char *dst, int w, int h, int dstW, int dstH, int dstPitch, int dstBpp, InputType inputType);
+};
diff --git a/src/Device/LRUCache.hpp b/src/Device/LRUCache.hpp
new file mode 100644
index 0000000..1a1a302
--- /dev/null
+++ b/src/Device/LRUCache.hpp
@@ -0,0 +1,145 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_LRUCache_hpp
+#define sw_LRUCache_hpp
+
+#include "Common/Math.hpp"
+
+namespace sw
+{
+ template<class Key, class Data>
+ class LRUCache
+ {
+ public:
+ LRUCache(int n);
+
+ ~LRUCache();
+
+ Data *query(const Key &key) const;
+ Data *add(const Key &key, Data *data);
+
+ int getSize() {return size;}
+ Key &getKey(int i) {return key[i];}
+
+ private:
+ int size;
+ int mask;
+ int top;
+ int fill;
+
+ Key *key;
+ Key **ref;
+ Data **data;
+ };
+}
+
+namespace sw
+{
+ template<class Key, class Data>
+ LRUCache<Key, Data>::LRUCache(int n)
+ {
+ size = ceilPow2(n);
+ mask = size - 1;
+ top = 0;
+ fill = 0;
+
+ key = new Key[size];
+ ref = new Key*[size];
+ data = new Data*[size];
+
+ for(int i = 0; i < size; i++)
+ {
+ data[i] = nullptr;
+
+ ref[i] = &key[i];
+ }
+ }
+
+ template<class Key, class Data>
+ LRUCache<Key, Data>::~LRUCache()
+ {
+ delete[] key;
+ key = nullptr;
+
+ delete[] ref;
+ ref = nullptr;
+
+ for(int i = 0; i < size; i++)
+ {
+ if(data[i])
+ {
+ data[i]->unbind();
+ data[i] = nullptr;
+ }
+ }
+
+ delete[] data;
+ data = nullptr;
+ }
+
+ template<class Key, class Data>
+ Data *LRUCache<Key, Data>::query(const Key &key) const
+ {
+ for(int i = top; i > top - fill; i--)
+ {
+ int j = i & mask;
+
+ if(key == *ref[j])
+ {
+ Data *hit = data[j];
+
+ if(i != top)
+ {
+ // Move one up
+ int k = (j + 1) & mask;
+
+ Data *swapD = data[k];
+ data[k] = data[j];
+ data[j] = swapD;
+
+ Key *swapK = ref[k];
+ ref[k] = ref[j];
+ ref[j] = swapK;
+ }
+
+ return hit;
+ }
+ }
+
+ return nullptr; // Not found
+ }
+
+ template<class Key, class Data>
+ Data *LRUCache<Key, Data>::add(const Key &key, Data *data)
+ {
+ top = (top + 1) & mask;
+ fill = fill + 1 < size ? fill + 1 : size;
+
+ *ref[top] = key;
+
+ data->bind();
+
+ if(this->data[top])
+ {
+ this->data[top]->unbind();
+ }
+
+ this->data[top] = data;
+
+ return data;
+ }
+}
+
+#endif // sw_LRUCache_hpp
diff --git a/src/Device/Matrix.cpp b/src/Device/Matrix.cpp
new file mode 100644
index 0000000..0da07e5
--- /dev/null
+++ b/src/Device/Matrix.cpp
@@ -0,0 +1,402 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Matrix.hpp"
+
+#include "Point.hpp"
+#include "Common/Math.hpp"
+
+namespace sw
+{
+ Matrix Matrix::diag(float m11, float m22, float m33, float m44)
+ {
+ return Matrix(m11, 0, 0, 0,
+ 0, m22, 0, 0,
+ 0, 0, m33, 0,
+ 0, 0, 0, m44);
+ }
+
+ Matrix::operator float*()
+ {
+ return &(*this)(1, 1);
+ }
+
+ Matrix Matrix::operator+() const
+ {
+ return *this;
+ }
+
+ Matrix Matrix::operator-() const
+ {
+ const Matrix &M = *this;
+
+ return Matrix(-M(1, 1), -M(1, 2), -M(1, 3), -M(1, 4),
+ -M(2, 1), -M(2, 2), -M(2, 3), -M(2, 4),
+ -M(3, 1), -M(3, 2), -M(3, 3), -M(3, 4),
+ -M(4, 1), -M(4, 2), -M(4, 3), -M(4, 4));
+ }
+
+ Matrix Matrix::operator!() const
+ {
+ const Matrix &M = *this;
+ Matrix I;
+
+ float M3344 = M(3, 3) * M(4, 4) - M(4, 3) * M(3, 4);
+ float M2344 = M(2, 3) * M(4, 4) - M(4, 3) * M(2, 4);
+ float M2334 = M(2, 3) * M(3, 4) - M(3, 3) * M(2, 4);
+ float M3244 = M(3, 2) * M(4, 4) - M(4, 2) * M(3, 4);
+ float M2244 = M(2, 2) * M(4, 4) - M(4, 2) * M(2, 4);
+ float M2234 = M(2, 2) * M(3, 4) - M(3, 2) * M(2, 4);
+ float M3243 = M(3, 2) * M(4, 3) - M(4, 2) * M(3, 3);
+ float M2243 = M(2, 2) * M(4, 3) - M(4, 2) * M(2, 3);
+ float M2233 = M(2, 2) * M(3, 3) - M(3, 2) * M(2, 3);
+ float M1344 = M(1, 3) * M(4, 4) - M(4, 3) * M(1, 4);
+ float M1334 = M(1, 3) * M(3, 4) - M(3, 3) * M(1, 4);
+ float M1244 = M(1, 2) * M(4, 4) - M(4, 2) * M(1, 4);
+ float M1234 = M(1, 2) * M(3, 4) - M(3, 2) * M(1, 4);
+ float M1243 = M(1, 2) * M(4, 3) - M(4, 2) * M(1, 3);
+ float M1233 = M(1, 2) * M(3, 3) - M(3, 2) * M(1, 3);
+ float M1324 = M(1, 3) * M(2, 4) - M(2, 3) * M(1, 4);
+ float M1224 = M(1, 2) * M(2, 4) - M(2, 2) * M(1, 4);
+ float M1223 = M(1, 2) * M(2, 3) - M(2, 2) * M(1, 3);
+
+ // Adjoint Matrix
+ I(1, 1) = M(2, 2) * M3344 - M(3, 2) * M2344 + M(4, 2) * M2334;
+ I(2, 1) = -M(2, 1) * M3344 + M(3, 1) * M2344 - M(4, 1) * M2334;
+ I(3, 1) = M(2, 1) * M3244 - M(3, 1) * M2244 + M(4, 1) * M2234;
+ I(4, 1) = -M(2, 1) * M3243 + M(3, 1) * M2243 - M(4, 1) * M2233;
+
+ I(1, 2) = -M(1, 2) * M3344 + M(3, 2) * M1344 - M(4, 2) * M1334;
+ I(2, 2) = M(1, 1) * M3344 - M(3, 1) * M1344 + M(4, 1) * M1334;
+ I(3, 2) = -M(1, 1) * M3244 + M(3, 1) * M1244 - M(4, 1) * M1234;
+ I(4, 2) = M(1, 1) * M3243 - M(3, 1) * M1243 + M(4, 1) * M1233;
+
+ I(1, 3) = M(1, 2) * M2344 - M(2, 2) * M1344 + M(4, 2) * M1324;
+ I(2, 3) = -M(1, 1) * M2344 + M(2, 1) * M1344 - M(4, 1) * M1324;
+ I(3, 3) = M(1, 1) * M2244 - M(2, 1) * M1244 + M(4, 1) * M1224;
+ I(4, 3) = -M(1, 1) * M2243 + M(2, 1) * M1243 - M(4, 1) * M1223;
+
+ I(1, 4) = -M(1, 2) * M2334 + M(2, 2) * M1334 - M(3, 2) * M1324;
+ I(2, 4) = M(1, 1) * M2334 - M(2, 1) * M1334 + M(3, 1) * M1324;
+ I(3, 4) = -M(1, 1) * M2234 + M(2, 1) * M1234 - M(3, 1) * M1224;
+ I(4, 4) = M(1, 1) * M2233 - M(2, 1) * M1233 + M(3, 1) * M1223;
+
+ // Division by determinant
+ I /= M(1, 1) * I(1, 1) +
+ M(2, 1) * I(1, 2) +
+ M(3, 1) * I(1, 3) +
+ M(4, 1) * I(1, 4);
+
+ return I;
+ }
+
+ Matrix Matrix::operator~() const
+ {
+ const Matrix &M = *this;
+
+ return Matrix(M(1, 1), M(2, 1), M(3, 1), M(4, 1),
+ M(1, 2), M(2, 2), M(3, 2), M(4, 2),
+ M(1, 3), M(2, 3), M(3, 3), M(4, 3),
+ M(1, 4), M(2, 4), M(3, 4), M(4, 4));
+ }
+
+ Matrix &Matrix::operator+=(const Matrix &N)
+ {
+ Matrix &M = *this;
+
+ M(1, 1) += N(1, 1); M(1, 2) += N(1, 2); M(1, 3) += N(1, 3); M(1, 4) += N(1, 4);
+ M(2, 1) += N(2, 1); M(2, 2) += N(2, 2); M(2, 3) += N(2, 3); M(2, 4) += N(2, 4);
+ M(3, 1) += N(3, 1); M(3, 2) += N(3, 2); M(3, 3) += N(3, 3); M(3, 4) += N(3, 4);
+ M(4, 1) += N(4, 1); M(4, 2) += N(4, 2); M(4, 3) += N(4, 3); M(4, 4) += N(4, 4);
+
+ return M;
+ }
+
+ Matrix &Matrix::operator-=(const Matrix &N)
+ {
+ Matrix &M = *this;
+
+ M(1, 1) -= N(1, 1); M(1, 2) -= N(1, 2); M(1, 3) -= N(1, 3); M(1, 4) -= N(1, 4);
+ M(2, 1) -= N(2, 1); M(2, 2) -= N(2, 2); M(2, 3) -= N(2, 3); M(2, 4) -= N(2, 4);
+ M(3, 1) -= N(3, 1); M(3, 2) -= N(3, 2); M(3, 3) -= N(3, 3); M(3, 4) -= N(3, 4);
+ M(4, 1) -= N(4, 1); M(4, 2) -= N(4, 2); M(4, 3) -= N(4, 3); M(4, 4) -= N(4, 4);
+
+ return M;
+ }
+
+ Matrix &Matrix::operator*=(float s)
+ {
+ Matrix &M = *this;
+
+ M(1, 1) *= s; M(1, 2) *= s; M(1, 3) *= s; M(1, 4) *= s;
+ M(2, 1) *= s; M(2, 2) *= s; M(2, 3) *= s; M(2, 4) *= s;
+ M(3, 1) *= s; M(3, 2) *= s; M(3, 3) *= s; M(3, 4) *= s;
+ M(4, 1) *= s; M(4, 2) *= s; M(4, 3) *= s; M(4, 4) *= s;
+
+ return M;
+ }
+
+ Matrix &Matrix::operator*=(const Matrix &M)
+ {
+ return *this = *this * M;
+ }
+
+ Matrix &Matrix::operator/=(float s)
+ {
+ float r = 1.0f / s;
+
+ return *this *= r;
+ }
+
+ bool operator==(const Matrix &M, const Matrix &N)
+ {
+ if(M(1, 1) == N(1, 1) && M(1, 2) == N(1, 2) && M(1, 3) == N(1, 3) && M(1, 4) == N(1, 4) &&
+ M(2, 1) == N(2, 1) && M(2, 2) == N(2, 2) && M(2, 3) == N(2, 3) && M(2, 4) == N(2, 4) &&
+ M(3, 1) == N(3, 1) && M(3, 2) == N(3, 2) && M(3, 3) == N(3, 3) && M(3, 4) == N(3, 4) &&
+ M(4, 1) == N(4, 1) && M(4, 2) == N(4, 2) && M(4, 3) == N(4, 3) && M(4, 4) == N(4, 4))
+ return true;
+ else
+ return false;
+ }
+
+ bool operator!=(const Matrix &M, const Matrix &N)
+ {
+ if(M(1, 1) != N(1, 1) || M(1, 2) != N(1, 2) || M(1, 3) != N(1, 3) || M(1, 4) != N(1, 4) ||
+ M(2, 1) != N(2, 1) || M(2, 2) != N(2, 2) || M(2, 3) != N(2, 3) || M(2, 4) != N(2, 4) ||
+ M(3, 1) != N(3, 1) || M(3, 2) != N(3, 2) || M(3, 3) != N(3, 3) || M(3, 4) != N(3, 4) ||
+ M(4, 1) != N(4, 1) || M(4, 2) != N(4, 2) || M(4, 3) != N(4, 3) || M(4, 4) != N(4, 4))
+ return true;
+ else
+ return false;
+ }
+
+ Matrix operator+(const Matrix &M, const Matrix &N)
+ {
+ return Matrix(M(1, 1) + N(1, 1), M(1, 2) + N(1, 2), M(1, 3) + N(1, 3), M(1, 4) + N(1, 4),
+ M(2, 1) + N(2, 1), M(2, 2) + N(2, 2), M(2, 3) + N(2, 3), M(2, 4) + N(2, 4),
+ M(3, 1) + N(3, 1), M(3, 2) + N(3, 2), M(3, 3) + N(3, 3), M(3, 4) + N(3, 4),
+ M(4, 1) + N(4, 1), M(4, 2) + N(4, 2), M(4, 3) + N(4, 3), M(4, 4) + N(4, 4));
+ }
+
+ Matrix operator-(const Matrix &M, const Matrix &N)
+ {
+ return Matrix(M(1, 1) - N(1, 1), M(1, 2) - N(1, 2), M(1, 3) - N(1, 3), M(1, 4) - N(1, 4),
+ M(2, 1) - N(2, 1), M(2, 2) - N(2, 2), M(2, 3) - N(2, 3), M(2, 4) - N(2, 4),
+ M(3, 1) - N(3, 1), M(3, 2) - N(3, 2), M(3, 3) - N(3, 3), M(3, 4) - N(3, 4),
+ M(4, 1) - N(4, 1), M(4, 2) - N(4, 2), M(4, 3) - N(4, 3), M(4, 4) - N(4, 4));
+ }
+
+ Matrix operator*(float s, const Matrix &M)
+ {
+ return Matrix(s * M(1, 1), s * M(1, 2), s * M(1, 3), s * M(1, 4),
+ s * M(2, 1), s * M(2, 2), s * M(2, 3), s * M(2, 4),
+ s * M(3, 1), s * M(3, 2), s * M(3, 3), s * M(3, 4),
+ s * M(4, 1), s * M(4, 2), s * M(4, 3), s * M(4, 4));
+ }
+
+ Matrix operator*(const Matrix &M, float s)
+ {
+ return Matrix(M(1, 1) * s, M(1, 2) * s, M(1, 3) * s, M(1, 4) * s,
+ M(2, 1) * s, M(2, 2) * s, M(2, 3) * s, M(2, 4) * s,
+ M(3, 1) * s, M(3, 2) * s, M(3, 3) * s, M(3, 4) * s,
+ M(4, 1) * s, M(4, 2) * s, M(4, 3) * s, M(4, 4) * s);
+ }
+
+ Matrix operator*(const Matrix &M, const Matrix &N)
+ {
+ return Matrix(M(1, 1) * N(1, 1) + M(1, 2) * N(2, 1) + M(1, 3) * N(3, 1) + M(1, 4) * N(4, 1), M(1, 1) * N(1, 2) + M(1, 2) * N(2, 2) + M(1, 3) * N(3, 2) + M(1, 4) * N(4, 2), M(1, 1) * N(1, 3) + M(1, 2) * N(2, 3) + M(1, 3) * N(3, 3) + M(1, 4) * N(4, 3), M(1, 1) * N(1, 4) + M(1, 2) * N(2, 4) + M(1, 3) * N(3, 4) + M(1, 4) * N(4, 4),
+ M(2, 1) * N(1, 1) + M(2, 2) * N(2, 1) + M(2, 3) * N(3, 1) + M(2, 4) * N(4, 1), M(2, 1) * N(1, 2) + M(2, 2) * N(2, 2) + M(2, 3) * N(3, 2) + M(2, 4) * N(4, 2), M(2, 1) * N(1, 3) + M(2, 2) * N(2, 3) + M(2, 3) * N(3, 3) + M(2, 4) * N(4, 3), M(2, 1) * N(1, 4) + M(2, 2) * N(2, 4) + M(2, 3) * N(3, 4) + M(2, 4) * N(4, 4),
+ M(3, 1) * N(1, 1) + M(3, 2) * N(2, 1) + M(3, 3) * N(3, 1) + M(3, 4) * N(4, 1), M(3, 1) * N(1, 2) + M(3, 2) * N(2, 2) + M(3, 3) * N(3, 2) + M(3, 4) * N(4, 2), M(3, 1) * N(1, 3) + M(3, 2) * N(2, 3) + M(3, 3) * N(3, 3) + M(3, 4) * N(4, 3), M(3, 1) * N(1, 4) + M(3, 2) * N(2, 4) + M(3, 3) * N(3, 4) + M(3, 4) * N(4, 4),
+ M(4, 1) * N(1, 1) + M(4, 2) * N(2, 1) + M(4, 3) * N(3, 1) + M(4, 4) * N(4, 1), M(4, 1) * N(1, 2) + M(4, 2) * N(2, 2) + M(4, 3) * N(3, 2) + M(4, 4) * N(4, 2), M(4, 1) * N(1, 3) + M(4, 2) * N(2, 3) + M(4, 3) * N(3, 3) + M(4, 4) * N(4, 3), M(4, 1) * N(1, 4) + M(4, 2) * N(2, 4) + M(4, 3) * N(3, 4) + M(4, 4) * N(4, 4));
+ }
+
+ Matrix operator/(const Matrix &M, float s)
+ {
+ float r = 1.0f / s;
+
+ return M * r;
+ }
+
+ float4 Matrix::operator*(const float4 &v) const
+ {
+ const Matrix &M = *this;
+ float Mx = M(1, 1) * v.x + M(1, 2) * v.y + M(1, 3) * v.z + M(1, 4) * v.w;
+ float My = M(2, 1) * v.x + M(2, 2) * v.y + M(2, 3) * v.z + M(2, 4) * v.w;
+ float Mz = M(3, 1) * v.x + M(3, 2) * v.y + M(3, 3) * v.z + M(3, 4) * v.w;
+ float Mw = M(4, 1) * v.x + M(4, 2) * v.y + M(4, 3) * v.z + M(4, 4) * v.w;
+
+ return {Mx, My, Mz, Mw};
+ }
+
+ float Matrix::det(const Matrix &M)
+ {
+ float M3344 = M(3, 3) * M(4, 4) - M(4, 3) * M(3, 4);
+ float M2344 = M(2, 3) * M(4, 4) - M(4, 3) * M(2, 4);
+ float M2334 = M(2, 3) * M(3, 4) - M(3, 3) * M(2, 4);
+ float M1344 = M(1, 3) * M(4, 4) - M(4, 3) * M(1, 4);
+ float M1334 = M(1, 3) * M(3, 4) - M(3, 3) * M(1, 4);
+ float M1324 = M(1, 3) * M(2, 4) - M(2, 3) * M(1, 4);
+
+ return M(1, 1) * (M(2, 2) * M3344 - M(3, 2) * M2344 + M(4, 2) * M2334) -
+ M(2, 1) * (M(1, 2) * M3344 - M(3, 2) * M1344 + M(4, 2) * M1334) +
+ M(3, 1) * (M(1, 2) * M2344 - M(2, 2) * M1344 + M(4, 2) * M1324) -
+ M(4, 1) * (M(1, 2) * M2334 - M(2, 2) * M1334 + M(3, 2) * M1324);
+ }
+
+ float Matrix::det(float m11)
+ {
+ return m11;
+ }
+
+ float Matrix::det(float m11, float m12,
+ float m21, float m22)
+ {
+ return m11 * m22 - m12 * m21;
+ }
+
+ float Matrix::det(float m11, float m12, float m13,
+ float m21, float m22, float m23,
+ float m31, float m32, float m33)
+ {
+ return m11 * (m22 * m33 - m32 * m23) -
+ m21 * (m12 * m33 - m32 * m13) +
+ m31 * (m12 * m23 - m22 * m13);
+ }
+
+ float Matrix::det(float m11, float m12, float m13, float m14,
+ float m21, float m22, float m23, float m24,
+ float m31, float m32, float m33, float m34,
+ float m41, float m42, float m43, float m44)
+ {
+ float M3344 = m33 * m44 - m43 * m34;
+ float M2344 = m23 * m44 - m43 * m24;
+ float M2334 = m23 * m34 - m33 * m24;
+ float M1344 = m13 * m44 - m43 * m14;
+ float M1334 = m13 * m34 - m33 * m14;
+ float M1324 = m13 * m24 - m23 * m14;
+
+ return m11 * (m22 * M3344 - m32 * M2344 + m42 * M2334) -
+ m21 * (m12 * M3344 - m32 * M1344 + m42 * M1334) +
+ m31 * (m12 * M2344 - m22 * M1344 + m42 * M1324) -
+ m41 * (m12 * M2334 - m22 * M1334 + m32 * M1324);
+ }
+
+ float Matrix::det(const Vector &v1, const Vector &v2, const Vector &v3)
+ {
+ return v1 * (v2 % v3);
+ }
+
+ float Matrix::det3(const Matrix &M)
+ {
+ return M(1, 1) * (M(2, 2) * M(3, 3) - M(3, 2) * M(2, 3)) -
+ M(2, 1) * (M(1, 2) * M(3, 3) - M(3, 2) * M(1, 3)) +
+ M(3, 1) * (M(1, 2) * M(2, 3) - M(2, 2) * M(1, 3));
+ }
+
+ float Matrix::tr(const Matrix &M)
+ {
+ return M(1, 1) + M(2, 2) + M(3, 3) + M(4, 4);
+ }
+
+ Matrix &Matrix::orthogonalise()
+ {
+ // NOTE: Numnerically instable, won't return exact the same result when already orhtogonal
+
+ Matrix &M = *this;
+
+ Vector v1(M(1, 1), M(2, 1), M(3, 1));
+ Vector v2(M(1, 2), M(2, 2), M(3, 2));
+ Vector v3(M(1, 3), M(2, 3), M(3, 3));
+
+ v2 -= v1 * (v1 * v2) / (v1 * v1);
+ v3 -= v1 * (v1 * v3) / (v1 * v1);
+ v3 -= v2 * (v2 * v3) / (v2 * v2);
+
+ v1 /= Vector::N(v1);
+ v2 /= Vector::N(v2);
+ v3 /= Vector::N(v3);
+
+ M(1, 1) = v1.x; M(1, 2) = v2.x; M(1, 3) = v3.x;
+ M(2, 1) = v1.y; M(2, 2) = v2.y; M(2, 3) = v3.y;
+ M(3, 1) = v1.z; M(3, 2) = v2.z; M(3, 3) = v3.z;
+
+ return *this;
+ }
+
+ Matrix Matrix::eulerRotate(const Vector &v)
+ {
+ float cz = cos(v.z);
+ float sz = sin(v.z);
+ float cx = cos(v.x);
+ float sx = sin(v.x);
+ float cy = cos(v.y);
+ float sy = sin(v.y);
+
+ float sxsy = sx * sy;
+ float sxcy = sx * cy;
+
+ return Matrix(cy * cz - sxsy * sz, -cy * sz - sxsy * cz, -sy * cx,
+ cx * sz, cx * cz, -sx,
+ sy * cz + sxcy * sz, -sy * sz + sxcy * cz, cy * cx);
+ }
+
+ Matrix Matrix::eulerRotate(float x, float y, float z)
+ {
+ return eulerRotate(Vector(x, y, z));
+ }
+
+ Matrix Matrix::translate(const Vector &v)
+ {
+ return Matrix(1, 0, 0, v.x,
+ 0, 1, 0, v.y,
+ 0, 0, 1, v.z,
+ 0, 0, 0, 1);
+ }
+
+ Matrix Matrix::translate(float x, float y, float z)
+ {
+ return translate(Vector(x, y, z));
+ }
+
+ Matrix Matrix::scale(const Vector &v)
+ {
+ return Matrix(v.x, 0, 0,
+ 0, v.y, 0,
+ 0, 0, v.z);
+ }
+
+ Matrix Matrix::scale(float x, float y, float z)
+ {
+ return scale(Vector(x, y, z));
+ }
+
+ Matrix Matrix::lookAt(const Vector &v)
+ {
+ Vector y = v;
+ y /= Vector::N(y);
+
+ Vector x = y % Vector(0, 0, 1);
+ x /= Vector::N(x);
+
+ Vector z = x % y;
+ z /= Vector::N(z);
+
+ return ~Matrix(x, y, z);
+ }
+
+ Matrix Matrix::lookAt(float x, float y, float z)
+ {
+ return translate(Vector(x, y, z));
+ }
+}
diff --git a/src/Device/Matrix.hpp b/src/Device/Matrix.hpp
new file mode 100644
index 0000000..41281a6
--- /dev/null
+++ b/src/Device/Matrix.hpp
@@ -0,0 +1,217 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Matrix_hpp
+#define Matrix_hpp
+
+namespace sw
+{
+ struct Vector;
+ struct Point;
+ struct float4;
+
+ struct Matrix
+ {
+ Matrix();
+ Matrix(const int i);
+ Matrix(const float m[16]);
+ Matrix(const float m[4][4]);
+ Matrix(float m11, float m12, float m13,
+ float m21, float m22, float m23,
+ float m31, float m32, float m33);
+ Matrix(float m11, float m12, float m13, float m14,
+ float m21, float m22, float m23, float m24,
+ float m31, float m32, float m33, float m34,
+ float m41, float m42, float m43, float m44);
+ Matrix(const Vector &v1, const Vector &v2, const Vector &v3); // Column vectors
+
+ Matrix &operator=(const Matrix &N);
+
+ // Row major order
+ float m[4][4];
+
+ static Matrix diag(float m11, float m22, float m33, float m44);
+
+ operator float*();
+
+ Matrix operator+() const;
+ Matrix operator-() const;
+
+ Matrix operator!() const; // Inverse
+ Matrix operator~() const; // Transpose
+
+ Matrix &operator+=(const Matrix &N);
+ Matrix &operator-=(const Matrix &N);
+ Matrix &operator*=(float s);
+ Matrix &operator*=(const Matrix &N);
+ Matrix &operator/=(float s);
+
+ float *operator[](int i); // Access element [row][col], starting with [0][0]
+ const float *operator[](int i) const;
+
+ float &operator()(int i, int j); // Access element (row, col), starting with (1, 1)
+ const float &operator()(int i, int j) const;
+
+ friend bool operator==(const Matrix &M, const Matrix &N);
+ friend bool operator!=(const Matrix &M, const Matrix &N);
+
+ friend Matrix operator+(const Matrix &M, const Matrix &N);
+ friend Matrix operator-(const Matrix &M, const Matrix &N);
+ friend Matrix operator*(float s, const Matrix &M);
+ friend Matrix operator*(const Matrix &M, const Matrix &N);
+ friend Matrix operator/(const Matrix &M, float s);
+
+ float4 operator*(const float4 &v) const;
+
+ static float det(const Matrix &M);
+ static float det(float m11);
+ static float det(float m11, float m12,
+ float m21, float m22);
+ static float det(float m11, float m12, float m13,
+ float m21, float m22, float m23,
+ float m31, float m32, float m33);
+ static float det(float m11, float m12, float m13, float m14,
+ float m21, float m22, float m23, float m24,
+ float m31, float m32, float m33, float m34,
+ float m41, float m42, float m43, float m44);
+ static float det(const Vector &v1, const Vector &v2, const Vector &v3);
+ static float det3(const Matrix &M);
+
+ static float tr(const Matrix &M);
+
+ Matrix &orthogonalise(); // Gram-Schmidt orthogonalisation of 3x3 submatrix
+
+ static Matrix eulerRotate(const Vector &v);
+ static Matrix eulerRotate(float x, float y, float z);
+
+ static Matrix translate(const Vector &v);
+ static Matrix translate(float x, float y, float z);
+
+ static Matrix scale(const Vector &v);
+ static Matrix scale(float x, float y, float z);
+
+ static Matrix lookAt(const Vector &v);
+ static Matrix lookAt(float x, float y, float z);
+ };
+}
+
+#include "Vector.hpp"
+
+namespace sw
+{
+ inline Matrix::Matrix()
+ {
+ }
+
+ inline Matrix::Matrix(const int i)
+ {
+ const float s = (float)i;
+
+ Matrix &M = *this;
+
+ M(1, 1) = s; M(1, 2) = 0; M(1, 3) = 0; M(1, 4) = 0;
+ M(2, 1) = 0; M(2, 2) = s; M(2, 3) = 0; M(2, 4) = 0;
+ M(3, 1) = 0; M(3, 2) = 0; M(3, 3) = s; M(3, 4) = 0;
+ M(4, 1) = 0; M(4, 2) = 0; M(4, 3) = 0; M(4, 4) = s;
+ }
+
+ inline Matrix::Matrix(const float m[16])
+ {
+ Matrix &M = *this;
+
+ M(1, 1) = m[0]; M(1, 2) = m[1]; M(1, 3) = m[2]; M(1, 4) = m[3];
+ M(2, 1) = m[4]; M(2, 2) = m[5]; M(2, 3) = m[6]; M(2, 4) = m[7];
+ M(3, 1) = m[8]; M(3, 2) = m[8]; M(3, 3) = m[10]; M(3, 4) = m[11];
+ M(4, 1) = m[12]; M(4, 2) = m[13]; M(4, 3) = m[14]; M(4, 4) = m[15];
+ }
+
+ inline Matrix::Matrix(const float m[4][4])
+ {
+ Matrix &M = *this;
+
+ M[0][0] = m[0][0]; M[0][1] = m[0][1]; M[0][2] = m[0][2]; M[0][3] = m[0][3];
+ M[1][0] = m[1][0]; M[1][1] = m[1][1]; M[1][2] = m[1][2]; M[1][3] = m[1][3];
+ M[2][0] = m[2][0]; M[2][1] = m[2][1]; M[2][2] = m[2][2]; M[2][3] = m[2][3];
+ M[3][0] = m[3][0]; M[3][1] = m[3][1]; M[3][2] = m[3][2]; M[3][3] = m[3][3];
+ }
+
+ inline Matrix::Matrix(float m11, float m12, float m13,
+ float m21, float m22, float m23,
+ float m31, float m32, float m33)
+ {
+ Matrix &M = *this;
+
+ M(1, 1) = m11; M(1, 2) = m12; M(1, 3) = m13; M(1, 4) = 0;
+ M(2, 1) = m21; M(2, 2) = m22; M(2, 3) = m23; M(2, 4) = 0;
+ M(3, 1) = m31; M(3, 2) = m32; M(3, 3) = m33; M(3, 4) = 0;
+ M(4, 1) = 0; M(4, 2) = 0; M(4, 3) = 0; M(4, 4) = 1;
+ }
+
+ inline Matrix::Matrix(float m11, float m12, float m13, float m14,
+ float m21, float m22, float m23, float m24,
+ float m31, float m32, float m33, float m34,
+ float m41, float m42, float m43, float m44)
+ {
+ Matrix &M = *this;
+
+ M(1, 1) = m11; M(1, 2) = m12; M(1, 3) = m13; M(1, 4) = m14;
+ M(2, 1) = m21; M(2, 2) = m22; M(2, 3) = m23; M(2, 4) = m24;
+ M(3, 1) = m31; M(3, 2) = m32; M(3, 3) = m33; M(3, 4) = m34;
+ M(4, 1) = m41; M(4, 2) = m42; M(4, 3) = m43; M(4, 4) = m44;
+ }
+
+ inline Matrix::Matrix(const Vector &v1, const Vector &v2, const Vector &v3)
+ {
+ Matrix &M = *this;
+
+ M(1, 1) = v1.x; M(1, 2) = v2.x; M(1, 3) = v3.x; M(1, 4) = 0;
+ M(2, 1) = v1.y; M(2, 2) = v2.y; M(2, 3) = v3.y; M(2, 4) = 0;
+ M(3, 1) = v1.z; M(3, 2) = v2.z; M(3, 3) = v3.z; M(3, 4) = 0;
+ M(4, 1) = 0; M(4, 2) = 0; M(4, 3) = 0; M(4, 4) = 1;
+ }
+
+ inline Matrix &Matrix::operator=(const Matrix &N)
+ {
+ Matrix &M = *this;
+
+ M(1, 1) = N(1, 1); M(1, 2) = N(1, 2); M(1, 3) = N(1, 3); M(1, 4) = N(1, 4);
+ M(2, 1) = N(2, 1); M(2, 2) = N(2, 2); M(2, 3) = N(2, 3); M(2, 4) = N(2, 4);
+ M(3, 1) = N(3, 1); M(3, 2) = N(3, 2); M(3, 3) = N(3, 3); M(3, 4) = N(3, 4);
+ M(4, 1) = N(4, 1); M(4, 2) = N(4, 2); M(4, 3) = N(4, 3); M(4, 4) = N(4, 4);
+
+ return M;
+ }
+
+ inline float *Matrix::operator[](int i)
+ {
+ return m[i];
+ }
+
+ inline const float *Matrix::operator[](int i) const
+ {
+ return m[i];
+ }
+
+ inline float &Matrix::operator()(int i, int j)
+ {
+ return m[i - 1][j - 1];
+ }
+
+ inline const float &Matrix::operator()(int i, int j) const
+ {
+ return m[i - 1][j - 1];
+ }
+}
+
+#endif // Matrix_hpp
diff --git a/src/Device/PixelProcessor.cpp b/src/Device/PixelProcessor.cpp
new file mode 100644
index 0000000..8bc40c2
--- /dev/null
+++ b/src/Device/PixelProcessor.cpp
@@ -0,0 +1,1212 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "PixelProcessor.hpp"
+
+#include "Surface.hpp"
+#include "Primitive.hpp"
+#include "Shader/PixelPipeline.hpp"
+#include "Shader/PixelProgram.hpp"
+#include "Shader/PixelShader.hpp"
+#include "Shader/Constants.hpp"
+#include "Common/Debug.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+ extern bool complementaryDepthBuffer;
+ extern TransparencyAntialiasing transparencyAntialiasing;
+ extern bool perspectiveCorrection;
+
+ bool precachePixel = false;
+
+ unsigned int PixelProcessor::States::computeHash()
+ {
+ unsigned int *state = (unsigned int*)this;
+ unsigned int hash = 0;
+
+ for(unsigned int i = 0; i < sizeof(States) / 4; i++)
+ {
+ hash ^= state[i];
+ }
+
+ return hash;
+ }
+
+ PixelProcessor::State::State()
+ {
+ memset(this, 0, sizeof(State));
+ }
+
+ bool PixelProcessor::State::operator==(const State &state) const
+ {
+ if(hash != state.hash)
+ {
+ return false;
+ }
+
+ return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
+ }
+
+ PixelProcessor::UniformBufferInfo::UniformBufferInfo()
+ {
+ buffer = nullptr;
+ offset = 0;
+ }
+
+ PixelProcessor::PixelProcessor(Context *context) : context(context)
+ {
+ setGlobalMipmapBias(0.0f); // Round to highest LOD [0.5, 1.0]: -0.5
+ // Round to nearest LOD [0.7, 1.4]: 0.0
+ // Round to lowest LOD [1.0, 2.0]: 0.5
+
+ routineCache = 0;
+ setRoutineCacheSize(1024);
+ }
+
+ PixelProcessor::~PixelProcessor()
+ {
+ delete routineCache;
+ routineCache = 0;
+ }
+
+ void PixelProcessor::setFloatConstant(unsigned int index, const float value[4])
+ {
+ if(index < FRAGMENT_UNIFORM_VECTORS)
+ {
+ c[index][0] = value[0];
+ c[index][1] = value[1];
+ c[index][2] = value[2];
+ c[index][3] = value[3];
+ }
+ else ASSERT(false);
+
+ if(index < 8) // ps_1_x constants
+ {
+ // FIXME: Compact into generic function
+ short x = iround(4095 * clamp(value[0], -1.0f, 1.0f));
+ short y = iround(4095 * clamp(value[1], -1.0f, 1.0f));
+ short z = iround(4095 * clamp(value[2], -1.0f, 1.0f));
+ short w = iround(4095 * clamp(value[3], -1.0f, 1.0f));
+
+ cW[index][0][0] = x;
+ cW[index][0][1] = x;
+ cW[index][0][2] = x;
+ cW[index][0][3] = x;
+
+ cW[index][1][0] = y;
+ cW[index][1][1] = y;
+ cW[index][1][2] = y;
+ cW[index][1][3] = y;
+
+ cW[index][2][0] = z;
+ cW[index][2][1] = z;
+ cW[index][2][2] = z;
+ cW[index][2][3] = z;
+
+ cW[index][3][0] = w;
+ cW[index][3][1] = w;
+ cW[index][3][2] = w;
+ cW[index][3][3] = w;
+ }
+ }
+
+ void PixelProcessor::setIntegerConstant(unsigned int index, const int value[4])
+ {
+ if(index < 16)
+ {
+ i[index][0] = value[0];
+ i[index][1] = value[1];
+ i[index][2] = value[2];
+ i[index][3] = value[3];
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setBooleanConstant(unsigned int index, int boolean)
+ {
+ if(index < 16)
+ {
+ b[index] = boolean != 0;
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setUniformBuffer(int index, sw::Resource* buffer, int offset)
+ {
+ uniformBufferInfo[index].buffer = buffer;
+ uniformBufferInfo[index].offset = offset;
+ }
+
+ void PixelProcessor::lockUniformBuffers(byte** u, sw::Resource* uniformBuffers[])
+ {
+ for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; ++i)
+ {
+ u[i] = uniformBufferInfo[i].buffer ? static_cast<byte*>(uniformBufferInfo[i].buffer->lock(PUBLIC, PRIVATE)) + uniformBufferInfo[i].offset : nullptr;
+ uniformBuffers[i] = uniformBufferInfo[i].buffer;
+ }
+ }
+
+ void PixelProcessor::setRenderTarget(int index, Surface *renderTarget, unsigned int layer)
+ {
+ context->renderTarget[index] = renderTarget;
+ context->renderTargetLayer[index] = layer;
+ }
+
+ void PixelProcessor::setDepthBuffer(Surface *depthBuffer, unsigned int layer)
+ {
+ context->depthBuffer = depthBuffer;
+ context->depthBufferLayer = layer;
+ }
+
+ void PixelProcessor::setStencilBuffer(Surface *stencilBuffer, unsigned int layer)
+ {
+ context->stencilBuffer = stencilBuffer;
+ context->stencilBufferLayer = layer;
+ }
+
+ void PixelProcessor::setTexCoordIndex(unsigned int stage, int texCoordIndex)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setTexCoordIndex(texCoordIndex);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setStageOperation(unsigned int stage, TextureStage::StageOperation stageOperation)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setStageOperation(stageOperation);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setFirstArgument(unsigned int stage, TextureStage::SourceArgument firstArgument)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setFirstArgument(firstArgument);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setSecondArgument(unsigned int stage, TextureStage::SourceArgument secondArgument)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setSecondArgument(secondArgument);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setThirdArgument(unsigned int stage, TextureStage::SourceArgument thirdArgument)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setThirdArgument(thirdArgument);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setStageOperationAlpha(unsigned int stage, TextureStage::StageOperation stageOperationAlpha)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setStageOperationAlpha(stageOperationAlpha);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setFirstArgumentAlpha(unsigned int stage, TextureStage::SourceArgument firstArgumentAlpha)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setFirstArgumentAlpha(firstArgumentAlpha);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setSecondArgumentAlpha(unsigned int stage, TextureStage::SourceArgument secondArgumentAlpha)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setSecondArgumentAlpha(secondArgumentAlpha);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setThirdArgumentAlpha(unsigned int stage, TextureStage::SourceArgument thirdArgumentAlpha)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setThirdArgumentAlpha(thirdArgumentAlpha);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setFirstModifier(unsigned int stage, TextureStage::ArgumentModifier firstModifier)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setFirstModifier(firstModifier);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setSecondModifier(unsigned int stage, TextureStage::ArgumentModifier secondModifier)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setSecondModifier(secondModifier);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setThirdModifier(unsigned int stage, TextureStage::ArgumentModifier thirdModifier)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setThirdModifier(thirdModifier);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setFirstModifierAlpha(unsigned int stage, TextureStage::ArgumentModifier firstModifierAlpha)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setFirstModifierAlpha(firstModifierAlpha);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setSecondModifierAlpha(unsigned int stage, TextureStage::ArgumentModifier secondModifierAlpha)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setSecondModifierAlpha(secondModifierAlpha);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setThirdModifierAlpha(unsigned int stage, TextureStage::ArgumentModifier thirdModifierAlpha)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setThirdModifierAlpha(thirdModifierAlpha);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setDestinationArgument(unsigned int stage, TextureStage::DestinationArgument destinationArgument)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setDestinationArgument(destinationArgument);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setConstantColor(unsigned int stage, const Color<float> &constantColor)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setConstantColor(constantColor);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setBumpmapMatrix(unsigned int stage, int element, float value)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setBumpmapMatrix(element, value);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setLuminanceScale(unsigned int stage, float value)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setLuminanceScale(value);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setLuminanceOffset(unsigned int stage, float value)
+ {
+ if(stage < 8)
+ {
+ context->textureStage[stage].setLuminanceOffset(value);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setTextureFilter(unsigned int sampler, FilterType textureFilter)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setTextureFilter(textureFilter);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setMipmapFilter(unsigned int sampler, MipmapType mipmapFilter)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setMipmapFilter(mipmapFilter);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setGatherEnable(unsigned int sampler, bool enable)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setGatherEnable(enable);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setAddressingModeU(unsigned int sampler, AddressingMode addressMode)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setAddressingModeU(addressMode);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setAddressingModeV(unsigned int sampler, AddressingMode addressMode)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setAddressingModeV(addressMode);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setAddressingModeW(unsigned int sampler, AddressingMode addressMode)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setAddressingModeW(addressMode);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setReadSRGB(unsigned int sampler, bool sRGB)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setReadSRGB(sRGB);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setMipmapLOD(unsigned int sampler, float bias)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setMipmapLOD(bias);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setBorderColor(unsigned int sampler, const Color<float> &borderColor)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setBorderColor(borderColor);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setMaxAnisotropy(unsigned int sampler, float maxAnisotropy)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setMaxAnisotropy(maxAnisotropy);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setHighPrecisionFiltering(unsigned int sampler, bool highPrecisionFiltering)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setHighPrecisionFiltering(highPrecisionFiltering);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setSwizzleR(unsigned int sampler, SwizzleType swizzleR)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setSwizzleR(swizzleR);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setSwizzleG(unsigned int sampler, SwizzleType swizzleG)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setSwizzleG(swizzleG);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setSwizzleB(unsigned int sampler, SwizzleType swizzleB)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setSwizzleB(swizzleB);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setSwizzleA(unsigned int sampler, SwizzleType swizzleA)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setSwizzleA(swizzleA);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setCompareFunc(unsigned int sampler, CompareFunc compFunc)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setCompareFunc(compFunc);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setBaseLevel(unsigned int sampler, int baseLevel)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setBaseLevel(baseLevel);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setMaxLevel(unsigned int sampler, int maxLevel)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setMaxLevel(maxLevel);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setMinLod(unsigned int sampler, float minLod)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setMinLod(minLod);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setMaxLod(unsigned int sampler, float maxLod)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setMaxLod(maxLod);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setSyncRequired(unsigned int sampler, bool isSincRequired)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setSyncRequired(isSincRequired);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProcessor::setWriteSRGB(bool sRGB)
+ {
+ context->setWriteSRGB(sRGB);
+ }
+
+ void PixelProcessor::setColorLogicOpEnabled(bool colorLogicOpEnabled)
+ {
+ context->setColorLogicOpEnabled(colorLogicOpEnabled);
+ }
+
+ void PixelProcessor::setLogicalOperation(LogicalOperation logicalOperation)
+ {
+ context->setLogicalOperation(logicalOperation);
+ }
+
+ void PixelProcessor::setDepthBufferEnable(bool depthBufferEnable)
+ {
+ context->setDepthBufferEnable(depthBufferEnable);
+ }
+
+ void PixelProcessor::setDepthCompare(DepthCompareMode depthCompareMode)
+ {
+ context->depthCompareMode = depthCompareMode;
+ }
+
+ void PixelProcessor::setAlphaCompare(AlphaCompareMode alphaCompareMode)
+ {
+ context->alphaCompareMode = alphaCompareMode;
+ }
+
+ void PixelProcessor::setDepthWriteEnable(bool depthWriteEnable)
+ {
+ context->depthWriteEnable = depthWriteEnable;
+ }
+
+ void PixelProcessor::setAlphaTestEnable(bool alphaTestEnable)
+ {
+ context->alphaTestEnable = alphaTestEnable;
+ }
+
+ void PixelProcessor::setCullMode(CullMode cullMode, bool frontFacingCCW)
+ {
+ context->cullMode = cullMode;
+ context->frontFacingCCW = frontFacingCCW;
+ }
+
+ void PixelProcessor::setColorWriteMask(int index, int rgbaMask)
+ {
+ context->setColorWriteMask(index, rgbaMask);
+ }
+
+ void PixelProcessor::setStencilEnable(bool stencilEnable)
+ {
+ context->stencilEnable = stencilEnable;
+ }
+
+ void PixelProcessor::setStencilCompare(StencilCompareMode stencilCompareMode)
+ {
+ context->stencilCompareMode = stencilCompareMode;
+ }
+
+ void PixelProcessor::setStencilReference(int stencilReference)
+ {
+ context->stencilReference = stencilReference;
+ stencil.set(stencilReference, context->stencilMask, context->stencilWriteMask);
+ }
+
+ void PixelProcessor::setStencilReferenceCCW(int stencilReferenceCCW)
+ {
+ context->stencilReferenceCCW = stencilReferenceCCW;
+ stencilCCW.set(stencilReferenceCCW, context->stencilMaskCCW, context->stencilWriteMaskCCW);
+ }
+
+ void PixelProcessor::setStencilMask(int stencilMask)
+ {
+ context->stencilMask = stencilMask;
+ stencil.set(context->stencilReference, stencilMask, context->stencilWriteMask);
+ }
+
+ void PixelProcessor::setStencilMaskCCW(int stencilMaskCCW)
+ {
+ context->stencilMaskCCW = stencilMaskCCW;
+ stencilCCW.set(context->stencilReferenceCCW, stencilMaskCCW, context->stencilWriteMaskCCW);
+ }
+
+ void PixelProcessor::setStencilFailOperation(StencilOperation stencilFailOperation)
+ {
+ context->stencilFailOperation = stencilFailOperation;
+ }
+
+ void PixelProcessor::setStencilPassOperation(StencilOperation stencilPassOperation)
+ {
+ context->stencilPassOperation = stencilPassOperation;
+ }
+
+ void PixelProcessor::setStencilZFailOperation(StencilOperation stencilZFailOperation)
+ {
+ context->stencilZFailOperation = stencilZFailOperation;
+ }
+
+ void PixelProcessor::setStencilWriteMask(int stencilWriteMask)
+ {
+ context->stencilWriteMask = stencilWriteMask;
+ stencil.set(context->stencilReference, context->stencilMask, stencilWriteMask);
+ }
+
+ void PixelProcessor::setStencilWriteMaskCCW(int stencilWriteMaskCCW)
+ {
+ context->stencilWriteMaskCCW = stencilWriteMaskCCW;
+ stencilCCW.set(context->stencilReferenceCCW, context->stencilMaskCCW, stencilWriteMaskCCW);
+ }
+
+ void PixelProcessor::setTwoSidedStencil(bool enable)
+ {
+ context->twoSidedStencil = enable;
+ }
+
+ void PixelProcessor::setStencilCompareCCW(StencilCompareMode stencilCompareMode)
+ {
+ context->stencilCompareModeCCW = stencilCompareMode;
+ }
+
+ void PixelProcessor::setStencilFailOperationCCW(StencilOperation stencilFailOperation)
+ {
+ context->stencilFailOperationCCW = stencilFailOperation;
+ }
+
+ void PixelProcessor::setStencilPassOperationCCW(StencilOperation stencilPassOperation)
+ {
+ context->stencilPassOperationCCW = stencilPassOperation;
+ }
+
+ void PixelProcessor::setStencilZFailOperationCCW(StencilOperation stencilZFailOperation)
+ {
+ context->stencilZFailOperationCCW = stencilZFailOperation;
+ }
+
+ void PixelProcessor::setTextureFactor(const Color<float> &textureFactor)
+ {
+ // FIXME: Compact into generic function // FIXME: Clamp
+ short textureFactorR = iround(4095 * textureFactor.r);
+ short textureFactorG = iround(4095 * textureFactor.g);
+ short textureFactorB = iround(4095 * textureFactor.b);
+ short textureFactorA = iround(4095 * textureFactor.a);
+
+ factor.textureFactor4[0][0] = textureFactorR;
+ factor.textureFactor4[0][1] = textureFactorR;
+ factor.textureFactor4[0][2] = textureFactorR;
+ factor.textureFactor4[0][3] = textureFactorR;
+
+ factor.textureFactor4[1][0] = textureFactorG;
+ factor.textureFactor4[1][1] = textureFactorG;
+ factor.textureFactor4[1][2] = textureFactorG;
+ factor.textureFactor4[1][3] = textureFactorG;
+
+ factor.textureFactor4[2][0] = textureFactorB;
+ factor.textureFactor4[2][1] = textureFactorB;
+ factor.textureFactor4[2][2] = textureFactorB;
+ factor.textureFactor4[2][3] = textureFactorB;
+
+ factor.textureFactor4[3][0] = textureFactorA;
+ factor.textureFactor4[3][1] = textureFactorA;
+ factor.textureFactor4[3][2] = textureFactorA;
+ factor.textureFactor4[3][3] = textureFactorA;
+ }
+
+ void PixelProcessor::setBlendConstant(const Color<float> &blendConstant)
+ {
+ // FIXME: Compact into generic function // FIXME: Clamp
+ short blendConstantR = iround(65535 * blendConstant.r);
+ short blendConstantG = iround(65535 * blendConstant.g);
+ short blendConstantB = iround(65535 * blendConstant.b);
+ short blendConstantA = iround(65535 * blendConstant.a);
+
+ factor.blendConstant4W[0][0] = blendConstantR;
+ factor.blendConstant4W[0][1] = blendConstantR;
+ factor.blendConstant4W[0][2] = blendConstantR;
+ factor.blendConstant4W[0][3] = blendConstantR;
+
+ factor.blendConstant4W[1][0] = blendConstantG;
+ factor.blendConstant4W[1][1] = blendConstantG;
+ factor.blendConstant4W[1][2] = blendConstantG;
+ factor.blendConstant4W[1][3] = blendConstantG;
+
+ factor.blendConstant4W[2][0] = blendConstantB;
+ factor.blendConstant4W[2][1] = blendConstantB;
+ factor.blendConstant4W[2][2] = blendConstantB;
+ factor.blendConstant4W[2][3] = blendConstantB;
+
+ factor.blendConstant4W[3][0] = blendConstantA;
+ factor.blendConstant4W[3][1] = blendConstantA;
+ factor.blendConstant4W[3][2] = blendConstantA;
+ factor.blendConstant4W[3][3] = blendConstantA;
+
+ // FIXME: Compact into generic function // FIXME: Clamp
+ short invBlendConstantR = iround(65535 * (1 - blendConstant.r));
+ short invBlendConstantG = iround(65535 * (1 - blendConstant.g));
+ short invBlendConstantB = iround(65535 * (1 - blendConstant.b));
+ short invBlendConstantA = iround(65535 * (1 - blendConstant.a));
+
+ factor.invBlendConstant4W[0][0] = invBlendConstantR;
+ factor.invBlendConstant4W[0][1] = invBlendConstantR;
+ factor.invBlendConstant4W[0][2] = invBlendConstantR;
+ factor.invBlendConstant4W[0][3] = invBlendConstantR;
+
+ factor.invBlendConstant4W[1][0] = invBlendConstantG;
+ factor.invBlendConstant4W[1][1] = invBlendConstantG;
+ factor.invBlendConstant4W[1][2] = invBlendConstantG;
+ factor.invBlendConstant4W[1][3] = invBlendConstantG;
+
+ factor.invBlendConstant4W[2][0] = invBlendConstantB;
+ factor.invBlendConstant4W[2][1] = invBlendConstantB;
+ factor.invBlendConstant4W[2][2] = invBlendConstantB;
+ factor.invBlendConstant4W[2][3] = invBlendConstantB;
+
+ factor.invBlendConstant4W[3][0] = invBlendConstantA;
+ factor.invBlendConstant4W[3][1] = invBlendConstantA;
+ factor.invBlendConstant4W[3][2] = invBlendConstantA;
+ factor.invBlendConstant4W[3][3] = invBlendConstantA;
+
+ factor.blendConstant4F[0][0] = blendConstant.r;
+ factor.blendConstant4F[0][1] = blendConstant.r;
+ factor.blendConstant4F[0][2] = blendConstant.r;
+ factor.blendConstant4F[0][3] = blendConstant.r;
+
+ factor.blendConstant4F[1][0] = blendConstant.g;
+ factor.blendConstant4F[1][1] = blendConstant.g;
+ factor.blendConstant4F[1][2] = blendConstant.g;
+ factor.blendConstant4F[1][3] = blendConstant.g;
+
+ factor.blendConstant4F[2][0] = blendConstant.b;
+ factor.blendConstant4F[2][1] = blendConstant.b;
+ factor.blendConstant4F[2][2] = blendConstant.b;
+ factor.blendConstant4F[2][3] = blendConstant.b;
+
+ factor.blendConstant4F[3][0] = blendConstant.a;
+ factor.blendConstant4F[3][1] = blendConstant.a;
+ factor.blendConstant4F[3][2] = blendConstant.a;
+ factor.blendConstant4F[3][3] = blendConstant.a;
+
+ factor.invBlendConstant4F[0][0] = 1 - blendConstant.r;
+ factor.invBlendConstant4F[0][1] = 1 - blendConstant.r;
+ factor.invBlendConstant4F[0][2] = 1 - blendConstant.r;
+ factor.invBlendConstant4F[0][3] = 1 - blendConstant.r;
+
+ factor.invBlendConstant4F[1][0] = 1 - blendConstant.g;
+ factor.invBlendConstant4F[1][1] = 1 - blendConstant.g;
+ factor.invBlendConstant4F[1][2] = 1 - blendConstant.g;
+ factor.invBlendConstant4F[1][3] = 1 - blendConstant.g;
+
+ factor.invBlendConstant4F[2][0] = 1 - blendConstant.b;
+ factor.invBlendConstant4F[2][1] = 1 - blendConstant.b;
+ factor.invBlendConstant4F[2][2] = 1 - blendConstant.b;
+ factor.invBlendConstant4F[2][3] = 1 - blendConstant.b;
+
+ factor.invBlendConstant4F[3][0] = 1 - blendConstant.a;
+ factor.invBlendConstant4F[3][1] = 1 - blendConstant.a;
+ factor.invBlendConstant4F[3][2] = 1 - blendConstant.a;
+ factor.invBlendConstant4F[3][3] = 1 - blendConstant.a;
+ }
+
+ void PixelProcessor::setFillMode(FillMode fillMode)
+ {
+ context->fillMode = fillMode;
+ }
+
+ void PixelProcessor::setShadingMode(ShadingMode shadingMode)
+ {
+ context->shadingMode = shadingMode;
+ }
+
+ void PixelProcessor::setAlphaBlendEnable(bool alphaBlendEnable)
+ {
+ context->setAlphaBlendEnable(alphaBlendEnable);
+ }
+
+ void PixelProcessor::setSourceBlendFactor(BlendFactor sourceBlendFactor)
+ {
+ context->setSourceBlendFactor(sourceBlendFactor);
+ }
+
+ void PixelProcessor::setDestBlendFactor(BlendFactor destBlendFactor)
+ {
+ context->setDestBlendFactor(destBlendFactor);
+ }
+
+ void PixelProcessor::setBlendOperation(BlendOperation blendOperation)
+ {
+ context->setBlendOperation(blendOperation);
+ }
+
+ void PixelProcessor::setSeparateAlphaBlendEnable(bool separateAlphaBlendEnable)
+ {
+ context->setSeparateAlphaBlendEnable(separateAlphaBlendEnable);
+ }
+
+ void PixelProcessor::setSourceBlendFactorAlpha(BlendFactor sourceBlendFactorAlpha)
+ {
+ context->setSourceBlendFactorAlpha(sourceBlendFactorAlpha);
+ }
+
+ void PixelProcessor::setDestBlendFactorAlpha(BlendFactor destBlendFactorAlpha)
+ {
+ context->setDestBlendFactorAlpha(destBlendFactorAlpha);
+ }
+
+ void PixelProcessor::setBlendOperationAlpha(BlendOperation blendOperationAlpha)
+ {
+ context->setBlendOperationAlpha(blendOperationAlpha);
+ }
+
+ void PixelProcessor::setAlphaReference(float alphaReference)
+ {
+ context->alphaReference = alphaReference;
+
+ factor.alphaReference4[0] = (word)iround(alphaReference * 0x1000 / 0xFF);
+ factor.alphaReference4[1] = (word)iround(alphaReference * 0x1000 / 0xFF);
+ factor.alphaReference4[2] = (word)iround(alphaReference * 0x1000 / 0xFF);
+ factor.alphaReference4[3] = (word)iround(alphaReference * 0x1000 / 0xFF);
+ }
+
+ void PixelProcessor::setGlobalMipmapBias(float bias)
+ {
+ context->setGlobalMipmapBias(bias);
+ }
+
+ void PixelProcessor::setFogStart(float start)
+ {
+ setFogRanges(start, context->fogEnd);
+ }
+
+ void PixelProcessor::setFogEnd(float end)
+ {
+ setFogRanges(context->fogStart, end);
+ }
+
+ void PixelProcessor::setFogColor(Color<float> fogColor)
+ {
+ // TODO: Compact into generic function
+ word fogR = (unsigned short)(65535 * fogColor.r);
+ word fogG = (unsigned short)(65535 * fogColor.g);
+ word fogB = (unsigned short)(65535 * fogColor.b);
+
+ fog.color4[0][0] = fogR;
+ fog.color4[0][1] = fogR;
+ fog.color4[0][2] = fogR;
+ fog.color4[0][3] = fogR;
+
+ fog.color4[1][0] = fogG;
+ fog.color4[1][1] = fogG;
+ fog.color4[1][2] = fogG;
+ fog.color4[1][3] = fogG;
+
+ fog.color4[2][0] = fogB;
+ fog.color4[2][1] = fogB;
+ fog.color4[2][2] = fogB;
+ fog.color4[2][3] = fogB;
+
+ fog.colorF[0] = replicate(fogColor.r);
+ fog.colorF[1] = replicate(fogColor.g);
+ fog.colorF[2] = replicate(fogColor.b);
+ }
+
+ void PixelProcessor::setFogDensity(float fogDensity)
+ {
+ fog.densityE = replicate(-fogDensity * 1.442695f); // 1/e^x = 2^(-x*1.44)
+ fog.density2E = replicate(-fogDensity * fogDensity * 1.442695f);
+ }
+
+ void PixelProcessor::setPixelFogMode(FogMode fogMode)
+ {
+ context->pixelFogMode = fogMode;
+ }
+
+ void PixelProcessor::setPerspectiveCorrection(bool perspectiveEnable)
+ {
+ perspectiveCorrection = perspectiveEnable;
+ }
+
+ void PixelProcessor::setOcclusionEnabled(bool enable)
+ {
+ context->occlusionEnabled = enable;
+ }
+
+ void PixelProcessor::setRoutineCacheSize(int cacheSize)
+ {
+ delete routineCache;
+ routineCache = new RoutineCache<State>(clamp(cacheSize, 1, 65536), precachePixel ? "sw-pixel" : 0);
+ }
+
+ void PixelProcessor::setFogRanges(float start, float end)
+ {
+ context->fogStart = start;
+ context->fogEnd = end;
+
+ if(start == end)
+ {
+ end += 0.001f; // Hack: ensure there is a small range
+ }
+
+ float fogScale = -1.0f / (end - start);
+ float fogOffset = end * -fogScale;
+
+ fog.scale = replicate(fogScale);
+ fog.offset = replicate(fogOffset);
+ }
+
+ const PixelProcessor::State PixelProcessor::update() const
+ {
+ State state;
+
+ if(context->pixelShader)
+ {
+ state.shaderID = context->pixelShader->getSerialID();
+ }
+ else
+ {
+ state.shaderID = 0;
+ }
+
+ state.depthOverride = context->pixelShader && context->pixelShader->depthOverride();
+ state.shaderContainsKill = context->pixelShader ? context->pixelShader->containsKill() : false;
+
+ if(context->alphaTestActive())
+ {
+ state.alphaCompareMode = context->alphaCompareMode;
+
+ state.transparencyAntialiasing = context->getMultiSampleCount() > 1 ? transparencyAntialiasing : TRANSPARENCY_NONE;
+ }
+
+ state.depthWriteEnable = context->depthWriteActive();
+
+ if(context->stencilActive())
+ {
+ state.stencilActive = true;
+ state.stencilCompareMode = context->stencilCompareMode;
+ state.stencilFailOperation = context->stencilFailOperation;
+ state.stencilPassOperation = context->stencilPassOperation;
+ state.stencilZFailOperation = context->stencilZFailOperation;
+ state.noStencilMask = (context->stencilMask == 0xFF);
+ state.noStencilWriteMask = (context->stencilWriteMask == 0xFF);
+ state.stencilWriteMasked = (context->stencilWriteMask == 0x00);
+
+ state.twoSidedStencil = context->twoSidedStencil;
+ state.stencilCompareModeCCW = context->twoSidedStencil ? context->stencilCompareModeCCW : state.stencilCompareMode;
+ state.stencilFailOperationCCW = context->twoSidedStencil ? context->stencilFailOperationCCW : state.stencilFailOperation;
+ state.stencilPassOperationCCW = context->twoSidedStencil ? context->stencilPassOperationCCW : state.stencilPassOperation;
+ state.stencilZFailOperationCCW = context->twoSidedStencil ? context->stencilZFailOperationCCW : state.stencilZFailOperation;
+ state.noStencilMaskCCW = context->twoSidedStencil ? (context->stencilMaskCCW == 0xFF) : state.noStencilMask;
+ state.noStencilWriteMaskCCW = context->twoSidedStencil ? (context->stencilWriteMaskCCW == 0xFF) : state.noStencilWriteMask;
+ state.stencilWriteMaskedCCW = context->twoSidedStencil ? (context->stencilWriteMaskCCW == 0x00) : state.stencilWriteMasked;
+ }
+
+ if(context->depthBufferActive())
+ {
+ state.depthTestActive = true;
+ state.depthCompareMode = context->depthCompareMode;
+ state.quadLayoutDepthBuffer = Surface::hasQuadLayout(context->depthBuffer->getInternalFormat());
+ }
+
+ state.occlusionEnabled = context->occlusionEnabled;
+
+ state.fogActive = context->fogActive();
+ state.pixelFogMode = context->pixelFogActive();
+ state.wBasedFog = context->wBasedFog && context->pixelFogActive() != FOG_NONE;
+ state.perspective = context->perspectiveActive();
+ state.depthClamp = (context->depthBias != 0.0f) || (context->slopeDepthBias != 0.0f);
+
+ if(context->alphaBlendActive())
+ {
+ state.alphaBlendActive = true;
+ state.sourceBlendFactor = context->sourceBlendFactor();
+ state.destBlendFactor = context->destBlendFactor();
+ state.blendOperation = context->blendOperation();
+ state.sourceBlendFactorAlpha = context->sourceBlendFactorAlpha();
+ state.destBlendFactorAlpha = context->destBlendFactorAlpha();
+ state.blendOperationAlpha = context->blendOperationAlpha();
+ }
+
+ state.logicalOperation = context->colorLogicOp();
+
+ for(int i = 0; i < RENDERTARGETS; i++)
+ {
+ state.colorWriteMask |= context->colorWriteActive(i) << (4 * i);
+ state.targetFormat[i] = context->renderTargetInternalFormat(i);
+ }
+
+ state.writeSRGB = context->writeSRGB && context->renderTarget[0] && Surface::isSRGBwritable(context->renderTarget[0]->getExternalFormat());
+ state.multiSample = context->getMultiSampleCount();
+ state.multiSampleMask = context->multiSampleMask;
+
+ if(state.multiSample > 1 && context->pixelShader)
+ {
+ state.centroid = context->pixelShader->containsCentroid();
+ }
+
+ state.frontFaceCCW = context->frontFacingCCW;
+
+ if(!context->pixelShader)
+ {
+ for(unsigned int i = 0; i < 8; i++)
+ {
+ state.textureStage[i] = context->textureStage[i].textureStageState();
+ }
+
+ state.specularAdd = context->specularActive() && context->specularEnable;
+ }
+
+ for(unsigned int i = 0; i < 16; i++)
+ {
+ if(context->pixelShader)
+ {
+ if(context->pixelShader->usesSampler(i))
+ {
+ state.sampler[i] = context->sampler[i].samplerState();
+ }
+ }
+ else
+ {
+ if(i < 8 && state.textureStage[i].stageOperation != TextureStage::STAGE_DISABLE)
+ {
+ state.sampler[i] = context->sampler[i].samplerState();
+ }
+ else break;
+ }
+ }
+
+ const bool point = context->isDrawPoint(true);
+ const bool sprite = context->pointSpriteActive();
+ const bool flatShading = (context->shadingMode == SHADING_FLAT) || point;
+
+ if(context->pixelShaderModel() < 0x0300)
+ {
+ for(int coordinate = 0; coordinate < 8; coordinate++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ if(context->textureActive(coordinate, component))
+ {
+ state.texture[coordinate].component |= 1 << component;
+
+ if(point && !sprite)
+ {
+ state.texture[coordinate].flat |= 1 << component;
+ }
+ }
+ }
+
+ if(context->textureTransformProject[coordinate] && context->pixelShaderModel() <= 0x0103)
+ {
+ if(context->textureTransformCount[coordinate] == 2)
+ {
+ state.texture[coordinate].project = 1;
+ }
+ else if(context->textureTransformCount[coordinate] == 3)
+ {
+ state.texture[coordinate].project = 2;
+ }
+ else if(context->textureTransformCount[coordinate] == 4 || context->textureTransformCount[coordinate] == 0)
+ {
+ state.texture[coordinate].project = 3;
+ }
+ }
+ }
+
+ for(int color = 0; color < 2; color++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ if(context->colorActive(color, component))
+ {
+ state.color[color].component |= 1 << component;
+
+ if(point || flatShading)
+ {
+ state.color[color].flat |= 1 << component;
+ }
+ }
+ }
+ }
+
+ if(context->fogActive())
+ {
+ state.fog.component = true;
+
+ if(point)
+ {
+ state.fog.flat = true;
+ }
+ }
+ }
+ else
+ {
+ for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ const Shader::Semantic &semantic = context->pixelShader->getInput(interpolant, component);
+
+ if(semantic.active())
+ {
+ bool flat = point;
+
+ switch(semantic.usage)
+ {
+ case Shader::USAGE_TEXCOORD: flat = point && !sprite; break;
+ case Shader::USAGE_COLOR: flat = semantic.flat || flatShading; break;
+ }
+
+ state.interpolant[interpolant].component |= 1 << component;
+
+ if(flat)
+ {
+ state.interpolant[interpolant].flat |= 1 << component;
+ }
+ }
+ }
+ }
+ }
+
+ if(state.centroid)
+ {
+ for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ state.interpolant[interpolant].centroid = context->pixelShader->getInput(interpolant, 0).centroid;
+ }
+ }
+ }
+
+ state.hash = state.computeHash();
+
+ return state;
+ }
+
+ Routine *PixelProcessor::routine(const State &state)
+ {
+ Routine *routine = routineCache->query(state);
+
+ if(!routine)
+ {
+ const bool integerPipeline = (context->pixelShaderModel() <= 0x0104);
+ QuadRasterizer *generator = nullptr;
+
+ if(integerPipeline)
+ {
+ generator = new PixelPipeline(state, context->pixelShader);
+ }
+ else
+ {
+ generator = new PixelProgram(state, context->pixelShader);
+ }
+
+ generator->generate();
+ routine = (*generator)(L"PixelRoutine_%0.8X", state.shaderID);
+ delete generator;
+
+ routineCache->add(state, routine);
+ }
+
+ return routine;
+ }
+}
diff --git a/src/Device/PixelProcessor.hpp b/src/Device/PixelProcessor.hpp
new file mode 100644
index 0000000..98300de
--- /dev/null
+++ b/src/Device/PixelProcessor.hpp
@@ -0,0 +1,342 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_PixelProcessor_hpp
+#define sw_PixelProcessor_hpp
+
+#include "Context.hpp"
+#include "RoutineCache.hpp"
+
+namespace sw
+{
+ class PixelShader;
+ class Rasterizer;
+ struct Texture;
+ struct DrawData;
+
+ class PixelProcessor
+ {
+ public:
+ struct States
+ {
+ unsigned int computeHash();
+
+ int shaderID;
+
+ bool depthOverride : 1; // TODO: Eliminate by querying shader.
+ bool shaderContainsKill : 1; // TODO: Eliminate by querying shader.
+
+ DepthCompareMode depthCompareMode : BITS(DEPTH_LAST);
+ AlphaCompareMode alphaCompareMode : BITS(ALPHA_LAST);
+ bool depthWriteEnable : 1;
+ bool quadLayoutDepthBuffer : 1;
+
+ bool stencilActive : 1;
+ StencilCompareMode stencilCompareMode : BITS(STENCIL_LAST);
+ StencilOperation stencilFailOperation : BITS(OPERATION_LAST);
+ StencilOperation stencilPassOperation : BITS(OPERATION_LAST);
+ StencilOperation stencilZFailOperation : BITS(OPERATION_LAST);
+ bool noStencilMask : 1;
+ bool noStencilWriteMask : 1;
+ bool stencilWriteMasked : 1;
+ bool twoSidedStencil : 1;
+ StencilCompareMode stencilCompareModeCCW : BITS(STENCIL_LAST);
+ StencilOperation stencilFailOperationCCW : BITS(OPERATION_LAST);
+ StencilOperation stencilPassOperationCCW : BITS(OPERATION_LAST);
+ StencilOperation stencilZFailOperationCCW : BITS(OPERATION_LAST);
+ bool noStencilMaskCCW : 1;
+ bool noStencilWriteMaskCCW : 1;
+ bool stencilWriteMaskedCCW : 1;
+
+ bool depthTestActive : 1;
+ bool fogActive : 1;
+ FogMode pixelFogMode : BITS(FOG_LAST);
+ bool specularAdd : 1;
+ bool occlusionEnabled : 1;
+ bool wBasedFog : 1;
+ bool perspective : 1;
+ bool depthClamp : 1;
+
+ bool alphaBlendActive : 1;
+ BlendFactor sourceBlendFactor : BITS(BLEND_LAST);
+ BlendFactor destBlendFactor : BITS(BLEND_LAST);
+ BlendOperation blendOperation : BITS(BLENDOP_LAST);
+ BlendFactor sourceBlendFactorAlpha : BITS(BLEND_LAST);
+ BlendFactor destBlendFactorAlpha : BITS(BLEND_LAST);
+ BlendOperation blendOperationAlpha : BITS(BLENDOP_LAST);
+
+ unsigned int colorWriteMask : RENDERTARGETS * 4; // Four component bit masks
+ Format targetFormat[RENDERTARGETS];
+ bool writeSRGB : 1;
+ unsigned int multiSample : 3;
+ unsigned int multiSampleMask : 4;
+ TransparencyAntialiasing transparencyAntialiasing : BITS(TRANSPARENCY_LAST);
+ bool centroid : 1;
+ bool frontFaceCCW : 1;
+
+ LogicalOperation logicalOperation : BITS(LOGICALOP_LAST);
+
+ Sampler::State sampler[TEXTURE_IMAGE_UNITS];
+ TextureStage::State textureStage[8];
+
+ struct Interpolant
+ {
+ unsigned char component : 4;
+ unsigned char flat : 4;
+ unsigned char project : 2;
+ bool centroid : 1;
+ };
+
+ union
+ {
+ struct
+ {
+ Interpolant color[2];
+ Interpolant texture[8];
+ Interpolant fog;
+ };
+
+ Interpolant interpolant[MAX_FRAGMENT_INPUTS];
+ };
+ };
+
+ struct State : States
+ {
+ State();
+
+ bool operator==(const State &state) const;
+
+ int colorWriteActive(int index) const
+ {
+ return (colorWriteMask >> (index * 4)) & 0xF;
+ }
+
+ bool alphaTestActive() const
+ {
+ return (alphaCompareMode != ALPHA_ALWAYS) || (transparencyAntialiasing != TRANSPARENCY_NONE);
+ }
+
+ bool pixelFogActive() const
+ {
+ return pixelFogMode != FOG_NONE;
+ }
+
+ unsigned int hash;
+ };
+
+ struct Stencil
+ {
+ int64_t testMaskQ;
+ int64_t referenceMaskedQ;
+ int64_t referenceMaskedSignedQ;
+ int64_t writeMaskQ;
+ int64_t invWriteMaskQ;
+ int64_t referenceQ;
+
+ void set(int reference, int testMask, int writeMask)
+ {
+ referenceQ = replicate(reference);
+ testMaskQ = replicate(testMask);
+ writeMaskQ = replicate(writeMask);
+ invWriteMaskQ = ~writeMaskQ;
+ referenceMaskedQ = referenceQ & testMaskQ;
+ referenceMaskedSignedQ = replicate(((reference & testMask) + 0x80) & 0xFF);
+ }
+
+ static int64_t replicate(int b)
+ {
+ int64_t w = b & 0xFF;
+
+ return (w << 0) | (w << 8) | (w << 16) | (w << 24) | (w << 32) | (w << 40) | (w << 48) | (w << 56);
+ }
+ };
+
+ struct Fog
+ {
+ float4 scale;
+ float4 offset;
+ word4 color4[3];
+ float4 colorF[3];
+ float4 densityE;
+ float4 density2E;
+ };
+
+ struct Factor
+ {
+ word4 textureFactor4[4];
+
+ word4 alphaReference4;
+
+ word4 blendConstant4W[4];
+ float4 blendConstant4F[4];
+ word4 invBlendConstant4W[4];
+ float4 invBlendConstant4F[4];
+ };
+
+ public:
+ typedef void (*RoutinePointer)(const Primitive *primitive, int count, int thread, DrawData *draw);
+
+ PixelProcessor(Context *context);
+
+ virtual ~PixelProcessor();
+
+ void setFloatConstant(unsigned int index, const float value[4]);
+ void setIntegerConstant(unsigned int index, const int value[4]);
+ void setBooleanConstant(unsigned int index, int boolean);
+
+ void setUniformBuffer(int index, sw::Resource* buffer, int offset);
+ void lockUniformBuffers(byte** u, sw::Resource* uniformBuffers[]);
+
+ void setRenderTarget(int index, Surface *renderTarget, unsigned int layer = 0);
+ void setDepthBuffer(Surface *depthBuffer, unsigned int layer = 0);
+ void setStencilBuffer(Surface *stencilBuffer, unsigned int layer = 0);
+
+ void setTexCoordIndex(unsigned int stage, int texCoordIndex);
+ void setStageOperation(unsigned int stage, TextureStage::StageOperation stageOperation);
+ void setFirstArgument(unsigned int stage, TextureStage::SourceArgument firstArgument);
+ void setSecondArgument(unsigned int stage, TextureStage::SourceArgument secondArgument);
+ void setThirdArgument(unsigned int stage, TextureStage::SourceArgument thirdArgument);
+ void setStageOperationAlpha(unsigned int stage, TextureStage::StageOperation stageOperationAlpha);
+ void setFirstArgumentAlpha(unsigned int stage, TextureStage::SourceArgument firstArgumentAlpha);
+ void setSecondArgumentAlpha(unsigned int stage, TextureStage::SourceArgument secondArgumentAlpha);
+ void setThirdArgumentAlpha(unsigned int stage, TextureStage::SourceArgument thirdArgumentAlpha);
+ void setFirstModifier(unsigned int stage, TextureStage::ArgumentModifier firstModifier);
+ void setSecondModifier(unsigned int stage, TextureStage::ArgumentModifier secondModifier);
+ void setThirdModifier(unsigned int stage, TextureStage::ArgumentModifier thirdModifier);
+ void setFirstModifierAlpha(unsigned int stage, TextureStage::ArgumentModifier firstModifierAlpha);
+ void setSecondModifierAlpha(unsigned int stage, TextureStage::ArgumentModifier secondModifierAlpha);
+ void setThirdModifierAlpha(unsigned int stage, TextureStage::ArgumentModifier thirdModifierAlpha);
+ void setDestinationArgument(unsigned int stage, TextureStage::DestinationArgument destinationArgument);
+ void setConstantColor(unsigned int stage, const Color<float> &constantColor);
+ void setBumpmapMatrix(unsigned int stage, int element, float value);
+ void setLuminanceScale(unsigned int stage, float value);
+ void setLuminanceOffset(unsigned int stage, float value);
+
+ void setTextureFilter(unsigned int sampler, FilterType textureFilter);
+ void setMipmapFilter(unsigned int sampler, MipmapType mipmapFilter);
+ void setGatherEnable(unsigned int sampler, bool enable);
+ void setAddressingModeU(unsigned int sampler, AddressingMode addressingMode);
+ void setAddressingModeV(unsigned int sampler, AddressingMode addressingMode);
+ void setAddressingModeW(unsigned int sampler, AddressingMode addressingMode);
+ void setReadSRGB(unsigned int sampler, bool sRGB);
+ void setMipmapLOD(unsigned int sampler, float bias);
+ void setBorderColor(unsigned int sampler, const Color<float> &borderColor);
+ void setMaxAnisotropy(unsigned int sampler, float maxAnisotropy);
+ void setHighPrecisionFiltering(unsigned int sampler, bool highPrecisionFiltering);
+ void setSwizzleR(unsigned int sampler, SwizzleType swizzleR);
+ void setSwizzleG(unsigned int sampler, SwizzleType swizzleG);
+ void setSwizzleB(unsigned int sampler, SwizzleType swizzleB);
+ void setSwizzleA(unsigned int sampler, SwizzleType swizzleA);
+ void setCompareFunc(unsigned int sampler, CompareFunc compare);
+ void setBaseLevel(unsigned int sampler, int baseLevel);
+ void setMaxLevel(unsigned int sampler, int maxLevel);
+ void setMinLod(unsigned int sampler, float minLod);
+ void setMaxLod(unsigned int sampler, float maxLod);
+ void setSyncRequired(unsigned int sampler, bool isSincRequired);
+
+ void setWriteSRGB(bool sRGB);
+ void setDepthBufferEnable(bool depthBufferEnable);
+ void setDepthCompare(DepthCompareMode depthCompareMode);
+ void setAlphaCompare(AlphaCompareMode alphaCompareMode);
+ void setDepthWriteEnable(bool depthWriteEnable);
+ void setAlphaTestEnable(bool alphaTestEnable);
+ void setCullMode(CullMode cullMode, bool frontFacingCCW);
+ void setColorWriteMask(int index, int rgbaMask);
+
+ void setColorLogicOpEnabled(bool colorLogicOpEnabled);
+ void setLogicalOperation(LogicalOperation logicalOperation);
+
+ void setStencilEnable(bool stencilEnable);
+ void setStencilCompare(StencilCompareMode stencilCompareMode);
+ void setStencilReference(int stencilReference);
+ void setStencilMask(int stencilMask);
+ void setStencilFailOperation(StencilOperation stencilFailOperation);
+ void setStencilPassOperation(StencilOperation stencilPassOperation);
+ void setStencilZFailOperation(StencilOperation stencilZFailOperation);
+ void setStencilWriteMask(int stencilWriteMask);
+ void setTwoSidedStencil(bool enable);
+ void setStencilCompareCCW(StencilCompareMode stencilCompareMode);
+ void setStencilReferenceCCW(int stencilReference);
+ void setStencilMaskCCW(int stencilMask);
+ void setStencilFailOperationCCW(StencilOperation stencilFailOperation);
+ void setStencilPassOperationCCW(StencilOperation stencilPassOperation);
+ void setStencilZFailOperationCCW(StencilOperation stencilZFailOperation);
+ void setStencilWriteMaskCCW(int stencilWriteMask);
+
+ void setTextureFactor(const Color<float> &textureFactor);
+ void setBlendConstant(const Color<float> &blendConstant);
+
+ void setFillMode(FillMode fillMode);
+ void setShadingMode(ShadingMode shadingMode);
+
+ void setAlphaBlendEnable(bool alphaBlendEnable);
+ void setSourceBlendFactor(BlendFactor sourceBlendFactor);
+ void setDestBlendFactor(BlendFactor destBlendFactor);
+ void setBlendOperation(BlendOperation blendOperation);
+
+ void setSeparateAlphaBlendEnable(bool separateAlphaBlendEnable);
+ void setSourceBlendFactorAlpha(BlendFactor sourceBlendFactorAlpha);
+ void setDestBlendFactorAlpha(BlendFactor destBlendFactorAlpha);
+ void setBlendOperationAlpha(BlendOperation blendOperationAlpha);
+
+ void setAlphaReference(float alphaReference);
+
+ void setGlobalMipmapBias(float bias);
+
+ void setFogStart(float start);
+ void setFogEnd(float end);
+ void setFogColor(Color<float> fogColor);
+ void setFogDensity(float fogDensity);
+ void setPixelFogMode(FogMode fogMode);
+
+ void setPerspectiveCorrection(bool perspectiveCorrection);
+
+ void setOcclusionEnabled(bool enable);
+
+ protected:
+ const State update() const;
+ Routine *routine(const State &state);
+ void setRoutineCacheSize(int routineCacheSize);
+
+ // Shader constants
+ word4 cW[8][4];
+ float4 c[FRAGMENT_UNIFORM_VECTORS];
+ int4 i[16];
+ bool b[16];
+
+ // Other semi-constants
+ Stencil stencil;
+ Stencil stencilCCW;
+ Fog fog;
+ Factor factor;
+
+ private:
+ struct UniformBufferInfo
+ {
+ UniformBufferInfo();
+
+ Resource* buffer;
+ int offset;
+ };
+ UniformBufferInfo uniformBufferInfo[MAX_UNIFORM_BUFFER_BINDINGS];
+
+ void setFogRanges(float start, float end);
+
+ Context *const context;
+
+ RoutineCache<State> *routineCache;
+ };
+}
+
+#endif // sw_PixelProcessor_hpp
diff --git a/src/Device/Plane.cpp b/src/Device/Plane.cpp
new file mode 100644
index 0000000..095b7f2
--- /dev/null
+++ b/src/Device/Plane.cpp
@@ -0,0 +1,60 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Plane.hpp"
+
+#include "Matrix.hpp"
+
+namespace sw
+{
+ Plane::Plane()
+ {
+ }
+
+ Plane::Plane(float p_A, float p_B, float p_C, float p_D)
+ {
+ A = p_A;
+ B = p_B;
+ C = p_C;
+ D = p_D;
+ }
+
+ Plane::Plane(const float ABCD[4])
+ {
+ A = ABCD[0];
+ B = ABCD[1];
+ C = ABCD[2];
+ D = ABCD[3];
+ }
+
+ Plane operator*(const Plane &p, const Matrix &T)
+ {
+ Matrix M = !T;
+
+ return Plane(p.A * M(1, 1) + p.B * M(1, 2) + p.C * M(1, 3) + p.D * M(1, 4),
+ p.A * M(2, 1) + p.B * M(2, 2) + p.C * M(2, 3) + p.D * M(2, 4),
+ p.A * M(3, 1) + p.B * M(3, 2) + p.C * M(3, 3) + p.D * M(3, 4),
+ p.A * M(4, 1) + p.B * M(4, 2) + p.C * M(4, 3) + p.D * M(4, 4));
+ }
+
+ Plane operator*(const Matrix &T, const Plane &p)
+ {
+ Matrix M = !T;
+
+ return Plane(M(1, 1) * p.A + M(2, 1) * p.B + M(3, 1) * p.C + M(4, 1) * p.D,
+ M(1, 2) * p.A + M(2, 2) * p.B + M(3, 2) * p.C + M(4, 2) * p.D,
+ M(1, 3) * p.A + M(2, 3) * p.B + M(3, 3) * p.C + M(4, 3) * p.D,
+ M(1, 4) * p.A + M(2, 4) * p.B + M(3, 4) * p.C + M(4, 4) * p.D);
+ }
+}
diff --git a/src/Device/Plane.hpp b/src/Device/Plane.hpp
new file mode 100644
index 0000000..962b9ae
--- /dev/null
+++ b/src/Device/Plane.hpp
@@ -0,0 +1,40 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Plane_hpp
+#define Plane_hpp
+
+#include "Vector.hpp"
+
+namespace sw
+{
+ struct Matrix;
+
+ struct Plane
+ {
+ float A;
+ float B;
+ float C;
+ float D;
+
+ Plane();
+ Plane(float A, float B, float C, float D); // Plane equation
+ Plane(const float ABCD[4]);
+
+ friend Plane operator*(const Plane &p, const Matrix &A); // Transform plane by matrix (post-multiply)
+ friend Plane operator*(const Matrix &A, const Plane &p); // Transform plane by matrix (pre-multiply)
+ };
+}
+
+#endif // Plane_hpp
diff --git a/src/Device/Point.cpp b/src/Device/Point.cpp
new file mode 100644
index 0000000..e7e33dd
--- /dev/null
+++ b/src/Device/Point.cpp
@@ -0,0 +1,92 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Point.hpp"
+
+#include "Matrix.hpp"
+
+namespace sw
+{
+ Point &Point::operator+=(const Vector &v)
+ {
+ x += v.x;
+ y += v.y;
+ z += v.z;
+
+ return *this;
+ }
+
+ Point &Point::operator-=(const Vector &v)
+ {
+ x -= v.x;
+ y -= v.y;
+ z -= v.z;
+
+ return *this;
+ }
+
+ Point operator+(const Point &P, const Vector &v)
+ {
+ return Point(P.x + v.x, P.y + v.y, P.z + v.z);
+ }
+
+ Point operator-(const Point &P, const Vector &v)
+ {
+ return Point(P.x - v.x, P.y - v.y, P.z - v.z);
+ }
+
+ Vector operator-(const Point &P, const Point &Q)
+ {
+ return Vector(P.x - Q.x, P.y - Q.y, P.z - Q.z);
+ }
+
+ Point operator*(const Matrix &M, const Point &P)
+ {
+ return Point(M(1, 1) * P.x + M(1, 2) * P.y + M(1, 3) * P.z + M(1, 4),
+ M(2, 1) * P.x + M(2, 2) * P.y + M(2, 3) * P.z + M(2, 4),
+ M(3, 1) * P.x + M(3, 2) * P.y + M(3, 3) * P.z + M(3, 4));
+ }
+
+ Point operator*(const Point &P, const Matrix &M)
+ {
+ return Point(P.x * M(1, 1) + P.y * M(2, 1) + P.z * M(3, 1),
+ P.x * M(1, 2) + P.y * M(2, 2) + P.z * M(3, 2),
+ P.x * M(1, 3) + P.y * M(2, 3) + P.z * M(3, 3));
+ }
+
+ Point &operator*=(Point &P, const Matrix &M)
+ {
+ return P = P * M;
+ }
+
+ float Point::d(const Point &P) const
+ {
+ return Vector::N(*this - P);
+ }
+
+ float Point::d2(const Point &P) const
+ {
+ return Vector::N2(*this - P);
+ }
+
+ float Point::d(const Point &P, const Point &Q)
+ {
+ return Vector::N(P - Q);
+ }
+
+ float Point::d2(const Point &P, const Point &Q)
+ {
+ return Vector::N2(P - Q);
+ }
+}
diff --git a/src/Device/Point.hpp b/src/Device/Point.hpp
new file mode 100644
index 0000000..85198c5
--- /dev/null
+++ b/src/Device/Point.hpp
@@ -0,0 +1,139 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Point_hpp
+#define Point_hpp
+
+namespace sw
+{
+ struct Vector;
+ struct Matrix;
+
+ struct Point
+ {
+ Point();
+ Point(const int i);
+ Point(const Point &P);
+ Point(const Vector &v);
+ Point(float Px, float Py, float Pz);
+
+ Point &operator=(const Point &P);
+
+ union
+ {
+ float p[3];
+
+ struct
+ {
+ float x;
+ float y;
+ float z;
+ };
+ };
+
+ float &operator[](int i);
+ float &operator()(int i);
+
+ const float &operator[](int i) const;
+ const float &operator()(int i) const;
+
+ Point &operator+=(const Vector &v);
+ Point &operator-=(const Vector &v);
+
+ friend Point operator+(const Point &P, const Vector &v);
+ friend Point operator-(const Point &P, const Vector &v);
+
+ friend Vector operator-(const Point &P, const Point &Q);
+
+ friend Point operator*(const Matrix &M, const Point& P);
+ friend Point operator*(const Point &P, const Matrix &M);
+ friend Point &operator*=(Point &P, const Matrix &M);
+
+ float d(const Point &P) const; // Distance between two points
+ float d2(const Point &P) const; // Squared distance between two points
+
+ static float d(const Point &P, const Point &Q); // Distance between two points
+ static float d2(const Point &P, const Point &Q); // Squared distance between two points
+ };
+}
+
+#include "Vector.hpp"
+
+namespace sw
+{
+ inline Point::Point()
+ {
+ }
+
+ inline Point::Point(const int i)
+ {
+ const float s = (float)i;
+
+ x = s;
+ y = s;
+ z = s;
+ }
+
+ inline Point::Point(const Point &P)
+ {
+ x = P.x;
+ y = P.y;
+ z = P.z;
+ }
+
+ inline Point::Point(const Vector &v)
+ {
+ x = v.x;
+ y = v.y;
+ z = v.z;
+ }
+
+ inline Point::Point(float P_x, float P_y, float P_z)
+ {
+ x = P_x;
+ y = P_y;
+ z = P_z;
+ }
+
+ inline Point &Point::operator=(const Point &P)
+ {
+ x = P.x;
+ y = P.y;
+ z = P.z;
+
+ return *this;
+ }
+
+ inline float &Point::operator()(int i)
+ {
+ return p[i];
+ }
+
+ inline float &Point::operator[](int i)
+ {
+ return p[i];
+ }
+
+ inline const float &Point::operator()(int i) const
+ {
+ return p[i];
+ }
+
+ inline const float &Point::operator[](int i) const
+ {
+ return p[i];
+ }
+}
+
+#endif // Point_hpp
diff --git a/src/Device/Polygon.hpp b/src/Device/Polygon.hpp
new file mode 100644
index 0000000..8ee8562
--- /dev/null
+++ b/src/Device/Polygon.hpp
@@ -0,0 +1,56 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Polygon_hpp
+#define sw_Polygon_hpp
+
+#include "Vertex.hpp"
+
+namespace sw
+{
+ struct Polygon
+ {
+ Polygon(const float4 *P0, const float4 *P1, const float4 *P2)
+ {
+ P[0][0] = P0;
+ P[0][1] = P1;
+ P[0][2] = P2;
+
+ n = 3;
+ i = 0;
+ b = 0;
+ }
+
+ Polygon(const float4 *P, int n)
+ {
+ for(int i = 0; i < n; i++)
+ {
+ this->P[0][i] = &P[i];
+ }
+
+ this->n = n;
+ this->i = 0;
+ this->b = 0;
+ }
+
+ float4 B[16]; // Buffer for clipped vertices
+ const float4 *P[16][16]; // Pointers to clipped polygon's vertices
+
+ int n; // Number of vertices
+ int i; // Level of P to use
+ int b; // Next available new vertex
+ };
+}
+
+#endif // sw_Polygon_hpp
diff --git a/src/Device/Primitive.hpp b/src/Device/Primitive.hpp
new file mode 100644
index 0000000..52daa18
--- /dev/null
+++ b/src/Device/Primitive.hpp
@@ -0,0 +1,80 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Primitive_hpp
+#define sw_Primitive_hpp
+
+#include "Vertex.hpp"
+#include "Main/Config.hpp"
+
+namespace sw
+{
+ struct Triangle
+ {
+ Vertex v0;
+ Vertex v1;
+ Vertex v2;
+ };
+
+ struct PlaneEquation // z = A * x + B * y + C
+ {
+ float4 A;
+ float4 B;
+ float4 C;
+ };
+
+ struct Primitive
+ {
+ int yMin;
+ int yMax;
+
+ float4 xQuad;
+ float4 yQuad;
+
+ PlaneEquation z;
+ PlaneEquation w;
+
+ union
+ {
+ struct
+ {
+ PlaneEquation C[2][4];
+ PlaneEquation T[8][4];
+ PlaneEquation f;
+ };
+
+ PlaneEquation V[MAX_FRAGMENT_INPUTS][4];
+ };
+
+ float area;
+
+ // Masks for two-sided stencil
+ int64_t clockwiseMask;
+ int64_t invClockwiseMask;
+
+ struct Span
+ {
+ unsigned short left;
+ unsigned short right;
+ };
+
+ // The rasterizer adds a zero length span to the top and bottom of the polygon to allow
+ // for 2x2 pixel processing. We need an even number of spans to keep accesses aligned.
+ Span outlineUnderflow[2];
+ Span outline[OUTLINE_RESOLUTION];
+ Span outlineOverflow[2];
+ };
+}
+
+#endif // sw_Primitive_hpp
diff --git a/src/Device/QuadRasterizer.cpp b/src/Device/QuadRasterizer.cpp
new file mode 100644
index 0000000..6b319b4
--- /dev/null
+++ b/src/Device/QuadRasterizer.cpp
@@ -0,0 +1,350 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "QuadRasterizer.hpp"
+
+#include "Primitive.hpp"
+#include "Renderer.hpp"
+#include "Shader/Constants.hpp"
+#include "Common/Math.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+ extern bool veryEarlyDepthTest;
+ extern bool complementaryDepthBuffer;
+ extern bool fullPixelPositionRegister;
+
+ extern int clusterCount;
+
+ QuadRasterizer::QuadRasterizer(const PixelProcessor::State &state, const PixelShader *pixelShader) : state(state), shader(pixelShader)
+ {
+ }
+
+ QuadRasterizer::~QuadRasterizer()
+ {
+ }
+
+ void QuadRasterizer::generate()
+ {
+ #if PERF_PROFILE
+ for(int i = 0; i < PERF_TIMERS; i++)
+ {
+ cycles[i] = 0;
+ }
+
+ Long pixelTime = Ticks();
+ #endif
+
+ constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
+ occlusion = 0;
+ int clusterCount = Renderer::getClusterCount();
+
+ Do
+ {
+ Int yMin = *Pointer<Int>(primitive + OFFSET(Primitive,yMin));
+ Int yMax = *Pointer<Int>(primitive + OFFSET(Primitive,yMax));
+
+ Int cluster2 = cluster + cluster;
+ yMin += clusterCount * 2 - 2 - cluster2;
+ yMin &= -clusterCount * 2;
+ yMin += cluster2;
+
+ If(yMin < yMax)
+ {
+ rasterize(yMin, yMax);
+ }
+
+ primitive += sizeof(Primitive) * state.multiSample;
+ count--;
+ }
+ Until(count == 0)
+
+ if(state.occlusionEnabled)
+ {
+ UInt clusterOcclusion = *Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster);
+ clusterOcclusion += occlusion;
+ *Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster) = clusterOcclusion;
+ }
+
+ #if PERF_PROFILE
+ cycles[PERF_PIXEL] = Ticks() - pixelTime;
+
+ for(int i = 0; i < PERF_TIMERS; i++)
+ {
+ *Pointer<Long>(data + OFFSET(DrawData,cycles[i]) + 8 * cluster) += cycles[i];
+ }
+ #endif
+
+ Return();
+ }
+
+ void QuadRasterizer::rasterize(Int &yMin, Int &yMax)
+ {
+ Pointer<Byte> cBuffer[RENDERTARGETS];
+ Pointer<Byte> zBuffer;
+ Pointer<Byte> sBuffer;
+
+ for(int index = 0; index < RENDERTARGETS; index++)
+ {
+ if(state.colorWriteActive(index))
+ {
+ cBuffer[index] = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,colorBuffer[index])) + yMin * *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+ }
+ }
+
+ if(state.depthTestActive)
+ {
+ zBuffer = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,depthBuffer)) + yMin * *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+ }
+
+ if(state.stencilActive)
+ {
+ sBuffer = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,stencilBuffer)) + yMin * *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB));
+ }
+
+ Int y = yMin;
+
+ Do
+ {
+ Int x0a = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
+ Int x0b = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
+ Int x0 = Min(x0a, x0b);
+
+ for(unsigned int q = 1; q < state.multiSample; q++)
+ {
+ x0a = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
+ x0b = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
+ x0 = Min(x0, Min(x0a, x0b));
+ }
+
+ x0 &= 0xFFFFFFFE;
+
+ Int x1a = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
+ Int x1b = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
+ Int x1 = Max(x1a, x1b);
+
+ for(unsigned int q = 1; q < state.multiSample; q++)
+ {
+ x1a = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
+ x1b = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
+ x1 = Max(x1, Max(x1a, x1b));
+ }
+
+ Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
+
+ if(interpolateZ())
+ {
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ Float4 y = yyyy;
+
+ if(state.multiSample > 1)
+ {
+ y -= *Pointer<Float4>(constants + OFFSET(Constants,Y) + q * sizeof(float4));
+ }
+
+ Dz[q] = *Pointer<Float4>(primitive + OFFSET(Primitive,z.C), 16) + y * *Pointer<Float4>(primitive + OFFSET(Primitive,z.B), 16);
+ }
+ }
+
+ if(veryEarlyDepthTest && state.multiSample == 1 && !state.depthOverride)
+ {
+ if(!state.stencilActive && state.depthTestActive && (state.depthCompareMode == DEPTH_LESSEQUAL || state.depthCompareMode == DEPTH_LESS)) // FIXME: Both modes ok?
+ {
+ Float4 xxxx = Float4(Float(x0)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
+
+ Pointer<Byte> buffer;
+ Int pitch;
+
+ if(!state.quadLayoutDepthBuffer)
+ {
+ buffer = zBuffer + 4 * x0;
+ pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+ }
+ else
+ {
+ buffer = zBuffer + 8 * x0;
+ }
+
+ For(Int x = x0, x < x1, x += 2)
+ {
+ Float4 z = interpolate(xxxx, Dz[0], z, primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
+
+ Float4 zValue;
+
+ if(!state.quadLayoutDepthBuffer)
+ {
+ // FIXME: Properly optimizes?
+ zValue.xy = *Pointer<Float4>(buffer);
+ zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
+ }
+ else
+ {
+ zValue = *Pointer<Float4>(buffer, 16);
+ }
+
+ Int4 zTest;
+
+ if(complementaryDepthBuffer)
+ {
+ zTest = CmpLE(zValue, z);
+ }
+ else
+ {
+ zTest = CmpNLT(zValue, z);
+ }
+
+ Int zMask = SignMask(zTest);
+
+ If(zMask == 0)
+ {
+ x0 += 2;
+ }
+ Else
+ {
+ x = x1;
+ }
+
+ xxxx += Float4(2);
+
+ if(!state.quadLayoutDepthBuffer)
+ {
+ buffer += 8;
+ }
+ else
+ {
+ buffer += 16;
+ }
+ }
+ }
+ }
+
+ If(x0 < x1)
+ {
+ if(interpolateW())
+ {
+ Dw = *Pointer<Float4>(primitive + OFFSET(Primitive,w.C), 16) + yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive,w.B), 16);
+ }
+
+ for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ if(state.interpolant[interpolant].component & (1 << component))
+ {
+ Dv[interpolant][component] = *Pointer<Float4>(primitive + OFFSET(Primitive,V[interpolant][component].C), 16);
+
+ if(!(state.interpolant[interpolant].flat & (1 << component)))
+ {
+ Dv[interpolant][component] += yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive,V[interpolant][component].B), 16);
+ }
+ }
+ }
+ }
+
+ if(state.fog.component)
+ {
+ Df = *Pointer<Float4>(primitive + OFFSET(Primitive,f.C), 16);
+
+ if(!state.fog.flat)
+ {
+ Df += yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive,f.B), 16);
+ }
+ }
+
+ Short4 xLeft[4];
+ Short4 xRight[4];
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ xLeft[q] = *Pointer<Short4>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline) + y * sizeof(Primitive::Span));
+ xRight[q] = xLeft[q];
+
+ xLeft[q] = Swizzle(xLeft[q], 0xA0) - Short4(1, 2, 1, 2);
+ xRight[q] = Swizzle(xRight[q], 0xF5) - Short4(0, 1, 0, 1);
+ }
+
+ For(Int x = x0, x < x1, x += 2)
+ {
+ Short4 xxxx = Short4(x);
+ Int cMask[4];
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ Short4 mask = CmpGT(xxxx, xLeft[q]) & CmpGT(xRight[q], xxxx);
+ cMask[q] = SignMask(PackSigned(mask, mask)) & 0x0000000F;
+ }
+
+ quad(cBuffer, zBuffer, sBuffer, cMask, x, y);
+ }
+ }
+
+ int clusterCount = Renderer::getClusterCount();
+
+ for(int index = 0; index < RENDERTARGETS; index++)
+ {
+ if(state.colorWriteActive(index))
+ {
+ cBuffer[index] += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])) << (1 + sw::log2(clusterCount)); // FIXME: Precompute
+ }
+ }
+
+ if(state.depthTestActive)
+ {
+ zBuffer += *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)) << (1 + sw::log2(clusterCount)); // FIXME: Precompute
+ }
+
+ if(state.stencilActive)
+ {
+ sBuffer += *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB)) << (1 + sw::log2(clusterCount)); // FIXME: Precompute
+ }
+
+ y += 2 * clusterCount;
+ }
+ Until(y >= yMax)
+ }
+
+ Float4 QuadRasterizer::interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective, bool clamp)
+ {
+ Float4 interpolant = D;
+
+ if(!flat)
+ {
+ interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, A), 16);
+
+ if(perspective)
+ {
+ interpolant *= rhw;
+ }
+ }
+
+ if(clamp)
+ {
+ interpolant = Min(Max(interpolant, Float4(0.0f)), Float4(1.0f));
+ }
+
+ return interpolant;
+ }
+
+ bool QuadRasterizer::interpolateZ() const
+ {
+ return state.depthTestActive || state.pixelFogActive() || (shader && shader->isVPosDeclared() && fullPixelPositionRegister);
+ }
+
+ bool QuadRasterizer::interpolateW() const
+ {
+ return state.perspective || (shader && shader->isVPosDeclared() && fullPixelPositionRegister);
+ }
+}
diff --git a/src/Device/QuadRasterizer.hpp b/src/Device/QuadRasterizer.hpp
new file mode 100644
index 0000000..1d7681d
--- /dev/null
+++ b/src/Device/QuadRasterizer.hpp
@@ -0,0 +1,61 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_QuadRasterizer_hpp
+#define sw_QuadRasterizer_hpp
+
+#include "Rasterizer.hpp"
+#include "Shader/ShaderCore.hpp"
+#include "Shader/PixelShader.hpp"
+#include "Common/Types.hpp"
+
+namespace sw
+{
+ class QuadRasterizer : public Rasterizer
+ {
+ public:
+ QuadRasterizer(const PixelProcessor::State &state, const PixelShader *shader);
+ virtual ~QuadRasterizer();
+
+ void generate();
+
+ protected:
+ Pointer<Byte> constants;
+
+ Float4 Dz[4];
+ Float4 Dw;
+ Float4 Dv[MAX_FRAGMENT_INPUTS][4];
+ Float4 Df;
+
+ UInt occlusion;
+
+#if PERF_PROFILE
+ Long cycles[PERF_TIMERS];
+#endif
+
+ virtual void quad(Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y) = 0;
+
+ bool interpolateZ() const;
+ bool interpolateW() const;
+ Float4 interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective, bool clamp);
+
+ const PixelProcessor::State &state;
+ const PixelShader *const shader;
+
+ private:
+ void rasterize(Int &yMin, Int &yMax);
+ };
+}
+
+#endif // sw_QuadRasterizer_hpp
diff --git a/src/Device/Rasterizer.hpp b/src/Device/Rasterizer.hpp
new file mode 100644
index 0000000..3811a25
--- /dev/null
+++ b/src/Device/Rasterizer.hpp
@@ -0,0 +1,38 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Rasterizer_hpp
+#define sw_Rasterizer_hpp
+
+#include "Context.hpp"
+#include "PixelProcessor.hpp"
+#include "Main/Config.hpp"
+
+namespace sw
+{
+ class Rasterizer : public Function<Void(Pointer<Byte>, Int, Int, Pointer<Byte>)>
+ {
+ public:
+ Rasterizer() : primitive(Arg<0>()), count(Arg<1>()), cluster(Arg<2>()), data(Arg<3>()) {}
+ virtual ~Rasterizer() {};
+
+ protected:
+ Pointer<Byte> primitive;
+ Int count;
+ Int cluster;
+ Pointer<Byte> data;
+ };
+}
+
+#endif // sw_Rasterizer_hpp
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
new file mode 100644
index 0000000..e7ec20a
--- /dev/null
+++ b/src/Device/Renderer.cpp
@@ -0,0 +1,2854 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Renderer.hpp"
+
+#include "Clipper.hpp"
+#include "Surface.hpp"
+#include "Primitive.hpp"
+#include "Polygon.hpp"
+#include "Main/FrameBuffer.hpp"
+#include "Main/SwiftConfig.hpp"
+#include "Reactor/Reactor.hpp"
+#include "Shader/Constants.hpp"
+#include "Common/MutexLock.hpp"
+#include "Common/CPUID.hpp"
+#include "Common/Memory.hpp"
+#include "Common/Resource.hpp"
+#include "Common/Half.hpp"
+#include "Common/Math.hpp"
+#include "Common/Timer.hpp"
+#include "Common/Debug.hpp"
+
+#undef max
+
+bool disableServer = true;
+
+#ifndef NDEBUG
+unsigned int minPrimitives = 1;
+unsigned int maxPrimitives = 1 << 21;
+#endif
+
+namespace sw
+{
+ extern bool halfIntegerCoordinates; // Pixel centers are not at integer coordinates
+ extern bool symmetricNormalizedDepth; // [-1, 1] instead of [0, 1]
+ extern bool booleanFaceRegister;
+ extern bool fullPixelPositionRegister;
+ extern bool leadingVertexFirst; // Flat shading uses first vertex, else last
+ extern bool secondaryColor; // Specular lighting is applied after texturing
+ extern bool colorsDefaultToZero;
+
+ extern bool forceWindowed;
+ extern bool complementaryDepthBuffer;
+ extern bool postBlendSRGB;
+ extern bool exactColorRounding;
+ extern TransparencyAntialiasing transparencyAntialiasing;
+ extern bool forceClearRegisters;
+
+ extern bool precacheVertex;
+ extern bool precacheSetup;
+ extern bool precachePixel;
+
+ static const int batchSize = 128;
+ AtomicInt threadCount(1);
+ AtomicInt Renderer::unitCount(1);
+ AtomicInt Renderer::clusterCount(1);
+
+ TranscendentalPrecision logPrecision = ACCURATE;
+ TranscendentalPrecision expPrecision = ACCURATE;
+ TranscendentalPrecision rcpPrecision = ACCURATE;
+ TranscendentalPrecision rsqPrecision = ACCURATE;
+ bool perspectiveCorrection = true;
+
+ static void setGlobalRenderingSettings(Conventions conventions, bool exactColorRounding)
+ {
+ static bool initialized = false;
+
+ if(!initialized)
+ {
+ sw::halfIntegerCoordinates = conventions.halfIntegerCoordinates;
+ sw::symmetricNormalizedDepth = conventions.symmetricNormalizedDepth;
+ sw::booleanFaceRegister = conventions.booleanFaceRegister;
+ sw::fullPixelPositionRegister = conventions.fullPixelPositionRegister;
+ sw::leadingVertexFirst = conventions.leadingVertexFirst;
+ sw::secondaryColor = conventions.secondaryColor;
+ sw::colorsDefaultToZero = conventions.colorsDefaultToZero;
+ sw::exactColorRounding = exactColorRounding;
+ initialized = true;
+ }
+ }
+
+ struct Parameters
+ {
+ Renderer *renderer;
+ int threadIndex;
+ };
+
+ DrawCall::DrawCall()
+ {
+ queries = 0;
+
+ vsDirtyConstF = VERTEX_UNIFORM_VECTORS + 1;
+ vsDirtyConstI = 16;
+ vsDirtyConstB = 16;
+
+ psDirtyConstF = FRAGMENT_UNIFORM_VECTORS;
+ psDirtyConstI = 16;
+ psDirtyConstB = 16;
+
+ references = -1;
+
+ data = (DrawData*)allocate(sizeof(DrawData));
+ data->constants = &constants;
+ }
+
+ DrawCall::~DrawCall()
+ {
+ delete queries;
+
+ deallocate(data);
+ }
+
+ Renderer::Renderer(Context *context, Conventions conventions, bool exactColorRounding) : VertexProcessor(context), PixelProcessor(context), SetupProcessor(context), context(context), viewport()
+ {
+ setGlobalRenderingSettings(conventions, exactColorRounding);
+
+ setRenderTarget(0, 0);
+ clipper = new Clipper(symmetricNormalizedDepth);
+ blitter = new Blitter;
+
+ updateViewMatrix = true;
+ updateBaseMatrix = true;
+ updateProjectionMatrix = true;
+ updateClipPlanes = true;
+
+ #if PERF_HUD
+ resetTimers();
+ #endif
+
+ for(int i = 0; i < 16; i++)
+ {
+ vertexTask[i] = 0;
+
+ worker[i] = 0;
+ resume[i] = 0;
+ suspend[i] = 0;
+ }
+
+ threadsAwake = 0;
+ resumeApp = new Event();
+
+ currentDraw = 0;
+ nextDraw = 0;
+
+ qHead = 0;
+ qSize = 0;
+
+ for(int i = 0; i < 16; i++)
+ {
+ triangleBatch[i] = 0;
+ primitiveBatch[i] = 0;
+ }
+
+ for(int draw = 0; draw < DRAW_COUNT; draw++)
+ {
+ drawCall[draw] = new DrawCall();
+ drawList[draw] = drawCall[draw];
+ }
+
+ for(int unit = 0; unit < 16; unit++)
+ {
+ primitiveProgress[unit].init();
+ }
+
+ for(int cluster = 0; cluster < 16; cluster++)
+ {
+ pixelProgress[cluster].init();
+ }
+
+ clipFlags = 0;
+
+ swiftConfig = new SwiftConfig(disableServer);
+ updateConfiguration(true);
+
+ sync = new Resource(0);
+ }
+
+ Renderer::~Renderer()
+ {
+ sync->destruct();
+
+ delete clipper;
+ clipper = nullptr;
+
+ delete blitter;
+ blitter = nullptr;
+
+ terminateThreads();
+ delete resumeApp;
+
+ for(int draw = 0; draw < DRAW_COUNT; draw++)
+ {
+ delete drawCall[draw];
+ }
+
+ delete swiftConfig;
+ }
+
+ // This object has to be mem aligned
+ void* Renderer::operator new(size_t size)
+ {
+ ASSERT(size == sizeof(Renderer)); // This operator can't be called from a derived class
+ return sw::allocate(sizeof(Renderer), 16);
+ }
+
+ void Renderer::operator delete(void * mem)
+ {
+ sw::deallocate(mem);
+ }
+
+ void Renderer::draw(DrawType drawType, unsigned int indexOffset, unsigned int count, bool update)
+ {
+ #ifndef NDEBUG
+ if(count < minPrimitives || count > maxPrimitives)
+ {
+ return;
+ }
+ #endif
+
+ context->drawType = drawType;
+
+ updateConfiguration();
+ updateClipper();
+
+ int ss = context->getSuperSampleCount();
+ int ms = context->getMultiSampleCount();
+ bool requiresSync = false;
+
+ for(int q = 0; q < ss; q++)
+ {
+ unsigned int oldMultiSampleMask = context->multiSampleMask;
+ context->multiSampleMask = (context->sampleMask >> (ms * q)) & ((unsigned)0xFFFFFFFF >> (32 - ms));
+
+ if(!context->multiSampleMask)
+ {
+ continue;
+ }
+
+ sync->lock(sw::PRIVATE);
+
+ if(update || oldMultiSampleMask != context->multiSampleMask)
+ {
+ vertexState = VertexProcessor::update(drawType);
+ setupState = SetupProcessor::update();
+ pixelState = PixelProcessor::update();
+
+ vertexRoutine = VertexProcessor::routine(vertexState);
+ setupRoutine = SetupProcessor::routine(setupState);
+ pixelRoutine = PixelProcessor::routine(pixelState);
+ }
+
+ int batch = batchSize / ms;
+
+ int (Renderer::*setupPrimitives)(int batch, int count);
+
+ if(context->isDrawTriangle())
+ {
+ switch(context->fillMode)
+ {
+ case FILL_SOLID:
+ setupPrimitives = &Renderer::setupSolidTriangles;
+ break;
+ case FILL_WIREFRAME:
+ setupPrimitives = &Renderer::setupWireframeTriangle;
+ batch = 1;
+ break;
+ case FILL_VERTEX:
+ setupPrimitives = &Renderer::setupVertexTriangle;
+ batch = 1;
+ break;
+ default:
+ ASSERT(false);
+ return;
+ }
+ }
+ else if(context->isDrawLine())
+ {
+ setupPrimitives = &Renderer::setupLines;
+ }
+ else // Point draw
+ {
+ setupPrimitives = &Renderer::setupPoints;
+ }
+
+ DrawCall *draw = nullptr;
+
+ do
+ {
+ for(int i = 0; i < DRAW_COUNT; i++)
+ {
+ if(drawCall[i]->references == -1)
+ {
+ draw = drawCall[i];
+ drawList[nextDraw & DRAW_COUNT_BITS] = draw;
+
+ break;
+ }
+ }
+
+ if(!draw)
+ {
+ resumeApp->wait();
+ }
+ }
+ while(!draw);
+
+ DrawData *data = draw->data;
+
+ if(queries.size() != 0)
+ {
+ draw->queries = new std::list<Query*>();
+ bool includePrimitivesWrittenQueries = vertexState.transformFeedbackQueryEnabled && vertexState.transformFeedbackEnabled;
+ for(auto &query : queries)
+ {
+ if(includePrimitivesWrittenQueries || (query->type != Query::TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN))
+ {
+ ++query->reference; // Atomic
+ draw->queries->push_back(query);
+ }
+ }
+ }
+
+ draw->drawType = drawType;
+ draw->batchSize = batch;
+
+ vertexRoutine->bind();
+ setupRoutine->bind();
+ pixelRoutine->bind();
+
+ draw->vertexRoutine = vertexRoutine;
+ draw->setupRoutine = setupRoutine;
+ draw->pixelRoutine = pixelRoutine;
+ draw->vertexPointer = (VertexProcessor::RoutinePointer)vertexRoutine->getEntry();
+ draw->setupPointer = (SetupProcessor::RoutinePointer)setupRoutine->getEntry();
+ draw->pixelPointer = (PixelProcessor::RoutinePointer)pixelRoutine->getEntry();
+ draw->setupPrimitives = setupPrimitives;
+ draw->setupState = setupState;
+
+ for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+ {
+ draw->vertexStream[i] = context->input[i].resource;
+ data->input[i] = context->input[i].buffer;
+ data->stride[i] = context->input[i].stride;
+
+ if(draw->vertexStream[i])
+ {
+ draw->vertexStream[i]->lock(PUBLIC, PRIVATE);
+ }
+ }
+
+ if(context->indexBuffer)
+ {
+ data->indices = (unsigned char*)context->indexBuffer->lock(PUBLIC, PRIVATE) + indexOffset;
+ }
+
+ draw->indexBuffer = context->indexBuffer;
+
+ for(int sampler = 0; sampler < TOTAL_IMAGE_UNITS; sampler++)
+ {
+ draw->texture[sampler] = 0;
+ }
+
+ for(int sampler = 0; sampler < TEXTURE_IMAGE_UNITS; sampler++)
+ {
+ if(pixelState.sampler[sampler].textureType != TEXTURE_NULL)
+ {
+ draw->texture[sampler] = context->texture[sampler];
+ draw->texture[sampler]->lock(PUBLIC, isReadWriteTexture(sampler) ? MANAGED : PRIVATE); // If the texure is both read and written, use the same read/write lock as render targets
+
+ data->mipmap[sampler] = context->sampler[sampler].getTextureData();
+
+ requiresSync |= context->sampler[sampler].requiresSync();
+ }
+ }
+
+ if(context->pixelShader)
+ {
+ if(draw->psDirtyConstF)
+ {
+ memcpy(&data->ps.cW, PixelProcessor::cW, sizeof(word4) * 4 * (draw->psDirtyConstF < 8 ? draw->psDirtyConstF : 8));
+ memcpy(&data->ps.c, PixelProcessor::c, sizeof(float4) * draw->psDirtyConstF);
+ draw->psDirtyConstF = 0;
+ }
+
+ if(draw->psDirtyConstI)
+ {
+ memcpy(&data->ps.i, PixelProcessor::i, sizeof(int4) * draw->psDirtyConstI);
+ draw->psDirtyConstI = 0;
+ }
+
+ if(draw->psDirtyConstB)
+ {
+ memcpy(&data->ps.b, PixelProcessor::b, sizeof(bool) * draw->psDirtyConstB);
+ draw->psDirtyConstB = 0;
+ }
+
+ PixelProcessor::lockUniformBuffers(data->ps.u, draw->pUniformBuffers);
+ }
+ else
+ {
+ for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; i++)
+ {
+ draw->pUniformBuffers[i] = nullptr;
+ }
+ }
+
+ if(context->pixelShaderModel() <= 0x0104)
+ {
+ for(int stage = 0; stage < 8; stage++)
+ {
+ if(pixelState.textureStage[stage].stageOperation != TextureStage::STAGE_DISABLE || context->pixelShader)
+ {
+ data->textureStage[stage] = context->textureStage[stage].uniforms;
+ }
+ else break;
+ }
+ }
+
+ if(context->vertexShader)
+ {
+ if(context->vertexShader->getShaderModel() >= 0x0300)
+ {
+ for(int sampler = 0; sampler < VERTEX_TEXTURE_IMAGE_UNITS; sampler++)
+ {
+ if(vertexState.sampler[sampler].textureType != TEXTURE_NULL)
+ {
+ draw->texture[TEXTURE_IMAGE_UNITS + sampler] = context->texture[TEXTURE_IMAGE_UNITS + sampler];
+ draw->texture[TEXTURE_IMAGE_UNITS + sampler]->lock(PUBLIC, PRIVATE);
+
+ data->mipmap[TEXTURE_IMAGE_UNITS + sampler] = context->sampler[TEXTURE_IMAGE_UNITS + sampler].getTextureData();
+
+ requiresSync |= context->sampler[TEXTURE_IMAGE_UNITS + sampler].requiresSync();
+ }
+ }
+ }
+
+ if(draw->vsDirtyConstF)
+ {
+ memcpy(&data->vs.c, VertexProcessor::c, sizeof(float4) * draw->vsDirtyConstF);
+ draw->vsDirtyConstF = 0;
+ }
+
+ if(draw->vsDirtyConstI)
+ {
+ memcpy(&data->vs.i, VertexProcessor::i, sizeof(int4) * draw->vsDirtyConstI);
+ draw->vsDirtyConstI = 0;
+ }
+
+ if(draw->vsDirtyConstB)
+ {
+ memcpy(&data->vs.b, VertexProcessor::b, sizeof(bool) * draw->vsDirtyConstB);
+ draw->vsDirtyConstB = 0;
+ }
+
+ if(context->vertexShader->isInstanceIdDeclared())
+ {
+ data->instanceID = context->instanceID;
+ }
+
+ VertexProcessor::lockUniformBuffers(data->vs.u, draw->vUniformBuffers);
+ VertexProcessor::lockTransformFeedbackBuffers(data->vs.t, data->vs.reg, data->vs.row, data->vs.col, data->vs.str, draw->transformFeedbackBuffers);
+ }
+ else
+ {
+ data->ff = ff;
+
+ draw->vsDirtyConstF = VERTEX_UNIFORM_VECTORS + 1;
+ draw->vsDirtyConstI = 16;
+ draw->vsDirtyConstB = 16;
+
+ for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; i++)
+ {
+ draw->vUniformBuffers[i] = nullptr;
+ }
+
+ for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
+ {
+ draw->transformFeedbackBuffers[i] = nullptr;
+ }
+ }
+
+ if(pixelState.stencilActive)
+ {
+ data->stencil[0] = stencil;
+ data->stencil[1] = stencilCCW;
+ }
+
+ if(pixelState.fogActive)
+ {
+ data->fog = fog;
+ }
+
+ if(setupState.isDrawPoint)
+ {
+ data->point = point;
+ }
+
+ data->lineWidth = context->lineWidth;
+
+ data->factor = factor;
+
+ if(pixelState.transparencyAntialiasing == TRANSPARENCY_ALPHA_TO_COVERAGE)
+ {
+ float ref = context->alphaReference * (1.0f / 255.0f);
+ float margin = sw::min(ref, 1.0f - ref);
+
+ if(ms == 4)
+ {
+ data->a2c0 = replicate(ref - margin * 0.6f);
+ data->a2c1 = replicate(ref - margin * 0.2f);
+ data->a2c2 = replicate(ref + margin * 0.2f);
+ data->a2c3 = replicate(ref + margin * 0.6f);
+ }
+ else if(ms == 2)
+ {
+ data->a2c0 = replicate(ref - margin * 0.3f);
+ data->a2c1 = replicate(ref + margin * 0.3f);
+ }
+ else ASSERT(false);
+ }
+
+ if(pixelState.occlusionEnabled)
+ {
+ for(int cluster = 0; cluster < clusterCount; cluster++)
+ {
+ data->occlusion[cluster] = 0;
+ }
+ }
+
+ #if PERF_PROFILE
+ for(int cluster = 0; cluster < clusterCount; cluster++)
+ {
+ for(int i = 0; i < PERF_TIMERS; i++)
+ {
+ data->cycles[i][cluster] = 0;
+ }
+ }
+ #endif
+
+ // Viewport
+ {
+ float W = 0.5f * viewport.width;
+ float H = 0.5f * viewport.height;
+ float X0 = viewport.x0 + W;
+ float Y0 = viewport.y0 + H;
+ float N = viewport.minZ;
+ float F = viewport.maxZ;
+ float Z = F - N;
+
+ if(context->isDrawTriangle(false))
+ {
+ N += context->depthBias;
+ }
+
+ if(complementaryDepthBuffer)
+ {
+ Z = -Z;
+ N = 1 - N;
+ }
+
+ static const float X[5][16] = // Fragment offsets
+ {
+ {+0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f}, // 1 sample
+ {-0.2500f, +0.2500f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f}, // 2 samples
+ {-0.3000f, +0.1000f, +0.3000f, -0.1000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f}, // 4 samples
+ {+0.1875f, -0.3125f, +0.3125f, -0.4375f, -0.0625f, +0.4375f, +0.0625f, -0.1875f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f}, // 8 samples
+ {+0.2553f, -0.1155f, +0.1661f, -0.1828f, +0.2293f, -0.4132f, -0.1773f, -0.0577f, +0.3891f, -0.4656f, +0.4103f, +0.4248f, -0.2109f, +0.3966f, -0.2664f, -0.3872f} // 16 samples
+ };
+
+ static const float Y[5][16] = // Fragment offsets
+ {
+ {+0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f}, // 1 sample
+ {-0.2500f, +0.2500f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f}, // 2 samples
+ {-0.1000f, -0.3000f, +0.1000f, +0.3000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f}, // 4 samples
+ {-0.4375f, -0.3125f, -0.1875f, -0.0625f, +0.0625f, +0.1875f, +0.3125f, +0.4375f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f}, // 8 samples
+ {-0.4503f, +0.1883f, +0.3684f, -0.4668f, -0.0690f, -0.1315f, +0.4999f, +0.0728f, +0.1070f, -0.3086f, +0.3725f, -0.1547f, -0.1102f, -0.3588f, +0.1789f, +0.0269f} // 16 samples
+ };
+
+ int s = sw::log2(ss);
+
+ data->Wx16 = replicate(W * 16);
+ data->Hx16 = replicate(H * 16);
+ data->X0x16 = replicate(X0 * 16 - 8);
+ data->Y0x16 = replicate(Y0 * 16 - 8);
+ data->XXXX = replicate(X[s][q] / W);
+ data->YYYY = replicate(Y[s][q] / H);
+ data->halfPixelX = replicate(0.5f / W);
+ data->halfPixelY = replicate(0.5f / H);
+ data->viewportHeight = abs(viewport.height);
+ data->slopeDepthBias = context->slopeDepthBias;
+ data->depthRange = Z;
+ data->depthNear = N;
+ draw->clipFlags = clipFlags;
+
+ if(clipFlags)
+ {
+ if(clipFlags & Clipper::CLIP_PLANE0) data->clipPlane[0] = clipPlane[0];
+ if(clipFlags & Clipper::CLIP_PLANE1) data->clipPlane[1] = clipPlane[1];
+ if(clipFlags & Clipper::CLIP_PLANE2) data->clipPlane[2] = clipPlane[2];
+ if(clipFlags & Clipper::CLIP_PLANE3) data->clipPlane[3] = clipPlane[3];
+ if(clipFlags & Clipper::CLIP_PLANE4) data->clipPlane[4] = clipPlane[4];
+ if(clipFlags & Clipper::CLIP_PLANE5) data->clipPlane[5] = clipPlane[5];
+ }
+ }
+
+ // Target
+ {
+ for(int index = 0; index < RENDERTARGETS; index++)
+ {
+ draw->renderTarget[index] = context->renderTarget[index];
+
+ if(draw->renderTarget[index])
+ {
+ unsigned int layer = context->renderTargetLayer[index];
+ requiresSync |= context->renderTarget[index]->requiresSync();
+ data->colorBuffer[index] = (unsigned int*)context->renderTarget[index]->lockInternal(0, 0, layer, LOCK_READWRITE, MANAGED);
+ data->colorBuffer[index] += q * ms * context->renderTarget[index]->getSliceB(true);
+ data->colorPitchB[index] = context->renderTarget[index]->getInternalPitchB();
+ data->colorSliceB[index] = context->renderTarget[index]->getInternalSliceB();
+ }
+ }
+
+ draw->depthBuffer = context->depthBuffer;
+ draw->stencilBuffer = context->stencilBuffer;
+
+ if(draw->depthBuffer)
+ {
+ unsigned int layer = context->depthBufferLayer;
+ requiresSync |= context->depthBuffer->requiresSync();
+ data->depthBuffer = (float*)context->depthBuffer->lockInternal(0, 0, layer, LOCK_READWRITE, MANAGED);
+ data->depthBuffer += q * ms * context->depthBuffer->getSliceB(true);
+ data->depthPitchB = context->depthBuffer->getInternalPitchB();
+ data->depthSliceB = context->depthBuffer->getInternalSliceB();
+ }
+
+ if(draw->stencilBuffer)
+ {
+ unsigned int layer = context->stencilBufferLayer;
+ requiresSync |= context->stencilBuffer->requiresSync();
+ data->stencilBuffer = (unsigned char*)context->stencilBuffer->lockStencil(0, 0, layer, MANAGED);
+ data->stencilBuffer += q * ms * context->stencilBuffer->getSliceB(true);
+ data->stencilPitchB = context->stencilBuffer->getStencilPitchB();
+ data->stencilSliceB = context->stencilBuffer->getStencilSliceB();
+ }
+ }
+
+ // Scissor
+ {
+ data->scissorX0 = scissor.x0;
+ data->scissorX1 = scissor.x1;
+ data->scissorY0 = scissor.y0;
+ data->scissorY1 = scissor.y1;
+ }
+
+ draw->primitive = 0;
+ draw->count = count;
+
+ draw->references = (count + batch - 1) / batch;
+
+ schedulerMutex.lock();
+ ++nextDraw; // Atomic
+ schedulerMutex.unlock();
+
+ #ifndef NDEBUG
+ if(threadCount == 1) // Use main thread for draw execution
+ {
+ threadsAwake = 1;
+ task[0].type = Task::RESUME;
+
+ taskLoop(0);
+ }
+ else
+ #endif
+ {
+ if(!threadsAwake)
+ {
+ suspend[0]->wait();
+
+ threadsAwake = 1;
+ task[0].type = Task::RESUME;
+
+ resume[0]->signal();
+ }
+ }
+ }
+
+ // TODO(sugoi): This is a temporary brute-force workaround to ensure IOSurface synchronization.
+ if(requiresSync)
+ {
+ synchronize();
+ }
+ }
+
+ void Renderer::clear(void *value, Format format, Surface *dest, const Rect &clearRect, unsigned int rgbaMask)
+ {
+ blitter->clear(value, format, dest, clearRect, rgbaMask);
+ }
+
+ void Renderer::blit(Surface *source, const SliceRectF &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil, bool sRGBconversion)
+ {
+ blitter->blit(source, sRect, dest, dRect, {filter, isStencil, sRGBconversion});
+ }
+
+ void Renderer::blit3D(Surface *source, Surface *dest)
+ {
+ blitter->blit3D(source, dest);
+ }
+
+ void Renderer::threadFunction(void *parameters)
+ {
+ Renderer *renderer = static_cast<Parameters*>(parameters)->renderer;
+ int threadIndex = static_cast<Parameters*>(parameters)->threadIndex;
+
+ if(logPrecision < IEEE)
+ {
+ CPUID::setFlushToZero(true);
+ CPUID::setDenormalsAreZero(true);
+ }
+
+ renderer->threadLoop(threadIndex);
+ }
+
+ void Renderer::threadLoop(int threadIndex)
+ {
+ while(!exitThreads)
+ {
+ taskLoop(threadIndex);
+
+ suspend[threadIndex]->signal();
+ resume[threadIndex]->wait();
+ }
+ }
+
+ void Renderer::taskLoop(int threadIndex)
+ {
+ while(task[threadIndex].type != Task::SUSPEND)
+ {
+ scheduleTask(threadIndex);
+ executeTask(threadIndex);
+ }
+ }
+
+ void Renderer::findAvailableTasks()
+ {
+ // Find pixel tasks
+ for(int cluster = 0; cluster < clusterCount; cluster++)
+ {
+ if(!pixelProgress[cluster].executing)
+ {
+ for(int unit = 0; unit < unitCount; unit++)
+ {
+ if(primitiveProgress[unit].references > 0) // Contains processed primitives
+ {
+ if(pixelProgress[cluster].drawCall == primitiveProgress[unit].drawCall)
+ {
+ if(pixelProgress[cluster].processedPrimitives == primitiveProgress[unit].firstPrimitive) // Previous primitives have been rendered
+ {
+ Task &task = taskQueue[qHead];
+ task.type = Task::PIXELS;
+ task.primitiveUnit = unit;
+ task.pixelCluster = cluster;
+
+ pixelProgress[cluster].executing = true;
+
+ // Commit to the task queue
+ qHead = (qHead + 1) & TASK_COUNT_BITS;
+ qSize++;
+
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Find primitive tasks
+ if(currentDraw == nextDraw)
+ {
+ return; // No more primitives to process
+ }
+
+ for(int unit = 0; unit < unitCount; unit++)
+ {
+ DrawCall *draw = drawList[currentDraw & DRAW_COUNT_BITS];
+
+ int primitive = draw->primitive;
+ int count = draw->count;
+
+ if(primitive >= count)
+ {
+ ++currentDraw; // Atomic
+
+ if(currentDraw == nextDraw)
+ {
+ return; // No more primitives to process
+ }
+
+ draw = drawList[currentDraw & DRAW_COUNT_BITS];
+ }
+
+ if(!primitiveProgress[unit].references) // Task not already being executed and not still in use by a pixel unit
+ {
+ primitive = draw->primitive;
+ count = draw->count;
+ int batch = draw->batchSize;
+
+ primitiveProgress[unit].drawCall = currentDraw;
+ primitiveProgress[unit].firstPrimitive = primitive;
+ primitiveProgress[unit].primitiveCount = count - primitive >= batch ? batch : count - primitive;
+
+ draw->primitive += batch;
+
+ Task &task = taskQueue[qHead];
+ task.type = Task::PRIMITIVES;
+ task.primitiveUnit = unit;
+
+ primitiveProgress[unit].references = -1;
+
+ // Commit to the task queue
+ qHead = (qHead + 1) & TASK_COUNT_BITS;
+ qSize++;
+ }
+ }
+ }
+
+ void Renderer::scheduleTask(int threadIndex)
+ {
+ schedulerMutex.lock();
+
+ int curThreadsAwake = threadsAwake;
+
+ if((int)qSize < threadCount - curThreadsAwake + 1)
+ {
+ findAvailableTasks();
+ }
+
+ if(qSize != 0)
+ {
+ task[threadIndex] = taskQueue[(qHead - qSize) & TASK_COUNT_BITS];
+ qSize--;
+
+ if(curThreadsAwake != threadCount)
+ {
+ int wakeup = qSize - curThreadsAwake + 1;
+
+ for(int i = 0; i < threadCount && wakeup > 0; i++)
+ {
+ if(task[i].type == Task::SUSPEND)
+ {
+ suspend[i]->wait();
+ task[i].type = Task::RESUME;
+ resume[i]->signal();
+
+ ++threadsAwake; // Atomic
+ wakeup--;
+ }
+ }
+ }
+ }
+ else
+ {
+ task[threadIndex].type = Task::SUSPEND;
+
+ --threadsAwake; // Atomic
+ }
+
+ schedulerMutex.unlock();
+ }
+
+ void Renderer::executeTask(int threadIndex)
+ {
+ #if PERF_HUD
+ int64_t startTick = Timer::ticks();
+ #endif
+
+ switch(task[threadIndex].type)
+ {
+ case Task::PRIMITIVES:
+ {
+ int unit = task[threadIndex].primitiveUnit;
+
+ int input = primitiveProgress[unit].firstPrimitive;
+ int count = primitiveProgress[unit].primitiveCount;
+ DrawCall *draw = drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+ int (Renderer::*setupPrimitives)(int batch, int count) = draw->setupPrimitives;
+
+ processPrimitiveVertices(unit, input, count, draw->count, threadIndex);
+
+ #if PERF_HUD
+ int64_t time = Timer::ticks();
+ vertexTime[threadIndex] += time - startTick;
+ startTick = time;
+ #endif
+
+ int visible = 0;
+
+ if(!draw->setupState.rasterizerDiscard)
+ {
+ visible = (this->*setupPrimitives)(unit, count);
+ }
+
+ primitiveProgress[unit].visible = visible;
+ primitiveProgress[unit].references = clusterCount;
+
+ #if PERF_HUD
+ setupTime[threadIndex] += Timer::ticks() - startTick;
+ #endif
+ }
+ break;
+ case Task::PIXELS:
+ {
+ int unit = task[threadIndex].primitiveUnit;
+ int visible = primitiveProgress[unit].visible;
+
+ if(visible > 0)
+ {
+ int cluster = task[threadIndex].pixelCluster;
+ Primitive *primitive = primitiveBatch[unit];
+ DrawCall *draw = drawList[pixelProgress[cluster].drawCall & DRAW_COUNT_BITS];
+ DrawData *data = draw->data;
+ PixelProcessor::RoutinePointer pixelRoutine = draw->pixelPointer;
+
+ pixelRoutine(primitive, visible, cluster, data);
+ }
+
+ finishRendering(task[threadIndex]);
+
+ #if PERF_HUD
+ pixelTime[threadIndex] += Timer::ticks() - startTick;
+ #endif
+ }
+ break;
+ case Task::RESUME:
+ break;
+ case Task::SUSPEND:
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void Renderer::synchronize()
+ {
+ sync->lock(sw::PUBLIC);
+ sync->unlock();
+ }
+
+ void Renderer::finishRendering(Task &pixelTask)
+ {
+ int unit = pixelTask.primitiveUnit;
+ int cluster = pixelTask.pixelCluster;
+
+ DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+ DrawData &data = *draw.data;
+ int primitive = primitiveProgress[unit].firstPrimitive;
+ int count = primitiveProgress[unit].primitiveCount;
+ int processedPrimitives = primitive + count;
+
+ pixelProgress[cluster].processedPrimitives = processedPrimitives;
+
+ if(pixelProgress[cluster].processedPrimitives >= draw.count)
+ {
+ ++pixelProgress[cluster].drawCall; // Atomic
+ pixelProgress[cluster].processedPrimitives = 0;
+ }
+
+ int ref = primitiveProgress[unit].references--; // Atomic
+
+ if(ref == 0)
+ {
+ ref = draw.references--; // Atomic
+
+ if(ref == 0)
+ {
+ #if PERF_PROFILE
+ for(int cluster = 0; cluster < clusterCount; cluster++)
+ {
+ for(int i = 0; i < PERF_TIMERS; i++)
+ {
+ profiler.cycles[i] += data.cycles[i][cluster];
+ }
+ }
+ #endif
+
+ if(draw.queries)
+ {
+ for(auto &query : *(draw.queries))
+ {
+ switch(query->type)
+ {
+ case Query::FRAGMENTS_PASSED:
+ for(int cluster = 0; cluster < clusterCount; cluster++)
+ {
+ query->data += data.occlusion[cluster];
+ }
+ break;
+ case Query::TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
+ query->data += processedPrimitives;
+ break;
+ default:
+ break;
+ }
+
+ --query->reference; // Atomic
+ }
+
+ delete draw.queries;
+ draw.queries = 0;
+ }
+
+ for(int i = 0; i < RENDERTARGETS; i++)
+ {
+ if(draw.renderTarget[i])
+ {
+ draw.renderTarget[i]->unlockInternal();
+ }
+ }
+
+ if(draw.depthBuffer)
+ {
+ draw.depthBuffer->unlockInternal();
+ }
+
+ if(draw.stencilBuffer)
+ {
+ draw.stencilBuffer->unlockStencil();
+ }
+
+ for(int i = 0; i < TOTAL_IMAGE_UNITS; i++)
+ {
+ if(draw.texture[i])
+ {
+ draw.texture[i]->unlock();
+ }
+ }
+
+ for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+ {
+ if(draw.vertexStream[i])
+ {
+ draw.vertexStream[i]->unlock();
+ }
+ }
+
+ if(draw.indexBuffer)
+ {
+ draw.indexBuffer->unlock();
+ }
+
+ for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; i++)
+ {
+ if(draw.pUniformBuffers[i])
+ {
+ draw.pUniformBuffers[i]->unlock();
+ }
+ if(draw.vUniformBuffers[i])
+ {
+ draw.vUniformBuffers[i]->unlock();
+ }
+ }
+
+ for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
+ {
+ if(draw.transformFeedbackBuffers[i])
+ {
+ draw.transformFeedbackBuffers[i]->unlock();
+ }
+ }
+
+ draw.vertexRoutine->unbind();
+ draw.setupRoutine->unbind();
+ draw.pixelRoutine->unbind();
+
+ sync->unlock();
+
+ draw.references = -1;
+ resumeApp->signal();
+ }
+ }
+
+ pixelProgress[cluster].executing = false;
+ }
+
+ void Renderer::processPrimitiveVertices(int unit, unsigned int start, unsigned int triangleCount, unsigned int loop, int thread)
+ {
+ Triangle *triangle = triangleBatch[unit];
+ int primitiveDrawCall = primitiveProgress[unit].drawCall;
+ DrawCall *draw = drawList[primitiveDrawCall & DRAW_COUNT_BITS];
+ DrawData *data = draw->data;
+ VertexTask *task = vertexTask[thread];
+
+ const void *indices = data->indices;
+ VertexProcessor::RoutinePointer vertexRoutine = draw->vertexPointer;
+
+ if(task->vertexCache.drawCall != primitiveDrawCall)
+ {
+ task->vertexCache.clear();
+ task->vertexCache.drawCall = primitiveDrawCall;
+ }
+
+ unsigned int batch[128][3]; // FIXME: Adjust to dynamic batch size
+
+ switch(draw->drawType)
+ {
+ case DRAW_POINTLIST:
+ {
+ unsigned int index = start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index;
+ batch[i][1] = index;
+ batch[i][2] = index;
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_LINELIST:
+ {
+ unsigned int index = 2 * start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index + 0;
+ batch[i][1] = index + 1;
+ batch[i][2] = index + 1;
+
+ index += 2;
+ }
+ }
+ break;
+ case DRAW_LINESTRIP:
+ {
+ unsigned int index = start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index + 0;
+ batch[i][1] = index + 1;
+ batch[i][2] = index + 1;
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_LINELOOP:
+ {
+ unsigned int index = start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = (index + 0) % loop;
+ batch[i][1] = (index + 1) % loop;
+ batch[i][2] = (index + 1) % loop;
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_TRIANGLELIST:
+ {
+ unsigned int index = 3 * start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index + 0;
+ batch[i][1] = index + 1;
+ batch[i][2] = index + 2;
+
+ index += 3;
+ }
+ }
+ break;
+ case DRAW_TRIANGLESTRIP:
+ {
+ unsigned int index = start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ if(leadingVertexFirst)
+ {
+ batch[i][0] = index + 0;
+ batch[i][1] = index + (index & 1) + 1;
+ batch[i][2] = index + (~index & 1) + 1;
+ }
+ else
+ {
+ batch[i][0] = index + (index & 1);
+ batch[i][1] = index + (~index & 1);
+ batch[i][2] = index + 2;
+ }
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_TRIANGLEFAN:
+ {
+ unsigned int index = start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ if(leadingVertexFirst)
+ {
+ batch[i][0] = index + 1;
+ batch[i][1] = index + 2;
+ batch[i][2] = 0;
+ }
+ else
+ {
+ batch[i][0] = 0;
+ batch[i][1] = index + 1;
+ batch[i][2] = index + 2;
+ }
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_INDEXEDPOINTLIST8:
+ {
+ const unsigned char *index = (const unsigned char*)indices + start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = *index;
+ batch[i][1] = *index;
+ batch[i][2] = *index;
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_INDEXEDPOINTLIST16:
+ {
+ const unsigned short *index = (const unsigned short*)indices + start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = *index;
+ batch[i][1] = *index;
+ batch[i][2] = *index;
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_INDEXEDPOINTLIST32:
+ {
+ const unsigned int *index = (const unsigned int*)indices + start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = *index;
+ batch[i][1] = *index;
+ batch[i][2] = *index;
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_INDEXEDLINELIST8:
+ {
+ const unsigned char *index = (const unsigned char*)indices + 2 * start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[0];
+ batch[i][1] = index[1];
+ batch[i][2] = index[1];
+
+ index += 2;
+ }
+ }
+ break;
+ case DRAW_INDEXEDLINELIST16:
+ {
+ const unsigned short *index = (const unsigned short*)indices + 2 * start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[0];
+ batch[i][1] = index[1];
+ batch[i][2] = index[1];
+
+ index += 2;
+ }
+ }
+ break;
+ case DRAW_INDEXEDLINELIST32:
+ {
+ const unsigned int *index = (const unsigned int*)indices + 2 * start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[0];
+ batch[i][1] = index[1];
+ batch[i][2] = index[1];
+
+ index += 2;
+ }
+ }
+ break;
+ case DRAW_INDEXEDLINESTRIP8:
+ {
+ const unsigned char *index = (const unsigned char*)indices + start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[0];
+ batch[i][1] = index[1];
+ batch[i][2] = index[1];
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_INDEXEDLINESTRIP16:
+ {
+ const unsigned short *index = (const unsigned short*)indices + start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[0];
+ batch[i][1] = index[1];
+ batch[i][2] = index[1];
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_INDEXEDLINESTRIP32:
+ {
+ const unsigned int *index = (const unsigned int*)indices + start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[0];
+ batch[i][1] = index[1];
+ batch[i][2] = index[1];
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_INDEXEDLINELOOP8:
+ {
+ const unsigned char *index = (const unsigned char*)indices;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[(start + i + 0) % loop];
+ batch[i][1] = index[(start + i + 1) % loop];
+ batch[i][2] = index[(start + i + 1) % loop];
+ }
+ }
+ break;
+ case DRAW_INDEXEDLINELOOP16:
+ {
+ const unsigned short *index = (const unsigned short*)indices;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[(start + i + 0) % loop];
+ batch[i][1] = index[(start + i + 1) % loop];
+ batch[i][2] = index[(start + i + 1) % loop];
+ }
+ }
+ break;
+ case DRAW_INDEXEDLINELOOP32:
+ {
+ const unsigned int *index = (const unsigned int*)indices;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[(start + i + 0) % loop];
+ batch[i][1] = index[(start + i + 1) % loop];
+ batch[i][2] = index[(start + i + 1) % loop];
+ }
+ }
+ break;
+ case DRAW_INDEXEDTRIANGLELIST8:
+ {
+ const unsigned char *index = (const unsigned char*)indices + 3 * start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[0];
+ batch[i][1] = index[1];
+ batch[i][2] = index[2];
+
+ index += 3;
+ }
+ }
+ break;
+ case DRAW_INDEXEDTRIANGLELIST16:
+ {
+ const unsigned short *index = (const unsigned short*)indices + 3 * start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[0];
+ batch[i][1] = index[1];
+ batch[i][2] = index[2];
+
+ index += 3;
+ }
+ }
+ break;
+ case DRAW_INDEXEDTRIANGLELIST32:
+ {
+ const unsigned int *index = (const unsigned int*)indices + 3 * start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[0];
+ batch[i][1] = index[1];
+ batch[i][2] = index[2];
+
+ index += 3;
+ }
+ }
+ break;
+ case DRAW_INDEXEDTRIANGLESTRIP8:
+ {
+ const unsigned char *index = (const unsigned char*)indices + start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[0];
+ batch[i][1] = index[((start + i) & 1) + 1];
+ batch[i][2] = index[(~(start + i) & 1) + 1];
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_INDEXEDTRIANGLESTRIP16:
+ {
+ const unsigned short *index = (const unsigned short*)indices + start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[0];
+ batch[i][1] = index[((start + i) & 1) + 1];
+ batch[i][2] = index[(~(start + i) & 1) + 1];
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_INDEXEDTRIANGLESTRIP32:
+ {
+ const unsigned int *index = (const unsigned int*)indices + start;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[0];
+ batch[i][1] = index[((start + i) & 1) + 1];
+ batch[i][2] = index[(~(start + i) & 1) + 1];
+
+ index += 1;
+ }
+ }
+ break;
+ case DRAW_INDEXEDTRIANGLEFAN8:
+ {
+ const unsigned char *index = (const unsigned char*)indices;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[start + i + 1];
+ batch[i][1] = index[start + i + 2];
+ batch[i][2] = index[0];
+ }
+ }
+ break;
+ case DRAW_INDEXEDTRIANGLEFAN16:
+ {
+ const unsigned short *index = (const unsigned short*)indices;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[start + i + 1];
+ batch[i][1] = index[start + i + 2];
+ batch[i][2] = index[0];
+ }
+ }
+ break;
+ case DRAW_INDEXEDTRIANGLEFAN32:
+ {
+ const unsigned int *index = (const unsigned int*)indices;
+
+ for(unsigned int i = 0; i < triangleCount; i++)
+ {
+ batch[i][0] = index[start + i + 1];
+ batch[i][1] = index[start + i + 2];
+ batch[i][2] = index[0];
+ }
+ }
+ break;
+ case DRAW_QUADLIST:
+ {
+ unsigned int index = 4 * start / 2;
+
+ for(unsigned int i = 0; i < triangleCount; i += 2)
+ {
+ batch[i+0][0] = index + 0;
+ batch[i+0][1] = index + 1;
+ batch[i+0][2] = index + 2;
+
+ batch[i+1][0] = index + 0;
+ batch[i+1][1] = index + 2;
+ batch[i+1][2] = index + 3;
+
+ index += 4;
+ }
+ }
+ break;
+ default:
+ ASSERT(false);
+ return;
+ }
+
+ task->primitiveStart = start;
+ task->vertexCount = triangleCount * 3;
+ vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
+ }
+
+ int Renderer::setupSolidTriangles(int unit, int count)
+ {
+ Triangle *triangle = triangleBatch[unit];
+ Primitive *primitive = primitiveBatch[unit];
+
+ DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+ SetupProcessor::State &state = draw.setupState;
+ const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
+
+ int ms = state.multiSample;
+ int pos = state.positionRegister;
+ const DrawData *data = draw.data;
+ int visible = 0;
+
+ for(int i = 0; i < count; i++, triangle++)
+ {
+ Vertex &v0 = triangle->v0;
+ Vertex &v1 = triangle->v1;
+ Vertex &v2 = triangle->v2;
+
+ if((v0.clipFlags & v1.clipFlags & v2.clipFlags) == Clipper::CLIP_FINITE)
+ {
+ Polygon polygon(&v0.v[pos], &v1.v[pos], &v2.v[pos]);
+
+ int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags | draw.clipFlags;
+
+ if(clipFlagsOr != Clipper::CLIP_FINITE)
+ {
+ if(!clipper->clip(polygon, clipFlagsOr, draw))
+ {
+ continue;
+ }
+ }
+
+ if(setupRoutine(primitive, triangle, &polygon, data))
+ {
+ primitive += ms;
+ visible++;
+ }
+ }
+ }
+
+ return visible;
+ }
+
+ int Renderer::setupWireframeTriangle(int unit, int count)
+ {
+ Triangle *triangle = triangleBatch[unit];
+ Primitive *primitive = primitiveBatch[unit];
+ int visible = 0;
+
+ DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+ SetupProcessor::State &state = draw.setupState;
+
+ const Vertex &v0 = triangle[0].v0;
+ const Vertex &v1 = triangle[0].v1;
+ const Vertex &v2 = triangle[0].v2;
+
+ float d = (v0.y * v1.x - v0.x * v1.y) * v2.w + (v0.x * v2.y - v0.y * v2.x) * v1.w + (v2.x * v1.y - v1.x * v2.y) * v0.w;
+
+ if(state.cullMode == CULL_CLOCKWISE)
+ {
+ if(d >= 0) return 0;
+ }
+ else if(state.cullMode == CULL_COUNTERCLOCKWISE)
+ {
+ if(d <= 0) return 0;
+ }
+
+ // Copy attributes
+ triangle[1].v0 = v1;
+ triangle[1].v1 = v2;
+ triangle[2].v0 = v2;
+ triangle[2].v1 = v0;
+
+ if(state.color[0][0].flat) // FIXME
+ {
+ for(int i = 0; i < 2; i++)
+ {
+ triangle[1].v0.C[i] = triangle[0].v0.C[i];
+ triangle[1].v1.C[i] = triangle[0].v0.C[i];
+ triangle[2].v0.C[i] = triangle[0].v0.C[i];
+ triangle[2].v1.C[i] = triangle[0].v0.C[i];
+ }
+ }
+
+ for(int i = 0; i < 3; i++)
+ {
+ if(setupLine(*primitive, *triangle, draw))
+ {
+ primitive->area = 0.5f * d;
+
+ primitive++;
+ visible++;
+ }
+
+ triangle++;
+ }
+
+ return visible;
+ }
+
+ int Renderer::setupVertexTriangle(int unit, int count)
+ {
+ Triangle *triangle = triangleBatch[unit];
+ Primitive *primitive = primitiveBatch[unit];
+ int visible = 0;
+
+ DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+ SetupProcessor::State &state = draw.setupState;
+
+ const Vertex &v0 = triangle[0].v0;
+ const Vertex &v1 = triangle[0].v1;
+ const Vertex &v2 = triangle[0].v2;
+
+ float d = (v0.y * v1.x - v0.x * v1.y) * v2.w + (v0.x * v2.y - v0.y * v2.x) * v1.w + (v2.x * v1.y - v1.x * v2.y) * v0.w;
+
+ if(state.cullMode == CULL_CLOCKWISE)
+ {
+ if(d >= 0) return 0;
+ }
+ else if(state.cullMode == CULL_COUNTERCLOCKWISE)
+ {
+ if(d <= 0) return 0;
+ }
+
+ // Copy attributes
+ triangle[1].v0 = v1;
+ triangle[2].v0 = v2;
+
+ for(int i = 0; i < 3; i++)
+ {
+ if(setupPoint(*primitive, *triangle, draw))
+ {
+ primitive->area = 0.5f * d;
+
+ primitive++;
+ visible++;
+ }
+
+ triangle++;
+ }
+
+ return visible;
+ }
+
+ int Renderer::setupLines(int unit, int count)
+ {
+ Triangle *triangle = triangleBatch[unit];
+ Primitive *primitive = primitiveBatch[unit];
+ int visible = 0;
+
+ DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+ SetupProcessor::State &state = draw.setupState;
+
+ int ms = state.multiSample;
+
+ for(int i = 0; i < count; i++)
+ {
+ if(setupLine(*primitive, *triangle, draw))
+ {
+ primitive += ms;
+ visible++;
+ }
+
+ triangle++;
+ }
+
+ return visible;
+ }
+
+ int Renderer::setupPoints(int unit, int count)
+ {
+ Triangle *triangle = triangleBatch[unit];
+ Primitive *primitive = primitiveBatch[unit];
+ int visible = 0;
+
+ DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+ SetupProcessor::State &state = draw.setupState;
+
+ int ms = state.multiSample;
+
+ for(int i = 0; i < count; i++)
+ {
+ if(setupPoint(*primitive, *triangle, draw))
+ {
+ primitive += ms;
+ visible++;
+ }
+
+ triangle++;
+ }
+
+ return visible;
+ }
+
+ bool Renderer::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+ {
+ const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
+ const SetupProcessor::State &state = draw.setupState;
+ const DrawData &data = *draw.data;
+
+ float lineWidth = data.lineWidth;
+
+ Vertex &v0 = triangle.v0;
+ Vertex &v1 = triangle.v1;
+
+ int pos = state.positionRegister;
+
+ const float4 &P0 = v0.v[pos];
+ const float4 &P1 = v1.v[pos];
+
+ if(P0.w <= 0 && P1.w <= 0)
+ {
+ return false;
+ }
+
+ const float W = data.Wx16[0] * (1.0f / 16.0f);
+ const float H = data.Hx16[0] * (1.0f / 16.0f);
+
+ float dx = W * (P1.x / P1.w - P0.x / P0.w);
+ float dy = H * (P1.y / P1.w - P0.y / P0.w);
+
+ if(dx == 0 && dy == 0)
+ {
+ return false;
+ }
+
+ if(state.multiSample > 1) // Rectangle
+ {
+ float4 P[4];
+ int C[4];
+
+ P[0] = P0;
+ P[1] = P1;
+ P[2] = P1;
+ P[3] = P0;
+
+ float scale = lineWidth * 0.5f / sqrt(dx*dx + dy*dy);
+
+ dx *= scale;
+ dy *= scale;
+
+ float dx0h = dx * P0.w / H;
+ float dy0w = dy * P0.w / W;
+
+ float dx1h = dx * P1.w / H;
+ float dy1w = dy * P1.w / W;
+
+ P[0].x += -dy0w;
+ P[0].y += +dx0h;
+ C[0] = clipper->computeClipFlags(P[0]);
+
+ P[1].x += -dy1w;
+ P[1].y += +dx1h;
+ C[1] = clipper->computeClipFlags(P[1]);
+
+ P[2].x += +dy1w;
+ P[2].y += -dx1h;
+ C[2] = clipper->computeClipFlags(P[2]);
+
+ P[3].x += +dy0w;
+ P[3].y += -dx0h;
+ C[3] = clipper->computeClipFlags(P[3]);
+
+ if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
+ {
+ Polygon polygon(P, 4);
+
+ int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | draw.clipFlags;
+
+ if(clipFlagsOr != Clipper::CLIP_FINITE)
+ {
+ if(!clipper->clip(polygon, clipFlagsOr, draw))
+ {
+ return false;
+ }
+ }
+
+ return setupRoutine(&primitive, &triangle, &polygon, &data);
+ }
+ }
+ else // Diamond test convention
+ {
+ float4 P[8];
+ int C[8];
+
+ P[0] = P0;
+ P[1] = P0;
+ P[2] = P0;
+ P[3] = P0;
+ P[4] = P1;
+ P[5] = P1;
+ P[6] = P1;
+ P[7] = P1;
+
+ float dx0 = lineWidth * 0.5f * P0.w / W;
+ float dy0 = lineWidth * 0.5f * P0.w / H;
+
+ float dx1 = lineWidth * 0.5f * P1.w / W;
+ float dy1 = lineWidth * 0.5f * P1.w / H;
+
+ P[0].x += -dx0;
+ C[0] = clipper->computeClipFlags(P[0]);
+
+ P[1].y += +dy0;
+ C[1] = clipper->computeClipFlags(P[1]);
+
+ P[2].x += +dx0;
+ C[2] = clipper->computeClipFlags(P[2]);
+
+ P[3].y += -dy0;
+ C[3] = clipper->computeClipFlags(P[3]);
+
+ P[4].x += -dx1;
+ C[4] = clipper->computeClipFlags(P[4]);
+
+ P[5].y += +dy1;
+ C[5] = clipper->computeClipFlags(P[5]);
+
+ P[6].x += +dx1;
+ C[6] = clipper->computeClipFlags(P[6]);
+
+ P[7].y += -dy1;
+ C[7] = clipper->computeClipFlags(P[7]);
+
+ if((C[0] & C[1] & C[2] & C[3] & C[4] & C[5] & C[6] & C[7]) == Clipper::CLIP_FINITE)
+ {
+ float4 L[6];
+
+ if(dx > -dy)
+ {
+ if(dx > dy) // Right
+ {
+ L[0] = P[0];
+ L[1] = P[1];
+ L[2] = P[5];
+ L[3] = P[6];
+ L[4] = P[7];
+ L[5] = P[3];
+ }
+ else // Down
+ {
+ L[0] = P[0];
+ L[1] = P[4];
+ L[2] = P[5];
+ L[3] = P[6];
+ L[4] = P[2];
+ L[5] = P[3];
+ }
+ }
+ else
+ {
+ if(dx > dy) // Up
+ {
+ L[0] = P[0];
+ L[1] = P[1];
+ L[2] = P[2];
+ L[3] = P[6];
+ L[4] = P[7];
+ L[5] = P[4];
+ }
+ else // Left
+ {
+ L[0] = P[1];
+ L[1] = P[2];
+ L[2] = P[3];
+ L[3] = P[7];
+ L[4] = P[4];
+ L[5] = P[5];
+ }
+ }
+
+ Polygon polygon(L, 6);
+
+ int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | C[4] | C[5] | C[6] | C[7] | draw.clipFlags;
+
+ if(clipFlagsOr != Clipper::CLIP_FINITE)
+ {
+ if(!clipper->clip(polygon, clipFlagsOr, draw))
+ {
+ return false;
+ }
+ }
+
+ return setupRoutine(&primitive, &triangle, &polygon, &data);
+ }
+ }
+
+ return false;
+ }
+
+ bool Renderer::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+ {
+ const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
+ const SetupProcessor::State &state = draw.setupState;
+ const DrawData &data = *draw.data;
+
+ Vertex &v = triangle.v0;
+
+ float pSize;
+
+ int pts = state.pointSizeRegister;
+
+ if(state.pointSizeRegister != Unused)
+ {
+ pSize = v.v[pts].y;
+ }
+ else
+ {
+ pSize = data.point.pointSize[0];
+ }
+
+ pSize = clamp(pSize, data.point.pointSizeMin, data.point.pointSizeMax);
+
+ float4 P[4];
+ int C[4];
+
+ int pos = state.positionRegister;
+
+ P[0] = v.v[pos];
+ P[1] = v.v[pos];
+ P[2] = v.v[pos];
+ P[3] = v.v[pos];
+
+ const float X = pSize * P[0].w * data.halfPixelX[0];
+ const float Y = pSize * P[0].w * data.halfPixelY[0];
+
+ P[0].x -= X;
+ P[0].y += Y;
+ C[0] = clipper->computeClipFlags(P[0]);
+
+ P[1].x += X;
+ P[1].y += Y;
+ C[1] = clipper->computeClipFlags(P[1]);
+
+ P[2].x += X;
+ P[2].y -= Y;
+ C[2] = clipper->computeClipFlags(P[2]);
+
+ P[3].x -= X;
+ P[3].y -= Y;
+ C[3] = clipper->computeClipFlags(P[3]);
+
+ triangle.v1 = triangle.v0;
+ triangle.v2 = triangle.v0;
+
+ triangle.v1.X += iround(16 * 0.5f * pSize);
+ triangle.v2.Y -= iround(16 * 0.5f * pSize) * (data.Hx16[0] > 0.0f ? 1 : -1); // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
+
+ Polygon polygon(P, 4);
+
+ if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
+ {
+ int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | draw.clipFlags;
+
+ if(clipFlagsOr != Clipper::CLIP_FINITE)
+ {
+ if(!clipper->clip(polygon, clipFlagsOr, draw))
+ {
+ return false;
+ }
+ }
+
+ return setupRoutine(&primitive, &triangle, &polygon, &data);
+ }
+
+ return false;
+ }
+
+ void Renderer::initializeThreads()
+ {
+ unitCount = ceilPow2(threadCount);
+ clusterCount = ceilPow2(threadCount);
+
+ for(int i = 0; i < unitCount; i++)
+ {
+ triangleBatch[i] = (Triangle*)allocate(batchSize * sizeof(Triangle));
+ primitiveBatch[i] = (Primitive*)allocate(batchSize * sizeof(Primitive));
+ }
+
+ for(int i = 0; i < threadCount; i++)
+ {
+ vertexTask[i] = (VertexTask*)allocate(sizeof(VertexTask));
+ vertexTask[i]->vertexCache.drawCall = -1;
+
+ task[i].type = Task::SUSPEND;
+
+ resume[i] = new Event();
+ suspend[i] = new Event();
+
+ Parameters parameters;
+ parameters.threadIndex = i;
+ parameters.renderer = this;
+
+ exitThreads = false;
+ worker[i] = new Thread(threadFunction, ¶meters);
+
+ suspend[i]->wait();
+ suspend[i]->signal();
+ }
+ }
+
+ void Renderer::terminateThreads()
+ {
+ while(threadsAwake != 0)
+ {
+ Thread::sleep(1);
+ }
+
+ for(int thread = 0; thread < threadCount; thread++)
+ {
+ if(worker[thread])
+ {
+ exitThreads = true;
+ resume[thread]->signal();
+ worker[thread]->join();
+
+ delete worker[thread];
+ worker[thread] = 0;
+ delete resume[thread];
+ resume[thread] = 0;
+ delete suspend[thread];
+ suspend[thread] = 0;
+ }
+
+ deallocate(vertexTask[thread]);
+ vertexTask[thread] = 0;
+ }
+
+ for(int i = 0; i < 16; i++)
+ {
+ deallocate(triangleBatch[i]);
+ triangleBatch[i] = 0;
+
+ deallocate(primitiveBatch[i]);
+ primitiveBatch[i] = 0;
+ }
+ }
+
+ void Renderer::loadConstants(const VertexShader *vertexShader)
+ {
+ if(!vertexShader) return;
+
+ size_t count = vertexShader->getLength();
+
+ for(size_t i = 0; i < count; i++)
+ {
+ const Shader::Instruction *instruction = vertexShader->getInstruction(i);
+
+ if(instruction->opcode == Shader::OPCODE_DEF)
+ {
+ int index = instruction->dst.index;
+ float value[4];
+
+ value[0] = instruction->src[0].value[0];
+ value[1] = instruction->src[0].value[1];
+ value[2] = instruction->src[0].value[2];
+ value[3] = instruction->src[0].value[3];
+
+ setVertexShaderConstantF(index, value);
+ }
+ else if(instruction->opcode == Shader::OPCODE_DEFI)
+ {
+ int index = instruction->dst.index;
+ int integer[4];
+
+ integer[0] = instruction->src[0].integer[0];
+ integer[1] = instruction->src[0].integer[1];
+ integer[2] = instruction->src[0].integer[2];
+ integer[3] = instruction->src[0].integer[3];
+
+ setVertexShaderConstantI(index, integer);
+ }
+ else if(instruction->opcode == Shader::OPCODE_DEFB)
+ {
+ int index = instruction->dst.index;
+ int boolean = instruction->src[0].boolean[0];
+
+ setVertexShaderConstantB(index, &boolean);
+ }
+ }
+ }
+
+ void Renderer::loadConstants(const PixelShader *pixelShader)
+ {
+ if(!pixelShader) return;
+
+ size_t count = pixelShader->getLength();
+
+ for(size_t i = 0; i < count; i++)
+ {
+ const Shader::Instruction *instruction = pixelShader->getInstruction(i);
+
+ if(instruction->opcode == Shader::OPCODE_DEF)
+ {
+ int index = instruction->dst.index;
+ float value[4];
+
+ value[0] = instruction->src[0].value[0];
+ value[1] = instruction->src[0].value[1];
+ value[2] = instruction->src[0].value[2];
+ value[3] = instruction->src[0].value[3];
+
+ setPixelShaderConstantF(index, value);
+ }
+ else if(instruction->opcode == Shader::OPCODE_DEFI)
+ {
+ int index = instruction->dst.index;
+ int integer[4];
+
+ integer[0] = instruction->src[0].integer[0];
+ integer[1] = instruction->src[0].integer[1];
+ integer[2] = instruction->src[0].integer[2];
+ integer[3] = instruction->src[0].integer[3];
+
+ setPixelShaderConstantI(index, integer);
+ }
+ else if(instruction->opcode == Shader::OPCODE_DEFB)
+ {
+ int index = instruction->dst.index;
+ int boolean = instruction->src[0].boolean[0];
+
+ setPixelShaderConstantB(index, &boolean);
+ }
+ }
+ }
+
+ void Renderer::setIndexBuffer(Resource *indexBuffer)
+ {
+ context->indexBuffer = indexBuffer;
+ }
+
+ void Renderer::setMultiSampleMask(unsigned int mask)
+ {
+ context->sampleMask = mask;
+ }
+
+ void Renderer::setTransparencyAntialiasing(TransparencyAntialiasing transparencyAntialiasing)
+ {
+ sw::transparencyAntialiasing = transparencyAntialiasing;
+ }
+
+ bool Renderer::isReadWriteTexture(int sampler)
+ {
+ for(int index = 0; index < RENDERTARGETS; index++)
+ {
+ if(context->renderTarget[index] && context->texture[sampler] == context->renderTarget[index]->getResource())
+ {
+ return true;
+ }
+ }
+
+ if(context->depthBuffer && context->texture[sampler] == context->depthBuffer->getResource())
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ void Renderer::updateClipper()
+ {
+ if(updateClipPlanes)
+ {
+ if(VertexProcessor::isFixedFunction()) // User plane in world space
+ {
+ const Matrix &scissorWorld = getViewTransform();
+
+ if(clipFlags & Clipper::CLIP_PLANE0) clipPlane[0] = scissorWorld * userPlane[0];
+ if(clipFlags & Clipper::CLIP_PLANE1) clipPlane[1] = scissorWorld * userPlane[1];
+ if(clipFlags & Clipper::CLIP_PLANE2) clipPlane[2] = scissorWorld * userPlane[2];
+ if(clipFlags & Clipper::CLIP_PLANE3) clipPlane[3] = scissorWorld * userPlane[3];
+ if(clipFlags & Clipper::CLIP_PLANE4) clipPlane[4] = scissorWorld * userPlane[4];
+ if(clipFlags & Clipper::CLIP_PLANE5) clipPlane[5] = scissorWorld * userPlane[5];
+ }
+ else // User plane in clip space
+ {
+ if(clipFlags & Clipper::CLIP_PLANE0) clipPlane[0] = userPlane[0];
+ if(clipFlags & Clipper::CLIP_PLANE1) clipPlane[1] = userPlane[1];
+ if(clipFlags & Clipper::CLIP_PLANE2) clipPlane[2] = userPlane[2];
+ if(clipFlags & Clipper::CLIP_PLANE3) clipPlane[3] = userPlane[3];
+ if(clipFlags & Clipper::CLIP_PLANE4) clipPlane[4] = userPlane[4];
+ if(clipFlags & Clipper::CLIP_PLANE5) clipPlane[5] = userPlane[5];
+ }
+
+ updateClipPlanes = false;
+ }
+ }
+
+ void Renderer::setTextureResource(unsigned int sampler, Resource *resource)
+ {
+ ASSERT(sampler < TOTAL_IMAGE_UNITS);
+
+ context->texture[sampler] = resource;
+ }
+
+ void Renderer::setTextureLevel(unsigned int sampler, unsigned int face, unsigned int level, Surface *surface, TextureType type)
+ {
+ ASSERT(sampler < TOTAL_IMAGE_UNITS && face < 6 && level < MIPMAP_LEVELS);
+
+ context->sampler[sampler].setTextureLevel(face, level, surface, type);
+ }
+
+ void Renderer::setTextureFilter(SamplerType type, int sampler, FilterType textureFilter)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setTextureFilter(sampler, textureFilter);
+ }
+ else
+ {
+ VertexProcessor::setTextureFilter(sampler, textureFilter);
+ }
+ }
+
+ void Renderer::setMipmapFilter(SamplerType type, int sampler, MipmapType mipmapFilter)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setMipmapFilter(sampler, mipmapFilter);
+ }
+ else
+ {
+ VertexProcessor::setMipmapFilter(sampler, mipmapFilter);
+ }
+ }
+
+ void Renderer::setGatherEnable(SamplerType type, int sampler, bool enable)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setGatherEnable(sampler, enable);
+ }
+ else
+ {
+ VertexProcessor::setGatherEnable(sampler, enable);
+ }
+ }
+
+ void Renderer::setAddressingModeU(SamplerType type, int sampler, AddressingMode addressMode)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setAddressingModeU(sampler, addressMode);
+ }
+ else
+ {
+ VertexProcessor::setAddressingModeU(sampler, addressMode);
+ }
+ }
+
+ void Renderer::setAddressingModeV(SamplerType type, int sampler, AddressingMode addressMode)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setAddressingModeV(sampler, addressMode);
+ }
+ else
+ {
+ VertexProcessor::setAddressingModeV(sampler, addressMode);
+ }
+ }
+
+ void Renderer::setAddressingModeW(SamplerType type, int sampler, AddressingMode addressMode)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setAddressingModeW(sampler, addressMode);
+ }
+ else
+ {
+ VertexProcessor::setAddressingModeW(sampler, addressMode);
+ }
+ }
+
+ void Renderer::setReadSRGB(SamplerType type, int sampler, bool sRGB)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setReadSRGB(sampler, sRGB);
+ }
+ else
+ {
+ VertexProcessor::setReadSRGB(sampler, sRGB);
+ }
+ }
+
+ void Renderer::setMipmapLOD(SamplerType type, int sampler, float bias)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setMipmapLOD(sampler, bias);
+ }
+ else
+ {
+ VertexProcessor::setMipmapLOD(sampler, bias);
+ }
+ }
+
+ void Renderer::setBorderColor(SamplerType type, int sampler, const Color<float> &borderColor)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setBorderColor(sampler, borderColor);
+ }
+ else
+ {
+ VertexProcessor::setBorderColor(sampler, borderColor);
+ }
+ }
+
+ void Renderer::setMaxAnisotropy(SamplerType type, int sampler, float maxAnisotropy)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setMaxAnisotropy(sampler, maxAnisotropy);
+ }
+ else
+ {
+ VertexProcessor::setMaxAnisotropy(sampler, maxAnisotropy);
+ }
+ }
+
+ void Renderer::setHighPrecisionFiltering(SamplerType type, int sampler, bool highPrecisionFiltering)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setHighPrecisionFiltering(sampler, highPrecisionFiltering);
+ }
+ else
+ {
+ VertexProcessor::setHighPrecisionFiltering(sampler, highPrecisionFiltering);
+ }
+ }
+
+ void Renderer::setSwizzleR(SamplerType type, int sampler, SwizzleType swizzleR)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setSwizzleR(sampler, swizzleR);
+ }
+ else
+ {
+ VertexProcessor::setSwizzleR(sampler, swizzleR);
+ }
+ }
+
+ void Renderer::setSwizzleG(SamplerType type, int sampler, SwizzleType swizzleG)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setSwizzleG(sampler, swizzleG);
+ }
+ else
+ {
+ VertexProcessor::setSwizzleG(sampler, swizzleG);
+ }
+ }
+
+ void Renderer::setSwizzleB(SamplerType type, int sampler, SwizzleType swizzleB)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setSwizzleB(sampler, swizzleB);
+ }
+ else
+ {
+ VertexProcessor::setSwizzleB(sampler, swizzleB);
+ }
+ }
+
+ void Renderer::setSwizzleA(SamplerType type, int sampler, SwizzleType swizzleA)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setSwizzleA(sampler, swizzleA);
+ }
+ else
+ {
+ VertexProcessor::setSwizzleA(sampler, swizzleA);
+ }
+ }
+
+ void Renderer::setCompareFunc(SamplerType type, int sampler, CompareFunc compFunc)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setCompareFunc(sampler, compFunc);
+ }
+ else
+ {
+ VertexProcessor::setCompareFunc(sampler, compFunc);
+ }
+ }
+
+ void Renderer::setBaseLevel(SamplerType type, int sampler, int baseLevel)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setBaseLevel(sampler, baseLevel);
+ }
+ else
+ {
+ VertexProcessor::setBaseLevel(sampler, baseLevel);
+ }
+ }
+
+ void Renderer::setMaxLevel(SamplerType type, int sampler, int maxLevel)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setMaxLevel(sampler, maxLevel);
+ }
+ else
+ {
+ VertexProcessor::setMaxLevel(sampler, maxLevel);
+ }
+ }
+
+ void Renderer::setMinLod(SamplerType type, int sampler, float minLod)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setMinLod(sampler, minLod);
+ }
+ else
+ {
+ VertexProcessor::setMinLod(sampler, minLod);
+ }
+ }
+
+ void Renderer::setMaxLod(SamplerType type, int sampler, float maxLod)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setMaxLod(sampler, maxLod);
+ }
+ else
+ {
+ VertexProcessor::setMaxLod(sampler, maxLod);
+ }
+ }
+
+ void Renderer::setSyncRequired(SamplerType type, int sampler, bool syncRequired)
+ {
+ if(type == SAMPLER_PIXEL)
+ {
+ PixelProcessor::setSyncRequired(sampler, syncRequired);
+ }
+ else
+ {
+ VertexProcessor::setSyncRequired(sampler, syncRequired);
+ }
+ }
+
+ void Renderer::setPointSpriteEnable(bool pointSpriteEnable)
+ {
+ context->setPointSpriteEnable(pointSpriteEnable);
+ }
+
+ void Renderer::setPointScaleEnable(bool pointScaleEnable)
+ {
+ context->setPointScaleEnable(pointScaleEnable);
+ }
+
+ void Renderer::setLineWidth(float width)
+ {
+ context->lineWidth = width;
+ }
+
+ void Renderer::setDepthBias(float bias)
+ {
+ context->depthBias = bias;
+ }
+
+ void Renderer::setSlopeDepthBias(float slopeBias)
+ {
+ context->slopeDepthBias = slopeBias;
+ }
+
+ void Renderer::setRasterizerDiscard(bool rasterizerDiscard)
+ {
+ context->rasterizerDiscard = rasterizerDiscard;
+ }
+
+ void Renderer::setPixelShader(const PixelShader *shader)
+ {
+ context->pixelShader = shader;
+
+ loadConstants(shader);
+ }
+
+ void Renderer::setVertexShader(const VertexShader *shader)
+ {
+ context->vertexShader = shader;
+
+ loadConstants(shader);
+ }
+
+ void Renderer::setPixelShaderConstantF(unsigned int index, const float value[4], unsigned int count)
+ {
+ for(unsigned int i = 0; i < DRAW_COUNT; i++)
+ {
+ if(drawCall[i]->psDirtyConstF < index + count)
+ {
+ drawCall[i]->psDirtyConstF = index + count;
+ }
+ }
+
+ for(unsigned int i = 0; i < count; i++)
+ {
+ PixelProcessor::setFloatConstant(index + i, value);
+ value += 4;
+ }
+ }
+
+ void Renderer::setPixelShaderConstantI(unsigned int index, const int value[4], unsigned int count)
+ {
+ for(unsigned int i = 0; i < DRAW_COUNT; i++)
+ {
+ if(drawCall[i]->psDirtyConstI < index + count)
+ {
+ drawCall[i]->psDirtyConstI = index + count;
+ }
+ }
+
+ for(unsigned int i = 0; i < count; i++)
+ {
+ PixelProcessor::setIntegerConstant(index + i, value);
+ value += 4;
+ }
+ }
+
+ void Renderer::setPixelShaderConstantB(unsigned int index, const int *boolean, unsigned int count)
+ {
+ for(unsigned int i = 0; i < DRAW_COUNT; i++)
+ {
+ if(drawCall[i]->psDirtyConstB < index + count)
+ {
+ drawCall[i]->psDirtyConstB = index + count;
+ }
+ }
+
+ for(unsigned int i = 0; i < count; i++)
+ {
+ PixelProcessor::setBooleanConstant(index + i, *boolean);
+ boolean++;
+ }
+ }
+
+ void Renderer::setVertexShaderConstantF(unsigned int index, const float value[4], unsigned int count)
+ {
+ for(unsigned int i = 0; i < DRAW_COUNT; i++)
+ {
+ if(drawCall[i]->vsDirtyConstF < index + count)
+ {
+ drawCall[i]->vsDirtyConstF = index + count;
+ }
+ }
+
+ for(unsigned int i = 0; i < count; i++)
+ {
+ VertexProcessor::setFloatConstant(index + i, value);
+ value += 4;
+ }
+ }
+
+ void Renderer::setVertexShaderConstantI(unsigned int index, const int value[4], unsigned int count)
+ {
+ for(unsigned int i = 0; i < DRAW_COUNT; i++)
+ {
+ if(drawCall[i]->vsDirtyConstI < index + count)
+ {
+ drawCall[i]->vsDirtyConstI = index + count;
+ }
+ }
+
+ for(unsigned int i = 0; i < count; i++)
+ {
+ VertexProcessor::setIntegerConstant(index + i, value);
+ value += 4;
+ }
+ }
+
+ void Renderer::setVertexShaderConstantB(unsigned int index, const int *boolean, unsigned int count)
+ {
+ for(unsigned int i = 0; i < DRAW_COUNT; i++)
+ {
+ if(drawCall[i]->vsDirtyConstB < index + count)
+ {
+ drawCall[i]->vsDirtyConstB = index + count;
+ }
+ }
+
+ for(unsigned int i = 0; i < count; i++)
+ {
+ VertexProcessor::setBooleanConstant(index + i, *boolean);
+ boolean++;
+ }
+ }
+
+ void Renderer::setModelMatrix(const Matrix &M, int i)
+ {
+ VertexProcessor::setModelMatrix(M, i);
+ }
+
+ void Renderer::setViewMatrix(const Matrix &V)
+ {
+ VertexProcessor::setViewMatrix(V);
+ updateClipPlanes = true;
+ }
+
+ void Renderer::setBaseMatrix(const Matrix &B)
+ {
+ VertexProcessor::setBaseMatrix(B);
+ updateClipPlanes = true;
+ }
+
+ void Renderer::setProjectionMatrix(const Matrix &P)
+ {
+ VertexProcessor::setProjectionMatrix(P);
+ updateClipPlanes = true;
+ }
+
+ void Renderer::addQuery(Query *query)
+ {
+ queries.push_back(query);
+ }
+
+ void Renderer::removeQuery(Query *query)
+ {
+ queries.remove(query);
+ }
+
+ #if PERF_HUD
+ int Renderer::getThreadCount()
+ {
+ return threadCount;
+ }
+
+ int64_t Renderer::getVertexTime(int thread)
+ {
+ return vertexTime[thread];
+ }
+
+ int64_t Renderer::getSetupTime(int thread)
+ {
+ return setupTime[thread];
+ }
+
+ int64_t Renderer::getPixelTime(int thread)
+ {
+ return pixelTime[thread];
+ }
+
+ void Renderer::resetTimers()
+ {
+ for(int thread = 0; thread < threadCount; thread++)
+ {
+ vertexTime[thread] = 0;
+ setupTime[thread] = 0;
+ pixelTime[thread] = 0;
+ }
+ }
+ #endif
+
+ void Renderer::setViewport(const Viewport &viewport)
+ {
+ this->viewport = viewport;
+ }
+
+ void Renderer::setScissor(const Rect &scissor)
+ {
+ this->scissor = scissor;
+ }
+
+ void Renderer::setClipFlags(int flags)
+ {
+ clipFlags = flags << 8; // Bottom 8 bits used by legacy frustum
+ }
+
+ void Renderer::setClipPlane(unsigned int index, const float plane[4])
+ {
+ if(index < MAX_CLIP_PLANES)
+ {
+ userPlane[index] = plane;
+ }
+ else ASSERT(false);
+
+ updateClipPlanes = true;
+ }
+
+ void Renderer::updateConfiguration(bool initialUpdate)
+ {
+ bool newConfiguration = swiftConfig->hasNewConfiguration();
+
+ if(newConfiguration || initialUpdate)
+ {
+ terminateThreads();
+
+ SwiftConfig::Configuration configuration = {};
+ swiftConfig->getConfiguration(configuration);
+
+ precacheVertex = !newConfiguration && configuration.precache;
+ precacheSetup = !newConfiguration && configuration.precache;
+ precachePixel = !newConfiguration && configuration.precache;
+
+ VertexProcessor::setRoutineCacheSize(configuration.vertexRoutineCacheSize);
+ PixelProcessor::setRoutineCacheSize(configuration.pixelRoutineCacheSize);
+ SetupProcessor::setRoutineCacheSize(configuration.setupRoutineCacheSize);
+
+ switch(configuration.textureSampleQuality)
+ {
+ case 0: Sampler::setFilterQuality(FILTER_POINT); break;
+ case 1: Sampler::setFilterQuality(FILTER_LINEAR); break;
+ case 2: Sampler::setFilterQuality(FILTER_ANISOTROPIC); break;
+ default: Sampler::setFilterQuality(FILTER_ANISOTROPIC); break;
+ }
+
+ switch(configuration.mipmapQuality)
+ {
+ case 0: Sampler::setMipmapQuality(MIPMAP_POINT); break;
+ case 1: Sampler::setMipmapQuality(MIPMAP_LINEAR); break;
+ default: Sampler::setMipmapQuality(MIPMAP_LINEAR); break;
+ }
+
+ setPerspectiveCorrection(configuration.perspectiveCorrection);
+
+ switch(configuration.transcendentalPrecision)
+ {
+ case 0:
+ logPrecision = APPROXIMATE;
+ expPrecision = APPROXIMATE;
+ rcpPrecision = APPROXIMATE;
+ rsqPrecision = APPROXIMATE;
+ break;
+ case 1:
+ logPrecision = PARTIAL;
+ expPrecision = PARTIAL;
+ rcpPrecision = PARTIAL;
+ rsqPrecision = PARTIAL;
+ break;
+ case 2:
+ logPrecision = ACCURATE;
+ expPrecision = ACCURATE;
+ rcpPrecision = ACCURATE;
+ rsqPrecision = ACCURATE;
+ break;
+ case 3:
+ logPrecision = WHQL;
+ expPrecision = WHQL;
+ rcpPrecision = WHQL;
+ rsqPrecision = WHQL;
+ break;
+ case 4:
+ logPrecision = IEEE;
+ expPrecision = IEEE;
+ rcpPrecision = IEEE;
+ rsqPrecision = IEEE;
+ break;
+ default:
+ logPrecision = ACCURATE;
+ expPrecision = ACCURATE;
+ rcpPrecision = ACCURATE;
+ rsqPrecision = ACCURATE;
+ break;
+ }
+
+ switch(configuration.transparencyAntialiasing)
+ {
+ case 0: transparencyAntialiasing = TRANSPARENCY_NONE; break;
+ case 1: transparencyAntialiasing = TRANSPARENCY_ALPHA_TO_COVERAGE; break;
+ default: transparencyAntialiasing = TRANSPARENCY_NONE; break;
+ }
+
+ switch(configuration.threadCount)
+ {
+ case -1: threadCount = CPUID::coreCount(); break;
+ case 0: threadCount = CPUID::processAffinity(); break;
+ default: threadCount = configuration.threadCount; break;
+ }
+
+ CPUID::setEnableSSE4_1(configuration.enableSSE4_1);
+ CPUID::setEnableSSSE3(configuration.enableSSSE3);
+ CPUID::setEnableSSE3(configuration.enableSSE3);
+ CPUID::setEnableSSE2(configuration.enableSSE2);
+ CPUID::setEnableSSE(configuration.enableSSE);
+
+ for(int pass = 0; pass < 10; pass++)
+ {
+ optimization[pass] = configuration.optimization[pass];
+ }
+
+ forceWindowed = configuration.forceWindowed;
+ complementaryDepthBuffer = configuration.complementaryDepthBuffer;
+ postBlendSRGB = configuration.postBlendSRGB;
+ exactColorRounding = configuration.exactColorRounding;
+ forceClearRegisters = configuration.forceClearRegisters;
+
+ #ifndef NDEBUG
+ minPrimitives = configuration.minPrimitives;
+ maxPrimitives = configuration.maxPrimitives;
+ #endif
+ }
+
+ if(!initialUpdate && !worker[0])
+ {
+ initializeThreads();
+ }
+ }
+}
diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
new file mode 100644
index 0000000..ce22866
--- /dev/null
+++ b/src/Device/Renderer.hpp
@@ -0,0 +1,508 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Renderer_hpp
+#define sw_Renderer_hpp
+
+#include "VertexProcessor.hpp"
+#include "PixelProcessor.hpp"
+#include "SetupProcessor.hpp"
+#include "Plane.hpp"
+#include "Blitter.hpp"
+#include "Common/MutexLock.hpp"
+#include "Common/Thread.hpp"
+#include "Main/Config.hpp"
+
+#include <list>
+
+namespace sw
+{
+ class Clipper;
+ struct DrawCall;
+ class PixelShader;
+ class VertexShader;
+ class SwiftConfig;
+ struct Task;
+ class Resource;
+ struct Constants;
+
+ enum TranscendentalPrecision
+ {
+ APPROXIMATE,
+ PARTIAL, // 2^-10
+ ACCURATE,
+ WHQL, // 2^-21
+ IEEE // 2^-23
+ };
+
+ extern TranscendentalPrecision logPrecision;
+ extern TranscendentalPrecision expPrecision;
+ extern TranscendentalPrecision rcpPrecision;
+ extern TranscendentalPrecision rsqPrecision;
+ extern bool perspectiveCorrection;
+
+ struct Conventions
+ {
+ bool halfIntegerCoordinates;
+ bool symmetricNormalizedDepth;
+ bool booleanFaceRegister;
+ bool fullPixelPositionRegister;
+ bool leadingVertexFirst;
+ bool secondaryColor;
+ bool colorsDefaultToZero;
+ };
+
+ static const Conventions OpenGL =
+ {
+ true, // halfIntegerCoordinates
+ true, // symmetricNormalizedDepth
+ true, // booleanFaceRegister
+ true, // fullPixelPositionRegister
+ false, // leadingVertexFirst
+ false, // secondaryColor
+ true, // colorsDefaultToZero
+ };
+
+ static const Conventions Direct3D =
+ {
+ false, // halfIntegerCoordinates
+ false, // symmetricNormalizedDepth
+ false, // booleanFaceRegister
+ false, // fullPixelPositionRegister
+ true, // leadingVertexFirst
+ true, // secondardyColor
+ false, // colorsDefaultToZero
+ };
+
+ struct Query
+ {
+ enum Type { FRAGMENTS_PASSED, TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN };
+
+ Query(Type type) : building(false), reference(0), data(0), type(type)
+ {
+ }
+
+ void begin()
+ {
+ building = true;
+ data = 0;
+ }
+
+ void end()
+ {
+ building = false;
+ }
+
+ bool building;
+ AtomicInt reference;
+ AtomicInt data;
+
+ const Type type;
+ };
+
+ struct DrawData
+ {
+ const Constants *constants;
+
+ const void *input[MAX_VERTEX_INPUTS];
+ unsigned int stride[MAX_VERTEX_INPUTS];
+ Texture mipmap[TOTAL_IMAGE_UNITS];
+ const void *indices;
+
+ struct VS
+ {
+ float4 c[VERTEX_UNIFORM_VECTORS + 1]; // One extra for indices out of range, c[VERTEX_UNIFORM_VECTORS] = {0, 0, 0, 0}
+ byte* u[MAX_UNIFORM_BUFFER_BINDINGS];
+ byte* t[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS];
+ unsigned int reg[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS]; // Offset used when reading from registers, in components
+ unsigned int row[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS]; // Number of rows to read
+ unsigned int col[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS]; // Number of columns to read
+ unsigned int str[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS]; // Number of components between each varying in output buffer
+ int4 i[16];
+ bool b[16];
+ };
+
+ struct PS
+ {
+ word4 cW[8][4];
+ float4 c[FRAGMENT_UNIFORM_VECTORS];
+ byte* u[MAX_UNIFORM_BUFFER_BINDINGS];
+ int4 i[16];
+ bool b[16];
+ };
+
+ union
+ {
+ VS vs;
+ VertexProcessor::FixedFunction ff;
+ };
+
+ PS ps;
+
+ int instanceID;
+
+ VertexProcessor::PointSprite point;
+ float lineWidth;
+
+ PixelProcessor::Stencil stencil[2]; // clockwise, counterclockwise
+ PixelProcessor::Stencil stencilCCW;
+ PixelProcessor::Fog fog;
+ PixelProcessor::Factor factor;
+ unsigned int occlusion[16]; // Number of pixels passing depth test
+
+ #if PERF_PROFILE
+ int64_t cycles[PERF_TIMERS][16];
+ #endif
+
+ TextureStage::Uniforms textureStage[8];
+
+ float4 Wx16;
+ float4 Hx16;
+ float4 X0x16;
+ float4 Y0x16;
+ float4 XXXX;
+ float4 YYYY;
+ float4 halfPixelX;
+ float4 halfPixelY;
+ float viewportHeight;
+ float slopeDepthBias;
+ float depthRange;
+ float depthNear;
+ Plane clipPlane[6];
+
+ unsigned int *colorBuffer[RENDERTARGETS];
+ int colorPitchB[RENDERTARGETS];
+ int colorSliceB[RENDERTARGETS];
+ float *depthBuffer;
+ int depthPitchB;
+ int depthSliceB;
+ unsigned char *stencilBuffer;
+ int stencilPitchB;
+ int stencilSliceB;
+
+ int scissorX0;
+ int scissorX1;
+ int scissorY0;
+ int scissorY1;
+
+ float4 a2c0;
+ float4 a2c1;
+ float4 a2c2;
+ float4 a2c3;
+ };
+
+ struct Viewport
+ {
+ float x0;
+ float y0;
+ float width;
+ float height;
+ float minZ;
+ float maxZ;
+ };
+
+ class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
+ {
+ struct Task
+ {
+ enum Type
+ {
+ PRIMITIVES,
+ PIXELS,
+
+ RESUME,
+ SUSPEND
+ };
+
+ AtomicInt type;
+ AtomicInt primitiveUnit;
+ AtomicInt pixelCluster;
+ };
+
+ struct PrimitiveProgress
+ {
+ void init()
+ {
+ drawCall = 0;
+ firstPrimitive = 0;
+ primitiveCount = 0;
+ visible = 0;
+ references = 0;
+ }
+
+ AtomicInt drawCall;
+ AtomicInt firstPrimitive;
+ AtomicInt primitiveCount;
+ AtomicInt visible;
+ AtomicInt references;
+ };
+
+ struct PixelProgress
+ {
+ void init()
+ {
+ drawCall = 0;
+ processedPrimitives = 0;
+ executing = false;
+ }
+
+ AtomicInt drawCall;
+ AtomicInt processedPrimitives;
+ AtomicInt executing;
+ };
+
+ public:
+ Renderer(Context *context, Conventions conventions, bool exactColorRounding);
+
+ virtual ~Renderer();
+
+ void *operator new(size_t size);
+ void operator delete(void * mem);
+
+ void draw(DrawType drawType, unsigned int indexOffset, unsigned int count, bool update = true);
+
+ void clear(void *value, Format format, Surface *dest, const Rect &rect, unsigned int rgbaMask);
+ void blit(Surface *source, const SliceRectF &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil = false, bool sRGBconversion = true);
+ void blit3D(Surface *source, Surface *dest);
+
+ void setIndexBuffer(Resource *indexBuffer);
+
+ void setMultiSampleMask(unsigned int mask);
+ void setTransparencyAntialiasing(TransparencyAntialiasing transparencyAntialiasing);
+
+ void setTextureResource(unsigned int sampler, Resource *resource);
+ void setTextureLevel(unsigned int sampler, unsigned int face, unsigned int level, Surface *surface, TextureType type);
+
+ void setTextureFilter(SamplerType type, int sampler, FilterType textureFilter);
+ void setMipmapFilter(SamplerType type, int sampler, MipmapType mipmapFilter);
+ void setGatherEnable(SamplerType type, int sampler, bool enable);
+ void setAddressingModeU(SamplerType type, int sampler, AddressingMode addressingMode);
+ void setAddressingModeV(SamplerType type, int sampler, AddressingMode addressingMode);
+ void setAddressingModeW(SamplerType type, int sampler, AddressingMode addressingMode);
+ void setReadSRGB(SamplerType type, int sampler, bool sRGB);
+ void setMipmapLOD(SamplerType type, int sampler, float bias);
+ void setBorderColor(SamplerType type, int sampler, const Color<float> &borderColor);
+ void setMaxAnisotropy(SamplerType type, int sampler, float maxAnisotropy);
+ void setHighPrecisionFiltering(SamplerType type, int sampler, bool highPrecisionFiltering);
+ void setSwizzleR(SamplerType type, int sampler, SwizzleType swizzleR);
+ void setSwizzleG(SamplerType type, int sampler, SwizzleType swizzleG);
+ void setSwizzleB(SamplerType type, int sampler, SwizzleType swizzleB);
+ void setSwizzleA(SamplerType type, int sampler, SwizzleType swizzleA);
+ void setCompareFunc(SamplerType type, int sampler, CompareFunc compare);
+ void setBaseLevel(SamplerType type, int sampler, int baseLevel);
+ void setMaxLevel(SamplerType type, int sampler, int maxLevel);
+ void setMinLod(SamplerType type, int sampler, float minLod);
+ void setMaxLod(SamplerType type, int sampler, float maxLod);
+ void setSyncRequired(SamplerType type, int sampler, bool syncRequired);
+
+ void setPointSpriteEnable(bool pointSpriteEnable);
+ void setPointScaleEnable(bool pointScaleEnable);
+ void setLineWidth(float width);
+
+ void setDepthBias(float bias);
+ void setSlopeDepthBias(float slopeBias);
+
+ void setRasterizerDiscard(bool rasterizerDiscard);
+
+ // Programmable pipelines
+ void setPixelShader(const PixelShader *shader);
+ void setVertexShader(const VertexShader *shader);
+
+ void setPixelShaderConstantF(unsigned int index, const float value[4], unsigned int count = 1);
+ void setPixelShaderConstantI(unsigned int index, const int value[4], unsigned int count = 1);
+ void setPixelShaderConstantB(unsigned int index, const int *boolean, unsigned int count = 1);
+
+ void setVertexShaderConstantF(unsigned int index, const float value[4], unsigned int count = 1);
+ void setVertexShaderConstantI(unsigned int index, const int value[4], unsigned int count = 1);
+ void setVertexShaderConstantB(unsigned int index, const int *boolean, unsigned int count = 1);
+
+ // Viewport & Clipper
+ void setViewport(const Viewport &viewport);
+ void setScissor(const Rect &scissor);
+ void setClipFlags(int flags);
+ void setClipPlane(unsigned int index, const float plane[4]);
+
+ // Partial transform
+ void setModelMatrix(const Matrix &M, int i = 0);
+ void setViewMatrix(const Matrix &V);
+ void setBaseMatrix(const Matrix &B);
+ void setProjectionMatrix(const Matrix &P);
+
+ void addQuery(Query *query);
+ void removeQuery(Query *query);
+
+ void synchronize();
+
+ #if PERF_HUD
+ // Performance timers
+ int getThreadCount();
+ int64_t getVertexTime(int thread);
+ int64_t getSetupTime(int thread);
+ int64_t getPixelTime(int thread);
+ void resetTimers();
+ #endif
+
+ static int getClusterCount() { return clusterCount; }
+
+ private:
+ static void threadFunction(void *parameters);
+ void threadLoop(int threadIndex);
+ void taskLoop(int threadIndex);
+ void findAvailableTasks();
+ void scheduleTask(int threadIndex);
+ void executeTask(int threadIndex);
+ void finishRendering(Task &pixelTask);
+
+ void processPrimitiveVertices(int unit, unsigned int start, unsigned int count, unsigned int loop, int thread);
+
+ int setupSolidTriangles(int batch, int count);
+ int setupWireframeTriangle(int batch, int count);
+ int setupVertexTriangle(int batch, int count);
+ int setupLines(int batch, int count);
+ int setupPoints(int batch, int count);
+
+ bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+ bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+
+ bool isReadWriteTexture(int sampler);
+ void updateClipper();
+ void updateConfiguration(bool initialUpdate = false);
+ void initializeThreads();
+ void terminateThreads();
+
+ void loadConstants(const VertexShader *vertexShader);
+ void loadConstants(const PixelShader *pixelShader);
+
+ Context *context;
+ Clipper *clipper;
+ Blitter *blitter;
+ Viewport viewport;
+ Rect scissor;
+ int clipFlags;
+
+ Triangle *triangleBatch[16];
+ Primitive *primitiveBatch[16];
+
+ // User-defined clipping planes
+ Plane userPlane[MAX_CLIP_PLANES];
+ Plane clipPlane[MAX_CLIP_PLANES]; // Tranformed to clip space
+ bool updateClipPlanes;
+
+ AtomicInt exitThreads;
+ AtomicInt threadsAwake;
+ Thread *worker[16];
+ Event *resume[16]; // Events for resuming threads
+ Event *suspend[16]; // Events for suspending threads
+ Event *resumeApp; // Event for resuming the application thread
+
+ PrimitiveProgress primitiveProgress[16];
+ PixelProgress pixelProgress[16];
+ Task task[16]; // Current tasks for threads
+
+ enum {
+ DRAW_COUNT = 16, // Number of draw calls buffered (must be power of 2)
+ DRAW_COUNT_BITS = DRAW_COUNT - 1,
+ };
+ DrawCall *drawCall[DRAW_COUNT];
+ DrawCall *drawList[DRAW_COUNT];
+
+ AtomicInt currentDraw;
+ AtomicInt nextDraw;
+
+ enum {
+ TASK_COUNT = 32, // Size of the task queue (must be power of 2)
+ TASK_COUNT_BITS = TASK_COUNT - 1,
+ };
+ Task taskQueue[TASK_COUNT];
+ AtomicInt qHead;
+ AtomicInt qSize;
+
+ static AtomicInt unitCount;
+ static AtomicInt clusterCount;
+
+ MutexLock schedulerMutex;
+
+ #if PERF_HUD
+ int64_t vertexTime[16];
+ int64_t setupTime[16];
+ int64_t pixelTime[16];
+ #endif
+
+ VertexTask *vertexTask[16];
+
+ SwiftConfig *swiftConfig;
+
+ std::list<Query*> queries;
+ Resource *sync;
+
+ VertexProcessor::State vertexState;
+ SetupProcessor::State setupState;
+ PixelProcessor::State pixelState;
+
+ Routine *vertexRoutine;
+ Routine *setupRoutine;
+ Routine *pixelRoutine;
+ };
+
+ struct DrawCall
+ {
+ DrawCall();
+
+ ~DrawCall();
+
+ AtomicInt drawType;
+ AtomicInt batchSize;
+
+ Routine *vertexRoutine;
+ Routine *setupRoutine;
+ Routine *pixelRoutine;
+
+ VertexProcessor::RoutinePointer vertexPointer;
+ SetupProcessor::RoutinePointer setupPointer;
+ PixelProcessor::RoutinePointer pixelPointer;
+
+ int (Renderer::*setupPrimitives)(int batch, int count);
+ SetupProcessor::State setupState;
+
+ Resource *vertexStream[MAX_VERTEX_INPUTS];
+ Resource *indexBuffer;
+ Surface *renderTarget[RENDERTARGETS];
+ Surface *depthBuffer;
+ Surface *stencilBuffer;
+ Resource *texture[TOTAL_IMAGE_UNITS];
+ Resource* pUniformBuffers[MAX_UNIFORM_BUFFER_BINDINGS];
+ Resource* vUniformBuffers[MAX_UNIFORM_BUFFER_BINDINGS];
+ Resource* transformFeedbackBuffers[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS];
+
+ unsigned int vsDirtyConstF;
+ unsigned int vsDirtyConstI;
+ unsigned int vsDirtyConstB;
+
+ unsigned int psDirtyConstF;
+ unsigned int psDirtyConstI;
+ unsigned int psDirtyConstB;
+
+ std::list<Query*> *queries;
+
+ AtomicInt clipFlags;
+
+ AtomicInt primitive; // Current primitive to enter pipeline
+ AtomicInt count; // Number of primitives to render
+ AtomicInt references; // Remaining references to this draw call, 0 when done drawing, -1 when resources unlocked and slot is free
+
+ DrawData *data;
+ };
+}
+
+#endif // sw_Renderer_hpp
diff --git a/src/Device/RoutineCache.hpp b/src/Device/RoutineCache.hpp
new file mode 100644
index 0000000..74dd842
--- /dev/null
+++ b/src/Device/RoutineCache.hpp
@@ -0,0 +1,49 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_RoutineCache_hpp
+#define sw_RoutineCache_hpp
+
+#include "LRUCache.hpp"
+
+#include "Reactor/Reactor.hpp"
+
+namespace sw
+{
+ template<class State>
+ class RoutineCache : public LRUCache<State, Routine>
+ {
+ public:
+ RoutineCache(int n, const char *precache = 0);
+ ~RoutineCache();
+
+ private:
+ const char *precache;
+ #if defined(_WIN32)
+ HMODULE precacheDLL;
+ #endif
+ };
+
+ template<class State>
+ RoutineCache<State>::RoutineCache(int n, const char *precache) : LRUCache<State, Routine>(n), precache(precache)
+ {
+ }
+
+ template<class State>
+ RoutineCache<State>::~RoutineCache()
+ {
+ }
+}
+
+#endif // sw_RoutineCache_hpp
diff --git a/src/Device/Sampler.cpp b/src/Device/Sampler.cpp
new file mode 100644
index 0000000..efac4c6
--- /dev/null
+++ b/src/Device/Sampler.cpp
@@ -0,0 +1,514 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Sampler.hpp"
+
+#include "Context.hpp"
+#include "Surface.hpp"
+#include "Shader/PixelRoutine.hpp"
+#include "Common/Debug.hpp"
+
+#include <memory.h>
+#include <string.h>
+
+namespace sw
+{
+ FilterType Sampler::maximumTextureFilterQuality = FILTER_LINEAR;
+ MipmapType Sampler::maximumMipmapFilterQuality = MIPMAP_POINT;
+
+ Sampler::State::State()
+ {
+ memset(this, 0, sizeof(State));
+ }
+
+ Sampler::Sampler()
+ {
+ // FIXME: Mipmap::init
+ static const unsigned int zero = 0x00FF00FF;
+
+ for(int level = 0; level < MIPMAP_LEVELS; level++)
+ {
+ Mipmap &mipmap = texture.mipmap[level];
+
+ memset(&mipmap, 0, sizeof(Mipmap));
+
+ for(int face = 0; face < 6; face++)
+ {
+ mipmap.buffer[face] = &zero;
+ }
+ }
+
+ externalTextureFormat = FORMAT_NULL;
+ internalTextureFormat = FORMAT_NULL;
+ textureType = TEXTURE_NULL;
+
+ textureFilter = FILTER_LINEAR;
+ addressingModeU = ADDRESSING_WRAP;
+ addressingModeV = ADDRESSING_WRAP;
+ addressingModeW = ADDRESSING_WRAP;
+ mipmapFilterState = MIPMAP_NONE;
+ sRGB = false;
+ gather = false;
+ highPrecisionFiltering = false;
+ border = 0;
+
+ swizzleR = SWIZZLE_RED;
+ swizzleG = SWIZZLE_GREEN;
+ swizzleB = SWIZZLE_BLUE;
+ swizzleA = SWIZZLE_ALPHA;
+
+ compare = COMPARE_BYPASS;
+
+ texture.LOD = 0.0f;
+ exp2LOD = 1.0f;
+
+ texture.baseLevel = 0;
+ texture.maxLevel = 1000;
+ texture.maxLod = MAX_TEXTURE_LOD;
+ texture.minLod = 0;
+ }
+
+ Sampler::~Sampler()
+ {
+ }
+
+ Sampler::State Sampler::samplerState() const
+ {
+ State state;
+
+ if(textureType != TEXTURE_NULL)
+ {
+ state.textureType = textureType;
+ state.textureFormat = internalTextureFormat;
+ state.textureFilter = getTextureFilter();
+ state.addressingModeU = getAddressingModeU();
+ state.addressingModeV = getAddressingModeV();
+ state.addressingModeW = getAddressingModeW();
+ state.mipmapFilter = mipmapFilter();
+ state.sRGB = (sRGB && Surface::isSRGBreadable(externalTextureFormat)) || Surface::isSRGBformat(internalTextureFormat);
+ state.swizzleR = swizzleR;
+ state.swizzleG = swizzleG;
+ state.swizzleB = swizzleB;
+ state.swizzleA = swizzleA;
+ state.highPrecisionFiltering = highPrecisionFiltering;
+ state.compare = getCompareFunc();
+
+ #if PERF_PROFILE
+ state.compressedFormat = Surface::isCompressed(externalTextureFormat);
+ #endif
+ }
+
+ return state;
+ }
+
+ void Sampler::setTextureLevel(int face, int level, Surface *surface, TextureType type)
+ {
+ if(surface)
+ {
+ Mipmap &mipmap = texture.mipmap[level];
+
+ border = surface->getBorder();
+ mipmap.buffer[face] = surface->lockInternal(-border, -border, 0, LOCK_UNLOCKED, PRIVATE);
+
+ if(face == 0)
+ {
+ externalTextureFormat = surface->getExternalFormat();
+ internalTextureFormat = surface->getInternalFormat();
+
+ int width = surface->getWidth();
+ int height = surface->getHeight();
+ int depth = surface->getDepth();
+ int pitchP = surface->getInternalPitchP();
+ int sliceP = surface->getInternalSliceP();
+
+ if(level == 0)
+ {
+ texture.widthHeightLOD[0] = width * exp2LOD;
+ texture.widthHeightLOD[1] = width * exp2LOD;
+ texture.widthHeightLOD[2] = height * exp2LOD;
+ texture.widthHeightLOD[3] = height * exp2LOD;
+
+ texture.widthLOD[0] = width * exp2LOD;
+ texture.widthLOD[1] = width * exp2LOD;
+ texture.widthLOD[2] = width * exp2LOD;
+ texture.widthLOD[3] = width * exp2LOD;
+
+ texture.heightLOD[0] = height * exp2LOD;
+ texture.heightLOD[1] = height * exp2LOD;
+ texture.heightLOD[2] = height * exp2LOD;
+ texture.heightLOD[3] = height * exp2LOD;
+
+ texture.depthLOD[0] = depth * exp2LOD;
+ texture.depthLOD[1] = depth * exp2LOD;
+ texture.depthLOD[2] = depth * exp2LOD;
+ texture.depthLOD[3] = depth * exp2LOD;
+ }
+
+ if(Surface::isFloatFormat(internalTextureFormat))
+ {
+ mipmap.fWidth[0] = (float)width / 65536.0f;
+ mipmap.fWidth[1] = (float)width / 65536.0f;
+ mipmap.fWidth[2] = (float)width / 65536.0f;
+ mipmap.fWidth[3] = (float)width / 65536.0f;
+
+ mipmap.fHeight[0] = (float)height / 65536.0f;
+ mipmap.fHeight[1] = (float)height / 65536.0f;
+ mipmap.fHeight[2] = (float)height / 65536.0f;
+ mipmap.fHeight[3] = (float)height / 65536.0f;
+
+ mipmap.fDepth[0] = (float)depth / 65536.0f;
+ mipmap.fDepth[1] = (float)depth / 65536.0f;
+ mipmap.fDepth[2] = (float)depth / 65536.0f;
+ mipmap.fDepth[3] = (float)depth / 65536.0f;
+ }
+
+ short halfTexelU = 0x8000 / width;
+ short halfTexelV = 0x8000 / height;
+ short halfTexelW = 0x8000 / depth;
+
+ mipmap.uHalf[0] = halfTexelU;
+ mipmap.uHalf[1] = halfTexelU;
+ mipmap.uHalf[2] = halfTexelU;
+ mipmap.uHalf[3] = halfTexelU;
+
+ mipmap.vHalf[0] = halfTexelV;
+ mipmap.vHalf[1] = halfTexelV;
+ mipmap.vHalf[2] = halfTexelV;
+ mipmap.vHalf[3] = halfTexelV;
+
+ mipmap.wHalf[0] = halfTexelW;
+ mipmap.wHalf[1] = halfTexelW;
+ mipmap.wHalf[2] = halfTexelW;
+ mipmap.wHalf[3] = halfTexelW;
+
+ mipmap.width[0] = width;
+ mipmap.width[1] = width;
+ mipmap.width[2] = width;
+ mipmap.width[3] = width;
+
+ mipmap.height[0] = height;
+ mipmap.height[1] = height;
+ mipmap.height[2] = height;
+ mipmap.height[3] = height;
+
+ mipmap.depth[0] = depth;
+ mipmap.depth[1] = depth;
+ mipmap.depth[2] = depth;
+ mipmap.depth[3] = depth;
+
+ mipmap.onePitchP[0] = 1;
+ mipmap.onePitchP[1] = pitchP;
+ mipmap.onePitchP[2] = 1;
+ mipmap.onePitchP[3] = pitchP;
+
+ mipmap.pitchP[0] = pitchP;
+ mipmap.pitchP[1] = pitchP;
+ mipmap.pitchP[2] = pitchP;
+ mipmap.pitchP[3] = pitchP;
+
+ mipmap.sliceP[0] = sliceP;
+ mipmap.sliceP[1] = sliceP;
+ mipmap.sliceP[2] = sliceP;
+ mipmap.sliceP[3] = sliceP;
+
+ if(internalTextureFormat == FORMAT_YV12_BT601 ||
+ internalTextureFormat == FORMAT_YV12_BT709 ||
+ internalTextureFormat == FORMAT_YV12_JFIF)
+ {
+ unsigned int YStride = pitchP;
+ unsigned int YSize = YStride * height;
+ unsigned int CStride = align<16>(YStride / 2);
+ unsigned int CSize = CStride * height / 2;
+
+ mipmap.buffer[1] = (byte*)mipmap.buffer[0] + YSize;
+ mipmap.buffer[2] = (byte*)mipmap.buffer[1] + CSize;
+
+ texture.mipmap[1].width[0] = width / 2;
+ texture.mipmap[1].width[1] = width / 2;
+ texture.mipmap[1].width[2] = width / 2;
+ texture.mipmap[1].width[3] = width / 2;
+ texture.mipmap[1].height[0] = height / 2;
+ texture.mipmap[1].height[1] = height / 2;
+ texture.mipmap[1].height[2] = height / 2;
+ texture.mipmap[1].height[3] = height / 2;
+ texture.mipmap[1].onePitchP[0] = 1;
+ texture.mipmap[1].onePitchP[1] = CStride;
+ texture.mipmap[1].onePitchP[2] = 1;
+ texture.mipmap[1].onePitchP[3] = CStride;
+ }
+ }
+ }
+
+ textureType = type;
+ }
+
+ void Sampler::setTextureFilter(FilterType textureFilter)
+ {
+ this->textureFilter = (FilterType)min(textureFilter, maximumTextureFilterQuality);
+ }
+
+ void Sampler::setMipmapFilter(MipmapType mipmapFilter)
+ {
+ mipmapFilterState = (MipmapType)min(mipmapFilter, maximumMipmapFilterQuality);
+ }
+
+ void Sampler::setGatherEnable(bool enable)
+ {
+ gather = enable;
+ }
+
+ void Sampler::setAddressingModeU(AddressingMode addressingMode)
+ {
+ addressingModeU = addressingMode;
+ }
+
+ void Sampler::setAddressingModeV(AddressingMode addressingMode)
+ {
+ addressingModeV = addressingMode;
+ }
+
+ void Sampler::setAddressingModeW(AddressingMode addressingMode)
+ {
+ addressingModeW = addressingMode;
+ }
+
+ void Sampler::setReadSRGB(bool sRGB)
+ {
+ this->sRGB = sRGB;
+ }
+
+ void Sampler::setBorderColor(const Color<float> &borderColor)
+ {
+ // FIXME: Compact into generic function // FIXME: Clamp
+ short r = iround(0xFFFF * borderColor.r);
+ short g = iround(0xFFFF * borderColor.g);
+ short b = iround(0xFFFF * borderColor.b);
+ short a = iround(0xFFFF * borderColor.a);
+
+ texture.borderColor4[0][0] = texture.borderColor4[0][1] = texture.borderColor4[0][2] = texture.borderColor4[0][3] = r;
+ texture.borderColor4[1][0] = texture.borderColor4[1][1] = texture.borderColor4[1][2] = texture.borderColor4[1][3] = g;
+ texture.borderColor4[2][0] = texture.borderColor4[2][1] = texture.borderColor4[2][2] = texture.borderColor4[2][3] = b;
+ texture.borderColor4[3][0] = texture.borderColor4[3][1] = texture.borderColor4[3][2] = texture.borderColor4[3][3] = a;
+
+ texture.borderColorF[0][0] = texture.borderColorF[0][1] = texture.borderColorF[0][2] = texture.borderColorF[0][3] = borderColor.r;
+ texture.borderColorF[1][0] = texture.borderColorF[1][1] = texture.borderColorF[1][2] = texture.borderColorF[1][3] = borderColor.g;
+ texture.borderColorF[2][0] = texture.borderColorF[2][1] = texture.borderColorF[2][2] = texture.borderColorF[2][3] = borderColor.b;
+ texture.borderColorF[3][0] = texture.borderColorF[3][1] = texture.borderColorF[3][2] = texture.borderColorF[3][3] = borderColor.a;
+ }
+
+ void Sampler::setMaxAnisotropy(float maxAnisotropy)
+ {
+ texture.maxAnisotropy = maxAnisotropy;
+ }
+
+ void Sampler::setHighPrecisionFiltering(bool highPrecisionFiltering)
+ {
+ this->highPrecisionFiltering = highPrecisionFiltering;
+ }
+
+ void Sampler::setSwizzleR(SwizzleType swizzleR)
+ {
+ this->swizzleR = swizzleR;
+ }
+
+ void Sampler::setSwizzleG(SwizzleType swizzleG)
+ {
+ this->swizzleG = swizzleG;
+ }
+
+ void Sampler::setSwizzleB(SwizzleType swizzleB)
+ {
+ this->swizzleB = swizzleB;
+ }
+
+ void Sampler::setSwizzleA(SwizzleType swizzleA)
+ {
+ this->swizzleA = swizzleA;
+ }
+
+ void Sampler::setCompareFunc(CompareFunc compare)
+ {
+ this->compare = compare;
+ }
+
+ void Sampler::setBaseLevel(int baseLevel)
+ {
+ texture.baseLevel = baseLevel;
+ }
+
+ void Sampler::setMaxLevel(int maxLevel)
+ {
+ texture.maxLevel = maxLevel;
+ }
+
+ void Sampler::setMinLod(float minLod)
+ {
+ texture.minLod = clamp(minLod, 0.0f, (float)(MAX_TEXTURE_LOD));
+ }
+
+ void Sampler::setMaxLod(float maxLod)
+ {
+ texture.maxLod = clamp(maxLod, 0.0f, (float)(MAX_TEXTURE_LOD));
+ }
+
+ void Sampler::setFilterQuality(FilterType maximumFilterQuality)
+ {
+ Sampler::maximumTextureFilterQuality = maximumFilterQuality;
+ }
+
+ void Sampler::setMipmapQuality(MipmapType maximumFilterQuality)
+ {
+ Sampler::maximumMipmapFilterQuality = maximumFilterQuality;
+ }
+
+ void Sampler::setMipmapLOD(float LOD)
+ {
+ texture.LOD = LOD;
+ exp2LOD = exp2(LOD);
+ }
+
+ bool Sampler::hasTexture() const
+ {
+ return textureType != TEXTURE_NULL;
+ }
+
+ bool Sampler::hasUnsignedTexture() const
+ {
+ return Surface::isUnsignedComponent(internalTextureFormat, 0) &&
+ Surface::isUnsignedComponent(internalTextureFormat, 1) &&
+ Surface::isUnsignedComponent(internalTextureFormat, 2) &&
+ Surface::isUnsignedComponent(internalTextureFormat, 3);
+ }
+
+ bool Sampler::hasCubeTexture() const
+ {
+ return textureType == TEXTURE_CUBE;
+ }
+
+ bool Sampler::hasVolumeTexture() const
+ {
+ return textureType == TEXTURE_3D || textureType == TEXTURE_2D_ARRAY;
+ }
+
+ void Sampler::setSyncRequired(bool isSyncRequired)
+ {
+ syncRequired = isSyncRequired;
+ }
+
+ bool Sampler::requiresSync() const
+ {
+ return syncRequired;
+ }
+
+ const Texture &Sampler::getTextureData()
+ {
+ return texture;
+ }
+
+ MipmapType Sampler::mipmapFilter() const
+ {
+ if(mipmapFilterState != MIPMAP_NONE)
+ {
+ for(int i = 1; i < MIPMAP_LEVELS; i++)
+ {
+ if(texture.mipmap[0].buffer[0] != texture.mipmap[i].buffer[0])
+ {
+ return mipmapFilterState;
+ }
+ }
+ }
+
+ // Only one mipmap level
+ return MIPMAP_NONE;
+ }
+
+ TextureType Sampler::getTextureType() const
+ {
+ return textureType;
+ }
+
+ FilterType Sampler::getTextureFilter() const
+ {
+ // Don't filter 1x1 textures.
+ if(texture.mipmap[0].width[0] == 1 && texture.mipmap[0].height[0] == 1 && texture.mipmap[0].depth[0] == 1)
+ {
+ if(mipmapFilter() == MIPMAP_NONE)
+ {
+ return FILTER_POINT;
+ }
+ }
+
+ FilterType filter = textureFilter;
+
+ if(gather && Surface::componentCount(internalTextureFormat) == 1)
+ {
+ filter = FILTER_GATHER;
+ }
+
+ if(textureType != TEXTURE_2D || texture.maxAnisotropy == 1.0f)
+ {
+ return (FilterType)min(filter, FILTER_LINEAR);
+ }
+
+ return filter;
+ }
+
+ AddressingMode Sampler::getAddressingModeU() const
+ {
+ if(textureType == TEXTURE_CUBE)
+ {
+ return border ? ADDRESSING_SEAMLESS : ADDRESSING_CLAMP;
+ }
+
+ return addressingModeU;
+ }
+
+ AddressingMode Sampler::getAddressingModeV() const
+ {
+ if(textureType == TEXTURE_CUBE)
+ {
+ return border ? ADDRESSING_SEAMLESS : ADDRESSING_CLAMP;
+ }
+
+ return addressingModeV;
+ }
+
+ AddressingMode Sampler::getAddressingModeW() const
+ {
+ if(textureType == TEXTURE_2D_ARRAY ||
+ textureType == TEXTURE_2D ||
+ textureType == TEXTURE_CUBE ||
+ textureType == TEXTURE_RECTANGLE)
+ {
+ return ADDRESSING_LAYER;
+ }
+
+ return addressingModeW;
+ }
+
+ CompareFunc Sampler::getCompareFunc() const
+ {
+ if(getTextureFilter() == FILTER_GATHER)
+ {
+ return COMPARE_BYPASS;
+ }
+
+ if(internalTextureFormat == FORMAT_D32FS8_SHADOW)
+ {
+ return COMPARE_LESSEQUAL;
+ }
+
+ return compare;
+ }
+}
diff --git a/src/Device/Sampler.hpp b/src/Device/Sampler.hpp
new file mode 100644
index 0000000..af225c5
--- /dev/null
+++ b/src/Device/Sampler.hpp
@@ -0,0 +1,248 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Sampler_hpp
+#define sw_Sampler_hpp
+
+#include "Main/Config.hpp"
+#include "Renderer/Surface.hpp"
+#include "Common/Types.hpp"
+
+namespace sw
+{
+ struct Mipmap
+ {
+ const void *buffer[6];
+
+ float4 fWidth;
+ float4 fHeight;
+ float4 fDepth;
+
+ short uHalf[4];
+ short vHalf[4];
+ short wHalf[4];
+ short width[4];
+ short height[4];
+ short depth[4];
+ short onePitchP[4];
+ int4 pitchP;
+ int4 sliceP;
+ };
+
+ struct Texture
+ {
+ Mipmap mipmap[MIPMAP_LEVELS];
+
+ float LOD;
+ float4 widthHeightLOD;
+ float4 widthLOD;
+ float4 heightLOD;
+ float4 depthLOD;
+
+ word4 borderColor4[4];
+ float4 borderColorF[4];
+ float maxAnisotropy;
+ int baseLevel;
+ int maxLevel;
+ float minLod;
+ float maxLod;
+ };
+
+ enum SamplerType
+ {
+ SAMPLER_PIXEL,
+ SAMPLER_VERTEX
+ };
+
+ enum TextureType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ TEXTURE_NULL,
+ TEXTURE_2D,
+ TEXTURE_RECTANGLE,
+ TEXTURE_CUBE,
+ TEXTURE_3D,
+ TEXTURE_2D_ARRAY,
+
+ TEXTURE_LAST = TEXTURE_2D_ARRAY
+ };
+
+ enum FilterType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ FILTER_POINT,
+ FILTER_GATHER,
+ FILTER_MIN_POINT_MAG_LINEAR,
+ FILTER_MIN_LINEAR_MAG_POINT,
+ FILTER_LINEAR,
+ FILTER_ANISOTROPIC,
+
+ FILTER_LAST = FILTER_ANISOTROPIC
+ };
+
+ enum MipmapType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ MIPMAP_NONE,
+ MIPMAP_POINT,
+ MIPMAP_LINEAR,
+
+ MIPMAP_LAST = MIPMAP_LINEAR
+ };
+
+ enum AddressingMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ ADDRESSING_WRAP,
+ ADDRESSING_CLAMP,
+ ADDRESSING_MIRROR,
+ ADDRESSING_MIRRORONCE,
+ ADDRESSING_BORDER, // Single color
+ ADDRESSING_SEAMLESS, // Border of pixels
+ ADDRESSING_LAYER,
+ ADDRESSING_TEXELFETCH,
+
+ ADDRESSING_LAST = ADDRESSING_TEXELFETCH
+ };
+
+ enum CompareFunc ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ COMPARE_BYPASS,
+ COMPARE_LESSEQUAL,
+ COMPARE_GREATEREQUAL,
+ COMPARE_LESS,
+ COMPARE_GREATER,
+ COMPARE_EQUAL,
+ COMPARE_NOTEQUAL,
+ COMPARE_ALWAYS,
+ COMPARE_NEVER,
+
+ COMPARE_LAST = COMPARE_NEVER
+ };
+
+ enum SwizzleType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ SWIZZLE_RED,
+ SWIZZLE_GREEN,
+ SWIZZLE_BLUE,
+ SWIZZLE_ALPHA,
+ SWIZZLE_ZERO,
+ SWIZZLE_ONE,
+
+ SWIZZLE_LAST = SWIZZLE_ONE
+ };
+
+ class Sampler
+ {
+ public:
+ struct State
+ {
+ State();
+
+ TextureType textureType : BITS(TEXTURE_LAST);
+ Format textureFormat : BITS(FORMAT_LAST);
+ FilterType textureFilter : BITS(FILTER_LAST);
+ AddressingMode addressingModeU : BITS(ADDRESSING_LAST);
+ AddressingMode addressingModeV : BITS(ADDRESSING_LAST);
+ AddressingMode addressingModeW : BITS(ADDRESSING_LAST);
+ MipmapType mipmapFilter : BITS(FILTER_LAST);
+ bool sRGB : 1;
+ SwizzleType swizzleR : BITS(SWIZZLE_LAST);
+ SwizzleType swizzleG : BITS(SWIZZLE_LAST);
+ SwizzleType swizzleB : BITS(SWIZZLE_LAST);
+ SwizzleType swizzleA : BITS(SWIZZLE_LAST);
+ bool highPrecisionFiltering : 1;
+ CompareFunc compare : BITS(COMPARE_LAST);
+
+ #if PERF_PROFILE
+ bool compressedFormat : 1;
+ #endif
+ };
+
+ Sampler();
+
+ ~Sampler();
+
+ State samplerState() const;
+
+ void setTextureLevel(int face, int level, Surface *surface, TextureType type);
+
+ void setTextureFilter(FilterType textureFilter);
+ void setMipmapFilter(MipmapType mipmapFilter);
+ void setGatherEnable(bool enable);
+ void setAddressingModeU(AddressingMode addressingMode);
+ void setAddressingModeV(AddressingMode addressingMode);
+ void setAddressingModeW(AddressingMode addressingMode);
+ void setReadSRGB(bool sRGB);
+ void setBorderColor(const Color<float> &borderColor);
+ void setMaxAnisotropy(float maxAnisotropy);
+ void setHighPrecisionFiltering(bool highPrecisionFiltering);
+ void setSwizzleR(SwizzleType swizzleR);
+ void setSwizzleG(SwizzleType swizzleG);
+ void setSwizzleB(SwizzleType swizzleB);
+ void setSwizzleA(SwizzleType swizzleA);
+ void setCompareFunc(CompareFunc compare);
+ void setBaseLevel(int baseLevel);
+ void setMaxLevel(int maxLevel);
+ void setMinLod(float minLod);
+ void setMaxLod(float maxLod);
+ void setSyncRequired(bool isSincRequired);
+
+ static void setFilterQuality(FilterType maximumFilterQuality);
+ static void setMipmapQuality(MipmapType maximumFilterQuality);
+ void setMipmapLOD(float lod);
+
+ bool hasTexture() const;
+ bool hasUnsignedTexture() const;
+ bool hasCubeTexture() const;
+ bool hasVolumeTexture() const;
+ bool requiresSync() const;
+
+ const Texture &getTextureData();
+
+ private:
+ MipmapType mipmapFilter() const;
+ TextureType getTextureType() const;
+ FilterType getTextureFilter() const;
+ AddressingMode getAddressingModeU() const;
+ AddressingMode getAddressingModeV() const;
+ AddressingMode getAddressingModeW() const;
+ CompareFunc getCompareFunc() const;
+
+ Format externalTextureFormat;
+ Format internalTextureFormat;
+ TextureType textureType;
+
+ FilterType textureFilter;
+ AddressingMode addressingModeU;
+ AddressingMode addressingModeV;
+ AddressingMode addressingModeW;
+ MipmapType mipmapFilterState;
+ bool sRGB;
+ bool gather;
+ bool highPrecisionFiltering;
+ bool syncRequired;
+ int border;
+
+ SwizzleType swizzleR;
+ SwizzleType swizzleG;
+ SwizzleType swizzleB;
+ SwizzleType swizzleA;
+ CompareFunc compare;
+
+ Texture texture;
+ float exp2LOD;
+
+ static FilterType maximumTextureFilterQuality;
+ static MipmapType maximumMipmapFilterQuality;
+ };
+}
+
+#endif // sw_Sampler_hpp
diff --git a/src/Device/SetupProcessor.cpp b/src/Device/SetupProcessor.cpp
new file mode 100644
index 0000000..c5c2a16
--- /dev/null
+++ b/src/Device/SetupProcessor.cpp
@@ -0,0 +1,248 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "SetupProcessor.hpp"
+
+#include "Primitive.hpp"
+#include "Polygon.hpp"
+#include "Context.hpp"
+#include "Renderer.hpp"
+#include "Shader/SetupRoutine.hpp"
+#include "Shader/Constants.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+ extern bool complementaryDepthBuffer;
+ extern bool fullPixelPositionRegister;
+
+ bool precacheSetup = false;
+
+ unsigned int SetupProcessor::States::computeHash()
+ {
+ unsigned int *state = (unsigned int*)this;
+ unsigned int hash = 0;
+
+ for(unsigned int i = 0; i < sizeof(States) / 4; i++)
+ {
+ hash ^= state[i];
+ }
+
+ return hash;
+ }
+
+ SetupProcessor::State::State(int i)
+ {
+ memset(this, 0, sizeof(State));
+ }
+
+ bool SetupProcessor::State::operator==(const State &state) const
+ {
+ if(hash != state.hash)
+ {
+ return false;
+ }
+
+ return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
+ }
+
+ SetupProcessor::SetupProcessor(Context *context) : context(context)
+ {
+ routineCache = 0;
+ setRoutineCacheSize(1024);
+ }
+
+ SetupProcessor::~SetupProcessor()
+ {
+ delete routineCache;
+ routineCache = 0;
+ }
+
+ SetupProcessor::State SetupProcessor::update() const
+ {
+ State state;
+
+ bool vPosZW = (context->pixelShader && context->pixelShader->isVPosDeclared() && fullPixelPositionRegister);
+
+ state.isDrawPoint = context->isDrawPoint(true);
+ state.isDrawLine = context->isDrawLine(true);
+ state.isDrawTriangle = context->isDrawTriangle(false);
+ state.isDrawSolidTriangle = context->isDrawTriangle(true);
+ state.interpolateZ = context->depthBufferActive() || context->pixelFogActive() != FOG_NONE || vPosZW;
+ state.interpolateW = context->perspectiveActive() || vPosZW;
+ state.perspective = context->perspectiveActive();
+ state.pointSprite = context->pointSpriteActive();
+ state.cullMode = context->cullMode;
+ state.twoSidedStencil = context->stencilActive() && context->twoSidedStencil;
+ state.slopeDepthBias = context->slopeDepthBias != 0.0f;
+ state.vFace = context->pixelShader && context->pixelShader->isVFaceDeclared();
+
+ state.positionRegister = Pos;
+ state.pointSizeRegister = Unused;
+
+ state.multiSample = context->getMultiSampleCount();
+ state.rasterizerDiscard = context->rasterizerDiscard;
+
+ if(context->vertexShader)
+ {
+ state.positionRegister = context->vertexShader->getPositionRegister();
+ state.pointSizeRegister = context->vertexShader->getPointSizeRegister();
+ }
+ else if(context->pointSizeActive())
+ {
+ state.pointSizeRegister = Pts;
+ }
+
+ for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ state.gradient[interpolant][component].attribute = Unused;
+ state.gradient[interpolant][component].flat = false;
+ state.gradient[interpolant][component].wrap = false;
+ }
+ }
+
+ state.fog.attribute = Unused;
+ state.fog.flat = false;
+ state.fog.wrap = false;
+
+ const bool point = context->isDrawPoint(true);
+ const bool sprite = context->pointSpriteActive();
+ const bool flatShading = (context->shadingMode == SHADING_FLAT) || point;
+
+ if(context->vertexShader && context->pixelShader)
+ {
+ for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ int project = context->isProjectionComponent(interpolant - 2, component) ? 1 : 0;
+ const Shader::Semantic& semantic = context->pixelShader->getInput(interpolant, component - project);
+
+ if(semantic.active())
+ {
+ int input = interpolant;
+ for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
+ {
+ if(semantic == context->vertexShader->getOutput(i, component - project))
+ {
+ input = i;
+ break;
+ }
+ }
+
+ bool flat = point;
+
+ switch(semantic.usage)
+ {
+ case Shader::USAGE_TEXCOORD: flat = point && !sprite; break;
+ case Shader::USAGE_COLOR: flat = semantic.flat || flatShading; break;
+ }
+
+ state.gradient[interpolant][component].attribute = input;
+ state.gradient[interpolant][component].flat = flat;
+ }
+ }
+ }
+ }
+ else if(context->preTransformed && context->pixelShader)
+ {
+ for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ const Shader::Semantic& semantic = context->pixelShader->getInput(interpolant, component);
+
+ switch(semantic.usage)
+ {
+ case 0xFF:
+ break;
+ case Shader::USAGE_TEXCOORD:
+ state.gradient[interpolant][component].attribute = T0 + semantic.index;
+ state.gradient[interpolant][component].flat = semantic.flat || (point && !sprite);
+ break;
+ case Shader::USAGE_COLOR:
+ state.gradient[interpolant][component].attribute = C0 + semantic.index;
+ state.gradient[interpolant][component].flat = semantic.flat || flatShading;
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ }
+ }
+ else if(context->pixelShaderModel() < 0x0300)
+ {
+ for(int coordinate = 0; coordinate < 8; coordinate++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ if(context->textureActive(coordinate, component))
+ {
+ state.texture[coordinate][component].attribute = T0 + coordinate;
+ state.texture[coordinate][component].flat = point && !sprite;
+ state.texture[coordinate][component].wrap = (context->textureWrap[coordinate] & (1 << component)) != 0;
+ }
+ }
+ }
+
+ for(int color = 0; color < 2; color++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ if(context->colorActive(color, component))
+ {
+ state.color[color][component].attribute = C0 + color;
+ state.color[color][component].flat = flatShading;
+ }
+ }
+ }
+ }
+ else ASSERT(false);
+
+ if(context->fogActive())
+ {
+ state.fog.attribute = Fog;
+ state.fog.flat = point;
+ }
+
+ state.hash = state.computeHash();
+
+ return state;
+ }
+
+ Routine *SetupProcessor::routine(const State &state)
+ {
+ Routine *routine = routineCache->query(state);
+
+ if(!routine)
+ {
+ SetupRoutine *generator = new SetupRoutine(state);
+ generator->generate();
+ routine = generator->getRoutine();
+ delete generator;
+
+ routineCache->add(state, routine);
+ }
+
+ return routine;
+ }
+
+ void SetupProcessor::setRoutineCacheSize(int cacheSize)
+ {
+ delete routineCache;
+ routineCache = new RoutineCache<State>(clamp(cacheSize, 1, 65536), precacheSetup ? "sw-setup" : 0);
+ }
+}
diff --git a/src/Device/SetupProcessor.hpp b/src/Device/SetupProcessor.hpp
new file mode 100644
index 0000000..be0adc7
--- /dev/null
+++ b/src/Device/SetupProcessor.hpp
@@ -0,0 +1,105 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_SetupProcessor_hpp
+#define sw_SetupProcessor_hpp
+
+#include "Context.hpp"
+#include "RoutineCache.hpp"
+#include "Shader/VertexShader.hpp"
+#include "Shader/PixelShader.hpp"
+#include "Common/Types.hpp"
+
+namespace sw
+{
+ struct Primitive;
+ struct Triangle;
+ struct Polygon;
+ struct Vertex;
+ struct DrawCall;
+ struct DrawData;
+
+ class SetupProcessor
+ {
+ public:
+ struct States
+ {
+ unsigned int computeHash();
+
+ bool isDrawPoint : 1;
+ bool isDrawLine : 1;
+ bool isDrawTriangle : 1;
+ bool isDrawSolidTriangle : 1;
+ bool interpolateZ : 1;
+ bool interpolateW : 1;
+ bool perspective : 1;
+ bool pointSprite : 1;
+ unsigned int positionRegister : BITS(VERTEX_OUTPUT_LAST);
+ unsigned int pointSizeRegister : BITS(VERTEX_OUTPUT_LAST);
+ CullMode cullMode : BITS(CULL_LAST);
+ bool twoSidedStencil : 1;
+ bool slopeDepthBias : 1;
+ bool vFace : 1;
+ unsigned int multiSample : 3; // 1, 2 or 4
+ bool rasterizerDiscard : 1;
+
+ struct Gradient
+ {
+ unsigned char attribute : BITS(VERTEX_OUTPUT_LAST);
+ bool flat : 1;
+ bool wrap : 1;
+ };
+
+ union
+ {
+ struct
+ {
+ Gradient color[2][4];
+ Gradient texture[8][4];
+ Gradient fog;
+ };
+
+ Gradient gradient[MAX_FRAGMENT_INPUTS][4];
+ };
+ };
+
+ struct State : States
+ {
+ State(int i = 0);
+
+ bool operator==(const State &states) const;
+
+ unsigned int hash;
+ };
+
+ typedef bool (*RoutinePointer)(Primitive *primitive, const Triangle *triangle, const Polygon *polygon, const DrawData *draw);
+
+ SetupProcessor(Context *context);
+
+ ~SetupProcessor();
+
+ protected:
+ State update() const;
+ Routine *routine(const State &state);
+
+ void setRoutineCacheSize(int cacheSize);
+
+ private:
+ Context *const context;
+
+ RoutineCache<State> *routineCache;
+ };
+}
+
+#endif // sw_SetupProcessor_hpp
diff --git a/src/Device/Stream.hpp b/src/Device/Stream.hpp
new file mode 100644
index 0000000..969d8b2
--- /dev/null
+++ b/src/Device/Stream.hpp
@@ -0,0 +1,105 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Stream_hpp
+#define sw_Stream_hpp
+
+#include "Common/Types.hpp"
+
+namespace sw
+{
+ class Resource;
+
+ enum StreamType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+ {
+ STREAMTYPE_COLOR, // 4 normalized unsigned bytes, ZYXW order
+ STREAMTYPE_UDEC3, // 3 unsigned 10-bit fields
+ STREAMTYPE_DEC3N, // 3 normalized signed 10-bit fields
+ STREAMTYPE_INDICES, // 4 unsigned bytes, stored unconverted into X component
+ STREAMTYPE_FLOAT, // Normalization ignored
+ STREAMTYPE_BYTE,
+ STREAMTYPE_SBYTE,
+ STREAMTYPE_SHORT,
+ STREAMTYPE_USHORT,
+ STREAMTYPE_INT,
+ STREAMTYPE_UINT,
+ STREAMTYPE_FIXED, // Normalization ignored (16.16 format)
+ STREAMTYPE_HALF, // Normalization ignored
+ STREAMTYPE_2_10_10_10_INT,
+ STREAMTYPE_2_10_10_10_UINT,
+
+ STREAMTYPE_LAST = STREAMTYPE_2_10_10_10_UINT
+ };
+
+ struct StreamResource
+ {
+ Resource *resource;
+ const void *buffer;
+ unsigned int stride;
+ };
+
+ struct Stream : public StreamResource
+ {
+ Stream(Resource *resource = 0, const void *buffer = 0, unsigned int stride = 0)
+ {
+ this->resource = resource;
+ this->buffer = buffer;
+ this->stride = stride;
+ }
+
+ Stream &define(StreamType type, unsigned int count, bool normalized = false)
+ {
+ this->type = type;
+ this->count = count;
+ this->normalized = normalized;
+
+ return *this;
+ }
+
+ Stream &define(const void *buffer, StreamType type, unsigned int count, bool normalized = false)
+ {
+ this->buffer = buffer;
+ this->type = type;
+ this->count = count;
+ this->normalized = normalized;
+
+ return *this;
+ }
+
+ Stream &defaults()
+ {
+ static const float4 null = {0, 0, 0, 1};
+
+ resource = 0;
+ buffer = &null;
+ stride = 0;
+ type = STREAMTYPE_FLOAT;
+ count = 0;
+ normalized = false;
+
+ return *this;
+ }
+
+ operator bool() const // Returns true if stream contains data
+ {
+ return count != 0;
+ }
+
+ StreamType type;
+ unsigned char count;
+ bool normalized;
+ };
+}
+
+#endif // sw_Stream_hpp
diff --git a/src/Device/Surface.cpp b/src/Device/Surface.cpp
new file mode 100644
index 0000000..e06f2bd
--- /dev/null
+++ b/src/Device/Surface.cpp
@@ -0,0 +1,6217 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Surface.hpp"
+
+#include "Color.hpp"
+#include "Context.hpp"
+#include "ETC_Decoder.hpp"
+#include "Renderer.hpp"
+#include "Common/Half.hpp"
+#include "Common/Memory.hpp"
+#include "Common/CPUID.hpp"
+#include "Common/Resource.hpp"
+#include "Common/Debug.hpp"
+#include "Reactor/Reactor.hpp"
+
+#if defined(__i386__) || defined(__x86_64__)
+ #include <xmmintrin.h>
+ #include <emmintrin.h>
+#endif
+
+#undef min
+#undef max
+
+namespace sw
+{
+ extern bool quadLayoutEnabled;
+ extern bool complementaryDepthBuffer;
+ extern TranscendentalPrecision logPrecision;
+
+ unsigned int *Surface::palette = 0;
+ unsigned int Surface::paletteID = 0;
+
+ void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
+ {
+ ASSERT((x >= -border) && (x < (width + border)));
+ ASSERT((y >= -border) && (y < (height + border)));
+ ASSERT((z >= 0) && (z < depth));
+
+ byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
+
+ for(int i = 0; i < samples; i++)
+ {
+ write(element, color);
+ element += sliceB;
+ }
+ }
+
+ void Surface::Buffer::write(int x, int y, const Color<float> &color)
+ {
+ ASSERT((x >= -border) && (x < (width + border)));
+ ASSERT((y >= -border) && (y < (height + border)));
+
+ byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB;
+
+ for(int i = 0; i < samples; i++)
+ {
+ write(element, color);
+ element += sliceB;
+ }
+ }
+
+ inline void Surface::Buffer::write(void *element, const Color<float> &color)
+ {
+ float r = color.r;
+ float g = color.g;
+ float b = color.b;
+ float a = color.a;
+
+ if(isSRGBformat(format))
+ {
+ r = linearToSRGB(r);
+ g = linearToSRGB(g);
+ b = linearToSRGB(b);
+ }
+
+ switch(format)
+ {
+ case FORMAT_A8:
+ *(unsigned char*)element = unorm<8>(a);
+ break;
+ case FORMAT_R8_SNORM:
+ *(char*)element = snorm<8>(r);
+ break;
+ case FORMAT_R8:
+ *(unsigned char*)element = unorm<8>(r);
+ break;
+ case FORMAT_R8I:
+ *(char*)element = scast<8>(r);
+ break;
+ case FORMAT_R8UI:
+ *(unsigned char*)element = ucast<8>(r);
+ break;
+ case FORMAT_R16I:
+ *(short*)element = scast<16>(r);
+ break;
+ case FORMAT_R16UI:
+ *(unsigned short*)element = ucast<16>(r);
+ break;
+ case FORMAT_R32I:
+ *(int*)element = static_cast<int>(r);
+ break;
+ case FORMAT_R32UI:
+ *(unsigned int*)element = static_cast<unsigned int>(r);
+ break;
+ case FORMAT_R3G3B2:
+ *(unsigned char*)element = (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
+ break;
+ case FORMAT_A8R3G3B2:
+ *(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
+ break;
+ case FORMAT_X4R4G4B4:
+ *(unsigned short*)element = 0xF000 | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
+ break;
+ case FORMAT_A4R4G4B4:
+ *(unsigned short*)element = (unorm<4>(a) << 12) | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
+ break;
+ case FORMAT_R4G4B4A4:
+ *(unsigned short*)element = (unorm<4>(r) << 12) | (unorm<4>(g) << 8) | (unorm<4>(b) << 4) | (unorm<4>(a) << 0);
+ break;
+ case FORMAT_R5G6B5:
+ *(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<6>(g) << 5) | (unorm<5>(b) << 0);
+ break;
+ case FORMAT_A1R5G5B5:
+ *(unsigned short*)element = (unorm<1>(a) << 15) | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
+ break;
+ case FORMAT_R5G5B5A1:
+ *(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<5>(g) << 6) | (unorm<5>(b) << 1) | (unorm<5>(a) << 0);
+ break;
+ case FORMAT_X1R5G5B5:
+ *(unsigned short*)element = 0x8000 | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
+ break;
+ case FORMAT_A8R8G8B8:
+ *(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
+ break;
+ case FORMAT_X8R8G8B8:
+ *(unsigned int*)element = 0xFF000000 | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
+ break;
+ case FORMAT_A8B8G8R8_SNORM:
+ *(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(a)) << 24) |
+ (static_cast<unsigned int>(snorm<8>(b)) << 16) |
+ (static_cast<unsigned int>(snorm<8>(g)) << 8) |
+ (static_cast<unsigned int>(snorm<8>(r)) << 0);
+ break;
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_A8:
+ *(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
+ break;
+ case FORMAT_A8B8G8R8I:
+ *(unsigned int*)element = (static_cast<unsigned int>(scast<8>(a)) << 24) |
+ (static_cast<unsigned int>(scast<8>(b)) << 16) |
+ (static_cast<unsigned int>(scast<8>(g)) << 8) |
+ (static_cast<unsigned int>(scast<8>(r)) << 0);
+ break;
+ case FORMAT_A8B8G8R8UI:
+ *(unsigned int*)element = (ucast<8>(a) << 24) | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
+ break;
+ case FORMAT_X8B8G8R8_SNORM:
+ *(unsigned int*)element = 0x7F000000 |
+ (static_cast<unsigned int>(snorm<8>(b)) << 16) |
+ (static_cast<unsigned int>(snorm<8>(g)) << 8) |
+ (static_cast<unsigned int>(snorm<8>(r)) << 0);
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ *(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
+ break;
+ case FORMAT_X8B8G8R8I:
+ *(unsigned int*)element = 0x7F000000 |
+ (static_cast<unsigned int>(scast<8>(b)) << 16) |
+ (static_cast<unsigned int>(scast<8>(g)) << 8) |
+ (static_cast<unsigned int>(scast<8>(r)) << 0);
+ case FORMAT_X8B8G8R8UI:
+ *(unsigned int*)element = 0xFF000000 | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
+ break;
+ case FORMAT_A2R10G10B10:
+ *(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(r) << 20) | (unorm<10>(g) << 10) | (unorm<10>(b) << 0);
+ break;
+ case FORMAT_A2B10G10R10:
+ case FORMAT_A2B10G10R10UI:
+ *(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(b) << 20) | (unorm<10>(g) << 10) | (unorm<10>(r) << 0);
+ break;
+ case FORMAT_G8R8_SNORM:
+ *(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(g)) << 8) |
+ (static_cast<unsigned short>(snorm<8>(r)) << 0);
+ break;
+ case FORMAT_G8R8:
+ *(unsigned short*)element = (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
+ break;
+ case FORMAT_G8R8I:
+ *(unsigned short*)element = (static_cast<unsigned short>(scast<8>(g)) << 8) |
+ (static_cast<unsigned short>(scast<8>(r)) << 0);
+ break;
+ case FORMAT_G8R8UI:
+ *(unsigned short*)element = (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
+ break;
+ case FORMAT_G16R16:
+ *(unsigned int*)element = (unorm<16>(g) << 16) | (unorm<16>(r) << 0);
+ break;
+ case FORMAT_G16R16I:
+ *(unsigned int*)element = (static_cast<unsigned int>(scast<16>(g)) << 16) |
+ (static_cast<unsigned int>(scast<16>(r)) << 0);
+ break;
+ case FORMAT_G16R16UI:
+ *(unsigned int*)element = (ucast<16>(g) << 16) | (ucast<16>(r) << 0);
+ break;
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ ((unsigned int*)element)[0] = static_cast<unsigned int>(r);
+ ((unsigned int*)element)[1] = static_cast<unsigned int>(g);
+ break;
+ case FORMAT_A16B16G16R16:
+ ((unsigned short*)element)[0] = unorm<16>(r);
+ ((unsigned short*)element)[1] = unorm<16>(g);
+ ((unsigned short*)element)[2] = unorm<16>(b);
+ ((unsigned short*)element)[3] = unorm<16>(a);
+ break;
+ case FORMAT_A16B16G16R16I:
+ ((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
+ ((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
+ ((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
+ ((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(a));
+ break;
+ case FORMAT_A16B16G16R16UI:
+ ((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
+ ((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
+ ((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
+ ((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(a));
+ break;
+ case FORMAT_X16B16G16R16I:
+ ((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
+ ((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
+ ((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
+ break;
+ case FORMAT_X16B16G16R16UI:
+ ((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
+ ((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
+ ((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
+ break;
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ ((unsigned int*)element)[0] = static_cast<unsigned int>(r);
+ ((unsigned int*)element)[1] = static_cast<unsigned int>(g);
+ ((unsigned int*)element)[2] = static_cast<unsigned int>(b);
+ ((unsigned int*)element)[3] = static_cast<unsigned int>(a);
+ break;
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_X32B32G32R32UI:
+ ((unsigned int*)element)[0] = static_cast<unsigned int>(r);
+ ((unsigned int*)element)[1] = static_cast<unsigned int>(g);
+ ((unsigned int*)element)[2] = static_cast<unsigned int>(b);
+ break;
+ case FORMAT_V8U8:
+ *(unsigned short*)element = (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
+ break;
+ case FORMAT_L6V5U5:
+ *(unsigned short*)element = (unorm<6>(b) << 10) | (snorm<5>(g) << 5) | (snorm<5>(r) << 0);
+ break;
+ case FORMAT_Q8W8V8U8:
+ *(unsigned int*)element = (snorm<8>(a) << 24) | (snorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
+ break;
+ case FORMAT_X8L8V8U8:
+ *(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
+ break;
+ case FORMAT_V16U16:
+ *(unsigned int*)element = (snorm<16>(g) << 16) | (snorm<16>(r) << 0);
+ break;
+ case FORMAT_A2W10V10U10:
+ *(unsigned int*)element = (unorm<2>(a) << 30) | (snorm<10>(b) << 20) | (snorm<10>(g) << 10) | (snorm<10>(r) << 0);
+ break;
+ case FORMAT_A16W16V16U16:
+ ((unsigned short*)element)[0] = snorm<16>(r);
+ ((unsigned short*)element)[1] = snorm<16>(g);
+ ((unsigned short*)element)[2] = snorm<16>(b);
+ ((unsigned short*)element)[3] = unorm<16>(a);
+ break;
+ case FORMAT_Q16W16V16U16:
+ ((unsigned short*)element)[0] = snorm<16>(r);
+ ((unsigned short*)element)[1] = snorm<16>(g);
+ ((unsigned short*)element)[2] = snorm<16>(b);
+ ((unsigned short*)element)[3] = snorm<16>(a);
+ break;
+ case FORMAT_R8G8B8:
+ ((unsigned char*)element)[0] = unorm<8>(b);
+ ((unsigned char*)element)[1] = unorm<8>(g);
+ ((unsigned char*)element)[2] = unorm<8>(r);
+ break;
+ case FORMAT_B8G8R8:
+ ((unsigned char*)element)[0] = unorm<8>(r);
+ ((unsigned char*)element)[1] = unorm<8>(g);
+ ((unsigned char*)element)[2] = unorm<8>(b);
+ break;
+ case FORMAT_R16F:
+ *(half*)element = (half)r;
+ break;
+ case FORMAT_A16F:
+ *(half*)element = (half)a;
+ break;
+ case FORMAT_G16R16F:
+ ((half*)element)[0] = (half)r;
+ ((half*)element)[1] = (half)g;
+ break;
+ case FORMAT_X16B16G16R16F_UNSIGNED:
+ r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
+ // Fall through to FORMAT_X16B16G16R16F.
+ case FORMAT_X16B16G16R16F:
+ ((half*)element)[3] = 1.0f;
+ // Fall through to FORMAT_B16G16R16F.
+ case FORMAT_B16G16R16F:
+ ((half*)element)[0] = (half)r;
+ ((half*)element)[1] = (half)g;
+ ((half*)element)[2] = (half)b;
+ break;
+ case FORMAT_A16B16G16R16F:
+ ((half*)element)[0] = (half)r;
+ ((half*)element)[1] = (half)g;
+ ((half*)element)[2] = (half)b;
+ ((half*)element)[3] = (half)a;
+ break;
+ case FORMAT_A32F:
+ *(float*)element = a;
+ break;
+ case FORMAT_R32F:
+ *(float*)element = r;
+ break;
+ case FORMAT_G32R32F:
+ ((float*)element)[0] = r;
+ ((float*)element)[1] = g;
+ break;
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
+ // Fall through to FORMAT_X32B32G32R32F.
+ case FORMAT_X32B32G32R32F:
+ ((float*)element)[3] = 1.0f;
+ // Fall through to FORMAT_B32G32R32F.
+ case FORMAT_B32G32R32F:
+ ((float*)element)[0] = r;
+ ((float*)element)[1] = g;
+ ((float*)element)[2] = b;
+ break;
+ case FORMAT_A32B32G32R32F:
+ ((float*)element)[0] = r;
+ ((float*)element)[1] = g;
+ ((float*)element)[2] = b;
+ ((float*)element)[3] = a;
+ break;
+ case FORMAT_D32F:
+ case FORMAT_D32FS8:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ *((float*)element) = r;
+ break;
+ case FORMAT_D32F_COMPLEMENTARY:
+ case FORMAT_D32FS8_COMPLEMENTARY:
+ *((float*)element) = 1 - r;
+ break;
+ case FORMAT_S8:
+ *((unsigned char*)element) = unorm<8>(r);
+ break;
+ case FORMAT_L8:
+ *(unsigned char*)element = unorm<8>(r);
+ break;
+ case FORMAT_A4L4:
+ *(unsigned char*)element = (unorm<4>(a) << 4) | (unorm<4>(r) << 0);
+ break;
+ case FORMAT_L16:
+ *(unsigned short*)element = unorm<16>(r);
+ break;
+ case FORMAT_A8L8:
+ *(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<8>(r) << 0);
+ break;
+ case FORMAT_L16F:
+ *(half*)element = (half)r;
+ break;
+ case FORMAT_A16L16F:
+ ((half*)element)[0] = (half)r;
+ ((half*)element)[1] = (half)a;
+ break;
+ case FORMAT_L32F:
+ *(float*)element = r;
+ break;
+ case FORMAT_A32L32F:
+ ((float*)element)[0] = r;
+ ((float*)element)[1] = a;
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ Color<float> Surface::Buffer::read(int x, int y, int z) const
+ {
+ ASSERT((x >= -border) && (x < (width + border)));
+ ASSERT((y >= -border) && (y < (height + border)));
+ ASSERT((z >= 0) && (z < depth));
+
+ void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
+
+ return read(element);
+ }
+
+ Color<float> Surface::Buffer::read(int x, int y) const
+ {
+ ASSERT((x >= -border) && (x < (width + border)));
+ ASSERT((y >= -border) && (y < (height + border)));
+
+ void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB;
+
+ return read(element);
+ }
+
+ inline Color<float> Surface::Buffer::read(void *element) const
+ {
+ float r = 0.0f;
+ float g = 0.0f;
+ float b = 0.0f;
+ float a = 1.0f;
+
+ switch(format)
+ {
+ case FORMAT_P8:
+ {
+ ASSERT(palette);
+
+ unsigned int abgr = palette[*(unsigned char*)element];
+
+ r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
+ g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
+ b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
+ a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
+ }
+ break;
+ case FORMAT_A8P8:
+ {
+ ASSERT(palette);
+
+ unsigned int bgr = palette[((unsigned char*)element)[0]];
+
+ r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
+ g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
+ b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
+ a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
+ }
+ break;
+ case FORMAT_A8:
+ r = 0;
+ g = 0;
+ b = 0;
+ a = *(unsigned char*)element * (1.0f / 0xFF);
+ break;
+ case FORMAT_R8_SNORM:
+ r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
+ break;
+ case FORMAT_R8:
+ r = *(unsigned char*)element * (1.0f / 0xFF);
+ break;
+ case FORMAT_R8I:
+ r = *(signed char*)element;
+ break;
+ case FORMAT_R8UI:
+ r = *(unsigned char*)element;
+ break;
+ case FORMAT_R3G3B2:
+ {
+ unsigned char rgb = *(unsigned char*)element;
+
+ r = (rgb & 0xE0) * (1.0f / 0xE0);
+ g = (rgb & 0x1C) * (1.0f / 0x1C);
+ b = (rgb & 0x03) * (1.0f / 0x03);
+ }
+ break;
+ case FORMAT_A8R3G3B2:
+ {
+ unsigned short argb = *(unsigned short*)element;
+
+ a = (argb & 0xFF00) * (1.0f / 0xFF00);
+ r = (argb & 0x00E0) * (1.0f / 0x00E0);
+ g = (argb & 0x001C) * (1.0f / 0x001C);
+ b = (argb & 0x0003) * (1.0f / 0x0003);
+ }
+ break;
+ case FORMAT_X4R4G4B4:
+ {
+ unsigned short rgb = *(unsigned short*)element;
+
+ r = (rgb & 0x0F00) * (1.0f / 0x0F00);
+ g = (rgb & 0x00F0) * (1.0f / 0x00F0);
+ b = (rgb & 0x000F) * (1.0f / 0x000F);
+ }
+ break;
+ case FORMAT_A4R4G4B4:
+ {
+ unsigned short argb = *(unsigned short*)element;
+
+ a = (argb & 0xF000) * (1.0f / 0xF000);
+ r = (argb & 0x0F00) * (1.0f / 0x0F00);
+ g = (argb & 0x00F0) * (1.0f / 0x00F0);
+ b = (argb & 0x000F) * (1.0f / 0x000F);
+ }
+ break;
+ case FORMAT_R4G4B4A4:
+ {
+ unsigned short rgba = *(unsigned short*)element;
+
+ r = (rgba & 0xF000) * (1.0f / 0xF000);
+ g = (rgba & 0x0F00) * (1.0f / 0x0F00);
+ b = (rgba & 0x00F0) * (1.0f / 0x00F0);
+ a = (rgba & 0x000F) * (1.0f / 0x000F);
+ }
+ break;
+ case FORMAT_R5G6B5:
+ {
+ unsigned short rgb = *(unsigned short*)element;
+
+ r = (rgb & 0xF800) * (1.0f / 0xF800);
+ g = (rgb & 0x07E0) * (1.0f / 0x07E0);
+ b = (rgb & 0x001F) * (1.0f / 0x001F);
+ }
+ break;
+ case FORMAT_A1R5G5B5:
+ {
+ unsigned short argb = *(unsigned short*)element;
+
+ a = (argb & 0x8000) * (1.0f / 0x8000);
+ r = (argb & 0x7C00) * (1.0f / 0x7C00);
+ g = (argb & 0x03E0) * (1.0f / 0x03E0);
+ b = (argb & 0x001F) * (1.0f / 0x001F);
+ }
+ break;
+ case FORMAT_R5G5B5A1:
+ {
+ unsigned short rgba = *(unsigned short*)element;
+
+ r = (rgba & 0xF800) * (1.0f / 0xF800);
+ g = (rgba & 0x07C0) * (1.0f / 0x07C0);
+ b = (rgba & 0x003E) * (1.0f / 0x003E);
+ a = (rgba & 0x0001) * (1.0f / 0x0001);
+ }
+ break;
+ case FORMAT_X1R5G5B5:
+ {
+ unsigned short xrgb = *(unsigned short*)element;
+
+ r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
+ g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
+ b = (xrgb & 0x001F) * (1.0f / 0x001F);
+ }
+ break;
+ case FORMAT_A8R8G8B8:
+ {
+ unsigned int argb = *(unsigned int*)element;
+
+ a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
+ r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
+ g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
+ b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
+ }
+ break;
+ case FORMAT_X8R8G8B8:
+ {
+ unsigned int xrgb = *(unsigned int*)element;
+
+ r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
+ g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
+ b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
+ }
+ break;
+ case FORMAT_A8B8G8R8_SNORM:
+ {
+ signed char* abgr = (signed char*)element;
+
+ r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
+ g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
+ b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
+ a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
+ }
+ break;
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_A8:
+ {
+ unsigned int abgr = *(unsigned int*)element;
+
+ a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
+ b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
+ g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
+ r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
+ }
+ break;
+ case FORMAT_A8B8G8R8I:
+ {
+ signed char* abgr = (signed char*)element;
+
+ r = abgr[0];
+ g = abgr[1];
+ b = abgr[2];
+ a = abgr[3];
+ }
+ break;
+ case FORMAT_A8B8G8R8UI:
+ {
+ unsigned char* abgr = (unsigned char*)element;
+
+ r = abgr[0];
+ g = abgr[1];
+ b = abgr[2];
+ a = abgr[3];
+ }
+ break;
+ case FORMAT_X8B8G8R8_SNORM:
+ {
+ signed char* bgr = (signed char*)element;
+
+ r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
+ g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
+ b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
+ }
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ {
+ unsigned int xbgr = *(unsigned int*)element;
+
+ b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
+ g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
+ r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
+ }
+ break;
+ case FORMAT_X8B8G8R8I:
+ {
+ signed char* bgr = (signed char*)element;
+
+ r = bgr[0];
+ g = bgr[1];
+ b = bgr[2];
+ }
+ break;
+ case FORMAT_X8B8G8R8UI:
+ {
+ unsigned char* bgr = (unsigned char*)element;
+
+ r = bgr[0];
+ g = bgr[1];
+ b = bgr[2];
+ }
+ break;
+ case FORMAT_G8R8_SNORM:
+ {
+ signed char* gr = (signed char*)element;
+
+ r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
+ g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
+ }
+ break;
+ case FORMAT_G8R8:
+ {
+ unsigned short gr = *(unsigned short*)element;
+
+ g = (gr & 0xFF00) * (1.0f / 0xFF00);
+ r = (gr & 0x00FF) * (1.0f / 0x00FF);
+ }
+ break;
+ case FORMAT_G8R8I:
+ {
+ signed char* gr = (signed char*)element;
+
+ r = gr[0];
+ g = gr[1];
+ }
+ break;
+ case FORMAT_G8R8UI:
+ {
+ unsigned char* gr = (unsigned char*)element;
+
+ r = gr[0];
+ g = gr[1];
+ }
+ break;
+ case FORMAT_R16I:
+ r = *((short*)element);
+ break;
+ case FORMAT_R16UI:
+ r = *((unsigned short*)element);
+ break;
+ case FORMAT_G16R16I:
+ {
+ short* gr = (short*)element;
+
+ r = gr[0];
+ g = gr[1];
+ }
+ break;
+ case FORMAT_G16R16:
+ {
+ unsigned int gr = *(unsigned int*)element;
+
+ g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
+ r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
+ }
+ break;
+ case FORMAT_G16R16UI:
+ {
+ unsigned short* gr = (unsigned short*)element;
+
+ r = gr[0];
+ g = gr[1];
+ }
+ break;
+ case FORMAT_A2R10G10B10:
+ {
+ unsigned int argb = *(unsigned int*)element;
+
+ a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
+ r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
+ g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
+ b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
+ }
+ break;
+ case FORMAT_A2B10G10R10:
+ {
+ unsigned int abgr = *(unsigned int*)element;
+
+ a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
+ b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
+ g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
+ r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
+ }
+ break;
+ case FORMAT_A2B10G10R10UI:
+ {
+ unsigned int abgr = *(unsigned int*)element;
+
+ a = static_cast<float>((abgr & 0xC0000000) >> 30);
+ b = static_cast<float>((abgr & 0x3FF00000) >> 20);
+ g = static_cast<float>((abgr & 0x000FFC00) >> 10);
+ r = static_cast<float>(abgr & 0x000003FF);
+ }
+ break;
+ case FORMAT_A16B16G16R16I:
+ {
+ short* abgr = (short*)element;
+
+ r = abgr[0];
+ g = abgr[1];
+ b = abgr[2];
+ a = abgr[3];
+ }
+ break;
+ case FORMAT_A16B16G16R16:
+ r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
+ g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
+ b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
+ a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
+ break;
+ case FORMAT_A16B16G16R16UI:
+ {
+ unsigned short* abgr = (unsigned short*)element;
+
+ r = abgr[0];
+ g = abgr[1];
+ b = abgr[2];
+ a = abgr[3];
+ }
+ break;
+ case FORMAT_X16B16G16R16I:
+ {
+ short* bgr = (short*)element;
+
+ r = bgr[0];
+ g = bgr[1];
+ b = bgr[2];
+ }
+ break;
+ case FORMAT_X16B16G16R16UI:
+ {
+ unsigned short* bgr = (unsigned short*)element;
+
+ r = bgr[0];
+ g = bgr[1];
+ b = bgr[2];
+ }
+ break;
+ case FORMAT_A32B32G32R32I:
+ {
+ int* abgr = (int*)element;
+
+ r = static_cast<float>(abgr[0]);
+ g = static_cast<float>(abgr[1]);
+ b = static_cast<float>(abgr[2]);
+ a = static_cast<float>(abgr[3]);
+ }
+ break;
+ case FORMAT_A32B32G32R32UI:
+ {
+ unsigned int* abgr = (unsigned int*)element;
+
+ r = static_cast<float>(abgr[0]);
+ g = static_cast<float>(abgr[1]);
+ b = static_cast<float>(abgr[2]);
+ a = static_cast<float>(abgr[3]);
+ }
+ break;
+ case FORMAT_X32B32G32R32I:
+ {
+ int* bgr = (int*)element;
+
+ r = static_cast<float>(bgr[0]);
+ g = static_cast<float>(bgr[1]);
+ b = static_cast<float>(bgr[2]);
+ }
+ break;
+ case FORMAT_X32B32G32R32UI:
+ {
+ unsigned int* bgr = (unsigned int*)element;
+
+ r = static_cast<float>(bgr[0]);
+ g = static_cast<float>(bgr[1]);
+ b = static_cast<float>(bgr[2]);
+ }
+ break;
+ case FORMAT_G32R32I:
+ {
+ int* gr = (int*)element;
+
+ r = static_cast<float>(gr[0]);
+ g = static_cast<float>(gr[1]);
+ }
+ break;
+ case FORMAT_G32R32UI:
+ {
+ unsigned int* gr = (unsigned int*)element;
+
+ r = static_cast<float>(gr[0]);
+ g = static_cast<float>(gr[1]);
+ }
+ break;
+ case FORMAT_R32I:
+ r = static_cast<float>(*((int*)element));
+ break;
+ case FORMAT_R32UI:
+ r = static_cast<float>(*((unsigned int*)element));
+ break;
+ case FORMAT_V8U8:
+ {
+ unsigned short vu = *(unsigned short*)element;
+
+ r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
+ g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
+ }
+ break;
+ case FORMAT_L6V5U5:
+ {
+ unsigned short lvu = *(unsigned short*)element;
+
+ r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
+ g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
+ b = (lvu & 0xFC00) * (1.0f / 0xFC00);
+ }
+ break;
+ case FORMAT_Q8W8V8U8:
+ {
+ unsigned int qwvu = *(unsigned int*)element;
+
+ r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
+ g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
+ b = ((int)(qwvu & 0x00FF0000) << 8) * (1.0f / 0x7F000000);
+ a = ((int)(qwvu & 0xFF000000) << 0) * (1.0f / 0x7F000000);
+ }
+ break;
+ case FORMAT_X8L8V8U8:
+ {
+ unsigned int xlvu = *(unsigned int*)element;
+
+ r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
+ g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
+ b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
+ }
+ break;
+ case FORMAT_R8G8B8:
+ r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
+ g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
+ b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
+ break;
+ case FORMAT_B8G8R8:
+ r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
+ g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
+ b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
+ break;
+ case FORMAT_V16U16:
+ {
+ unsigned int vu = *(unsigned int*)element;
+
+ r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
+ g = ((int)(vu & 0xFFFF0000) << 0) * (1.0f / 0x7FFF0000);
+ }
+ break;
+ case FORMAT_A2W10V10U10:
+ {
+ unsigned int awvu = *(unsigned int*)element;
+
+ r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
+ g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
+ b = ((int)(awvu & 0x3FF00000) << 2) * (1.0f / 0x7FC00000);
+ a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
+ }
+ break;
+ case FORMAT_A16W16V16U16:
+ r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
+ g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
+ b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
+ a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
+ break;
+ case FORMAT_Q16W16V16U16:
+ r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
+ g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
+ b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
+ a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
+ break;
+ case FORMAT_L8:
+ r =
+ g =
+ b = *(unsigned char*)element * (1.0f / 0xFF);
+ break;
+ case FORMAT_A4L4:
+ {
+ unsigned char al = *(unsigned char*)element;
+
+ r =
+ g =
+ b = (al & 0x0F) * (1.0f / 0x0F);
+ a = (al & 0xF0) * (1.0f / 0xF0);
+ }
+ break;
+ case FORMAT_L16:
+ r =
+ g =
+ b = *(unsigned short*)element * (1.0f / 0xFFFF);
+ break;
+ case FORMAT_A8L8:
+ r =
+ g =
+ b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
+ a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
+ break;
+ case FORMAT_L16F:
+ r =
+ g =
+ b = *(half*)element;
+ break;
+ case FORMAT_A16L16F:
+ r =
+ g =
+ b = ((half*)element)[0];
+ a = ((half*)element)[1];
+ break;
+ case FORMAT_L32F:
+ r =
+ g =
+ b = *(float*)element;
+ break;
+ case FORMAT_A32L32F:
+ r =
+ g =
+ b = ((float*)element)[0];
+ a = ((float*)element)[1];
+ break;
+ case FORMAT_A16F:
+ a = *(half*)element;
+ break;
+ case FORMAT_R16F:
+ r = *(half*)element;
+ break;
+ case FORMAT_G16R16F:
+ r = ((half*)element)[0];
+ g = ((half*)element)[1];
+ break;
+ case FORMAT_X16B16G16R16F:
+ case FORMAT_X16B16G16R16F_UNSIGNED:
+ case FORMAT_B16G16R16F:
+ r = ((half*)element)[0];
+ g = ((half*)element)[1];
+ b = ((half*)element)[2];
+ break;
+ case FORMAT_A16B16G16R16F:
+ r = ((half*)element)[0];
+ g = ((half*)element)[1];
+ b = ((half*)element)[2];
+ a = ((half*)element)[3];
+ break;
+ case FORMAT_A32F:
+ a = *(float*)element;
+ break;
+ case FORMAT_R32F:
+ r = *(float*)element;
+ break;
+ case FORMAT_G32R32F:
+ r = ((float*)element)[0];
+ g = ((float*)element)[1];
+ break;
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_B32G32R32F:
+ r = ((float*)element)[0];
+ g = ((float*)element)[1];
+ b = ((float*)element)[2];
+ break;
+ case FORMAT_A32B32G32R32F:
+ r = ((float*)element)[0];
+ g = ((float*)element)[1];
+ b = ((float*)element)[2];
+ a = ((float*)element)[3];
+ break;
+ case FORMAT_D32F:
+ case FORMAT_D32FS8:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ r = *(float*)element;
+ g = r;
+ b = r;
+ a = r;
+ break;
+ case FORMAT_D32F_COMPLEMENTARY:
+ case FORMAT_D32FS8_COMPLEMENTARY:
+ r = 1.0f - *(float*)element;
+ g = r;
+ b = r;
+ a = r;
+ break;
+ case FORMAT_S8:
+ r = *(unsigned char*)element * (1.0f / 0xFF);
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ if(isSRGBformat(format))
+ {
+ r = sRGBtoLinear(r);
+ g = sRGBtoLinear(g);
+ b = sRGBtoLinear(b);
+ }
+
+ return Color<float>(r, g, b, a);
+ }
+
+ Color<float> Surface::Buffer::sample(float x, float y, float z) const
+ {
+ x -= 0.5f;
+ y -= 0.5f;
+ z -= 0.5f;
+
+ int x0 = clamp((int)x, 0, width - 1);
+ int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
+
+ int y0 = clamp((int)y, 0, height - 1);
+ int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
+
+ int z0 = clamp((int)z, 0, depth - 1);
+ int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
+
+ Color<float> c000 = read(x0, y0, z0);
+ Color<float> c100 = read(x1, y0, z0);
+ Color<float> c010 = read(x0, y1, z0);
+ Color<float> c110 = read(x1, y1, z0);
+ Color<float> c001 = read(x0, y0, z1);
+ Color<float> c101 = read(x1, y0, z1);
+ Color<float> c011 = read(x0, y1, z1);
+ Color<float> c111 = read(x1, y1, z1);
+
+ float fx = x - x0;
+ float fy = y - y0;
+ float fz = z - z0;
+
+ c000 *= (1 - fx) * (1 - fy) * (1 - fz);
+ c100 *= fx * (1 - fy) * (1 - fz);
+ c010 *= (1 - fx) * fy * (1 - fz);
+ c110 *= fx * fy * (1 - fz);
+ c001 *= (1 - fx) * (1 - fy) * fz;
+ c101 *= fx * (1 - fy) * fz;
+ c011 *= (1 - fx) * fy * fz;
+ c111 *= fx * fy * fz;
+
+ return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
+ }
+
+ Color<float> Surface::Buffer::sample(float x, float y, int layer) const
+ {
+ x -= 0.5f;
+ y -= 0.5f;
+
+ int x0 = clamp((int)x, 0, width - 1);
+ int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
+
+ int y0 = clamp((int)y, 0, height - 1);
+ int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
+
+ Color<float> c00 = read(x0, y0, layer);
+ Color<float> c10 = read(x1, y0, layer);
+ Color<float> c01 = read(x0, y1, layer);
+ Color<float> c11 = read(x1, y1, layer);
+
+ float fx = x - x0;
+ float fy = y - y0;
+
+ c00 *= (1 - fx) * (1 - fy);
+ c10 *= fx * (1 - fy);
+ c01 *= (1 - fx) * fy;
+ c11 *= fx * fy;
+
+ return c00 + c10 + c01 + c11;
+ }
+
+ void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
+ {
+ this->lock = lock;
+
+ switch(lock)
+ {
+ case LOCK_UNLOCKED:
+ case LOCK_READONLY:
+ case LOCK_UPDATE:
+ break;
+ case LOCK_WRITEONLY:
+ case LOCK_READWRITE:
+ case LOCK_DISCARD:
+ dirty = true;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ if(buffer)
+ {
+ x += border;
+ y += border;
+
+ switch(format)
+ {
+ case FORMAT_DXT1:
+ case FORMAT_ATI1:
+ case FORMAT_ETC1:
+ case FORMAT_R11_EAC:
+ case FORMAT_SIGNED_R11_EAC:
+ case FORMAT_RGB8_ETC2:
+ case FORMAT_SRGB8_ETC2:
+ case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+ case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+ return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
+ case FORMAT_RG11_EAC:
+ case FORMAT_SIGNED_RG11_EAC:
+ case FORMAT_RGBA8_ETC2_EAC:
+ case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
+ case FORMAT_RGBA_ASTC_4x4_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
+ return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_5x4_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
+ return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_5x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
+ return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_6x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
+ return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_6x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
+ return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_8x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
+ return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_8x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
+ return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_8x8_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
+ return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_10x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
+ return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_10x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
+ return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_10x8_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
+ return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_10x10_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
+ return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_12x10_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
+ return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
+ case FORMAT_RGBA_ASTC_12x12_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
+ return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
+ case FORMAT_DXT3:
+ case FORMAT_DXT5:
+ case FORMAT_ATI2:
+ return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
+ default:
+ return (unsigned char*)buffer + x * bytes + y * pitchB + z * samples * sliceB;
+ }
+ }
+
+ return nullptr;
+ }
+
+ void Surface::Buffer::unlockRect()
+ {
+ lock = LOCK_UNLOCKED;
+ }
+
+ class SurfaceImplementation : public Surface
+ {
+ public:
+ SurfaceImplementation(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
+ : Surface(width, height, depth, format, pixels, pitch, slice) {}
+ SurfaceImplementation(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0)
+ : Surface(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchP) {}
+ ~SurfaceImplementation() override {};
+
+ void *lockInternal(int x, int y, int z, Lock lock, Accessor client) override
+ {
+ return Surface::lockInternal(x, y, z, lock, client);
+ }
+
+ void unlockInternal() override
+ {
+ Surface::unlockInternal();
+ }
+ };
+
+ Surface *Surface::create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
+ {
+ return new SurfaceImplementation(width, height, depth, format, pixels, pitch, slice);
+ }
+
+ Surface *Surface::create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided)
+ {
+ return new SurfaceImplementation(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchPprovided);
+ }
+
+ Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
+ {
+ resource = new Resource(0);
+ hasParent = false;
+ ownExternal = false;
+ depth = max(1, depth);
+
+ external.buffer = pixels;
+ external.width = width;
+ external.height = height;
+ external.depth = depth;
+ external.samples = 1;
+ external.format = format;
+ external.bytes = bytes(external.format);
+ external.pitchB = pitch;
+ external.pitchP = external.bytes ? pitch / external.bytes : 0;
+ external.sliceB = slice;
+ external.sliceP = external.bytes ? slice / external.bytes : 0;
+ external.border = 0;
+ external.lock = LOCK_UNLOCKED;
+ external.dirty = true;
+
+ internal.buffer = nullptr;
+ internal.width = width;
+ internal.height = height;
+ internal.depth = depth;
+ internal.samples = 1;
+ internal.format = selectInternalFormat(format);
+ internal.bytes = bytes(internal.format);
+ internal.pitchB = pitchB(internal.width, 0, internal.format, false);
+ internal.pitchP = pitchP(internal.width, 0, internal.format, false);
+ internal.sliceB = sliceB(internal.width, internal.height, 0, internal.format, false);
+ internal.sliceP = sliceP(internal.width, internal.height, 0, internal.format, false);
+ internal.border = 0;
+ internal.lock = LOCK_UNLOCKED;
+ internal.dirty = false;
+
+ stencil.buffer = nullptr;
+ stencil.width = width;
+ stencil.height = height;
+ stencil.depth = depth;
+ stencil.samples = 1;
+ stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
+ stencil.bytes = bytes(stencil.format);
+ stencil.pitchB = pitchB(stencil.width, 0, stencil.format, false);
+ stencil.pitchP = pitchP(stencil.width, 0, stencil.format, false);
+ stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, false);
+ stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, false);
+ stencil.border = 0;
+ stencil.lock = LOCK_UNLOCKED;
+ stencil.dirty = false;
+
+ dirtyContents = true;
+ paletteUsed = 0;
+ }
+
+ Surface::Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
+ {
+ resource = texture ? texture : new Resource(0);
+ hasParent = texture != nullptr;
+ ownExternal = true;
+ depth = max(1, depth);
+ samples = max(1, samples);
+
+ external.buffer = nullptr;
+ external.width = width;
+ external.height = height;
+ external.depth = depth;
+ external.samples = (short)samples;
+ external.format = format;
+ external.bytes = bytes(external.format);
+ external.pitchB = !pitchPprovided ? pitchB(external.width, 0, external.format, renderTarget && !texture) : pitchPprovided * external.bytes;
+ external.pitchP = !pitchPprovided ? pitchP(external.width, 0, external.format, renderTarget && !texture) : pitchPprovided;
+ external.sliceB = sliceB(external.width, external.height, 0, external.format, renderTarget && !texture);
+ external.sliceP = sliceP(external.width, external.height, 0, external.format, renderTarget && !texture);
+ external.border = 0;
+ external.lock = LOCK_UNLOCKED;
+ external.dirty = false;
+
+ internal.buffer = nullptr;
+ internal.width = width;
+ internal.height = height;
+ internal.depth = depth;
+ internal.samples = (short)samples;
+ internal.format = selectInternalFormat(format);
+ internal.bytes = bytes(internal.format);
+ internal.pitchB = !pitchPprovided ? pitchB(internal.width, border, internal.format, renderTarget) : pitchPprovided * internal.bytes;
+ internal.pitchP = !pitchPprovided ? pitchP(internal.width, border, internal.format, renderTarget) : pitchPprovided;
+ internal.sliceB = sliceB(internal.width, internal.height, border, internal.format, renderTarget);
+ internal.sliceP = sliceP(internal.width, internal.height, border, internal.format, renderTarget);
+ internal.border = (short)border;
+ internal.lock = LOCK_UNLOCKED;
+ internal.dirty = false;
+
+ stencil.buffer = nullptr;
+ stencil.width = width;
+ stencil.height = height;
+ stencil.depth = depth;
+ stencil.samples = (short)samples;
+ stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
+ stencil.bytes = bytes(stencil.format);
+ stencil.pitchB = pitchB(stencil.width, 0, stencil.format, renderTarget);
+ stencil.pitchP = pitchP(stencil.width, 0, stencil.format, renderTarget);
+ stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, renderTarget);
+ stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, renderTarget);
+ stencil.border = 0;
+ stencil.lock = LOCK_UNLOCKED;
+ stencil.dirty = false;
+
+ dirtyContents = true;
+ paletteUsed = 0;
+ }
+
+ Surface::~Surface()
+ {
+ // sync() must be called before this destructor to ensure all locks have been released.
+ // We can't call it here because the parent resource may already have been destroyed.
+ ASSERT(isUnlocked());
+
+ if(!hasParent)
+ {
+ resource->destruct();
+ }
+
+ if(ownExternal)
+ {
+ deallocate(external.buffer);
+ }
+
+ if(internal.buffer != external.buffer)
+ {
+ deallocate(internal.buffer);
+ }
+
+ deallocate(stencil.buffer);
+
+ external.buffer = nullptr;
+ internal.buffer = nullptr;
+ stencil.buffer = nullptr;
+ }
+
+ void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
+ {
+ resource->lock(client);
+
+ if(!external.buffer)
+ {
+ if(internal.buffer && identicalBuffers())
+ {
+ external.buffer = internal.buffer;
+ }
+ else
+ {
+ external.buffer = allocateBuffer(external.width, external.height, external.depth, external.border, external.samples, external.format);
+ }
+ }
+
+ if(internal.dirty)
+ {
+ if(lock != LOCK_DISCARD)
+ {
+ update(external, internal);
+ }
+
+ internal.dirty = false;
+ }
+
+ switch(lock)
+ {
+ case LOCK_READONLY:
+ break;
+ case LOCK_WRITEONLY:
+ case LOCK_READWRITE:
+ case LOCK_DISCARD:
+ dirtyContents = true;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ return external.lockRect(x, y, z, lock);
+ }
+
+ void Surface::unlockExternal()
+ {
+ external.unlockRect();
+
+ resource->unlock();
+ }
+
+ void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
+ {
+ if(lock != LOCK_UNLOCKED)
+ {
+ resource->lock(client);
+ }
+
+ if(!internal.buffer)
+ {
+ if(external.buffer && identicalBuffers())
+ {
+ internal.buffer = external.buffer;
+ }
+ else
+ {
+ internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.border, internal.samples, internal.format);
+ }
+ }
+
+ // FIXME: WHQL requires conversion to lower external precision and back
+ if(logPrecision >= WHQL)
+ {
+ if(internal.dirty && renderTarget && internal.format != external.format)
+ {
+ if(lock != LOCK_DISCARD)
+ {
+ switch(external.format)
+ {
+ case FORMAT_R3G3B2:
+ case FORMAT_A8R3G3B2:
+ case FORMAT_A1R5G5B5:
+ case FORMAT_A2R10G10B10:
+ case FORMAT_A2B10G10R10:
+ lockExternal(0, 0, 0, LOCK_READWRITE, client);
+ unlockExternal();
+ break;
+ default:
+ // Difference passes WHQL
+ break;
+ }
+ }
+ }
+ }
+
+ if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
+ {
+ if(lock != LOCK_DISCARD)
+ {
+ update(internal, external);
+ }
+
+ external.dirty = false;
+ paletteUsed = Surface::paletteID;
+ }
+
+ switch(lock)
+ {
+ case LOCK_UNLOCKED:
+ case LOCK_READONLY:
+ break;
+ case LOCK_WRITEONLY:
+ case LOCK_READWRITE:
+ case LOCK_DISCARD:
+ dirtyContents = true;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ if(lock == LOCK_READONLY && client == PUBLIC)
+ {
+ resolve();
+ }
+
+ return internal.lockRect(x, y, z, lock);
+ }
+
+ void Surface::unlockInternal()
+ {
+ internal.unlockRect();
+
+ resource->unlock();
+ }
+
+ void *Surface::lockStencil(int x, int y, int front, Accessor client)
+ {
+ resource->lock(client);
+
+ if(stencil.format == FORMAT_NULL)
+ {
+ return nullptr;
+ }
+
+ if(!stencil.buffer)
+ {
+ stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.border, stencil.samples, stencil.format);
+ }
+
+ return stencil.lockRect(x, y, front, LOCK_READWRITE); // FIXME
+ }
+
+ void Surface::unlockStencil()
+ {
+ stencil.unlockRect();
+
+ resource->unlock();
+ }
+
+ int Surface::bytes(Format format)
+ {
+ switch(format)
+ {
+ case FORMAT_NULL: return 0;
+ case FORMAT_P8: return 1;
+ case FORMAT_A8P8: return 2;
+ case FORMAT_A8: return 1;
+ case FORMAT_R8I: return 1;
+ case FORMAT_R8: return 1;
+ case FORMAT_R3G3B2: return 1;
+ case FORMAT_R16I: return 2;
+ case FORMAT_R16UI: return 2;
+ case FORMAT_A8R3G3B2: return 2;
+ case FORMAT_R5G6B5: return 2;
+ case FORMAT_A1R5G5B5: return 2;
+ case FORMAT_X1R5G5B5: return 2;
+ case FORMAT_R5G5B5A1: return 2;
+ case FORMAT_X4R4G4B4: return 2;
+ case FORMAT_A4R4G4B4: return 2;
+ case FORMAT_R4G4B4A4: return 2;
+ case FORMAT_R8G8B8: return 3;
+ case FORMAT_B8G8R8: return 3;
+ case FORMAT_R32I: return 4;
+ case FORMAT_R32UI: return 4;
+ case FORMAT_X8R8G8B8: return 4;
+ // case FORMAT_X8G8R8B8Q: return 4;
+ case FORMAT_A8R8G8B8: return 4;
+ // case FORMAT_A8G8R8B8Q: return 4;
+ case FORMAT_X8B8G8R8I: return 4;
+ case FORMAT_X8B8G8R8: return 4;
+ case FORMAT_SRGB8_X8: return 4;
+ case FORMAT_SRGB8_A8: return 4;
+ case FORMAT_A8B8G8R8I: return 4;
+ case FORMAT_R8UI: return 1;
+ case FORMAT_G8R8UI: return 2;
+ case FORMAT_X8B8G8R8UI: return 4;
+ case FORMAT_A8B8G8R8UI: return 4;
+ case FORMAT_A8B8G8R8: return 4;
+ case FORMAT_R8_SNORM: return 1;
+ case FORMAT_G8R8_SNORM: return 2;
+ case FORMAT_X8B8G8R8_SNORM: return 4;
+ case FORMAT_A8B8G8R8_SNORM: return 4;
+ case FORMAT_A2R10G10B10: return 4;
+ case FORMAT_A2B10G10R10: return 4;
+ case FORMAT_A2B10G10R10UI: return 4;
+ case FORMAT_G8R8I: return 2;
+ case FORMAT_G8R8: return 2;
+ case FORMAT_G16R16I: return 4;
+ case FORMAT_G16R16UI: return 4;
+ case FORMAT_G16R16: return 4;
+ case FORMAT_G32R32I: return 8;
+ case FORMAT_G32R32UI: return 8;
+ case FORMAT_X16B16G16R16I: return 8;
+ case FORMAT_X16B16G16R16UI: return 8;
+ case FORMAT_A16B16G16R16I: return 8;
+ case FORMAT_A16B16G16R16UI: return 8;
+ case FORMAT_A16B16G16R16: return 8;
+ case FORMAT_X32B32G32R32I: return 16;
+ case FORMAT_X32B32G32R32UI: return 16;
+ case FORMAT_A32B32G32R32I: return 16;
+ case FORMAT_A32B32G32R32UI: return 16;
+ // Compressed formats
+ case FORMAT_DXT1: return 2; // Column of four pixels
+ case FORMAT_DXT3: return 4; // Column of four pixels
+ case FORMAT_DXT5: return 4; // Column of four pixels
+ case FORMAT_ATI1: return 2; // Column of four pixels
+ case FORMAT_ATI2: return 4; // Column of four pixels
+ case FORMAT_ETC1: return 2; // Column of four pixels
+ case FORMAT_R11_EAC: return 2;
+ case FORMAT_SIGNED_R11_EAC: return 2;
+ case FORMAT_RG11_EAC: return 4;
+ case FORMAT_SIGNED_RG11_EAC: return 4;
+ case FORMAT_RGB8_ETC2: return 2;
+ case FORMAT_SRGB8_ETC2: return 2;
+ case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2: return 2;
+ case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: return 2;
+ case FORMAT_RGBA8_ETC2_EAC: return 4;
+ case FORMAT_SRGB8_ALPHA8_ETC2_EAC: return 4;
+ case FORMAT_RGBA_ASTC_4x4_KHR:
+ case FORMAT_RGBA_ASTC_5x4_KHR:
+ case FORMAT_RGBA_ASTC_5x5_KHR:
+ case FORMAT_RGBA_ASTC_6x5_KHR:
+ case FORMAT_RGBA_ASTC_6x6_KHR:
+ case FORMAT_RGBA_ASTC_8x5_KHR:
+ case FORMAT_RGBA_ASTC_8x6_KHR:
+ case FORMAT_RGBA_ASTC_8x8_KHR:
+ case FORMAT_RGBA_ASTC_10x5_KHR:
+ case FORMAT_RGBA_ASTC_10x6_KHR:
+ case FORMAT_RGBA_ASTC_10x8_KHR:
+ case FORMAT_RGBA_ASTC_10x10_KHR:
+ case FORMAT_RGBA_ASTC_12x10_KHR:
+ case FORMAT_RGBA_ASTC_12x12_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
+ // Bumpmap formats
+ case FORMAT_V8U8: return 2;
+ case FORMAT_L6V5U5: return 2;
+ case FORMAT_Q8W8V8U8: return 4;
+ case FORMAT_X8L8V8U8: return 4;
+ case FORMAT_A2W10V10U10: return 4;
+ case FORMAT_V16U16: return 4;
+ case FORMAT_A16W16V16U16: return 8;
+ case FORMAT_Q16W16V16U16: return 8;
+ // Luminance formats
+ case FORMAT_L8: return 1;
+ case FORMAT_A4L4: return 1;
+ case FORMAT_L16: return 2;
+ case FORMAT_A8L8: return 2;
+ case FORMAT_L16F: return 2;
+ case FORMAT_A16L16F: return 4;
+ case FORMAT_L32F: return 4;
+ case FORMAT_A32L32F: return 8;
+ // Floating-point formats
+ case FORMAT_A16F: return 2;
+ case FORMAT_R16F: return 2;
+ case FORMAT_G16R16F: return 4;
+ case FORMAT_B16G16R16F: return 6;
+ case FORMAT_X16B16G16R16F: return 8;
+ case FORMAT_A16B16G16R16F: return 8;
+ case FORMAT_X16B16G16R16F_UNSIGNED: return 8;
+ case FORMAT_A32F: return 4;
+ case FORMAT_R32F: return 4;
+ case FORMAT_G32R32F: return 8;
+ case FORMAT_B32G32R32F: return 12;
+ case FORMAT_X32B32G32R32F: return 16;
+ case FORMAT_A32B32G32R32F: return 16;
+ case FORMAT_X32B32G32R32F_UNSIGNED: return 16;
+ // Depth/stencil formats
+ case FORMAT_D16: return 2;
+ case FORMAT_D32: return 4;
+ case FORMAT_D24X8: return 4;
+ case FORMAT_D24S8: return 4;
+ case FORMAT_D24FS8: return 4;
+ case FORMAT_D32F: return 4;
+ case FORMAT_D32FS8: return 4;
+ case FORMAT_D32F_COMPLEMENTARY: return 4;
+ case FORMAT_D32FS8_COMPLEMENTARY: return 4;
+ case FORMAT_D32F_LOCKABLE: return 4;
+ case FORMAT_D32FS8_TEXTURE: return 4;
+ case FORMAT_D32F_SHADOW: return 4;
+ case FORMAT_D32FS8_SHADOW: return 4;
+ case FORMAT_DF24S8: return 4;
+ case FORMAT_DF16S8: return 2;
+ case FORMAT_INTZ: return 4;
+ case FORMAT_S8: return 1;
+ case FORMAT_YV12_BT601: return 1; // Y plane only
+ case FORMAT_YV12_BT709: return 1; // Y plane only
+ case FORMAT_YV12_JFIF: return 1; // Y plane only
+ default:
+ ASSERT(false);
+ }
+
+ return 0;
+ }
+
+ int Surface::pitchB(int width, int border, Format format, bool target)
+ {
+ width += 2 * border;
+
+ // Render targets require 2x2 quads
+ if(target || isDepth(format) || isStencil(format))
+ {
+ width = align<2>(width);
+ }
+
+ switch(format)
+ {
+ case FORMAT_DXT1:
+ case FORMAT_ETC1:
+ case FORMAT_R11_EAC:
+ case FORMAT_SIGNED_R11_EAC:
+ case FORMAT_RGB8_ETC2:
+ case FORMAT_SRGB8_ETC2:
+ case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+ case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+ return 8 * ((width + 3) / 4); // 64 bit per 4x4 block, computed per 4 rows
+ case FORMAT_RG11_EAC:
+ case FORMAT_SIGNED_RG11_EAC:
+ case FORMAT_RGBA8_ETC2_EAC:
+ case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
+ case FORMAT_RGBA_ASTC_4x4_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
+ return 16 * ((width + 3) / 4); // 128 bit per 4x4 block, computed per 4 rows
+ case FORMAT_RGBA_ASTC_5x4_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
+ case FORMAT_RGBA_ASTC_5x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
+ return 16 * ((width + 4) / 5);
+ case FORMAT_RGBA_ASTC_6x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
+ case FORMAT_RGBA_ASTC_6x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
+ return 16 * ((width + 5) / 6);
+ case FORMAT_RGBA_ASTC_8x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
+ case FORMAT_RGBA_ASTC_8x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
+ case FORMAT_RGBA_ASTC_8x8_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
+ return 16 * ((width + 7) / 8);
+ case FORMAT_RGBA_ASTC_10x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
+ case FORMAT_RGBA_ASTC_10x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
+ case FORMAT_RGBA_ASTC_10x8_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
+ case FORMAT_RGBA_ASTC_10x10_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
+ return 16 * ((width + 9) / 10);
+ case FORMAT_RGBA_ASTC_12x10_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
+ case FORMAT_RGBA_ASTC_12x12_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
+ return 16 * ((width + 11) / 12);
+ case FORMAT_DXT3:
+ case FORMAT_DXT5:
+ return 16 * ((width + 3) / 4); // 128 bit per 4x4 block, computed per 4 rows
+ case FORMAT_ATI1:
+ return 2 * ((width + 3) / 4); // 64 bit per 4x4 block, computed per row
+ case FORMAT_ATI2:
+ return 4 * ((width + 3) / 4); // 128 bit per 4x4 block, computed per row
+ case FORMAT_YV12_BT601:
+ case FORMAT_YV12_BT709:
+ case FORMAT_YV12_JFIF:
+ return align<16>(width);
+ default:
+ return bytes(format) * width;
+ }
+ }
+
+ int Surface::pitchP(int width, int border, Format format, bool target)
+ {
+ int B = bytes(format);
+
+ return B > 0 ? pitchB(width, border, format, target) / B : 0;
+ }
+
+ int Surface::sliceB(int width, int height, int border, Format format, bool target)
+ {
+ height += 2 * border;
+
+ // Render targets require 2x2 quads
+ if(target || isDepth(format) || isStencil(format))
+ {
+ height = align<2>(height);
+ }
+
+ switch(format)
+ {
+ case FORMAT_DXT1:
+ case FORMAT_DXT3:
+ case FORMAT_DXT5:
+ case FORMAT_ETC1:
+ case FORMAT_R11_EAC:
+ case FORMAT_SIGNED_R11_EAC:
+ case FORMAT_RG11_EAC:
+ case FORMAT_SIGNED_RG11_EAC:
+ case FORMAT_RGB8_ETC2:
+ case FORMAT_SRGB8_ETC2:
+ case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+ case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+ case FORMAT_RGBA8_ETC2_EAC:
+ case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
+ case FORMAT_RGBA_ASTC_4x4_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
+ case FORMAT_RGBA_ASTC_5x4_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
+ return pitchB(width, border, format, target) * ((height + 3) / 4); // Pitch computed per 4 rows
+ case FORMAT_RGBA_ASTC_5x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
+ case FORMAT_RGBA_ASTC_6x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
+ case FORMAT_RGBA_ASTC_8x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
+ case FORMAT_RGBA_ASTC_10x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
+ return pitchB(width, border, format, target) * ((height + 4) / 5); // Pitch computed per 5 rows
+ case FORMAT_RGBA_ASTC_6x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
+ case FORMAT_RGBA_ASTC_8x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
+ case FORMAT_RGBA_ASTC_10x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
+ return pitchB(width, border, format, target) * ((height + 5) / 6); // Pitch computed per 6 rows
+ case FORMAT_RGBA_ASTC_8x8_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
+ case FORMAT_RGBA_ASTC_10x8_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
+ return pitchB(width, border, format, target) * ((height + 7) / 8); // Pitch computed per 8 rows
+ case FORMAT_RGBA_ASTC_10x10_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
+ case FORMAT_RGBA_ASTC_12x10_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
+ return pitchB(width, border, format, target) * ((height + 9) / 10); // Pitch computed per 10 rows
+ case FORMAT_RGBA_ASTC_12x12_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
+ return pitchB(width, border, format, target) * ((height + 11) / 12); // Pitch computed per 12 rows
+ case FORMAT_ATI1:
+ case FORMAT_ATI2:
+ return pitchB(width, border, format, target) * align<4>(height); // Pitch computed per row
+ default:
+ return pitchB(width, border, format, target) * height; // Pitch computed per row
+ }
+ }
+
+ int Surface::sliceP(int width, int height, int border, Format format, bool target)
+ {
+ int B = bytes(format);
+
+ return B > 0 ? sliceB(width, height, border, format, target) / B : 0;
+ }
+
+ void Surface::update(Buffer &destination, Buffer &source)
+ {
+ // ASSERT(source.lock != LOCK_UNLOCKED);
+ // ASSERT(destination.lock != LOCK_UNLOCKED);
+
+ if(destination.buffer != source.buffer)
+ {
+ ASSERT(source.dirty && !destination.dirty);
+
+ switch(source.format)
+ {
+ case FORMAT_R8G8B8: decodeR8G8B8(destination, source); break; // FIXME: Check destination format
+ case FORMAT_X1R5G5B5: decodeX1R5G5B5(destination, source); break; // FIXME: Check destination format
+ case FORMAT_A1R5G5B5: decodeA1R5G5B5(destination, source); break; // FIXME: Check destination format
+ case FORMAT_X4R4G4B4: decodeX4R4G4B4(destination, source); break; // FIXME: Check destination format
+ case FORMAT_A4R4G4B4: decodeA4R4G4B4(destination, source); break; // FIXME: Check destination format
+ case FORMAT_P8: decodeP8(destination, source); break; // FIXME: Check destination format
+ case FORMAT_DXT1: decodeDXT1(destination, source); break; // FIXME: Check destination format
+ case FORMAT_DXT3: decodeDXT3(destination, source); break; // FIXME: Check destination format
+ case FORMAT_DXT5: decodeDXT5(destination, source); break; // FIXME: Check destination format
+ case FORMAT_ATI1: decodeATI1(destination, source); break; // FIXME: Check destination format
+ case FORMAT_ATI2: decodeATI2(destination, source); break; // FIXME: Check destination format
+ case FORMAT_R11_EAC: decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
+ case FORMAT_SIGNED_R11_EAC: decodeEAC(destination, source, 1, true); break; // FIXME: Check destination format
+ case FORMAT_RG11_EAC: decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
+ case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true); break; // FIXME: Check destination format
+ case FORMAT_ETC1:
+ case FORMAT_RGB8_ETC2: decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ETC2: decodeETC2(destination, source, 0, true); break; // FIXME: Check destination format
+ case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true); break; // FIXME: Check destination format
+ case FORMAT_RGBA8_ETC2_EAC: decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ETC2_EAC: decodeETC2(destination, source, 8, true); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_4x4_KHR: decodeASTC(destination, source, 4, 4, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_5x4_KHR: decodeASTC(destination, source, 5, 4, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_5x5_KHR: decodeASTC(destination, source, 5, 5, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_6x5_KHR: decodeASTC(destination, source, 6, 5, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_6x6_KHR: decodeASTC(destination, source, 6, 6, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_8x5_KHR: decodeASTC(destination, source, 8, 5, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_8x6_KHR: decodeASTC(destination, source, 8, 6, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_8x8_KHR: decodeASTC(destination, source, 8, 8, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_10x5_KHR: decodeASTC(destination, source, 10, 5, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_10x6_KHR: decodeASTC(destination, source, 10, 6, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_10x8_KHR: decodeASTC(destination, source, 10, 8, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
+ case FORMAT_RGBA_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR: decodeASTC(destination, source, 4, 4, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR: decodeASTC(destination, source, 5, 4, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR: decodeASTC(destination, source, 5, 5, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR: decodeASTC(destination, source, 6, 5, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR: decodeASTC(destination, source, 6, 6, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR: decodeASTC(destination, source, 8, 5, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR: decodeASTC(destination, source, 8, 6, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR: decodeASTC(destination, source, 8, 8, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR: decodeASTC(destination, source, 10, 5, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR: decodeASTC(destination, source, 10, 6, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR: decodeASTC(destination, source, 10, 8, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true); break; // FIXME: Check destination format
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true); break; // FIXME: Check destination format
+ default: genericUpdate(destination, source); break;
+ }
+ }
+ }
+
+ void Surface::genericUpdate(Buffer &destination, Buffer &source)
+ {
+ unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+ unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+ int depth = min(destination.depth, source.depth);
+ int height = min(destination.height, source.height);
+ int width = min(destination.width, source.width);
+ int rowBytes = width * source.bytes;
+
+ for(int z = 0; z < depth; z++)
+ {
+ unsigned char *sourceRow = sourceSlice;
+ unsigned char *destinationRow = destinationSlice;
+
+ for(int y = 0; y < height; y++)
+ {
+ if(source.format == destination.format)
+ {
+ memcpy(destinationRow, sourceRow, rowBytes);
+ }
+ else
+ {
+ unsigned char *sourceElement = sourceRow;
+ unsigned char *destinationElement = destinationRow;
+
+ for(int x = 0; x < width; x++)
+ {
+ Color<float> color = source.read(sourceElement);
+ destination.write(destinationElement, color);
+
+ sourceElement += source.bytes;
+ destinationElement += destination.bytes;
+ }
+ }
+
+ sourceRow += source.pitchB;
+ destinationRow += destination.pitchB;
+ }
+
+ sourceSlice += source.sliceB;
+ destinationSlice += destination.sliceB;
+ }
+
+ source.unlockRect();
+ destination.unlockRect();
+ }
+
+ void Surface::decodeR8G8B8(Buffer &destination, Buffer &source)
+ {
+ unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+ unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+ int depth = min(destination.depth, source.depth);
+ int height = min(destination.height, source.height);
+ int width = min(destination.width, source.width);
+
+ for(int z = 0; z < depth; z++)
+ {
+ unsigned char *sourceRow = sourceSlice;
+ unsigned char *destinationRow = destinationSlice;
+
+ for(int y = 0; y < height; y++)
+ {
+ unsigned char *sourceElement = sourceRow;
+ unsigned char *destinationElement = destinationRow;
+
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int b = sourceElement[0];
+ unsigned int g = sourceElement[1];
+ unsigned int r = sourceElement[2];
+
+ *(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
+
+ sourceElement += source.bytes;
+ destinationElement += destination.bytes;
+ }
+
+ sourceRow += source.pitchB;
+ destinationRow += destination.pitchB;
+ }
+
+ sourceSlice += source.sliceB;
+ destinationSlice += destination.sliceB;
+ }
+
+ source.unlockRect();
+ destination.unlockRect();
+ }
+
+ void Surface::decodeX1R5G5B5(Buffer &destination, Buffer &source)
+ {
+ unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+ unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+ int depth = min(destination.depth, source.depth);
+ int height = min(destination.height, source.height);
+ int width = min(destination.width, source.width);
+
+ for(int z = 0; z < depth; z++)
+ {
+ unsigned char *sourceRow = sourceSlice;
+ unsigned char *destinationRow = destinationSlice;
+
+ for(int y = 0; y < height; y++)
+ {
+ unsigned char *sourceElement = sourceRow;
+ unsigned char *destinationElement = destinationRow;
+
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int xrgb = *(unsigned short*)sourceElement;
+
+ unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
+ unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
+ unsigned int b = (((xrgb & 0x001F) * 2106 + 0x80) >> 8);
+
+ *(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
+
+ sourceElement += source.bytes;
+ destinationElement += destination.bytes;
+ }
+
+ sourceRow += source.pitchB;
+ destinationRow += destination.pitchB;
+ }
+
+ sourceSlice += source.sliceB;
+ destinationSlice += destination.sliceB;
+ }
+
+ source.unlockRect();
+ destination.unlockRect();
+ }
+
+ void Surface::decodeA1R5G5B5(Buffer &destination, Buffer &source)
+ {
+ unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+ unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+ int depth = min(destination.depth, source.depth);
+ int height = min(destination.height, source.height);
+ int width = min(destination.width, source.width);
+
+ for(int z = 0; z < depth; z++)
+ {
+ unsigned char *sourceRow = sourceSlice;
+ unsigned char *destinationRow = destinationSlice;
+
+ for(int y = 0; y < height; y++)
+ {
+ unsigned char *sourceElement = sourceRow;
+ unsigned char *destinationElement = destinationRow;
+
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int argb = *(unsigned short*)sourceElement;
+
+ unsigned int a = (argb & 0x8000) * 130560;
+ unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
+ unsigned int g = (((argb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
+ unsigned int b = (((argb & 0x001F) * 2106 + 0x80) >> 8);
+
+ *(unsigned int*)destinationElement = a | r | g | b;
+
+ sourceElement += source.bytes;
+ destinationElement += destination.bytes;
+ }
+
+ sourceRow += source.pitchB;
+ destinationRow += destination.pitchB;
+ }
+
+ sourceSlice += source.sliceB;
+ destinationSlice += destination.sliceB;
+ }
+
+ source.unlockRect();
+ destination.unlockRect();
+ }
+
+ void Surface::decodeX4R4G4B4(Buffer &destination, Buffer &source)
+ {
+ unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+ unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+ int depth = min(destination.depth, source.depth);
+ int height = min(destination.height, source.height);
+ int width = min(destination.width, source.width);
+
+ for(int z = 0; z < depth; z++)
+ {
+ unsigned char *sourceRow = sourceSlice;
+ unsigned char *destinationRow = destinationSlice;
+
+ for(int y = 0; y < height; y++)
+ {
+ unsigned char *sourceElement = sourceRow;
+ unsigned char *destinationElement = destinationRow;
+
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int xrgb = *(unsigned short*)sourceElement;
+
+ unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
+ unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
+ unsigned int b = (xrgb & 0x000F) * 0x00000011;
+
+ *(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
+
+ sourceElement += source.bytes;
+ destinationElement += destination.bytes;
+ }
+
+ sourceRow += source.pitchB;
+ destinationRow += destination.pitchB;
+ }
+
+ sourceSlice += source.sliceB;
+ destinationSlice += destination.sliceB;
+ }
+
+ source.unlockRect();
+ destination.unlockRect();
+ }
+
+ void Surface::decodeA4R4G4B4(Buffer &destination, Buffer &source)
+ {
+ unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+ unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+ int depth = min(destination.depth, source.depth);
+ int height = min(destination.height, source.height);
+ int width = min(destination.width, source.width);
+
+ for(int z = 0; z < depth; z++)
+ {
+ unsigned char *sourceRow = sourceSlice;
+ unsigned char *destinationRow = destinationSlice;
+
+ for(int y = 0; y < height; y++)
+ {
+ unsigned char *sourceElement = sourceRow;
+ unsigned char *destinationElement = destinationRow;
+
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int argb = *(unsigned short*)sourceElement;
+
+ unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
+ unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
+ unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
+ unsigned int b = (argb & 0x000F) * 0x00000011;
+
+ *(unsigned int*)destinationElement = a | r | g | b;
+
+ sourceElement += source.bytes;
+ destinationElement += destination.bytes;
+ }
+
+ sourceRow += source.pitchB;
+ destinationRow += destination.pitchB;
+ }
+
+ sourceSlice += source.sliceB;
+ destinationSlice += destination.sliceB;
+ }
+
+ source.unlockRect();
+ destination.unlockRect();
+ }
+
+ void Surface::decodeP8(Buffer &destination, Buffer &source)
+ {
+ unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+ unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+ int depth = min(destination.depth, source.depth);
+ int height = min(destination.height, source.height);
+ int width = min(destination.width, source.width);
+
+ for(int z = 0; z < depth; z++)
+ {
+ unsigned char *sourceRow = sourceSlice;
+ unsigned char *destinationRow = destinationSlice;
+
+ for(int y = 0; y < height; y++)
+ {
+ unsigned char *sourceElement = sourceRow;
+ unsigned char *destinationElement = destinationRow;
+
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int abgr = palette[*(unsigned char*)sourceElement];
+
+ unsigned int r = (abgr & 0x000000FF) << 16;
+ unsigned int g = (abgr & 0x0000FF00) << 0;
+ unsigned int b = (abgr & 0x00FF0000) >> 16;
+ unsigned int a = (abgr & 0xFF000000) >> 0;
+
+ *(unsigned int*)destinationElement = a | r | g | b;
+
+ sourceElement += source.bytes;
+ destinationElement += destination.bytes;
+ }
+
+ sourceRow += source.pitchB;
+ destinationRow += destination.pitchB;
+ }
+
+ sourceSlice += source.sliceB;
+ destinationSlice += destination.sliceB;
+ }
+
+ source.unlockRect();
+ destination.unlockRect();
+ }
+
+ void Surface::decodeDXT1(Buffer &internal, Buffer &external)
+ {
+ unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
+ const DXT1 *source = (const DXT1*)external.lockRect(0, 0, 0, LOCK_READONLY);
+
+ for(int z = 0; z < external.depth; z++)
+ {
+ unsigned int *dest = destSlice;
+
+ for(int y = 0; y < external.height; y += 4)
+ {
+ for(int x = 0; x < external.width; x += 4)
+ {
+ Color<byte> c[4];
+
+ c[0] = source->c0;
+ c[1] = source->c1;
+
+ if(source->c0 > source->c1) // No transparency
+ {
+ // c2 = 2 / 3 * c0 + 1 / 3 * c1
+ c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
+ c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
+ c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
+ c[2].a = 0xFF;
+
+ // c3 = 1 / 3 * c0 + 2 / 3 * c1
+ c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
+ c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
+ c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
+ c[3].a = 0xFF;
+ }
+ else // c3 transparent
+ {
+ // c2 = 1 / 2 * c0 + 1 / 2 * c1
+ c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
+ c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
+ c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
+ c[2].a = 0xFF;
+
+ c[3].r = 0;
+ c[3].g = 0;
+ c[3].b = 0;
+ c[3].a = 0;
+ }
+
+ for(int j = 0; j < 4 && (y + j) < internal.height; j++)
+ {
+ for(int i = 0; i < 4 && (x + i) < internal.width; i++)
+ {
+ dest[(x + i) + (y + j) * internal.pitchP] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
+ }
+ }
+
+ source++;
+ }
+ }
+
+ (byte*&)destSlice += internal.sliceB;
+ }
+
+ external.unlockRect();
+ internal.unlockRect();
+ }
+
+ void Surface::decodeDXT3(Buffer &internal, Buffer &external)
+ {
+ unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
+ const DXT3 *source = (const DXT3*)external.lockRect(0, 0, 0, LOCK_READONLY);
+
+ for(int z = 0; z < external.depth; z++)
+ {
+ unsigned int *dest = destSlice;
+
+ for(int y = 0; y < external.height; y += 4)
+ {
+ for(int x = 0; x < external.width; x += 4)
+ {
+ Color<byte> c[4];
+
+ c[0] = source->c0;
+ c[1] = source->c1;
+
+ // c2 = 2 / 3 * c0 + 1 / 3 * c1
+ c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
+ c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
+ c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
+
+ // c3 = 1 / 3 * c0 + 2 / 3 * c1
+ c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
+ c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
+ c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
+
+ for(int j = 0; j < 4 && (y + j) < internal.height; j++)
+ {
+ for(int i = 0; i < 4 && (x + i) < internal.width; i++)
+ {
+ unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
+ unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
+
+ dest[(x + i) + (y + j) * internal.pitchP] = color;
+ }
+ }
+
+ source++;
+ }
+ }
+
+ (byte*&)destSlice += internal.sliceB;
+ }
+
+ external.unlockRect();
+ internal.unlockRect();
+ }
+
+ void Surface::decodeDXT5(Buffer &internal, Buffer &external)
+ {
+ unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
+ const DXT5 *source = (const DXT5*)external.lockRect(0, 0, 0, LOCK_READONLY);
+
+ for(int z = 0; z < external.depth; z++)
+ {
+ unsigned int *dest = destSlice;
+
+ for(int y = 0; y < external.height; y += 4)
+ {
+ for(int x = 0; x < external.width; x += 4)
+ {
+ Color<byte> c[4];
+
+ c[0] = source->c0;
+ c[1] = source->c1;
+
+ // c2 = 2 / 3 * c0 + 1 / 3 * c1
+ c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
+ c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
+ c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
+
+ // c3 = 1 / 3 * c0 + 2 / 3 * c1
+ c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
+ c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
+ c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
+
+ byte a[8];
+
+ a[0] = source->a0;
+ a[1] = source->a1;
+
+ if(a[0] > a[1])
+ {
+ a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
+ a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
+ a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
+ a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
+ a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
+ a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
+ }
+ else
+ {
+ a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
+ a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
+ a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
+ a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
+ a[6] = 0;
+ a[7] = 0xFF;
+ }
+
+ for(int j = 0; j < 4 && (y + j) < internal.height; j++)
+ {
+ for(int i = 0; i < 4 && (x + i) < internal.width; i++)
+ {
+ unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
+ unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
+
+ dest[(x + i) + (y + j) * internal.pitchP] = color;
+ }
+ }
+
+ source++;
+ }
+ }
+
+ (byte*&)destSlice += internal.sliceB;
+ }
+
+ external.unlockRect();
+ internal.unlockRect();
+ }
+
+ void Surface::decodeATI1(Buffer &internal, Buffer &external)
+ {
+ byte *destSlice = (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
+ const ATI1 *source = (const ATI1*)external.lockRect(0, 0, 0, LOCK_READONLY);
+
+ for(int z = 0; z < external.depth; z++)
+ {
+ byte *dest = destSlice;
+
+ for(int y = 0; y < external.height; y += 4)
+ {
+ for(int x = 0; x < external.width; x += 4)
+ {
+ byte r[8];
+
+ r[0] = source->r0;
+ r[1] = source->r1;
+
+ if(r[0] > r[1])
+ {
+ r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
+ r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
+ r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
+ r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
+ r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
+ r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
+ }
+ else
+ {
+ r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
+ r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
+ r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
+ r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
+ r[6] = 0;
+ r[7] = 0xFF;
+ }
+
+ for(int j = 0; j < 4 && (y + j) < internal.height; j++)
+ {
+ for(int i = 0; i < 4 && (x + i) < internal.width; i++)
+ {
+ dest[(x + i) + (y + j) * internal.pitchP] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
+ }
+ }
+
+ source++;
+ }
+ }
+
+ destSlice += internal.sliceB;
+ }
+
+ external.unlockRect();
+ internal.unlockRect();
+ }
+
+ void Surface::decodeATI2(Buffer &internal, Buffer &external)
+ {
+ word *destSlice = (word*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
+ const ATI2 *source = (const ATI2*)external.lockRect(0, 0, 0, LOCK_READONLY);
+
+ for(int z = 0; z < external.depth; z++)
+ {
+ word *dest = destSlice;
+
+ for(int y = 0; y < external.height; y += 4)
+ {
+ for(int x = 0; x < external.width; x += 4)
+ {
+ byte X[8];
+
+ X[0] = source->x0;
+ X[1] = source->x1;
+
+ if(X[0] > X[1])
+ {
+ X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
+ X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
+ X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
+ X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
+ X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
+ X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
+ }
+ else
+ {
+ X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
+ X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
+ X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
+ X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
+ X[6] = 0;
+ X[7] = 0xFF;
+ }
+
+ byte Y[8];
+
+ Y[0] = source->y0;
+ Y[1] = source->y1;
+
+ if(Y[0] > Y[1])
+ {
+ Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
+ Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
+ Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
+ Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
+ Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
+ Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
+ }
+ else
+ {
+ Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
+ Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
+ Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
+ Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
+ Y[6] = 0;
+ Y[7] = 0xFF;
+ }
+
+ for(int j = 0; j < 4 && (y + j) < internal.height; j++)
+ {
+ for(int i = 0; i < 4 && (x + i) < internal.width; i++)
+ {
+ word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
+ word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
+
+ dest[(x + i) + (y + j) * internal.pitchP] = (g << 8) + r;
+ }
+ }
+
+ source++;
+ }
+ }
+
+ (byte*&)destSlice += internal.sliceB;
+ }
+
+ external.unlockRect();
+ internal.unlockRect();
+ }
+
+ void Surface::decodeETC2(Buffer &internal, Buffer &external, int nbAlphaBits, bool isSRGB)
+ {
+ ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE), external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
+ (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
+ external.unlockRect();
+ internal.unlockRect();
+
+ if(isSRGB)
+ {
+ static byte sRGBtoLinearTable[256];
+ static bool sRGBtoLinearTableDirty = true;
+ if(sRGBtoLinearTableDirty)
+ {
+ for(int i = 0; i < 256; i++)
+ {
+ sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
+ }
+ sRGBtoLinearTableDirty = false;
+ }
+
+ // Perform sRGB conversion in place after decoding
+ byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
+ for(int y = 0; y < internal.height; y++)
+ {
+ byte *srcRow = src + y * internal.pitchB;
+ for(int x = 0; x < internal.width; x++)
+ {
+ byte *srcPix = srcRow + x * internal.bytes;
+ for(int i = 0; i < 3; i++)
+ {
+ srcPix[i] = sRGBtoLinearTable[srcPix[i]];
+ }
+ }
+ }
+ internal.unlockRect();
+ }
+ }
+
+ void Surface::decodeEAC(Buffer &internal, Buffer &external, int nbChannels, bool isSigned)
+ {
+ ASSERT(nbChannels == 1 || nbChannels == 2);
+
+ byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
+ ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), src, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
+ (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
+ external.unlockRect();
+
+ // FIXME: We convert EAC data to float, until signed short internal formats are supported
+ // This code can be removed if ETC2 images are decoded to internal 16 bit signed R/RG formats
+ const float normalization = isSigned ? (1.0f / (8.0f * 127.875f)) : (1.0f / (8.0f * 255.875f));
+ for(int y = 0; y < internal.height; y++)
+ {
+ byte* srcRow = src + y * internal.pitchB;
+ for(int x = internal.width - 1; x >= 0; x--)
+ {
+ int* srcPix = reinterpret_cast<int*>(srcRow + x * internal.bytes);
+ float* dstPix = reinterpret_cast<float*>(srcPix);
+ for(int c = nbChannels - 1; c >= 0; c--)
+ {
+ dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
+ }
+ }
+ }
+
+ internal.unlockRect();
+ }
+
+ void Surface::decodeASTC(Buffer &internal, Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
+ {
+ }
+
+ size_t Surface::size(int width, int height, int depth, int border, int samples, Format format)
+ {
+ samples = max(1, samples);
+
+ switch(format)
+ {
+ default:
+ {
+ uint64_t size = (uint64_t)sliceB(width, height, border, format, true) * depth * samples;
+
+ // FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
+ // and stencil operations also read 8 bytes per four 8-bit stencil values,
+ // so we have to allocate 4 extra bytes to avoid buffer overruns.
+ size += 4;
+
+ // We can only sample buffers smaller than 2 GiB.
+ // Force an out-of-memory if larger, or let the caller report an error.
+ return size < 0x80000000u ? (size_t)size : std::numeric_limits<size_t>::max();
+ }
+ case FORMAT_YV12_BT601:
+ case FORMAT_YV12_BT709:
+ case FORMAT_YV12_JFIF:
+ {
+ width += 2 * border;
+ height += 2 * border;
+
+ size_t YStride = align<16>(width);
+ size_t YSize = YStride * height;
+ size_t CStride = align<16>(YStride / 2);
+ size_t CSize = CStride * height / 2;
+
+ return YSize + 2 * CSize;
+ }
+ }
+ }
+
+ bool Surface::isStencil(Format format)
+ {
+ switch(format)
+ {
+ case FORMAT_D32:
+ case FORMAT_D16:
+ case FORMAT_D24X8:
+ case FORMAT_D32F:
+ case FORMAT_D32F_COMPLEMENTARY:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32F_SHADOW:
+ return false;
+ case FORMAT_D24S8:
+ case FORMAT_D24FS8:
+ case FORMAT_S8:
+ case FORMAT_DF24S8:
+ case FORMAT_DF16S8:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32FS8_SHADOW:
+ case FORMAT_D32FS8:
+ case FORMAT_D32FS8_COMPLEMENTARY:
+ case FORMAT_INTZ:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ bool Surface::isDepth(Format format)
+ {
+ switch(format)
+ {
+ case FORMAT_D32:
+ case FORMAT_D16:
+ case FORMAT_D24X8:
+ case FORMAT_D24S8:
+ case FORMAT_D24FS8:
+ case FORMAT_D32F:
+ case FORMAT_D32FS8:
+ case FORMAT_D32F_COMPLEMENTARY:
+ case FORMAT_D32FS8_COMPLEMENTARY:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_DF24S8:
+ case FORMAT_DF16S8:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ case FORMAT_INTZ:
+ return true;
+ case FORMAT_S8:
+ return false;
+ default:
+ return false;
+ }
+ }
+
+ bool Surface::hasQuadLayout(Format format)
+ {
+ switch(format)
+ {
+ case FORMAT_D32:
+ case FORMAT_D16:
+ case FORMAT_D24X8:
+ case FORMAT_D24S8:
+ case FORMAT_D24FS8:
+ case FORMAT_D32F:
+ case FORMAT_D32FS8:
+ case FORMAT_D32F_COMPLEMENTARY:
+ case FORMAT_D32FS8_COMPLEMENTARY:
+ case FORMAT_DF24S8:
+ case FORMAT_DF16S8:
+ case FORMAT_INTZ:
+ case FORMAT_S8:
+ case FORMAT_A8G8R8B8Q:
+ case FORMAT_X8G8R8B8Q:
+ return true;
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ default:
+ break;
+ }
+
+ return false;
+ }
+
+ bool Surface::isPalette(Format format)
+ {
+ switch(format)
+ {
+ case FORMAT_P8:
+ case FORMAT_A8P8:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ bool Surface::isFloatFormat(Format format)
+ {
+ switch(format)
+ {
+ case FORMAT_R5G6B5:
+ case FORMAT_R8G8B8:
+ case FORMAT_B8G8R8:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_R8UI:
+ case FORMAT_G8R8UI:
+ case FORMAT_X8B8G8R8UI:
+ case FORMAT_A8B8G8R8UI:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8:
+ case FORMAT_A2B10G10R10:
+ case FORMAT_A2B10G10R10UI:
+ case FORMAT_R8_SNORM:
+ case FORMAT_G8R8_SNORM:
+ case FORMAT_X8B8G8R8_SNORM:
+ case FORMAT_A8B8G8R8_SNORM:
+ case FORMAT_R16I:
+ case FORMAT_R16UI:
+ case FORMAT_G16R16I:
+ case FORMAT_G16R16UI:
+ case FORMAT_G16R16:
+ case FORMAT_X16B16G16R16I:
+ case FORMAT_X16B16G16R16UI:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_A16B16G16R16:
+ case FORMAT_V8U8:
+ case FORMAT_Q8W8V8U8:
+ case FORMAT_X8L8V8U8:
+ case FORMAT_V16U16:
+ case FORMAT_A16W16V16U16:
+ case FORMAT_Q16W16V16U16:
+ case FORMAT_A8:
+ case FORMAT_R8I:
+ case FORMAT_R8:
+ case FORMAT_S8:
+ case FORMAT_L8:
+ case FORMAT_L16:
+ case FORMAT_A8L8:
+ case FORMAT_YV12_BT601:
+ case FORMAT_YV12_BT709:
+ case FORMAT_YV12_JFIF:
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_X32B32G32R32UI:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ return false;
+ case FORMAT_R16F:
+ case FORMAT_G16R16F:
+ case FORMAT_B16G16R16F:
+ case FORMAT_X16B16G16R16F:
+ case FORMAT_A16B16G16R16F:
+ case FORMAT_X16B16G16R16F_UNSIGNED:
+ case FORMAT_R32F:
+ case FORMAT_G32R32F:
+ case FORMAT_B32G32R32F:
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_A32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_D32F:
+ case FORMAT_D32FS8:
+ case FORMAT_D32F_COMPLEMENTARY:
+ case FORMAT_D32FS8_COMPLEMENTARY:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ case FORMAT_L16F:
+ case FORMAT_A16L16F:
+ case FORMAT_L32F:
+ case FORMAT_A32L32F:
+ return true;
+ default:
+ ASSERT(false);
+ }
+
+ return false;
+ }
+
+ bool Surface::isUnsignedComponent(Format format, int component)
+ {
+ switch(format)
+ {
+ case FORMAT_NULL:
+ case FORMAT_R5G6B5:
+ case FORMAT_R8G8B8:
+ case FORMAT_B8G8R8:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_G8R8:
+ case FORMAT_A2B10G10R10:
+ case FORMAT_A2B10G10R10UI:
+ case FORMAT_R16UI:
+ case FORMAT_G16R16:
+ case FORMAT_G16R16UI:
+ case FORMAT_X16B16G16R16UI:
+ case FORMAT_A16B16G16R16:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_R32UI:
+ case FORMAT_G32R32UI:
+ case FORMAT_X32B32G32R32UI:
+ case FORMAT_A32B32G32R32UI:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_R8UI:
+ case FORMAT_G8R8UI:
+ case FORMAT_X8B8G8R8UI:
+ case FORMAT_A8B8G8R8UI:
+ case FORMAT_D32F:
+ case FORMAT_D32FS8:
+ case FORMAT_D32F_COMPLEMENTARY:
+ case FORMAT_D32FS8_COMPLEMENTARY:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ case FORMAT_A8:
+ case FORMAT_R8:
+ case FORMAT_L8:
+ case FORMAT_L16:
+ case FORMAT_A8L8:
+ case FORMAT_YV12_BT601:
+ case FORMAT_YV12_BT709:
+ case FORMAT_YV12_JFIF:
+ return true;
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A8B8G8R8_SNORM:
+ case FORMAT_Q8W8V8U8:
+ case FORMAT_Q16W16V16U16:
+ case FORMAT_A32B32G32R32F:
+ return false;
+ case FORMAT_R32F:
+ case FORMAT_R8I:
+ case FORMAT_R16I:
+ case FORMAT_R32I:
+ case FORMAT_R8_SNORM:
+ return component >= 1;
+ case FORMAT_V8U8:
+ case FORMAT_X8L8V8U8:
+ case FORMAT_V16U16:
+ case FORMAT_G32R32F:
+ case FORMAT_G8R8I:
+ case FORMAT_G16R16I:
+ case FORMAT_G32R32I:
+ case FORMAT_G8R8_SNORM:
+ return component >= 2;
+ case FORMAT_A16W16V16U16:
+ case FORMAT_B32G32R32F:
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X16B16G16R16I:
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_X8B8G8R8_SNORM:
+ return component >= 3;
+ default:
+ ASSERT(false);
+ }
+
+ return false;
+ }
+
+ bool Surface::isSRGBreadable(Format format)
+ {
+ // Keep in sync with Capabilities::isSRGBreadable
+ switch(format)
+ {
+ case FORMAT_L8:
+ case FORMAT_A8L8:
+ case FORMAT_R8G8B8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_R5G6B5:
+ case FORMAT_X1R5G5B5:
+ case FORMAT_A1R5G5B5:
+ case FORMAT_A4R4G4B4:
+ case FORMAT_DXT1:
+ case FORMAT_DXT3:
+ case FORMAT_DXT5:
+ case FORMAT_ATI1:
+ case FORMAT_ATI2:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ bool Surface::isSRGBwritable(Format format)
+ {
+ // Keep in sync with Capabilities::isSRGBwritable
+ switch(format)
+ {
+ case FORMAT_NULL:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_R5G6B5:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ bool Surface::isSRGBformat(Format format)
+ {
+ switch(format)
+ {
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ bool Surface::isCompressed(Format format)
+ {
+ switch(format)
+ {
+ case FORMAT_DXT1:
+ case FORMAT_DXT3:
+ case FORMAT_DXT5:
+ case FORMAT_ATI1:
+ case FORMAT_ATI2:
+ case FORMAT_ETC1:
+ case FORMAT_R11_EAC:
+ case FORMAT_SIGNED_R11_EAC:
+ case FORMAT_RG11_EAC:
+ case FORMAT_SIGNED_RG11_EAC:
+ case FORMAT_RGB8_ETC2:
+ case FORMAT_SRGB8_ETC2:
+ case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+ case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+ case FORMAT_RGBA8_ETC2_EAC:
+ case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
+ case FORMAT_RGBA_ASTC_4x4_KHR:
+ case FORMAT_RGBA_ASTC_5x4_KHR:
+ case FORMAT_RGBA_ASTC_5x5_KHR:
+ case FORMAT_RGBA_ASTC_6x5_KHR:
+ case FORMAT_RGBA_ASTC_6x6_KHR:
+ case FORMAT_RGBA_ASTC_8x5_KHR:
+ case FORMAT_RGBA_ASTC_8x6_KHR:
+ case FORMAT_RGBA_ASTC_8x8_KHR:
+ case FORMAT_RGBA_ASTC_10x5_KHR:
+ case FORMAT_RGBA_ASTC_10x6_KHR:
+ case FORMAT_RGBA_ASTC_10x8_KHR:
+ case FORMAT_RGBA_ASTC_10x10_KHR:
+ case FORMAT_RGBA_ASTC_12x10_KHR:
+ case FORMAT_RGBA_ASTC_12x12_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ bool Surface::isSignedNonNormalizedInteger(Format format)
+ {
+ switch(format)
+ {
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_G8R8I:
+ case FORMAT_R8I:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_X16B16G16R16I:
+ case FORMAT_G16R16I:
+ case FORMAT_R16I:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_G32R32I:
+ case FORMAT_R32I:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ bool Surface::isUnsignedNonNormalizedInteger(Format format)
+ {
+ switch(format)
+ {
+ case FORMAT_A8B8G8R8UI:
+ case FORMAT_X8B8G8R8UI:
+ case FORMAT_G8R8UI:
+ case FORMAT_R8UI:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_X16B16G16R16UI:
+ case FORMAT_G16R16UI:
+ case FORMAT_R16UI:
+ case FORMAT_A32B32G32R32UI:
+ case FORMAT_X32B32G32R32UI:
+ case FORMAT_G32R32UI:
+ case FORMAT_R32UI:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ bool Surface::isNonNormalizedInteger(Format format)
+ {
+ return isSignedNonNormalizedInteger(format) ||
+ isUnsignedNonNormalizedInteger(format);
+ }
+
+ bool Surface::isNormalizedInteger(Format format)
+ {
+ return !isFloatFormat(format) &&
+ !isNonNormalizedInteger(format) &&
+ !isCompressed(format) &&
+ !isDepth(format) &&
+ !isStencil(format);
+ }
+
+ int Surface::componentCount(Format format)
+ {
+ switch(format)
+ {
+ case FORMAT_R5G6B5: return 3;
+ case FORMAT_X8R8G8B8: return 3;
+ case FORMAT_X8B8G8R8I: return 3;
+ case FORMAT_X8B8G8R8: return 3;
+ case FORMAT_A8R8G8B8: return 4;
+ case FORMAT_SRGB8_X8: return 3;
+ case FORMAT_SRGB8_A8: return 4;
+ case FORMAT_A8B8G8R8I: return 4;
+ case FORMAT_A8B8G8R8: return 4;
+ case FORMAT_G8R8I: return 2;
+ case FORMAT_G8R8: return 2;
+ case FORMAT_R8_SNORM: return 1;
+ case FORMAT_G8R8_SNORM: return 2;
+ case FORMAT_X8B8G8R8_SNORM:return 3;
+ case FORMAT_A8B8G8R8_SNORM:return 4;
+ case FORMAT_R8UI: return 1;
+ case FORMAT_G8R8UI: return 2;
+ case FORMAT_X8B8G8R8UI: return 3;
+ case FORMAT_A8B8G8R8UI: return 4;
+ case FORMAT_A2B10G10R10: return 4;
+ case FORMAT_A2B10G10R10UI: return 4;
+ case FORMAT_G16R16I: return 2;
+ case FORMAT_G16R16UI: return 2;
+ case FORMAT_G16R16: return 2;
+ case FORMAT_G32R32I: return 2;
+ case FORMAT_G32R32UI: return 2;
+ case FORMAT_X16B16G16R16I: return 3;
+ case FORMAT_X16B16G16R16UI: return 3;
+ case FORMAT_A16B16G16R16I: return 4;
+ case FORMAT_A16B16G16R16UI: return 4;
+ case FORMAT_A16B16G16R16: return 4;
+ case FORMAT_X32B32G32R32I: return 3;
+ case FORMAT_X32B32G32R32UI: return 3;
+ case FORMAT_A32B32G32R32I: return 4;
+ case FORMAT_A32B32G32R32UI: return 4;
+ case FORMAT_V8U8: return 2;
+ case FORMAT_Q8W8V8U8: return 4;
+ case FORMAT_X8L8V8U8: return 3;
+ case FORMAT_V16U16: return 2;
+ case FORMAT_A16W16V16U16: return 4;
+ case FORMAT_Q16W16V16U16: return 4;
+ case FORMAT_R32F: return 1;
+ case FORMAT_G32R32F: return 2;
+ case FORMAT_X32B32G32R32F: return 3;
+ case FORMAT_A32B32G32R32F: return 4;
+ case FORMAT_X32B32G32R32F_UNSIGNED: return 3;
+ case FORMAT_D32F: return 1;
+ case FORMAT_D32FS8: return 1;
+ case FORMAT_D32F_LOCKABLE: return 1;
+ case FORMAT_D32FS8_TEXTURE: return 1;
+ case FORMAT_D32F_SHADOW: return 1;
+ case FORMAT_D32FS8_SHADOW: return 1;
+ case FORMAT_A8: return 1;
+ case FORMAT_R8I: return 1;
+ case FORMAT_R8: return 1;
+ case FORMAT_R16I: return 1;
+ case FORMAT_R16UI: return 1;
+ case FORMAT_R32I: return 1;
+ case FORMAT_R32UI: return 1;
+ case FORMAT_L8: return 1;
+ case FORMAT_L16: return 1;
+ case FORMAT_A8L8: return 2;
+ case FORMAT_YV12_BT601: return 3;
+ case FORMAT_YV12_BT709: return 3;
+ case FORMAT_YV12_JFIF: return 3;
+ default:
+ ASSERT(false);
+ }
+
+ return 1;
+ }
+
+ void *Surface::allocateBuffer(int width, int height, int depth, int border, int samples, Format format)
+ {
+ return allocate(size(width, height, depth, border, samples, format));
+ }
+
+ void Surface::memfill4(void *buffer, int pattern, int bytes)
+ {
+ while((size_t)buffer & 0x1 && bytes >= 1)
+ {
+ *(char*)buffer = (char)pattern;
+ (char*&)buffer += 1;
+ bytes -= 1;
+ }
+
+ while((size_t)buffer & 0x3 && bytes >= 2)
+ {
+ *(short*)buffer = (short)pattern;
+ (short*&)buffer += 1;
+ bytes -= 2;
+ }
+
+ #if defined(__i386__) || defined(__x86_64__)
+ if(CPUID::supportsSSE())
+ {
+ while((size_t)buffer & 0xF && bytes >= 4)
+ {
+ *(int*)buffer = pattern;
+ (int*&)buffer += 1;
+ bytes -= 4;
+ }
+
+ __m128 quad = _mm_set_ps1((float&)pattern);
+
+ float *pointer = (float*)buffer;
+ int qxwords = bytes / 64;
+ bytes -= qxwords * 64;
+
+ while(qxwords--)
+ {
+ _mm_stream_ps(pointer + 0, quad);
+ _mm_stream_ps(pointer + 4, quad);
+ _mm_stream_ps(pointer + 8, quad);
+ _mm_stream_ps(pointer + 12, quad);
+
+ pointer += 16;
+ }
+
+ buffer = pointer;
+ }
+ #endif
+
+ while(bytes >= 4)
+ {
+ *(int*)buffer = (int)pattern;
+ (int*&)buffer += 1;
+ bytes -= 4;
+ }
+
+ while(bytes >= 2)
+ {
+ *(short*)buffer = (short)pattern;
+ (short*&)buffer += 1;
+ bytes -= 2;
+ }
+
+ while(bytes >= 1)
+ {
+ *(char*)buffer = (char)pattern;
+ (char*&)buffer += 1;
+ bytes -= 1;
+ }
+ }
+
+ void Surface::sync()
+ {
+ resource->lock(EXCLUSIVE);
+ resource->unlock();
+ }
+
+ bool Surface::isEntire(const Rect& rect) const
+ {
+ return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
+ }
+
+ Rect Surface::getRect() const
+ {
+ return Rect(0, 0, internal.width, internal.height);
+ }
+
+ void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
+ {
+ if(width == 0 || height == 0)
+ {
+ return;
+ }
+
+ if(internal.format == FORMAT_NULL)
+ {
+ return;
+ }
+
+ // Not overlapping
+ if(x0 > internal.width) return;
+ if(y0 > internal.height) return;
+ if(x0 + width < 0) return;
+ if(y0 + height < 0) return;
+
+ // Clip against dimensions
+ if(x0 < 0) {width += x0; x0 = 0;}
+ if(x0 + width > internal.width) width = internal.width - x0;
+ if(y0 < 0) {height += y0; y0 = 0;}
+ if(y0 + height > internal.height) height = internal.height - y0;
+
+ const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
+ const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
+
+ int x1 = x0 + width;
+ int y1 = y0 + height;
+
+ if(!hasQuadLayout(internal.format))
+ {
+ float *target = (float*)lockInternal(x0, y0, 0, lock, PUBLIC);
+
+ for(int z = 0; z < internal.samples; z++)
+ {
+ float *row = target;
+ for(int y = y0; y < y1; y++)
+ {
+ memfill4(row, (int&)depth, width * sizeof(float));
+ row += internal.pitchP;
+ }
+ target += internal.sliceP;
+ }
+
+ unlockInternal();
+ }
+ else // Quad layout
+ {
+ if(complementaryDepthBuffer)
+ {
+ depth = 1 - depth;
+ }
+
+ float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
+
+ int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
+ int oddX1 = (x1 & ~1) * 2;
+ int evenX0 = ((x0 + 1) & ~1) * 2;
+ int evenBytes = (oddX1 - evenX0) * sizeof(float);
+
+ for(int z = 0; z < internal.samples; z++)
+ {
+ for(int y = y0; y < y1; y++)
+ {
+ float *target = buffer + (y & ~1) * internal.pitchP + (y & 1) * 2;
+
+ if((y & 1) == 0 && y + 1 < y1) // Fill quad line at once
+ {
+ if((x0 & 1) != 0)
+ {
+ target[oddX0 + 0] = depth;
+ target[oddX0 + 2] = depth;
+ }
+
+ // for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
+ // {
+ // target[x2 + 0] = depth;
+ // target[x2 + 1] = depth;
+ // target[x2 + 2] = depth;
+ // target[x2 + 3] = depth;
+ // }
+
+ // __asm
+ // {
+ // movss xmm0, depth
+ // shufps xmm0, xmm0, 0x00
+ //
+ // mov eax, x0
+ // add eax, 1
+ // and eax, 0xFFFFFFFE
+ // cmp eax, x1
+ // jge qEnd
+ //
+ // mov edi, target
+ //
+ // qLoop:
+ // movntps [edi+8*eax], xmm0
+ //
+ // add eax, 2
+ // cmp eax, x1
+ // jl qLoop
+ // qEnd:
+ // }
+
+ memfill4(&target[evenX0], (int&)depth, evenBytes);
+
+ if((x1 & 1) != 0)
+ {
+ target[oddX1 + 0] = depth;
+ target[oddX1 + 2] = depth;
+ }
+
+ y++;
+ }
+ else
+ {
+ for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
+ {
+ target[i] = depth;
+ }
+ }
+ }
+
+ buffer += internal.sliceP;
+ }
+
+ unlockInternal();
+ }
+ }
+
+ void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
+ {
+ if(mask == 0 || width == 0 || height == 0)
+ {
+ return;
+ }
+
+ if(stencil.format == FORMAT_NULL)
+ {
+ return;
+ }
+
+ // Not overlapping
+ if(x0 > internal.width) return;
+ if(y0 > internal.height) return;
+ if(x0 + width < 0) return;
+ if(y0 + height < 0) return;
+
+ // Clip against dimensions
+ if(x0 < 0) {width += x0; x0 = 0;}
+ if(x0 + width > internal.width) width = internal.width - x0;
+ if(y0 < 0) {height += y0; y0 = 0;}
+ if(y0 + height > internal.height) height = internal.height - y0;
+
+ int x1 = x0 + width;
+ int y1 = y0 + height;
+
+ int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
+ int oddX1 = (x1 & ~1) * 2;
+ int evenX0 = ((x0 + 1) & ~1) * 2;
+ int evenBytes = oddX1 - evenX0;
+
+ unsigned char maskedS = s & mask;
+ unsigned char invMask = ~mask;
+ unsigned int fill = maskedS;
+ fill = fill | (fill << 8) | (fill << 16) | (fill << 24);
+
+ char *buffer = (char*)lockStencil(0, 0, 0, PUBLIC);
+
+ // Stencil buffers are assumed to use quad layout
+ for(int z = 0; z < stencil.samples; z++)
+ {
+ for(int y = y0; y < y1; y++)
+ {
+ char *target = buffer + (y & ~1) * stencil.pitchP + (y & 1) * 2;
+
+ if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF) // Fill quad line at once
+ {
+ if((x0 & 1) != 0)
+ {
+ target[oddX0 + 0] = fill;
+ target[oddX0 + 2] = fill;
+ }
+
+ memfill4(&target[evenX0], fill, evenBytes);
+
+ if((x1 & 1) != 0)
+ {
+ target[oddX1 + 0] = fill;
+ target[oddX1 + 2] = fill;
+ }
+
+ y++;
+ }
+ else
+ {
+ for(int x = x0; x < x1; x++)
+ {
+ int i = (x & ~1) * 2 + (x & 1);
+ target[i] = maskedS | (target[i] & invMask);
+ }
+ }
+ }
+
+ buffer += stencil.sliceP;
+ }
+
+ unlockStencil();
+ }
+
+ void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
+ {
+ unsigned char *row;
+ Buffer *buffer;
+
+ if(internal.dirty)
+ {
+ row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
+ buffer = &internal;
+ }
+ else
+ {
+ row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
+ buffer = &external;
+ }
+
+ if(buffer->bytes <= 4)
+ {
+ int c;
+ buffer->write(&c, color);
+
+ if(buffer->bytes <= 1) c = (c << 8) | c;
+ if(buffer->bytes <= 2) c = (c << 16) | c;
+
+ for(int y = 0; y < height; y++)
+ {
+ memfill4(row, c, width * buffer->bytes);
+
+ row += buffer->pitchB;
+ }
+ }
+ else // Generic
+ {
+ for(int y = 0; y < height; y++)
+ {
+ unsigned char *element = row;
+
+ for(int x = 0; x < width; x++)
+ {
+ buffer->write(element, color);
+
+ element += buffer->bytes;
+ }
+
+ row += buffer->pitchB;
+ }
+ }
+
+ if(buffer == &internal)
+ {
+ unlockInternal();
+ }
+ else
+ {
+ unlockExternal();
+ }
+ }
+
+ void Surface::copyInternal(const Surface *source, int x, int y, float srcX, float srcY, bool filter)
+ {
+ ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
+
+ sw::Color<float> color;
+
+ if(!filter)
+ {
+ color = source->internal.read((int)srcX, (int)srcY, 0);
+ }
+ else // Bilinear filtering
+ {
+ color = source->internal.sample(srcX, srcY, 0);
+ }
+
+ internal.write(x, y, color);
+ }
+
+ void Surface::copyInternal(const Surface *source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
+ {
+ ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
+
+ sw::Color<float> color;
+
+ if(!filter)
+ {
+ color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
+ }
+ else // Bilinear filtering
+ {
+ color = source->internal.sample(srcX, srcY, srcZ);
+ }
+
+ internal.write(x, y, z, color);
+ }
+
+ void Surface::copyCubeEdge(Edge dstEdge, Surface *src, Edge srcEdge)
+ {
+ Surface *dst = this;
+
+ // Figure out if the edges to be copied in reverse order respectively from one another
+ // The copy should be reversed whenever the same edges are contiguous or if we're
+ // copying top <-> right or bottom <-> left. This is explained by the layout, which is:
+ //
+ // | +y |
+ // | -x | +z | +x | -z |
+ // | -y |
+
+ bool reverse = (srcEdge == dstEdge) ||
+ ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
+ ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
+ ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
+ ((srcEdge == LEFT) && (dstEdge == BOTTOM));
+
+ int srcBytes = src->bytes(src->Surface::getInternalFormat());
+ int srcPitch = src->getInternalPitchB();
+ int dstBytes = dst->bytes(dst->Surface::getInternalFormat());
+ int dstPitch = dst->getInternalPitchB();
+
+ int srcW = src->getWidth();
+ int srcH = src->getHeight();
+ int dstW = dst->getWidth();
+ int dstH = dst->getHeight();
+
+ ASSERT(srcW == srcH && dstW == dstH && srcW == dstW && srcBytes == dstBytes);
+
+ // Src is expressed in the regular [0, width-1], [0, height-1] space
+ int srcDelta = ((srcEdge == TOP) || (srcEdge == BOTTOM)) ? srcBytes : srcPitch;
+ int srcStart = ((srcEdge == BOTTOM) ? srcPitch * (srcH - 1) : ((srcEdge == RIGHT) ? srcBytes * (srcW - 1) : 0));
+
+ // Dst contains borders, so it is expressed in the [-1, width+1], [-1, height+1] space
+ int dstDelta = (((dstEdge == TOP) || (dstEdge == BOTTOM)) ? dstBytes : dstPitch) * (reverse ? -1 : 1);
+ int dstStart = ((dstEdge == BOTTOM) ? dstPitch * (dstH + 1) : ((dstEdge == RIGHT) ? dstBytes * (dstW + 1) : 0)) + (reverse ? dstW * -dstDelta : dstDelta);
+
+ char *srcBuf = (char*)src->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PRIVATE) + srcStart;
+ char *dstBuf = (char*)dst->lockInternal(-1, -1, 0, sw::LOCK_READWRITE, sw::PRIVATE) + dstStart;
+
+ for(int i = 0; i < srcW; ++i, dstBuf += dstDelta, srcBuf += srcDelta)
+ {
+ memcpy(dstBuf, srcBuf, srcBytes);
+ }
+
+ if(dstEdge == LEFT || dstEdge == RIGHT)
+ {
+ // TOP and BOTTOM are already set, let's average out the corners
+ int x0 = (dstEdge == RIGHT) ? dstW : -1;
+ int y0 = -1;
+ int x1 = (dstEdge == RIGHT) ? dstW - 1 : 0;
+ int y1 = 0;
+ dst->computeCubeCorner(x0, y0, x1, y1);
+ y0 = dstH;
+ y1 = dstH - 1;
+ dst->computeCubeCorner(x0, y0, x1, y1);
+ }
+
+ src->unlockInternal();
+ dst->unlockInternal();
+ }
+
+ void Surface::computeCubeCorner(int x0, int y0, int x1, int y1)
+ {
+ ASSERT(internal.lock != LOCK_UNLOCKED);
+
+ sw::Color<float> color = internal.read(x0, y1);
+ color += internal.read(x1, y0);
+ color += internal.read(x1, y1);
+ color *= (1.0f / 3.0f);
+
+ internal.write(x0, y0, color);
+ }
+
+ bool Surface::hasStencil() const
+ {
+ return isStencil(external.format);
+ }
+
+ bool Surface::hasDepth() const
+ {
+ return isDepth(external.format);
+ }
+
+ bool Surface::hasPalette() const
+ {
+ return isPalette(external.format);
+ }
+
+ bool Surface::isRenderTarget() const
+ {
+ return renderTarget;
+ }
+
+ bool Surface::hasDirtyContents() const
+ {
+ return dirtyContents;
+ }
+
+ void Surface::markContentsClean()
+ {
+ dirtyContents = false;
+ }
+
+ Resource *Surface::getResource()
+ {
+ return resource;
+ }
+
+ bool Surface::identicalBuffers() const
+ {
+ return external.format == internal.format &&
+ external.width == internal.width &&
+ external.height == internal.height &&
+ external.depth == internal.depth &&
+ external.pitchB == internal.pitchB &&
+ external.sliceB == internal.sliceB &&
+ external.border == internal.border &&
+ external.samples == internal.samples;
+ }
+
+ Format Surface::selectInternalFormat(Format format) const
+ {
+ switch(format)
+ {
+ case FORMAT_NULL:
+ return FORMAT_NULL;
+ case FORMAT_P8:
+ case FORMAT_A8P8:
+ case FORMAT_A4R4G4B4:
+ case FORMAT_A1R5G5B5:
+ case FORMAT_A8R3G3B2:
+ return FORMAT_A8R8G8B8;
+ case FORMAT_A8:
+ return FORMAT_A8;
+ case FORMAT_R8I:
+ return FORMAT_R8I;
+ case FORMAT_R8UI:
+ return FORMAT_R8UI;
+ case FORMAT_R8_SNORM:
+ return FORMAT_R8_SNORM;
+ case FORMAT_R8:
+ return FORMAT_R8;
+ case FORMAT_R16I:
+ return FORMAT_R16I;
+ case FORMAT_R16UI:
+ return FORMAT_R16UI;
+ case FORMAT_R32I:
+ return FORMAT_R32I;
+ case FORMAT_R32UI:
+ return FORMAT_R32UI;
+ case FORMAT_X16B16G16R16I:
+ return FORMAT_X16B16G16R16I;
+ case FORMAT_A16B16G16R16I:
+ return FORMAT_A16B16G16R16I;
+ case FORMAT_X16B16G16R16UI:
+ return FORMAT_X16B16G16R16UI;
+ case FORMAT_A16B16G16R16UI:
+ return FORMAT_A16B16G16R16UI;
+ case FORMAT_A2R10G10B10:
+ case FORMAT_A2B10G10R10:
+ case FORMAT_A16B16G16R16:
+ return FORMAT_A16B16G16R16;
+ case FORMAT_A2B10G10R10UI:
+ return FORMAT_A16B16G16R16UI;
+ case FORMAT_X32B32G32R32I:
+ return FORMAT_X32B32G32R32I;
+ case FORMAT_A32B32G32R32I:
+ return FORMAT_A32B32G32R32I;
+ case FORMAT_X32B32G32R32UI:
+ return FORMAT_X32B32G32R32UI;
+ case FORMAT_A32B32G32R32UI:
+ return FORMAT_A32B32G32R32UI;
+ case FORMAT_G8R8I:
+ return FORMAT_G8R8I;
+ case FORMAT_G8R8UI:
+ return FORMAT_G8R8UI;
+ case FORMAT_G8R8_SNORM:
+ return FORMAT_G8R8_SNORM;
+ case FORMAT_G8R8:
+ return FORMAT_G8R8;
+ case FORMAT_G16R16I:
+ return FORMAT_G16R16I;
+ case FORMAT_G16R16UI:
+ return FORMAT_G16R16UI;
+ case FORMAT_G16R16:
+ return FORMAT_G16R16;
+ case FORMAT_G32R32I:
+ return FORMAT_G32R32I;
+ case FORMAT_G32R32UI:
+ return FORMAT_G32R32UI;
+ case FORMAT_A8R8G8B8:
+ if(lockable || !quadLayoutEnabled)
+ {
+ return FORMAT_A8R8G8B8;
+ }
+ else
+ {
+ return FORMAT_A8G8R8B8Q;
+ }
+ case FORMAT_A8B8G8R8I:
+ return FORMAT_A8B8G8R8I;
+ case FORMAT_A8B8G8R8UI:
+ return FORMAT_A8B8G8R8UI;
+ case FORMAT_A8B8G8R8_SNORM:
+ return FORMAT_A8B8G8R8_SNORM;
+ case FORMAT_R5G5B5A1:
+ case FORMAT_R4G4B4A4:
+ case FORMAT_A8B8G8R8:
+ return FORMAT_A8B8G8R8;
+ case FORMAT_R5G6B5:
+ return FORMAT_R5G6B5;
+ case FORMAT_R3G3B2:
+ case FORMAT_R8G8B8:
+ case FORMAT_X4R4G4B4:
+ case FORMAT_X1R5G5B5:
+ case FORMAT_X8R8G8B8:
+ if(lockable || !quadLayoutEnabled)
+ {
+ return FORMAT_X8R8G8B8;
+ }
+ else
+ {
+ return FORMAT_X8G8R8B8Q;
+ }
+ case FORMAT_X8B8G8R8I:
+ return FORMAT_X8B8G8R8I;
+ case FORMAT_X8B8G8R8UI:
+ return FORMAT_X8B8G8R8UI;
+ case FORMAT_X8B8G8R8_SNORM:
+ return FORMAT_X8B8G8R8_SNORM;
+ case FORMAT_B8G8R8:
+ case FORMAT_X8B8G8R8:
+ return FORMAT_X8B8G8R8;
+ case FORMAT_SRGB8_X8:
+ return FORMAT_SRGB8_X8;
+ case FORMAT_SRGB8_A8:
+ return FORMAT_SRGB8_A8;
+ // Compressed formats
+ case FORMAT_DXT1:
+ case FORMAT_DXT3:
+ case FORMAT_DXT5:
+ case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+ case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+ case FORMAT_RGBA8_ETC2_EAC:
+ case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
+ case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
+ case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
+ return FORMAT_A8R8G8B8;
+ case FORMAT_RGBA_ASTC_4x4_KHR:
+ case FORMAT_RGBA_ASTC_5x4_KHR:
+ case FORMAT_RGBA_ASTC_5x5_KHR:
+ case FORMAT_RGBA_ASTC_6x5_KHR:
+ case FORMAT_RGBA_ASTC_6x6_KHR:
+ case FORMAT_RGBA_ASTC_8x5_KHR:
+ case FORMAT_RGBA_ASTC_8x6_KHR:
+ case FORMAT_RGBA_ASTC_8x8_KHR:
+ case FORMAT_RGBA_ASTC_10x5_KHR:
+ case FORMAT_RGBA_ASTC_10x6_KHR:
+ case FORMAT_RGBA_ASTC_10x8_KHR:
+ case FORMAT_RGBA_ASTC_10x10_KHR:
+ case FORMAT_RGBA_ASTC_12x10_KHR:
+ case FORMAT_RGBA_ASTC_12x12_KHR:
+ // ASTC supports HDR, so a floating point format is required to represent it properly
+ return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
+ case FORMAT_ATI1:
+ return FORMAT_R8;
+ case FORMAT_R11_EAC:
+ case FORMAT_SIGNED_R11_EAC:
+ return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
+ case FORMAT_ATI2:
+ return FORMAT_G8R8;
+ case FORMAT_RG11_EAC:
+ case FORMAT_SIGNED_RG11_EAC:
+ return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
+ case FORMAT_ETC1:
+ case FORMAT_RGB8_ETC2:
+ case FORMAT_SRGB8_ETC2:
+ return FORMAT_X8R8G8B8;
+ // Bumpmap formats
+ case FORMAT_V8U8: return FORMAT_V8U8;
+ case FORMAT_L6V5U5: return FORMAT_X8L8V8U8;
+ case FORMAT_Q8W8V8U8: return FORMAT_Q8W8V8U8;
+ case FORMAT_X8L8V8U8: return FORMAT_X8L8V8U8;
+ case FORMAT_V16U16: return FORMAT_V16U16;
+ case FORMAT_A2W10V10U10: return FORMAT_A16W16V16U16;
+ case FORMAT_Q16W16V16U16: return FORMAT_Q16W16V16U16;
+ // Floating-point formats
+ case FORMAT_A16F: return FORMAT_A32B32G32R32F;
+ case FORMAT_R16F: return FORMAT_R32F;
+ case FORMAT_G16R16F: return FORMAT_G32R32F;
+ case FORMAT_B16G16R16F: return FORMAT_X32B32G32R32F;
+ case FORMAT_X16B16G16R16F: return FORMAT_X32B32G32R32F;
+ case FORMAT_A16B16G16R16F: return FORMAT_A32B32G32R32F;
+ case FORMAT_X16B16G16R16F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
+ case FORMAT_A32F: return FORMAT_A32B32G32R32F;
+ case FORMAT_R32F: return FORMAT_R32F;
+ case FORMAT_G32R32F: return FORMAT_G32R32F;
+ case FORMAT_B32G32R32F: return FORMAT_X32B32G32R32F;
+ case FORMAT_X32B32G32R32F: return FORMAT_X32B32G32R32F;
+ case FORMAT_A32B32G32R32F: return FORMAT_A32B32G32R32F;
+ case FORMAT_X32B32G32R32F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
+ // Luminance formats
+ case FORMAT_L8: return FORMAT_L8;
+ case FORMAT_A4L4: return FORMAT_A8L8;
+ case FORMAT_L16: return FORMAT_L16;
+ case FORMAT_A8L8: return FORMAT_A8L8;
+ case FORMAT_L16F: return FORMAT_X32B32G32R32F;
+ case FORMAT_A16L16F: return FORMAT_A32B32G32R32F;
+ case FORMAT_L32F: return FORMAT_X32B32G32R32F;
+ case FORMAT_A32L32F: return FORMAT_A32B32G32R32F;
+ // Depth/stencil formats
+ case FORMAT_D16:
+ case FORMAT_D32:
+ case FORMAT_D24X8:
+ if(hasParent) // Texture
+ {
+ return FORMAT_D32F_SHADOW;
+ }
+ else if(complementaryDepthBuffer)
+ {
+ return FORMAT_D32F_COMPLEMENTARY;
+ }
+ else
+ {
+ return FORMAT_D32F;
+ }
+ case FORMAT_D24S8:
+ case FORMAT_D24FS8:
+ if(hasParent) // Texture
+ {
+ return FORMAT_D32FS8_SHADOW;
+ }
+ else if(complementaryDepthBuffer)
+ {
+ return FORMAT_D32FS8_COMPLEMENTARY;
+ }
+ else
+ {
+ return FORMAT_D32FS8;
+ }
+ case FORMAT_D32F: return FORMAT_D32F;
+ case FORMAT_D32FS8: return FORMAT_D32FS8;
+ case FORMAT_D32F_LOCKABLE: return FORMAT_D32F_LOCKABLE;
+ case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
+ case FORMAT_INTZ: return FORMAT_D32FS8_TEXTURE;
+ case FORMAT_DF24S8: return FORMAT_D32FS8_SHADOW;
+ case FORMAT_DF16S8: return FORMAT_D32FS8_SHADOW;
+ case FORMAT_S8: return FORMAT_S8;
+ // YUV formats
+ case FORMAT_YV12_BT601: return FORMAT_YV12_BT601;
+ case FORMAT_YV12_BT709: return FORMAT_YV12_BT709;
+ case FORMAT_YV12_JFIF: return FORMAT_YV12_JFIF;
+ default:
+ ASSERT(false);
+ }
+
+ return FORMAT_NULL;
+ }
+
+ void Surface::setTexturePalette(unsigned int *palette)
+ {
+ Surface::palette = palette;
+ Surface::paletteID++;
+ }
+
+ void Surface::resolve()
+ {
+ if(internal.samples <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
+ {
+ return;
+ }
+
+ ASSERT(internal.depth == 1); // Unimplemented
+
+ void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
+
+ int width = internal.width;
+ int height = internal.height;
+ int pitch = internal.pitchB;
+ int slice = internal.sliceB;
+
+ unsigned char *source0 = (unsigned char*)source;
+ unsigned char *source1 = source0 + slice;
+ unsigned char *source2 = source1 + slice;
+ unsigned char *source3 = source2 + slice;
+ unsigned char *source4 = source3 + slice;
+ unsigned char *source5 = source4 + slice;
+ unsigned char *source6 = source5 + slice;
+ unsigned char *source7 = source6 + slice;
+ unsigned char *source8 = source7 + slice;
+ unsigned char *source9 = source8 + slice;
+ unsigned char *sourceA = source9 + slice;
+ unsigned char *sourceB = sourceA + slice;
+ unsigned char *sourceC = sourceB + slice;
+ unsigned char *sourceD = sourceC + slice;
+ unsigned char *sourceE = sourceD + slice;
+ unsigned char *sourceF = sourceE + slice;
+
+ if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
+ internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
+ internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
+ {
+ #if defined(__i386__) || defined(__x86_64__)
+ if(CPUID::supportsSSE2() && (width % 4) == 0)
+ {
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 4)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+
+ c0 = _mm_avg_epu8(c0, c1);
+
+ _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 4)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+ __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+ __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+
+ c0 = _mm_avg_epu8(c0, c1);
+ c2 = _mm_avg_epu8(c2, c3);
+ c0 = _mm_avg_epu8(c0, c2);
+
+ _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 4)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+ __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+ __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+ __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+ __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+ __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+ __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+
+ c0 = _mm_avg_epu8(c0, c1);
+ c2 = _mm_avg_epu8(c2, c3);
+ c4 = _mm_avg_epu8(c4, c5);
+ c6 = _mm_avg_epu8(c6, c7);
+ c0 = _mm_avg_epu8(c0, c2);
+ c4 = _mm_avg_epu8(c4, c6);
+ c0 = _mm_avg_epu8(c0, c4);
+
+ _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 4)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+ __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+ __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+ __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+ __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+ __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+ __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+ __m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
+ __m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
+ __m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
+ __m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
+ __m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
+ __m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
+ __m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
+ __m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
+
+ c0 = _mm_avg_epu8(c0, c1);
+ c2 = _mm_avg_epu8(c2, c3);
+ c4 = _mm_avg_epu8(c4, c5);
+ c6 = _mm_avg_epu8(c6, c7);
+ c8 = _mm_avg_epu8(c8, c9);
+ cA = _mm_avg_epu8(cA, cB);
+ cC = _mm_avg_epu8(cC, cD);
+ cE = _mm_avg_epu8(cE, cF);
+ c0 = _mm_avg_epu8(c0, c2);
+ c4 = _mm_avg_epu8(c4, c6);
+ c8 = _mm_avg_epu8(c8, cA);
+ cC = _mm_avg_epu8(cC, cE);
+ c0 = _mm_avg_epu8(c0, c4);
+ c8 = _mm_avg_epu8(c8, cC);
+ c0 = _mm_avg_epu8(c0, c8);
+
+ _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+ }
+ else
+ #endif
+ {
+ #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
+
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+ unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+
+ c0 = AVERAGE(c0, c1);
+
+ *(unsigned int*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+ unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+ unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+ unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+
+ c0 = AVERAGE(c0, c1);
+ c2 = AVERAGE(c2, c3);
+ c0 = AVERAGE(c0, c2);
+
+ *(unsigned int*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+ unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+ unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+ unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+ unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
+ unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
+ unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
+ unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
+
+ c0 = AVERAGE(c0, c1);
+ c2 = AVERAGE(c2, c3);
+ c4 = AVERAGE(c4, c5);
+ c6 = AVERAGE(c6, c7);
+ c0 = AVERAGE(c0, c2);
+ c4 = AVERAGE(c4, c6);
+ c0 = AVERAGE(c0, c4);
+
+ *(unsigned int*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+ unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+ unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+ unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+ unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
+ unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
+ unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
+ unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
+ unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
+ unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
+ unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
+ unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
+ unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
+ unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
+ unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
+ unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
+
+ c0 = AVERAGE(c0, c1);
+ c2 = AVERAGE(c2, c3);
+ c4 = AVERAGE(c4, c5);
+ c6 = AVERAGE(c6, c7);
+ c8 = AVERAGE(c8, c9);
+ cA = AVERAGE(cA, cB);
+ cC = AVERAGE(cC, cD);
+ cE = AVERAGE(cE, cF);
+ c0 = AVERAGE(c0, c2);
+ c4 = AVERAGE(c4, c6);
+ c8 = AVERAGE(c8, cA);
+ cC = AVERAGE(cC, cE);
+ c0 = AVERAGE(c0, c4);
+ c8 = AVERAGE(c8, cC);
+ c0 = AVERAGE(c0, c8);
+
+ *(unsigned int*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+
+ #undef AVERAGE
+ }
+ }
+ else if(internal.format == FORMAT_G16R16)
+ {
+
+ #if defined(__i386__) || defined(__x86_64__)
+ if(CPUID::supportsSSE2() && (width % 4) == 0)
+ {
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 4)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+
+ c0 = _mm_avg_epu16(c0, c1);
+
+ _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 4)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+ __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+ __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+
+ c0 = _mm_avg_epu16(c0, c1);
+ c2 = _mm_avg_epu16(c2, c3);
+ c0 = _mm_avg_epu16(c0, c2);
+
+ _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 4)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+ __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+ __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+ __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+ __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+ __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+ __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+
+ c0 = _mm_avg_epu16(c0, c1);
+ c2 = _mm_avg_epu16(c2, c3);
+ c4 = _mm_avg_epu16(c4, c5);
+ c6 = _mm_avg_epu16(c6, c7);
+ c0 = _mm_avg_epu16(c0, c2);
+ c4 = _mm_avg_epu16(c4, c6);
+ c0 = _mm_avg_epu16(c0, c4);
+
+ _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 4)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+ __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+ __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+ __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+ __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+ __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+ __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+ __m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
+ __m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
+ __m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
+ __m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
+ __m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
+ __m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
+ __m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
+ __m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
+
+ c0 = _mm_avg_epu16(c0, c1);
+ c2 = _mm_avg_epu16(c2, c3);
+ c4 = _mm_avg_epu16(c4, c5);
+ c6 = _mm_avg_epu16(c6, c7);
+ c8 = _mm_avg_epu16(c8, c9);
+ cA = _mm_avg_epu16(cA, cB);
+ cC = _mm_avg_epu16(cC, cD);
+ cE = _mm_avg_epu16(cE, cF);
+ c0 = _mm_avg_epu16(c0, c2);
+ c4 = _mm_avg_epu16(c4, c6);
+ c8 = _mm_avg_epu16(c8, cA);
+ cC = _mm_avg_epu16(cC, cE);
+ c0 = _mm_avg_epu16(c0, c4);
+ c8 = _mm_avg_epu16(c8, cC);
+ c0 = _mm_avg_epu16(c0, c8);
+
+ _mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+ }
+ else
+ #endif
+ {
+ #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
+
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+ unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+
+ c0 = AVERAGE(c0, c1);
+
+ *(unsigned int*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+ unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+ unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+ unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+
+ c0 = AVERAGE(c0, c1);
+ c2 = AVERAGE(c2, c3);
+ c0 = AVERAGE(c0, c2);
+
+ *(unsigned int*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+ unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+ unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+ unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+ unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
+ unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
+ unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
+ unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
+
+ c0 = AVERAGE(c0, c1);
+ c2 = AVERAGE(c2, c3);
+ c4 = AVERAGE(c4, c5);
+ c6 = AVERAGE(c6, c7);
+ c0 = AVERAGE(c0, c2);
+ c4 = AVERAGE(c4, c6);
+ c0 = AVERAGE(c0, c4);
+
+ *(unsigned int*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+ unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+ unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+ unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+ unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
+ unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
+ unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
+ unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
+ unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
+ unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
+ unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
+ unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
+ unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
+ unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
+ unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
+ unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
+
+ c0 = AVERAGE(c0, c1);
+ c2 = AVERAGE(c2, c3);
+ c4 = AVERAGE(c4, c5);
+ c6 = AVERAGE(c6, c7);
+ c8 = AVERAGE(c8, c9);
+ cA = AVERAGE(cA, cB);
+ cC = AVERAGE(cC, cD);
+ cE = AVERAGE(cE, cF);
+ c0 = AVERAGE(c0, c2);
+ c4 = AVERAGE(c4, c6);
+ c8 = AVERAGE(c8, cA);
+ cC = AVERAGE(cC, cE);
+ c0 = AVERAGE(c0, c4);
+ c8 = AVERAGE(c8, cC);
+ c0 = AVERAGE(c0, c8);
+
+ *(unsigned int*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+
+ #undef AVERAGE
+ }
+ }
+ else if(internal.format == FORMAT_A16B16G16R16)
+ {
+ #if defined(__i386__) || defined(__x86_64__)
+ if(CPUID::supportsSSE2() && (width % 2) == 0)
+ {
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 2)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+
+ c0 = _mm_avg_epu16(c0, c1);
+
+ _mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 2)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+ __m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
+ __m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
+
+ c0 = _mm_avg_epu16(c0, c1);
+ c2 = _mm_avg_epu16(c2, c3);
+ c0 = _mm_avg_epu16(c0, c2);
+
+ _mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 2)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+ __m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
+ __m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
+ __m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
+ __m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
+ __m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
+ __m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
+
+ c0 = _mm_avg_epu16(c0, c1);
+ c2 = _mm_avg_epu16(c2, c3);
+ c4 = _mm_avg_epu16(c4, c5);
+ c6 = _mm_avg_epu16(c6, c7);
+ c0 = _mm_avg_epu16(c0, c2);
+ c4 = _mm_avg_epu16(c4, c6);
+ c0 = _mm_avg_epu16(c0, c4);
+
+ _mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 2)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+ __m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
+ __m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
+ __m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
+ __m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
+ __m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
+ __m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
+ __m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
+ __m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
+ __m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
+ __m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
+ __m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
+ __m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
+ __m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
+ __m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
+
+ c0 = _mm_avg_epu16(c0, c1);
+ c2 = _mm_avg_epu16(c2, c3);
+ c4 = _mm_avg_epu16(c4, c5);
+ c6 = _mm_avg_epu16(c6, c7);
+ c8 = _mm_avg_epu16(c8, c9);
+ cA = _mm_avg_epu16(cA, cB);
+ cC = _mm_avg_epu16(cC, cD);
+ cE = _mm_avg_epu16(cE, cF);
+ c0 = _mm_avg_epu16(c0, c2);
+ c4 = _mm_avg_epu16(c4, c6);
+ c8 = _mm_avg_epu16(c8, cA);
+ cC = _mm_avg_epu16(cC, cE);
+ c0 = _mm_avg_epu16(c0, c4);
+ c8 = _mm_avg_epu16(c8, cC);
+ c0 = _mm_avg_epu16(c0, c8);
+
+ _mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+ }
+ else
+ #endif
+ {
+ #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
+
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < 2 * width; x++)
+ {
+ unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+ unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+
+ c0 = AVERAGE(c0, c1);
+
+ *(unsigned int*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < 2 * width; x++)
+ {
+ unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+ unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+ unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+ unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+
+ c0 = AVERAGE(c0, c1);
+ c2 = AVERAGE(c2, c3);
+ c0 = AVERAGE(c0, c2);
+
+ *(unsigned int*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < 2 * width; x++)
+ {
+ unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+ unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+ unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+ unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+ unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
+ unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
+ unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
+ unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
+
+ c0 = AVERAGE(c0, c1);
+ c2 = AVERAGE(c2, c3);
+ c4 = AVERAGE(c4, c5);
+ c6 = AVERAGE(c6, c7);
+ c0 = AVERAGE(c0, c2);
+ c4 = AVERAGE(c4, c6);
+ c0 = AVERAGE(c0, c4);
+
+ *(unsigned int*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < 2 * width; x++)
+ {
+ unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+ unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+ unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+ unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+ unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
+ unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
+ unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
+ unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
+ unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
+ unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
+ unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
+ unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
+ unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
+ unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
+ unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
+ unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
+
+ c0 = AVERAGE(c0, c1);
+ c2 = AVERAGE(c2, c3);
+ c4 = AVERAGE(c4, c5);
+ c6 = AVERAGE(c6, c7);
+ c8 = AVERAGE(c8, c9);
+ cA = AVERAGE(cA, cB);
+ cC = AVERAGE(cC, cD);
+ cE = AVERAGE(cE, cF);
+ c0 = AVERAGE(c0, c2);
+ c4 = AVERAGE(c4, c6);
+ c8 = AVERAGE(c8, cA);
+ cC = AVERAGE(cC, cE);
+ c0 = AVERAGE(c0, c4);
+ c8 = AVERAGE(c8, cC);
+ c0 = AVERAGE(c0, c8);
+
+ *(unsigned int*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+
+ #undef AVERAGE
+ }
+ }
+ else if(internal.format == FORMAT_R32F)
+ {
+ #if defined(__i386__) || defined(__x86_64__)
+ if(CPUID::supportsSSE() && (width % 4) == 0)
+ {
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 4)
+ {
+ __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+ __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+
+ c0 = _mm_add_ps(c0, c1);
+ c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
+
+ _mm_store_ps((float*)(source0 + 4 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 4)
+ {
+ __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+ __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+ __m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
+ __m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
+
+ c0 = _mm_add_ps(c0, c1);
+ c2 = _mm_add_ps(c2, c3);
+ c0 = _mm_add_ps(c0, c2);
+ c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
+
+ _mm_store_ps((float*)(source0 + 4 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 4)
+ {
+ __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+ __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+ __m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
+ __m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
+ __m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
+ __m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
+ __m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
+ __m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
+
+ c0 = _mm_add_ps(c0, c1);
+ c2 = _mm_add_ps(c2, c3);
+ c4 = _mm_add_ps(c4, c5);
+ c6 = _mm_add_ps(c6, c7);
+ c0 = _mm_add_ps(c0, c2);
+ c4 = _mm_add_ps(c4, c6);
+ c0 = _mm_add_ps(c0, c4);
+ c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
+
+ _mm_store_ps((float*)(source0 + 4 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 4)
+ {
+ __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+ __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+ __m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
+ __m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
+ __m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
+ __m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
+ __m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
+ __m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
+ __m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
+ __m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
+ __m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
+ __m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
+ __m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
+ __m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
+ __m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
+ __m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
+
+ c0 = _mm_add_ps(c0, c1);
+ c2 = _mm_add_ps(c2, c3);
+ c4 = _mm_add_ps(c4, c5);
+ c6 = _mm_add_ps(c6, c7);
+ c8 = _mm_add_ps(c8, c9);
+ cA = _mm_add_ps(cA, cB);
+ cC = _mm_add_ps(cC, cD);
+ cE = _mm_add_ps(cE, cF);
+ c0 = _mm_add_ps(c0, c2);
+ c4 = _mm_add_ps(c4, c6);
+ c8 = _mm_add_ps(c8, cA);
+ cC = _mm_add_ps(cC, cE);
+ c0 = _mm_add_ps(c0, c4);
+ c8 = _mm_add_ps(c8, cC);
+ c0 = _mm_add_ps(c0, c8);
+ c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
+
+ _mm_store_ps((float*)(source0 + 4 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+ }
+ else
+ #endif
+ {
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ float c0 = *(float*)(source0 + 4 * x);
+ float c1 = *(float*)(source1 + 4 * x);
+
+ c0 = c0 + c1;
+ c0 *= 1.0f / 2.0f;
+
+ *(float*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ float c0 = *(float*)(source0 + 4 * x);
+ float c1 = *(float*)(source1 + 4 * x);
+ float c2 = *(float*)(source2 + 4 * x);
+ float c3 = *(float*)(source3 + 4 * x);
+
+ c0 = c0 + c1;
+ c2 = c2 + c3;
+ c0 = c0 + c2;
+ c0 *= 1.0f / 4.0f;
+
+ *(float*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ float c0 = *(float*)(source0 + 4 * x);
+ float c1 = *(float*)(source1 + 4 * x);
+ float c2 = *(float*)(source2 + 4 * x);
+ float c3 = *(float*)(source3 + 4 * x);
+ float c4 = *(float*)(source4 + 4 * x);
+ float c5 = *(float*)(source5 + 4 * x);
+ float c6 = *(float*)(source6 + 4 * x);
+ float c7 = *(float*)(source7 + 4 * x);
+
+ c0 = c0 + c1;
+ c2 = c2 + c3;
+ c4 = c4 + c5;
+ c6 = c6 + c7;
+ c0 = c0 + c2;
+ c4 = c4 + c6;
+ c0 = c0 + c4;
+ c0 *= 1.0f / 8.0f;
+
+ *(float*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ float c0 = *(float*)(source0 + 4 * x);
+ float c1 = *(float*)(source1 + 4 * x);
+ float c2 = *(float*)(source2 + 4 * x);
+ float c3 = *(float*)(source3 + 4 * x);
+ float c4 = *(float*)(source4 + 4 * x);
+ float c5 = *(float*)(source5 + 4 * x);
+ float c6 = *(float*)(source6 + 4 * x);
+ float c7 = *(float*)(source7 + 4 * x);
+ float c8 = *(float*)(source8 + 4 * x);
+ float c9 = *(float*)(source9 + 4 * x);
+ float cA = *(float*)(sourceA + 4 * x);
+ float cB = *(float*)(sourceB + 4 * x);
+ float cC = *(float*)(sourceC + 4 * x);
+ float cD = *(float*)(sourceD + 4 * x);
+ float cE = *(float*)(sourceE + 4 * x);
+ float cF = *(float*)(sourceF + 4 * x);
+
+ c0 = c0 + c1;
+ c2 = c2 + c3;
+ c4 = c4 + c5;
+ c6 = c6 + c7;
+ c8 = c8 + c9;
+ cA = cA + cB;
+ cC = cC + cD;
+ cE = cE + cF;
+ c0 = c0 + c2;
+ c4 = c4 + c6;
+ c8 = c8 + cA;
+ cC = cC + cE;
+ c0 = c0 + c4;
+ c8 = c8 + cC;
+ c0 = c0 + c8;
+ c0 *= 1.0f / 16.0f;
+
+ *(float*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+ }
+ }
+ else if(internal.format == FORMAT_G32R32F)
+ {
+ #if defined(__i386__) || defined(__x86_64__)
+ if(CPUID::supportsSSE() && (width % 2) == 0)
+ {
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 2)
+ {
+ __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+ __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+
+ c0 = _mm_add_ps(c0, c1);
+ c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
+
+ _mm_store_ps((float*)(source0 + 8 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 2)
+ {
+ __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+ __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+ __m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
+ __m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
+
+ c0 = _mm_add_ps(c0, c1);
+ c2 = _mm_add_ps(c2, c3);
+ c0 = _mm_add_ps(c0, c2);
+ c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
+
+ _mm_store_ps((float*)(source0 + 8 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 2)
+ {
+ __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+ __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+ __m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
+ __m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
+ __m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
+ __m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
+ __m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
+ __m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
+
+ c0 = _mm_add_ps(c0, c1);
+ c2 = _mm_add_ps(c2, c3);
+ c4 = _mm_add_ps(c4, c5);
+ c6 = _mm_add_ps(c6, c7);
+ c0 = _mm_add_ps(c0, c2);
+ c4 = _mm_add_ps(c4, c6);
+ c0 = _mm_add_ps(c0, c4);
+ c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
+
+ _mm_store_ps((float*)(source0 + 8 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 2)
+ {
+ __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+ __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+ __m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
+ __m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
+ __m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
+ __m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
+ __m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
+ __m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
+ __m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
+ __m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
+ __m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
+ __m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
+ __m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
+ __m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
+ __m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
+ __m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
+
+ c0 = _mm_add_ps(c0, c1);
+ c2 = _mm_add_ps(c2, c3);
+ c4 = _mm_add_ps(c4, c5);
+ c6 = _mm_add_ps(c6, c7);
+ c8 = _mm_add_ps(c8, c9);
+ cA = _mm_add_ps(cA, cB);
+ cC = _mm_add_ps(cC, cD);
+ cE = _mm_add_ps(cE, cF);
+ c0 = _mm_add_ps(c0, c2);
+ c4 = _mm_add_ps(c4, c6);
+ c8 = _mm_add_ps(c8, cA);
+ cC = _mm_add_ps(cC, cE);
+ c0 = _mm_add_ps(c0, c4);
+ c8 = _mm_add_ps(c8, cC);
+ c0 = _mm_add_ps(c0, c8);
+ c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
+
+ _mm_store_ps((float*)(source0 + 8 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+ }
+ else
+ #endif
+ {
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < 2 * width; x++)
+ {
+ float c0 = *(float*)(source0 + 4 * x);
+ float c1 = *(float*)(source1 + 4 * x);
+
+ c0 = c0 + c1;
+ c0 *= 1.0f / 2.0f;
+
+ *(float*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < 2 * width; x++)
+ {
+ float c0 = *(float*)(source0 + 4 * x);
+ float c1 = *(float*)(source1 + 4 * x);
+ float c2 = *(float*)(source2 + 4 * x);
+ float c3 = *(float*)(source3 + 4 * x);
+
+ c0 = c0 + c1;
+ c2 = c2 + c3;
+ c0 = c0 + c2;
+ c0 *= 1.0f / 4.0f;
+
+ *(float*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < 2 * width; x++)
+ {
+ float c0 = *(float*)(source0 + 4 * x);
+ float c1 = *(float*)(source1 + 4 * x);
+ float c2 = *(float*)(source2 + 4 * x);
+ float c3 = *(float*)(source3 + 4 * x);
+ float c4 = *(float*)(source4 + 4 * x);
+ float c5 = *(float*)(source5 + 4 * x);
+ float c6 = *(float*)(source6 + 4 * x);
+ float c7 = *(float*)(source7 + 4 * x);
+
+ c0 = c0 + c1;
+ c2 = c2 + c3;
+ c4 = c4 + c5;
+ c6 = c6 + c7;
+ c0 = c0 + c2;
+ c4 = c4 + c6;
+ c0 = c0 + c4;
+ c0 *= 1.0f / 8.0f;
+
+ *(float*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < 2 * width; x++)
+ {
+ float c0 = *(float*)(source0 + 4 * x);
+ float c1 = *(float*)(source1 + 4 * x);
+ float c2 = *(float*)(source2 + 4 * x);
+ float c3 = *(float*)(source3 + 4 * x);
+ float c4 = *(float*)(source4 + 4 * x);
+ float c5 = *(float*)(source5 + 4 * x);
+ float c6 = *(float*)(source6 + 4 * x);
+ float c7 = *(float*)(source7 + 4 * x);
+ float c8 = *(float*)(source8 + 4 * x);
+ float c9 = *(float*)(source9 + 4 * x);
+ float cA = *(float*)(sourceA + 4 * x);
+ float cB = *(float*)(sourceB + 4 * x);
+ float cC = *(float*)(sourceC + 4 * x);
+ float cD = *(float*)(sourceD + 4 * x);
+ float cE = *(float*)(sourceE + 4 * x);
+ float cF = *(float*)(sourceF + 4 * x);
+
+ c0 = c0 + c1;
+ c2 = c2 + c3;
+ c4 = c4 + c5;
+ c6 = c6 + c7;
+ c8 = c8 + c9;
+ cA = cA + cB;
+ cC = cC + cD;
+ cE = cE + cF;
+ c0 = c0 + c2;
+ c4 = c4 + c6;
+ c8 = c8 + cA;
+ cC = cC + cE;
+ c0 = c0 + c4;
+ c8 = c8 + cC;
+ c0 = c0 + c8;
+ c0 *= 1.0f / 16.0f;
+
+ *(float*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+ }
+ }
+ else if(internal.format == FORMAT_A32B32G32R32F ||
+ internal.format == FORMAT_X32B32G32R32F ||
+ internal.format == FORMAT_X32B32G32R32F_UNSIGNED)
+ {
+ #if defined(__i386__) || defined(__x86_64__)
+ if(CPUID::supportsSSE())
+ {
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+ __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+
+ c0 = _mm_add_ps(c0, c1);
+ c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
+
+ _mm_store_ps((float*)(source0 + 16 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+ __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+ __m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
+ __m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
+
+ c0 = _mm_add_ps(c0, c1);
+ c2 = _mm_add_ps(c2, c3);
+ c0 = _mm_add_ps(c0, c2);
+ c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
+
+ _mm_store_ps((float*)(source0 + 16 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+ __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+ __m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
+ __m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
+ __m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
+ __m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
+ __m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
+ __m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
+
+ c0 = _mm_add_ps(c0, c1);
+ c2 = _mm_add_ps(c2, c3);
+ c4 = _mm_add_ps(c4, c5);
+ c6 = _mm_add_ps(c6, c7);
+ c0 = _mm_add_ps(c0, c2);
+ c4 = _mm_add_ps(c4, c6);
+ c0 = _mm_add_ps(c0, c4);
+ c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
+
+ _mm_store_ps((float*)(source0 + 16 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+ __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+ __m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
+ __m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
+ __m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
+ __m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
+ __m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
+ __m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
+ __m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
+ __m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
+ __m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
+ __m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
+ __m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
+ __m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
+ __m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
+ __m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
+
+ c0 = _mm_add_ps(c0, c1);
+ c2 = _mm_add_ps(c2, c3);
+ c4 = _mm_add_ps(c4, c5);
+ c6 = _mm_add_ps(c6, c7);
+ c8 = _mm_add_ps(c8, c9);
+ cA = _mm_add_ps(cA, cB);
+ cC = _mm_add_ps(cC, cD);
+ cE = _mm_add_ps(cE, cF);
+ c0 = _mm_add_ps(c0, c2);
+ c4 = _mm_add_ps(c4, c6);
+ c8 = _mm_add_ps(c8, cA);
+ cC = _mm_add_ps(cC, cE);
+ c0 = _mm_add_ps(c0, c4);
+ c8 = _mm_add_ps(c8, cC);
+ c0 = _mm_add_ps(c0, c8);
+ c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
+
+ _mm_store_ps((float*)(source0 + 16 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+ }
+ else
+ #endif
+ {
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < 4 * width; x++)
+ {
+ float c0 = *(float*)(source0 + 4 * x);
+ float c1 = *(float*)(source1 + 4 * x);
+
+ c0 = c0 + c1;
+ c0 *= 1.0f / 2.0f;
+
+ *(float*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < 4 * width; x++)
+ {
+ float c0 = *(float*)(source0 + 4 * x);
+ float c1 = *(float*)(source1 + 4 * x);
+ float c2 = *(float*)(source2 + 4 * x);
+ float c3 = *(float*)(source3 + 4 * x);
+
+ c0 = c0 + c1;
+ c2 = c2 + c3;
+ c0 = c0 + c2;
+ c0 *= 1.0f / 4.0f;
+
+ *(float*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < 4 * width; x++)
+ {
+ float c0 = *(float*)(source0 + 4 * x);
+ float c1 = *(float*)(source1 + 4 * x);
+ float c2 = *(float*)(source2 + 4 * x);
+ float c3 = *(float*)(source3 + 4 * x);
+ float c4 = *(float*)(source4 + 4 * x);
+ float c5 = *(float*)(source5 + 4 * x);
+ float c6 = *(float*)(source6 + 4 * x);
+ float c7 = *(float*)(source7 + 4 * x);
+
+ c0 = c0 + c1;
+ c2 = c2 + c3;
+ c4 = c4 + c5;
+ c6 = c6 + c7;
+ c0 = c0 + c2;
+ c4 = c4 + c6;
+ c0 = c0 + c4;
+ c0 *= 1.0f / 8.0f;
+
+ *(float*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < 4 * width; x++)
+ {
+ float c0 = *(float*)(source0 + 4 * x);
+ float c1 = *(float*)(source1 + 4 * x);
+ float c2 = *(float*)(source2 + 4 * x);
+ float c3 = *(float*)(source3 + 4 * x);
+ float c4 = *(float*)(source4 + 4 * x);
+ float c5 = *(float*)(source5 + 4 * x);
+ float c6 = *(float*)(source6 + 4 * x);
+ float c7 = *(float*)(source7 + 4 * x);
+ float c8 = *(float*)(source8 + 4 * x);
+ float c9 = *(float*)(source9 + 4 * x);
+ float cA = *(float*)(sourceA + 4 * x);
+ float cB = *(float*)(sourceB + 4 * x);
+ float cC = *(float*)(sourceC + 4 * x);
+ float cD = *(float*)(sourceD + 4 * x);
+ float cE = *(float*)(sourceE + 4 * x);
+ float cF = *(float*)(sourceF + 4 * x);
+
+ c0 = c0 + c1;
+ c2 = c2 + c3;
+ c4 = c4 + c5;
+ c6 = c6 + c7;
+ c8 = c8 + c9;
+ cA = cA + cB;
+ cC = cC + cD;
+ cE = cE + cF;
+ c0 = c0 + c2;
+ c4 = c4 + c6;
+ c8 = c8 + cA;
+ cC = cC + cE;
+ c0 = c0 + c4;
+ c8 = c8 + cC;
+ c0 = c0 + c8;
+ c0 *= 1.0f / 16.0f;
+
+ *(float*)(source0 + 4 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+ }
+ }
+ else if(internal.format == FORMAT_R5G6B5)
+ {
+ #if defined(__i386__) || defined(__x86_64__)
+ if(CPUID::supportsSSE2() && (width % 8) == 0)
+ {
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 8)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+
+ static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+ static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+ __m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+
+ c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+ c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+ c1 = _mm_avg_epu16(c0__g_, c1__g_);
+ c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+ c0 = _mm_or_si128(c0, c1);
+
+ _mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 8)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+ __m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+ __m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+
+ static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+ static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+ __m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+
+ c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+ c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+ c0 = _mm_avg_epu8(c0, c2);
+ c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+ c1 = _mm_avg_epu16(c0__g_, c1__g_);
+ c3 = _mm_avg_epu16(c2__g_, c3__g_);
+ c1 = _mm_avg_epu16(c1, c3);
+ c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+ c0 = _mm_or_si128(c0, c1);
+
+ _mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 8)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+ __m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+ __m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+ __m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
+ __m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
+ __m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
+ __m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
+
+ static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+ static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+ __m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
+
+ c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+ c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+ c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
+ c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
+ c0 = _mm_avg_epu8(c0, c2);
+ c4 = _mm_avg_epu8(c4, c6);
+ c0 = _mm_avg_epu8(c0, c4);
+ c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+ c1 = _mm_avg_epu16(c0__g_, c1__g_);
+ c3 = _mm_avg_epu16(c2__g_, c3__g_);
+ c5 = _mm_avg_epu16(c4__g_, c5__g_);
+ c7 = _mm_avg_epu16(c6__g_, c7__g_);
+ c1 = _mm_avg_epu16(c1, c3);
+ c5 = _mm_avg_epu16(c5, c7);
+ c1 = _mm_avg_epu16(c1, c5);
+ c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+ c0 = _mm_or_si128(c0, c1);
+
+ _mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x += 8)
+ {
+ __m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+ __m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+ __m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+ __m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+ __m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
+ __m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
+ __m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
+ __m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
+ __m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
+ __m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
+ __m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
+ __m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
+ __m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
+ __m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
+ __m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
+ __m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
+
+ static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+ static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+ __m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
+ __m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
+ __m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
+ __m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
+ __m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
+ __m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
+ __m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
+ __m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
+ __m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
+ __m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
+ __m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
+ __m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
+ __m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
+ __m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
+ __m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
+
+ c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+ c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+ c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
+ c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
+ c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
+ cA = _mm_avg_epu8(cA_r_b, cB_r_b);
+ cC = _mm_avg_epu8(cC_r_b, cD_r_b);
+ cE = _mm_avg_epu8(cE_r_b, cF_r_b);
+ c0 = _mm_avg_epu8(c0, c2);
+ c4 = _mm_avg_epu8(c4, c6);
+ c8 = _mm_avg_epu8(c8, cA);
+ cC = _mm_avg_epu8(cC, cE);
+ c0 = _mm_avg_epu8(c0, c4);
+ c8 = _mm_avg_epu8(c8, cC);
+ c0 = _mm_avg_epu8(c0, c8);
+ c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+ c1 = _mm_avg_epu16(c0__g_, c1__g_);
+ c3 = _mm_avg_epu16(c2__g_, c3__g_);
+ c5 = _mm_avg_epu16(c4__g_, c5__g_);
+ c7 = _mm_avg_epu16(c6__g_, c7__g_);
+ c9 = _mm_avg_epu16(c8__g_, c9__g_);
+ cB = _mm_avg_epu16(cA__g_, cB__g_);
+ cD = _mm_avg_epu16(cC__g_, cD__g_);
+ cF = _mm_avg_epu16(cE__g_, cF__g_);
+ c1 = _mm_avg_epu8(c1, c3);
+ c5 = _mm_avg_epu8(c5, c7);
+ c9 = _mm_avg_epu8(c9, cB);
+ cD = _mm_avg_epu8(cD, cF);
+ c1 = _mm_avg_epu8(c1, c5);
+ c9 = _mm_avg_epu8(c9, cD);
+ c1 = _mm_avg_epu8(c1, c9);
+ c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+ c0 = _mm_or_si128(c0, c1);
+
+ _mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+ }
+ else
+ #endif
+ {
+ #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
+
+ if(internal.samples == 2)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
+ unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
+
+ c0 = AVERAGE(c0, c1);
+
+ *(unsigned short*)(source0 + 2 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ }
+ }
+ else if(internal.samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
+ unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
+ unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
+ unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
+
+ c0 = AVERAGE(c0, c1);
+ c2 = AVERAGE(c2, c3);
+ c0 = AVERAGE(c0, c2);
+
+ *(unsigned short*)(source0 + 2 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ }
+ }
+ else if(internal.samples == 8)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
+ unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
+ unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
+ unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
+ unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
+ unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
+ unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
+ unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
+
+ c0 = AVERAGE(c0, c1);
+ c2 = AVERAGE(c2, c3);
+ c4 = AVERAGE(c4, c5);
+ c6 = AVERAGE(c6, c7);
+ c0 = AVERAGE(c0, c2);
+ c4 = AVERAGE(c4, c6);
+ c0 = AVERAGE(c0, c4);
+
+ *(unsigned short*)(source0 + 2 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ }
+ }
+ else if(internal.samples == 16)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
+ unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
+ unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
+ unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
+ unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
+ unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
+ unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
+ unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
+ unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
+ unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
+ unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
+ unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
+ unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
+ unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
+ unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
+ unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
+
+ c0 = AVERAGE(c0, c1);
+ c2 = AVERAGE(c2, c3);
+ c4 = AVERAGE(c4, c5);
+ c6 = AVERAGE(c6, c7);
+ c8 = AVERAGE(c8, c9);
+ cA = AVERAGE(cA, cB);
+ cC = AVERAGE(cC, cD);
+ cE = AVERAGE(cE, cF);
+ c0 = AVERAGE(c0, c2);
+ c4 = AVERAGE(c4, c6);
+ c8 = AVERAGE(c8, cA);
+ cC = AVERAGE(cC, cE);
+ c0 = AVERAGE(c0, c4);
+ c8 = AVERAGE(c8, cC);
+ c0 = AVERAGE(c0, c8);
+
+ *(unsigned short*)(source0 + 2 * x) = c0;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ source4 += pitch;
+ source5 += pitch;
+ source6 += pitch;
+ source7 += pitch;
+ source8 += pitch;
+ source9 += pitch;
+ sourceA += pitch;
+ sourceB += pitch;
+ sourceC += pitch;
+ sourceD += pitch;
+ sourceE += pitch;
+ sourceF += pitch;
+ }
+ }
+ else ASSERT(false);
+
+ #undef AVERAGE
+ }
+ }
+ else
+ {
+ // UNIMPLEMENTED();
+ }
+ }
+}
diff --git a/src/Device/Surface.hpp b/src/Device/Surface.hpp
new file mode 100644
index 0000000..10c5364
--- /dev/null
+++ b/src/Device/Surface.hpp
@@ -0,0 +1,665 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Surface_hpp
+#define sw_Surface_hpp
+
+#include "Color.hpp"
+#include "Main/Config.hpp"
+#include "Common/Resource.hpp"
+
+namespace sw
+{
+ class Resource;
+
+ template <typename T> struct RectT
+ {
+ RectT() {}
+ RectT(T x0i, T y0i, T x1i, T y1i) : x0(x0i), y0(y0i), x1(x1i), y1(y1i) {}
+
+ void clip(T minX, T minY, T maxX, T maxY)
+ {
+ x0 = clamp(x0, minX, maxX);
+ y0 = clamp(y0, minY, maxY);
+ x1 = clamp(x1, minX, maxX);
+ y1 = clamp(y1, minY, maxY);
+ }
+
+ T width() const { return x1 - x0; }
+ T height() const { return y1 - y0; }
+
+ T x0; // Inclusive
+ T y0; // Inclusive
+ T x1; // Exclusive
+ T y1; // Exclusive
+ };
+
+ typedef RectT<int> Rect;
+ typedef RectT<float> RectF;
+
+ template<typename T> struct SliceRectT : public RectT<T>
+ {
+ SliceRectT() : slice(0) {}
+ SliceRectT(const RectT<T>& rect) : RectT<T>(rect), slice(0) {}
+ SliceRectT(const RectT<T>& rect, int s) : RectT<T>(rect), slice(s) {}
+ SliceRectT(T x0, T y0, T x1, T y1, int s) : RectT<T>(x0, y0, x1, y1), slice(s) {}
+ int slice;
+ };
+
+ typedef SliceRectT<int> SliceRect;
+ typedef SliceRectT<float> SliceRectF;
+
+ enum Format : unsigned char
+ {
+ FORMAT_NULL,
+
+ FORMAT_A8,
+ FORMAT_R8I,
+ FORMAT_R8UI,
+ FORMAT_R8_SNORM,
+ FORMAT_R8,
+ FORMAT_R16I,
+ FORMAT_R16UI,
+ FORMAT_R32I,
+ FORMAT_R32UI,
+ FORMAT_R3G3B2,
+ FORMAT_A8R3G3B2,
+ FORMAT_X4R4G4B4,
+ FORMAT_A4R4G4B4,
+ FORMAT_R4G4B4A4,
+ FORMAT_R5G6B5,
+ FORMAT_R8G8B8,
+ FORMAT_B8G8R8,
+ FORMAT_X8R8G8B8,
+ FORMAT_A8R8G8B8,
+ FORMAT_X8B8G8R8I,
+ FORMAT_X8B8G8R8UI,
+ FORMAT_X8B8G8R8_SNORM,
+ FORMAT_X8B8G8R8,
+ FORMAT_A8B8G8R8I,
+ FORMAT_A8B8G8R8UI,
+ FORMAT_A8B8G8R8_SNORM,
+ FORMAT_A8B8G8R8,
+ FORMAT_SRGB8_X8,
+ FORMAT_SRGB8_A8,
+ FORMAT_X1R5G5B5,
+ FORMAT_A1R5G5B5,
+ FORMAT_R5G5B5A1,
+ FORMAT_G8R8I,
+ FORMAT_G8R8UI,
+ FORMAT_G8R8_SNORM,
+ FORMAT_G8R8,
+ FORMAT_G16R16,
+ FORMAT_G16R16I,
+ FORMAT_G16R16UI,
+ FORMAT_G32R32I,
+ FORMAT_G32R32UI,
+ FORMAT_A2R10G10B10,
+ FORMAT_A2B10G10R10,
+ FORMAT_A2B10G10R10UI,
+ FORMAT_A16B16G16R16,
+ FORMAT_X16B16G16R16I,
+ FORMAT_X16B16G16R16UI,
+ FORMAT_A16B16G16R16I,
+ FORMAT_A16B16G16R16UI,
+ FORMAT_X32B32G32R32I,
+ FORMAT_X32B32G32R32UI,
+ FORMAT_A32B32G32R32I,
+ FORMAT_A32B32G32R32UI,
+ // Paletted formats
+ FORMAT_P8,
+ FORMAT_A8P8,
+ // Compressed formats
+ FORMAT_DXT1,
+ FORMAT_DXT3,
+ FORMAT_DXT5,
+ FORMAT_ATI1,
+ FORMAT_ATI2,
+ FORMAT_ETC1,
+ FORMAT_R11_EAC,
+ FORMAT_SIGNED_R11_EAC,
+ FORMAT_RG11_EAC,
+ FORMAT_SIGNED_RG11_EAC,
+ FORMAT_RGB8_ETC2,
+ FORMAT_SRGB8_ETC2,
+ FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2,
+ FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2,
+ FORMAT_RGBA8_ETC2_EAC,
+ FORMAT_SRGB8_ALPHA8_ETC2_EAC,
+ FORMAT_RGBA_ASTC_4x4_KHR,
+ FORMAT_RGBA_ASTC_5x4_KHR,
+ FORMAT_RGBA_ASTC_5x5_KHR,
+ FORMAT_RGBA_ASTC_6x5_KHR,
+ FORMAT_RGBA_ASTC_6x6_KHR,
+ FORMAT_RGBA_ASTC_8x5_KHR,
+ FORMAT_RGBA_ASTC_8x6_KHR,
+ FORMAT_RGBA_ASTC_8x8_KHR,
+ FORMAT_RGBA_ASTC_10x5_KHR,
+ FORMAT_RGBA_ASTC_10x6_KHR,
+ FORMAT_RGBA_ASTC_10x8_KHR,
+ FORMAT_RGBA_ASTC_10x10_KHR,
+ FORMAT_RGBA_ASTC_12x10_KHR,
+ FORMAT_RGBA_ASTC_12x12_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR,
+ FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR,
+ // Floating-point formats
+ FORMAT_A16F,
+ FORMAT_R16F,
+ FORMAT_G16R16F,
+ FORMAT_B16G16R16F,
+ FORMAT_X16B16G16R16F,
+ FORMAT_A16B16G16R16F,
+ FORMAT_X16B16G16R16F_UNSIGNED,
+ FORMAT_A32F,
+ FORMAT_R32F,
+ FORMAT_G32R32F,
+ FORMAT_B32G32R32F,
+ FORMAT_X32B32G32R32F,
+ FORMAT_A32B32G32R32F,
+ FORMAT_X32B32G32R32F_UNSIGNED,
+ // Bump map formats
+ FORMAT_V8U8,
+ FORMAT_L6V5U5,
+ FORMAT_Q8W8V8U8,
+ FORMAT_X8L8V8U8,
+ FORMAT_A2W10V10U10,
+ FORMAT_V16U16,
+ FORMAT_A16W16V16U16,
+ FORMAT_Q16W16V16U16,
+ // Luminance formats
+ FORMAT_L8,
+ FORMAT_A4L4,
+ FORMAT_L16,
+ FORMAT_A8L8,
+ FORMAT_L16F,
+ FORMAT_A16L16F,
+ FORMAT_L32F,
+ FORMAT_A32L32F,
+ // Depth/stencil formats
+ FORMAT_D16,
+ FORMAT_D32,
+ FORMAT_D24X8,
+ FORMAT_D24S8,
+ FORMAT_D24FS8,
+ FORMAT_D32F, // Quad layout
+ FORMAT_D32FS8, // Quad layout
+ FORMAT_D32F_COMPLEMENTARY, // Quad layout, 1 - z
+ FORMAT_D32FS8_COMPLEMENTARY, // Quad layout, 1 - z
+ FORMAT_D32F_LOCKABLE, // Linear layout
+ FORMAT_D32FS8_TEXTURE, // Linear layout, no PCF
+ FORMAT_D32F_SHADOW, // Linear layout, PCF
+ FORMAT_D32FS8_SHADOW, // Linear layout, PCF
+ FORMAT_DF24S8,
+ FORMAT_DF16S8,
+ FORMAT_INTZ,
+ FORMAT_S8,
+ // Quad layout framebuffer
+ FORMAT_X8G8R8B8Q,
+ FORMAT_A8G8R8B8Q,
+ // YUV formats
+ FORMAT_YV12_BT601,
+ FORMAT_YV12_BT709,
+ FORMAT_YV12_JFIF, // Full-swing BT.601
+
+ FORMAT_LAST = FORMAT_YV12_JFIF
+ };
+
+ enum Lock
+ {
+ LOCK_UNLOCKED,
+ LOCK_READONLY,
+ LOCK_WRITEONLY,
+ LOCK_READWRITE,
+ LOCK_DISCARD,
+ LOCK_UPDATE // Write access which doesn't dirty the buffer, because it's being updated with the sibling's data.
+ };
+
+ class [[clang::lto_visibility_public]] Surface
+ {
+ private:
+ struct Buffer
+ {
+ friend Surface;
+
+ private:
+ void write(int x, int y, int z, const Color<float> &color);
+ void write(int x, int y, const Color<float> &color);
+ void write(void *element, const Color<float> &color);
+ Color<float> read(int x, int y, int z) const;
+ Color<float> read(int x, int y) const;
+ Color<float> read(void *element) const;
+ Color<float> sample(float x, float y, float z) const;
+ Color<float> sample(float x, float y, int layer) const;
+
+ void *lockRect(int x, int y, int z, Lock lock);
+ void unlockRect();
+
+ void *buffer;
+ int width;
+ int height;
+ int depth;
+ short border;
+ short samples;
+
+ int bytes;
+ int pitchB;
+ int pitchP;
+ int sliceB;
+ int sliceP;
+
+ Format format;
+ AtomicInt lock;
+
+ bool dirty; // Sibling internal/external buffer doesn't match.
+ };
+
+ protected:
+ Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice);
+ Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0);
+
+ public:
+ static Surface *create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice);
+ static Surface *create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0);
+
+ virtual ~Surface() = 0;
+
+ inline void *lock(int x, int y, int z, Lock lock, Accessor client, bool internal = false);
+ inline void unlock(bool internal = false);
+ inline int getWidth() const;
+ inline int getHeight() const;
+ inline int getDepth() const;
+ inline int getBorder() const;
+ inline Format getFormat(bool internal = false) const;
+ inline int getPitchB(bool internal = false) const;
+ inline int getPitchP(bool internal = false) const;
+ inline int getSliceB(bool internal = false) const;
+ inline int getSliceP(bool internal = false) const;
+
+ void *lockExternal(int x, int y, int z, Lock lock, Accessor client);
+ void unlockExternal();
+ inline Format getExternalFormat() const;
+ inline int getExternalPitchB() const;
+ inline int getExternalPitchP() const;
+ inline int getExternalSliceB() const;
+ inline int getExternalSliceP() const;
+
+ virtual void *lockInternal(int x, int y, int z, Lock lock, Accessor client) = 0;
+ virtual void unlockInternal() = 0;
+ inline Format getInternalFormat() const;
+ inline int getInternalPitchB() const;
+ inline int getInternalPitchP() const;
+ inline int getInternalSliceB() const;
+ inline int getInternalSliceP() const;
+
+ void *lockStencil(int x, int y, int front, Accessor client);
+ void unlockStencil();
+ inline Format getStencilFormat() const;
+ inline int getStencilPitchB() const;
+ inline int getStencilSliceB() const;
+
+ void sync(); // Wait for lock(s) to be released.
+ virtual bool requiresSync() const { return false; }
+ inline bool isUnlocked() const; // Only reliable after sync().
+
+ inline int getSamples() const;
+ inline int getMultiSampleCount() const;
+ inline int getSuperSampleCount() const;
+
+ bool isEntire(const Rect& rect) const;
+ Rect getRect() const;
+ void clearDepth(float depth, int x0, int y0, int width, int height);
+ void clearStencil(unsigned char stencil, unsigned char mask, int x0, int y0, int width, int height);
+ void fill(const Color<float> &color, int x0, int y0, int width, int height);
+
+ Color<float> readExternal(int x, int y, int z) const;
+ Color<float> readExternal(int x, int y) const;
+ Color<float> sampleExternal(float x, float y, float z) const;
+ Color<float> sampleExternal(float x, float y) const;
+ void writeExternal(int x, int y, int z, const Color<float> &color);
+ void writeExternal(int x, int y, const Color<float> &color);
+
+ void copyInternal(const Surface* src, int x, int y, float srcX, float srcY, bool filter);
+ void copyInternal(const Surface* src, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter);
+
+ enum Edge { TOP, BOTTOM, RIGHT, LEFT };
+ void copyCubeEdge(Edge dstEdge, Surface *src, Edge srcEdge);
+ void computeCubeCorner(int x0, int y0, int x1, int y1);
+
+ bool hasStencil() const;
+ bool hasDepth() const;
+ bool hasPalette() const;
+ bool isRenderTarget() const;
+
+ bool hasDirtyContents() const;
+ void markContentsClean();
+ inline bool isExternalDirty() const;
+ Resource *getResource();
+
+ static int bytes(Format format);
+ static int pitchB(int width, int border, Format format, bool target);
+ static int pitchP(int width, int border, Format format, bool target);
+ static int sliceB(int width, int height, int border, Format format, bool target);
+ static int sliceP(int width, int height, int border, Format format, bool target);
+ static size_t size(int width, int height, int depth, int border, int samples, Format format);
+
+ static bool isStencil(Format format);
+ static bool isDepth(Format format);
+ static bool hasQuadLayout(Format format);
+ static bool isPalette(Format format);
+
+ static bool isFloatFormat(Format format);
+ static bool isUnsignedComponent(Format format, int component);
+ static bool isSRGBreadable(Format format);
+ static bool isSRGBwritable(Format format);
+ static bool isSRGBformat(Format format);
+ static bool isCompressed(Format format);
+ static bool isSignedNonNormalizedInteger(Format format);
+ static bool isUnsignedNonNormalizedInteger(Format format);
+ static bool isNonNormalizedInteger(Format format);
+ static bool isNormalizedInteger(Format format);
+ static int componentCount(Format format);
+
+ static void setTexturePalette(unsigned int *palette);
+
+ private:
+ sw::Resource *resource;
+
+ typedef unsigned char byte;
+ typedef unsigned short word;
+ typedef unsigned int dword;
+ typedef uint64_t qword;
+
+ struct DXT1
+ {
+ word c0;
+ word c1;
+ dword lut;
+ };
+
+ struct DXT3
+ {
+ qword a;
+
+ word c0;
+ word c1;
+ dword lut;
+ };
+
+ struct DXT5
+ {
+ union
+ {
+ struct
+ {
+ byte a0;
+ byte a1;
+ };
+
+ qword alut; // Skip first 16 bit
+ };
+
+ word c0;
+ word c1;
+ dword clut;
+ };
+
+ struct ATI2
+ {
+ union
+ {
+ struct
+ {
+ byte y0;
+ byte y1;
+ };
+
+ qword ylut; // Skip first 16 bit
+ };
+
+ union
+ {
+ struct
+ {
+ byte x0;
+ byte x1;
+ };
+
+ qword xlut; // Skip first 16 bit
+ };
+ };
+
+ struct ATI1
+ {
+ union
+ {
+ struct
+ {
+ byte r0;
+ byte r1;
+ };
+
+ qword rlut; // Skip first 16 bit
+ };
+ };
+
+ static void decodeR8G8B8(Buffer &destination, Buffer &source);
+ static void decodeX1R5G5B5(Buffer &destination, Buffer &source);
+ static void decodeA1R5G5B5(Buffer &destination, Buffer &source);
+ static void decodeX4R4G4B4(Buffer &destination, Buffer &source);
+ static void decodeA4R4G4B4(Buffer &destination, Buffer &source);
+ static void decodeP8(Buffer &destination, Buffer &source);
+
+ static void decodeDXT1(Buffer &internal, Buffer &external);
+ static void decodeDXT3(Buffer &internal, Buffer &external);
+ static void decodeDXT5(Buffer &internal, Buffer &external);
+ static void decodeATI1(Buffer &internal, Buffer &external);
+ static void decodeATI2(Buffer &internal, Buffer &external);
+ static void decodeEAC(Buffer &internal, Buffer &external, int nbChannels, bool isSigned);
+ static void decodeETC2(Buffer &internal, Buffer &external, int nbAlphaBits, bool isSRGB);
+ static void decodeASTC(Buffer &internal, Buffer &external, int xSize, int ySize, int zSize, bool isSRGB);
+
+ static void update(Buffer &destination, Buffer &source);
+ static void genericUpdate(Buffer &destination, Buffer &source);
+ static void *allocateBuffer(int width, int height, int depth, int border, int samples, Format format);
+ static void memfill4(void *buffer, int pattern, int bytes);
+
+ bool identicalBuffers() const;
+ Format selectInternalFormat(Format format) const;
+
+ void resolve();
+
+ Buffer external;
+ Buffer internal;
+ Buffer stencil;
+
+ const bool lockable;
+ const bool renderTarget;
+
+ bool dirtyContents; // Sibling surfaces need updating (mipmaps / cube borders).
+ unsigned int paletteUsed;
+
+ static unsigned int *palette; // FIXME: Not multi-device safe
+ static unsigned int paletteID;
+
+ bool hasParent;
+ bool ownExternal;
+ };
+}
+
+#undef min
+#undef max
+
+namespace sw
+{
+ void *Surface::lock(int x, int y, int z, Lock lock, Accessor client, bool internal)
+ {
+ return internal ? lockInternal(x, y, z, lock, client) : lockExternal(x, y, z, lock, client);
+ }
+
+ void Surface::unlock(bool internal)
+ {
+ return internal ? unlockInternal() : unlockExternal();
+ }
+
+ int Surface::getWidth() const
+ {
+ return external.width;
+ }
+
+ int Surface::getHeight() const
+ {
+ return external.height;
+ }
+
+ int Surface::getDepth() const
+ {
+ return external.depth;
+ }
+
+ int Surface::getBorder() const
+ {
+ return internal.border;
+ }
+
+ Format Surface::getFormat(bool internal) const
+ {
+ return internal ? getInternalFormat() : getExternalFormat();
+ }
+
+ int Surface::getPitchB(bool internal) const
+ {
+ return internal ? getInternalPitchB() : getExternalPitchB();
+ }
+
+ int Surface::getPitchP(bool internal) const
+ {
+ return internal ? getInternalPitchP() : getExternalPitchP();
+ }
+
+ int Surface::getSliceB(bool internal) const
+ {
+ return internal ? getInternalSliceB() : getExternalSliceB();
+ }
+
+ int Surface::getSliceP(bool internal) const
+ {
+ return internal ? getInternalSliceP() : getExternalSliceP();
+ }
+
+ Format Surface::getExternalFormat() const
+ {
+ return external.format;
+ }
+
+ int Surface::getExternalPitchB() const
+ {
+ return external.pitchB;
+ }
+
+ int Surface::getExternalPitchP() const
+ {
+ return external.pitchP;
+ }
+
+ int Surface::getExternalSliceB() const
+ {
+ return external.sliceB;
+ }
+
+ int Surface::getExternalSliceP() const
+ {
+ return external.sliceP;
+ }
+
+ Format Surface::getInternalFormat() const
+ {
+ return internal.format;
+ }
+
+ int Surface::getInternalPitchB() const
+ {
+ return internal.pitchB;
+ }
+
+ int Surface::getInternalPitchP() const
+ {
+ return internal.pitchP;
+ }
+
+ int Surface::getInternalSliceB() const
+ {
+ return internal.sliceB;
+ }
+
+ int Surface::getInternalSliceP() const
+ {
+ return internal.sliceP;
+ }
+
+ Format Surface::getStencilFormat() const
+ {
+ return stencil.format;
+ }
+
+ int Surface::getStencilPitchB() const
+ {
+ return stencil.pitchB;
+ }
+
+ int Surface::getStencilSliceB() const
+ {
+ return stencil.sliceB;
+ }
+
+ int Surface::getSamples() const
+ {
+ return internal.samples;
+ }
+
+ int Surface::getMultiSampleCount() const
+ {
+ return sw::min((int)internal.samples, 4);
+ }
+
+ int Surface::getSuperSampleCount() const
+ {
+ return internal.samples > 4 ? internal.samples / 4 : 1;
+ }
+
+ bool Surface::isUnlocked() const
+ {
+ return external.lock == LOCK_UNLOCKED &&
+ internal.lock == LOCK_UNLOCKED &&
+ stencil.lock == LOCK_UNLOCKED;
+ }
+
+ bool Surface::isExternalDirty() const
+ {
+ return external.buffer && external.buffer != internal.buffer && external.dirty;
+ }
+}
+
+#endif // sw_Surface_hpp
diff --git a/src/Device/SwiftConfig.cpp b/src/Device/SwiftConfig.cpp
new file mode 100644
index 0000000..1c22394
--- /dev/null
+++ b/src/Device/SwiftConfig.cpp
@@ -0,0 +1,822 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "SwiftConfig.hpp"
+
+#include "Config.hpp"
+#include "Common/Configurator.hpp"
+#include "Common/Debug.hpp"
+#include "Common/Version.h"
+
+#include <sstream>
+#include <stdio.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <algorithm>
+
+namespace sw
+{
+ extern Profiler profiler;
+
+ std::string itoa(int number)
+ {
+ std::stringstream ss;
+ ss << number;
+ return ss.str();
+ }
+
+ std::string ftoa(double number)
+ {
+ std::stringstream ss;
+ ss << number;
+ return ss.str();
+ }
+
+ SwiftConfig::SwiftConfig(bool disableServerOverride) : listenSocket(0)
+ {
+ readConfiguration(disableServerOverride);
+
+ if(!disableServerOverride)
+ {
+ writeConfiguration();
+ }
+
+ receiveBuffer = 0;
+
+ if(!config.disableServer)
+ {
+ createServer();
+ }
+ }
+
+ SwiftConfig::~SwiftConfig()
+ {
+ destroyServer();
+ }
+
+ void SwiftConfig::createServer()
+ {
+ bufferLength = 16 * 1024;
+ receiveBuffer = new char[bufferLength];
+
+ Socket::startup();
+ listenSocket = new Socket("localhost", "8080");
+ listenSocket->listen();
+
+ terminate = false;
+ serverThread = new Thread(serverRoutine, this);
+ }
+
+ void SwiftConfig::destroyServer()
+ {
+ if(receiveBuffer)
+ {
+ terminate = true;
+ serverThread->join();
+ delete serverThread;
+
+ delete listenSocket;
+ listenSocket = 0;
+
+ Socket::cleanup();
+
+ delete[] receiveBuffer;
+ receiveBuffer = 0;
+ }
+ }
+
+ bool SwiftConfig::hasNewConfiguration(bool reset)
+ {
+ bool value = newConfig;
+
+ if(reset)
+ {
+ newConfig = false;
+ }
+
+ return value;
+ }
+
+ void SwiftConfig::getConfiguration(Configuration &configuration)
+ {
+ criticalSection.lock();
+ configuration = config;
+ criticalSection.unlock();
+ }
+
+ void SwiftConfig::serverRoutine(void *parameters)
+ {
+ SwiftConfig *swiftConfig = (SwiftConfig*)parameters;
+
+ swiftConfig->serverLoop();
+ }
+
+ void SwiftConfig::serverLoop()
+ {
+ readConfiguration();
+
+ while(!terminate)
+ {
+ if(listenSocket->select(100000))
+ {
+ Socket *clientSocket = listenSocket->accept();
+ int bytesReceived = 1;
+
+ while(bytesReceived > 0 && !terminate)
+ {
+ if(clientSocket->select(10))
+ {
+ bytesReceived = clientSocket->receive(receiveBuffer, bufferLength);
+
+ if(bytesReceived > 0)
+ {
+ receiveBuffer[bytesReceived] = 0;
+
+ respond(clientSocket, receiveBuffer);
+ }
+ }
+ }
+
+ delete clientSocket;
+ }
+ }
+ }
+
+ bool match(const char **url, const char *string)
+ {
+ size_t length = strlen(string);
+
+ if(strncmp(*url, string, length) == 0)
+ {
+ *url += length;
+
+ return true;
+ }
+
+ return false;
+ }
+
+ void SwiftConfig::respond(Socket *clientSocket, const char *request)
+ {
+ if(match(&request, "GET /"))
+ {
+ if(match(&request, "swiftshader") || match(&request, "swiftconfig"))
+ {
+ if(match(&request, " ") || match(&request, "/ "))
+ {
+ return send(clientSocket, OK, page());
+ }
+ }
+ }
+ else if(match(&request, "POST /"))
+ {
+ if(match(&request, "swiftshader") || match(&request, "swiftconfig"))
+ {
+ if(match(&request, " ") || match(&request, "/ "))
+ {
+ criticalSection.lock();
+
+ const char *postData = strstr(request, "\r\n\r\n");
+ postData = postData ? postData + 4 : 0;
+
+ if(postData && strlen(postData) > 0)
+ {
+ parsePost(postData);
+ }
+ else // POST data in next packet
+ {
+ int bytesReceived = clientSocket->receive(receiveBuffer, bufferLength);
+
+ if(bytesReceived > 0)
+ {
+ receiveBuffer[bytesReceived] = 0;
+ parsePost(receiveBuffer);
+ }
+ }
+
+ writeConfiguration();
+ newConfig = true;
+
+ if(config.disableServer)
+ {
+ destroyServer();
+ }
+
+ criticalSection.unlock();
+
+ return send(clientSocket, OK, page());
+ }
+ else if(match(&request, "/profile "))
+ {
+ return send(clientSocket, OK, profile());
+ }
+ }
+ }
+
+ return send(clientSocket, NotFound);
+ }
+
+ std::string SwiftConfig::page()
+ {
+ std::string html;
+
+ const std::string selected = "selected='selected'";
+ const std::string checked = "checked='checked'";
+ const std::string empty = "";
+
+ html += "<!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01//EN' 'http://www.w3.org/TR/html4/strict.dtd'>\n";
+ html += "<html>\n";
+ html += "<head>\n";
+ html += "<meta http-equiv='content-type' content='text/html; charset=UTF-8'>\n";
+ html += "<title>SwiftShader Configuration Panel</title>\n";
+ html += "</head>\n";
+ html += "<body>\n";
+ html += "<script type='text/javascript'>\n";
+ html += "request();\n";
+ html += "function request()\n";
+ html += "{\n";
+ html += "var xhr = new XMLHttpRequest();\n";
+ html += "xhr.open('POST', '/swiftshader/profile', true);\n";
+ html += "xhr.onreadystatechange = function()\n";
+ html += "{\n";
+ html += "if(xhr.readyState == 4 && xhr.status == 200)\n";
+ html += "{\n";
+ html += "document.getElementById('profile').innerHTML = xhr.responseText;\n";
+ html += "setTimeout('request()', 1000);\n";
+ html += "}\n";
+ html += "}\n";
+ html += "xhr.send();\n";
+ html += "}\n";
+ html += "</script>\n";
+ html += "<form method='POST' action=''>\n";
+ html += "<h1>SwiftShader Configuration Panel</h1>\n";
+ html += "<div id='profile'>" + profile() + "</div>\n";
+ html += "<hr><p>\n";
+ html += "<input type='submit' value='Apply changes' title='Click to apply all settings.'>\n";
+ // html += "<input type='reset' value='Reset changes' title='Click to reset your changes to the previous value.'>\n";
+ html += "</p><hr>\n";
+ html += "<h2><em>Device capabilities</em></h2>\n";
+ html += "<table>\n";
+ html += "<tr><td>Build revision:</td><td>" REVISION_STRING "</td></tr>\n";
+ html += "<tr><td>Pixel shader model:</td><td><select name='pixelShaderVersion' title='The highest version of pixel shader supported by SwiftShader. Lower versions might be faster if supported by the application. Only effective after restarting the application.'>\n";
+ html += "<option value='0'" + (config.pixelShaderVersion == 0 ? selected : empty) + ">0.0</option>\n";
+ html += "<option value='11'" + (config.pixelShaderVersion == 11 ? selected : empty) + ">1.1</option>\n";
+ html += "<option value='12'" + (config.pixelShaderVersion == 12 ? selected : empty) + ">1.2</option>\n";
+ html += "<option value='13'" + (config.pixelShaderVersion == 13 ? selected : empty) + ">1.3</option>\n";
+ html += "<option value='14'" + (config.pixelShaderVersion == 14 ? selected : empty) + ">1.4</option>\n";
+ html += "<option value='20'" + (config.pixelShaderVersion == 20 ? selected : empty) + ">2.0</option>\n";
+ html += "<option value='21'" + (config.pixelShaderVersion == 21 ? selected : empty) + ">2.x</option>\n";
+ html += "<option value='30'" + (config.pixelShaderVersion == 30 ? selected : empty) + ">3.0 (default)</option>\n";
+ html += "</select></td></tr>\n";
+ html += "<tr><td>Vertex shader model:</td><td><select name='vertexShaderVersion' title='The highest version of vertex shader supported by SwiftShader. Lower versions might be faster if supported by the application. Only effective after restarting the application.'>\n";
+ html += "<option value='0'" + (config.vertexShaderVersion == 0 ? selected : empty) + ">0.0</option>\n";
+ html += "<option value='11'" + (config.vertexShaderVersion == 11 ? selected : empty) + ">1.1</option>\n";
+ html += "<option value='20'" + (config.vertexShaderVersion == 20 ? selected : empty) + ">2.0</option>\n";
+ html += "<option value='21'" + (config.vertexShaderVersion == 21 ? selected : empty) + ">2.x</option>\n";
+ html += "<option value='30'" + (config.vertexShaderVersion == 30 ? selected : empty) + ">3.0 (default)</option>\n";
+ html += "</select></td></tr>\n";
+ html += "<tr><td>Texture memory:</td><td><select name='textureMemory' title='The maximum amount of memory used for textures and other resources.'>\n";
+ html += "<option value='128'" + (config.textureMemory == 128 ? selected : empty) + ">128 MB</option>\n";
+ html += "<option value='256'" + (config.textureMemory == 256 ? selected : empty) + ">256 MB (default)</option>\n";
+ html += "<option value='512'" + (config.textureMemory == 512 ? selected : empty) + ">512 MB</option>\n";
+ html += "<option value='1024'" + (config.textureMemory == 1024 ? selected : empty) + ">1024 MB</option>\n";
+ html += "<option value='2048'" + (config.textureMemory == 2048 ? selected : empty) + ">2048 MB</option>\n";
+ html += "</select></td></tr>\n";
+ html += "<tr><td>Device identifier:</td><td><select name='identifier' title='The information used by some applications to determine device capabilities.'>\n";
+ html += "<option value='0'" + (config.identifier == 0 ? selected : empty) + ">Google SwiftShader (default)</option>\n";
+ html += "<option value='1'" + (config.identifier == 1 ? selected : empty) + ">NVIDIA GeForce 7900 GS</option>\n";
+ html += "<option value='2'" + (config.identifier == 2 ? selected : empty) + ">ATI Mobility Radeon X1600</option>\n";
+ html += "<option value='3'" + (config.identifier == 3 ? selected : empty) + ">Intel GMA X3100</option>\n";
+ html += "<option value='4'" + (config.identifier == 4 ? selected : empty) + ">System device</option>\n";
+ html += "</select></td></tr>\n";
+ html += "</table>\n";
+ html += "<h2><em>Cache sizes</em></h2>\n";
+ html += "<table>\n";
+ html += "<tr><td>Vertex routine cache size:</td><td><select name='vertexRoutineCacheSize' title='The number of dynamically generated vertex processing routines being cached for reuse. Lower numbers save memory but require more routines to be regenerated.'>\n";
+ html += "<option value='64'" + (config.vertexRoutineCacheSize == 64 ? selected : empty) + ">64</option>\n";
+ html += "<option value='128'" + (config.vertexRoutineCacheSize == 128 ? selected : empty) + ">128</option>\n";
+ html += "<option value='256'" + (config.vertexRoutineCacheSize == 256 ? selected : empty) + ">256</option>\n";
+ html += "<option value='512'" + (config.vertexRoutineCacheSize == 512 ? selected : empty) + ">512</option>\n";
+ html += "<option value='1024'" + (config.vertexRoutineCacheSize == 1024 ? selected : empty) + ">1024 (default)</option>\n";
+ html += "<option value='2048'" + (config.vertexRoutineCacheSize == 2048 ? selected : empty) + ">2048</option>\n";
+ html += "<option value='4096'" + (config.vertexRoutineCacheSize == 4096 ? selected : empty) + ">4096</option>\n";
+ html += "</select></td>\n";
+ html += "</tr>\n";
+ html += "<tr><td>Pixel routine cache size:</td><td><select name='pixelRoutineCacheSize' title='The number of dynamically generated pixel processing routines being cached for reuse. Lower numbers save memory but require more routines to be regenerated.'>\n";
+ html += "<option value='64'" + (config.pixelRoutineCacheSize == 64 ? selected : empty) + ">64</option>\n";
+ html += "<option value='128'" + (config.pixelRoutineCacheSize == 128 ? selected : empty) + ">128</option>\n";
+ html += "<option value='256'" + (config.pixelRoutineCacheSize == 256 ? selected : empty) + ">256</option>\n";
+ html += "<option value='512'" + (config.pixelRoutineCacheSize == 512 ? selected : empty) + ">512</option>\n";
+ html += "<option value='1024'" + (config.pixelRoutineCacheSize == 1024 ? selected : empty) + ">1024 (default)</option>\n";
+ html += "<option value='2048'" + (config.pixelRoutineCacheSize == 2048 ? selected : empty) + ">2048</option>\n";
+ html += "<option value='4096'" + (config.pixelRoutineCacheSize == 4096 ? selected : empty) + ">4096</option>\n";
+ html += "</select></td>\n";
+ html += "</tr>\n";
+ html += "<tr><td>Setup routine cache size:</td><td><select name='setupRoutineCacheSize' title='The number of dynamically generated primitive setup routines being cached for reuse. Lower numbers save memory but require more routines to be regenerated.'>\n";
+ html += "<option value='64'" + (config.setupRoutineCacheSize == 64 ? selected : empty) + ">64</option>\n";
+ html += "<option value='128'" + (config.setupRoutineCacheSize == 128 ? selected : empty) + ">128</option>\n";
+ html += "<option value='256'" + (config.setupRoutineCacheSize == 256 ? selected : empty) + ">256</option>\n";
+ html += "<option value='512'" + (config.setupRoutineCacheSize == 512 ? selected : empty) + ">512</option>\n";
+ html += "<option value='1024'" + (config.setupRoutineCacheSize == 1024 ? selected : empty) + ">1024 (default)</option>\n";
+ html += "<option value='2048'" + (config.setupRoutineCacheSize == 2048 ? selected : empty) + ">2048</option>\n";
+ html += "<option value='4096'" + (config.setupRoutineCacheSize == 4096 ? selected : empty) + ">4096</option>\n";
+ html += "</select></td>\n";
+ html += "</tr>\n";
+ html += "<tr><td>Vertex cache size:</td><td><select name='vertexCacheSize' title='The number of processed vertices being cached for reuse. Lower numbers save memory but require more vertices to be reprocessed.'>\n";
+ html += "<option value='64'" + (config.vertexCacheSize == 64 ? selected : empty) + ">64 (default)</option>\n";
+ html += "</select></td>\n";
+ html += "</tr>\n";
+ html += "</table>\n";
+ html += "<h2><em>Quality</em></h2>\n";
+ html += "<table>\n";
+ html += "<tr><td>Maximum texture sampling quality:</td><td><select name='textureSampleQuality' title='The maximum texture filtering quality. Lower settings can be faster but cause visual artifacts.'>\n";
+ html += "<option value='0'" + (config.textureSampleQuality == 0 ? selected : empty) + ">Point</option>\n";
+ html += "<option value='1'" + (config.textureSampleQuality == 1 ? selected : empty) + ">Linear</option>\n";
+ html += "<option value='2'" + (config.textureSampleQuality == 2 ? selected : empty) + ">Anisotropic (default)</option>\n";
+ html += "</select></td>\n";
+ html += "</tr>\n";
+ html += "<tr><td>Maximum mipmapping quality:</td><td><select name='mipmapQuality' title='The maximum mipmap filtering quality. Higher settings can be more visually appealing but are slower.'>\n";
+ html += "<option value='0'" + (config.mipmapQuality == 0 ? selected : empty) + ">Point</option>\n";
+ html += "<option value='1'" + (config.mipmapQuality == 1 ? selected : empty) + ">Linear (default)</option>\n";
+ html += "</select></td>\n";
+ html += "</tr>\n";
+ html += "<tr><td>Perspective correction:</td><td><select name='perspectiveCorrection' title='Enables or disables perspective correction. Disabling it is faster but can causes distortion. Recommended for 2D applications only.'>\n";
+ html += "<option value='0'" + (config.perspectiveCorrection == 0 ? selected : empty) + ">Off</option>\n";
+ html += "<option value='1'" + (config.perspectiveCorrection == 1 ? selected : empty) + ">On (default)</option>\n";
+ html += "</select></td>\n";
+ html += "</tr>\n";
+ html += "<tr><td>Transcendental function precision:</td><td><select name='transcendentalPrecision' title='The precision at which log/exp/pow/rcp/rsq/nrm shader instructions are computed. Lower settings can be faster but cause visual artifacts.'>\n";
+ html += "<option value='0'" + (config.transcendentalPrecision == 0 ? selected : empty) + ">Approximate</option>\n";
+ html += "<option value='1'" + (config.transcendentalPrecision == 1 ? selected : empty) + ">Partial</option>\n";
+ html += "<option value='2'" + (config.transcendentalPrecision == 2 ? selected : empty) + ">Accurate (default)</option>\n";
+ html += "<option value='3'" + (config.transcendentalPrecision == 3 ? selected : empty) + ">WHQL</option>\n";
+ html += "<option value='4'" + (config.transcendentalPrecision == 4 ? selected : empty) + ">IEEE</option>\n";
+ html += "</select></td>\n";
+ html += "</tr>\n";
+ html += "<tr><td>Transparency anti-aliasing:</td><td><select name='transparencyAntialiasing' title='The technique used to anti-alias alpha-tested transparent textures.'>\n";
+ html += "<option value='0'" + (config.transparencyAntialiasing == 0 ? selected : empty) + ">None (default)</option>\n";
+ html += "<option value='1'" + (config.transparencyAntialiasing == 1 ? selected : empty) + ">Alpha-to-Coverage</option>\n";
+ html += "</select></td>\n";
+ html += "</table>\n";
+ html += "<h2><em>Processor settings</em></h2>\n";
+ html += "<table>\n";
+ html += "<tr><td>Number of threads:</td><td><select name='threadCount' title='The number of rendering threads to be used.'>\n";
+ html += "<option value='-1'" + (config.threadCount == -1 ? selected : empty) + ">Core count</option>\n";
+ html += "<option value='0'" + (config.threadCount == 0 ? selected : empty) + ">Process affinity (default)</option>\n";
+ html += "<option value='1'" + (config.threadCount == 1 ? selected : empty) + ">1</option>\n";
+ html += "<option value='2'" + (config.threadCount == 2 ? selected : empty) + ">2</option>\n";
+ html += "<option value='3'" + (config.threadCount == 3 ? selected : empty) + ">3</option>\n";
+ html += "<option value='4'" + (config.threadCount == 4 ? selected : empty) + ">4</option>\n";
+ html += "<option value='5'" + (config.threadCount == 5 ? selected : empty) + ">5</option>\n";
+ html += "<option value='6'" + (config.threadCount == 6 ? selected : empty) + ">6</option>\n";
+ html += "<option value='7'" + (config.threadCount == 7 ? selected : empty) + ">7</option>\n";
+ html += "<option value='8'" + (config.threadCount == 8 ? selected : empty) + ">8</option>\n";
+ html += "<option value='9'" + (config.threadCount == 9 ? selected : empty) + ">9</option>\n";
+ html += "<option value='10'" + (config.threadCount == 10 ? selected : empty) + ">10</option>\n";
+ html += "<option value='11'" + (config.threadCount == 11 ? selected : empty) + ">11</option>\n";
+ html += "<option value='12'" + (config.threadCount == 12 ? selected : empty) + ">12</option>\n";
+ html += "<option value='13'" + (config.threadCount == 13 ? selected : empty) + ">13</option>\n";
+ html += "<option value='14'" + (config.threadCount == 14 ? selected : empty) + ">14</option>\n";
+ html += "<option value='15'" + (config.threadCount == 15 ? selected : empty) + ">15</option>\n";
+ html += "<option value='16'" + (config.threadCount == 16 ? selected : empty) + ">16</option>\n";
+ html += "</select></td></tr>\n";
+ html += "<tr><td>Enable SSE:</td><td><input name = 'enableSSE' type='checkbox'" + (config.enableSSE ? checked : empty) + " disabled='disabled' title='If checked enables the use of SSE instruction set extentions if supported by the CPU.'></td></tr>";
+ html += "<tr><td>Enable SSE2:</td><td><input name = 'enableSSE2' type='checkbox'" + (config.enableSSE2 ? checked : empty) + " title='If checked enables the use of SSE2 instruction set extentions if supported by the CPU.'></td></tr>";
+ html += "<tr><td>Enable SSE3:</td><td><input name = 'enableSSE3' type='checkbox'" + (config.enableSSE3 ? checked : empty) + " title='If checked enables the use of SSE3 instruction set extentions if supported by the CPU.'></td></tr>";
+ html += "<tr><td>Enable SSSE3:</td><td><input name = 'enableSSSE3' type='checkbox'" + (config.enableSSSE3 ? checked : empty) + " title='If checked enables the use of SSSE3 instruction set extentions if supported by the CPU.'></td></tr>";
+ html += "<tr><td>Enable SSE4.1:</td><td><input name = 'enableSSE4_1' type='checkbox'" + (config.enableSSE4_1 ? checked : empty) + " title='If checked enables the use of SSE4.1 instruction set extentions if supported by the CPU.'></td></tr>";
+ html += "</table>\n";
+ html += "<h2><em>Compiler optimizations</em></h2>\n";
+ html += "<table>\n";
+
+ for(int pass = 0; pass < 10; pass++)
+ {
+ html += "<tr><td>Optimization pass " + itoa(pass + 1) + ":</td><td><select name='optimization" + itoa(pass + 1) + "' title='An optimization pass for the shader compiler.'>\n";
+ html += "<option value='0'" + (config.optimization[pass] == 0 ? selected : empty) + ">Disabled" + (pass > 0 ? " (default)" : "") + "</option>\n";
+ html += "<option value='1'" + (config.optimization[pass] == 1 ? selected : empty) + ">Instruction Combining" + (pass == 0 ? " (default)" : "") + "</option>\n";
+ html += "<option value='2'" + (config.optimization[pass] == 2 ? selected : empty) + ">Control Flow Simplification</option>\n";
+ html += "<option value='3'" + (config.optimization[pass] == 3 ? selected : empty) + ">Loop Invariant Code Motion</option>\n";
+ html += "<option value='4'" + (config.optimization[pass] == 4 ? selected : empty) + ">Aggressive Dead Code Elimination</option>\n";
+ html += "<option value='5'" + (config.optimization[pass] == 5 ? selected : empty) + ">Global Value Numbering</option>\n";
+ html += "<option value='6'" + (config.optimization[pass] == 6 ? selected : empty) + ">Commutative Expressions Reassociation</option>\n";
+ html += "<option value='7'" + (config.optimization[pass] == 7 ? selected : empty) + ">Dead Store Elimination</option>\n";
+ html += "<option value='8'" + (config.optimization[pass] == 8 ? selected : empty) + ">Sparse Conditional Copy Propagation</option>\n";
+ html += "<option value='9'" + (config.optimization[pass] == 9 ? selected : empty) + ">Scalar Replacement of Aggregates</option>\n";
+ html += "</select></td></tr>\n";
+ }
+
+ html += "</table>\n";
+ html += "<h2><em>Testing & Experimental</em></h2>\n";
+ html += "<table>\n";
+ html += "<tr><td>Disable SwiftConfig server:</td><td><input name = 'disableServer' type='checkbox'" + (config.disableServer == true ? checked : empty) + " title='If checked disables the web browser based control panel.'></td></tr>";
+ html += "<tr><td>Force windowed mode:</td><td><input name = 'forceWindowed' type='checkbox'" + (config.forceWindowed == true ? checked : empty) + " title='If checked prevents the application from switching to full-screen mode.'></td></tr>";
+ html += "<tr><td>Complementary depth buffer:</td><td><input name = 'complementaryDepthBuffer' type='checkbox'" + (config.complementaryDepthBuffer == true ? checked : empty) + " title='If checked causes 1 - z to be stored in the depth buffer.'></td></tr>";
+ html += "<tr><td>Post alpha blend sRGB conversion:</td><td><input name = 'postBlendSRGB' type='checkbox'" + (config.postBlendSRGB == true ? checked : empty) + " title='If checked alpha blending is performed in linear color space.'></td></tr>";
+ html += "<tr><td>Exact color rounding:</td><td><input name = 'exactColorRounding' type='checkbox'" + (config.exactColorRounding == true ? checked : empty) + " title='If checked color rounding is done at high accuracy.'></td></tr>";
+ html += "<tr><td>Disable alpha display formats:</td><td><input name = 'disableAlphaMode' type='checkbox'" + (config.disableAlphaMode == true ? checked : empty) + " title='If checked the device does not advertise the A8R8G8B8 display mode.'></td></tr>";
+ html += "<tr><td>Disable 10-bit display formats:</td><td><input name = 'disable10BitMode' type='checkbox'" + (config.disable10BitMode == true ? checked : empty) + " title='If checked the device does not advertise the A2R10G10B10 display mode.'></td></tr>";
+ html += "<tr><td>Frame-buffer API:</td><td><select name='frameBufferAPI' title='The API used for displaying the rendered result on screen (requires restart).'>\n";
+ html += "<option value='0'" + (config.frameBufferAPI == 0 ? selected : empty) + ">DirectDraw (default)</option>\n";
+ html += "<option value='1'" + (config.frameBufferAPI == 1 ? selected : empty) + ">GDI</option>\n";
+ html += "</select></td>\n";
+ html += "<tr><td>DLL precaching:</td><td><input name = 'precache' type='checkbox'" + (config.precache == true ? checked : empty) + " title='If checked dynamically generated routines will be stored in a DLL for faster loading on application restart.'></td></tr>";
+ html += "<tr><td>Shadow mapping extensions:</td><td><select name='shadowMapping' title='Features that may accelerate or improve the quality of shadow mapping.'>\n";
+ html += "<option value='0'" + (config.shadowMapping == 0 ? selected : empty) + ">None</option>\n";
+ html += "<option value='1'" + (config.shadowMapping == 1 ? selected : empty) + ">Fetch4</option>\n";
+ html += "<option value='2'" + (config.shadowMapping == 2 ? selected : empty) + ">DST</option>\n";
+ html += "<option value='3'" + (config.shadowMapping == 3 ? selected : empty) + ">Fetch4 & DST (default)</option>\n";
+ html += "</select></td>\n";
+ html += "<tr><td>Force clearing registers that have no default value:</td><td><input name = 'forceClearRegisters' type='checkbox'" + (config.forceClearRegisters == true ? checked : empty) + " title='Initializes shader register values to 0 even if they have no default.'></td></tr>";
+ html += "</table>\n";
+ #ifndef NDEBUG
+ html += "<h2><em>Debugging</em></h2>\n";
+ html += "<table>\n";
+ html += "<tr><td>Minimum primitives:</td><td><input type='text' size='10' maxlength='10' name='minPrimitives' value='" + itoa(config.minPrimitives) + "'></td></tr>\n";
+ html += "<tr><td>Maximum primitives:</td><td><input type='text' size='10' maxlength='10' name='maxPrimitives' value='" + itoa(config.maxPrimitives) + "'></td></tr>\n";
+ html += "</table>\n";
+ #endif
+ html += "<hr><p>\n";
+ html += "<span style='font-size:10pt'>Hover the mouse pointer over a control to get additional information.</span><br>\n";
+ html += "<span style='font-size:10pt'>Some settings can be applied interactively, some need a restart of the application.</span><br>\n";
+ html += "<span style='font-size:10pt'>Removing the SwiftShader.ini file results in resetting the options to their default.</span></p>\n";
+ html += "</form>\n";
+ html += "</body>\n";
+ html += "</html>\n";
+
+ profiler.reset();
+
+ return html;
+ }
+
+ std::string SwiftConfig::profile()
+ {
+ std::string html;
+
+ html += "<p>FPS: " + ftoa(profiler.FPS) + "</p>\n";
+ html += "<p>Frame: " + itoa(profiler.framesTotal) + "</p>\n";
+
+ #if PERF_PROFILE
+ int texTime = (int)(1000 * profiler.cycles[PERF_TEX] / profiler.cycles[PERF_PIXEL] + 0.5);
+ int shaderTime = (int)(1000 * profiler.cycles[PERF_SHADER] / profiler.cycles[PERF_PIXEL] + 0.5);
+ int pipeTime = (int)(1000 * profiler.cycles[PERF_PIPE] / profiler.cycles[PERF_PIXEL] + 0.5);
+ int ropTime = (int)(1000 * profiler.cycles[PERF_ROP] / profiler.cycles[PERF_PIXEL] + 0.5);
+ int interpTime = (int)(1000 * profiler.cycles[PERF_INTERP] / profiler.cycles[PERF_PIXEL] + 0.5);
+ int rastTime = 1000 - pipeTime;
+
+ pipeTime -= shaderTime + ropTime + interpTime;
+ shaderTime -= texTime;
+
+ double texTimeF = (double)texTime / 10;
+ double shaderTimeF = (double)shaderTime / 10;
+ double pipeTimeF = (double)pipeTime / 10;
+ double ropTimeF = (double)ropTime / 10;
+ double interpTimeF = (double)interpTime / 10;
+ double rastTimeF = (double)rastTime / 10;
+
+ double averageRopOperations = profiler.ropOperationsTotal / std::max(profiler.framesTotal, 1) / 1.0e6f;
+ double averageCompressedTex = profiler.compressedTexTotal / std::max(profiler.framesTotal, 1) / 1.0e6f;
+ double averageTexOperations = profiler.texOperationsTotal / std::max(profiler.framesTotal, 1) / 1.0e6f;
+
+ html += "<p>Raster operations (million): " + ftoa(profiler.ropOperationsFrame / 1.0e6f) + " (current), " + ftoa(averageRopOperations) + " (average)</p>\n";
+ html += "<p>Texture operations (million): " + ftoa(profiler.texOperationsFrame / 1.0e6f) + " (current), " + ftoa(averageTexOperations) + " (average)</p>\n";
+ html += "<p>Compressed texture operations (million): " + ftoa(profiler.compressedTexFrame / 1.0e6f) + " (current), " + ftoa(averageCompressedTex) + " (average)</p>\n";
+ html += "<div id='profile' style='position:relative; width:1010px; height:50px; background-color:silver;'>";
+ html += "<div style='position:relative; width:1000px; height:40px; background-color:white; left:5px; top:5px;'>";
+ html += "<div style='position:relative; float:left; width:" + itoa(rastTime) + "px; height:40px; border-style:none; text-align:center; line-height:40px; background-color:#FFFF7F; overflow:hidden;'>" + ftoa(rastTimeF) + "% rast</div>\n";
+ html += "<div style='position:relative; float:left; width:" + itoa(pipeTime) + "px; height:40px; border-style:none; text-align:center; line-height:40px; background-color:#FF7F7F; overflow:hidden;'>" + ftoa(pipeTimeF) + "% pipe</div>\n";
+ html += "<div style='position:relative; float:left; width:" + itoa(interpTime) + "px; height:40px; border-style:none; text-align:center; line-height:40px; background-color:#7FFFFF; overflow:hidden;'>" + ftoa(interpTimeF) + "% interp</div>\n";
+ html += "<div style='position:relative; float:left; width:" + itoa(shaderTime) + "px; height:40px; border-style:none; text-align:center; line-height:40px; background-color:#7FFF7F; overflow:hidden;'>" + ftoa(shaderTimeF) + "% shader</div>\n";
+ html += "<div style='position:relative; float:left; width:" + itoa(texTime) + "px; height:40px; border-style:none; text-align:center; line-height:40px; background-color:#FF7FFF; overflow:hidden;'>" + ftoa(texTimeF) + "% tex</div>\n";
+ html += "<div style='position:relative; float:left; width:" + itoa(ropTime) + "px; height:40px; border-style:none; text-align:center; line-height:40px; background-color:#7F7FFF; overflow:hidden;'>" + ftoa(ropTimeF) + "% rop</div>\n";
+ html += "</div></div>\n";
+
+ for(int i = 0; i < PERF_TIMERS; i++)
+ {
+ profiler.cycles[i] = 0;
+ }
+ #endif
+
+ return html;
+ }
+
+ void SwiftConfig::send(Socket *clientSocket, Status code, std::string body)
+ {
+ std::string status;
+ char header[1024];
+
+ switch(code)
+ {
+ case OK: status += "HTTP/1.1 200 OK\r\n"; break;
+ case NotFound: status += "HTTP/1.1 404 Not Found\r\n"; break;
+ }
+
+ sprintf(header, "Content-Type: text/html; charset=UTF-8\r\n"
+ "Content-Length: %zd\r\n"
+ "Host: localhost\r\n"
+ "\r\n", body.size());
+
+ std::string message = status + header + body;
+ clientSocket->send(message.c_str(), (int)message.length());
+ }
+
+ void SwiftConfig::parsePost(const char *post)
+ {
+ // Only enabled checkboxes appear in the POST
+ config.enableSSE = true;
+ config.enableSSE2 = false;
+ config.enableSSE3 = false;
+ config.enableSSSE3 = false;
+ config.enableSSE4_1 = false;
+ config.disableServer = false;
+ config.forceWindowed = false;
+ config.complementaryDepthBuffer = false;
+ config.postBlendSRGB = false;
+ config.exactColorRounding = false;
+ config.disableAlphaMode = false;
+ config.disable10BitMode = false;
+ config.precache = false;
+ config.forceClearRegisters = false;
+
+ while(*post != 0)
+ {
+ int integer;
+ int index;
+
+ if(sscanf(post, "pixelShaderVersion=%d", &integer))
+ {
+ config.pixelShaderVersion = integer;
+ }
+ else if(sscanf(post, "vertexShaderVersion=%d", &integer))
+ {
+ config.vertexShaderVersion = integer;
+ }
+ else if(sscanf(post, "textureMemory=%d", &integer))
+ {
+ config.textureMemory = integer;
+ }
+ else if(sscanf(post, "identifier=%d", &integer))
+ {
+ config.identifier = integer;
+ }
+ else if(sscanf(post, "vertexRoutineCacheSize=%d", &integer))
+ {
+ config.vertexRoutineCacheSize = integer;
+ }
+ else if(sscanf(post, "pixelRoutineCacheSize=%d", &integer))
+ {
+ config.pixelRoutineCacheSize = integer;
+ }
+ else if(sscanf(post, "setupRoutineCacheSize=%d", &integer))
+ {
+ config.setupRoutineCacheSize = integer;
+ }
+ else if(sscanf(post, "vertexCacheSize=%d", &integer))
+ {
+ config.vertexCacheSize = integer;
+ }
+ else if(sscanf(post, "textureSampleQuality=%d", &integer))
+ {
+ config.textureSampleQuality = integer;
+ }
+ else if(sscanf(post, "mipmapQuality=%d", &integer))
+ {
+ config.mipmapQuality = integer;
+ }
+ else if(sscanf(post, "perspectiveCorrection=%d", &integer))
+ {
+ config.perspectiveCorrection = integer != 0;
+ }
+ else if(sscanf(post, "transcendentalPrecision=%d", &integer))
+ {
+ config.transcendentalPrecision = integer;
+ }
+ else if(sscanf(post, "transparencyAntialiasing=%d", &integer))
+ {
+ config.transparencyAntialiasing = integer;
+ }
+ else if(sscanf(post, "threadCount=%d", &integer))
+ {
+ config.threadCount = integer;
+ }
+ else if(sscanf(post, "frameBufferAPI=%d", &integer))
+ {
+ config.frameBufferAPI = integer;
+ }
+ else if(sscanf(post, "shadowMapping=%d", &integer))
+ {
+ config.shadowMapping = integer;
+ }
+ else if(strstr(post, "enableSSE=on"))
+ {
+ config.enableSSE = true;
+ }
+ else if(strstr(post, "enableSSE2=on"))
+ {
+ if(config.enableSSE)
+ {
+ config.enableSSE2 = true;
+ }
+ }
+ else if(strstr(post, "enableSSE3=on"))
+ {
+ if(config.enableSSE2)
+ {
+ config.enableSSE3 = true;
+ }
+ }
+ else if(strstr(post, "enableSSSE3=on"))
+ {
+ if(config.enableSSE3)
+ {
+ config.enableSSSE3 = true;
+ }
+ }
+ else if(strstr(post, "enableSSE4_1=on"))
+ {
+ if(config.enableSSSE3)
+ {
+ config.enableSSE4_1 = true;
+ }
+ }
+ else if(sscanf(post, "optimization%d=%d", &index, &integer))
+ {
+ config.optimization[index - 1] = (Optimization)integer;
+ }
+ else if(strstr(post, "disableServer=on"))
+ {
+ config.disableServer = true;
+ }
+ else if(strstr(post, "forceWindowed=on"))
+ {
+ config.forceWindowed = true;
+ }
+ else if(strstr(post, "complementaryDepthBuffer=on"))
+ {
+ config.complementaryDepthBuffer = true;
+ }
+ else if(strstr(post, "postBlendSRGB=on"))
+ {
+ config.postBlendSRGB = true;
+ }
+ else if(strstr(post, "exactColorRounding=on"))
+ {
+ config.exactColorRounding = true;
+ }
+ else if(strstr(post, "disableAlphaMode=on"))
+ {
+ config.disableAlphaMode = true;
+ }
+ else if(strstr(post, "disable10BitMode=on"))
+ {
+ config.disable10BitMode = true;
+ }
+ else if(strstr(post, "precache=on"))
+ {
+ config.precache = true;
+ }
+ else if(strstr(post, "forceClearRegisters=on"))
+ {
+ config.forceClearRegisters = true;
+ }
+ #ifndef NDEBUG
+ else if(sscanf(post, "minPrimitives=%d", &integer))
+ {
+ config.minPrimitives = integer;
+ }
+ else if(sscanf(post, "maxPrimitives=%d", &integer))
+ {
+ config.maxPrimitives = integer;
+ }
+ #endif
+ else
+ {
+ ASSERT(false);
+ }
+
+ do
+ {
+ post++;
+ }
+ while(post[-1] != '&' && *post != 0);
+ }
+ }
+
+ void SwiftConfig::readConfiguration(bool disableServerOverride)
+ {
+ Configurator ini("SwiftShader.ini");
+
+ config.pixelShaderVersion = ini.getInteger("Capabilities", "PixelShaderVersion", 30);
+ config.vertexShaderVersion = ini.getInteger("Capabilities", "VertexShaderVersion", 30);
+ config.textureMemory = ini.getInteger("Capabilities", "TextureMemory", 256);
+ config.identifier = ini.getInteger("Capabilities", "Identifier", 0);
+ config.vertexRoutineCacheSize = ini.getInteger("Caches", "VertexRoutineCacheSize", 1024);
+ config.pixelRoutineCacheSize = ini.getInteger("Caches", "PixelRoutineCacheSize", 1024);
+ config.setupRoutineCacheSize = ini.getInteger("Caches", "SetupRoutineCacheSize", 1024);
+ config.vertexCacheSize = ini.getInteger("Caches", "VertexCacheSize", 64);
+ config.textureSampleQuality = ini.getInteger("Quality", "TextureSampleQuality", 2);
+ config.mipmapQuality = ini.getInteger("Quality", "MipmapQuality", 1);
+ config.perspectiveCorrection = ini.getBoolean("Quality", "PerspectiveCorrection", true);
+ config.transcendentalPrecision = ini.getInteger("Quality", "TranscendentalPrecision", 2);
+ config.transparencyAntialiasing = ini.getInteger("Quality", "TransparencyAntialiasing", 0);
+ config.threadCount = ini.getInteger("Processor", "ThreadCount", DEFAULT_THREAD_COUNT);
+ config.enableSSE = ini.getBoolean("Processor", "EnableSSE", true);
+ config.enableSSE2 = ini.getBoolean("Processor", "EnableSSE2", true);
+ config.enableSSE3 = ini.getBoolean("Processor", "EnableSSE3", true);
+ config.enableSSSE3 = ini.getBoolean("Processor", "EnableSSSE3", true);
+ config.enableSSE4_1 = ini.getBoolean("Processor", "EnableSSE4_1", true);
+
+ for(int pass = 0; pass < 10; pass++)
+ {
+ config.optimization[pass] = (Optimization)ini.getInteger("Optimization", "OptimizationPass" + itoa(pass + 1), pass == 0 ? InstructionCombining : Disabled);
+ }
+
+ config.disableServer = ini.getBoolean("Testing", "DisableServer", false);
+ config.forceWindowed = ini.getBoolean("Testing", "ForceWindowed", false);
+ config.complementaryDepthBuffer = ini.getBoolean("Testing", "ComplementaryDepthBuffer", false);
+ config.postBlendSRGB = ini.getBoolean("Testing", "PostBlendSRGB", false);
+ config.exactColorRounding = ini.getBoolean("Testing", "ExactColorRounding", true);
+ config.disableAlphaMode = ini.getBoolean("Testing", "DisableAlphaMode", false);
+ config.disable10BitMode = ini.getBoolean("Testing", "Disable10BitMode", false);
+ config.frameBufferAPI = ini.getInteger("Testing", "FrameBufferAPI", 0);
+ config.precache = ini.getBoolean("Testing", "Precache", false);
+ config.shadowMapping = ini.getInteger("Testing", "ShadowMapping", 3);
+ config.forceClearRegisters = ini.getBoolean("Testing", "ForceClearRegisters", false);
+
+ #ifndef NDEBUG
+ config.minPrimitives = 1;
+ config.maxPrimitives = 1 << 21;
+ #endif
+
+ struct stat status;
+ int lastModified = ini.getInteger("LastModified", "Time", 0);
+
+ bool noConfig = stat("SwiftShader.ini", &status) != 0;
+ newConfig = !noConfig && abs((int)status.st_mtime - lastModified) > 1;
+
+ if(disableServerOverride)
+ {
+ config.disableServer = true;
+ }
+ }
+
+ void SwiftConfig::writeConfiguration()
+ {
+ Configurator ini("SwiftShader.ini");
+
+ ini.addValue("Capabilities", "PixelShaderVersion", itoa(config.pixelShaderVersion));
+ ini.addValue("Capabilities", "VertexShaderVersion", itoa(config.vertexShaderVersion));
+ ini.addValue("Capabilities", "TextureMemory", itoa(config.textureMemory));
+ ini.addValue("Capabilities", "Identifier", itoa(config.identifier));
+ ini.addValue("Caches", "VertexRoutineCacheSize", itoa(config.vertexRoutineCacheSize));
+ ini.addValue("Caches", "PixelRoutineCacheSize", itoa(config.pixelRoutineCacheSize));
+ ini.addValue("Caches", "SetupRoutineCacheSize", itoa(config.setupRoutineCacheSize));
+ ini.addValue("Caches", "VertexCacheSize", itoa(config.vertexCacheSize));
+ ini.addValue("Quality", "TextureSampleQuality", itoa(config.textureSampleQuality));
+ ini.addValue("Quality", "MipmapQuality", itoa(config.mipmapQuality));
+ ini.addValue("Quality", "PerspectiveCorrection", itoa(config.perspectiveCorrection));
+ ini.addValue("Quality", "TranscendentalPrecision", itoa(config.transcendentalPrecision));
+ ini.addValue("Quality", "TransparencyAntialiasing", itoa(config.transparencyAntialiasing));
+ ini.addValue("Processor", "ThreadCount", itoa(config.threadCount));
+ // ini.addValue("Processor", "EnableSSE", itoa(config.enableSSE));
+ ini.addValue("Processor", "EnableSSE2", itoa(config.enableSSE2));
+ ini.addValue("Processor", "EnableSSE3", itoa(config.enableSSE3));
+ ini.addValue("Processor", "EnableSSSE3", itoa(config.enableSSSE3));
+ ini.addValue("Processor", "EnableSSE4_1", itoa(config.enableSSE4_1));
+
+ for(int pass = 0; pass < 10; pass++)
+ {
+ ini.addValue("Optimization", "OptimizationPass" + itoa(pass + 1), itoa(config.optimization[pass]));
+ }
+
+ ini.addValue("Testing", "DisableServer", itoa(config.disableServer));
+ ini.addValue("Testing", "ForceWindowed", itoa(config.forceWindowed));
+ ini.addValue("Testing", "ComplementaryDepthBuffer", itoa(config.complementaryDepthBuffer));
+ ini.addValue("Testing", "PostBlendSRGB", itoa(config.postBlendSRGB));
+ ini.addValue("Testing", "ExactColorRounding", itoa(config.exactColorRounding));
+ ini.addValue("Testing", "DisableAlphaMode", itoa(config.disableAlphaMode));
+ ini.addValue("Testing", "Disable10BitMode", itoa(config.disable10BitMode));
+ ini.addValue("Testing", "FrameBufferAPI", itoa(config.frameBufferAPI));
+ ini.addValue("Testing", "Precache", itoa(config.precache));
+ ini.addValue("Testing", "ShadowMapping", itoa(config.shadowMapping));
+ ini.addValue("Testing", "ForceClearRegisters", itoa(config.forceClearRegisters));
+ ini.addValue("LastModified", "Time", itoa((int)time(0)));
+
+ ini.writeFile("SwiftShader Configuration File\n"
+ ";\n"
+ "; To get an overview of the valid settings and their meaning,\n"
+ "; run the application in windowed mode and open the\n"
+ "; SwiftConfig application or go to http://localhost:8080/swiftconfig.");
+ }
+}
diff --git a/src/Device/SwiftConfig.hpp b/src/Device/SwiftConfig.hpp
new file mode 100644
index 0000000..233b438
--- /dev/null
+++ b/src/Device/SwiftConfig.hpp
@@ -0,0 +1,115 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_SwiftConfig_hpp
+#define sw_SwiftConfig_hpp
+
+#include "Reactor/Nucleus.hpp"
+
+#include "Common/Thread.hpp"
+#include "Common/MutexLock.hpp"
+#include "Common/Socket.hpp"
+
+#include <string>
+
+namespace sw
+{
+ class SwiftConfig
+ {
+ public:
+ struct Configuration
+ {
+ int pixelShaderVersion;
+ int vertexShaderVersion;
+ int textureMemory;
+ int identifier;
+ int vertexRoutineCacheSize;
+ int pixelRoutineCacheSize;
+ int setupRoutineCacheSize;
+ int vertexCacheSize;
+ int textureSampleQuality;
+ int mipmapQuality;
+ bool perspectiveCorrection;
+ int transcendentalPrecision;
+ int threadCount;
+ bool enableSSE;
+ bool enableSSE2;
+ bool enableSSE3;
+ bool enableSSSE3;
+ bool enableSSE4_1;
+ Optimization optimization[10];
+ bool disableServer;
+ bool keepSystemCursor;
+ bool forceWindowed;
+ bool complementaryDepthBuffer;
+ bool postBlendSRGB;
+ bool exactColorRounding;
+ bool disableAlphaMode;
+ bool disable10BitMode;
+ int transparencyAntialiasing;
+ int frameBufferAPI;
+ bool precache;
+ int shadowMapping;
+ bool forceClearRegisters;
+ #ifndef NDEBUG
+ unsigned int minPrimitives;
+ unsigned int maxPrimitives;
+ #endif
+ };
+
+ SwiftConfig(bool disableServerOverride);
+
+ ~SwiftConfig();
+
+ bool hasNewConfiguration(bool reset = true);
+ void getConfiguration(Configuration &configuration);
+
+ private:
+ enum Status
+ {
+ OK = 200,
+ NotFound = 404
+ };
+
+ void createServer();
+ void destroyServer();
+
+ static void serverRoutine(void *parameters);
+
+ void serverLoop();
+ void respond(Socket *clientSocket, const char *request);
+ std::string page();
+ std::string profile();
+ void send(Socket *clientSocket, Status code, std::string body = "");
+ void parsePost(const char *post);
+
+ void readConfiguration(bool disableServerOverride = false);
+ void writeConfiguration();
+
+ Configuration config;
+
+ Thread *serverThread;
+ volatile bool terminate;
+ MutexLock criticalSection; // Protects reading and writing the configuration settings
+
+ bool newConfig;
+
+ Socket *listenSocket;
+
+ int bufferLength;
+ char *receiveBuffer;
+ };
+}
+
+#endif // sw_SwiftConfig_hpp
diff --git a/src/Device/TextureStage.cpp b/src/Device/TextureStage.cpp
new file mode 100644
index 0000000..0327478
--- /dev/null
+++ b/src/Device/TextureStage.cpp
@@ -0,0 +1,412 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "TextureStage.hpp"
+
+#include "Sampler.hpp"
+#include "Common/Debug.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+ TextureStage::State::State()
+ {
+ memset(this, 0, sizeof(State));
+ }
+
+ TextureStage::TextureStage() : sampler(0), previousStage(0)
+ {
+ }
+
+ TextureStage::~TextureStage()
+ {
+ }
+
+ void TextureStage::init(int stage, const Sampler *sampler, const TextureStage *previousStage)
+ {
+ this->stage = stage;
+
+ stageOperation = (stage == 0 ? STAGE_MODULATE : STAGE_DISABLE);
+ firstArgument = SOURCE_TEXTURE;
+ secondArgument = SOURCE_CURRENT;
+ thirdArgument = SOURCE_CURRENT;
+ stageOperationAlpha = (stage == 0 ? STAGE_SELECTARG1 : STAGE_DISABLE);
+ firstArgumentAlpha = SOURCE_DIFFUSE;
+ secondArgumentAlpha = SOURCE_CURRENT;
+ thirdArgumentAlpha = SOURCE_CURRENT;
+ firstModifier = MODIFIER_COLOR;
+ secondModifier = MODIFIER_COLOR;
+ thirdModifier = MODIFIER_COLOR;
+ firstModifierAlpha = MODIFIER_COLOR;
+ secondModifierAlpha = MODIFIER_COLOR;
+ thirdModifierAlpha = MODIFIER_COLOR;
+ destinationArgument = DESTINATION_CURRENT;
+
+ texCoordIndex = stage;
+ this->sampler = sampler;
+ this->previousStage = previousStage;
+ }
+
+ TextureStage::State TextureStage::textureStageState() const
+ {
+ State state;
+
+ if(!isStageDisabled())
+ {
+ state.stageOperation = stageOperation;
+ state.firstArgument = firstArgument;
+ state.secondArgument = secondArgument;
+ state.thirdArgument = thirdArgument;
+ state.stageOperationAlpha = stageOperationAlpha;
+ state.firstArgumentAlpha = firstArgumentAlpha;
+ state.secondArgumentAlpha = secondArgumentAlpha;
+ state.thirdArgumentAlpha = thirdArgumentAlpha;
+ state.firstModifier = firstModifier;
+ state.secondModifier = secondModifier;
+ state.thirdModifier = thirdModifier;
+ state.firstModifierAlpha = firstModifierAlpha;
+ state.secondModifierAlpha = secondModifierAlpha;
+ state.thirdModifierAlpha = thirdModifierAlpha;
+ state.destinationArgument = destinationArgument;
+ state.texCoordIndex = texCoordIndex;
+
+ state.cantUnderflow = sampler->hasUnsignedTexture() || !usesTexture();
+ state.usesTexture = usesTexture();
+ }
+
+ return state;
+ }
+
+ void TextureStage::setConstantColor(const Color<float> &constantColor)
+ {
+ // FIXME: Compact into generic function // FIXME: Clamp
+ short r = iround(4095 * constantColor.r);
+ short g = iround(4095 * constantColor.g);
+ short b = iround(4095 * constantColor.b);
+ short a = iround(4095 * constantColor.a);
+
+ uniforms.constantColor4[0][0] = uniforms.constantColor4[0][1] = uniforms.constantColor4[0][2] = uniforms.constantColor4[0][3] = r;
+ uniforms.constantColor4[1][0] = uniforms.constantColor4[1][1] = uniforms.constantColor4[1][2] = uniforms.constantColor4[1][3] = g;
+ uniforms.constantColor4[2][0] = uniforms.constantColor4[2][1] = uniforms.constantColor4[2][2] = uniforms.constantColor4[2][3] = b;
+ uniforms.constantColor4[3][0] = uniforms.constantColor4[3][1] = uniforms.constantColor4[3][2] = uniforms.constantColor4[3][3] = a;
+ }
+
+ void TextureStage::setBumpmapMatrix(int element, float value)
+ {
+ uniforms.bumpmapMatrix4F[element / 2][element % 2][0] = value;
+ uniforms.bumpmapMatrix4F[element / 2][element % 2][1] = value;
+ uniforms.bumpmapMatrix4F[element / 2][element % 2][2] = value;
+ uniforms.bumpmapMatrix4F[element / 2][element % 2][3] = value;
+
+ uniforms.bumpmapMatrix4W[element / 2][element % 2][0] = iround(4095 * value);
+ uniforms.bumpmapMatrix4W[element / 2][element % 2][1] = iround(4095 * value);
+ uniforms.bumpmapMatrix4W[element / 2][element % 2][2] = iround(4095 * value);
+ uniforms.bumpmapMatrix4W[element / 2][element % 2][3] = iround(4095 * value);
+ }
+
+ void TextureStage::setLuminanceScale(float value)
+ {
+ short scale = iround(4095 * value);
+
+ uniforms.luminanceScale4[0] = uniforms.luminanceScale4[1] = uniforms.luminanceScale4[2] = uniforms.luminanceScale4[3] = scale;
+ }
+
+ void TextureStage::setLuminanceOffset(float value)
+ {
+ short offset = iround(4095 * value);
+
+ uniforms.luminanceOffset4[0] = uniforms.luminanceOffset4[1] = uniforms.luminanceOffset4[2] = uniforms.luminanceOffset4[3] = offset;
+ }
+
+ void TextureStage::setTexCoordIndex(unsigned int texCoordIndex)
+ {
+ ASSERT(texCoordIndex < 8);
+
+ this->texCoordIndex = texCoordIndex;
+ }
+
+ void TextureStage::setStageOperation(StageOperation stageOperation)
+ {
+ this->stageOperation = stageOperation;
+ }
+
+ void TextureStage::setFirstArgument(SourceArgument firstArgument)
+ {
+ this->firstArgument = firstArgument;
+ }
+
+ void TextureStage::setSecondArgument(SourceArgument secondArgument)
+ {
+ this->secondArgument = secondArgument;
+ }
+
+ void TextureStage::setThirdArgument(SourceArgument thirdArgument)
+ {
+ this->thirdArgument = thirdArgument;
+ }
+
+ void TextureStage::setStageOperationAlpha(StageOperation stageOperationAlpha)
+ {
+ this->stageOperationAlpha = stageOperationAlpha;
+ }
+
+ void TextureStage::setFirstArgumentAlpha(SourceArgument firstArgumentAlpha)
+ {
+ this->firstArgumentAlpha = firstArgumentAlpha;
+ }
+
+ void TextureStage::setSecondArgumentAlpha(SourceArgument secondArgumentAlpha)
+ {
+ this->secondArgumentAlpha = secondArgumentAlpha;
+ }
+
+ void TextureStage::setThirdArgumentAlpha(SourceArgument thirdArgumentAlpha)
+ {
+ this->thirdArgumentAlpha= thirdArgumentAlpha;
+ }
+
+ void TextureStage::setFirstModifier(ArgumentModifier firstModifier)
+ {
+ this->firstModifier = firstModifier;
+ }
+
+ void TextureStage::setSecondModifier(ArgumentModifier secondModifier)
+ {
+ this->secondModifier = secondModifier;
+ }
+
+ void TextureStage::setThirdModifier(ArgumentModifier thirdModifier)
+ {
+ this->thirdModifier = thirdModifier;
+ }
+
+ void TextureStage::setFirstModifierAlpha(ArgumentModifier firstModifierAlpha)
+ {
+ this->firstModifierAlpha = firstModifierAlpha;
+ }
+
+ void TextureStage::setSecondModifierAlpha(ArgumentModifier secondModifierAlpha)
+ {
+ this->secondModifierAlpha = secondModifierAlpha;
+ }
+
+ void TextureStage::setThirdModifierAlpha(ArgumentModifier thirdModifierAlpha)
+ {
+ this->thirdModifierAlpha = thirdModifierAlpha;
+ }
+
+ void TextureStage::setDestinationArgument(DestinationArgument destinationArgument)
+ {
+ this->destinationArgument = destinationArgument;
+ }
+
+ bool TextureStage::usesColor(SourceArgument source) const
+ {
+ // One argument
+ if(stageOperation == STAGE_SELECTARG1 || stageOperation == STAGE_PREMODULATE)
+ {
+ return firstArgument == source;
+ }
+ else if(stageOperation == STAGE_SELECTARG2)
+ {
+ return secondArgument == source;
+ }
+ else if(stageOperation == STAGE_SELECTARG3)
+ {
+ return thirdArgument == source;
+ }
+ else
+ {
+ // Two arguments or more
+ if(firstArgument == source || secondArgument == source)
+ {
+ return true;
+ }
+
+ // Three arguments
+ if(stageOperation == STAGE_MULTIPLYADD || stageOperation == STAGE_LERP)
+ {
+ return thirdArgument == source;
+ }
+ }
+
+ return false;
+ }
+
+ bool TextureStage::usesAlpha(SourceArgument source) const
+ {
+ if(stageOperationAlpha == STAGE_DISABLE)
+ {
+ return false;
+ }
+
+ if(source == SOURCE_TEXTURE)
+ {
+ if(stageOperation == STAGE_BLENDTEXTUREALPHA || stageOperation == STAGE_BLENDTEXTUREALPHAPM)
+ {
+ return true;
+ }
+ }
+ else if(source == SOURCE_CURRENT)
+ {
+ if(stageOperation == STAGE_BLENDCURRENTALPHA)
+ {
+ return true;
+ }
+ }
+ else if(source == SOURCE_DIFFUSE)
+ {
+ if(stageOperation == STAGE_BLENDDIFFUSEALPHA)
+ {
+ return true;
+ }
+ }
+ else if(source == SOURCE_TFACTOR)
+ {
+ if(stageOperation == STAGE_BLENDFACTORALPHA)
+ {
+ return true;
+ }
+ }
+
+ // One argument
+ if(stageOperation == STAGE_SELECTARG1 || stageOperation == STAGE_PREMODULATE)
+ {
+ if(firstArgument == source && (firstModifier == MODIFIER_ALPHA || firstModifier == MODIFIER_INVALPHA))
+ {
+ return true;
+ }
+ }
+ else if(stageOperation == STAGE_SELECTARG2)
+ {
+ if(secondArgument == source && (secondModifier == MODIFIER_ALPHA || secondModifier == MODIFIER_INVALPHA))
+ {
+ return true;
+ }
+ }
+ else if(stageOperation == STAGE_SELECTARG3)
+ {
+ if(thirdArgument == source && (thirdModifier == MODIFIER_ALPHA || thirdModifier == MODIFIER_INVALPHA))
+ {
+ return true;
+ }
+ }
+ else
+ {
+ // Two arguments or more
+ if(firstArgument == source || secondArgument == source)
+ {
+ if(firstArgument == source && (firstModifier == MODIFIER_ALPHA || firstModifier == MODIFIER_INVALPHA))
+ {
+ return true;
+ }
+
+ if(secondArgument == source && (secondModifier == MODIFIER_ALPHA || secondModifier == MODIFIER_INVALPHA))
+ {
+ return true;
+ }
+ }
+
+ // Three arguments
+ if(stageOperation == STAGE_MULTIPLYADD || stageOperation == STAGE_LERP)
+ {
+ if(thirdArgument == source && (thirdModifier == MODIFIER_ALPHA || thirdModifier == MODIFIER_INVALPHA))
+ {
+ return true;
+ }
+ }
+ }
+
+ // One argument
+ if(stageOperationAlpha == STAGE_SELECTARG1 || stageOperationAlpha == STAGE_PREMODULATE)
+ {
+ return firstArgumentAlpha == source;
+ }
+ else if(stageOperationAlpha == STAGE_SELECTARG2)
+ {
+ return secondArgumentAlpha == source;
+ }
+ else if(stageOperationAlpha == STAGE_SELECTARG3)
+ {
+ return thirdArgumentAlpha == source;
+ }
+ else
+ {
+ // Two arguments or more
+ if(firstArgumentAlpha == source || secondArgumentAlpha == source)
+ {
+ return true;
+ }
+
+ // Three arguments
+ if(stageOperationAlpha == STAGE_MULTIPLYADD || stageOperationAlpha == STAGE_LERP)
+ {
+ return thirdArgumentAlpha == source;
+ }
+ }
+
+ return false;
+ }
+
+ bool TextureStage::uses(SourceArgument source) const
+ {
+ return usesColor(source) || usesAlpha(source);
+ }
+
+ bool TextureStage::usesCurrent() const
+ {
+ return uses(SOURCE_CURRENT) || (stageOperation == STAGE_BLENDCURRENTALPHA || stageOperationAlpha == STAGE_BLENDCURRENTALPHA);
+ }
+
+ bool TextureStage::usesDiffuse() const
+ {
+ return uses(SOURCE_DIFFUSE) || (stageOperation == STAGE_BLENDDIFFUSEALPHA || stageOperationAlpha == STAGE_BLENDDIFFUSEALPHA);
+ }
+
+ bool TextureStage::usesSpecular() const
+ {
+ return uses(SOURCE_SPECULAR);
+ }
+
+ bool TextureStage::usesTexture() const
+ {
+ return uses(SOURCE_TEXTURE) ||
+ stageOperation == STAGE_BLENDTEXTUREALPHA ||
+ stageOperationAlpha == STAGE_BLENDTEXTUREALPHA ||
+ stageOperation == STAGE_BLENDTEXTUREALPHAPM ||
+ stageOperationAlpha == STAGE_BLENDTEXTUREALPHAPM ||
+ (previousStage && previousStage->stageOperation == STAGE_PREMODULATE) ||
+ (previousStage && previousStage->stageOperationAlpha == STAGE_PREMODULATE);
+ }
+
+ bool TextureStage::isStageDisabled() const
+ {
+ bool disabled = (stageOperation == STAGE_DISABLE) || (!sampler->hasTexture() && usesTexture());
+
+ if(!previousStage || disabled)
+ {
+ return disabled;
+ }
+ else
+ {
+ return previousStage->isStageDisabled();
+ }
+ }
+
+ bool TextureStage::writesCurrent() const
+ {
+ return !isStageDisabled() && destinationArgument == DESTINATION_CURRENT && stageOperation != STAGE_BUMPENVMAP && stageOperation != STAGE_BUMPENVMAPLUMINANCE;
+ }
+}
diff --git a/src/Device/TextureStage.hpp b/src/Device/TextureStage.hpp
new file mode 100644
index 0000000..2c9ecbd
--- /dev/null
+++ b/src/Device/TextureStage.hpp
@@ -0,0 +1,198 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_TextureStage_hpp
+#define sw_TextureStage_hpp
+
+#include "Common/Types.hpp"
+#include "Common/Math.hpp"
+#include "Renderer/Color.hpp"
+
+namespace sw
+{
+ class Sampler;
+ class PixelRoutine;
+ class Context;
+
+ class TextureStage
+ {
+ friend class Context; // FIXME
+
+ public:
+ enum StageOperation
+ {
+ STAGE_DISABLE,
+ STAGE_SELECTARG1,
+ STAGE_SELECTARG2,
+ STAGE_SELECTARG3,
+ STAGE_MODULATE,
+ STAGE_MODULATE2X,
+ STAGE_MODULATE4X,
+ STAGE_ADD,
+ STAGE_ADDSIGNED,
+ STAGE_ADDSIGNED2X,
+ STAGE_SUBTRACT,
+ STAGE_ADDSMOOTH,
+ STAGE_MULTIPLYADD,
+ STAGE_LERP,
+ STAGE_DOT3,
+ STAGE_BLENDCURRENTALPHA,
+ STAGE_BLENDDIFFUSEALPHA,
+ STAGE_BLENDFACTORALPHA,
+ STAGE_BLENDTEXTUREALPHA,
+ STAGE_BLENDTEXTUREALPHAPM,
+ STAGE_PREMODULATE,
+ STAGE_MODULATEALPHA_ADDCOLOR,
+ STAGE_MODULATECOLOR_ADDALPHA,
+ STAGE_MODULATEINVALPHA_ADDCOLOR,
+ STAGE_MODULATEINVCOLOR_ADDALPHA,
+ STAGE_BUMPENVMAP,
+ STAGE_BUMPENVMAPLUMINANCE,
+
+ STAGE_LAST = STAGE_BUMPENVMAPLUMINANCE
+ };
+
+ enum SourceArgument
+ {
+ SOURCE_TEXTURE,
+ SOURCE_CONSTANT,
+ SOURCE_CURRENT,
+ SOURCE_DIFFUSE,
+ SOURCE_SPECULAR,
+ SOURCE_TEMP,
+ SOURCE_TFACTOR,
+
+ SOURCE_LAST = SOURCE_TFACTOR
+ };
+
+ enum DestinationArgument
+ {
+ DESTINATION_CURRENT,
+ DESTINATION_TEMP,
+
+ DESTINATION_LAST = DESTINATION_TEMP
+ };
+
+ enum ArgumentModifier
+ {
+ MODIFIER_COLOR,
+ MODIFIER_INVCOLOR,
+ MODIFIER_ALPHA,
+ MODIFIER_INVALPHA,
+
+ MODIFIER_LAST = MODIFIER_INVALPHA
+ };
+
+ struct State
+ {
+ State();
+
+ unsigned int stageOperation : BITS(STAGE_LAST);
+ unsigned int firstArgument : BITS(SOURCE_LAST);
+ unsigned int secondArgument : BITS(SOURCE_LAST);
+ unsigned int thirdArgument : BITS(SOURCE_LAST);
+ unsigned int stageOperationAlpha : BITS(STAGE_LAST);
+ unsigned int firstArgumentAlpha : BITS(SOURCE_LAST);
+ unsigned int secondArgumentAlpha : BITS(SOURCE_LAST);
+ unsigned int thirdArgumentAlpha : BITS(SOURCE_LAST);
+ unsigned int firstModifier : BITS(MODIFIER_LAST);
+ unsigned int secondModifier : BITS(MODIFIER_LAST);
+ unsigned int thirdModifier : BITS(MODIFIER_LAST);
+ unsigned int firstModifierAlpha : BITS(MODIFIER_LAST);
+ unsigned int secondModifierAlpha : BITS(MODIFIER_LAST);
+ unsigned int thirdModifierAlpha : BITS(MODIFIER_LAST);
+ unsigned int destinationArgument : BITS(DESTINATION_LAST);
+ unsigned int texCoordIndex : BITS(7);
+
+ unsigned int cantUnderflow : 1;
+ unsigned int usesTexture : 1;
+ };
+
+ struct Uniforms
+ {
+ word4 constantColor4[4];
+ float4 bumpmapMatrix4F[2][2];
+ word4 bumpmapMatrix4W[2][2];
+ word4 luminanceScale4;
+ word4 luminanceOffset4;
+ };
+
+ TextureStage();
+
+ ~TextureStage();
+
+ void init(int stage, const Sampler *sampler, const TextureStage *previousStage);
+
+ State textureStageState() const;
+
+ void setConstantColor(const Color<float> &constantColor);
+ void setBumpmapMatrix(int element, float value);
+ void setLuminanceScale(float value);
+ void setLuminanceOffset(float value);
+
+ void setTexCoordIndex(unsigned int texCoordIndex);
+ void setStageOperation(StageOperation stageOperation);
+ void setFirstArgument(SourceArgument firstArgument);
+ void setSecondArgument(SourceArgument secondArgument);
+ void setThirdArgument(SourceArgument thirdArgument);
+ void setStageOperationAlpha(StageOperation stageOperationAlpha);
+ void setFirstArgumentAlpha(SourceArgument firstArgumentAlpha);
+ void setSecondArgumentAlpha(SourceArgument secondArgumentAlpha);
+ void setThirdArgumentAlpha(SourceArgument thirdArgumentAlpha);
+ void setFirstModifier(ArgumentModifier firstModifier);
+ void setSecondModifier(ArgumentModifier secondModifier);
+ void setThirdModifier(ArgumentModifier thirdModifier);
+ void setFirstModifierAlpha(ArgumentModifier firstModifierAlpha);
+ void setSecondModifierAlpha(ArgumentModifier secondModifierAlpha);
+ void setThirdModifierAlpha(ArgumentModifier thirdModifierAlpha);
+ void setDestinationArgument(DestinationArgument destinationArgument);
+
+ Uniforms uniforms; // FIXME: Private
+
+ private:
+ bool usesColor(SourceArgument source) const;
+ bool usesAlpha(SourceArgument source) const;
+ bool uses(SourceArgument source) const;
+ bool usesCurrent() const;
+ bool usesDiffuse() const;
+ bool usesSpecular() const;
+ bool usesTexture() const;
+ bool isStageDisabled() const;
+ bool writesCurrent() const;
+
+ int stage;
+
+ StageOperation stageOperation;
+ SourceArgument firstArgument;
+ SourceArgument secondArgument;
+ SourceArgument thirdArgument;
+ StageOperation stageOperationAlpha;
+ SourceArgument firstArgumentAlpha;
+ SourceArgument secondArgumentAlpha;
+ SourceArgument thirdArgumentAlpha;
+ ArgumentModifier firstModifier;
+ ArgumentModifier secondModifier;
+ ArgumentModifier thirdModifier;
+ ArgumentModifier firstModifierAlpha;
+ ArgumentModifier secondModifierAlpha;
+ ArgumentModifier thirdModifierAlpha;
+ DestinationArgument destinationArgument;
+
+ int texCoordIndex;
+ const Sampler *sampler;
+ const TextureStage *previousStage;
+ };
+}
+
+#endif // sw_TextureStage_hpp
diff --git a/src/Device/Triangle.hpp b/src/Device/Triangle.hpp
new file mode 100644
index 0000000..8a91fab
--- /dev/null
+++ b/src/Device/Triangle.hpp
@@ -0,0 +1,30 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Triangle_hpp
+#define sw_Triangle_hpp
+
+#include "Vertex.hpp"
+
+namespace sw
+{
+ struct Triangle
+ {
+ Vertex V0;
+ Vertex V1;
+ Vertex V2;
+ };
+}
+
+#endif // sw_Triangle_hpp
diff --git a/src/Device/Vector.cpp b/src/Device/Vector.cpp
new file mode 100644
index 0000000..4a02534
--- /dev/null
+++ b/src/Device/Vector.cpp
@@ -0,0 +1,175 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Vector.hpp"
+
+#include "Matrix.hpp"
+#include "Common/Math.hpp"
+
+namespace sw
+{
+ Vector Vector::operator+() const
+ {
+ return *this;
+ }
+
+ Vector Vector::operator-() const
+ {
+ return Vector(-x, -y, -z);
+ }
+
+ Vector &Vector::operator+=(const Vector &v)
+ {
+ x += v.x;
+ y += v.y;
+ z += v.z;
+
+ return *this;
+ }
+
+ Vector &Vector::operator-=(const Vector &v)
+ {
+ x -= v.x;
+ y -= v.y;
+ z -= v.z;
+
+ return *this;
+ }
+
+ Vector &Vector::operator*=(float s)
+ {
+ x *= s;
+ y *= s;
+ z *= s;
+
+ return *this;
+ }
+
+ Vector &Vector::operator/=(float s)
+ {
+ float r = 1.0f / s;
+
+ return *this *= r;
+ }
+
+ bool operator==(const Vector &U, const Vector &v)
+ {
+ if(U.x == v.x && U.y == v.y && U.z == v.z)
+ return true;
+ else
+ return false;
+ }
+
+ bool operator!=(const Vector &U, const Vector &v)
+ {
+ if(U.x != v.x || U.y != v.y || U.z != v.z)
+ return true;
+ else
+ return false;
+ }
+
+ bool operator>(const Vector &u, const Vector &v)
+ {
+ if((u^2) > (v^2))
+ return true;
+ else
+ return false;
+ }
+
+ bool operator<(const Vector &u, const Vector &v)
+ {
+ if((u^2) < (v^2))
+ return true;
+ else
+ return false;
+ }
+
+ Vector operator+(const Vector &u, const Vector &v)
+ {
+ return Vector(u.x + v.x, u.y + v.y, u.z + v.z);
+ }
+
+ Vector operator-(const Vector &u, const Vector &v)
+ {
+ return Vector(u.x - v.x, u.y - v.y, u.z - v.z);
+ }
+
+ float operator*(const Vector &u, const Vector &v)
+ {
+ return u.x * v.x + u.y * v.y + u.z * v.z;
+ }
+
+ Vector operator*(float s, const Vector &v)
+ {
+ return Vector(s * v.x, s * v.y, s * v.z);
+ }
+
+ Vector operator*(const Vector &v, float s)
+ {
+ return Vector(v.x * s, v.y * s, v.z * s);
+ }
+
+ Vector operator/(const Vector &v, float s)
+ {
+ float r = 1.0f / s;
+
+ return Vector(v.x * r, v.y * r, v.z * r);
+ }
+
+ float operator^(const Vector &u, const Vector &v)
+ {
+ return acos(u / Vector::N(u) * v / Vector::N(v));
+ }
+
+ Vector operator%(const Vector &u, const Vector &v)
+ {
+ return Vector(u.y * v.z - u.z * v.y, u.z * v.x - u.x * v.z, u.x * v.y - u.y * v.x);
+ }
+
+ Vector operator*(const Matrix &M, const Vector &v)
+ {
+ return Vector(M(1, 1) * v.x + M(1, 2) * v.y + M(1, 3) * v.z,
+ M(2, 1) * v.x + M(2, 2) * v.y + M(2, 3) * v.z,
+ M(3, 1) * v.x + M(3, 2) * v.y + M(3, 3) * v.z);
+ }
+
+ Vector operator*(const Vector &v, const Matrix &M)
+ {
+ return Vector(v.x * M(1, 1) + v.y * M(2, 1) + v.z * M(3, 1) + M(4, 1),
+ v.x * M(1, 2) + v.y * M(2, 2) + v.z * M(3, 2) + M(4, 2),
+ v.x * M(1, 3) + v.y * M(2, 3) + v.z * M(3, 3) + M(4, 3));
+ }
+
+ Vector &operator*=(Vector &v, const Matrix &M)
+ {
+ return v = v * M;
+ }
+
+ float Vector::N(const Vector &v)
+ {
+ return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
+ }
+
+ float Vector::N2(const Vector &v)
+ {
+ return v.x*v.x + v.y*v.y + v.z*v.z;
+ }
+
+ Vector lerp(const Vector &u, const Vector &v, float t)
+ {
+ return Vector(u.x + t * (v.x - u.x),
+ u.y + t * (v.y - u.y),
+ u.z + t * (v.z - u.x));
+ }
+}
diff --git a/src/Device/Vector.hpp b/src/Device/Vector.hpp
new file mode 100644
index 0000000..e7f261d
--- /dev/null
+++ b/src/Device/Vector.hpp
@@ -0,0 +1,153 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Vector_hpp
+#define Vector_hpp
+
+namespace sw
+{
+ struct Point;
+ struct Matrix;
+ struct Plane;
+
+ struct Vector
+ {
+ Vector();
+ Vector(const int i);
+ Vector(const Vector &v);
+ Vector(const Point &p);
+ Vector(float v_x, float v_y, float v_z);
+
+ Vector &operator=(const Vector &v);
+
+ union
+ {
+ float v[3];
+
+ struct
+ {
+ float x;
+ float y;
+ float z;
+ };
+ };
+
+ float &operator[](int i);
+ float &operator()(int i);
+
+ const float &operator[](int i) const;
+ const float &operator()(int i) const;
+
+ Vector operator+() const;
+ Vector operator-() const;
+
+ Vector &operator+=(const Vector &v);
+ Vector &operator-=(const Vector &v);
+ Vector &operator*=(float s);
+ Vector &operator/=(float s);
+
+ friend bool operator==(const Vector &u, const Vector &v);
+ friend bool operator!=(const Vector &u, const Vector &v);
+
+ friend Vector operator+(const Vector &u, const Vector &v);
+ friend Vector operator-(const Vector &u, const Vector &v);
+ friend float operator*(const Vector &u, const Vector &v); // Dot product
+ friend Vector operator*(float s, const Vector &v);
+ friend Vector operator*(const Vector &v, float s);
+ friend Vector operator/(const Vector &v, float s);
+ friend float operator^(const Vector &u, const Vector &v); // Angle between vectors
+ friend Vector operator%(const Vector &u, const Vector &v); // Cross product
+
+ friend Vector operator*(const Matrix &M, const Vector& v);
+ friend Vector operator*(const Vector &v, const Matrix &M);
+ friend Vector &operator*=(Vector &v, const Matrix &M);
+
+ static float N(const Vector &v); // Norm
+ static float N2(const Vector &v); // Squared norm
+
+ static Vector mirror(const Vector &v, const Plane &p);
+ static Vector reflect(const Vector &v, const Plane &p);
+ static Vector lerp(const Vector &u, const Vector &v, float t);
+ };
+}
+
+#include "Point.hpp"
+
+namespace sw
+{
+ inline Vector::Vector()
+ {
+ }
+
+ inline Vector::Vector(const int i)
+ {
+ const float s = (float)i;
+
+ x = s;
+ y = s;
+ z = s;
+ }
+
+ inline Vector::Vector(const Vector &v)
+ {
+ x = v.x;
+ y = v.y;
+ z = v.z;
+ }
+
+ inline Vector::Vector(const Point &P)
+ {
+ x = P.x;
+ y = P.y;
+ z = P.z;
+ }
+
+ inline Vector::Vector(float v_x, float v_y, float v_z)
+ {
+ x = v_x;
+ y = v_y;
+ z = v_z;
+ }
+
+ inline Vector &Vector::operator=(const Vector &v)
+ {
+ x = v.x;
+ y = v.y;
+ z = v.z;
+
+ return *this;
+ }
+
+ inline float &Vector::operator()(int i)
+ {
+ return v[i];
+ }
+
+ inline float &Vector::operator[](int i)
+ {
+ return v[i];
+ }
+
+ inline const float &Vector::operator()(int i) const
+ {
+ return v[i];
+ }
+
+ inline const float &Vector::operator[](int i) const
+ {
+ return v[i];
+ }
+}
+
+#endif // Vector_hpp
diff --git a/src/Device/Vertex.hpp b/src/Device/Vertex.hpp
new file mode 100644
index 0000000..9ae8d14
--- /dev/null
+++ b/src/Device/Vertex.hpp
@@ -0,0 +1,98 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Vertex_hpp
+#define Vertex_hpp
+
+#include "Color.hpp"
+#include "Common/Types.hpp"
+#include "Main/Config.hpp"
+
+namespace sw
+{
+ enum Out
+ {
+ // Default vertex output semantics
+ Pos = 0,
+ C0 = 1, // Diffuse
+ C1 = 2, // Specular
+ T0 = 3,
+ T1 = 4,
+ T2 = 5,
+ T3 = 6,
+ T4 = 7,
+ T5 = 8,
+ T6 = 9,
+ T7 = 10,
+ Fog = 11, // x component
+ Pts = Fog, // y component
+
+ // Variable semantics
+ V0 = 0,
+ Vn_1 = MAX_VERTEX_OUTPUTS - 1,
+
+ Unused,
+ VERTEX_OUTPUT_LAST = Unused,
+ };
+
+ struct UVWQ
+ {
+ float u;
+ float v;
+ float w;
+ float q;
+
+ float &operator[](int i)
+ {
+ return (&u)[i];
+ }
+ };
+
+ ALIGN(16, struct Vertex
+ {
+ union
+ {
+ struct // Fixed semantics
+ {
+ // Position
+ float x;
+ float y;
+ float z;
+ float w;
+
+ float4 C[2]; // Diffuse and specular color
+
+ UVWQ T[8]; // Texture coordinates
+
+ float f; // Fog
+ float pSize; // Point size
+ };
+
+ float4 v[MAX_VERTEX_OUTPUTS]; // Generic components using semantic declaration
+ };
+
+ // Projected coordinates
+ int X;
+ int Y;
+ float Z;
+ float W;
+
+ int clipFlags;
+ int padding[3];
+ });
+
+ static_assert((sizeof(Vertex) & 0x0000000F) == 0, "Vertex size not a multiple of 16 bytes (alignment requirement)");
+}
+
+#endif // Vertex_hpp
diff --git a/src/Device/VertexProcessor.cpp b/src/Device/VertexProcessor.cpp
new file mode 100644
index 0000000..976ea2b
--- /dev/null
+++ b/src/Device/VertexProcessor.cpp
@@ -0,0 +1,1118 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "VertexProcessor.hpp"
+
+#include "Shader/VertexPipeline.hpp"
+#include "Shader/VertexProgram.hpp"
+#include "Shader/VertexShader.hpp"
+#include "Shader/PixelShader.hpp"
+#include "Shader/Constants.hpp"
+#include "Common/Math.hpp"
+#include "Common/Debug.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+ bool precacheVertex = false;
+
+ void VertexCache::clear()
+ {
+ for(int i = 0; i < 16; i++)
+ {
+ tag[i] = 0x80000000;
+ }
+ }
+
+ unsigned int VertexProcessor::States::computeHash()
+ {
+ unsigned int *state = (unsigned int*)this;
+ unsigned int hash = 0;
+
+ for(unsigned int i = 0; i < sizeof(States) / 4; i++)
+ {
+ hash ^= state[i];
+ }
+
+ return hash;
+ }
+
+ VertexProcessor::State::State()
+ {
+ memset(this, 0, sizeof(State));
+ }
+
+ bool VertexProcessor::State::operator==(const State &state) const
+ {
+ if(hash != state.hash)
+ {
+ return false;
+ }
+
+ return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
+ }
+
+ VertexProcessor::TransformFeedbackInfo::TransformFeedbackInfo()
+ {
+ buffer = nullptr;
+ offset = 0;
+ reg = 0;
+ row = 0;
+ col = 0;
+ stride = 0;
+ }
+
+ VertexProcessor::UniformBufferInfo::UniformBufferInfo()
+ {
+ buffer = nullptr;
+ offset = 0;
+ }
+
+ VertexProcessor::VertexProcessor(Context *context) : context(context)
+ {
+ for(int i = 0; i < 12; i++)
+ {
+ M[i] = 1;
+ }
+
+ V = 1;
+ B = 1;
+ P = 0;
+ PB = 0;
+ PBV = 0;
+
+ for(int i = 0; i < 12; i++)
+ {
+ PBVM[i] = 0;
+ }
+
+ setLightingEnable(true);
+ setSpecularEnable(false);
+
+ for(int i = 0; i < 8; i++)
+ {
+ setLightEnable(i, false);
+ setLightPosition(i, 0);
+ }
+
+ updateMatrix = true;
+ updateViewMatrix = true;
+ updateBaseMatrix = true;
+ updateProjectionMatrix = true;
+ updateLighting = true;
+
+ for(int i = 0; i < 12; i++)
+ {
+ updateModelMatrix[i] = true;
+ }
+
+ routineCache = 0;
+ setRoutineCacheSize(1024);
+ }
+
+ VertexProcessor::~VertexProcessor()
+ {
+ delete routineCache;
+ routineCache = 0;
+ }
+
+ void VertexProcessor::setInputStream(int index, const Stream &stream)
+ {
+ context->input[index] = stream;
+ }
+
+ void VertexProcessor::resetInputStreams(bool preTransformed)
+ {
+ for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+ {
+ context->input[i].defaults();
+ }
+
+ context->preTransformed = preTransformed;
+ }
+
+ void VertexProcessor::setFloatConstant(unsigned int index, const float value[4])
+ {
+ if(index < VERTEX_UNIFORM_VECTORS)
+ {
+ c[index][0] = value[0];
+ c[index][1] = value[1];
+ c[index][2] = value[2];
+ c[index][3] = value[3];
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setIntegerConstant(unsigned int index, const int integer[4])
+ {
+ if(index < 16)
+ {
+ i[index][0] = integer[0];
+ i[index][1] = integer[1];
+ i[index][2] = integer[2];
+ i[index][3] = integer[3];
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setBooleanConstant(unsigned int index, int boolean)
+ {
+ if(index < 16)
+ {
+ b[index] = boolean != 0;
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setUniformBuffer(int index, sw::Resource* buffer, int offset)
+ {
+ uniformBufferInfo[index].buffer = buffer;
+ uniformBufferInfo[index].offset = offset;
+ }
+
+ void VertexProcessor::lockUniformBuffers(byte** u, sw::Resource* uniformBuffers[])
+ {
+ for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; ++i)
+ {
+ u[i] = uniformBufferInfo[i].buffer ? static_cast<byte*>(uniformBufferInfo[i].buffer->lock(PUBLIC, PRIVATE)) + uniformBufferInfo[i].offset : nullptr;
+ uniformBuffers[i] = uniformBufferInfo[i].buffer;
+ }
+ }
+
+ void VertexProcessor::setTransformFeedbackBuffer(int index, sw::Resource* buffer, int offset, unsigned int reg, unsigned int row, unsigned int col, unsigned int stride)
+ {
+ transformFeedbackInfo[index].buffer = buffer;
+ transformFeedbackInfo[index].offset = offset;
+ transformFeedbackInfo[index].reg = reg;
+ transformFeedbackInfo[index].row = row;
+ transformFeedbackInfo[index].col = col;
+ transformFeedbackInfo[index].stride = stride;
+ }
+
+ void VertexProcessor::lockTransformFeedbackBuffers(byte** t, unsigned int* v, unsigned int* r, unsigned int* c, unsigned int* s, sw::Resource* transformFeedbackBuffers[])
+ {
+ for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; ++i)
+ {
+ t[i] = transformFeedbackInfo[i].buffer ? static_cast<byte*>(transformFeedbackInfo[i].buffer->lock(PUBLIC, PRIVATE)) + transformFeedbackInfo[i].offset : nullptr;
+ transformFeedbackBuffers[i] = transformFeedbackInfo[i].buffer;
+ v[i] = transformFeedbackInfo[i].reg;
+ r[i] = transformFeedbackInfo[i].row;
+ c[i] = transformFeedbackInfo[i].col;
+ s[i] = transformFeedbackInfo[i].stride;
+ }
+ }
+
+ void VertexProcessor::setModelMatrix(const Matrix &M, int i)
+ {
+ if(i < 12)
+ {
+ this->M[i] = M;
+
+ updateMatrix = true;
+ updateModelMatrix[i] = true;
+ updateLighting = true;
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setViewMatrix(const Matrix &V)
+ {
+ this->V = V;
+
+ updateMatrix = true;
+ updateViewMatrix = true;
+ }
+
+ void VertexProcessor::setBaseMatrix(const Matrix &B)
+ {
+ this->B = B;
+
+ updateMatrix = true;
+ updateBaseMatrix = true;
+ }
+
+ void VertexProcessor::setProjectionMatrix(const Matrix &P)
+ {
+ this->P = P;
+ context->wBasedFog = (P[3][0] != 0.0f) || (P[3][1] != 0.0f) || (P[3][2] != 0.0f) || (P[3][3] != 1.0f);
+
+ updateMatrix = true;
+ updateProjectionMatrix = true;
+ }
+
+ void VertexProcessor::setLightingEnable(bool lightingEnable)
+ {
+ context->setLightingEnable(lightingEnable);
+
+ updateLighting = true;
+ }
+
+ void VertexProcessor::setLightEnable(unsigned int light, bool lightEnable)
+ {
+ if(light < 8)
+ {
+ context->setLightEnable(light, lightEnable);
+ }
+ else ASSERT(false);
+
+ updateLighting = true;
+ }
+
+ void VertexProcessor::setSpecularEnable(bool specularEnable)
+ {
+ context->setSpecularEnable(specularEnable);
+
+ updateLighting = true;
+ }
+
+ void VertexProcessor::setLightPosition(unsigned int light, const Point &lightPosition)
+ {
+ if(light < 8)
+ {
+ context->setLightPosition(light, lightPosition);
+ }
+ else ASSERT(false);
+
+ updateLighting = true;
+ }
+
+ void VertexProcessor::setLightDiffuse(unsigned int light, const Color<float> &lightDiffuse)
+ {
+ if(light < 8)
+ {
+ ff.lightDiffuse[light][0] = lightDiffuse.r;
+ ff.lightDiffuse[light][1] = lightDiffuse.g;
+ ff.lightDiffuse[light][2] = lightDiffuse.b;
+ ff.lightDiffuse[light][3] = lightDiffuse.a;
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setLightSpecular(unsigned int light, const Color<float> &lightSpecular)
+ {
+ if(light < 8)
+ {
+ ff.lightSpecular[light][0] = lightSpecular.r;
+ ff.lightSpecular[light][1] = lightSpecular.g;
+ ff.lightSpecular[light][2] = lightSpecular.b;
+ ff.lightSpecular[light][3] = lightSpecular.a;
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setLightAmbient(unsigned int light, const Color<float> &lightAmbient)
+ {
+ if(light < 8)
+ {
+ ff.lightAmbient[light][0] = lightAmbient.r;
+ ff.lightAmbient[light][1] = lightAmbient.g;
+ ff.lightAmbient[light][2] = lightAmbient.b;
+ ff.lightAmbient[light][3] = lightAmbient.a;
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setLightAttenuation(unsigned int light, float constant, float linear, float quadratic)
+ {
+ if(light < 8)
+ {
+ ff.attenuationConstant[light] = replicate(constant);
+ ff.attenuationLinear[light] = replicate(linear);
+ ff.attenuationQuadratic[light] = replicate(quadratic);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setLightRange(unsigned int light, float lightRange)
+ {
+ if(light < 8)
+ {
+ ff.lightRange[light] = lightRange;
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setFogEnable(bool fogEnable)
+ {
+ context->fogEnable = fogEnable;
+ }
+
+ void VertexProcessor::setVertexFogMode(FogMode fogMode)
+ {
+ context->vertexFogMode = fogMode;
+ }
+
+ void VertexProcessor::setInstanceID(int instanceID)
+ {
+ context->instanceID = instanceID;
+ }
+
+ void VertexProcessor::setColorVertexEnable(bool colorVertexEnable)
+ {
+ context->setColorVertexEnable(colorVertexEnable);
+ }
+
+ void VertexProcessor::setDiffuseMaterialSource(MaterialSource diffuseMaterialSource)
+ {
+ context->setDiffuseMaterialSource(diffuseMaterialSource);
+ }
+
+ void VertexProcessor::setSpecularMaterialSource(MaterialSource specularMaterialSource)
+ {
+ context->setSpecularMaterialSource(specularMaterialSource);
+ }
+
+ void VertexProcessor::setAmbientMaterialSource(MaterialSource ambientMaterialSource)
+ {
+ context->setAmbientMaterialSource(ambientMaterialSource);
+ }
+
+ void VertexProcessor::setEmissiveMaterialSource(MaterialSource emissiveMaterialSource)
+ {
+ context->setEmissiveMaterialSource(emissiveMaterialSource);
+ }
+
+ void VertexProcessor::setGlobalAmbient(const Color<float> &globalAmbient)
+ {
+ ff.globalAmbient[0] = globalAmbient.r;
+ ff.globalAmbient[1] = globalAmbient.g;
+ ff.globalAmbient[2] = globalAmbient.b;
+ ff.globalAmbient[3] = globalAmbient.a;
+ }
+
+ void VertexProcessor::setMaterialEmission(const Color<float> &emission)
+ {
+ ff.materialEmission[0] = emission.r;
+ ff.materialEmission[1] = emission.g;
+ ff.materialEmission[2] = emission.b;
+ ff.materialEmission[3] = emission.a;
+ }
+
+ void VertexProcessor::setMaterialAmbient(const Color<float> &materialAmbient)
+ {
+ ff.materialAmbient[0] = materialAmbient.r;
+ ff.materialAmbient[1] = materialAmbient.g;
+ ff.materialAmbient[2] = materialAmbient.b;
+ ff.materialAmbient[3] = materialAmbient.a;
+ }
+
+ void VertexProcessor::setMaterialDiffuse(const Color<float> &diffuseColor)
+ {
+ ff.materialDiffuse[0] = diffuseColor.r;
+ ff.materialDiffuse[1] = diffuseColor.g;
+ ff.materialDiffuse[2] = diffuseColor.b;
+ ff.materialDiffuse[3] = diffuseColor.a;
+ }
+
+ void VertexProcessor::setMaterialSpecular(const Color<float> &specularColor)
+ {
+ ff.materialSpecular[0] = specularColor.r;
+ ff.materialSpecular[1] = specularColor.g;
+ ff.materialSpecular[2] = specularColor.b;
+ ff.materialSpecular[3] = specularColor.a;
+ }
+
+ void VertexProcessor::setMaterialShininess(float specularPower)
+ {
+ ff.materialShininess = specularPower;
+ }
+
+ void VertexProcessor::setLightViewPosition(unsigned int light, const Point &P)
+ {
+ if(light < 8)
+ {
+ ff.lightPosition[light][0] = P.x;
+ ff.lightPosition[light][1] = P.y;
+ ff.lightPosition[light][2] = P.z;
+ ff.lightPosition[light][3] = 1;
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setRangeFogEnable(bool enable)
+ {
+ context->rangeFogEnable = enable;
+ }
+
+ void VertexProcessor::setIndexedVertexBlendEnable(bool indexedVertexBlendEnable)
+ {
+ context->indexedVertexBlendEnable = indexedVertexBlendEnable;
+ }
+
+ void VertexProcessor::setVertexBlendMatrixCount(unsigned int vertexBlendMatrixCount)
+ {
+ if(vertexBlendMatrixCount <= 4)
+ {
+ context->vertexBlendMatrixCount = vertexBlendMatrixCount;
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setTextureWrap(unsigned int stage, int mask)
+ {
+ if(stage < TEXTURE_IMAGE_UNITS)
+ {
+ context->textureWrap[stage] = mask;
+ }
+ else ASSERT(false);
+
+ context->textureWrapActive = false;
+
+ for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
+ {
+ context->textureWrapActive |= (context->textureWrap[i] != 0x00);
+ }
+ }
+
+ void VertexProcessor::setTexGen(unsigned int stage, TexGen texGen)
+ {
+ if(stage < 8)
+ {
+ context->texGen[stage] = texGen;
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setLocalViewer(bool localViewer)
+ {
+ context->localViewer = localViewer;
+ }
+
+ void VertexProcessor::setNormalizeNormals(bool normalizeNormals)
+ {
+ context->normalizeNormals = normalizeNormals;
+ }
+
+ void VertexProcessor::setTextureMatrix(int stage, const Matrix &T)
+ {
+ for(int i = 0; i < 4; i++)
+ {
+ for(int j = 0; j < 4; j++)
+ {
+ ff.textureTransform[stage][i][j] = T[i][j];
+ }
+ }
+ }
+
+ void VertexProcessor::setTextureTransform(int stage, int count, bool project)
+ {
+ context->textureTransformCount[stage] = count;
+ context->textureTransformProject[stage] = project;
+ }
+
+ void VertexProcessor::setTextureFilter(unsigned int sampler, FilterType textureFilter)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setTextureFilter(textureFilter);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setMipmapFilter(unsigned int sampler, MipmapType mipmapFilter)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setMipmapFilter(mipmapFilter);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setGatherEnable(unsigned int sampler, bool enable)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setGatherEnable(enable);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setAddressingModeU(unsigned int sampler, AddressingMode addressMode)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setAddressingModeU(addressMode);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setAddressingModeV(unsigned int sampler, AddressingMode addressMode)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setAddressingModeV(addressMode);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setAddressingModeW(unsigned int sampler, AddressingMode addressMode)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setAddressingModeW(addressMode);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setReadSRGB(unsigned int sampler, bool sRGB)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setReadSRGB(sRGB);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setMipmapLOD(unsigned int sampler, float bias)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setMipmapLOD(bias);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setBorderColor(unsigned int sampler, const Color<float> &borderColor)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setBorderColor(borderColor);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setMaxAnisotropy(unsigned int sampler, float maxAnisotropy)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setMaxAnisotropy(maxAnisotropy);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setHighPrecisionFiltering(unsigned int sampler, bool highPrecisionFiltering)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setHighPrecisionFiltering(highPrecisionFiltering);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setSwizzleR(unsigned int sampler, SwizzleType swizzleR)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setSwizzleR(swizzleR);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setSwizzleG(unsigned int sampler, SwizzleType swizzleG)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setSwizzleG(swizzleG);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setSwizzleB(unsigned int sampler, SwizzleType swizzleB)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setSwizzleB(swizzleB);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setSwizzleA(unsigned int sampler, SwizzleType swizzleA)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setSwizzleA(swizzleA);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setCompareFunc(unsigned int sampler, CompareFunc compFunc)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setCompareFunc(compFunc);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setBaseLevel(unsigned int sampler, int baseLevel)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setBaseLevel(baseLevel);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setMaxLevel(unsigned int sampler, int maxLevel)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setMaxLevel(maxLevel);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setMinLod(unsigned int sampler, float minLod)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setMinLod(minLod);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setMaxLod(unsigned int sampler, float maxLod)
+ {
+ if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[TEXTURE_IMAGE_UNITS + sampler].setMaxLod(maxLod);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setSyncRequired(unsigned int sampler, bool isSincRequired)
+ {
+ if(sampler < TEXTURE_IMAGE_UNITS)
+ {
+ context->sampler[sampler].setSyncRequired(isSincRequired);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProcessor::setPointSize(float pointSize)
+ {
+ point.pointSize = replicate(pointSize);
+ }
+
+ void VertexProcessor::setPointSizeMin(float pointSizeMin)
+ {
+ point.pointSizeMin = pointSizeMin;
+ }
+
+ void VertexProcessor::setPointSizeMax(float pointSizeMax)
+ {
+ point.pointSizeMax = pointSizeMax;
+ }
+
+ void VertexProcessor::setPointScaleA(float pointScaleA)
+ {
+ point.pointScaleA = pointScaleA;
+ }
+
+ void VertexProcessor::setPointScaleB(float pointScaleB)
+ {
+ point.pointScaleB = pointScaleB;
+ }
+
+ void VertexProcessor::setPointScaleC(float pointScaleC)
+ {
+ point.pointScaleC = pointScaleC;
+ }
+
+ void VertexProcessor::setTransformFeedbackQueryEnabled(bool enable)
+ {
+ context->transformFeedbackQueryEnabled = enable;
+ }
+
+ void VertexProcessor::enableTransformFeedback(uint64_t enable)
+ {
+ context->transformFeedbackEnabled = enable;
+ }
+
+ const Matrix &VertexProcessor::getModelTransform(int i)
+ {
+ updateTransform();
+ return PBVM[i];
+ }
+
+ const Matrix &VertexProcessor::getViewTransform()
+ {
+ updateTransform();
+ return PBV;
+ }
+
+ bool VertexProcessor::isFixedFunction()
+ {
+ return !context->vertexShader;
+ }
+
+ void VertexProcessor::setTransform(const Matrix &M, int i)
+ {
+ ff.transformT[i][0][0] = M[0][0];
+ ff.transformT[i][0][1] = M[1][0];
+ ff.transformT[i][0][2] = M[2][0];
+ ff.transformT[i][0][3] = M[3][0];
+
+ ff.transformT[i][1][0] = M[0][1];
+ ff.transformT[i][1][1] = M[1][1];
+ ff.transformT[i][1][2] = M[2][1];
+ ff.transformT[i][1][3] = M[3][1];
+
+ ff.transformT[i][2][0] = M[0][2];
+ ff.transformT[i][2][1] = M[1][2];
+ ff.transformT[i][2][2] = M[2][2];
+ ff.transformT[i][2][3] = M[3][2];
+
+ ff.transformT[i][3][0] = M[0][3];
+ ff.transformT[i][3][1] = M[1][3];
+ ff.transformT[i][3][2] = M[2][3];
+ ff.transformT[i][3][3] = M[3][3];
+ }
+
+ void VertexProcessor::setCameraTransform(const Matrix &M, int i)
+ {
+ ff.cameraTransformT[i][0][0] = M[0][0];
+ ff.cameraTransformT[i][0][1] = M[1][0];
+ ff.cameraTransformT[i][0][2] = M[2][0];
+ ff.cameraTransformT[i][0][3] = M[3][0];
+
+ ff.cameraTransformT[i][1][0] = M[0][1];
+ ff.cameraTransformT[i][1][1] = M[1][1];
+ ff.cameraTransformT[i][1][2] = M[2][1];
+ ff.cameraTransformT[i][1][3] = M[3][1];
+
+ ff.cameraTransformT[i][2][0] = M[0][2];
+ ff.cameraTransformT[i][2][1] = M[1][2];
+ ff.cameraTransformT[i][2][2] = M[2][2];
+ ff.cameraTransformT[i][2][3] = M[3][2];
+
+ ff.cameraTransformT[i][3][0] = M[0][3];
+ ff.cameraTransformT[i][3][1] = M[1][3];
+ ff.cameraTransformT[i][3][2] = M[2][3];
+ ff.cameraTransformT[i][3][3] = M[3][3];
+ }
+
+ void VertexProcessor::setNormalTransform(const Matrix &M, int i)
+ {
+ ff.normalTransformT[i][0][0] = M[0][0];
+ ff.normalTransformT[i][0][1] = M[1][0];
+ ff.normalTransformT[i][0][2] = M[2][0];
+ ff.normalTransformT[i][0][3] = M[3][0];
+
+ ff.normalTransformT[i][1][0] = M[0][1];
+ ff.normalTransformT[i][1][1] = M[1][1];
+ ff.normalTransformT[i][1][2] = M[2][1];
+ ff.normalTransformT[i][1][3] = M[3][1];
+
+ ff.normalTransformT[i][2][0] = M[0][2];
+ ff.normalTransformT[i][2][1] = M[1][2];
+ ff.normalTransformT[i][2][2] = M[2][2];
+ ff.normalTransformT[i][2][3] = M[3][2];
+
+ ff.normalTransformT[i][3][0] = M[0][3];
+ ff.normalTransformT[i][3][1] = M[1][3];
+ ff.normalTransformT[i][3][2] = M[2][3];
+ ff.normalTransformT[i][3][3] = M[3][3];
+ }
+
+ void VertexProcessor::updateTransform()
+ {
+ if(!updateMatrix) return;
+
+ int activeMatrices = context->indexedVertexBlendEnable ? 12 : max(context->vertexBlendMatrixCount, 1);
+
+ if(updateProjectionMatrix)
+ {
+ PB = P * B;
+ PBV = PB * V;
+
+ for(int i = 0; i < activeMatrices; i++)
+ {
+ PBVM[i] = PBV * M[i];
+ updateModelMatrix[i] = false;
+ }
+
+ updateProjectionMatrix = false;
+ updateBaseMatrix = false;
+ updateViewMatrix = false;
+ }
+
+ if(updateBaseMatrix)
+ {
+ PB = P * B;
+ PBV = PB * V;
+
+ for(int i = 0; i < activeMatrices; i++)
+ {
+ PBVM[i] = PBV * M[i];
+ updateModelMatrix[i] = false;
+ }
+
+ updateBaseMatrix = false;
+ updateViewMatrix = false;
+ }
+
+ if(updateViewMatrix)
+ {
+ PBV = PB * V;
+
+ for(int i = 0; i < activeMatrices; i++)
+ {
+ PBVM[i] = PBV * M[i];
+ updateModelMatrix[i] = false;
+ }
+
+ updateViewMatrix = false;
+ }
+
+ for(int i = 0; i < activeMatrices; i++)
+ {
+ if(updateModelMatrix[i])
+ {
+ PBVM[i] = PBV * M[i];
+ updateModelMatrix[i] = false;
+ }
+ }
+
+ for(int i = 0; i < activeMatrices; i++)
+ {
+ setTransform(PBVM[i], i);
+ setCameraTransform(B * V * M[i], i);
+ setNormalTransform(~!(B * V * M[i]), i);
+ }
+
+ updateMatrix = false;
+ }
+
+ void VertexProcessor::setRoutineCacheSize(int cacheSize)
+ {
+ delete routineCache;
+ routineCache = new RoutineCache<State>(clamp(cacheSize, 1, 65536), precacheVertex ? "sw-vertex" : 0);
+ }
+
+ const VertexProcessor::State VertexProcessor::update(DrawType drawType)
+ {
+ if(isFixedFunction())
+ {
+ updateTransform();
+
+ if(updateLighting)
+ {
+ for(int i = 0; i < 8; i++)
+ {
+ if(context->vertexLightActive(i))
+ {
+ // Light position in camera coordinates
+ setLightViewPosition(i, B * V * context->getLightPosition(i));
+ }
+ }
+
+ updateLighting = false;
+ }
+ }
+
+ State state;
+
+ if(context->vertexShader)
+ {
+ state.shaderID = context->vertexShader->getSerialID();
+ }
+ else
+ {
+ state.shaderID = 0;
+ }
+
+ state.fixedFunction = !context->vertexShader && context->pixelShaderModel() < 0x0300;
+ state.textureSampling = context->vertexShader ? context->vertexShader->containsTextureSampling() : false;
+ state.positionRegister = context->vertexShader ? context->vertexShader->getPositionRegister() : Pos;
+ state.pointSizeRegister = context->vertexShader ? context->vertexShader->getPointSizeRegister() : Pts;
+
+ state.vertexBlendMatrixCount = context->vertexBlendMatrixCountActive();
+ state.indexedVertexBlendEnable = context->indexedVertexBlendActive();
+ state.vertexNormalActive = context->vertexNormalActive();
+ state.normalizeNormals = context->normalizeNormalsActive();
+ state.vertexLightingActive = context->vertexLightingActive();
+ state.diffuseActive = context->diffuseActive();
+ state.specularActive = context->specularActive();
+ state.vertexSpecularActive = context->vertexSpecularActive();
+
+ state.vertexLightActive = context->vertexLightActive(0) << 0 |
+ context->vertexLightActive(1) << 1 |
+ context->vertexLightActive(2) << 2 |
+ context->vertexLightActive(3) << 3 |
+ context->vertexLightActive(4) << 4 |
+ context->vertexLightActive(5) << 5 |
+ context->vertexLightActive(6) << 6 |
+ context->vertexLightActive(7) << 7;
+
+ state.vertexDiffuseMaterialSourceActive = context->vertexDiffuseMaterialSourceActive();
+ state.vertexSpecularMaterialSourceActive = context->vertexSpecularMaterialSourceActive();
+ state.vertexAmbientMaterialSourceActive = context->vertexAmbientMaterialSourceActive();
+ state.vertexEmissiveMaterialSourceActive = context->vertexEmissiveMaterialSourceActive();
+ state.fogActive = context->fogActive();
+ state.vertexFogMode = context->vertexFogModeActive();
+ state.rangeFogActive = context->rangeFogActive();
+ state.localViewerActive = context->localViewerActive();
+ state.pointSizeActive = context->pointSizeActive();
+ state.pointScaleActive = context->pointScaleActive();
+
+ state.preTransformed = context->preTransformed;
+ state.superSampling = context->getSuperSampleCount() > 1;
+ state.multiSampling = context->getMultiSampleCount() > 1;
+
+ state.transformFeedbackQueryEnabled = context->transformFeedbackQueryEnabled;
+ state.transformFeedbackEnabled = context->transformFeedbackEnabled;
+
+ // Note: Quads aren't handled for verticesPerPrimitive, but verticesPerPrimitive is used for transform feedback,
+ // which is an OpenGL ES 3.0 feature, and OpenGL ES 3.0 doesn't support quads as a primitive type.
+ DrawType type = static_cast<DrawType>(static_cast<unsigned int>(drawType) & 0xF);
+ state.verticesPerPrimitive = 1 + (type >= DRAW_LINELIST) + (type >= DRAW_TRIANGLELIST);
+
+ for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+ {
+ state.input[i].type = context->input[i].type;
+ state.input[i].count = context->input[i].count;
+ state.input[i].normalized = context->input[i].normalized;
+ state.input[i].attribType = context->vertexShader ? context->vertexShader->getAttribType(i) : VertexShader::ATTRIBTYPE_FLOAT;
+ }
+
+ if(!context->vertexShader)
+ {
+ for(int i = 0; i < 8; i++)
+ {
+ // state.textureState[i].vertexTextureActive = context->vertexTextureActive(i, 0);
+ state.textureState[i].texGenActive = context->texGenActive(i);
+ state.textureState[i].textureTransformCountActive = context->textureTransformCountActive(i);
+ state.textureState[i].texCoordIndexActive = context->texCoordIndexActive(i);
+ }
+ }
+ else
+ {
+ for(unsigned int i = 0; i < VERTEX_TEXTURE_IMAGE_UNITS; i++)
+ {
+ if(context->vertexShader->usesSampler(i))
+ {
+ state.sampler[i] = context->sampler[TEXTURE_IMAGE_UNITS + i].samplerState();
+ }
+ }
+ }
+
+ if(context->vertexShader) // FIXME: Also when pre-transformed?
+ {
+ for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
+ {
+ state.output[i].xWrite = context->vertexShader->getOutput(i, 0).active();
+ state.output[i].yWrite = context->vertexShader->getOutput(i, 1).active();
+ state.output[i].zWrite = context->vertexShader->getOutput(i, 2).active();
+ state.output[i].wWrite = context->vertexShader->getOutput(i, 3).active();
+ }
+ }
+ else if(!context->preTransformed || context->pixelShaderModel() < 0x0300)
+ {
+ state.output[Pos].write = 0xF;
+
+ if(context->diffuseActive() && (context->lightingEnable || context->input[Color0]))
+ {
+ state.output[C0].write = 0xF;
+ }
+
+ if(context->specularActive())
+ {
+ state.output[C1].write = 0xF;
+ }
+
+ for(int stage = 0; stage < 8; stage++)
+ {
+ if(context->texCoordActive(stage, 0)) state.output[T0 + stage].write |= 0x01;
+ if(context->texCoordActive(stage, 1)) state.output[T0 + stage].write |= 0x02;
+ if(context->texCoordActive(stage, 2)) state.output[T0 + stage].write |= 0x04;
+ if(context->texCoordActive(stage, 3)) state.output[T0 + stage].write |= 0x08;
+ }
+
+ if(context->fogActive())
+ {
+ state.output[Fog].xWrite = true;
+ }
+
+ if(context->pointSizeActive())
+ {
+ state.output[Pts].yWrite = true;
+ }
+ }
+ else
+ {
+ state.output[Pos].write = 0xF;
+
+ for(int i = 0; i < 2; i++)
+ {
+ if(context->input[Color0 + i])
+ {
+ state.output[C0 + i].write = 0xF;
+ }
+ }
+
+ for(int i = 0; i < 8; i++)
+ {
+ if(context->input[TexCoord0 + i])
+ {
+ state.output[T0 + i].write = 0xF;
+ }
+ }
+
+ if(context->input[PointSize])
+ {
+ state.output[Pts].yWrite = true;
+ }
+ }
+
+ if(context->vertexShaderModel() < 0x0300)
+ {
+ state.output[C0].clamp = 0xF;
+ state.output[C1].clamp = 0xF;
+ state.output[Fog].xClamp = true;
+ }
+
+ state.hash = state.computeHash();
+
+ return state;
+ }
+
+ Routine *VertexProcessor::routine(const State &state)
+ {
+ Routine *routine = routineCache->query(state);
+
+ if(!routine) // Create one
+ {
+ VertexRoutine *generator = nullptr;
+
+ if(state.fixedFunction)
+ {
+ generator = new VertexPipeline(state);
+ }
+ else
+ {
+ generator = new VertexProgram(state, context->vertexShader);
+ }
+
+ generator->generate();
+ routine = (*generator)(L"VertexRoutine_%0.8X", state.shaderID);
+ delete generator;
+
+ routineCache->add(state, routine);
+ }
+
+ return routine;
+ }
+}
diff --git a/src/Device/VertexProcessor.hpp b/src/Device/VertexProcessor.hpp
new file mode 100644
index 0000000..277a155
--- /dev/null
+++ b/src/Device/VertexProcessor.hpp
@@ -0,0 +1,352 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_VertexProcessor_hpp
+#define sw_VertexProcessor_hpp
+
+#include "Matrix.hpp"
+#include "Context.hpp"
+#include "RoutineCache.hpp"
+#include "Shader/VertexShader.hpp"
+
+namespace sw
+{
+ struct DrawData;
+
+ struct VertexCache // FIXME: Variable size
+ {
+ void clear();
+
+ Vertex vertex[16][4];
+ unsigned int tag[16];
+
+ int drawCall;
+ };
+
+ struct VertexTask
+ {
+ unsigned int vertexCount;
+ unsigned int primitiveStart;
+ VertexCache vertexCache;
+ };
+
+ class VertexProcessor
+ {
+ public:
+ struct States
+ {
+ unsigned int computeHash();
+
+ uint64_t shaderID;
+
+ bool fixedFunction : 1; // TODO: Eliminate by querying shader.
+ bool textureSampling : 1; // TODO: Eliminate by querying shader.
+ unsigned int positionRegister : BITS(MAX_VERTEX_OUTPUTS); // TODO: Eliminate by querying shader.
+ unsigned int pointSizeRegister : BITS(MAX_VERTEX_OUTPUTS); // TODO: Eliminate by querying shader.
+
+ unsigned int vertexBlendMatrixCount : 3;
+ bool indexedVertexBlendEnable : 1;
+ bool vertexNormalActive : 1;
+ bool normalizeNormals : 1;
+ bool vertexLightingActive : 1;
+ bool diffuseActive : 1;
+ bool specularActive : 1;
+ bool vertexSpecularActive : 1;
+ unsigned int vertexLightActive : 8;
+ MaterialSource vertexDiffuseMaterialSourceActive : BITS(MATERIAL_LAST);
+ MaterialSource vertexSpecularMaterialSourceActive : BITS(MATERIAL_LAST);
+ MaterialSource vertexAmbientMaterialSourceActive : BITS(MATERIAL_LAST);
+ MaterialSource vertexEmissiveMaterialSourceActive : BITS(MATERIAL_LAST);
+ bool fogActive : 1;
+ FogMode vertexFogMode : BITS(FOG_LAST);
+ bool rangeFogActive : 1;
+ bool localViewerActive : 1;
+ bool pointSizeActive : 1;
+ bool pointScaleActive : 1;
+ bool transformFeedbackQueryEnabled : 1;
+ uint64_t transformFeedbackEnabled : 64;
+ unsigned char verticesPerPrimitive : 2; // 1 (points), 2 (lines) or 3 (triangles)
+
+ bool preTransformed : 1;
+ bool superSampling : 1;
+ bool multiSampling : 1;
+
+ struct TextureState
+ {
+ TexGen texGenActive : BITS(TEXGEN_LAST);
+ unsigned char textureTransformCountActive : 3;
+ unsigned char texCoordIndexActive : 3;
+ };
+
+ TextureState textureState[8];
+
+ Sampler::State sampler[VERTEX_TEXTURE_IMAGE_UNITS];
+
+ struct Input
+ {
+ operator bool() const // Returns true if stream contains data
+ {
+ return count != 0;
+ }
+
+ StreamType type : BITS(STREAMTYPE_LAST);
+ unsigned int count : 3;
+ bool normalized : 1;
+ unsigned int attribType : BITS(VertexShader::ATTRIBTYPE_LAST);
+ };
+
+ struct Output
+ {
+ union
+ {
+ unsigned char write : 4;
+
+ struct
+ {
+ unsigned char xWrite : 1;
+ unsigned char yWrite : 1;
+ unsigned char zWrite : 1;
+ unsigned char wWrite : 1;
+ };
+ };
+
+ union
+ {
+ unsigned char clamp : 4;
+
+ struct
+ {
+ unsigned char xClamp : 1;
+ unsigned char yClamp : 1;
+ unsigned char zClamp : 1;
+ unsigned char wClamp : 1;
+ };
+ };
+ };
+
+ Input input[MAX_VERTEX_INPUTS];
+ Output output[MAX_VERTEX_OUTPUTS];
+ };
+
+ struct State : States
+ {
+ State();
+
+ bool operator==(const State &state) const;
+
+ unsigned int hash;
+ };
+
+ struct FixedFunction
+ {
+ float4 transformT[12][4];
+ float4 cameraTransformT[12][4];
+ float4 normalTransformT[12][4];
+ float4 textureTransform[8][4];
+
+ float4 lightPosition[8];
+ float4 lightAmbient[8];
+ float4 lightSpecular[8];
+ float4 lightDiffuse[8];
+ float4 attenuationConstant[8];
+ float4 attenuationLinear[8];
+ float4 attenuationQuadratic[8];
+ float lightRange[8];
+ float4 materialDiffuse;
+ float4 materialSpecular;
+ float materialShininess;
+ float4 globalAmbient;
+ float4 materialEmission;
+ float4 materialAmbient;
+ };
+
+ struct PointSprite
+ {
+ float4 pointSize;
+ float pointSizeMin;
+ float pointSizeMax;
+ float pointScaleA;
+ float pointScaleB;
+ float pointScaleC;
+ };
+
+ typedef void (*RoutinePointer)(Vertex *output, unsigned int *batch, VertexTask *vertexTask, DrawData *draw);
+
+ VertexProcessor(Context *context);
+
+ virtual ~VertexProcessor();
+
+ void setInputStream(int index, const Stream &stream);
+ void resetInputStreams(bool preTransformed);
+
+ void setFloatConstant(unsigned int index, const float value[4]);
+ void setIntegerConstant(unsigned int index, const int integer[4]);
+ void setBooleanConstant(unsigned int index, int boolean);
+
+ void setUniformBuffer(int index, sw::Resource* uniformBuffer, int offset);
+ void lockUniformBuffers(byte** u, sw::Resource* uniformBuffers[]);
+
+ void setTransformFeedbackBuffer(int index, sw::Resource* transformFeedbackBuffer, int offset, unsigned int reg, unsigned int row, unsigned int col, unsigned int stride);
+ void lockTransformFeedbackBuffers(byte** t, unsigned int* v, unsigned int* r, unsigned int* c, unsigned int* s, sw::Resource* transformFeedbackBuffers[]);
+
+ // Transformations
+ void setModelMatrix(const Matrix &M, int i = 0);
+ void setViewMatrix(const Matrix &V);
+ void setBaseMatrix(const Matrix &B);
+ void setProjectionMatrix(const Matrix &P);
+
+ // Lighting
+ void setLightingEnable(bool lightingEnable);
+ void setLightEnable(unsigned int light, bool lightEnable);
+ void setSpecularEnable(bool specularEnable);
+
+ void setGlobalAmbient(const Color<float> &globalAmbient);
+ void setLightPosition(unsigned int light, const Point &lightPosition);
+ void setLightViewPosition(unsigned int light, const Point &lightPosition);
+ void setLightDiffuse(unsigned int light, const Color<float> &lightDiffuse);
+ void setLightSpecular(unsigned int light, const Color<float> &lightSpecular);
+ void setLightAmbient(unsigned int light, const Color<float> &lightAmbient);
+ void setLightAttenuation(unsigned int light, float constant, float linear, float quadratic);
+ void setLightRange(unsigned int light, float lightRange);
+
+ void setInstanceID(int instanceID);
+
+ void setFogEnable(bool fogEnable);
+ void setVertexFogMode(FogMode fogMode);
+ void setRangeFogEnable(bool enable);
+
+ void setColorVertexEnable(bool colorVertexEnable);
+ void setDiffuseMaterialSource(MaterialSource diffuseMaterialSource);
+ void setSpecularMaterialSource(MaterialSource specularMaterialSource);
+ void setAmbientMaterialSource(MaterialSource ambientMaterialSource);
+ void setEmissiveMaterialSource(MaterialSource emissiveMaterialSource);
+
+ void setMaterialEmission(const Color<float> &emission);
+ void setMaterialAmbient(const Color<float> &materialAmbient);
+ void setMaterialDiffuse(const Color<float> &diffuseColor);
+ void setMaterialSpecular(const Color<float> &specularColor);
+ void setMaterialShininess(float specularPower);
+
+ void setIndexedVertexBlendEnable(bool indexedVertexBlendEnable);
+ void setVertexBlendMatrixCount(unsigned int vertexBlendMatrixCount);
+
+ void setTextureWrap(unsigned int stage, int mask);
+ void setTexGen(unsigned int stage, TexGen texGen);
+ void setLocalViewer(bool localViewer);
+ void setNormalizeNormals(bool normalizeNormals);
+ void setTextureMatrix(int stage, const Matrix &T);
+ void setTextureTransform(int stage, int count, bool project);
+
+ void setTextureFilter(unsigned int sampler, FilterType textureFilter);
+ void setMipmapFilter(unsigned int sampler, MipmapType mipmapFilter);
+ void setGatherEnable(unsigned int sampler, bool enable);
+ void setAddressingModeU(unsigned int sampler, AddressingMode addressingMode);
+ void setAddressingModeV(unsigned int sampler, AddressingMode addressingMode);
+ void setAddressingModeW(unsigned int sampler, AddressingMode addressingMode);
+ void setReadSRGB(unsigned int sampler, bool sRGB);
+ void setMipmapLOD(unsigned int sampler, float bias);
+ void setBorderColor(unsigned int sampler, const Color<float> &borderColor);
+ void setMaxAnisotropy(unsigned int stage, float maxAnisotropy);
+ void setHighPrecisionFiltering(unsigned int sampler, bool highPrecisionFiltering);
+ void setSwizzleR(unsigned int sampler, SwizzleType swizzleR);
+ void setSwizzleG(unsigned int sampler, SwizzleType swizzleG);
+ void setSwizzleB(unsigned int sampler, SwizzleType swizzleB);
+ void setSwizzleA(unsigned int sampler, SwizzleType swizzleA);
+ void setCompareFunc(unsigned int sampler, CompareFunc compare);
+ void setBaseLevel(unsigned int sampler, int baseLevel);
+ void setMaxLevel(unsigned int sampler, int maxLevel);
+ void setMinLod(unsigned int sampler, float minLod);
+ void setMaxLod(unsigned int sampler, float maxLod);
+ void setSyncRequired(unsigned int sampler, bool isSincRequired);
+
+ void setPointSize(float pointSize);
+ void setPointSizeMin(float pointSizeMin);
+ void setPointSizeMax(float pointSizeMax);
+ void setPointScaleA(float pointScaleA);
+ void setPointScaleB(float pointScaleB);
+ void setPointScaleC(float pointScaleC);
+
+ void setTransformFeedbackQueryEnabled(bool enable);
+ void enableTransformFeedback(uint64_t enable);
+
+ protected:
+ const Matrix &getModelTransform(int i);
+ const Matrix &getViewTransform();
+
+ const State update(DrawType drawType);
+ Routine *routine(const State &state);
+
+ bool isFixedFunction();
+ void setRoutineCacheSize(int cacheSize);
+
+ // Shader constants
+ float4 c[VERTEX_UNIFORM_VECTORS + 1]; // One extra for indices out of range, c[VERTEX_UNIFORM_VECTORS] = {0, 0, 0, 0}
+ int4 i[16];
+ bool b[16];
+
+ PointSprite point;
+ FixedFunction ff;
+
+ private:
+ struct UniformBufferInfo
+ {
+ UniformBufferInfo();
+
+ Resource* buffer;
+ int offset;
+ };
+ UniformBufferInfo uniformBufferInfo[MAX_UNIFORM_BUFFER_BINDINGS];
+
+ struct TransformFeedbackInfo
+ {
+ TransformFeedbackInfo();
+
+ Resource* buffer;
+ unsigned int offset;
+ unsigned int reg;
+ unsigned int row;
+ unsigned int col;
+ unsigned int stride;
+ };
+ TransformFeedbackInfo transformFeedbackInfo[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS];
+
+ void updateTransform();
+ void setTransform(const Matrix &M, int i);
+ void setCameraTransform(const Matrix &M, int i);
+ void setNormalTransform(const Matrix &M, int i);
+
+ Context *const context;
+
+ RoutineCache<State> *routineCache;
+
+ protected:
+ Matrix M[12]; // Model/Geometry/World matrix
+ Matrix V; // View/Camera/Eye matrix
+ Matrix B; // Base matrix
+ Matrix P; // Projection matrix
+ Matrix PB; // P * B
+ Matrix PBV; // P * B * V
+ Matrix PBVM[12]; // P * B * V * M
+
+ // Update hierarchy
+ bool updateMatrix;
+ bool updateModelMatrix[12];
+ bool updateViewMatrix;
+ bool updateBaseMatrix;
+ bool updateProjectionMatrix;
+ bool updateLighting;
+ };
+}
+
+#endif // sw_VertexProcessor_hpp
diff --git a/src/Pipeline/Constants.cpp b/src/Pipeline/Constants.cpp
new file mode 100644
index 0000000..06dda32
--- /dev/null
+++ b/src/Pipeline/Constants.cpp
@@ -0,0 +1,362 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Constants.hpp"
+
+#include "Common/Math.hpp"
+#include "Common/Half.hpp"
+
+#include <memory.h>
+
+namespace sw
+{
+ Constants constants;
+
+ Constants::Constants()
+ {
+ static const unsigned int transposeBit0[16] =
+ {
+ 0x00000000,
+ 0x00000001,
+ 0x00000010,
+ 0x00000011,
+ 0x00000100,
+ 0x00000101,
+ 0x00000110,
+ 0x00000111,
+ 0x00001000,
+ 0x00001001,
+ 0x00001010,
+ 0x00001011,
+ 0x00001100,
+ 0x00001101,
+ 0x00001110,
+ 0x00001111
+ };
+
+ static const unsigned int transposeBit1[16] =
+ {
+ 0x00000000,
+ 0x00000002,
+ 0x00000020,
+ 0x00000022,
+ 0x00000200,
+ 0x00000202,
+ 0x00000220,
+ 0x00000222,
+ 0x00002000,
+ 0x00002002,
+ 0x00002020,
+ 0x00002022,
+ 0x00002200,
+ 0x00002202,
+ 0x00002220,
+ 0x00002222
+ };
+
+ static const unsigned int transposeBit2[16] =
+ {
+ 0x00000000,
+ 0x00000004,
+ 0x00000040,
+ 0x00000044,
+ 0x00000400,
+ 0x00000404,
+ 0x00000440,
+ 0x00000444,
+ 0x00004000,
+ 0x00004004,
+ 0x00004040,
+ 0x00004044,
+ 0x00004400,
+ 0x00004404,
+ 0x00004440,
+ 0x00004444
+ };
+
+ memcpy(&this->transposeBit0, transposeBit0, sizeof(transposeBit0));
+ memcpy(&this->transposeBit1, transposeBit1, sizeof(transposeBit1));
+ memcpy(&this->transposeBit2, transposeBit2, sizeof(transposeBit2));
+
+ static const ushort4 cWeight[17] =
+ {
+ {0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}, // 0xFFFF / 1 = 0xFFFF
+ {0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}, // 0xFFFF / 1 = 0xFFFF
+ {0x8000, 0x8000, 0x8000, 0x8000}, // 0xFFFF / 2 = 0x8000
+ {0x5555, 0x5555, 0x5555, 0x5555}, // 0xFFFF / 3 = 0x5555
+ {0x4000, 0x4000, 0x4000, 0x4000}, // 0xFFFF / 4 = 0x4000
+ {0x3333, 0x3333, 0x3333, 0x3333}, // 0xFFFF / 5 = 0x3333
+ {0x2AAA, 0x2AAA, 0x2AAA, 0x2AAA}, // 0xFFFF / 6 = 0x2AAA
+ {0x2492, 0x2492, 0x2492, 0x2492}, // 0xFFFF / 7 = 0x2492
+ {0x2000, 0x2000, 0x2000, 0x2000}, // 0xFFFF / 8 = 0x2000
+ {0x1C71, 0x1C71, 0x1C71, 0x1C71}, // 0xFFFF / 9 = 0x1C71
+ {0x1999, 0x1999, 0x1999, 0x1999}, // 0xFFFF / 10 = 0x1999
+ {0x1745, 0x1745, 0x1745, 0x1745}, // 0xFFFF / 11 = 0x1745
+ {0x1555, 0x1555, 0x1555, 0x1555}, // 0xFFFF / 12 = 0x1555
+ {0x13B1, 0x13B1, 0x13B1, 0x13B1}, // 0xFFFF / 13 = 0x13B1
+ {0x1249, 0x1249, 0x1249, 0x1249}, // 0xFFFF / 14 = 0x1249
+ {0x1111, 0x1111, 0x1111, 0x1111}, // 0xFFFF / 15 = 0x1111
+ {0x1000, 0x1000, 0x1000, 0x1000}, // 0xFFFF / 16 = 0x1000
+ };
+
+ static const float4 uvWeight[17] =
+ {
+ {1.0f / 1.0f, 1.0f / 1.0f, 1.0f / 1.0f, 1.0f / 1.0f},
+ {1.0f / 1.0f, 1.0f / 1.0f, 1.0f / 1.0f, 1.0f / 1.0f},
+ {1.0f / 2.0f, 1.0f / 2.0f, 1.0f / 2.0f, 1.0f / 2.0f},
+ {1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f},
+ {1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f},
+ {1.0f / 5.0f, 1.0f / 5.0f, 1.0f / 5.0f, 1.0f / 5.0f},
+ {1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f},
+ {1.0f / 7.0f, 1.0f / 7.0f, 1.0f / 7.0f, 1.0f / 7.0f},
+ {1.0f / 8.0f, 1.0f / 8.0f, 1.0f / 8.0f, 1.0f / 8.0f},
+ {1.0f / 9.0f, 1.0f / 9.0f, 1.0f / 9.0f, 1.0f / 9.0f},
+ {1.0f / 10.0f, 1.0f / 10.0f, 1.0f / 10.0f, 1.0f / 10.0f},
+ {1.0f / 11.0f, 1.0f / 11.0f, 1.0f / 11.0f, 1.0f / 11.0f},
+ {1.0f / 12.0f, 1.0f / 12.0f, 1.0f / 12.0f, 1.0f / 12.0f},
+ {1.0f / 13.0f, 1.0f / 13.0f, 1.0f / 13.0f, 1.0f / 13.0f},
+ {1.0f / 14.0f, 1.0f / 14.0f, 1.0f / 14.0f, 1.0f / 14.0f},
+ {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f},
+ {1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f},
+ };
+
+ static const float4 uvStart[17] =
+ {
+ {-0.0f / 2.0f, -0.0f / 2.0f, -0.0f / 2.0f, -0.0f / 2.0f},
+ {-0.0f / 2.0f, -0.0f / 2.0f, -0.0f / 2.0f, -0.0f / 2.0f},
+ {-1.0f / 4.0f, -1.0f / 4.0f, -1.0f / 4.0f, -1.0f / 4.0f},
+ {-2.0f / 6.0f, -2.0f / 6.0f, -2.0f / 6.0f, -2.0f / 6.0f},
+ {-3.0f / 8.0f, -3.0f / 8.0f, -3.0f / 8.0f, -3.0f / 8.0f},
+ {-4.0f / 10.0f, -4.0f / 10.0f, -4.0f / 10.0f, -4.0f / 10.0f},
+ {-5.0f / 12.0f, -5.0f / 12.0f, -5.0f / 12.0f, -5.0f / 12.0f},
+ {-6.0f / 14.0f, -6.0f / 14.0f, -6.0f / 14.0f, -6.0f / 14.0f},
+ {-7.0f / 16.0f, -7.0f / 16.0f, -7.0f / 16.0f, -7.0f / 16.0f},
+ {-8.0f / 18.0f, -8.0f / 18.0f, -8.0f / 18.0f, -8.0f / 18.0f},
+ {-9.0f / 20.0f, -9.0f / 20.0f, -9.0f / 20.0f, -9.0f / 20.0f},
+ {-10.0f / 22.0f, -10.0f / 22.0f, -10.0f / 22.0f, -10.0f / 22.0f},
+ {-11.0f / 24.0f, -11.0f / 24.0f, -11.0f / 24.0f, -11.0f / 24.0f},
+ {-12.0f / 26.0f, -12.0f / 26.0f, -12.0f / 26.0f, -12.0f / 26.0f},
+ {-13.0f / 28.0f, -13.0f / 28.0f, -13.0f / 28.0f, -13.0f / 28.0f},
+ {-14.0f / 30.0f, -14.0f / 30.0f, -14.0f / 30.0f, -14.0f / 30.0f},
+ {-15.0f / 32.0f, -15.0f / 32.0f, -15.0f / 32.0f, -15.0f / 32.0f},
+ };
+
+ memcpy(&this->cWeight, cWeight, sizeof(cWeight));
+ memcpy(&this->uvWeight, uvWeight, sizeof(uvWeight));
+ memcpy(&this->uvStart, uvStart, sizeof(uvStart));
+
+ static const unsigned int occlusionCount[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+
+ memcpy(&this->occlusionCount, &occlusionCount, sizeof(occlusionCount));
+
+ for(int i = 0; i < 16; i++)
+ {
+ maskB4Q[i][0] = -(i >> 0 & 1);
+ maskB4Q[i][1] = -(i >> 1 & 1);
+ maskB4Q[i][2] = -(i >> 2 & 1);
+ maskB4Q[i][3] = -(i >> 3 & 1);
+ maskB4Q[i][4] = -(i >> 0 & 1);
+ maskB4Q[i][5] = -(i >> 1 & 1);
+ maskB4Q[i][6] = -(i >> 2 & 1);
+ maskB4Q[i][7] = -(i >> 3 & 1);
+
+ invMaskB4Q[i][0] = ~maskB4Q[i][0];
+ invMaskB4Q[i][1] = ~maskB4Q[i][1];
+ invMaskB4Q[i][2] = ~maskB4Q[i][2];
+ invMaskB4Q[i][3] = ~maskB4Q[i][3];
+ invMaskB4Q[i][4] = ~maskB4Q[i][4];
+ invMaskB4Q[i][5] = ~maskB4Q[i][5];
+ invMaskB4Q[i][6] = ~maskB4Q[i][6];
+ invMaskB4Q[i][7] = ~maskB4Q[i][7];
+
+ maskW4Q[i][0] = -(i >> 0 & 1);
+ maskW4Q[i][1] = -(i >> 1 & 1);
+ maskW4Q[i][2] = -(i >> 2 & 1);
+ maskW4Q[i][3] = -(i >> 3 & 1);
+
+ invMaskW4Q[i][0] = ~maskW4Q[i][0];
+ invMaskW4Q[i][1] = ~maskW4Q[i][1];
+ invMaskW4Q[i][2] = ~maskW4Q[i][2];
+ invMaskW4Q[i][3] = ~maskW4Q[i][3];
+
+ maskD4X[i][0] = -(i >> 0 & 1);
+ maskD4X[i][1] = -(i >> 1 & 1);
+ maskD4X[i][2] = -(i >> 2 & 1);
+ maskD4X[i][3] = -(i >> 3 & 1);
+
+ invMaskD4X[i][0] = ~maskD4X[i][0];
+ invMaskD4X[i][1] = ~maskD4X[i][1];
+ invMaskD4X[i][2] = ~maskD4X[i][2];
+ invMaskD4X[i][3] = ~maskD4X[i][3];
+
+ maskQ0Q[i] = -(i >> 0 & 1);
+ maskQ1Q[i] = -(i >> 1 & 1);
+ maskQ2Q[i] = -(i >> 2 & 1);
+ maskQ3Q[i] = -(i >> 3 & 1);
+
+ invMaskQ0Q[i] = ~maskQ0Q[i];
+ invMaskQ1Q[i] = ~maskQ1Q[i];
+ invMaskQ2Q[i] = ~maskQ2Q[i];
+ invMaskQ3Q[i] = ~maskQ3Q[i];
+
+ maskX0X[i][0] = maskX0X[i][1] = maskX0X[i][2] = maskX0X[i][3] = -(i >> 0 & 1);
+ maskX1X[i][0] = maskX1X[i][1] = maskX1X[i][2] = maskX1X[i][3] = -(i >> 1 & 1);
+ maskX2X[i][0] = maskX2X[i][1] = maskX2X[i][2] = maskX2X[i][3] = -(i >> 2 & 1);
+ maskX3X[i][0] = maskX3X[i][1] = maskX3X[i][2] = maskX3X[i][3] = -(i >> 3 & 1);
+
+ invMaskX0X[i][0] = invMaskX0X[i][1] = invMaskX0X[i][2] = invMaskX0X[i][3] = ~maskX0X[i][0];
+ invMaskX1X[i][0] = invMaskX1X[i][1] = invMaskX1X[i][2] = invMaskX1X[i][3] = ~maskX1X[i][0];
+ invMaskX2X[i][0] = invMaskX2X[i][1] = invMaskX2X[i][2] = invMaskX2X[i][3] = ~maskX2X[i][0];
+ invMaskX3X[i][0] = invMaskX3X[i][1] = invMaskX3X[i][2] = invMaskX3X[i][3] = ~maskX3X[i][0];
+
+ maskD01Q[i][0] = -(i >> 0 & 1);
+ maskD01Q[i][1] = -(i >> 1 & 1);
+ maskD23Q[i][0] = -(i >> 2 & 1);
+ maskD23Q[i][1] = -(i >> 3 & 1);
+
+ invMaskD01Q[i][0] = ~maskD01Q[i][0];
+ invMaskD01Q[i][1] = ~maskD01Q[i][1];
+ invMaskD23Q[i][0] = ~maskD23Q[i][0];
+ invMaskD23Q[i][1] = ~maskD23Q[i][1];
+
+ maskQ01X[i][0] = -(i >> 0 & 1);
+ maskQ01X[i][1] = -(i >> 1 & 1);
+ maskQ23X[i][0] = -(i >> 2 & 1);
+ maskQ23X[i][1] = -(i >> 3 & 1);
+
+ invMaskQ01X[i][0] = ~maskQ01X[i][0];
+ invMaskQ01X[i][1] = ~maskQ01X[i][1];
+ invMaskQ23X[i][0] = ~maskQ23X[i][0];
+ invMaskQ23X[i][1] = ~maskQ23X[i][1];
+ }
+
+ for(int i = 0; i < 8; i++)
+ {
+ mask565Q[i][0] =
+ mask565Q[i][1] =
+ mask565Q[i][2] =
+ mask565Q[i][3] = (i & 0x1 ? 0x001F : 0) | (i & 0x2 ? 0x07E0 : 0) | (i & 0x4 ? 0xF800 : 0);
+ }
+
+ for(int i = 0; i < 4; i++)
+ {
+ maskW01Q[i][0] = -(i >> 0 & 1);
+ maskW01Q[i][1] = -(i >> 1 & 1);
+ maskW01Q[i][2] = -(i >> 0 & 1);
+ maskW01Q[i][3] = -(i >> 1 & 1);
+
+ maskD01X[i][0] = -(i >> 0 & 1);
+ maskD01X[i][1] = -(i >> 1 & 1);
+ maskD01X[i][2] = -(i >> 0 & 1);
+ maskD01X[i][3] = -(i >> 1 & 1);
+ }
+
+ for(int i = 0; i < 256; i++)
+ {
+ sRGBtoLinear8_16[i] = (unsigned short)(sw::sRGBtoLinear((float)i / 0xFF) * 0xFFFF + 0.5f);
+ }
+
+ for(int i = 0; i < 64; i++)
+ {
+ sRGBtoLinear6_16[i] = (unsigned short)(sw::sRGBtoLinear((float)i / 0x3F) * 0xFFFF + 0.5f);
+ }
+
+ for(int i = 0; i < 32; i++)
+ {
+ sRGBtoLinear5_16[i] = (unsigned short)(sw::sRGBtoLinear((float)i / 0x1F) * 0xFFFF + 0.5f);
+ }
+
+ for(int i = 0; i < 0x1000; i++)
+ {
+ linearToSRGB12_16[i] = (unsigned short)(clamp(sw::linearToSRGB((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
+ sRGBtoLinear12_16[i] = (unsigned short)(clamp(sw::sRGBtoLinear((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
+ }
+
+ for(int q = 0; q < 4; q++)
+ {
+ for(int c = 0; c < 16; c++)
+ {
+ for(int i = 0; i < 4; i++)
+ {
+ const float X[4] = {+0.3125f, -0.3125f, -0.1250f, +0.1250f};
+ const float Y[4] = {+0.1250f, -0.1250f, +0.3125f, -0.3125f};
+
+ sampleX[q][c][i] = c & (1 << i) ? X[q] : 0.0f;
+ sampleY[q][c][i] = c & (1 << i) ? Y[q] : 0.0f;
+ weight[c][i] = c & (1 << i) ? 1.0f : 0.0f;
+ }
+ }
+ }
+
+ const int Xf[4] = {-5, +5, +2, -2}; // Fragment offsets
+ const int Yf[4] = {-2, +2, -5, +5}; // Fragment offsets
+
+ memcpy(&this->Xf, &Xf, sizeof(Xf));
+ memcpy(&this->Yf, &Yf, sizeof(Yf));
+
+ static const float4 X[4] = {{-0.3125f, -0.3125f, -0.3125f, -0.3125f},
+ {+0.3125f, +0.3125f, +0.3125f, +0.3125f},
+ {+0.1250f, +0.1250f, +0.1250f, +0.1250f},
+ {-0.1250f, -0.1250f, -0.1250f, -0.1250f}};
+
+ static const float4 Y[4] = {{-0.1250f, -0.1250f, -0.1250f, -0.1250f},
+ {+0.1250f, +0.1250f, +0.1250f, +0.1250f},
+ {-0.3125f, -0.3125f, -0.3125f, -0.3125f},
+ {+0.3125f, +0.3125f, +0.3125f, +0.3125f}};
+
+ memcpy(&this->X, &X, sizeof(X));
+ memcpy(&this->Y, &Y, sizeof(Y));
+
+ const dword maxX[16] = {0x00000000, 0x00000001, 0x00000100, 0x00000101, 0x00010000, 0x00010001, 0x00010100, 0x00010101, 0x01000000, 0x01000001, 0x01000100, 0x01000101, 0x01010000, 0x01010001, 0x01010100, 0x01010101};
+ const dword maxY[16] = {0x00000000, 0x00000002, 0x00000200, 0x00000202, 0x00020000, 0x00020002, 0x00020200, 0x00020202, 0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02020000, 0x02020002, 0x02020200, 0x02020202};
+ const dword maxZ[16] = {0x00000000, 0x00000004, 0x00000400, 0x00000404, 0x00040000, 0x00040004, 0x00040400, 0x00040404, 0x04000000, 0x04000004, 0x04000400, 0x04000404, 0x04040000, 0x04040004, 0x04040400, 0x04040404};
+ const dword minX[16] = {0x00000000, 0x00000008, 0x00000800, 0x00000808, 0x00080000, 0x00080008, 0x00080800, 0x00080808, 0x08000000, 0x08000008, 0x08000800, 0x08000808, 0x08080000, 0x08080008, 0x08080800, 0x08080808};
+ const dword minY[16] = {0x00000000, 0x00000010, 0x00001000, 0x00001010, 0x00100000, 0x00100010, 0x00101000, 0x00101010, 0x10000000, 0x10000010, 0x10001000, 0x10001010, 0x10100000, 0x10100010, 0x10101000, 0x10101010};
+ const dword minZ[16] = {0x00000000, 0x00000020, 0x00002000, 0x00002020, 0x00200000, 0x00200020, 0x00202000, 0x00202020, 0x20000000, 0x20000020, 0x20002000, 0x20002020, 0x20200000, 0x20200020, 0x20202000, 0x20202020};
+ const dword fini[16] = {0x00000000, 0x00000080, 0x00008000, 0x00008080, 0x00800000, 0x00800080, 0x00808000, 0x00808080, 0x80000000, 0x80000080, 0x80008000, 0x80008080, 0x80800000, 0x80800080, 0x80808000, 0x80808080};
+
+ memcpy(&this->maxX, &maxX, sizeof(maxX));
+ memcpy(&this->maxY, &maxY, sizeof(maxY));
+ memcpy(&this->maxZ, &maxZ, sizeof(maxZ));
+ memcpy(&this->minX, &minX, sizeof(minX));
+ memcpy(&this->minY, &minY, sizeof(minY));
+ memcpy(&this->minZ, &minZ, sizeof(minZ));
+ memcpy(&this->fini, &fini, sizeof(fini));
+
+ static const dword4 maxPos = {0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFE};
+
+ memcpy(&this->maxPos, &maxPos, sizeof(maxPos));
+
+ static const float4 unscaleByte = {1.0f / 0xFF, 1.0f / 0xFF, 1.0f / 0xFF, 1.0f / 0xFF};
+ static const float4 unscaleSByte = {1.0f / 0x7F, 1.0f / 0x7F, 1.0f / 0x7F, 1.0f / 0x7F};
+ static const float4 unscaleShort = {1.0f / 0x7FFF, 1.0f / 0x7FFF, 1.0f / 0x7FFF, 1.0f / 0x7FFF};
+ static const float4 unscaleUShort = {1.0f / 0xFFFF, 1.0f / 0xFFFF, 1.0f / 0xFFFF, 1.0f / 0xFFFF};
+ static const float4 unscaleInt = {1.0f / 0x7FFFFFFF, 1.0f / 0x7FFFFFFF, 1.0f / 0x7FFFFFFF, 1.0f / 0x7FFFFFFF};
+ static const float4 unscaleUInt = {1.0f / 0xFFFFFFFF, 1.0f / 0xFFFFFFFF, 1.0f / 0xFFFFFFFF, 1.0f / 0xFFFFFFFF};
+ static const float4 unscaleFixed = {1.0f / 0x00010000, 1.0f / 0x00010000, 1.0f / 0x00010000, 1.0f / 0x00010000};
+
+ memcpy(&this->unscaleByte, &unscaleByte, sizeof(unscaleByte));
+ memcpy(&this->unscaleSByte, &unscaleSByte, sizeof(unscaleSByte));
+ memcpy(&this->unscaleShort, &unscaleShort, sizeof(unscaleShort));
+ memcpy(&this->unscaleUShort, &unscaleUShort, sizeof(unscaleUShort));
+ memcpy(&this->unscaleInt, &unscaleInt, sizeof(unscaleInt));
+ memcpy(&this->unscaleUInt, &unscaleUInt, sizeof(unscaleUInt));
+ memcpy(&this->unscaleFixed, &unscaleFixed, sizeof(unscaleFixed));
+
+ for(int i = 0; i <= 0xFFFF; i++)
+ {
+ half2float[i] = (float)reinterpret_cast<half&>(i);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/Pipeline/Constants.hpp b/src/Pipeline/Constants.hpp
new file mode 100644
index 0000000..6b70e04
--- /dev/null
+++ b/src/Pipeline/Constants.hpp
@@ -0,0 +1,113 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Constants_hpp
+#define sw_Constants_hpp
+
+#include "Common/Types.hpp"
+
+namespace sw
+{
+ struct Constants
+ {
+ Constants();
+
+ unsigned int transposeBit0[16];
+ unsigned int transposeBit1[16];
+ unsigned int transposeBit2[16];
+
+ ushort4 cWeight[17];
+ float4 uvWeight[17];
+ float4 uvStart[17];
+
+ unsigned int occlusionCount[16];
+
+ byte8 maskB4Q[16];
+ byte8 invMaskB4Q[16];
+ word4 maskW4Q[16];
+ word4 invMaskW4Q[16];
+ dword4 maskD4X[16];
+ dword4 invMaskD4X[16];
+ qword maskQ0Q[16];
+ qword maskQ1Q[16];
+ qword maskQ2Q[16];
+ qword maskQ3Q[16];
+ qword invMaskQ0Q[16];
+ qword invMaskQ1Q[16];
+ qword invMaskQ2Q[16];
+ qword invMaskQ3Q[16];
+ dword4 maskX0X[16];
+ dword4 maskX1X[16];
+ dword4 maskX2X[16];
+ dword4 maskX3X[16];
+ dword4 invMaskX0X[16];
+ dword4 invMaskX1X[16];
+ dword4 invMaskX2X[16];
+ dword4 invMaskX3X[16];
+ dword2 maskD01Q[16];
+ dword2 maskD23Q[16];
+ dword2 invMaskD01Q[16];
+ dword2 invMaskD23Q[16];
+ qword2 maskQ01X[16];
+ qword2 maskQ23X[16];
+ qword2 invMaskQ01X[16];
+ qword2 invMaskQ23X[16];
+ word4 maskW01Q[4];
+ dword4 maskD01X[4];
+ word4 mask565Q[8];
+
+ unsigned short sRGBtoLinear8_16[256];
+ unsigned short sRGBtoLinear6_16[64];
+ unsigned short sRGBtoLinear5_16[32];
+
+ unsigned short linearToSRGB12_16[4096];
+ unsigned short sRGBtoLinear12_16[4096];
+
+ // Centroid parameters
+ float4 sampleX[4][16];
+ float4 sampleY[4][16];
+ float4 weight[16];
+
+ // Fragment offsets
+ int Xf[4];
+ int Yf[4];
+
+ float4 X[4];
+ float4 Y[4];
+
+ dword maxX[16];
+ dword maxY[16];
+ dword maxZ[16];
+ dword minX[16];
+ dword minY[16];
+ dword minZ[16];
+ dword fini[16];
+
+ dword4 maxPos;
+
+ float4 unscaleByte;
+ float4 unscaleSByte;
+ float4 unscaleShort;
+ float4 unscaleUShort;
+ float4 unscaleInt;
+ float4 unscaleUInt;
+ float4 unscaleFixed;
+
+ float half2float[65536];
+ };
+
+ extern Constants constants;
+}
+
+#endif // sw_Constants_hpp
diff --git a/src/Pipeline/PixelPipeline.cpp b/src/Pipeline/PixelPipeline.cpp
new file mode 100644
index 0000000..d4faebd
--- /dev/null
+++ b/src/Pipeline/PixelPipeline.cpp
@@ -0,0 +1,1959 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "PixelPipeline.hpp"
+#include "SamplerCore.hpp"
+#include "Renderer/Renderer.hpp"
+
+namespace sw
+{
+ extern bool postBlendSRGB;
+
+ void PixelPipeline::setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w)
+ {
+ if(state.color[0].component & 0x1) diffuse.x = convertFixed12(v[0].x); else diffuse.x = Short4(0x1000);
+ if(state.color[0].component & 0x2) diffuse.y = convertFixed12(v[0].y); else diffuse.y = Short4(0x1000);
+ if(state.color[0].component & 0x4) diffuse.z = convertFixed12(v[0].z); else diffuse.z = Short4(0x1000);
+ if(state.color[0].component & 0x8) diffuse.w = convertFixed12(v[0].w); else diffuse.w = Short4(0x1000);
+
+ if(state.color[1].component & 0x1) specular.x = convertFixed12(v[1].x); else specular.x = Short4(0x0000);
+ if(state.color[1].component & 0x2) specular.y = convertFixed12(v[1].y); else specular.y = Short4(0x0000);
+ if(state.color[1].component & 0x4) specular.z = convertFixed12(v[1].z); else specular.z = Short4(0x0000);
+ if(state.color[1].component & 0x8) specular.w = convertFixed12(v[1].w); else specular.w = Short4(0x0000);
+ }
+
+ void PixelPipeline::fixedFunction()
+ {
+ current = diffuse;
+ Vector4s temp(0x0000, 0x0000, 0x0000, 0x0000);
+
+ for(int stage = 0; stage < 8; stage++)
+ {
+ if(state.textureStage[stage].stageOperation == TextureStage::STAGE_DISABLE)
+ {
+ break;
+ }
+
+ Vector4s texture;
+
+ if(state.textureStage[stage].usesTexture)
+ {
+ texture = sampleTexture(stage, stage);
+ }
+
+ blendTexture(temp, texture, stage);
+ }
+
+ specularPixel(current, specular);
+ }
+
+ void PixelPipeline::applyShader(Int cMask[4])
+ {
+ if(!shader)
+ {
+ fixedFunction();
+ return;
+ }
+
+ int pad = 0; // Count number of texm3x3pad instructions
+ Vector4s dPairing; // Destination for first pairing instruction
+
+ for(size_t i = 0; i < shader->getLength(); i++)
+ {
+ const Shader::Instruction *instruction = shader->getInstruction(i);
+ Shader::Opcode opcode = instruction->opcode;
+
+ // #ifndef NDEBUG // FIXME: Centralize debug output control
+ // shader->printInstruction(i, "debug.txt");
+ // #endif
+
+ if(opcode == Shader::OPCODE_DCL || opcode == Shader::OPCODE_DEF || opcode == Shader::OPCODE_DEFI || opcode == Shader::OPCODE_DEFB)
+ {
+ continue;
+ }
+
+ const Dst &dst = instruction->dst;
+ const Src &src0 = instruction->src[0];
+ const Src &src1 = instruction->src[1];
+ const Src &src2 = instruction->src[2];
+
+ unsigned short shaderModel = shader->getShaderModel();
+ bool pairing = i + 1 < shader->getLength() && shader->getInstruction(i + 1)->coissue; // First instruction of pair
+ bool coissue = instruction->coissue; // Second instruction of pair
+
+ Vector4s d;
+ Vector4s s0;
+ Vector4s s1;
+ Vector4s s2;
+
+ if(src0.type != Shader::PARAMETER_VOID) s0 = fetchRegister(src0);
+ if(src1.type != Shader::PARAMETER_VOID) s1 = fetchRegister(src1);
+ if(src2.type != Shader::PARAMETER_VOID) s2 = fetchRegister(src2);
+
+ Float4 x = shaderModel < 0x0104 ? v[2 + dst.index].x : v[2 + src0.index].x;
+ Float4 y = shaderModel < 0x0104 ? v[2 + dst.index].y : v[2 + src0.index].y;
+ Float4 z = shaderModel < 0x0104 ? v[2 + dst.index].z : v[2 + src0.index].z;
+ Float4 w = shaderModel < 0x0104 ? v[2 + dst.index].w : v[2 + src0.index].w;
+
+ switch(opcode)
+ {
+ case Shader::OPCODE_PS_1_0: break;
+ case Shader::OPCODE_PS_1_1: break;
+ case Shader::OPCODE_PS_1_2: break;
+ case Shader::OPCODE_PS_1_3: break;
+ case Shader::OPCODE_PS_1_4: break;
+
+ case Shader::OPCODE_DEF: break;
+
+ case Shader::OPCODE_NOP: break;
+ case Shader::OPCODE_MOV: MOV(d, s0); break;
+ case Shader::OPCODE_ADD: ADD(d, s0, s1); break;
+ case Shader::OPCODE_SUB: SUB(d, s0, s1); break;
+ case Shader::OPCODE_MAD: MAD(d, s0, s1, s2); break;
+ case Shader::OPCODE_MUL: MUL(d, s0, s1); break;
+ case Shader::OPCODE_DP3: DP3(d, s0, s1); break;
+ case Shader::OPCODE_DP4: DP4(d, s0, s1); break;
+ case Shader::OPCODE_LRP: LRP(d, s0, s1, s2); break;
+ case Shader::OPCODE_TEXCOORD:
+ if(shaderModel < 0x0104)
+ {
+ TEXCOORD(d, x, y, z, dst.index);
+ }
+ else
+ {
+ if((src0.swizzle & 0x30) == 0x20) // .xyz
+ {
+ TEXCRD(d, x, y, z, src0.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
+ }
+ else // .xwy
+ {
+ TEXCRD(d, x, y, w, src0.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
+ }
+ }
+ break;
+ case Shader::OPCODE_TEXKILL:
+ if(shaderModel < 0x0104)
+ {
+ TEXKILL(cMask, x, y, z);
+ }
+ else if(shaderModel == 0x0104)
+ {
+ if(dst.type == Shader::PARAMETER_TEXTURE)
+ {
+ TEXKILL(cMask, x, y, z);
+ }
+ else
+ {
+ TEXKILL(cMask, rs[dst.index]);
+ }
+ }
+ else ASSERT(false);
+ break;
+ case Shader::OPCODE_TEX:
+ if(shaderModel < 0x0104)
+ {
+ TEX(d, x, y, z, dst.index, false);
+ }
+ else if(shaderModel == 0x0104)
+ {
+ if(src0.type == Shader::PARAMETER_TEXTURE)
+ {
+ if((src0.swizzle & 0x30) == 0x20) // .xyz
+ {
+ TEX(d, x, y, z, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
+ }
+ else // .xyw
+ {
+ TEX(d, x, y, w, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
+ }
+ }
+ else
+ {
+ TEXLD(d, s0, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
+ }
+ }
+ else ASSERT(false);
+ break;
+ case Shader::OPCODE_TEXBEM: TEXBEM(d, s0, x, y, z, dst.index); break;
+ case Shader::OPCODE_TEXBEML: TEXBEML(d, s0, x, y, z, dst.index); break;
+ case Shader::OPCODE_TEXREG2AR: TEXREG2AR(d, s0, dst.index); break;
+ case Shader::OPCODE_TEXREG2GB: TEXREG2GB(d, s0, dst.index); break;
+ case Shader::OPCODE_TEXM3X2PAD: TEXM3X2PAD(x, y, z, s0, 0, src0.modifier == Shader::MODIFIER_SIGN); break;
+ case Shader::OPCODE_TEXM3X2TEX: TEXM3X2TEX(d, x, y, z, dst.index, s0, src0.modifier == Shader::MODIFIER_SIGN); break;
+ case Shader::OPCODE_TEXM3X3PAD: TEXM3X3PAD(x, y, z, s0, pad++ % 2, src0.modifier == Shader::MODIFIER_SIGN); break;
+ case Shader::OPCODE_TEXM3X3TEX: TEXM3X3TEX(d, x, y, z, dst.index, s0, src0.modifier == Shader::MODIFIER_SIGN); break;
+ case Shader::OPCODE_TEXM3X3SPEC: TEXM3X3SPEC(d, x, y, z, dst.index, s0, s1); break;
+ case Shader::OPCODE_TEXM3X3VSPEC: TEXM3X3VSPEC(d, x, y, z, dst.index, s0); break;
+ case Shader::OPCODE_CND: CND(d, s0, s1, s2); break;
+ case Shader::OPCODE_TEXREG2RGB: TEXREG2RGB(d, s0, dst.index); break;
+ case Shader::OPCODE_TEXDP3TEX: TEXDP3TEX(d, x, y, z, dst.index, s0); break;
+ case Shader::OPCODE_TEXM3X2DEPTH: TEXM3X2DEPTH(d, x, y, z, s0, src0.modifier == Shader::MODIFIER_SIGN); break;
+ case Shader::OPCODE_TEXDP3: TEXDP3(d, x, y, z, s0); break;
+ case Shader::OPCODE_TEXM3X3: TEXM3X3(d, x, y, z, s0, src0.modifier == Shader::MODIFIER_SIGN); break;
+ case Shader::OPCODE_TEXDEPTH: TEXDEPTH(); break;
+ case Shader::OPCODE_CMP0: CMP(d, s0, s1, s2); break;
+ case Shader::OPCODE_BEM: BEM(d, s0, s1, dst.index); break;
+ case Shader::OPCODE_PHASE: break;
+ case Shader::OPCODE_END: break;
+ default:
+ ASSERT(false);
+ }
+
+ if(dst.type != Shader::PARAMETER_VOID && opcode != Shader::OPCODE_TEXKILL)
+ {
+ if(dst.shift > 0)
+ {
+ if(dst.mask & 0x1) { d.x = AddSat(d.x, d.x); if(dst.shift > 1) d.x = AddSat(d.x, d.x); if(dst.shift > 2) d.x = AddSat(d.x, d.x); }
+ if(dst.mask & 0x2) { d.y = AddSat(d.y, d.y); if(dst.shift > 1) d.y = AddSat(d.y, d.y); if(dst.shift > 2) d.y = AddSat(d.y, d.y); }
+ if(dst.mask & 0x4) { d.z = AddSat(d.z, d.z); if(dst.shift > 1) d.z = AddSat(d.z, d.z); if(dst.shift > 2) d.z = AddSat(d.z, d.z); }
+ if(dst.mask & 0x8) { d.w = AddSat(d.w, d.w); if(dst.shift > 1) d.w = AddSat(d.w, d.w); if(dst.shift > 2) d.w = AddSat(d.w, d.w); }
+ }
+ else if(dst.shift < 0)
+ {
+ if(dst.mask & 0x1) d.x = d.x >> -dst.shift;
+ if(dst.mask & 0x2) d.y = d.y >> -dst.shift;
+ if(dst.mask & 0x4) d.z = d.z >> -dst.shift;
+ if(dst.mask & 0x8) d.w = d.w >> -dst.shift;
+ }
+
+ if(dst.saturate)
+ {
+ if(dst.mask & 0x1) { d.x = Min(d.x, Short4(0x1000)); d.x = Max(d.x, Short4(0x0000)); }
+ if(dst.mask & 0x2) { d.y = Min(d.y, Short4(0x1000)); d.y = Max(d.y, Short4(0x0000)); }
+ if(dst.mask & 0x4) { d.z = Min(d.z, Short4(0x1000)); d.z = Max(d.z, Short4(0x0000)); }
+ if(dst.mask & 0x8) { d.w = Min(d.w, Short4(0x1000)); d.w = Max(d.w, Short4(0x0000)); }
+ }
+
+ if(pairing)
+ {
+ if(dst.mask & 0x1) dPairing.x = d.x;
+ if(dst.mask & 0x2) dPairing.y = d.y;
+ if(dst.mask & 0x4) dPairing.z = d.z;
+ if(dst.mask & 0x8) dPairing.w = d.w;
+ }
+
+ if(coissue)
+ {
+ const Dst &dst = shader->getInstruction(i - 1)->dst;
+
+ writeDestination(dPairing, dst);
+ }
+
+ if(!pairing)
+ {
+ writeDestination(d, dst);
+ }
+ }
+ }
+
+ current.x = Min(current.x, Short4(0x0FFF)); current.x = Max(current.x, Short4(0x0000));
+ current.y = Min(current.y, Short4(0x0FFF)); current.y = Max(current.y, Short4(0x0000));
+ current.z = Min(current.z, Short4(0x0FFF)); current.z = Max(current.z, Short4(0x0000));
+ current.w = Min(current.w, Short4(0x0FFF)); current.w = Max(current.w, Short4(0x0000));
+ }
+
+ Bool PixelPipeline::alphaTest(Int cMask[4])
+ {
+ if(!state.alphaTestActive())
+ {
+ return true;
+ }
+
+ Int aMask;
+
+ if(state.transparencyAntialiasing == TRANSPARENCY_NONE)
+ {
+ PixelRoutine::alphaTest(aMask, current.w);
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ cMask[q] &= aMask;
+ }
+ }
+ else if(state.transparencyAntialiasing == TRANSPARENCY_ALPHA_TO_COVERAGE)
+ {
+ Float4 alpha = Float4(current.w) * Float4(1.0f / 0x1000);
+
+ alphaToCoverage(cMask, alpha);
+ }
+ else ASSERT(false);
+
+ Int pass = cMask[0];
+
+ for(unsigned int q = 1; q < state.multiSample; q++)
+ {
+ pass = pass | cMask[q];
+ }
+
+ return pass != 0x0;
+ }
+
+ void PixelPipeline::rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
+ {
+ if(!state.colorWriteActive(0))
+ {
+ return;
+ }
+
+ Vector4f oC;
+
+ switch(state.targetFormat[0])
+ {
+ case FORMAT_R5G6B5:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_A8:
+ case FORMAT_G16R16:
+ case FORMAT_A16B16G16R16:
+ if(!postBlendSRGB && state.writeSRGB)
+ {
+ linearToSRGB12_16(current);
+ }
+ else
+ {
+ current.x <<= 4;
+ current.y <<= 4;
+ current.z <<= 4;
+ current.w <<= 4;
+ }
+
+ if(state.targetFormat[0] == FORMAT_R5G6B5)
+ {
+ current.x &= Short4(0xF800u);
+ current.y &= Short4(0xFC00u);
+ current.z &= Short4(0xF800u);
+ }
+
+ fogBlend(current, fog);
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ Pointer<Byte> buffer = cBuffer[0] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[0]));
+ Vector4s color = current;
+
+ if(state.multiSampleMask & (1 << q))
+ {
+ alphaBlend(0, buffer, color, x);
+ logicOperation(0, buffer, color, x);
+ writeColor(0, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+ }
+ }
+ break;
+ case FORMAT_R32F:
+ case FORMAT_G32R32F:
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_A32B32G32R32F:
+ // case FORMAT_X32B32G32R32F_UNSIGNED: // Not renderable in any fixed-function API.
+ convertSigned12(oC, current);
+ PixelRoutine::fogBlend(oC, fog);
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ Pointer<Byte> buffer = cBuffer[0] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[0]));
+ Vector4f color = oC;
+
+ if(state.multiSampleMask & (1 << q))
+ {
+ alphaBlend(0, buffer, color, x);
+ writeColor(0, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+ }
+ }
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void PixelPipeline::blendTexture(Vector4s &temp, Vector4s &texture, int stage)
+ {
+ Vector4s *arg1 = nullptr;
+ Vector4s *arg2 = nullptr;
+ Vector4s *arg3 = nullptr;
+ Vector4s res;
+
+ Vector4s constant;
+ Vector4s tfactor;
+
+ const TextureStage::State &textureStage = state.textureStage[stage];
+
+ if(textureStage.firstArgument == TextureStage::SOURCE_CONSTANT ||
+ textureStage.firstArgumentAlpha == TextureStage::SOURCE_CONSTANT ||
+ textureStage.secondArgument == TextureStage::SOURCE_CONSTANT ||
+ textureStage.secondArgumentAlpha == TextureStage::SOURCE_CONSTANT ||
+ textureStage.thirdArgument == TextureStage::SOURCE_CONSTANT ||
+ textureStage.thirdArgumentAlpha == TextureStage::SOURCE_CONSTANT)
+ {
+ constant.x = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[0]));
+ constant.y = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[1]));
+ constant.z = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[2]));
+ constant.w = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[3]));
+ }
+
+ if(textureStage.firstArgument == TextureStage::SOURCE_TFACTOR ||
+ textureStage.firstArgumentAlpha == TextureStage::SOURCE_TFACTOR ||
+ textureStage.secondArgument == TextureStage::SOURCE_TFACTOR ||
+ textureStage.secondArgumentAlpha == TextureStage::SOURCE_TFACTOR ||
+ textureStage.thirdArgument == TextureStage::SOURCE_TFACTOR ||
+ textureStage.thirdArgumentAlpha == TextureStage::SOURCE_TFACTOR)
+ {
+ tfactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[0]));
+ tfactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[1]));
+ tfactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[2]));
+ tfactor.w = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]));
+ }
+
+ // Premodulate
+ if(stage > 0 && textureStage.usesTexture)
+ {
+ if(state.textureStage[stage - 1].stageOperation == TextureStage::STAGE_PREMODULATE)
+ {
+ current.x = MulHigh(current.x, texture.x) << 4;
+ current.y = MulHigh(current.y, texture.y) << 4;
+ current.z = MulHigh(current.z, texture.z) << 4;
+ }
+
+ if(state.textureStage[stage - 1].stageOperationAlpha == TextureStage::STAGE_PREMODULATE)
+ {
+ current.w = MulHigh(current.w, texture.w) << 4;
+ }
+ }
+
+ if(luminance)
+ {
+ texture.x = MulHigh(texture.x, L) << 4;
+ texture.y = MulHigh(texture.y, L) << 4;
+ texture.z = MulHigh(texture.z, L) << 4;
+
+ luminance = false;
+ }
+
+ switch(textureStage.firstArgument)
+ {
+ case TextureStage::SOURCE_TEXTURE: arg1 = &texture; break;
+ case TextureStage::SOURCE_CONSTANT: arg1 = &constant; break;
+ case TextureStage::SOURCE_CURRENT: arg1 = ¤t; break;
+ case TextureStage::SOURCE_DIFFUSE: arg1 = &diffuse; break;
+ case TextureStage::SOURCE_SPECULAR: arg1 = &specular; break;
+ case TextureStage::SOURCE_TEMP: arg1 = &temp; break;
+ case TextureStage::SOURCE_TFACTOR: arg1 = &tfactor; break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.secondArgument)
+ {
+ case TextureStage::SOURCE_TEXTURE: arg2 = &texture; break;
+ case TextureStage::SOURCE_CONSTANT: arg2 = &constant; break;
+ case TextureStage::SOURCE_CURRENT: arg2 = ¤t; break;
+ case TextureStage::SOURCE_DIFFUSE: arg2 = &diffuse; break;
+ case TextureStage::SOURCE_SPECULAR: arg2 = &specular; break;
+ case TextureStage::SOURCE_TEMP: arg2 = &temp; break;
+ case TextureStage::SOURCE_TFACTOR: arg2 = &tfactor; break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.thirdArgument)
+ {
+ case TextureStage::SOURCE_TEXTURE: arg3 = &texture; break;
+ case TextureStage::SOURCE_CONSTANT: arg3 = &constant; break;
+ case TextureStage::SOURCE_CURRENT: arg3 = ¤t; break;
+ case TextureStage::SOURCE_DIFFUSE: arg3 = &diffuse; break;
+ case TextureStage::SOURCE_SPECULAR: arg3 = &specular; break;
+ case TextureStage::SOURCE_TEMP: arg3 = &temp; break;
+ case TextureStage::SOURCE_TFACTOR: arg3 = &tfactor; break;
+ default:
+ ASSERT(false);
+ }
+
+ Vector4s mod1;
+ Vector4s mod2;
+ Vector4s mod3;
+
+ switch(textureStage.firstModifier)
+ {
+ case TextureStage::MODIFIER_COLOR:
+ break;
+ case TextureStage::MODIFIER_INVCOLOR:
+ mod1.x = SubSat(Short4(0x1000), arg1->x);
+ mod1.y = SubSat(Short4(0x1000), arg1->y);
+ mod1.z = SubSat(Short4(0x1000), arg1->z);
+ mod1.w = SubSat(Short4(0x1000), arg1->w);
+
+ arg1 = &mod1;
+ break;
+ case TextureStage::MODIFIER_ALPHA:
+ mod1.x = arg1->w;
+ mod1.y = arg1->w;
+ mod1.z = arg1->w;
+ mod1.w = arg1->w;
+
+ arg1 = &mod1;
+ break;
+ case TextureStage::MODIFIER_INVALPHA:
+ mod1.x = SubSat(Short4(0x1000), arg1->w);
+ mod1.y = SubSat(Short4(0x1000), arg1->w);
+ mod1.z = SubSat(Short4(0x1000), arg1->w);
+ mod1.w = SubSat(Short4(0x1000), arg1->w);
+
+ arg1 = &mod1;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.secondModifier)
+ {
+ case TextureStage::MODIFIER_COLOR:
+ break;
+ case TextureStage::MODIFIER_INVCOLOR:
+ mod2.x = SubSat(Short4(0x1000), arg2->x);
+ mod2.y = SubSat(Short4(0x1000), arg2->y);
+ mod2.z = SubSat(Short4(0x1000), arg2->z);
+ mod2.w = SubSat(Short4(0x1000), arg2->w);
+
+ arg2 = &mod2;
+ break;
+ case TextureStage::MODIFIER_ALPHA:
+ mod2.x = arg2->w;
+ mod2.y = arg2->w;
+ mod2.z = arg2->w;
+ mod2.w = arg2->w;
+
+ arg2 = &mod2;
+ break;
+ case TextureStage::MODIFIER_INVALPHA:
+ mod2.x = SubSat(Short4(0x1000), arg2->w);
+ mod2.y = SubSat(Short4(0x1000), arg2->w);
+ mod2.z = SubSat(Short4(0x1000), arg2->w);
+ mod2.w = SubSat(Short4(0x1000), arg2->w);
+
+ arg2 = &mod2;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.thirdModifier)
+ {
+ case TextureStage::MODIFIER_COLOR:
+ break;
+ case TextureStage::MODIFIER_INVCOLOR:
+ mod3.x = SubSat(Short4(0x1000), arg3->x);
+ mod3.y = SubSat(Short4(0x1000), arg3->y);
+ mod3.z = SubSat(Short4(0x1000), arg3->z);
+ mod3.w = SubSat(Short4(0x1000), arg3->w);
+
+ arg3 = &mod3;
+ break;
+ case TextureStage::MODIFIER_ALPHA:
+ mod3.x = arg3->w;
+ mod3.y = arg3->w;
+ mod3.z = arg3->w;
+ mod3.w = arg3->w;
+
+ arg3 = &mod3;
+ break;
+ case TextureStage::MODIFIER_INVALPHA:
+ mod3.x = SubSat(Short4(0x1000), arg3->w);
+ mod3.y = SubSat(Short4(0x1000), arg3->w);
+ mod3.z = SubSat(Short4(0x1000), arg3->w);
+ mod3.w = SubSat(Short4(0x1000), arg3->w);
+
+ arg3 = &mod3;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.stageOperation)
+ {
+ case TextureStage::STAGE_DISABLE:
+ break;
+ case TextureStage::STAGE_SELECTARG1: // Arg1
+ res.x = arg1->x;
+ res.y = arg1->y;
+ res.z = arg1->z;
+ break;
+ case TextureStage::STAGE_SELECTARG2: // Arg2
+ res.x = arg2->x;
+ res.y = arg2->y;
+ res.z = arg2->z;
+ break;
+ case TextureStage::STAGE_SELECTARG3: // Arg3
+ res.x = arg3->x;
+ res.y = arg3->y;
+ res.z = arg3->z;
+ break;
+ case TextureStage::STAGE_MODULATE: // Arg1 * Arg2
+ res.x = MulHigh(arg1->x, arg2->x) << 4;
+ res.y = MulHigh(arg1->y, arg2->y) << 4;
+ res.z = MulHigh(arg1->z, arg2->z) << 4;
+ break;
+ case TextureStage::STAGE_MODULATE2X: // Arg1 * Arg2 * 2
+ res.x = MulHigh(arg1->x, arg2->x) << 5;
+ res.y = MulHigh(arg1->y, arg2->y) << 5;
+ res.z = MulHigh(arg1->z, arg2->z) << 5;
+ break;
+ case TextureStage::STAGE_MODULATE4X: // Arg1 * Arg2 * 4
+ res.x = MulHigh(arg1->x, arg2->x) << 6;
+ res.y = MulHigh(arg1->y, arg2->y) << 6;
+ res.z = MulHigh(arg1->z, arg2->z) << 6;
+ break;
+ case TextureStage::STAGE_ADD: // Arg1 + Arg2
+ res.x = AddSat(arg1->x, arg2->x);
+ res.y = AddSat(arg1->y, arg2->y);
+ res.z = AddSat(arg1->z, arg2->z);
+ break;
+ case TextureStage::STAGE_ADDSIGNED: // Arg1 + Arg2 - 0.5
+ res.x = AddSat(arg1->x, arg2->x);
+ res.y = AddSat(arg1->y, arg2->y);
+ res.z = AddSat(arg1->z, arg2->z);
+
+ res.x = SubSat(res.x, Short4(0x0800));
+ res.y = SubSat(res.y, Short4(0x0800));
+ res.z = SubSat(res.z, Short4(0x0800));
+ break;
+ case TextureStage::STAGE_ADDSIGNED2X: // (Arg1 + Arg2 - 0.5) << 1
+ res.x = AddSat(arg1->x, arg2->x);
+ res.y = AddSat(arg1->y, arg2->y);
+ res.z = AddSat(arg1->z, arg2->z);
+
+ res.x = SubSat(res.x, Short4(0x0800));
+ res.y = SubSat(res.y, Short4(0x0800));
+ res.z = SubSat(res.z, Short4(0x0800));
+
+ res.x = AddSat(res.x, res.x);
+ res.y = AddSat(res.y, res.y);
+ res.z = AddSat(res.z, res.z);
+ break;
+ case TextureStage::STAGE_SUBTRACT: // Arg1 - Arg2
+ res.x = SubSat(arg1->x, arg2->x);
+ res.y = SubSat(arg1->y, arg2->y);
+ res.z = SubSat(arg1->z, arg2->z);
+ break;
+ case TextureStage::STAGE_ADDSMOOTH: // Arg1 + Arg2 - Arg1 * Arg2
+ {
+ Short4 tmp;
+
+ tmp = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(arg1->x, arg2->x); res.x = SubSat(res.x, tmp);
+ tmp = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(arg1->y, arg2->y); res.y = SubSat(res.y, tmp);
+ tmp = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(arg1->z, arg2->z); res.z = SubSat(res.z, tmp);
+ }
+ break;
+ case TextureStage::STAGE_MULTIPLYADD: // Arg3 + Arg1 * Arg2
+ res.x = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(res.x, arg3->x);
+ res.y = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(res.y, arg3->y);
+ res.z = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(res.z, arg3->z);
+ break;
+ case TextureStage::STAGE_LERP: // Arg3 * (Arg1 - Arg2) + Arg2
+ res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, arg3->x) << 4; res.x = AddSat(res.x, arg2->x);
+ res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, arg3->y) << 4; res.y = AddSat(res.y, arg2->y);
+ res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, arg3->z) << 4; res.z = AddSat(res.z, arg2->z);
+ break;
+ case TextureStage::STAGE_DOT3: // 2 * (Arg1.x - 0.5) * 2 * (Arg2.x - 0.5) + 2 * (Arg1.y - 0.5) * 2 * (Arg2.y - 0.5) + 2 * (Arg1.z - 0.5) * 2 * (Arg2.z - 0.5)
+ {
+ Short4 tmp;
+
+ res.x = SubSat(arg1->x, Short4(0x0800)); tmp = SubSat(arg2->x, Short4(0x0800)); res.x = MulHigh(res.x, tmp);
+ res.y = SubSat(arg1->y, Short4(0x0800)); tmp = SubSat(arg2->y, Short4(0x0800)); res.y = MulHigh(res.y, tmp);
+ res.z = SubSat(arg1->z, Short4(0x0800)); tmp = SubSat(arg2->z, Short4(0x0800)); res.z = MulHigh(res.z, tmp);
+
+ res.x = res.x << 6;
+ res.y = res.y << 6;
+ res.z = res.z << 6;
+
+ res.x = AddSat(res.x, res.y);
+ res.x = AddSat(res.x, res.z);
+
+ // Clamp to [0, 1]
+ res.x = Max(res.x, Short4(0x0000));
+ res.x = Min(res.x, Short4(0x1000));
+
+ res.y = res.x;
+ res.z = res.x;
+ res.w = res.x;
+ }
+ break;
+ case TextureStage::STAGE_BLENDCURRENTALPHA: // Alpha * (Arg1 - Arg2) + Arg2
+ res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, current.w) << 4; res.x = AddSat(res.x, arg2->x);
+ res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, current.w) << 4; res.y = AddSat(res.y, arg2->y);
+ res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, current.w) << 4; res.z = AddSat(res.z, arg2->z);
+ break;
+ case TextureStage::STAGE_BLENDDIFFUSEALPHA: // Alpha * (Arg1 - Arg2) + Arg2
+ res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, diffuse.w) << 4; res.x = AddSat(res.x, arg2->x);
+ res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, diffuse.w) << 4; res.y = AddSat(res.y, arg2->y);
+ res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, diffuse.w) << 4; res.z = AddSat(res.z, arg2->z);
+ break;
+ case TextureStage::STAGE_BLENDFACTORALPHA: // Alpha * (Arg1 - Arg2) + Arg2
+ res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.x = AddSat(res.x, arg2->x);
+ res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.y = AddSat(res.y, arg2->y);
+ res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.z = AddSat(res.z, arg2->z);
+ break;
+ case TextureStage::STAGE_BLENDTEXTUREALPHA: // Alpha * (Arg1 - Arg2) + Arg2
+ res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, texture.w) << 4; res.x = AddSat(res.x, arg2->x);
+ res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, texture.w) << 4; res.y = AddSat(res.y, arg2->y);
+ res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, texture.w) << 4; res.z = AddSat(res.z, arg2->z);
+ break;
+ case TextureStage::STAGE_BLENDTEXTUREALPHAPM: // Arg1 + Arg2 * (1 - Alpha)
+ res.x = SubSat(Short4(0x1000), texture.w); res.x = MulHigh(res.x, arg2->x) << 4; res.x = AddSat(res.x, arg1->x);
+ res.y = SubSat(Short4(0x1000), texture.w); res.y = MulHigh(res.y, arg2->y) << 4; res.y = AddSat(res.y, arg1->y);
+ res.z = SubSat(Short4(0x1000), texture.w); res.z = MulHigh(res.z, arg2->z) << 4; res.z = AddSat(res.z, arg1->z);
+ break;
+ case TextureStage::STAGE_PREMODULATE:
+ res.x = arg1->x;
+ res.y = arg1->y;
+ res.z = arg1->z;
+ break;
+ case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR: // Arg1 + Arg1.w * Arg2
+ res.x = MulHigh(arg1->w, arg2->x) << 4; res.x = AddSat(res.x, arg1->x);
+ res.y = MulHigh(arg1->w, arg2->y) << 4; res.y = AddSat(res.y, arg1->y);
+ res.z = MulHigh(arg1->w, arg2->z) << 4; res.z = AddSat(res.z, arg1->z);
+ break;
+ case TextureStage::STAGE_MODULATECOLOR_ADDALPHA: // Arg1 * Arg2 + Arg1.w
+ res.x = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(res.x, arg1->w);
+ res.y = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(res.y, arg1->w);
+ res.z = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(res.z, arg1->w);
+ break;
+ case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR: // (1 - Arg1.w) * Arg2 + Arg1
+ {
+ Short4 tmp;
+
+ res.x = AddSat(arg1->x, arg2->x); tmp = MulHigh(arg1->w, arg2->x) << 4; res.x = SubSat(res.x, tmp);
+ res.y = AddSat(arg1->y, arg2->y); tmp = MulHigh(arg1->w, arg2->y) << 4; res.y = SubSat(res.y, tmp);
+ res.z = AddSat(arg1->z, arg2->z); tmp = MulHigh(arg1->w, arg2->z) << 4; res.z = SubSat(res.z, tmp);
+ }
+ break;
+ case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA: // (1 - Arg1) * Arg2 + Arg1.w
+ {
+ Short4 tmp;
+
+ res.x = AddSat(arg1->w, arg2->x); tmp = MulHigh(arg1->x, arg2->x) << 4; res.x = SubSat(res.x, tmp);
+ res.y = AddSat(arg1->w, arg2->y); tmp = MulHigh(arg1->y, arg2->y) << 4; res.y = SubSat(res.y, tmp);
+ res.z = AddSat(arg1->w, arg2->z); tmp = MulHigh(arg1->z, arg2->z) << 4; res.z = SubSat(res.z, tmp);
+ }
+ break;
+ case TextureStage::STAGE_BUMPENVMAP:
+ {
+ du = Float4(texture.x) * Float4(1.0f / 0x0FE0);
+ dv = Float4(texture.y) * Float4(1.0f / 0x0FE0);
+
+ Float4 du2;
+ Float4 dv2;
+
+ du2 = du;
+ dv2 = dv;
+ du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
+ dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
+ du += dv2;
+ dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
+ du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
+ dv += du2;
+
+ perturbate = true;
+
+ res.x = current.x;
+ res.y = current.y;
+ res.z = current.z;
+ res.w = current.w;
+ }
+ break;
+ case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
+ {
+ du = Float4(texture.x) * Float4(1.0f / 0x0FE0);
+ dv = Float4(texture.y) * Float4(1.0f / 0x0FE0);
+
+ Float4 du2;
+ Float4 dv2;
+
+ du2 = du;
+ dv2 = dv;
+
+ du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
+ dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
+ du += dv2;
+ dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
+ du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
+ dv += du2;
+
+ perturbate = true;
+
+ L = texture.z;
+ L = MulHigh(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceScale4)));
+ L = L << 4;
+ L = AddSat(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceOffset4)));
+ L = Max(L, Short4(0x0000));
+ L = Min(L, Short4(0x1000));
+
+ luminance = true;
+
+ res.x = current.x;
+ res.y = current.y;
+ res.z = current.z;
+ res.w = current.w;
+ }
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ if(textureStage.stageOperation != TextureStage::STAGE_DOT3)
+ {
+ switch(textureStage.firstArgumentAlpha)
+ {
+ case TextureStage::SOURCE_TEXTURE: arg1 = &texture; break;
+ case TextureStage::SOURCE_CONSTANT: arg1 = &constant; break;
+ case TextureStage::SOURCE_CURRENT: arg1 = ¤t; break;
+ case TextureStage::SOURCE_DIFFUSE: arg1 = &diffuse; break;
+ case TextureStage::SOURCE_SPECULAR: arg1 = &specular; break;
+ case TextureStage::SOURCE_TEMP: arg1 = &temp; break;
+ case TextureStage::SOURCE_TFACTOR: arg1 = &tfactor; break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.secondArgumentAlpha)
+ {
+ case TextureStage::SOURCE_TEXTURE: arg2 = &texture; break;
+ case TextureStage::SOURCE_CONSTANT: arg2 = &constant; break;
+ case TextureStage::SOURCE_CURRENT: arg2 = ¤t; break;
+ case TextureStage::SOURCE_DIFFUSE: arg2 = &diffuse; break;
+ case TextureStage::SOURCE_SPECULAR: arg2 = &specular; break;
+ case TextureStage::SOURCE_TEMP: arg2 = &temp; break;
+ case TextureStage::SOURCE_TFACTOR: arg2 = &tfactor; break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.thirdArgumentAlpha)
+ {
+ case TextureStage::SOURCE_TEXTURE: arg3 = &texture; break;
+ case TextureStage::SOURCE_CONSTANT: arg3 = &constant; break;
+ case TextureStage::SOURCE_CURRENT: arg3 = ¤t; break;
+ case TextureStage::SOURCE_DIFFUSE: arg3 = &diffuse; break;
+ case TextureStage::SOURCE_SPECULAR: arg3 = &specular; break;
+ case TextureStage::SOURCE_TEMP: arg3 = &temp; break;
+ case TextureStage::SOURCE_TFACTOR: arg3 = &tfactor; break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.firstModifierAlpha) // FIXME: Check if actually used
+ {
+ case TextureStage::MODIFIER_COLOR:
+ break;
+ case TextureStage::MODIFIER_INVCOLOR:
+ mod1.w = SubSat(Short4(0x1000), arg1->w);
+
+ arg1 = &mod1;
+ break;
+ case TextureStage::MODIFIER_ALPHA:
+ // Redudant
+ break;
+ case TextureStage::MODIFIER_INVALPHA:
+ mod1.w = SubSat(Short4(0x1000), arg1->w);
+
+ arg1 = &mod1;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.secondModifierAlpha) // FIXME: Check if actually used
+ {
+ case TextureStage::MODIFIER_COLOR:
+ break;
+ case TextureStage::MODIFIER_INVCOLOR:
+ mod2.w = SubSat(Short4(0x1000), arg2->w);
+
+ arg2 = &mod2;
+ break;
+ case TextureStage::MODIFIER_ALPHA:
+ // Redudant
+ break;
+ case TextureStage::MODIFIER_INVALPHA:
+ mod2.w = SubSat(Short4(0x1000), arg2->w);
+
+ arg2 = &mod2;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.thirdModifierAlpha) // FIXME: Check if actually used
+ {
+ case TextureStage::MODIFIER_COLOR:
+ break;
+ case TextureStage::MODIFIER_INVCOLOR:
+ mod3.w = SubSat(Short4(0x1000), arg3->w);
+
+ arg3 = &mod3;
+ break;
+ case TextureStage::MODIFIER_ALPHA:
+ // Redudant
+ break;
+ case TextureStage::MODIFIER_INVALPHA:
+ mod3.w = SubSat(Short4(0x1000), arg3->w);
+
+ arg3 = &mod3;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.stageOperationAlpha)
+ {
+ case TextureStage::STAGE_DISABLE:
+ break;
+ case TextureStage::STAGE_SELECTARG1: // Arg1
+ res.w = arg1->w;
+ break;
+ case TextureStage::STAGE_SELECTARG2: // Arg2
+ res.w = arg2->w;
+ break;
+ case TextureStage::STAGE_SELECTARG3: // Arg3
+ res.w = arg3->w;
+ break;
+ case TextureStage::STAGE_MODULATE: // Arg1 * Arg2
+ res.w = MulHigh(arg1->w, arg2->w) << 4;
+ break;
+ case TextureStage::STAGE_MODULATE2X: // Arg1 * Arg2 * 2
+ res.w = MulHigh(arg1->w, arg2->w) << 5;
+ break;
+ case TextureStage::STAGE_MODULATE4X: // Arg1 * Arg2 * 4
+ res.w = MulHigh(arg1->w, arg2->w) << 6;
+ break;
+ case TextureStage::STAGE_ADD: // Arg1 + Arg2
+ res.w = AddSat(arg1->w, arg2->w);
+ break;
+ case TextureStage::STAGE_ADDSIGNED: // Arg1 + Arg2 - 0.5
+ res.w = AddSat(arg1->w, arg2->w);
+ res.w = SubSat(res.w, Short4(0x0800));
+ break;
+ case TextureStage::STAGE_ADDSIGNED2X: // (Arg1 + Arg2 - 0.5) << 1
+ res.w = AddSat(arg1->w, arg2->w);
+ res.w = SubSat(res.w, Short4(0x0800));
+ res.w = AddSat(res.w, res.w);
+ break;
+ case TextureStage::STAGE_SUBTRACT: // Arg1 - Arg2
+ res.w = SubSat(arg1->w, arg2->w);
+ break;
+ case TextureStage::STAGE_ADDSMOOTH: // Arg1 + Arg2 - Arg1 * Arg2
+ {
+ Short4 tmp;
+
+ tmp = MulHigh(arg1->w, arg2->w) << 4; res.w = AddSat(arg1->w, arg2->w); res.w = SubSat(res.w, tmp);
+ }
+ break;
+ case TextureStage::STAGE_MULTIPLYADD: // Arg3 + Arg1 * Arg2
+ res.w = MulHigh(arg1->w, arg2->w) << 4; res.w = AddSat(res.w, arg3->w);
+ break;
+ case TextureStage::STAGE_LERP: // Arg3 * (Arg1 - Arg2) + Arg2
+ res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, arg3->w) << 4; res.w = AddSat(res.w, arg2->w);
+ break;
+ case TextureStage::STAGE_DOT3:
+ break; // Already computed in color channel
+ case TextureStage::STAGE_BLENDCURRENTALPHA: // Alpha * (Arg1 - Arg2) + Arg2
+ res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, current.w) << 4; res.w = AddSat(res.w, arg2->w);
+ break;
+ case TextureStage::STAGE_BLENDDIFFUSEALPHA: // Arg1 * (Alpha) + Arg2 * (1 - Alpha)
+ res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, diffuse.w) << 4; res.w = AddSat(res.w, arg2->w);
+ break;
+ case TextureStage::STAGE_BLENDFACTORALPHA:
+ res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.w = AddSat(res.w, arg2->w);
+ break;
+ case TextureStage::STAGE_BLENDTEXTUREALPHA: // Arg1 * (Alpha) + Arg2 * (1 - Alpha)
+ res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, texture.w) << 4; res.w = AddSat(res.w, arg2->w);
+ break;
+ case TextureStage::STAGE_BLENDTEXTUREALPHAPM: // Arg1 + Arg2 * (1 - Alpha)
+ res.w = SubSat(Short4(0x1000), texture.w); res.w = MulHigh(res.w, arg2->w) << 4; res.w = AddSat(res.w, arg1->w);
+ break;
+ case TextureStage::STAGE_PREMODULATE:
+ res.w = arg1->w;
+ break;
+ case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
+ case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
+ case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
+ case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
+ case TextureStage::STAGE_BUMPENVMAP:
+ case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
+ break; // Invalid alpha operations
+ default:
+ ASSERT(false);
+ }
+ }
+
+ // Clamp result to [0, 1]
+
+ switch(textureStage.stageOperation)
+ {
+ case TextureStage::STAGE_DISABLE:
+ case TextureStage::STAGE_SELECTARG1:
+ case TextureStage::STAGE_SELECTARG2:
+ case TextureStage::STAGE_SELECTARG3:
+ case TextureStage::STAGE_MODULATE:
+ case TextureStage::STAGE_MODULATE2X:
+ case TextureStage::STAGE_MODULATE4X:
+ case TextureStage::STAGE_ADD:
+ case TextureStage::STAGE_MULTIPLYADD:
+ case TextureStage::STAGE_LERP:
+ case TextureStage::STAGE_BLENDCURRENTALPHA:
+ case TextureStage::STAGE_BLENDDIFFUSEALPHA:
+ case TextureStage::STAGE_BLENDFACTORALPHA:
+ case TextureStage::STAGE_BLENDTEXTUREALPHA:
+ case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
+ case TextureStage::STAGE_DOT3: // Already clamped
+ case TextureStage::STAGE_PREMODULATE:
+ case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
+ case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
+ case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
+ case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
+ case TextureStage::STAGE_BUMPENVMAP:
+ case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
+ if(state.textureStage[stage].cantUnderflow)
+ {
+ break; // Can't go below zero
+ }
+ case TextureStage::STAGE_ADDSIGNED:
+ case TextureStage::STAGE_ADDSIGNED2X:
+ case TextureStage::STAGE_SUBTRACT:
+ case TextureStage::STAGE_ADDSMOOTH:
+ res.x = Max(res.x, Short4(0x0000));
+ res.y = Max(res.y, Short4(0x0000));
+ res.z = Max(res.z, Short4(0x0000));
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.stageOperationAlpha)
+ {
+ case TextureStage::STAGE_DISABLE:
+ case TextureStage::STAGE_SELECTARG1:
+ case TextureStage::STAGE_SELECTARG2:
+ case TextureStage::STAGE_SELECTARG3:
+ case TextureStage::STAGE_MODULATE:
+ case TextureStage::STAGE_MODULATE2X:
+ case TextureStage::STAGE_MODULATE4X:
+ case TextureStage::STAGE_ADD:
+ case TextureStage::STAGE_MULTIPLYADD:
+ case TextureStage::STAGE_LERP:
+ case TextureStage::STAGE_BLENDCURRENTALPHA:
+ case TextureStage::STAGE_BLENDDIFFUSEALPHA:
+ case TextureStage::STAGE_BLENDFACTORALPHA:
+ case TextureStage::STAGE_BLENDTEXTUREALPHA:
+ case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
+ case TextureStage::STAGE_DOT3: // Already clamped
+ case TextureStage::STAGE_PREMODULATE:
+ case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
+ case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
+ case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
+ case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
+ case TextureStage::STAGE_BUMPENVMAP:
+ case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
+ if(state.textureStage[stage].cantUnderflow)
+ {
+ break; // Can't go below zero
+ }
+ case TextureStage::STAGE_ADDSIGNED:
+ case TextureStage::STAGE_ADDSIGNED2X:
+ case TextureStage::STAGE_SUBTRACT:
+ case TextureStage::STAGE_ADDSMOOTH:
+ res.w = Max(res.w, Short4(0x0000));
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.stageOperation)
+ {
+ case TextureStage::STAGE_DISABLE:
+ case TextureStage::STAGE_SELECTARG1:
+ case TextureStage::STAGE_SELECTARG2:
+ case TextureStage::STAGE_SELECTARG3:
+ case TextureStage::STAGE_MODULATE:
+ case TextureStage::STAGE_SUBTRACT:
+ case TextureStage::STAGE_ADDSMOOTH:
+ case TextureStage::STAGE_LERP:
+ case TextureStage::STAGE_BLENDCURRENTALPHA:
+ case TextureStage::STAGE_BLENDDIFFUSEALPHA:
+ case TextureStage::STAGE_BLENDFACTORALPHA:
+ case TextureStage::STAGE_BLENDTEXTUREALPHA:
+ case TextureStage::STAGE_DOT3: // Already clamped
+ case TextureStage::STAGE_PREMODULATE:
+ case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
+ case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
+ case TextureStage::STAGE_BUMPENVMAP:
+ case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
+ break; // Can't go above one
+ case TextureStage::STAGE_MODULATE2X:
+ case TextureStage::STAGE_MODULATE4X:
+ case TextureStage::STAGE_ADD:
+ case TextureStage::STAGE_ADDSIGNED:
+ case TextureStage::STAGE_ADDSIGNED2X:
+ case TextureStage::STAGE_MULTIPLYADD:
+ case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
+ case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
+ case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
+ res.x = Min(res.x, Short4(0x1000));
+ res.y = Min(res.y, Short4(0x1000));
+ res.z = Min(res.z, Short4(0x1000));
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.stageOperationAlpha)
+ {
+ case TextureStage::STAGE_DISABLE:
+ case TextureStage::STAGE_SELECTARG1:
+ case TextureStage::STAGE_SELECTARG2:
+ case TextureStage::STAGE_SELECTARG3:
+ case TextureStage::STAGE_MODULATE:
+ case TextureStage::STAGE_SUBTRACT:
+ case TextureStage::STAGE_ADDSMOOTH:
+ case TextureStage::STAGE_LERP:
+ case TextureStage::STAGE_BLENDCURRENTALPHA:
+ case TextureStage::STAGE_BLENDDIFFUSEALPHA:
+ case TextureStage::STAGE_BLENDFACTORALPHA:
+ case TextureStage::STAGE_BLENDTEXTUREALPHA:
+ case TextureStage::STAGE_DOT3: // Already clamped
+ case TextureStage::STAGE_PREMODULATE:
+ case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
+ case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
+ case TextureStage::STAGE_BUMPENVMAP:
+ case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
+ break; // Can't go above one
+ case TextureStage::STAGE_MODULATE2X:
+ case TextureStage::STAGE_MODULATE4X:
+ case TextureStage::STAGE_ADD:
+ case TextureStage::STAGE_ADDSIGNED:
+ case TextureStage::STAGE_ADDSIGNED2X:
+ case TextureStage::STAGE_MULTIPLYADD:
+ case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
+ case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
+ case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
+ res.w = Min(res.w, Short4(0x1000));
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(textureStage.destinationArgument)
+ {
+ case TextureStage::DESTINATION_CURRENT:
+ current.x = res.x;
+ current.y = res.y;
+ current.z = res.z;
+ current.w = res.w;
+ break;
+ case TextureStage::DESTINATION_TEMP:
+ temp.x = res.x;
+ temp.y = res.y;
+ temp.z = res.z;
+ temp.w = res.w;
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void PixelPipeline::fogBlend(Vector4s ¤t, Float4 &f)
+ {
+ if(!state.fogActive)
+ {
+ return;
+ }
+
+ if(state.pixelFogMode != FOG_NONE)
+ {
+ pixelFog(f);
+ }
+
+ UShort4 fog = convertFixed16(f, true);
+
+ current.x = As<Short4>(MulHigh(As<UShort4>(current.x), fog));
+ current.y = As<Short4>(MulHigh(As<UShort4>(current.y), fog));
+ current.z = As<Short4>(MulHigh(As<UShort4>(current.z), fog));
+
+ UShort4 invFog = UShort4(0xFFFFu) - fog;
+
+ current.x += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[0]))));
+ current.y += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[1]))));
+ current.z += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[2]))));
+ }
+
+ void PixelPipeline::specularPixel(Vector4s ¤t, Vector4s &specular)
+ {
+ if(!state.specularAdd)
+ {
+ return;
+ }
+
+ current.x = AddSat(current.x, specular.x);
+ current.y = AddSat(current.y, specular.y);
+ current.z = AddSat(current.z, specular.z);
+ }
+
+ Vector4s PixelPipeline::sampleTexture(int coordinates, int stage, bool project)
+ {
+ Float4 x = v[2 + coordinates].x;
+ Float4 y = v[2 + coordinates].y;
+ Float4 z = v[2 + coordinates].z;
+ Float4 w = v[2 + coordinates].w;
+
+ if(perturbate)
+ {
+ x += du;
+ y += dv;
+
+ perturbate = false;
+ }
+
+ return sampleTexture(stage, x, y, z, w, project);
+ }
+
+ Vector4s PixelPipeline::sampleTexture(int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project)
+ {
+ Vector4s c;
+
+ #if PERF_PROFILE
+ Long texTime = Ticks();
+ #endif
+
+ Vector4f dsx;
+ Vector4f dsy;
+
+ Pointer<Byte> texture = data + OFFSET(DrawData, mipmap) + stage * sizeof(Texture);
+
+ if(!project)
+ {
+ c = SamplerCore(constants, state.sampler[stage]).sampleTexture(texture, u, v, w, q, q, dsx, dsy);
+ }
+ else
+ {
+ Float4 rq = reciprocal(q);
+
+ Float4 u_q = u * rq;
+ Float4 v_q = v * rq;
+ Float4 w_q = w * rq;
+
+ c = SamplerCore(constants, state.sampler[stage]).sampleTexture(texture, u_q, v_q, w_q, q, q, dsx, dsy);
+ }
+
+ #if PERF_PROFILE
+ cycles[PERF_TEX] += Ticks() - texTime;
+ #endif
+
+ return c;
+ }
+
+ Short4 PixelPipeline::convertFixed12(RValue<Float4> cf)
+ {
+ return RoundShort4(cf * Float4(0x1000));
+ }
+
+ void PixelPipeline::convertFixed12(Vector4s &cs, Vector4f &cf)
+ {
+ cs.x = convertFixed12(cf.x);
+ cs.y = convertFixed12(cf.y);
+ cs.z = convertFixed12(cf.z);
+ cs.w = convertFixed12(cf.w);
+ }
+
+ Float4 PixelPipeline::convertSigned12(Short4 &cs)
+ {
+ return Float4(cs) * Float4(1.0f / 0x0FFE);
+ }
+
+ void PixelPipeline::convertSigned12(Vector4f &cf, Vector4s &cs)
+ {
+ cf.x = convertSigned12(cs.x);
+ cf.y = convertSigned12(cs.y);
+ cf.z = convertSigned12(cs.z);
+ cf.w = convertSigned12(cs.w);
+ }
+
+ void PixelPipeline::writeDestination(Vector4s &d, const Dst &dst)
+ {
+ switch(dst.type)
+ {
+ case Shader::PARAMETER_TEMP:
+ if(dst.mask & 0x1) rs[dst.index].x = d.x;
+ if(dst.mask & 0x2) rs[dst.index].y = d.y;
+ if(dst.mask & 0x4) rs[dst.index].z = d.z;
+ if(dst.mask & 0x8) rs[dst.index].w = d.w;
+ break;
+ case Shader::PARAMETER_INPUT:
+ if(dst.mask & 0x1) vs[dst.index].x = d.x;
+ if(dst.mask & 0x2) vs[dst.index].y = d.y;
+ if(dst.mask & 0x4) vs[dst.index].z = d.z;
+ if(dst.mask & 0x8) vs[dst.index].w = d.w;
+ break;
+ case Shader::PARAMETER_CONST: ASSERT(false); break;
+ case Shader::PARAMETER_TEXTURE:
+ if(dst.mask & 0x1) ts[dst.index].x = d.x;
+ if(dst.mask & 0x2) ts[dst.index].y = d.y;
+ if(dst.mask & 0x4) ts[dst.index].z = d.z;
+ if(dst.mask & 0x8) ts[dst.index].w = d.w;
+ break;
+ case Shader::PARAMETER_COLOROUT:
+ if(dst.mask & 0x1) vs[dst.index].x = d.x;
+ if(dst.mask & 0x2) vs[dst.index].y = d.y;
+ if(dst.mask & 0x4) vs[dst.index].z = d.z;
+ if(dst.mask & 0x8) vs[dst.index].w = d.w;
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ Vector4s PixelPipeline::fetchRegister(const Src &src)
+ {
+ Vector4s *reg;
+ int i = src.index;
+
+ Vector4s c;
+
+ if(src.type == Shader::PARAMETER_CONST)
+ {
+ c.x = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][0]));
+ c.y = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][1]));
+ c.z = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][2]));
+ c.w = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][3]));
+ }
+
+ switch(src.type)
+ {
+ case Shader::PARAMETER_TEMP: reg = &rs[i]; break;
+ case Shader::PARAMETER_INPUT: reg = &vs[i]; break;
+ case Shader::PARAMETER_CONST: reg = &c; break;
+ case Shader::PARAMETER_TEXTURE: reg = &ts[i]; break;
+ case Shader::PARAMETER_VOID: return rs[0]; // Dummy
+ case Shader::PARAMETER_FLOAT4LITERAL: return rs[0]; // Dummy
+ default: ASSERT(false); return rs[0];
+ }
+
+ const Short4 &x = (*reg)[(src.swizzle >> 0) & 0x3];
+ const Short4 &y = (*reg)[(src.swizzle >> 2) & 0x3];
+ const Short4 &z = (*reg)[(src.swizzle >> 4) & 0x3];
+ const Short4 &w = (*reg)[(src.swizzle >> 6) & 0x3];
+
+ Vector4s mod;
+
+ switch(src.modifier)
+ {
+ case Shader::MODIFIER_NONE:
+ mod.x = x;
+ mod.y = y;
+ mod.z = z;
+ mod.w = w;
+ break;
+ case Shader::MODIFIER_BIAS:
+ mod.x = SubSat(x, Short4(0x0800));
+ mod.y = SubSat(y, Short4(0x0800));
+ mod.z = SubSat(z, Short4(0x0800));
+ mod.w = SubSat(w, Short4(0x0800));
+ break;
+ case Shader::MODIFIER_BIAS_NEGATE:
+ mod.x = SubSat(Short4(0x0800), x);
+ mod.y = SubSat(Short4(0x0800), y);
+ mod.z = SubSat(Short4(0x0800), z);
+ mod.w = SubSat(Short4(0x0800), w);
+ break;
+ case Shader::MODIFIER_COMPLEMENT:
+ mod.x = SubSat(Short4(0x1000), x);
+ mod.y = SubSat(Short4(0x1000), y);
+ mod.z = SubSat(Short4(0x1000), z);
+ mod.w = SubSat(Short4(0x1000), w);
+ break;
+ case Shader::MODIFIER_NEGATE:
+ mod.x = -x;
+ mod.y = -y;
+ mod.z = -z;
+ mod.w = -w;
+ break;
+ case Shader::MODIFIER_X2:
+ mod.x = AddSat(x, x);
+ mod.y = AddSat(y, y);
+ mod.z = AddSat(z, z);
+ mod.w = AddSat(w, w);
+ break;
+ case Shader::MODIFIER_X2_NEGATE:
+ mod.x = -AddSat(x, x);
+ mod.y = -AddSat(y, y);
+ mod.z = -AddSat(z, z);
+ mod.w = -AddSat(w, w);
+ break;
+ case Shader::MODIFIER_SIGN:
+ mod.x = SubSat(x, Short4(0x0800));
+ mod.y = SubSat(y, Short4(0x0800));
+ mod.z = SubSat(z, Short4(0x0800));
+ mod.w = SubSat(w, Short4(0x0800));
+ mod.x = AddSat(mod.x, mod.x);
+ mod.y = AddSat(mod.y, mod.y);
+ mod.z = AddSat(mod.z, mod.z);
+ mod.w = AddSat(mod.w, mod.w);
+ break;
+ case Shader::MODIFIER_SIGN_NEGATE:
+ mod.x = SubSat(Short4(0x0800), x);
+ mod.y = SubSat(Short4(0x0800), y);
+ mod.z = SubSat(Short4(0x0800), z);
+ mod.w = SubSat(Short4(0x0800), w);
+ mod.x = AddSat(mod.x, mod.x);
+ mod.y = AddSat(mod.y, mod.y);
+ mod.z = AddSat(mod.z, mod.z);
+ mod.w = AddSat(mod.w, mod.w);
+ break;
+ case Shader::MODIFIER_DZ:
+ mod.x = x;
+ mod.y = y;
+ mod.z = z;
+ mod.w = w;
+ // Projection performed by texture sampler
+ break;
+ case Shader::MODIFIER_DW:
+ mod.x = x;
+ mod.y = y;
+ mod.z = z;
+ mod.w = w;
+ // Projection performed by texture sampler
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ if(src.type == Shader::PARAMETER_CONST && (src.modifier == Shader::MODIFIER_X2 || src.modifier == Shader::MODIFIER_X2_NEGATE))
+ {
+ mod.x = Min(mod.x, Short4(0x1000)); mod.x = Max(mod.x, Short4(-0x1000));
+ mod.y = Min(mod.y, Short4(0x1000)); mod.y = Max(mod.y, Short4(-0x1000));
+ mod.z = Min(mod.z, Short4(0x1000)); mod.z = Max(mod.z, Short4(-0x1000));
+ mod.w = Min(mod.w, Short4(0x1000)); mod.w = Max(mod.w, Short4(-0x1000));
+ }
+
+ return mod;
+ }
+
+ void PixelPipeline::MOV(Vector4s &dst, Vector4s &src0)
+ {
+ dst.x = src0.x;
+ dst.y = src0.y;
+ dst.z = src0.z;
+ dst.w = src0.w;
+ }
+
+ void PixelPipeline::ADD(Vector4s &dst, Vector4s &src0, Vector4s &src1)
+ {
+ dst.x = AddSat(src0.x, src1.x);
+ dst.y = AddSat(src0.y, src1.y);
+ dst.z = AddSat(src0.z, src1.z);
+ dst.w = AddSat(src0.w, src1.w);
+ }
+
+ void PixelPipeline::SUB(Vector4s &dst, Vector4s &src0, Vector4s &src1)
+ {
+ dst.x = SubSat(src0.x, src1.x);
+ dst.y = SubSat(src0.y, src1.y);
+ dst.z = SubSat(src0.z, src1.z);
+ dst.w = SubSat(src0.w, src1.w);
+ }
+
+ void PixelPipeline::MAD(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
+ {
+ // FIXME: Long fixed-point multiply fixup
+ { dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x); }
+ { dst.y = MulHigh(src0.y, src1.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, src2.y); }
+ { dst.z = MulHigh(src0.z, src1.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, src2.z); }
+ { dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w); }
+ }
+
+ void PixelPipeline::MUL(Vector4s &dst, Vector4s &src0, Vector4s &src1)
+ {
+ // FIXME: Long fixed-point multiply fixup
+ { dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); }
+ { dst.y = MulHigh(src0.y, src1.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); }
+ { dst.z = MulHigh(src0.z, src1.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); }
+ { dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); }
+ }
+
+ void PixelPipeline::DP3(Vector4s &dst, Vector4s &src0, Vector4s &src1)
+ {
+ Short4 t0;
+ Short4 t1;
+
+ // FIXME: Long fixed-point multiply fixup
+ t0 = MulHigh(src0.x, src1.x); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0);
+ t1 = MulHigh(src0.y, src1.y); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
+ t0 = AddSat(t0, t1);
+ t1 = MulHigh(src0.z, src1.z); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
+ t0 = AddSat(t0, t1);
+
+ dst.x = t0;
+ dst.y = t0;
+ dst.z = t0;
+ dst.w = t0;
+ }
+
+ void PixelPipeline::DP4(Vector4s &dst, Vector4s &src0, Vector4s &src1)
+ {
+ Short4 t0;
+ Short4 t1;
+
+ // FIXME: Long fixed-point multiply fixup
+ t0 = MulHigh(src0.x, src1.x); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0);
+ t1 = MulHigh(src0.y, src1.y); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
+ t0 = AddSat(t0, t1);
+ t1 = MulHigh(src0.z, src1.z); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
+ t0 = AddSat(t0, t1);
+ t1 = MulHigh(src0.w, src1.w); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
+ t0 = AddSat(t0, t1);
+
+ dst.x = t0;
+ dst.y = t0;
+ dst.z = t0;
+ dst.w = t0;
+ }
+
+ void PixelPipeline::LRP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
+ {
+ // FIXME: Long fixed-point multiply fixup
+ { dst.x = SubSat(src1.x, src2.x); dst.x = MulHigh(dst.x, src0.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x); }
+ {
+ dst.y = SubSat(src1.y, src2.y); dst.y = MulHigh(dst.y, src0.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, src2.y);
+ }
+ {dst.z = SubSat(src1.z, src2.z); dst.z = MulHigh(dst.z, src0.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, src2.z); }
+ {dst.w = SubSat(src1.w, src2.w); dst.w = MulHigh(dst.w, src0.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w); }
+ }
+
+ void PixelPipeline::TEXCOORD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate)
+ {
+ Float4 uw;
+ Float4 vw;
+ Float4 sw;
+
+ if(state.interpolant[2 + coordinate].component & 0x01)
+ {
+ uw = Max(u, Float4(0.0f));
+ uw = Min(uw, Float4(1.0f));
+ dst.x = convertFixed12(uw);
+ }
+ else
+ {
+ dst.x = Short4(0x0000);
+ }
+
+ if(state.interpolant[2 + coordinate].component & 0x02)
+ {
+ vw = Max(v, Float4(0.0f));
+ vw = Min(vw, Float4(1.0f));
+ dst.y = convertFixed12(vw);
+ }
+ else
+ {
+ dst.y = Short4(0x0000);
+ }
+
+ if(state.interpolant[2 + coordinate].component & 0x04)
+ {
+ sw = Max(s, Float4(0.0f));
+ sw = Min(sw, Float4(1.0f));
+ dst.z = convertFixed12(sw);
+ }
+ else
+ {
+ dst.z = Short4(0x0000);
+ }
+
+ dst.w = Short4(0x1000);
+ }
+
+ void PixelPipeline::TEXCRD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project)
+ {
+ Float4 uw = u;
+ Float4 vw = v;
+ Float4 sw = s;
+
+ if(project)
+ {
+ uw *= Rcp_pp(s);
+ vw *= Rcp_pp(s);
+ }
+
+ if(state.interpolant[2 + coordinate].component & 0x01)
+ {
+ uw *= Float4(0x1000);
+ uw = Max(uw, Float4(-0x8000));
+ uw = Min(uw, Float4(0x7FFF));
+ dst.x = RoundShort4(uw);
+ }
+ else
+ {
+ dst.x = Short4(0x0000);
+ }
+
+ if(state.interpolant[2 + coordinate].component & 0x02)
+ {
+ vw *= Float4(0x1000);
+ vw = Max(vw, Float4(-0x8000));
+ vw = Min(vw, Float4(0x7FFF));
+ dst.y = RoundShort4(vw);
+ }
+ else
+ {
+ dst.y = Short4(0x0000);
+ }
+
+ if(state.interpolant[2 + coordinate].component & 0x04)
+ {
+ sw *= Float4(0x1000);
+ sw = Max(sw, Float4(-0x8000));
+ sw = Min(sw, Float4(0x7FFF));
+ dst.z = RoundShort4(sw);
+ }
+ else
+ {
+ dst.z = Short4(0x0000);
+ }
+ }
+
+ void PixelPipeline::TEXDP3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src)
+ {
+ TEXM3X3PAD(u, v, s, src, 0, false);
+
+ Short4 t0 = RoundShort4(u_ * Float4(0x1000));
+
+ dst.x = t0;
+ dst.y = t0;
+ dst.z = t0;
+ dst.w = t0;
+ }
+
+ void PixelPipeline::TEXDP3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0)
+ {
+ TEXM3X3PAD(u, v, s, src0, 0, false);
+
+ v_ = Float4(0.0f);
+ w_ = Float4(0.0f);
+
+ dst = sampleTexture(stage, u_, v_, w_, w_);
+ }
+
+ void PixelPipeline::TEXKILL(Int cMask[4], Float4 &u, Float4 &v, Float4 &s)
+ {
+ Int kill = SignMask(CmpNLT(u, Float4(0.0f))) &
+ SignMask(CmpNLT(v, Float4(0.0f))) &
+ SignMask(CmpNLT(s, Float4(0.0f)));
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ cMask[q] &= kill;
+ }
+ }
+
+ void PixelPipeline::TEXKILL(Int cMask[4], Vector4s &src)
+ {
+ Short4 test = src.x | src.y | src.z;
+ Int kill = SignMask(PackSigned(test, test)) ^ 0x0000000F;
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ cMask[q] &= kill;
+ }
+ }
+
+ void PixelPipeline::TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int sampler, bool project)
+ {
+ dst = sampleTexture(sampler, u, v, s, s, project);
+ }
+
+ void PixelPipeline::TEXLD(Vector4s &dst, Vector4s &src, int sampler, bool project)
+ {
+ Float4 u = Float4(src.x) * Float4(1.0f / 0x0FFE);
+ Float4 v = Float4(src.y) * Float4(1.0f / 0x0FFE);
+ Float4 s = Float4(src.z) * Float4(1.0f / 0x0FFE);
+
+ dst = sampleTexture(sampler, u, v, s, s, project);
+ }
+
+ void PixelPipeline::TEXBEM(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage)
+ {
+ Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
+ Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
+
+ Float4 du2 = du;
+ Float4 dv2 = dv;
+
+ du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
+ dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
+ du += dv2;
+ dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
+ du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
+ dv += du2;
+
+ Float4 u_ = u + du;
+ Float4 v_ = v + dv;
+
+ dst = sampleTexture(stage, u_, v_, s, s);
+ }
+
+ void PixelPipeline::TEXBEML(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage)
+ {
+ Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
+ Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
+
+ Float4 du2 = du;
+ Float4 dv2 = dv;
+
+ du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
+ dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
+ du += dv2;
+ dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
+ du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
+ dv += du2;
+
+ Float4 u_ = u + du;
+ Float4 v_ = v + dv;
+
+ dst = sampleTexture(stage, u_, v_, s, s);
+
+ Short4 L;
+
+ L = src.z;
+ L = MulHigh(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceScale4)));
+ L = L << 4;
+ L = AddSat(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceOffset4)));
+ L = Max(L, Short4(0x0000));
+ L = Min(L, Short4(0x1000));
+
+ dst.x = MulHigh(dst.x, L); dst.x = dst.x << 4;
+ dst.y = MulHigh(dst.y, L); dst.y = dst.y << 4;
+ dst.z = MulHigh(dst.z, L); dst.z = dst.z << 4;
+ }
+
+ void PixelPipeline::TEXREG2AR(Vector4s &dst, Vector4s &src0, int stage)
+ {
+ Float4 u = Float4(src0.w) * Float4(1.0f / 0x0FFE);
+ Float4 v = Float4(src0.x) * Float4(1.0f / 0x0FFE);
+ Float4 s = Float4(src0.z) * Float4(1.0f / 0x0FFE);
+
+ dst = sampleTexture(stage, u, v, s, s);
+ }
+
+ void PixelPipeline::TEXREG2GB(Vector4s &dst, Vector4s &src0, int stage)
+ {
+ Float4 u = Float4(src0.y) * Float4(1.0f / 0x0FFE);
+ Float4 v = Float4(src0.z) * Float4(1.0f / 0x0FFE);
+ Float4 s = v;
+
+ dst = sampleTexture(stage, u, v, s, s);
+ }
+
+ void PixelPipeline::TEXREG2RGB(Vector4s &dst, Vector4s &src0, int stage)
+ {
+ Float4 u = Float4(src0.x) * Float4(1.0f / 0x0FFE);
+ Float4 v = Float4(src0.y) * Float4(1.0f / 0x0FFE);
+ Float4 s = Float4(src0.z) * Float4(1.0f / 0x0FFE);
+
+ dst = sampleTexture(stage, u, v, s, s);
+ }
+
+ void PixelPipeline::TEXM3X2DEPTH(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src, bool signedScaling)
+ {
+ TEXM3X2PAD(u, v, s, src, 1, signedScaling);
+
+ // z / w
+ u_ *= Rcp_pp(v_); // FIXME: Set result to 1.0 when division by zero
+
+ oDepth = u_;
+ }
+
+ void PixelPipeline::TEXM3X2PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling)
+ {
+ TEXM3X3PAD(u, v, s, src0, component, signedScaling);
+ }
+
+ void PixelPipeline::TEXM3X2TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling)
+ {
+ TEXM3X2PAD(u, v, s, src0, 1, signedScaling);
+
+ w_ = Float4(0.0f);
+
+ dst = sampleTexture(stage, u_, v_, w_, w_);
+ }
+
+ void PixelPipeline::TEXM3X3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, bool signedScaling)
+ {
+ TEXM3X3PAD(u, v, s, src0, 2, signedScaling);
+
+ dst.x = RoundShort4(u_ * Float4(0x1000));
+ dst.y = RoundShort4(v_ * Float4(0x1000));
+ dst.z = RoundShort4(w_ * Float4(0x1000));
+ dst.w = Short4(0x1000);
+ }
+
+ void PixelPipeline::TEXM3X3PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling)
+ {
+ if(component == 0 || previousScaling != signedScaling) // FIXME: Other source modifiers?
+ {
+ U = Float4(src0.x);
+ V = Float4(src0.y);
+ W = Float4(src0.z);
+
+ previousScaling = signedScaling;
+ }
+
+ Float4 x = U * u + V * v + W * s;
+
+ x *= Float4(1.0f / 0x1000);
+
+ switch(component)
+ {
+ case 0: u_ = x; break;
+ case 1: v_ = x; break;
+ case 2: w_ = x; break;
+ default: ASSERT(false);
+ }
+ }
+
+ void PixelPipeline::TEXM3X3SPEC(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, Vector4s &src1)
+ {
+ TEXM3X3PAD(u, v, s, src0, 2, false);
+
+ Float4 E[3]; // Eye vector
+
+ E[0] = Float4(src1.x) * Float4(1.0f / 0x0FFE);
+ E[1] = Float4(src1.y) * Float4(1.0f / 0x0FFE);
+ E[2] = Float4(src1.z) * Float4(1.0f / 0x0FFE);
+
+ // Reflection
+ Float4 u__;
+ Float4 v__;
+ Float4 w__;
+
+ // (u'', v'', w'') = 2 * (N . E) * N - E * (N . N)
+ u__ = u_ * E[0];
+ v__ = v_ * E[1];
+ w__ = w_ * E[2];
+ u__ += v__ + w__;
+ u__ += u__;
+ v__ = u__;
+ w__ = u__;
+ u__ *= u_;
+ v__ *= v_;
+ w__ *= w_;
+ u_ *= u_;
+ v_ *= v_;
+ w_ *= w_;
+ u_ += v_ + w_;
+ u__ -= E[0] * u_;
+ v__ -= E[1] * u_;
+ w__ -= E[2] * u_;
+
+ dst = sampleTexture(stage, u__, v__, w__, w__);
+ }
+
+ void PixelPipeline::TEXM3X3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling)
+ {
+ TEXM3X3PAD(u, v, s, src0, 2, signedScaling);
+
+ dst = sampleTexture(stage, u_, v_, w_, w_);
+ }
+
+ void PixelPipeline::TEXM3X3VSPEC(Vector4s &dst, Float4 &x, Float4 &y, Float4 &z, int stage, Vector4s &src0)
+ {
+ TEXM3X3PAD(x, y, z, src0, 2, false);
+
+ Float4 E[3]; // Eye vector
+
+ E[0] = v[2 + stage - 2].w;
+ E[1] = v[2 + stage - 1].w;
+ E[2] = v[2 + stage - 0].w;
+
+ // Reflection
+ Float4 u__;
+ Float4 v__;
+ Float4 w__;
+
+ // (u'', v'', w'') = 2 * (N . E) * N - E * (N . N)
+ u__ = u_ * E[0];
+ v__ = v_ * E[1];
+ w__ = w_ * E[2];
+ u__ += v__ + w__;
+ u__ += u__;
+ v__ = u__;
+ w__ = u__;
+ u__ *= u_;
+ v__ *= v_;
+ w__ *= w_;
+ u_ *= u_;
+ v_ *= v_;
+ w_ *= w_;
+ u_ += v_ + w_;
+ u__ -= E[0] * u_;
+ v__ -= E[1] * u_;
+ w__ -= E[2] * u_;
+
+ dst = sampleTexture(stage, u__, v__, w__, w__);
+ }
+
+ void PixelPipeline::TEXDEPTH()
+ {
+ u_ = Float4(rs[5].x);
+ v_ = Float4(rs[5].y);
+
+ // z / w
+ u_ *= Rcp_pp(v_); // FIXME: Set result to 1.0 when division by zero
+
+ oDepth = u_;
+ }
+
+ void PixelPipeline::CND(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
+ {
+ {Short4 t0; t0 = src0.x; t0 = CmpGT(t0, Short4(0x0800)); Short4 t1; t1 = src1.x; t1 = t1 & t0; t0 = ~t0 & src2.x; t0 = t0 | t1; dst.x = t0; };
+ {Short4 t0; t0 = src0.y; t0 = CmpGT(t0, Short4(0x0800)); Short4 t1; t1 = src1.y; t1 = t1 & t0; t0 = ~t0 & src2.y; t0 = t0 | t1; dst.y = t0; };
+ {Short4 t0; t0 = src0.z; t0 = CmpGT(t0, Short4(0x0800)); Short4 t1; t1 = src1.z; t1 = t1 & t0; t0 = ~t0 & src2.z; t0 = t0 | t1; dst.z = t0; };
+ {Short4 t0; t0 = src0.w; t0 = CmpGT(t0, Short4(0x0800)); Short4 t1; t1 = src1.w; t1 = t1 & t0; t0 = ~t0 & src2.w; t0 = t0 | t1; dst.w = t0; };
+ }
+
+ void PixelPipeline::CMP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
+ {
+ {Short4 t0 = CmpGT(Short4(0x0000), src0.x); Short4 t1; t1 = src2.x; t1 &= t0; t0 = ~t0 & src1.x; t0 |= t1; dst.x = t0; };
+ {Short4 t0 = CmpGT(Short4(0x0000), src0.y); Short4 t1; t1 = src2.y; t1 &= t0; t0 = ~t0 & src1.y; t0 |= t1; dst.y = t0; };
+ {Short4 t0 = CmpGT(Short4(0x0000), src0.z); Short4 t1; t1 = src2.z; t1 &= t0; t0 = ~t0 & src1.z; t0 |= t1; dst.z = t0; };
+ {Short4 t0 = CmpGT(Short4(0x0000), src0.w); Short4 t1; t1 = src2.w; t1 &= t0; t0 = ~t0 & src1.w; t0 |= t1; dst.w = t0; };
+ }
+
+ void PixelPipeline::BEM(Vector4s &dst, Vector4s &src0, Vector4s &src1, int stage)
+ {
+ Short4 t0;
+ Short4 t1;
+
+ // dst.x = src0.x + BUMPENVMAT00(stage) * src1.x + BUMPENVMAT10(stage) * src1.y
+ t0 = MulHigh(src1.x, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[0][0]))); t0 = t0 << 4; // FIXME: Matrix components range? Overflow hazard.
+ t1 = MulHigh(src1.y, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[1][0]))); t1 = t1 << 4; // FIXME: Matrix components range? Overflow hazard.
+ t0 = AddSat(t0, t1);
+ t0 = AddSat(t0, src0.x);
+ dst.x = t0;
+
+ // dst.y = src0.y + BUMPENVMAT01(stage) * src1.x + BUMPENVMAT11(stage) * src1.y
+ t0 = MulHigh(src1.x, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[0][1]))); t0 = t0 << 4; // FIXME: Matrix components range? Overflow hazard.
+ t1 = MulHigh(src1.y, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[1][1]))); t1 = t1 << 4; // FIXME: Matrix components range? Overflow hazard.
+ t0 = AddSat(t0, t1);
+ t0 = AddSat(t0, src0.y);
+ dst.y = t0;
+ }
+}
+
diff --git a/src/Pipeline/PixelPipeline.hpp b/src/Pipeline/PixelPipeline.hpp
new file mode 100644
index 0000000..66f0ec7
--- /dev/null
+++ b/src/Pipeline/PixelPipeline.hpp
@@ -0,0 +1,114 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_PixelPipeline_hpp
+#define sw_PixelPipeline_hpp
+
+#include "PixelRoutine.hpp"
+
+namespace sw
+{
+ class PixelPipeline : public PixelRoutine
+ {
+ public:
+ PixelPipeline(const PixelProcessor::State &state, const PixelShader *shader) :
+ PixelRoutine(state, shader), current(rs[0]), diffuse(vs[0]), specular(vs[1]), perturbate(false), luminance(false), previousScaling(false) {}
+ virtual ~PixelPipeline() {}
+
+ protected:
+ virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w);
+ virtual void applyShader(Int cMask[4]);
+ virtual Bool alphaTest(Int cMask[4]);
+ virtual void rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]);
+
+ private:
+ Vector4s ¤t;
+ Vector4s &diffuse;
+ Vector4s &specular;
+
+ Vector4s rs[6];
+ Vector4s vs[2];
+ Vector4s ts[6];
+
+ // bem(l) offsets and luminance
+ Float4 du;
+ Float4 dv;
+ Short4 L;
+
+ // texm3x3 temporaries
+ Float4 u_; // FIXME
+ Float4 v_; // FIXME
+ Float4 w_; // FIXME
+ Float4 U; // FIXME
+ Float4 V; // FIXME
+ Float4 W; // FIXME
+
+ void fixedFunction();
+ void blendTexture(Vector4s &temp, Vector4s &texture, int stage);
+ void fogBlend(Vector4s ¤t, Float4 &fog);
+ void specularPixel(Vector4s ¤t, Vector4s &specular);
+
+ Vector4s sampleTexture(int coordinates, int sampler, bool project = false);
+ Vector4s sampleTexture(int sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project = false);
+
+ Short4 convertFixed12(RValue<Float4> cf);
+ void convertFixed12(Vector4s &cs, Vector4f &cf);
+ Float4 convertSigned12(Short4 &cs);
+ void convertSigned12(Vector4f &cf, Vector4s &cs);
+
+ void writeDestination(Vector4s &d, const Dst &dst);
+ Vector4s fetchRegister(const Src &src);
+
+ // Instructions
+ void MOV(Vector4s &dst, Vector4s &src0);
+ void ADD(Vector4s &dst, Vector4s &src0, Vector4s &src1);
+ void SUB(Vector4s &dst, Vector4s &src0, Vector4s &src1);
+ void MAD(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2);
+ void MUL(Vector4s &dst, Vector4s &src0, Vector4s &src1);
+ void DP3(Vector4s &dst, Vector4s &src0, Vector4s &src1);
+ void DP4(Vector4s &dst, Vector4s &src0, Vector4s &src1);
+ void LRP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2);
+ void TEXCOORD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate);
+ void TEXCRD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project);
+ void TEXDP3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src);
+ void TEXDP3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0);
+ void TEXKILL(Int cMask[4], Float4 &u, Float4 &v, Float4 &s);
+ void TEXKILL(Int cMask[4], Vector4s &dst);
+ void TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, bool project);
+ void TEXLD(Vector4s &dst, Vector4s &src, int stage, bool project);
+ void TEXBEM(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage);
+ void TEXBEML(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage);
+ void TEXREG2AR(Vector4s &dst, Vector4s &src0, int stage);
+ void TEXREG2GB(Vector4s &dst, Vector4s &src0, int stage);
+ void TEXREG2RGB(Vector4s &dst, Vector4s &src0, int stage);
+ void TEXM3X2DEPTH(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src, bool signedScaling);
+ void TEXM3X2PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling);
+ void TEXM3X2TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling);
+ void TEXM3X3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, bool signedScaling);
+ void TEXM3X3PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling);
+ void TEXM3X3SPEC(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, Vector4s &src1);
+ void TEXM3X3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool singedScaling);
+ void TEXM3X3VSPEC(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0);
+ void TEXDEPTH();
+ void CND(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2);
+ void CMP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2);
+ void BEM(Vector4s &dst, Vector4s &src0, Vector4s &src1, int stage);
+
+ bool perturbate;
+ bool luminance;
+ bool previousScaling;
+ };
+}
+
+#endif
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
new file mode 100644
index 0000000..473712b
--- /dev/null
+++ b/src/Pipeline/PixelProgram.cpp
@@ -0,0 +1,1850 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "PixelProgram.hpp"
+
+#include "SamplerCore.hpp"
+#include "Renderer/Primitive.hpp"
+#include "Renderer/Renderer.hpp"
+
+namespace sw
+{
+ extern bool postBlendSRGB;
+ extern bool booleanFaceRegister;
+ extern bool halfIntegerCoordinates; // Pixel centers are not at integer coordinates
+ extern bool fullPixelPositionRegister;
+
+ void PixelProgram::setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w)
+ {
+ if(shader->getShaderModel() >= 0x0300)
+ {
+ if(shader->isVPosDeclared())
+ {
+ if(!halfIntegerCoordinates)
+ {
+ vPos.x = Float4(Float(x)) + Float4(0, 1, 0, 1);
+ vPos.y = Float4(Float(y)) + Float4(0, 0, 1, 1);
+ }
+ else
+ {
+ vPos.x = Float4(Float(x)) + Float4(0.5f, 1.5f, 0.5f, 1.5f);
+ vPos.y = Float4(Float(y)) + Float4(0.5f, 0.5f, 1.5f, 1.5f);
+ }
+
+ if(fullPixelPositionRegister)
+ {
+ vPos.z = z[0]; // FIXME: Centroid?
+ vPos.w = w; // FIXME: Centroid?
+ }
+ }
+
+ if(shader->isVFaceDeclared())
+ {
+ Float4 face = *Pointer<Float>(primitive + OFFSET(Primitive, area));
+
+ if(booleanFaceRegister)
+ {
+ face = As<Float4>(state.frontFaceCCW ? CmpNLT(face, Float4(0.0f)) : CmpLT(face, Float4(0.0f)));
+ }
+
+ vFace.x = face;
+ vFace.y = face;
+ vFace.z = face;
+ vFace.w = face;
+ }
+ }
+ }
+
+ void PixelProgram::applyShader(Int cMask[4])
+ {
+ enableIndex = 0;
+ stackIndex = 0;
+
+ if(shader->containsLeaveInstruction())
+ {
+ enableLeave = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+ }
+
+ for(int i = 0; i < RENDERTARGETS; i++)
+ {
+ if(state.targetFormat[i] != FORMAT_NULL)
+ {
+ oC[i] = Vector4f(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ }
+
+ // Create all call site return blocks up front
+ for(size_t i = 0; i < shader->getLength(); i++)
+ {
+ const Shader::Instruction *instruction = shader->getInstruction(i);
+ Shader::Opcode opcode = instruction->opcode;
+
+ if(opcode == Shader::OPCODE_CALL || opcode == Shader::OPCODE_CALLNZ)
+ {
+ const Dst &dst = instruction->dst;
+
+ ASSERT(callRetBlock[dst.label].size() == dst.callSite);
+ callRetBlock[dst.label].push_back(Nucleus::createBasicBlock());
+ }
+ }
+
+ bool broadcastColor0 = true;
+
+ for(size_t i = 0; i < shader->getLength(); i++)
+ {
+ const Shader::Instruction *instruction = shader->getInstruction(i);
+ Shader::Opcode opcode = instruction->opcode;
+
+ if(opcode == Shader::OPCODE_DCL || opcode == Shader::OPCODE_DEF || opcode == Shader::OPCODE_DEFI || opcode == Shader::OPCODE_DEFB)
+ {
+ continue;
+ }
+
+ const Dst &dst = instruction->dst;
+ const Src &src0 = instruction->src[0];
+ const Src &src1 = instruction->src[1];
+ const Src &src2 = instruction->src[2];
+ const Src &src3 = instruction->src[3];
+ const Src &src4 = instruction->src[4];
+
+ bool predicate = instruction->predicate;
+ Control control = instruction->control;
+ bool pp = dst.partialPrecision;
+ bool project = instruction->project;
+ bool bias = instruction->bias;
+
+ Vector4f d;
+ Vector4f s0;
+ Vector4f s1;
+ Vector4f s2;
+ Vector4f s3;
+ Vector4f s4;
+
+ if(opcode == Shader::OPCODE_TEXKILL) // Takes destination as input
+ {
+ if(dst.type == Shader::PARAMETER_TEXTURE)
+ {
+ d.x = v[2 + dst.index].x;
+ d.y = v[2 + dst.index].y;
+ d.z = v[2 + dst.index].z;
+ d.w = v[2 + dst.index].w;
+ }
+ else
+ {
+ d = r[dst.index];
+ }
+ }
+
+ if(src0.type != Shader::PARAMETER_VOID) s0 = fetchRegister(src0);
+ if(src1.type != Shader::PARAMETER_VOID) s1 = fetchRegister(src1);
+ if(src2.type != Shader::PARAMETER_VOID) s2 = fetchRegister(src2);
+ if(src3.type != Shader::PARAMETER_VOID) s3 = fetchRegister(src3);
+ if(src4.type != Shader::PARAMETER_VOID) s4 = fetchRegister(src4);
+
+ switch(opcode)
+ {
+ case Shader::OPCODE_PS_2_0: break;
+ case Shader::OPCODE_PS_2_x: break;
+ case Shader::OPCODE_PS_3_0: break;
+ case Shader::OPCODE_DEF: break;
+ case Shader::OPCODE_DCL: break;
+ case Shader::OPCODE_NOP: break;
+ case Shader::OPCODE_MOV: mov(d, s0); break;
+ case Shader::OPCODE_NEG: neg(d, s0); break;
+ case Shader::OPCODE_INEG: ineg(d, s0); break;
+ case Shader::OPCODE_F2B: f2b(d, s0); break;
+ case Shader::OPCODE_B2F: b2f(d, s0); break;
+ case Shader::OPCODE_F2I: f2i(d, s0); break;
+ case Shader::OPCODE_I2F: i2f(d, s0); break;
+ case Shader::OPCODE_F2U: f2u(d, s0); break;
+ case Shader::OPCODE_U2F: u2f(d, s0); break;
+ case Shader::OPCODE_I2B: i2b(d, s0); break;
+ case Shader::OPCODE_B2I: b2i(d, s0); break;
+ case Shader::OPCODE_ADD: add(d, s0, s1); break;
+ case Shader::OPCODE_IADD: iadd(d, s0, s1); break;
+ case Shader::OPCODE_SUB: sub(d, s0, s1); break;
+ case Shader::OPCODE_ISUB: isub(d, s0, s1); break;
+ case Shader::OPCODE_MUL: mul(d, s0, s1); break;
+ case Shader::OPCODE_IMUL: imul(d, s0, s1); break;
+ case Shader::OPCODE_MAD: mad(d, s0, s1, s2); break;
+ case Shader::OPCODE_IMAD: imad(d, s0, s1, s2); break;
+ case Shader::OPCODE_DP1: dp1(d, s0, s1); break;
+ case Shader::OPCODE_DP2: dp2(d, s0, s1); break;
+ case Shader::OPCODE_DP2ADD: dp2add(d, s0, s1, s2); break;
+ case Shader::OPCODE_DP3: dp3(d, s0, s1); break;
+ case Shader::OPCODE_DP4: dp4(d, s0, s1); break;
+ case Shader::OPCODE_DET2: det2(d, s0, s1); break;
+ case Shader::OPCODE_DET3: det3(d, s0, s1, s2); break;
+ case Shader::OPCODE_DET4: det4(d, s0, s1, s2, s3); break;
+ case Shader::OPCODE_CMP0: cmp0(d, s0, s1, s2); break;
+ case Shader::OPCODE_ICMP: icmp(d, s0, s1, control); break;
+ case Shader::OPCODE_UCMP: ucmp(d, s0, s1, control); break;
+ case Shader::OPCODE_SELECT: select(d, s0, s1, s2); break;
+ case Shader::OPCODE_EXTRACT: extract(d.x, s0, s1.x); break;
+ case Shader::OPCODE_INSERT: insert(d, s0, s1.x, s2.x); break;
+ case Shader::OPCODE_FRC: frc(d, s0); break;
+ case Shader::OPCODE_TRUNC: trunc(d, s0); break;
+ case Shader::OPCODE_FLOOR: floor(d, s0); break;
+ case Shader::OPCODE_ROUND: round(d, s0); break;
+ case Shader::OPCODE_ROUNDEVEN: roundEven(d, s0); break;
+ case Shader::OPCODE_CEIL: ceil(d, s0); break;
+ case Shader::OPCODE_EXP2X: exp2x(d, s0, pp); break;
+ case Shader::OPCODE_EXP2: exp2(d, s0, pp); break;
+ case Shader::OPCODE_LOG2X: log2x(d, s0, pp); break;
+ case Shader::OPCODE_LOG2: log2(d, s0, pp); break;
+ case Shader::OPCODE_EXP: exp(d, s0, pp); break;
+ case Shader::OPCODE_LOG: log(d, s0, pp); break;
+ case Shader::OPCODE_RCPX: rcpx(d, s0, pp); break;
+ case Shader::OPCODE_DIV: div(d, s0, s1); break;
+ case Shader::OPCODE_IDIV: idiv(d, s0, s1); break;
+ case Shader::OPCODE_UDIV: udiv(d, s0, s1); break;
+ case Shader::OPCODE_MOD: mod(d, s0, s1); break;
+ case Shader::OPCODE_IMOD: imod(d, s0, s1); break;
+ case Shader::OPCODE_UMOD: umod(d, s0, s1); break;
+ case Shader::OPCODE_SHL: shl(d, s0, s1); break;
+ case Shader::OPCODE_ISHR: ishr(d, s0, s1); break;
+ case Shader::OPCODE_USHR: ushr(d, s0, s1); break;
+ case Shader::OPCODE_RSQX: rsqx(d, s0, pp); break;
+ case Shader::OPCODE_SQRT: sqrt(d, s0, pp); break;
+ case Shader::OPCODE_RSQ: rsq(d, s0, pp); break;
+ case Shader::OPCODE_LEN2: len2(d.x, s0, pp); break;
+ case Shader::OPCODE_LEN3: len3(d.x, s0, pp); break;
+ case Shader::OPCODE_LEN4: len4(d.x, s0, pp); break;
+ case Shader::OPCODE_DIST1: dist1(d.x, s0, s1, pp); break;
+ case Shader::OPCODE_DIST2: dist2(d.x, s0, s1, pp); break;
+ case Shader::OPCODE_DIST3: dist3(d.x, s0, s1, pp); break;
+ case Shader::OPCODE_DIST4: dist4(d.x, s0, s1, pp); break;
+ case Shader::OPCODE_MIN: min(d, s0, s1); break;
+ case Shader::OPCODE_IMIN: imin(d, s0, s1); break;
+ case Shader::OPCODE_UMIN: umin(d, s0, s1); break;
+ case Shader::OPCODE_MAX: max(d, s0, s1); break;
+ case Shader::OPCODE_IMAX: imax(d, s0, s1); break;
+ case Shader::OPCODE_UMAX: umax(d, s0, s1); break;
+ case Shader::OPCODE_LRP: lrp(d, s0, s1, s2); break;
+ case Shader::OPCODE_STEP: step(d, s0, s1); break;
+ case Shader::OPCODE_SMOOTH: smooth(d, s0, s1, s2); break;
+ case Shader::OPCODE_ISINF: isinf(d, s0); break;
+ case Shader::OPCODE_ISNAN: isnan(d, s0); break;
+ case Shader::OPCODE_FLOATBITSTOINT:
+ case Shader::OPCODE_FLOATBITSTOUINT:
+ case Shader::OPCODE_INTBITSTOFLOAT:
+ case Shader::OPCODE_UINTBITSTOFLOAT: d = s0; break;
+ case Shader::OPCODE_PACKSNORM2x16: packSnorm2x16(d, s0); break;
+ case Shader::OPCODE_PACKUNORM2x16: packUnorm2x16(d, s0); break;
+ case Shader::OPCODE_PACKHALF2x16: packHalf2x16(d, s0); break;
+ case Shader::OPCODE_UNPACKSNORM2x16: unpackSnorm2x16(d, s0); break;
+ case Shader::OPCODE_UNPACKUNORM2x16: unpackUnorm2x16(d, s0); break;
+ case Shader::OPCODE_UNPACKHALF2x16: unpackHalf2x16(d, s0); break;
+ case Shader::OPCODE_POWX: powx(d, s0, s1, pp); break;
+ case Shader::OPCODE_POW: pow(d, s0, s1, pp); break;
+ case Shader::OPCODE_SGN: sgn(d, s0); break;
+ case Shader::OPCODE_ISGN: isgn(d, s0); break;
+ case Shader::OPCODE_CRS: crs(d, s0, s1); break;
+ case Shader::OPCODE_FORWARD1: forward1(d, s0, s1, s2); break;
+ case Shader::OPCODE_FORWARD2: forward2(d, s0, s1, s2); break;
+ case Shader::OPCODE_FORWARD3: forward3(d, s0, s1, s2); break;
+ case Shader::OPCODE_FORWARD4: forward4(d, s0, s1, s2); break;
+ case Shader::OPCODE_REFLECT1: reflect1(d, s0, s1); break;
+ case Shader::OPCODE_REFLECT2: reflect2(d, s0, s1); break;
+ case Shader::OPCODE_REFLECT3: reflect3(d, s0, s1); break;
+ case Shader::OPCODE_REFLECT4: reflect4(d, s0, s1); break;
+ case Shader::OPCODE_REFRACT1: refract1(d, s0, s1, s2.x); break;
+ case Shader::OPCODE_REFRACT2: refract2(d, s0, s1, s2.x); break;
+ case Shader::OPCODE_REFRACT3: refract3(d, s0, s1, s2.x); break;
+ case Shader::OPCODE_REFRACT4: refract4(d, s0, s1, s2.x); break;
+ case Shader::OPCODE_NRM2: nrm2(d, s0, pp); break;
+ case Shader::OPCODE_NRM3: nrm3(d, s0, pp); break;
+ case Shader::OPCODE_NRM4: nrm4(d, s0, pp); break;
+ case Shader::OPCODE_ABS: abs(d, s0); break;
+ case Shader::OPCODE_IABS: iabs(d, s0); break;
+ case Shader::OPCODE_SINCOS: sincos(d, s0, pp); break;
+ case Shader::OPCODE_COS: cos(d, s0, pp); break;
+ case Shader::OPCODE_SIN: sin(d, s0, pp); break;
+ case Shader::OPCODE_TAN: tan(d, s0, pp); break;
+ case Shader::OPCODE_ACOS: acos(d, s0, pp); break;
+ case Shader::OPCODE_ASIN: asin(d, s0, pp); break;
+ case Shader::OPCODE_ATAN: atan(d, s0, pp); break;
+ case Shader::OPCODE_ATAN2: atan2(d, s0, s1, pp); break;
+ case Shader::OPCODE_COSH: cosh(d, s0, pp); break;
+ case Shader::OPCODE_SINH: sinh(d, s0, pp); break;
+ case Shader::OPCODE_TANH: tanh(d, s0, pp); break;
+ case Shader::OPCODE_ACOSH: acosh(d, s0, pp); break;
+ case Shader::OPCODE_ASINH: asinh(d, s0, pp); break;
+ case Shader::OPCODE_ATANH: atanh(d, s0, pp); break;
+ case Shader::OPCODE_M4X4: M4X4(d, s0, src1); break;
+ case Shader::OPCODE_M4X3: M4X3(d, s0, src1); break;
+ case Shader::OPCODE_M3X4: M3X4(d, s0, src1); break;
+ case Shader::OPCODE_M3X3: M3X3(d, s0, src1); break;
+ case Shader::OPCODE_M3X2: M3X2(d, s0, src1); break;
+ case Shader::OPCODE_TEX: TEX(d, s0, src1, project, bias); break;
+ case Shader::OPCODE_TEXLDD: TEXGRAD(d, s0, src1, s2, s3); break;
+ case Shader::OPCODE_TEXLDL: TEXLOD(d, s0, src1, s0.w); break;
+ case Shader::OPCODE_TEXLOD: TEXLOD(d, s0, src1, s2.x); break;
+ case Shader::OPCODE_TEXSIZE: TEXSIZE(d, s0.x, src1); break;
+ case Shader::OPCODE_TEXKILL: TEXKILL(cMask, d, dst.mask); break;
+ case Shader::OPCODE_TEXOFFSET: TEXOFFSET(d, s0, src1, s2); break;
+ case Shader::OPCODE_TEXLODOFFSET: TEXLODOFFSET(d, s0, src1, s2, s3.x); break;
+ case Shader::OPCODE_TEXELFETCH: TEXELFETCH(d, s0, src1, s2.x); break;
+ case Shader::OPCODE_TEXELFETCHOFFSET: TEXELFETCHOFFSET(d, s0, src1, s2, s3.x); break;
+ case Shader::OPCODE_TEXGRAD: TEXGRAD(d, s0, src1, s2, s3); break;
+ case Shader::OPCODE_TEXGRADOFFSET: TEXGRADOFFSET(d, s0, src1, s2, s3, s4); break;
+ case Shader::OPCODE_TEXBIAS: TEXBIAS(d, s0, src1, s2.x); break;
+ case Shader::OPCODE_TEXOFFSETBIAS: TEXOFFSETBIAS(d, s0, src1, s2, s3.x); break;
+ case Shader::OPCODE_DISCARD: DISCARD(cMask, instruction); break;
+ case Shader::OPCODE_DFDX: DFDX(d, s0); break;
+ case Shader::OPCODE_DFDY: DFDY(d, s0); break;
+ case Shader::OPCODE_FWIDTH: FWIDTH(d, s0); break;
+ case Shader::OPCODE_BREAK: BREAK(); break;
+ case Shader::OPCODE_BREAKC: BREAKC(s0, s1, control); break;
+ case Shader::OPCODE_BREAKP: BREAKP(src0); break;
+ case Shader::OPCODE_CONTINUE: CONTINUE(); break;
+ case Shader::OPCODE_TEST: TEST(); break;
+ case Shader::OPCODE_CALL: CALL(dst.label, dst.callSite); break;
+ case Shader::OPCODE_CALLNZ: CALLNZ(dst.label, dst.callSite, src0); break;
+ case Shader::OPCODE_ELSE: ELSE(); break;
+ case Shader::OPCODE_ENDIF: ENDIF(); break;
+ case Shader::OPCODE_ENDLOOP: ENDLOOP(); break;
+ case Shader::OPCODE_ENDREP: ENDREP(); break;
+ case Shader::OPCODE_ENDWHILE: ENDWHILE(); break;
+ case Shader::OPCODE_ENDSWITCH: ENDSWITCH(); break;
+ case Shader::OPCODE_IF: IF(src0); break;
+ case Shader::OPCODE_IFC: IFC(s0, s1, control); break;
+ case Shader::OPCODE_LABEL: LABEL(dst.index); break;
+ case Shader::OPCODE_LOOP: LOOP(src1); break;
+ case Shader::OPCODE_REP: REP(src0); break;
+ case Shader::OPCODE_WHILE: WHILE(src0); break;
+ case Shader::OPCODE_SWITCH: SWITCH(); break;
+ case Shader::OPCODE_RET: RET(); break;
+ case Shader::OPCODE_LEAVE: LEAVE(); break;
+ case Shader::OPCODE_CMP: cmp(d, s0, s1, control); break;
+ case Shader::OPCODE_ALL: all(d.x, s0); break;
+ case Shader::OPCODE_ANY: any(d.x, s0); break;
+ case Shader::OPCODE_NOT: bitwise_not(d, s0); break;
+ case Shader::OPCODE_OR: bitwise_or(d, s0, s1); break;
+ case Shader::OPCODE_XOR: bitwise_xor(d, s0, s1); break;
+ case Shader::OPCODE_AND: bitwise_and(d, s0, s1); break;
+ case Shader::OPCODE_EQ: equal(d, s0, s1); break;
+ case Shader::OPCODE_NE: notEqual(d, s0, s1); break;
+ case Shader::OPCODE_END: break;
+ default:
+ ASSERT(false);
+ }
+
+ if(dst.type != Shader::PARAMETER_VOID && dst.type != Shader::PARAMETER_LABEL && opcode != Shader::OPCODE_TEXKILL && opcode != Shader::OPCODE_NOP)
+ {
+ if(dst.saturate)
+ {
+ if(dst.x) d.x = Max(d.x, Float4(0.0f));
+ if(dst.y) d.y = Max(d.y, Float4(0.0f));
+ if(dst.z) d.z = Max(d.z, Float4(0.0f));
+ if(dst.w) d.w = Max(d.w, Float4(0.0f));
+
+ if(dst.x) d.x = Min(d.x, Float4(1.0f));
+ if(dst.y) d.y = Min(d.y, Float4(1.0f));
+ if(dst.z) d.z = Min(d.z, Float4(1.0f));
+ if(dst.w) d.w = Min(d.w, Float4(1.0f));
+ }
+
+ if(instruction->isPredicated())
+ {
+ Vector4f pDst; // FIXME: Rename
+
+ switch(dst.type)
+ {
+ case Shader::PARAMETER_TEMP:
+ if(dst.rel.type == Shader::PARAMETER_VOID)
+ {
+ if(dst.x) pDst.x = r[dst.index].x;
+ if(dst.y) pDst.y = r[dst.index].y;
+ if(dst.z) pDst.z = r[dst.index].z;
+ if(dst.w) pDst.w = r[dst.index].w;
+ }
+ else if(!dst.rel.dynamic)
+ {
+ Int a = dst.index + relativeAddress(dst.rel);
+
+ if(dst.x) pDst.x = r[a].x;
+ if(dst.y) pDst.y = r[a].y;
+ if(dst.z) pDst.z = r[a].z;
+ if(dst.w) pDst.w = r[a].w;
+ }
+ else
+ {
+ Int4 a = dst.index + dynamicAddress(dst.rel);
+
+ if(dst.x) pDst.x = r[a].x;
+ if(dst.y) pDst.y = r[a].y;
+ if(dst.z) pDst.z = r[a].z;
+ if(dst.w) pDst.w = r[a].w;
+ }
+ break;
+ case Shader::PARAMETER_COLOROUT:
+ if(dst.rel.type == Shader::PARAMETER_VOID)
+ {
+ if(dst.x) pDst.x = oC[dst.index].x;
+ if(dst.y) pDst.y = oC[dst.index].y;
+ if(dst.z) pDst.z = oC[dst.index].z;
+ if(dst.w) pDst.w = oC[dst.index].w;
+ }
+ else if(!dst.rel.dynamic)
+ {
+ Int a = dst.index + relativeAddress(dst.rel);
+
+ if(dst.x) pDst.x = oC[a].x;
+ if(dst.y) pDst.y = oC[a].y;
+ if(dst.z) pDst.z = oC[a].z;
+ if(dst.w) pDst.w = oC[a].w;
+ }
+ else
+ {
+ Int4 a = dst.index + dynamicAddress(dst.rel);
+
+ if(dst.x) pDst.x = oC[a].x;
+ if(dst.y) pDst.y = oC[a].y;
+ if(dst.z) pDst.z = oC[a].z;
+ if(dst.w) pDst.w = oC[a].w;
+ }
+ break;
+ case Shader::PARAMETER_PREDICATE:
+ if(dst.x) pDst.x = p0.x;
+ if(dst.y) pDst.y = p0.y;
+ if(dst.z) pDst.z = p0.z;
+ if(dst.w) pDst.w = p0.w;
+ break;
+ case Shader::PARAMETER_DEPTHOUT:
+ pDst.x = oDepth;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ Int4 enable = enableMask(instruction);
+
+ Int4 xEnable = enable;
+ Int4 yEnable = enable;
+ Int4 zEnable = enable;
+ Int4 wEnable = enable;
+
+ if(predicate)
+ {
+ unsigned char pSwizzle = instruction->predicateSwizzle;
+
+ Float4 xPredicate = p0[(pSwizzle >> 0) & 0x03];
+ Float4 yPredicate = p0[(pSwizzle >> 2) & 0x03];
+ Float4 zPredicate = p0[(pSwizzle >> 4) & 0x03];
+ Float4 wPredicate = p0[(pSwizzle >> 6) & 0x03];
+
+ if(!instruction->predicateNot)
+ {
+ if(dst.x) xEnable = xEnable & As<Int4>(xPredicate);
+ if(dst.y) yEnable = yEnable & As<Int4>(yPredicate);
+ if(dst.z) zEnable = zEnable & As<Int4>(zPredicate);
+ if(dst.w) wEnable = wEnable & As<Int4>(wPredicate);
+ }
+ else
+ {
+ if(dst.x) xEnable = xEnable & ~As<Int4>(xPredicate);
+ if(dst.y) yEnable = yEnable & ~As<Int4>(yPredicate);
+ if(dst.z) zEnable = zEnable & ~As<Int4>(zPredicate);
+ if(dst.w) wEnable = wEnable & ~As<Int4>(wPredicate);
+ }
+ }
+
+ if(dst.x) d.x = As<Float4>(As<Int4>(d.x) & xEnable);
+ if(dst.y) d.y = As<Float4>(As<Int4>(d.y) & yEnable);
+ if(dst.z) d.z = As<Float4>(As<Int4>(d.z) & zEnable);
+ if(dst.w) d.w = As<Float4>(As<Int4>(d.w) & wEnable);
+
+ if(dst.x) d.x = As<Float4>(As<Int4>(d.x) | (As<Int4>(pDst.x) & ~xEnable));
+ if(dst.y) d.y = As<Float4>(As<Int4>(d.y) | (As<Int4>(pDst.y) & ~yEnable));
+ if(dst.z) d.z = As<Float4>(As<Int4>(d.z) | (As<Int4>(pDst.z) & ~zEnable));
+ if(dst.w) d.w = As<Float4>(As<Int4>(d.w) | (As<Int4>(pDst.w) & ~wEnable));
+ }
+
+ switch(dst.type)
+ {
+ case Shader::PARAMETER_TEMP:
+ if(dst.rel.type == Shader::PARAMETER_VOID)
+ {
+ if(dst.x) r[dst.index].x = d.x;
+ if(dst.y) r[dst.index].y = d.y;
+ if(dst.z) r[dst.index].z = d.z;
+ if(dst.w) r[dst.index].w = d.w;
+ }
+ else if(!dst.rel.dynamic)
+ {
+ Int a = dst.index + relativeAddress(dst.rel);
+
+ if(dst.x) r[a].x = d.x;
+ if(dst.y) r[a].y = d.y;
+ if(dst.z) r[a].z = d.z;
+ if(dst.w) r[a].w = d.w;
+ }
+ else
+ {
+ Int4 a = dst.index + dynamicAddress(dst.rel);
+
+ if(dst.x) r.scatter_x(a, d.x);
+ if(dst.y) r.scatter_y(a, d.y);
+ if(dst.z) r.scatter_z(a, d.z);
+ if(dst.w) r.scatter_w(a, d.w);
+ }
+ break;
+ case Shader::PARAMETER_COLOROUT:
+ if(dst.rel.type == Shader::PARAMETER_VOID)
+ {
+ broadcastColor0 = (dst.index == 0) && broadcastColor0;
+
+ if(dst.x) oC[dst.index].x = d.x;
+ if(dst.y) oC[dst.index].y = d.y;
+ if(dst.z) oC[dst.index].z = d.z;
+ if(dst.w) oC[dst.index].w = d.w;
+ }
+ else if(!dst.rel.dynamic)
+ {
+ broadcastColor0 = false;
+ Int a = dst.index + relativeAddress(dst.rel);
+
+ if(dst.x) oC[a].x = d.x;
+ if(dst.y) oC[a].y = d.y;
+ if(dst.z) oC[a].z = d.z;
+ if(dst.w) oC[a].w = d.w;
+ }
+ else
+ {
+ broadcastColor0 = false;
+ Int4 a = dst.index + dynamicAddress(dst.rel);
+
+ if(dst.x) oC.scatter_x(a, d.x);
+ if(dst.y) oC.scatter_y(a, d.y);
+ if(dst.z) oC.scatter_z(a, d.z);
+ if(dst.w) oC.scatter_w(a, d.w);
+ }
+ break;
+ case Shader::PARAMETER_PREDICATE:
+ if(dst.x) p0.x = d.x;
+ if(dst.y) p0.y = d.y;
+ if(dst.z) p0.z = d.z;
+ if(dst.w) p0.w = d.w;
+ break;
+ case Shader::PARAMETER_DEPTHOUT:
+ oDepth = d.x;
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ }
+
+ if(currentLabel != -1)
+ {
+ Nucleus::setInsertBlock(returnBlock);
+ }
+
+ if(broadcastColor0)
+ {
+ for(int i = 0; i < RENDERTARGETS; i++)
+ {
+ c[i] = oC[0];
+ }
+ }
+ else
+ {
+ for(int i = 0; i < RENDERTARGETS; i++)
+ {
+ c[i] = oC[i];
+ }
+ }
+
+ clampColor(c);
+
+ if(state.depthOverride)
+ {
+ oDepth = Min(Max(oDepth, Float4(0.0f)), Float4(1.0f));
+ }
+ }
+
+ Bool PixelProgram::alphaTest(Int cMask[4])
+ {
+ if(!state.alphaTestActive())
+ {
+ return true;
+ }
+
+ Int aMask;
+
+ if(state.transparencyAntialiasing == TRANSPARENCY_NONE)
+ {
+ Short4 alpha = RoundShort4(c[0].w * Float4(0x1000));
+
+ PixelRoutine::alphaTest(aMask, alpha);
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ cMask[q] &= aMask;
+ }
+ }
+ else if(state.transparencyAntialiasing == TRANSPARENCY_ALPHA_TO_COVERAGE)
+ {
+ alphaToCoverage(cMask, c[0].w);
+ }
+ else ASSERT(false);
+
+ Int pass = cMask[0];
+
+ for(unsigned int q = 1; q < state.multiSample; q++)
+ {
+ pass = pass | cMask[q];
+ }
+
+ return pass != 0x0;
+ }
+
+ void PixelProgram::rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
+ {
+ for(int index = 0; index < RENDERTARGETS; index++)
+ {
+ if(!state.colorWriteActive(index))
+ {
+ continue;
+ }
+
+ if(!postBlendSRGB && state.writeSRGB && !isSRGB(index))
+ {
+ c[index].x = linearToSRGB(c[index].x);
+ c[index].y = linearToSRGB(c[index].y);
+ c[index].z = linearToSRGB(c[index].z);
+ }
+
+ if(index == 0)
+ {
+ fogBlend(c[index], fog);
+ }
+
+ switch(state.targetFormat[index])
+ {
+ case FORMAT_R5G6B5:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_G8R8:
+ case FORMAT_R8:
+ case FORMAT_A8:
+ case FORMAT_G16R16:
+ case FORMAT_A16B16G16R16:
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
+ Vector4s color;
+
+ if(state.targetFormat[index] == FORMAT_R5G6B5)
+ {
+ color.x = UShort4(c[index].x * Float4(0xFBFF), false);
+ color.y = UShort4(c[index].y * Float4(0xFDFF), false);
+ color.z = UShort4(c[index].z * Float4(0xFBFF), false);
+ color.w = UShort4(c[index].w * Float4(0xFFFF), false);
+ }
+ else
+ {
+ color.x = convertFixed16(c[index].x, false);
+ color.y = convertFixed16(c[index].y, false);
+ color.z = convertFixed16(c[index].z, false);
+ color.w = convertFixed16(c[index].w, false);
+ }
+
+ if(state.multiSampleMask & (1 << q))
+ {
+ alphaBlend(index, buffer, color, x);
+ logicOperation(index, buffer, color, x);
+ writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+ }
+ }
+ break;
+ case FORMAT_R32F:
+ case FORMAT_G32R32F:
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_A32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_R32I:
+ case FORMAT_G32R32I:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_R32UI:
+ case FORMAT_G32R32UI:
+ case FORMAT_A32B32G32R32UI:
+ case FORMAT_R16I:
+ case FORMAT_G16R16I:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_R16UI:
+ case FORMAT_G16R16UI:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_R8I:
+ case FORMAT_G8R8I:
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_R8UI:
+ case FORMAT_G8R8UI:
+ case FORMAT_A8B8G8R8UI:
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
+ Vector4f color = c[index];
+
+ if(state.multiSampleMask & (1 << q))
+ {
+ alphaBlend(index, buffer, color, x);
+ writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+ }
+ }
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ }
+
+ Vector4f PixelProgram::sampleTexture(const Src &sampler, Vector4f &uvwq, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function)
+ {
+ Vector4f tmp;
+
+ if(sampler.type == Shader::PARAMETER_SAMPLER && sampler.rel.type == Shader::PARAMETER_VOID)
+ {
+ tmp = sampleTexture(sampler.index, uvwq, bias, dsx, dsy, offset, function);
+ }
+ else
+ {
+ Int index = As<Int>(Float(fetchRegister(sampler).x.x));
+
+ for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
+ {
+ if(shader->usesSampler(i))
+ {
+ If(index == i)
+ {
+ tmp = sampleTexture(i, uvwq, bias, dsx, dsy, offset, function);
+ // FIXME: When the sampler states are the same, we could use one sampler and just index the texture
+ }
+ }
+ }
+ }
+
+ Vector4f c;
+ c.x = tmp[(sampler.swizzle >> 0) & 0x3];
+ c.y = tmp[(sampler.swizzle >> 2) & 0x3];
+ c.z = tmp[(sampler.swizzle >> 4) & 0x3];
+ c.w = tmp[(sampler.swizzle >> 6) & 0x3];
+
+ return c;
+ }
+
+ Vector4f PixelProgram::sampleTexture(int samplerIndex, Vector4f &uvwq, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function)
+ {
+ #if PERF_PROFILE
+ Long texTime = Ticks();
+ #endif
+
+ Pointer<Byte> texture = data + OFFSET(DrawData, mipmap) + samplerIndex * sizeof(Texture);
+ Vector4f c = SamplerCore(constants, state.sampler[samplerIndex]).sampleTexture(texture, uvwq.x, uvwq.y, uvwq.z, uvwq.w, bias, dsx, dsy, offset, function);
+
+ #if PERF_PROFILE
+ cycles[PERF_TEX] += Ticks() - texTime;
+ #endif
+
+ return c;
+ }
+
+ void PixelProgram::clampColor(Vector4f oC[RENDERTARGETS])
+ {
+ for(int index = 0; index < RENDERTARGETS; index++)
+ {
+ if(!state.colorWriteActive(index) && !(index == 0 && state.alphaTestActive()))
+ {
+ continue;
+ }
+
+ switch(state.targetFormat[index])
+ {
+ case FORMAT_NULL:
+ break;
+ case FORMAT_R5G6B5:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_G8R8:
+ case FORMAT_R8:
+ case FORMAT_A8:
+ case FORMAT_G16R16:
+ case FORMAT_A16B16G16R16:
+ oC[index].x = Max(oC[index].x, Float4(0.0f)); oC[index].x = Min(oC[index].x, Float4(1.0f));
+ oC[index].y = Max(oC[index].y, Float4(0.0f)); oC[index].y = Min(oC[index].y, Float4(1.0f));
+ oC[index].z = Max(oC[index].z, Float4(0.0f)); oC[index].z = Min(oC[index].z, Float4(1.0f));
+ oC[index].w = Max(oC[index].w, Float4(0.0f)); oC[index].w = Min(oC[index].w, Float4(1.0f));
+ break;
+ case FORMAT_R32F:
+ case FORMAT_G32R32F:
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_A32B32G32R32F:
+ case FORMAT_R32I:
+ case FORMAT_G32R32I:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_R32UI:
+ case FORMAT_G32R32UI:
+ case FORMAT_A32B32G32R32UI:
+ case FORMAT_R16I:
+ case FORMAT_G16R16I:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_R16UI:
+ case FORMAT_G16R16UI:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_R8I:
+ case FORMAT_G8R8I:
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_R8UI:
+ case FORMAT_G8R8UI:
+ case FORMAT_A8B8G8R8UI:
+ break;
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ oC[index].x = Max(oC[index].x, Float4(0.0f));
+ oC[index].y = Max(oC[index].y, Float4(0.0f));
+ oC[index].z = Max(oC[index].z, Float4(0.0f));
+ oC[index].w = Max(oC[index].w, Float4(0.0f));
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ }
+
+ Int4 PixelProgram::enableMask(const Shader::Instruction *instruction)
+ {
+ Int4 enable = instruction->analysisBranch ? Int4(enableStack[enableIndex]) : Int4(0xFFFFFFFF);
+
+ if(!whileTest)
+ {
+ if(shader->containsBreakInstruction() && instruction->analysisBreak)
+ {
+ enable &= enableBreak;
+ }
+
+ if(shader->containsContinueInstruction() && instruction->analysisContinue)
+ {
+ enable &= enableContinue;
+ }
+
+ if(shader->containsLeaveInstruction() && instruction->analysisLeave)
+ {
+ enable &= enableLeave;
+ }
+ }
+
+ return enable;
+ }
+
+ Vector4f PixelProgram::fetchRegister(const Src &src, unsigned int offset)
+ {
+ Vector4f reg;
+ unsigned int i = src.index + offset;
+
+ switch(src.type)
+ {
+ case Shader::PARAMETER_TEMP:
+ if(src.rel.type == Shader::PARAMETER_VOID)
+ {
+ reg = r[i];
+ }
+ else if(!src.rel.dynamic)
+ {
+ reg = r[i + relativeAddress(src.rel, src.bufferIndex)];
+ }
+ else
+ {
+ reg = r[i + dynamicAddress(src.rel)];
+ }
+ break;
+ case Shader::PARAMETER_INPUT:
+ if(src.rel.type == Shader::PARAMETER_VOID) // Not relative
+ {
+ reg = v[i];
+ }
+ else if(!src.rel.dynamic)
+ {
+ reg = v[i + relativeAddress(src.rel, src.bufferIndex)];
+ }
+ else
+ {
+ reg = v[i + dynamicAddress(src.rel)];
+ }
+ break;
+ case Shader::PARAMETER_CONST:
+ reg = readConstant(src, offset);
+ break;
+ case Shader::PARAMETER_TEXTURE:
+ reg = v[2 + i];
+ break;
+ case Shader::PARAMETER_MISCTYPE:
+ if(src.index == Shader::VPosIndex) reg = vPos;
+ if(src.index == Shader::VFaceIndex) reg = vFace;
+ break;
+ case Shader::PARAMETER_SAMPLER:
+ if(src.rel.type == Shader::PARAMETER_VOID)
+ {
+ reg.x = As<Float4>(Int4(i));
+ }
+ else if(src.rel.type == Shader::PARAMETER_TEMP)
+ {
+ reg.x = As<Float4>(Int4(i) + As<Int4>(r[src.rel.index].x));
+ }
+ return reg;
+ case Shader::PARAMETER_PREDICATE: return reg; // Dummy
+ case Shader::PARAMETER_VOID: return reg; // Dummy
+ case Shader::PARAMETER_FLOAT4LITERAL:
+ reg.x = Float4(src.value[0]);
+ reg.y = Float4(src.value[1]);
+ reg.z = Float4(src.value[2]);
+ reg.w = Float4(src.value[3]);
+ break;
+ case Shader::PARAMETER_CONSTINT: return reg; // Dummy
+ case Shader::PARAMETER_CONSTBOOL: return reg; // Dummy
+ case Shader::PARAMETER_LOOP: return reg; // Dummy
+ case Shader::PARAMETER_COLOROUT:
+ if(src.rel.type == Shader::PARAMETER_VOID) // Not relative
+ {
+ reg = oC[i];
+ }
+ else if(!src.rel.dynamic)
+ {
+ reg = oC[i + relativeAddress(src.rel, src.bufferIndex)];
+ }
+ else
+ {
+ reg = oC[i + dynamicAddress(src.rel)];
+ }
+ break;
+ case Shader::PARAMETER_DEPTHOUT:
+ reg.x = oDepth;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ const Float4 &x = reg[(src.swizzle >> 0) & 0x3];
+ const Float4 &y = reg[(src.swizzle >> 2) & 0x3];
+ const Float4 &z = reg[(src.swizzle >> 4) & 0x3];
+ const Float4 &w = reg[(src.swizzle >> 6) & 0x3];
+
+ Vector4f mod;
+
+ switch(src.modifier)
+ {
+ case Shader::MODIFIER_NONE:
+ mod.x = x;
+ mod.y = y;
+ mod.z = z;
+ mod.w = w;
+ break;
+ case Shader::MODIFIER_NEGATE:
+ mod.x = -x;
+ mod.y = -y;
+ mod.z = -z;
+ mod.w = -w;
+ break;
+ case Shader::MODIFIER_ABS:
+ mod.x = Abs(x);
+ mod.y = Abs(y);
+ mod.z = Abs(z);
+ mod.w = Abs(w);
+ break;
+ case Shader::MODIFIER_ABS_NEGATE:
+ mod.x = -Abs(x);
+ mod.y = -Abs(y);
+ mod.z = -Abs(z);
+ mod.w = -Abs(w);
+ break;
+ case Shader::MODIFIER_NOT:
+ mod.x = As<Float4>(As<Int4>(x) ^ Int4(0xFFFFFFFF));
+ mod.y = As<Float4>(As<Int4>(y) ^ Int4(0xFFFFFFFF));
+ mod.z = As<Float4>(As<Int4>(z) ^ Int4(0xFFFFFFFF));
+ mod.w = As<Float4>(As<Int4>(w) ^ Int4(0xFFFFFFFF));
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ return mod;
+ }
+
+ RValue<Pointer<Byte>> PixelProgram::uniformAddress(int bufferIndex, unsigned int index)
+ {
+ if(bufferIndex == -1)
+ {
+ return data + OFFSET(DrawData, ps.c[index]);
+ }
+ else
+ {
+ return *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, ps.u[bufferIndex])) + index;
+ }
+ }
+
+ RValue<Pointer<Byte>> PixelProgram::uniformAddress(int bufferIndex, unsigned int index, Int& offset)
+ {
+ return uniformAddress(bufferIndex, index) + offset * sizeof(float4);
+ }
+
+ Vector4f PixelProgram::readConstant(const Src &src, unsigned int offset)
+ {
+ Vector4f c;
+ unsigned int i = src.index + offset;
+
+ if(src.rel.type == Shader::PARAMETER_VOID) // Not relative
+ {
+ c.x = c.y = c.z = c.w = *Pointer<Float4>(uniformAddress(src.bufferIndex, i));
+
+ c.x = c.x.xxxx;
+ c.y = c.y.yyyy;
+ c.z = c.z.zzzz;
+ c.w = c.w.wwww;
+
+ if(shader->containsDefineInstruction()) // Constant may be known at compile time
+ {
+ for(size_t j = 0; j < shader->getLength(); j++)
+ {
+ const Shader::Instruction &instruction = *shader->getInstruction(j);
+
+ if(instruction.opcode == Shader::OPCODE_DEF)
+ {
+ if(instruction.dst.index == i)
+ {
+ c.x = Float4(instruction.src[0].value[0]);
+ c.y = Float4(instruction.src[0].value[1]);
+ c.z = Float4(instruction.src[0].value[2]);
+ c.w = Float4(instruction.src[0].value[3]);
+
+ break;
+ }
+ }
+ }
+ }
+ }
+ else if(!src.rel.dynamic || src.rel.type == Shader::PARAMETER_LOOP)
+ {
+ Int a = relativeAddress(src.rel, src.bufferIndex);
+
+ c.x = c.y = c.z = c.w = *Pointer<Float4>(uniformAddress(src.bufferIndex, i, a));
+
+ c.x = c.x.xxxx;
+ c.y = c.y.yyyy;
+ c.z = c.z.zzzz;
+ c.w = c.w.wwww;
+ }
+ else
+ {
+ int component = src.rel.swizzle & 0x03;
+ Float4 a;
+
+ switch(src.rel.type)
+ {
+ case Shader::PARAMETER_TEMP: a = r[src.rel.index][component]; break;
+ case Shader::PARAMETER_INPUT: a = v[src.rel.index][component]; break;
+ case Shader::PARAMETER_OUTPUT: a = oC[src.rel.index][component]; break;
+ case Shader::PARAMETER_CONST: a = *Pointer<Float>(uniformAddress(src.bufferIndex, src.rel.index) + component * sizeof(float)); break;
+ case Shader::PARAMETER_MISCTYPE:
+ switch(src.rel.index)
+ {
+ case Shader::VPosIndex: a = vPos.x; break;
+ case Shader::VFaceIndex: a = vFace.x; break;
+ default: ASSERT(false);
+ }
+ break;
+ default: ASSERT(false);
+ }
+
+ Int4 index = Int4(i) + As<Int4>(a) * Int4(src.rel.scale);
+
+ index = Min(As<UInt4>(index), UInt4(VERTEX_UNIFORM_VECTORS)); // Clamp to constant register range, c[VERTEX_UNIFORM_VECTORS] = {0, 0, 0, 0}
+
+ Int index0 = Extract(index, 0);
+ Int index1 = Extract(index, 1);
+ Int index2 = Extract(index, 2);
+ Int index3 = Extract(index, 3);
+
+ c.x = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index0), 16);
+ c.y = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index1), 16);
+ c.z = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index2), 16);
+ c.w = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index3), 16);
+
+ transpose4x4(c.x, c.y, c.z, c.w);
+ }
+
+ return c;
+ }
+
+ Int PixelProgram::relativeAddress(const Shader::Relative &rel, int bufferIndex)
+ {
+ ASSERT(!rel.dynamic);
+
+ if(rel.type == Shader::PARAMETER_TEMP)
+ {
+ return As<Int>(Extract(r[rel.index].x, 0)) * rel.scale;
+ }
+ else if(rel.type == Shader::PARAMETER_INPUT)
+ {
+ return As<Int>(Extract(v[rel.index].x, 0)) * rel.scale;
+ }
+ else if(rel.type == Shader::PARAMETER_OUTPUT)
+ {
+ return As<Int>(Extract(oC[rel.index].x, 0)) * rel.scale;
+ }
+ else if(rel.type == Shader::PARAMETER_CONST)
+ {
+ return *Pointer<Int>(uniformAddress(bufferIndex, rel.index)) * rel.scale;
+ }
+ else if(rel.type == Shader::PARAMETER_LOOP)
+ {
+ return aL[loopDepth];
+ }
+ else ASSERT(false);
+
+ return 0;
+ }
+
+ Int4 PixelProgram::dynamicAddress(const Shader::Relative &rel)
+ {
+ int component = rel.swizzle & 0x03;
+ Float4 a;
+
+ switch(rel.type)
+ {
+ case Shader::PARAMETER_TEMP: a = r[rel.index][component]; break;
+ case Shader::PARAMETER_INPUT: a = v[rel.index][component]; break;
+ case Shader::PARAMETER_OUTPUT: a = oC[rel.index][component]; break;
+ case Shader::PARAMETER_MISCTYPE:
+ switch(rel.index)
+ {
+ case Shader::VPosIndex: a = vPos.x; break;
+ case Shader::VFaceIndex: a = vFace.x; break;
+ default: ASSERT(false);
+ }
+ break;
+ default: ASSERT(false);
+ }
+
+ return As<Int4>(a) * Int4(rel.scale);
+ }
+
+ Float4 PixelProgram::linearToSRGB(const Float4 &x) // Approximates x^(1.0/2.2)
+ {
+ Float4 sqrtx = Rcp_pp(RcpSqrt_pp(x));
+ Float4 sRGB = sqrtx * Float4(1.14f) - x * Float4(0.14f);
+
+ return Min(Max(sRGB, Float4(0.0f)), Float4(1.0f));
+ }
+
+ void PixelProgram::M3X2(Vector4f &dst, Vector4f &src0, const Src &src1)
+ {
+ Vector4f row0 = fetchRegister(src1, 0);
+ Vector4f row1 = fetchRegister(src1, 1);
+
+ dst.x = dot3(src0, row0);
+ dst.y = dot3(src0, row1);
+ }
+
+ void PixelProgram::M3X3(Vector4f &dst, Vector4f &src0, const Src &src1)
+ {
+ Vector4f row0 = fetchRegister(src1, 0);
+ Vector4f row1 = fetchRegister(src1, 1);
+ Vector4f row2 = fetchRegister(src1, 2);
+
+ dst.x = dot3(src0, row0);
+ dst.y = dot3(src0, row1);
+ dst.z = dot3(src0, row2);
+ }
+
+ void PixelProgram::M3X4(Vector4f &dst, Vector4f &src0, const Src &src1)
+ {
+ Vector4f row0 = fetchRegister(src1, 0);
+ Vector4f row1 = fetchRegister(src1, 1);
+ Vector4f row2 = fetchRegister(src1, 2);
+ Vector4f row3 = fetchRegister(src1, 3);
+
+ dst.x = dot3(src0, row0);
+ dst.y = dot3(src0, row1);
+ dst.z = dot3(src0, row2);
+ dst.w = dot3(src0, row3);
+ }
+
+ void PixelProgram::M4X3(Vector4f &dst, Vector4f &src0, const Src &src1)
+ {
+ Vector4f row0 = fetchRegister(src1, 0);
+ Vector4f row1 = fetchRegister(src1, 1);
+ Vector4f row2 = fetchRegister(src1, 2);
+
+ dst.x = dot4(src0, row0);
+ dst.y = dot4(src0, row1);
+ dst.z = dot4(src0, row2);
+ }
+
+ void PixelProgram::M4X4(Vector4f &dst, Vector4f &src0, const Src &src1)
+ {
+ Vector4f row0 = fetchRegister(src1, 0);
+ Vector4f row1 = fetchRegister(src1, 1);
+ Vector4f row2 = fetchRegister(src1, 2);
+ Vector4f row3 = fetchRegister(src1, 3);
+
+ dst.x = dot4(src0, row0);
+ dst.y = dot4(src0, row1);
+ dst.z = dot4(src0, row2);
+ dst.w = dot4(src0, row3);
+ }
+
+ void PixelProgram::TEX(Vector4f &dst, Vector4f &src0, const Src &src1, bool project, bool bias)
+ {
+ if(project)
+ {
+ Vector4f proj;
+ Float4 rw = reciprocal(src0.w);
+ proj.x = src0.x * rw;
+ proj.y = src0.y * rw;
+ proj.z = src0.z * rw;
+
+ dst = sampleTexture(src1, proj, src0.x, (src0), (src0), (src0), Implicit);
+ }
+ else
+ {
+ dst = sampleTexture(src1, src0, src0.x, (src0), (src0), (src0), bias ? Bias : Implicit);
+ }
+ }
+
+ void PixelProgram::TEXOFFSET(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &offset)
+ {
+ dst = sampleTexture(src1, src0, (src0.x), (src0), (src0), offset, {Implicit, Offset});
+ }
+
+ void PixelProgram::TEXLODOFFSET(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &offset, Float4 &lod)
+ {
+ dst = sampleTexture(src1, src0, lod, (src0), (src0), offset, {Lod, Offset});
+ }
+
+ void PixelProgram::TEXBIAS(Vector4f &dst, Vector4f &src0, const Src &src1, Float4 &bias)
+ {
+ dst = sampleTexture(src1, src0, bias, (src0), (src0), (src0), Bias);
+ }
+
+ void PixelProgram::TEXOFFSETBIAS(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &offset, Float4 &bias)
+ {
+ dst = sampleTexture(src1, src0, bias, (src0), (src0), offset, {Bias, Offset});
+ }
+
+ void PixelProgram::TEXELFETCH(Vector4f &dst, Vector4f &src0, const Src& src1, Float4 &lod)
+ {
+ dst = sampleTexture(src1, src0, lod, (src0), (src0), (src0), Fetch);
+ }
+
+ void PixelProgram::TEXELFETCHOFFSET(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &offset, Float4 &lod)
+ {
+ dst = sampleTexture(src1, src0, lod, (src0), (src0), offset, {Fetch, Offset});
+ }
+
+ void PixelProgram::TEXGRAD(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &dsx, Vector4f &dsy)
+ {
+ dst = sampleTexture(src1, src0, (src0.x), dsx, dsy, (src0), Grad);
+ }
+
+ void PixelProgram::TEXGRADOFFSET(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &dsx, Vector4f &dsy, Vector4f &offset)
+ {
+ dst = sampleTexture(src1, src0, (src0.x), dsx, dsy, offset, {Grad, Offset});
+ }
+
+ void PixelProgram::TEXLOD(Vector4f &dst, Vector4f &src0, const Src &src1, Float4 &lod)
+ {
+ dst = sampleTexture(src1, src0, lod, (src0), (src0), (src0), Lod);
+ }
+
+ void PixelProgram::TEXSIZE(Vector4f &dst, Float4 &lod, const Src &src1)
+ {
+ Pointer<Byte> texture = data + OFFSET(DrawData, mipmap) + src1.index * sizeof(Texture);
+ dst = SamplerCore::textureSize(texture, lod);
+ }
+
+ void PixelProgram::TEXKILL(Int cMask[4], Vector4f &src, unsigned char mask)
+ {
+ Int kill = -1;
+
+ if(mask & 0x1) kill &= SignMask(CmpNLT(src.x, Float4(0.0f)));
+ if(mask & 0x2) kill &= SignMask(CmpNLT(src.y, Float4(0.0f)));
+ if(mask & 0x4) kill &= SignMask(CmpNLT(src.z, Float4(0.0f)));
+ if(mask & 0x8) kill &= SignMask(CmpNLT(src.w, Float4(0.0f)));
+
+ // FIXME: Dynamic branching affects TEXKILL?
+ // if(shader->containsDynamicBranching())
+ // {
+ // kill = ~SignMask(enableMask());
+ // }
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ cMask[q] &= kill;
+ }
+
+ // FIXME: Branch to end of shader if all killed?
+ }
+
+ void PixelProgram::DISCARD(Int cMask[4], const Shader::Instruction *instruction)
+ {
+ Int kill = 0;
+
+ if(shader->containsDynamicBranching())
+ {
+ kill = ~SignMask(enableMask(instruction));
+ }
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ cMask[q] &= kill;
+ }
+
+ // FIXME: Branch to end of shader if all killed?
+ }
+
+ void PixelProgram::DFDX(Vector4f &dst, Vector4f &src)
+ {
+ dst.x = src.x.yyww - src.x.xxzz;
+ dst.y = src.y.yyww - src.y.xxzz;
+ dst.z = src.z.yyww - src.z.xxzz;
+ dst.w = src.w.yyww - src.w.xxzz;
+ }
+
+ void PixelProgram::DFDY(Vector4f &dst, Vector4f &src)
+ {
+ dst.x = src.x.zwzw - src.x.xyxy;
+ dst.y = src.y.zwzw - src.y.xyxy;
+ dst.z = src.z.zwzw - src.z.xyxy;
+ dst.w = src.w.zwzw - src.w.xyxy;
+ }
+
+ void PixelProgram::FWIDTH(Vector4f &dst, Vector4f &src)
+ {
+ // abs(dFdx(src)) + abs(dFdy(src));
+ dst.x = Abs(src.x.yyww - src.x.xxzz) + Abs(src.x.zwzw - src.x.xyxy);
+ dst.y = Abs(src.y.yyww - src.y.xxzz) + Abs(src.y.zwzw - src.y.xyxy);
+ dst.z = Abs(src.z.yyww - src.z.xxzz) + Abs(src.z.zwzw - src.z.xyxy);
+ dst.w = Abs(src.w.yyww - src.w.xxzz) + Abs(src.w.zwzw - src.w.xyxy);
+ }
+
+ void PixelProgram::BREAK()
+ {
+ enableBreak = enableBreak & ~enableStack[enableIndex];
+ }
+
+ void PixelProgram::BREAKC(Vector4f &src0, Vector4f &src1, Control control)
+ {
+ Int4 condition;
+
+ switch(control)
+ {
+ case Shader::CONTROL_GT: condition = CmpNLE(src0.x, src1.x); break;
+ case Shader::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x); break;
+ case Shader::CONTROL_GE: condition = CmpNLT(src0.x, src1.x); break;
+ case Shader::CONTROL_LT: condition = CmpLT(src0.x, src1.x); break;
+ case Shader::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x); break;
+ case Shader::CONTROL_LE: condition = CmpLE(src0.x, src1.x); break;
+ default:
+ ASSERT(false);
+ }
+
+ BREAK(condition);
+ }
+
+ void PixelProgram::BREAKP(const Src &predicateRegister) // FIXME: Factor out parts common with BREAKC
+ {
+ Int4 condition = As<Int4>(p0[predicateRegister.swizzle & 0x3]);
+
+ if(predicateRegister.modifier == Shader::MODIFIER_NOT)
+ {
+ condition = ~condition;
+ }
+
+ BREAK(condition);
+ }
+
+ void PixelProgram::BREAK(Int4 &condition)
+ {
+ condition &= enableStack[enableIndex];
+
+ enableBreak = enableBreak & ~condition;
+ }
+
+ void PixelProgram::CONTINUE()
+ {
+ enableContinue = enableContinue & ~enableStack[enableIndex];
+ }
+
+ void PixelProgram::TEST()
+ {
+ whileTest = true;
+ }
+
+ void PixelProgram::CALL(int labelIndex, int callSiteIndex)
+ {
+ if(!labelBlock[labelIndex])
+ {
+ labelBlock[labelIndex] = Nucleus::createBasicBlock();
+ }
+
+ if(callRetBlock[labelIndex].size() > 1)
+ {
+ callStack[stackIndex++] = UInt(callSiteIndex);
+ }
+
+ Int4 restoreLeave = enableLeave;
+
+ Nucleus::createBr(labelBlock[labelIndex]);
+ Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+ enableLeave = restoreLeave;
+ }
+
+ void PixelProgram::CALLNZ(int labelIndex, int callSiteIndex, const Src &src)
+ {
+ if(src.type == Shader::PARAMETER_CONSTBOOL)
+ {
+ CALLNZb(labelIndex, callSiteIndex, src);
+ }
+ else if(src.type == Shader::PARAMETER_PREDICATE)
+ {
+ CALLNZp(labelIndex, callSiteIndex, src);
+ }
+ else ASSERT(false);
+ }
+
+ void PixelProgram::CALLNZb(int labelIndex, int callSiteIndex, const Src &boolRegister)
+ {
+ Bool condition = (*Pointer<Byte>(data + OFFSET(DrawData, ps.b[boolRegister.index])) != Byte(0)); // FIXME
+
+ if(boolRegister.modifier == Shader::MODIFIER_NOT)
+ {
+ condition = !condition;
+ }
+
+ if(!labelBlock[labelIndex])
+ {
+ labelBlock[labelIndex] = Nucleus::createBasicBlock();
+ }
+
+ if(callRetBlock[labelIndex].size() > 1)
+ {
+ callStack[stackIndex++] = UInt(callSiteIndex);
+ }
+
+ Int4 restoreLeave = enableLeave;
+
+ branch(condition, labelBlock[labelIndex], callRetBlock[labelIndex][callSiteIndex]);
+ Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+ enableLeave = restoreLeave;
+ }
+
+ void PixelProgram::CALLNZp(int labelIndex, int callSiteIndex, const Src &predicateRegister)
+ {
+ Int4 condition = As<Int4>(p0[predicateRegister.swizzle & 0x3]);
+
+ if(predicateRegister.modifier == Shader::MODIFIER_NOT)
+ {
+ condition = ~condition;
+ }
+
+ condition &= enableStack[enableIndex];
+
+ if(!labelBlock[labelIndex])
+ {
+ labelBlock[labelIndex] = Nucleus::createBasicBlock();
+ }
+
+ if(callRetBlock[labelIndex].size() > 1)
+ {
+ callStack[stackIndex++] = UInt(callSiteIndex);
+ }
+
+ enableIndex++;
+ enableStack[enableIndex] = condition;
+ Int4 restoreLeave = enableLeave;
+
+ Bool notAllFalse = SignMask(condition) != 0;
+ branch(notAllFalse, labelBlock[labelIndex], callRetBlock[labelIndex][callSiteIndex]);
+ Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+ enableIndex--;
+ enableLeave = restoreLeave;
+ }
+
+ void PixelProgram::ELSE()
+ {
+ ifDepth--;
+
+ BasicBlock *falseBlock = ifFalseBlock[ifDepth];
+ BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+ if(isConditionalIf[ifDepth])
+ {
+ Int4 condition = ~enableStack[enableIndex] & enableStack[enableIndex - 1];
+ Bool notAllFalse = SignMask(condition) != 0;
+
+ branch(notAllFalse, falseBlock, endBlock);
+
+ enableStack[enableIndex] = ~enableStack[enableIndex] & enableStack[enableIndex - 1];
+ }
+ else
+ {
+ Nucleus::createBr(endBlock);
+ Nucleus::setInsertBlock(falseBlock);
+ }
+
+ ifFalseBlock[ifDepth] = endBlock;
+
+ ifDepth++;
+ }
+
+ void PixelProgram::ENDIF()
+ {
+ ifDepth--;
+
+ BasicBlock *endBlock = ifFalseBlock[ifDepth];
+
+ Nucleus::createBr(endBlock);
+ Nucleus::setInsertBlock(endBlock);
+
+ if(isConditionalIf[ifDepth])
+ {
+ enableIndex--;
+ }
+ }
+
+ void PixelProgram::ENDLOOP()
+ {
+ loopRepDepth--;
+
+ aL[loopDepth] = aL[loopDepth] + increment[loopDepth]; // FIXME: +=
+
+ BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+ BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+ Nucleus::createBr(testBlock);
+ Nucleus::setInsertBlock(endBlock);
+
+ loopDepth--;
+ enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+ }
+
+ void PixelProgram::ENDREP()
+ {
+ loopRepDepth--;
+
+ BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+ BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+ Nucleus::createBr(testBlock);
+ Nucleus::setInsertBlock(endBlock);
+
+ loopDepth--;
+ enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+ }
+
+ void PixelProgram::ENDWHILE()
+ {
+ loopRepDepth--;
+
+ BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+ BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+ Nucleus::createBr(testBlock);
+ Nucleus::setInsertBlock(endBlock);
+
+ enableIndex--;
+ whileTest = false;
+ }
+
+ void PixelProgram::ENDSWITCH()
+ {
+ loopRepDepth--;
+
+ BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+ Nucleus::createBr(endBlock);
+ Nucleus::setInsertBlock(endBlock);
+ }
+
+ void PixelProgram::IF(const Src &src)
+ {
+ if(src.type == Shader::PARAMETER_CONSTBOOL)
+ {
+ IFb(src);
+ }
+ else if(src.type == Shader::PARAMETER_PREDICATE)
+ {
+ IFp(src);
+ }
+ else
+ {
+ Int4 condition = As<Int4>(fetchRegister(src).x);
+ IF(condition);
+ }
+ }
+
+ void PixelProgram::IFb(const Src &boolRegister)
+ {
+ ASSERT(ifDepth < 24 + 4);
+
+ Bool condition = (*Pointer<Byte>(data + OFFSET(DrawData, ps.b[boolRegister.index])) != Byte(0)); // FIXME
+
+ if(boolRegister.modifier == Shader::MODIFIER_NOT)
+ {
+ condition = !condition;
+ }
+
+ BasicBlock *trueBlock = Nucleus::createBasicBlock();
+ BasicBlock *falseBlock = Nucleus::createBasicBlock();
+
+ branch(condition, trueBlock, falseBlock);
+
+ isConditionalIf[ifDepth] = false;
+ ifFalseBlock[ifDepth] = falseBlock;
+
+ ifDepth++;
+ }
+
+ void PixelProgram::IFp(const Src &predicateRegister)
+ {
+ Int4 condition = As<Int4>(p0[predicateRegister.swizzle & 0x3]);
+
+ if(predicateRegister.modifier == Shader::MODIFIER_NOT)
+ {
+ condition = ~condition;
+ }
+
+ IF(condition);
+ }
+
+ void PixelProgram::IFC(Vector4f &src0, Vector4f &src1, Control control)
+ {
+ Int4 condition;
+
+ switch(control)
+ {
+ case Shader::CONTROL_GT: condition = CmpNLE(src0.x, src1.x); break;
+ case Shader::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x); break;
+ case Shader::CONTROL_GE: condition = CmpNLT(src0.x, src1.x); break;
+ case Shader::CONTROL_LT: condition = CmpLT(src0.x, src1.x); break;
+ case Shader::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x); break;
+ case Shader::CONTROL_LE: condition = CmpLE(src0.x, src1.x); break;
+ default:
+ ASSERT(false);
+ }
+
+ IF(condition);
+ }
+
+ void PixelProgram::IF(Int4 &condition)
+ {
+ condition &= enableStack[enableIndex];
+
+ enableIndex++;
+ enableStack[enableIndex] = condition;
+
+ BasicBlock *trueBlock = Nucleus::createBasicBlock();
+ BasicBlock *falseBlock = Nucleus::createBasicBlock();
+
+ Bool notAllFalse = SignMask(condition) != 0;
+
+ branch(notAllFalse, trueBlock, falseBlock);
+
+ isConditionalIf[ifDepth] = true;
+ ifFalseBlock[ifDepth] = falseBlock;
+
+ ifDepth++;
+ }
+
+ void PixelProgram::LABEL(int labelIndex)
+ {
+ if(!labelBlock[labelIndex])
+ {
+ labelBlock[labelIndex] = Nucleus::createBasicBlock();
+ }
+
+ Nucleus::setInsertBlock(labelBlock[labelIndex]);
+ currentLabel = labelIndex;
+ }
+
+ void PixelProgram::LOOP(const Src &integerRegister)
+ {
+ loopDepth++;
+
+ iteration[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData, ps.i[integerRegister.index][0]));
+ aL[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData, ps.i[integerRegister.index][1]));
+ increment[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData, ps.i[integerRegister.index][2]));
+
+ // If(increment[loopDepth] == 0)
+ // {
+ // increment[loopDepth] = 1;
+ // }
+
+ BasicBlock *loopBlock = Nucleus::createBasicBlock();
+ BasicBlock *testBlock = Nucleus::createBasicBlock();
+ BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+ loopRepTestBlock[loopRepDepth] = testBlock;
+ loopRepEndBlock[loopRepDepth] = endBlock;
+
+ // FIXME: jump(testBlock)
+ Nucleus::createBr(testBlock);
+ Nucleus::setInsertBlock(testBlock);
+
+ branch(iteration[loopDepth] > 0, loopBlock, endBlock);
+ Nucleus::setInsertBlock(loopBlock);
+
+ iteration[loopDepth] = iteration[loopDepth] - 1; // FIXME: --
+
+ loopRepDepth++;
+ }
+
+ void PixelProgram::REP(const Src &integerRegister)
+ {
+ loopDepth++;
+
+ iteration[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData, ps.i[integerRegister.index][0]));
+ aL[loopDepth] = aL[loopDepth - 1];
+
+ BasicBlock *loopBlock = Nucleus::createBasicBlock();
+ BasicBlock *testBlock = Nucleus::createBasicBlock();
+ BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+ loopRepTestBlock[loopRepDepth] = testBlock;
+ loopRepEndBlock[loopRepDepth] = endBlock;
+
+ // FIXME: jump(testBlock)
+ Nucleus::createBr(testBlock);
+ Nucleus::setInsertBlock(testBlock);
+
+ branch(iteration[loopDepth] > 0, loopBlock, endBlock);
+ Nucleus::setInsertBlock(loopBlock);
+
+ iteration[loopDepth] = iteration[loopDepth] - 1; // FIXME: --
+
+ loopRepDepth++;
+ }
+
+ void PixelProgram::WHILE(const Src &temporaryRegister)
+ {
+ enableIndex++;
+
+ BasicBlock *loopBlock = Nucleus::createBasicBlock();
+ BasicBlock *testBlock = Nucleus::createBasicBlock();
+ BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+ loopRepTestBlock[loopRepDepth] = testBlock;
+ loopRepEndBlock[loopRepDepth] = endBlock;
+
+ Int4 restoreBreak = enableBreak;
+ Int4 restoreContinue = enableContinue;
+
+ // TODO: jump(testBlock)
+ Nucleus::createBr(testBlock);
+ Nucleus::setInsertBlock(testBlock);
+ enableContinue = restoreContinue;
+
+ const Vector4f &src = fetchRegister(temporaryRegister);
+ Int4 condition = As<Int4>(src.x);
+ condition &= enableStack[enableIndex - 1];
+ if(shader->containsLeaveInstruction()) condition &= enableLeave;
+ if(shader->containsBreakInstruction()) condition &= enableBreak;
+ enableStack[enableIndex] = condition;
+
+ Bool notAllFalse = SignMask(condition) != 0;
+ branch(notAllFalse, loopBlock, endBlock);
+
+ Nucleus::setInsertBlock(endBlock);
+ enableBreak = restoreBreak;
+
+ Nucleus::setInsertBlock(loopBlock);
+
+ loopRepDepth++;
+ }
+
+ void PixelProgram::SWITCH()
+ {
+ BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+ loopRepTestBlock[loopRepDepth] = nullptr;
+ loopRepEndBlock[loopRepDepth] = endBlock;
+
+ Int4 restoreBreak = enableBreak;
+
+ BasicBlock *currentBlock = Nucleus::getInsertBlock();
+
+ Nucleus::setInsertBlock(endBlock);
+ enableBreak = restoreBreak;
+
+ Nucleus::setInsertBlock(currentBlock);
+
+ loopRepDepth++;
+ }
+
+ void PixelProgram::RET()
+ {
+ if(currentLabel == -1)
+ {
+ returnBlock = Nucleus::createBasicBlock();
+ Nucleus::createBr(returnBlock);
+ }
+ else
+ {
+ BasicBlock *unreachableBlock = Nucleus::createBasicBlock();
+
+ if(callRetBlock[currentLabel].size() > 1) // Pop the return destination from the call stack
+ {
+ // FIXME: Encapsulate
+ UInt index = callStack[--stackIndex];
+
+ Value *value = index.loadValue();
+ SwitchCases *switchCases = Nucleus::createSwitch(value, unreachableBlock, (int)callRetBlock[currentLabel].size());
+
+ for(unsigned int i = 0; i < callRetBlock[currentLabel].size(); i++)
+ {
+ Nucleus::addSwitchCase(switchCases, i, callRetBlock[currentLabel][i]);
+ }
+ }
+ else if(callRetBlock[currentLabel].size() == 1) // Jump directly to the unique return destination
+ {
+ Nucleus::createBr(callRetBlock[currentLabel][0]);
+ }
+ else // Function isn't called
+ {
+ Nucleus::createBr(unreachableBlock);
+ }
+
+ Nucleus::setInsertBlock(unreachableBlock);
+ Nucleus::createUnreachable();
+ }
+ }
+
+ void PixelProgram::LEAVE()
+ {
+ enableLeave = enableLeave & ~enableStack[enableIndex];
+
+ // FIXME: Return from function if all instances left
+ // FIXME: Use enableLeave in other control-flow constructs
+ }
+}
diff --git a/src/Pipeline/PixelProgram.hpp b/src/Pipeline/PixelProgram.hpp
new file mode 100644
index 0000000..240938d
--- /dev/null
+++ b/src/Pipeline/PixelProgram.hpp
@@ -0,0 +1,170 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_PixelProgram_hpp
+#define sw_PixelProgram_hpp
+
+#include "PixelRoutine.hpp"
+#include "SamplerCore.hpp"
+
+namespace sw
+{
+ class PixelProgram : public PixelRoutine
+ {
+ public:
+ PixelProgram(const PixelProcessor::State &state, const PixelShader *shader) :
+ PixelRoutine(state, shader), r(shader->indirectAddressableTemporaries),
+ loopDepth(-1), ifDepth(0), loopRepDepth(0), currentLabel(-1), whileTest(false)
+ {
+ for(int i = 0; i < 2048; ++i)
+ {
+ labelBlock[i] = 0;
+ }
+
+ enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+
+ if(shader->containsBreakInstruction())
+ {
+ enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+ }
+
+ if(shader->containsContinueInstruction())
+ {
+ enableContinue = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+ }
+ }
+
+ virtual ~PixelProgram() {}
+
+ protected:
+ virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w);
+ virtual void applyShader(Int cMask[4]);
+ virtual Bool alphaTest(Int cMask[4]);
+ virtual void rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]);
+
+ private:
+ // Temporary registers
+ RegisterArray<NUM_TEMPORARY_REGISTERS> r;
+
+ // Color outputs
+ Vector4f c[RENDERTARGETS];
+ RegisterArray<RENDERTARGETS, true> oC;
+
+ // Shader variables
+ Vector4f vPos;
+ Vector4f vFace;
+
+ // DX9 specific variables
+ Vector4f p0;
+ Array<Int, 4> aL;
+ Array<Int, 4> increment;
+ Array<Int, 4> iteration;
+
+ Int loopDepth; // FIXME: Add support for switch
+ Int stackIndex; // FIXME: Inc/decrement callStack
+ Array<UInt, 16> callStack;
+
+ // Per pixel based on conditions reached
+ Int enableIndex;
+ Array<Int4, 1 + 24> enableStack;
+ Int4 enableBreak;
+ Int4 enableContinue;
+ Int4 enableLeave;
+
+ Vector4f sampleTexture(const Src &sampler, Vector4f &uvwq, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
+ Vector4f sampleTexture(int samplerIndex, Vector4f &uvwq, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
+
+ // Raster operations
+ void clampColor(Vector4f oC[RENDERTARGETS]);
+
+ Int4 enableMask(const Shader::Instruction *instruction);
+
+ Vector4f fetchRegister(const Src &src, unsigned int offset = 0);
+ Vector4f readConstant(const Src &src, unsigned int offset = 0);
+ RValue<Pointer<Byte>> uniformAddress(int bufferIndex, unsigned int index);
+ RValue<Pointer<Byte>> uniformAddress(int bufferIndex, unsigned int index, Int& offset);
+ Int relativeAddress(const Shader::Relative &rel, int bufferIndex = -1);
+ Int4 dynamicAddress(const Shader::Relative &rel);
+
+ Float4 linearToSRGB(const Float4 &x);
+
+ // Instructions
+ typedef Shader::Control Control;
+
+ void M3X2(Vector4f &dst, Vector4f &src0, const Src &src1);
+ void M3X3(Vector4f &dst, Vector4f &src0, const Src &src1);
+ void M3X4(Vector4f &dst, Vector4f &src0, const Src &src1);
+ void M4X3(Vector4f &dst, Vector4f &src0, const Src &src1);
+ void M4X4(Vector4f &dst, Vector4f &src0, const Src &src1);
+ void TEX(Vector4f &dst, Vector4f &src0, const Src &src1, bool project, bool bias);
+ void TEXLOD(Vector4f &dst, Vector4f &src0, const Src &src1, Float4 &lod);
+ void TEXBIAS(Vector4f &dst, Vector4f &src0, const Src &src1, Float4 &bias);
+ void TEXSIZE(Vector4f &dst, Float4 &lod, const Src &src1);
+ void TEXKILL(Int cMask[4], Vector4f &src, unsigned char mask);
+ void TEXOFFSET(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &offset);
+ void TEXOFFSETBIAS(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &offset, Float4 &bias);
+ void TEXLODOFFSET(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &offset, Float4 &lod);
+ void TEXELFETCH(Vector4f &dst, Vector4f &src, const Src &, Float4 &lod);
+ void TEXELFETCHOFFSET(Vector4f &dst, Vector4f &src, const Src &, Vector4f &offset, Float4 &lod);
+ void TEXGRAD(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &dsx, Vector4f &dsy);
+ void TEXGRADOFFSET(Vector4f &dst, Vector4f &src, const Src &, Vector4f &dsx, Vector4f &dsy, Vector4f &offset);
+ void DISCARD(Int cMask[4], const Shader::Instruction *instruction);
+ void DFDX(Vector4f &dst, Vector4f &src);
+ void DFDY(Vector4f &dst, Vector4f &src);
+ void FWIDTH(Vector4f &dst, Vector4f &src);
+ void BREAK();
+ void BREAKC(Vector4f &src0, Vector4f &src1, Control);
+ void BREAKP(const Src &predicateRegister);
+ void BREAK(Int4 &condition);
+ void CONTINUE();
+ void TEST();
+ void CALL(int labelIndex, int callSiteIndex);
+ void CALLNZ(int labelIndex, int callSiteIndex, const Src &src);
+ void CALLNZb(int labelIndex, int callSiteIndex, const Src &boolRegister);
+ void CALLNZp(int labelIndex, int callSiteIndex, const Src &predicateRegister);
+ void ELSE();
+ void ENDIF();
+ void ENDLOOP();
+ void ENDREP();
+ void ENDWHILE();
+ void ENDSWITCH();
+ void IF(const Src &src);
+ void IFb(const Src &boolRegister);
+ void IFp(const Src &predicateRegister);
+ void IFC(Vector4f &src0, Vector4f &src1, Control);
+ void IF(Int4 &condition);
+ void LABEL(int labelIndex);
+ void LOOP(const Src &integerRegister);
+ void REP(const Src &integerRegister);
+ void WHILE(const Src &temporaryRegister);
+ void SWITCH();
+ void RET();
+ void LEAVE();
+
+ int ifDepth;
+ int loopRepDepth;
+ int currentLabel;
+ bool whileTest;
+
+ BasicBlock *ifFalseBlock[24 + 24];
+ BasicBlock *loopRepTestBlock[4];
+ BasicBlock *loopRepEndBlock[4];
+ BasicBlock *labelBlock[2048];
+ std::vector<BasicBlock*> callRetBlock[2048];
+ BasicBlock *returnBlock;
+ bool isConditionalIf[24 + 24];
+ };
+}
+
+#endif
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
new file mode 100644
index 0000000..146e42d
--- /dev/null
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -0,0 +1,2724 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "PixelRoutine.hpp"
+
+#include "SamplerCore.hpp"
+#include "Constants.hpp"
+#include "Renderer/Renderer.hpp"
+#include "Renderer/QuadRasterizer.hpp"
+#include "Renderer/Surface.hpp"
+#include "Renderer/Primitive.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+ extern bool complementaryDepthBuffer;
+ extern bool postBlendSRGB;
+ extern bool exactColorRounding;
+ extern bool forceClearRegisters;
+
+ PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader)
+ : QuadRasterizer(state, shader), v(shader && shader->indirectAddressableInput)
+ {
+ if(!shader || shader->getShaderModel() < 0x0200 || forceClearRegisters)
+ {
+ for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
+ {
+ v[i].x = Float4(0.0f);
+ v[i].y = Float4(0.0f);
+ v[i].z = Float4(0.0f);
+ v[i].w = Float4(0.0f);
+ }
+ }
+ }
+
+ PixelRoutine::~PixelRoutine()
+ {
+ }
+
+ void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
+ {
+ #if PERF_PROFILE
+ Long pipeTime = Ticks();
+ #endif
+
+ const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
+
+ Int zMask[4]; // Depth mask
+ Int sMask[4]; // Stencil mask
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ zMask[q] = cMask[q];
+ sMask[q] = cMask[q];
+ }
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
+ }
+
+ Float4 f;
+ Float4 rhwCentroid;
+
+ Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
+
+ if(interpolateZ())
+ {
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ Float4 x = xxxx;
+
+ if(state.multiSample > 1)
+ {
+ x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
+ }
+
+ z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
+ }
+ }
+
+ Bool depthPass = false;
+
+ if(earlyDepthTest)
+ {
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
+ }
+ }
+
+ If(depthPass || Bool(!earlyDepthTest))
+ {
+ #if PERF_PROFILE
+ Long interpTime = Ticks();
+ #endif
+
+ Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
+
+ // Centroid locations
+ Float4 XXXX = Float4(0.0f);
+ Float4 YYYY = Float4(0.0f);
+
+ if(state.centroid)
+ {
+ Float4 WWWW(1.0e-9f);
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
+ YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
+ WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
+ }
+
+ WWWW = Rcp_pp(WWWW);
+ XXXX *= WWWW;
+ YYYY *= WWWW;
+
+ XXXX += xxxx;
+ YYYY += yyyy;
+ }
+
+ if(interpolateW())
+ {
+ w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false);
+ rhw = reciprocal(w, false, false, true);
+
+ if(state.centroid)
+ {
+ rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
+ }
+ }
+
+ for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ if(state.interpolant[interpolant].component & (1 << component))
+ {
+ if(!state.interpolant[interpolant].centroid)
+ {
+ v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective, false);
+ }
+ else
+ {
+ v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
+ }
+ }
+ }
+
+ Float4 rcp;
+
+ switch(state.interpolant[interpolant].project)
+ {
+ case 0:
+ break;
+ case 1:
+ rcp = reciprocal(v[interpolant].y);
+ v[interpolant].x = v[interpolant].x * rcp;
+ break;
+ case 2:
+ rcp = reciprocal(v[interpolant].z);
+ v[interpolant].x = v[interpolant].x * rcp;
+ v[interpolant].y = v[interpolant].y * rcp;
+ break;
+ case 3:
+ rcp = reciprocal(v[interpolant].w);
+ v[interpolant].x = v[interpolant].x * rcp;
+ v[interpolant].y = v[interpolant].y * rcp;
+ v[interpolant].z = v[interpolant].z * rcp;
+ break;
+ }
+ }
+
+ if(state.fog.component)
+ {
+ f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective, false);
+ }
+
+ setBuiltins(x, y, z, w);
+
+ #if PERF_PROFILE
+ cycles[PERF_INTERP] += Ticks() - interpTime;
+ #endif
+
+ Bool alphaPass = true;
+
+ if(colorUsed())
+ {
+ #if PERF_PROFILE
+ Long shaderTime = Ticks();
+ #endif
+
+ applyShader(cMask);
+
+ #if PERF_PROFILE
+ cycles[PERF_SHADER] += Ticks() - shaderTime;
+ #endif
+
+ alphaPass = alphaTest(cMask);
+
+ if((shader && shader->containsKill()) || state.alphaTestActive())
+ {
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ zMask[q] &= cMask[q];
+ sMask[q] &= cMask[q];
+ }
+ }
+ }
+
+ If(alphaPass)
+ {
+ if(!earlyDepthTest)
+ {
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
+ }
+ }
+
+ #if PERF_PROFILE
+ Long ropTime = Ticks();
+ #endif
+
+ If(depthPass || Bool(earlyDepthTest))
+ {
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ if(state.multiSampleMask & (1 << q))
+ {
+ writeDepth(zBuffer, q, x, z[q], zMask[q]);
+
+ if(state.occlusionEnabled)
+ {
+ occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
+ }
+ }
+ }
+
+ if(colorUsed())
+ {
+ #if PERF_PROFILE
+ AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
+ #endif
+
+ rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
+ }
+ }
+
+ #if PERF_PROFILE
+ cycles[PERF_ROP] += Ticks() - ropTime;
+ #endif
+ }
+ }
+
+ for(unsigned int q = 0; q < state.multiSample; q++)
+ {
+ if(state.multiSampleMask & (1 << q))
+ {
+ writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
+ }
+ }
+
+ #if PERF_PROFILE
+ cycles[PERF_PIPE] += Ticks() - pipeTime;
+ #endif
+ }
+
+ Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
+ {
+ Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
+
+ if(!flat)
+ {
+ interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
+ y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
+
+ if(perspective)
+ {
+ interpolant *= rhw;
+ }
+ }
+
+ return interpolant;
+ }
+
+ void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
+ {
+ if(!state.stencilActive)
+ {
+ return;
+ }
+
+ // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
+
+ Pointer<Byte> buffer = sBuffer + 2 * x;
+
+ if(q > 0)
+ {
+ buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
+ }
+
+ Byte8 value = *Pointer<Byte8>(buffer);
+ Byte8 valueCCW = value;
+
+ if(!state.noStencilMask)
+ {
+ value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
+ }
+
+ stencilTest(value, state.stencilCompareMode, false);
+
+ if(state.twoSidedStencil)
+ {
+ if(!state.noStencilMaskCCW)
+ {
+ valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
+ }
+
+ stencilTest(valueCCW, state.stencilCompareModeCCW, true);
+
+ value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
+ valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
+ value |= valueCCW;
+ }
+
+ sMask = SignMask(value) & cMask;
+ }
+
+ void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
+ {
+ Byte8 equal;
+
+ switch(stencilCompareMode)
+ {
+ case STENCIL_ALWAYS:
+ value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ break;
+ case STENCIL_NEVER:
+ value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+ break;
+ case STENCIL_LESS: // a < b ~ b > a
+ value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
+ break;
+ case STENCIL_EQUAL:
+ value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
+ break;
+ case STENCIL_NOTEQUAL: // a != b ~ !(a == b)
+ value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
+ value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ break;
+ case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
+ equal = value;
+ equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
+ value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
+ value |= equal;
+ break;
+ case STENCIL_GREATER: // a > b
+ equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
+ value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
+ value = equal;
+ break;
+ case STENCIL_GREATEREQUAL: // a >= b ~ !(a < b) ~ !(b > a)
+ value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
+ value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
+ {
+ if(!state.depthTestActive)
+ {
+ return true;
+ }
+
+ Float4 Z = z;
+
+ if(shader && shader->depthOverride())
+ {
+ if(complementaryDepthBuffer)
+ {
+ Z = Float4(1.0f) - oDepth;
+ }
+ else
+ {
+ Z = oDepth;
+ }
+ }
+
+ Pointer<Byte> buffer;
+ Int pitch;
+
+ if(!state.quadLayoutDepthBuffer)
+ {
+ buffer = zBuffer + 4 * x;
+ pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+ }
+ else
+ {
+ buffer = zBuffer + 8 * x;
+ }
+
+ if(q > 0)
+ {
+ buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
+ }
+
+ Float4 zValue;
+
+ if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
+ {
+ if(!state.quadLayoutDepthBuffer)
+ {
+ // FIXME: Properly optimizes?
+ zValue.xy = *Pointer<Float4>(buffer);
+ zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
+ }
+ else
+ {
+ zValue = *Pointer<Float4>(buffer, 16);
+ }
+ }
+
+ Int4 zTest;
+
+ switch(state.depthCompareMode)
+ {
+ case DEPTH_ALWAYS:
+ // Optimized
+ break;
+ case DEPTH_NEVER:
+ // Optimized
+ break;
+ case DEPTH_EQUAL:
+ zTest = CmpEQ(zValue, Z);
+ break;
+ case DEPTH_NOTEQUAL:
+ zTest = CmpNEQ(zValue, Z);
+ break;
+ case DEPTH_LESS:
+ if(complementaryDepthBuffer)
+ {
+ zTest = CmpLT(zValue, Z);
+ }
+ else
+ {
+ zTest = CmpNLE(zValue, Z);
+ }
+ break;
+ case DEPTH_GREATEREQUAL:
+ if(complementaryDepthBuffer)
+ {
+ zTest = CmpNLT(zValue, Z);
+ }
+ else
+ {
+ zTest = CmpLE(zValue, Z);
+ }
+ break;
+ case DEPTH_LESSEQUAL:
+ if(complementaryDepthBuffer)
+ {
+ zTest = CmpLE(zValue, Z);
+ }
+ else
+ {
+ zTest = CmpNLT(zValue, Z);
+ }
+ break;
+ case DEPTH_GREATER:
+ if(complementaryDepthBuffer)
+ {
+ zTest = CmpNLE(zValue, Z);
+ }
+ else
+ {
+ zTest = CmpLT(zValue, Z);
+ }
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ switch(state.depthCompareMode)
+ {
+ case DEPTH_ALWAYS:
+ zMask = cMask;
+ break;
+ case DEPTH_NEVER:
+ zMask = 0x0;
+ break;
+ default:
+ zMask = SignMask(zTest) & cMask;
+ break;
+ }
+
+ if(state.stencilActive)
+ {
+ zMask &= sMask;
+ }
+
+ return zMask != 0;
+ }
+
+ void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
+ {
+ Short4 cmp;
+ Short4 equal;
+
+ switch(state.alphaCompareMode)
+ {
+ case ALPHA_ALWAYS:
+ aMask = 0xF;
+ break;
+ case ALPHA_NEVER:
+ aMask = 0x0;
+ break;
+ case ALPHA_EQUAL:
+ cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
+ aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
+ break;
+ case ALPHA_NOTEQUAL: // a != b ~ !(a == b)
+ cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME
+ aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
+ break;
+ case ALPHA_LESS: // a < b ~ b > a
+ cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
+ aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
+ break;
+ case ALPHA_GREATEREQUAL: // a >= b ~ (a > b) || (a == b) ~ !(b > a) // TODO: Approximate
+ equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
+ cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
+ cmp |= equal;
+ aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
+ break;
+ case ALPHA_LESSEQUAL: // a <= b ~ !(a > b)
+ cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME
+ aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
+ break;
+ case ALPHA_GREATER: // a > b
+ cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
+ aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
+ {
+ Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
+ Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
+ Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
+ Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
+
+ Int aMask0 = SignMask(coverage0);
+ Int aMask1 = SignMask(coverage1);
+ Int aMask2 = SignMask(coverage2);
+ Int aMask3 = SignMask(coverage3);
+
+ cMask[0] &= aMask0;
+ cMask[1] &= aMask1;
+ cMask[2] &= aMask2;
+ cMask[3] &= aMask3;
+ }
+
+ void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
+ {
+ if(!state.fogActive)
+ {
+ return;
+ }
+
+ if(state.pixelFogMode != FOG_NONE)
+ {
+ pixelFog(fog);
+
+ fog = Min(fog, Float4(1.0f));
+ fog = Max(fog, Float4(0.0f));
+ }
+
+ c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
+ c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
+ c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
+
+ c0.x *= fog;
+ c0.y *= fog;
+ c0.z *= fog;
+
+ c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
+ c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
+ c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
+ }
+
+ void PixelRoutine::pixelFog(Float4 &visibility)
+ {
+ Float4 &zw = visibility;
+
+ if(state.pixelFogMode != FOG_NONE)
+ {
+ if(state.wBasedFog)
+ {
+ zw = rhw;
+ }
+ else
+ {
+ if(complementaryDepthBuffer)
+ {
+ zw = Float4(1.0f) - z[0];
+ }
+ else
+ {
+ zw = z[0];
+ }
+ }
+ }
+
+ switch(state.pixelFogMode)
+ {
+ case FOG_NONE:
+ break;
+ case FOG_LINEAR:
+ zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
+ zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
+ break;
+ case FOG_EXP:
+ zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
+ zw = exponential2(zw, true);
+ break;
+ case FOG_EXP2:
+ zw *= zw;
+ zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
+ zw = exponential2(zw, true);
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
+ {
+ if(!state.depthWriteEnable)
+ {
+ return;
+ }
+
+ Float4 Z = z;
+
+ if(shader && shader->depthOverride())
+ {
+ if(complementaryDepthBuffer)
+ {
+ Z = Float4(1.0f) - oDepth;
+ }
+ else
+ {
+ Z = oDepth;
+ }
+ }
+
+ Pointer<Byte> buffer;
+ Int pitch;
+
+ if(!state.quadLayoutDepthBuffer)
+ {
+ buffer = zBuffer + 4 * x;
+ pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+ }
+ else
+ {
+ buffer = zBuffer + 8 * x;
+ }
+
+ if(q > 0)
+ {
+ buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
+ }
+
+ Float4 zValue;
+
+ if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
+ {
+ if(!state.quadLayoutDepthBuffer)
+ {
+ // FIXME: Properly optimizes?
+ zValue.xy = *Pointer<Float4>(buffer);
+ zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
+ }
+ else
+ {
+ zValue = *Pointer<Float4>(buffer, 16);
+ }
+ }
+
+ Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
+ zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
+ Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
+
+ if(!state.quadLayoutDepthBuffer)
+ {
+ // FIXME: Properly optimizes?
+ *Pointer<Float2>(buffer) = Float2(Z.xy);
+ *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
+ }
+ else
+ {
+ *Pointer<Float4>(buffer, 16) = Z;
+ }
+ }
+
+ void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
+ {
+ if(!state.stencilActive)
+ {
+ return;
+ }
+
+ if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
+ {
+ if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
+ {
+ return;
+ }
+ }
+
+ if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
+ {
+ return;
+ }
+
+ Pointer<Byte> buffer = sBuffer + 2 * x;
+
+ if(q > 0)
+ {
+ buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
+ }
+
+ Byte8 bufferValue = *Pointer<Byte8>(buffer);
+
+ Byte8 newValue;
+ stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
+
+ if(!state.noStencilWriteMask)
+ {
+ Byte8 maskedValue = bufferValue;
+ newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
+ maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
+ newValue |= maskedValue;
+ }
+
+ if(state.twoSidedStencil)
+ {
+ Byte8 newValueCCW;
+
+ stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
+
+ if(!state.noStencilWriteMaskCCW)
+ {
+ Byte8 maskedValue = bufferValue;
+ newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
+ maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
+ newValueCCW |= maskedValue;
+ }
+
+ newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
+ newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
+ newValue |= newValueCCW;
+ }
+
+ newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
+ bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
+ newValue |= bufferValue;
+
+ *Pointer<Byte4>(buffer) = Byte4(newValue);
+ }
+
+ void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
+ {
+ Byte8 &pass = newValue;
+ Byte8 fail;
+ Byte8 zFail;
+
+ stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
+
+ if(stencilZFailOperation != stencilPassOperation)
+ {
+ stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
+ }
+
+ if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
+ {
+ stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
+ }
+
+ if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
+ {
+ if(state.depthTestActive && stencilZFailOperation != stencilPassOperation) // zMask valid and values not the same
+ {
+ pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
+ zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
+ pass |= zFail;
+ }
+
+ pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
+ fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
+ pass |= fail;
+ }
+ }
+
+ void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
+ {
+ switch(operation)
+ {
+ case OPERATION_KEEP:
+ output = bufferValue;
+ break;
+ case OPERATION_ZERO:
+ output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+ break;
+ case OPERATION_REPLACE:
+ output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
+ break;
+ case OPERATION_INCRSAT:
+ output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
+ break;
+ case OPERATION_DECRSAT:
+ output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
+ break;
+ case OPERATION_INVERT:
+ output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ break;
+ case OPERATION_INCR:
+ output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
+ break;
+ case OPERATION_DECR:
+ output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorActive)
+ {
+ switch(blendFactorActive)
+ {
+ case BLEND_ZERO:
+ // Optimized
+ break;
+ case BLEND_ONE:
+ // Optimized
+ break;
+ case BLEND_SOURCE:
+ blendFactor.x = current.x;
+ blendFactor.y = current.y;
+ blendFactor.z = current.z;
+ break;
+ case BLEND_INVSOURCE:
+ blendFactor.x = Short4(0xFFFFu) - current.x;
+ blendFactor.y = Short4(0xFFFFu) - current.y;
+ blendFactor.z = Short4(0xFFFFu) - current.z;
+ break;
+ case BLEND_DEST:
+ blendFactor.x = pixel.x;
+ blendFactor.y = pixel.y;
+ blendFactor.z = pixel.z;
+ break;
+ case BLEND_INVDEST:
+ blendFactor.x = Short4(0xFFFFu) - pixel.x;
+ blendFactor.y = Short4(0xFFFFu) - pixel.y;
+ blendFactor.z = Short4(0xFFFFu) - pixel.z;
+ break;
+ case BLEND_SOURCEALPHA:
+ blendFactor.x = current.w;
+ blendFactor.y = current.w;
+ blendFactor.z = current.w;
+ break;
+ case BLEND_INVSOURCEALPHA:
+ blendFactor.x = Short4(0xFFFFu) - current.w;
+ blendFactor.y = Short4(0xFFFFu) - current.w;
+ blendFactor.z = Short4(0xFFFFu) - current.w;
+ break;
+ case BLEND_DESTALPHA:
+ blendFactor.x = pixel.w;
+ blendFactor.y = pixel.w;
+ blendFactor.z = pixel.w;
+ break;
+ case BLEND_INVDESTALPHA:
+ blendFactor.x = Short4(0xFFFFu) - pixel.w;
+ blendFactor.y = Short4(0xFFFFu) - pixel.w;
+ blendFactor.z = Short4(0xFFFFu) - pixel.w;
+ break;
+ case BLEND_SRCALPHASAT:
+ blendFactor.x = Short4(0xFFFFu) - pixel.w;
+ blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
+ blendFactor.y = blendFactor.x;
+ blendFactor.z = blendFactor.x;
+ break;
+ case BLEND_CONSTANT:
+ blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
+ blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
+ blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
+ break;
+ case BLEND_INVCONSTANT:
+ blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
+ blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
+ blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
+ break;
+ case BLEND_CONSTANTALPHA:
+ blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+ blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+ blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+ break;
+ case BLEND_INVCONSTANTALPHA:
+ blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+ blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+ blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
+ {
+ switch(blendFactorAlphaActive)
+ {
+ case BLEND_ZERO:
+ // Optimized
+ break;
+ case BLEND_ONE:
+ // Optimized
+ break;
+ case BLEND_SOURCE:
+ blendFactor.w = current.w;
+ break;
+ case BLEND_INVSOURCE:
+ blendFactor.w = Short4(0xFFFFu) - current.w;
+ break;
+ case BLEND_DEST:
+ blendFactor.w = pixel.w;
+ break;
+ case BLEND_INVDEST:
+ blendFactor.w = Short4(0xFFFFu) - pixel.w;
+ break;
+ case BLEND_SOURCEALPHA:
+ blendFactor.w = current.w;
+ break;
+ case BLEND_INVSOURCEALPHA:
+ blendFactor.w = Short4(0xFFFFu) - current.w;
+ break;
+ case BLEND_DESTALPHA:
+ blendFactor.w = pixel.w;
+ break;
+ case BLEND_INVDESTALPHA:
+ blendFactor.w = Short4(0xFFFFu) - pixel.w;
+ break;
+ case BLEND_SRCALPHASAT:
+ blendFactor.w = Short4(0xFFFFu);
+ break;
+ case BLEND_CONSTANT:
+ case BLEND_CONSTANTALPHA:
+ blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+ break;
+ case BLEND_INVCONSTANT:
+ case BLEND_INVCONSTANTALPHA:
+ blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ bool PixelRoutine::isSRGB(int index) const
+ {
+ return Surface::isSRGBformat(state.targetFormat[index]);
+ }
+
+ void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
+ {
+ Short4 c01;
+ Short4 c23;
+ Pointer<Byte> buffer;
+ Pointer<Byte> buffer2;
+
+ switch(state.targetFormat[index])
+ {
+ case FORMAT_R5G6B5:
+ buffer = cBuffer + 2 * x;
+ buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
+
+ pixel.x = c01 & Short4(0xF800u);
+ pixel.y = (c01 & Short4(0x07E0u)) << 5;
+ pixel.z = (c01 & Short4(0x001Fu)) << 11;
+ pixel.w = Short4(0xFFFFu);
+ break;
+ case FORMAT_A8R8G8B8:
+ buffer = cBuffer + 4 * x;
+ c01 = *Pointer<Short4>(buffer);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ c23 = *Pointer<Short4>(buffer);
+ pixel.z = c01;
+ pixel.y = c01;
+ pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+ pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+ pixel.x = pixel.z;
+ pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+ pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+ pixel.y = pixel.z;
+ pixel.w = pixel.x;
+ pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
+ pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+ pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+ pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+ break;
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_A8:
+ buffer = cBuffer + 4 * x;
+ c01 = *Pointer<Short4>(buffer);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ c23 = *Pointer<Short4>(buffer);
+ pixel.z = c01;
+ pixel.y = c01;
+ pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+ pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+ pixel.x = pixel.z;
+ pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+ pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+ pixel.y = pixel.z;
+ pixel.w = pixel.x;
+ pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+ pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+ pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+ pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+ break;
+ case FORMAT_A8:
+ buffer = cBuffer + 1 * x;
+ pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
+ pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+ pixel.x = Short4(0x0000);
+ pixel.y = Short4(0x0000);
+ pixel.z = Short4(0x0000);
+ break;
+ case FORMAT_R8:
+ buffer = cBuffer + 1 * x;
+ pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
+ pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
+ pixel.y = Short4(0x0000);
+ pixel.z = Short4(0x0000);
+ pixel.w = Short4(0xFFFFu);
+ break;
+ case FORMAT_X8R8G8B8:
+ buffer = cBuffer + 4 * x;
+ c01 = *Pointer<Short4>(buffer);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ c23 = *Pointer<Short4>(buffer);
+ pixel.z = c01;
+ pixel.y = c01;
+ pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+ pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+ pixel.x = pixel.z;
+ pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+ pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+ pixel.y = pixel.z;
+ pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
+ pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+ pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+ pixel.w = Short4(0xFFFFu);
+ break;
+ case FORMAT_G8R8:
+ buffer = cBuffer + 2 * x;
+ c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
+ pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
+ pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
+ pixel.z = Short4(0x0000u);
+ pixel.w = Short4(0xFFFFu);
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ buffer = cBuffer + 4 * x;
+ c01 = *Pointer<Short4>(buffer);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ c23 = *Pointer<Short4>(buffer);
+ pixel.z = c01;
+ pixel.y = c01;
+ pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+ pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+ pixel.x = pixel.z;
+ pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+ pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+ pixel.y = pixel.z;
+ pixel.w = pixel.x;
+ pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+ pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+ pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+ pixel.w = Short4(0xFFFFu);
+ break;
+ case FORMAT_A8G8R8B8Q:
+ UNIMPLEMENTED();
+ // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+ // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+ // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
+ // pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
+ break;
+ case FORMAT_X8G8R8B8Q:
+ UNIMPLEMENTED();
+ // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+ // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+ // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
+ // pixel.w = Short4(0xFFFFu);
+ break;
+ case FORMAT_A16B16G16R16:
+ buffer = cBuffer;
+ pixel.x = *Pointer<Short4>(buffer + 8 * x);
+ pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ pixel.z = *Pointer<Short4>(buffer + 8 * x);
+ pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
+ transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
+ break;
+ case FORMAT_G16R16:
+ buffer = cBuffer;
+ pixel.x = *Pointer<Short4>(buffer + 4 * x);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ pixel.y = *Pointer<Short4>(buffer + 4 * x);
+ pixel.z = pixel.x;
+ pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
+ pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
+ pixel.y = pixel.z;
+ pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
+ pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
+ pixel.z = Short4(0xFFFFu);
+ pixel.w = Short4(0xFFFFu);
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
+ {
+ sRGBtoLinear16_12_16(pixel);
+ }
+ }
+
+ void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x)
+ {
+ if(!state.alphaBlendActive)
+ {
+ return;
+ }
+
+ Vector4s pixel;
+ readPixel(index, cBuffer, x, pixel);
+
+ // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
+ Vector4s sourceFactor;
+ Vector4s destFactor;
+
+ blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
+ blendFactor(destFactor, current, pixel, state.destBlendFactor);
+
+ if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
+ {
+ current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
+ current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
+ current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
+ }
+
+ if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
+ {
+ pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
+ pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
+ pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
+ }
+
+ switch(state.blendOperation)
+ {
+ case BLENDOP_ADD:
+ current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
+ current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
+ current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
+ break;
+ case BLENDOP_SUB:
+ current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
+ current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
+ current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
+ break;
+ case BLENDOP_INVSUB:
+ current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
+ current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
+ current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
+ break;
+ case BLENDOP_MIN:
+ current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
+ current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
+ current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
+ break;
+ case BLENDOP_MAX:
+ current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
+ current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
+ current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
+ break;
+ case BLENDOP_SOURCE:
+ // No operation
+ break;
+ case BLENDOP_DEST:
+ current.x = pixel.x;
+ current.y = pixel.y;
+ current.z = pixel.z;
+ break;
+ case BLENDOP_NULL:
+ current.x = Short4(0x0000);
+ current.y = Short4(0x0000);
+ current.z = Short4(0x0000);
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
+ blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
+
+ if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
+ {
+ current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
+ }
+
+ if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
+ {
+ pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
+ }
+
+ switch(state.blendOperationAlpha)
+ {
+ case BLENDOP_ADD:
+ current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
+ break;
+ case BLENDOP_SUB:
+ current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
+ break;
+ case BLENDOP_INVSUB:
+ current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
+ break;
+ case BLENDOP_MIN:
+ current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
+ break;
+ case BLENDOP_MAX:
+ current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
+ break;
+ case BLENDOP_SOURCE:
+ // No operation
+ break;
+ case BLENDOP_DEST:
+ current.w = pixel.w;
+ break;
+ case BLENDOP_NULL:
+ current.w = Short4(0x0000);
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x)
+ {
+ if(state.logicalOperation == LOGICALOP_COPY)
+ {
+ return;
+ }
+
+ Vector4s pixel;
+ readPixel(index, cBuffer, x, pixel);
+
+ switch(state.logicalOperation)
+ {
+ case LOGICALOP_CLEAR:
+ current.x = UShort4(0);
+ current.y = UShort4(0);
+ current.z = UShort4(0);
+ break;
+ case LOGICALOP_SET:
+ current.x = UShort4(0xFFFFu);
+ current.y = UShort4(0xFFFFu);
+ current.z = UShort4(0xFFFFu);
+ break;
+ case LOGICALOP_COPY:
+ ASSERT(false); // Optimized out
+ break;
+ case LOGICALOP_COPY_INVERTED:
+ current.x = ~current.x;
+ current.y = ~current.y;
+ current.z = ~current.z;
+ break;
+ case LOGICALOP_NOOP:
+ current.x = pixel.x;
+ current.y = pixel.y;
+ current.z = pixel.z;
+ break;
+ case LOGICALOP_INVERT:
+ current.x = ~pixel.x;
+ current.y = ~pixel.y;
+ current.z = ~pixel.z;
+ break;
+ case LOGICALOP_AND:
+ current.x = pixel.x & current.x;
+ current.y = pixel.y & current.y;
+ current.z = pixel.z & current.z;
+ break;
+ case LOGICALOP_NAND:
+ current.x = ~(pixel.x & current.x);
+ current.y = ~(pixel.y & current.y);
+ current.z = ~(pixel.z & current.z);
+ break;
+ case LOGICALOP_OR:
+ current.x = pixel.x | current.x;
+ current.y = pixel.y | current.y;
+ current.z = pixel.z | current.z;
+ break;
+ case LOGICALOP_NOR:
+ current.x = ~(pixel.x | current.x);
+ current.y = ~(pixel.y | current.y);
+ current.z = ~(pixel.z | current.z);
+ break;
+ case LOGICALOP_XOR:
+ current.x = pixel.x ^ current.x;
+ current.y = pixel.y ^ current.y;
+ current.z = pixel.z ^ current.z;
+ break;
+ case LOGICALOP_EQUIV:
+ current.x = ~(pixel.x ^ current.x);
+ current.y = ~(pixel.y ^ current.y);
+ current.z = ~(pixel.z ^ current.z);
+ break;
+ case LOGICALOP_AND_REVERSE:
+ current.x = ~pixel.x & current.x;
+ current.y = ~pixel.y & current.y;
+ current.z = ~pixel.z & current.z;
+ break;
+ case LOGICALOP_AND_INVERTED:
+ current.x = pixel.x & ~current.x;
+ current.y = pixel.y & ~current.y;
+ current.z = pixel.z & ~current.z;
+ break;
+ case LOGICALOP_OR_REVERSE:
+ current.x = ~pixel.x | current.x;
+ current.y = ~pixel.y | current.y;
+ current.z = ~pixel.z | current.z;
+ break;
+ case LOGICALOP_OR_INVERTED:
+ current.x = pixel.x | ~current.x;
+ current.y = pixel.y | ~current.y;
+ current.z = pixel.z | ~current.z;
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s ¤t, Int &sMask, Int &zMask, Int &cMask)
+ {
+ if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
+ {
+ linearToSRGB16_12_16(current);
+ }
+
+ if(exactColorRounding)
+ {
+ switch(state.targetFormat[index])
+ {
+ case FORMAT_R5G6B5:
+ current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
+ current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
+ current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
+ break;
+ case FORMAT_X8G8R8B8Q:
+ case FORMAT_A8G8R8B8Q:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_G8R8:
+ case FORMAT_R8:
+ current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
+ current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
+ current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
+ current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
+ break;
+ default:
+ break;
+ }
+ }
+
+ int rgbaWriteMask = state.colorWriteActive(index);
+ int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
+
+ switch(state.targetFormat[index])
+ {
+ case FORMAT_R5G6B5:
+ {
+ current.x = current.x & Short4(0xF800u);
+ current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
+ current.z = As<UShort4>(current.z) >> 11;
+
+ current.x = current.x | current.y | current.z;
+ }
+ break;
+ case FORMAT_X8G8R8B8Q:
+ UNIMPLEMENTED();
+ // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+ // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+ // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+ // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
+ // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
+ break;
+ case FORMAT_A8G8R8B8Q:
+ UNIMPLEMENTED();
+ // current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+ // current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+ // current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+ // current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+ // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
+ // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
+ break;
+ case FORMAT_X8R8G8B8:
+ case FORMAT_A8R8G8B8:
+ if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
+ {
+ current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+ current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+ current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+ current.z = As<Short4>(PackUnsigned(current.z, current.x));
+ current.y = As<Short4>(PackUnsigned(current.y, current.y));
+
+ current.x = current.z;
+ current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+ current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+ current.y = current.z;
+ current.z = As<Short4>(UnpackLow(current.z, current.x));
+ current.y = As<Short4>(UnpackHigh(current.y, current.x));
+ }
+ else
+ {
+ current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+ current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+ current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+ current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+ current.z = As<Short4>(PackUnsigned(current.z, current.x));
+ current.y = As<Short4>(PackUnsigned(current.y, current.w));
+
+ current.x = current.z;
+ current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+ current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+ current.y = current.z;
+ current.z = As<Short4>(UnpackLow(current.z, current.x));
+ current.y = As<Short4>(UnpackHigh(current.y, current.x));
+ }
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
+ {
+ current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+ current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+ current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+ current.z = As<Short4>(PackUnsigned(current.x, current.z));
+ current.y = As<Short4>(PackUnsigned(current.y, current.y));
+
+ current.x = current.z;
+ current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+ current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+ current.y = current.z;
+ current.z = As<Short4>(UnpackLow(current.z, current.x));
+ current.y = As<Short4>(UnpackHigh(current.y, current.x));
+ }
+ else
+ {
+ current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+ current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+ current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+ current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+ current.z = As<Short4>(PackUnsigned(current.x, current.z));
+ current.y = As<Short4>(PackUnsigned(current.y, current.w));
+
+ current.x = current.z;
+ current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+ current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+ current.y = current.z;
+ current.z = As<Short4>(UnpackLow(current.z, current.x));
+ current.y = As<Short4>(UnpackHigh(current.y, current.x));
+ }
+ break;
+ case FORMAT_G8R8:
+ current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+ current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+ current.x = As<Short4>(PackUnsigned(current.x, current.x));
+ current.y = As<Short4>(PackUnsigned(current.y, current.y));
+ current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
+ break;
+ case FORMAT_R8:
+ current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+ current.x = As<Short4>(PackUnsigned(current.x, current.x));
+ break;
+ case FORMAT_A8:
+ current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+ current.w = As<Short4>(PackUnsigned(current.w, current.w));
+ break;
+ case FORMAT_G16R16:
+ current.z = current.x;
+ current.x = As<Short4>(UnpackLow(current.x, current.y));
+ current.z = As<Short4>(UnpackHigh(current.z, current.y));
+ current.y = current.z;
+ break;
+ case FORMAT_A16B16G16R16:
+ transpose4x4(current.x, current.y, current.z, current.w);
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ Short4 c01 = current.z;
+ Short4 c23 = current.y;
+
+ Int xMask; // Combination of all masks
+
+ if(state.depthTestActive)
+ {
+ xMask = zMask;
+ }
+ else
+ {
+ xMask = cMask;
+ }
+
+ if(state.stencilActive)
+ {
+ xMask &= sMask;
+ }
+
+ switch(state.targetFormat[index])
+ {
+ case FORMAT_R5G6B5:
+ {
+ Pointer<Byte> buffer = cBuffer + 2 * x;
+ Int value = *Pointer<Int>(buffer);
+
+ Int c01 = Extract(As<Int2>(current.x), 0);
+
+ if((bgraWriteMask & 0x00000007) != 0x00000007)
+ {
+ Int masked = value;
+ c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
+ masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
+ c01 |= masked;
+ }
+
+ c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
+ value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
+ c01 |= value;
+ *Pointer<Int>(buffer) = c01;
+
+ buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+ value = *Pointer<Int>(buffer);
+
+ Int c23 = Extract(As<Int2>(current.x), 1);
+
+ if((bgraWriteMask & 0x00000007) != 0x00000007)
+ {
+ Int masked = value;
+ c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
+ masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
+ c23 |= masked;
+ }
+
+ c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
+ value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
+ c23 |= value;
+ *Pointer<Int>(buffer) = c23;
+ }
+ break;
+ case FORMAT_A8G8R8B8Q:
+ case FORMAT_X8G8R8B8Q: // FIXME: Don't touch alpha?
+ UNIMPLEMENTED();
+ // value = *Pointer<Short4>(cBuffer + 8 * x + 0);
+
+ // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
+ // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
+ // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
+ // {
+ // Short4 masked = value;
+ // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+ // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+ // c01 |= masked;
+ // }
+
+ // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+ // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+ // c01 |= value;
+ // *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
+
+ // value = *Pointer<Short4>(cBuffer + 8 * x + 8);
+
+ // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
+ // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
+ // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
+ // {
+ // Short4 masked = value;
+ // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+ // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+ // c23 |= masked;
+ // }
+
+ // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+ // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+ // c23 |= value;
+ // *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
+ break;
+ case FORMAT_A8R8G8B8:
+ case FORMAT_X8R8G8B8: // FIXME: Don't touch alpha?
+ {
+ Pointer<Byte> buffer = cBuffer + x * 4;
+ Short4 value = *Pointer<Short4>(buffer);
+
+ if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
+ ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
+ (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
+ {
+ Short4 masked = value;
+ c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+ masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+ c01 |= masked;
+ }
+
+ c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+ value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+ c01 |= value;
+ *Pointer<Short4>(buffer) = c01;
+
+ buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+ value = *Pointer<Short4>(buffer);
+
+ if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
+ ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
+ (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh?
+ {
+ Short4 masked = value;
+ c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+ masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+ c23 |= masked;
+ }
+
+ c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+ value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+ c23 |= value;
+ *Pointer<Short4>(buffer) = c23;
+ }
+ break;
+ case FORMAT_A8B8G8R8:
+ case FORMAT_X8B8G8R8: // FIXME: Don't touch alpha?
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ {
+ Pointer<Byte> buffer = cBuffer + x * 4;
+ Short4 value = *Pointer<Short4>(buffer);
+
+ bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
+ (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
+ ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
+
+ if(masked)
+ {
+ Short4 masked = value;
+ c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
+ masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
+ c01 |= masked;
+ }
+
+ c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+ value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+ c01 |= value;
+ *Pointer<Short4>(buffer) = c01;
+
+ buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+ value = *Pointer<Short4>(buffer);
+
+ if(masked)
+ {
+ Short4 masked = value;
+ c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
+ masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
+ c23 |= masked;
+ }
+
+ c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+ value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+ c23 |= value;
+ *Pointer<Short4>(buffer) = c23;
+ }
+ break;
+ case FORMAT_G8R8:
+ if((rgbaWriteMask & 0x00000003) != 0x0)
+ {
+ Pointer<Byte> buffer = cBuffer + 2 * x;
+ Int2 value;
+ value = Insert(value, *Pointer<Int>(buffer), 0);
+ Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
+
+ Int2 packedCol = As<Int2>(current.x);
+
+ UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
+ if((rgbaWriteMask & 0x3) != 0x3)
+ {
+ Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
+ UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+ mergedMask &= rgbaMask;
+ }
+
+ packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
+
+ *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
+ *Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
+ }
+ break;
+ case FORMAT_R8:
+ if(rgbaWriteMask & 0x00000001)
+ {
+ Pointer<Byte> buffer = cBuffer + 1 * x;
+ Short4 value;
+ value = Insert(value, *Pointer<Short>(buffer), 0);
+ Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
+
+ current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
+ value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
+ current.x |= value;
+
+ *Pointer<Short>(buffer) = Extract(current.x, 0);
+ *Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
+ }
+ break;
+ case FORMAT_A8:
+ if(rgbaWriteMask & 0x00000008)
+ {
+ Pointer<Byte> buffer = cBuffer + 1 * x;
+ Short4 value;
+ value = Insert(value, *Pointer<Short>(buffer), 0);
+ Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+ value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
+
+ current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
+ value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
+ current.w |= value;
+
+ *Pointer<Short>(buffer) = Extract(current.w, 0);
+ *Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
+ }
+ break;
+ case FORMAT_G16R16:
+ {
+ Pointer<Byte> buffer = cBuffer + 4 * x;
+
+ Short4 value = *Pointer<Short4>(buffer);
+
+ if((rgbaWriteMask & 0x00000003) != 0x00000003)
+ {
+ Short4 masked = value;
+ current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
+ masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
+ current.x |= masked;
+ }
+
+ current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+ value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+ current.x |= value;
+ *Pointer<Short4>(buffer) = current.x;
+
+ buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+ value = *Pointer<Short4>(buffer);
+
+ if((rgbaWriteMask & 0x00000003) != 0x00000003)
+ {
+ Short4 masked = value;
+ current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
+ masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
+ current.y |= masked;
+ }
+
+ current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+ value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+ current.y |= value;
+ *Pointer<Short4>(buffer) = current.y;
+ }
+ break;
+ case FORMAT_A16B16G16R16:
+ {
+ Pointer<Byte> buffer = cBuffer + 8 * x;
+
+ {
+ Short4 value = *Pointer<Short4>(buffer);
+
+ if(rgbaWriteMask != 0x0000000F)
+ {
+ Short4 masked = value;
+ current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+ masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+ current.x |= masked;
+ }
+
+ current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
+ value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
+ current.x |= value;
+ *Pointer<Short4>(buffer) = current.x;
+ }
+
+ {
+ Short4 value = *Pointer<Short4>(buffer + 8);
+
+ if(rgbaWriteMask != 0x0000000F)
+ {
+ Short4 masked = value;
+ current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+ masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+ current.y |= masked;
+ }
+
+ current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
+ value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
+ current.y |= value;
+ *Pointer<Short4>(buffer + 8) = current.y;
+ }
+
+ buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+ {
+ Short4 value = *Pointer<Short4>(buffer);
+
+ if(rgbaWriteMask != 0x0000000F)
+ {
+ Short4 masked = value;
+ current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+ masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+ current.z |= masked;
+ }
+
+ current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
+ value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
+ current.z |= value;
+ *Pointer<Short4>(buffer) = current.z;
+ }
+
+ {
+ Short4 value = *Pointer<Short4>(buffer + 8);
+
+ if(rgbaWriteMask != 0x0000000F)
+ {
+ Short4 masked = value;
+ current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+ masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+ current.w |= masked;
+ }
+
+ current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
+ value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
+ current.w |= value;
+ *Pointer<Short4>(buffer + 8) = current.w;
+ }
+ }
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
+ {
+ switch(blendFactorActive)
+ {
+ case BLEND_ZERO:
+ // Optimized
+ break;
+ case BLEND_ONE:
+ // Optimized
+ break;
+ case BLEND_SOURCE:
+ blendFactor.x = oC.x;
+ blendFactor.y = oC.y;
+ blendFactor.z = oC.z;
+ break;
+ case BLEND_INVSOURCE:
+ blendFactor.x = Float4(1.0f) - oC.x;
+ blendFactor.y = Float4(1.0f) - oC.y;
+ blendFactor.z = Float4(1.0f) - oC.z;
+ break;
+ case BLEND_DEST:
+ blendFactor.x = pixel.x;
+ blendFactor.y = pixel.y;
+ blendFactor.z = pixel.z;
+ break;
+ case BLEND_INVDEST:
+ blendFactor.x = Float4(1.0f) - pixel.x;
+ blendFactor.y = Float4(1.0f) - pixel.y;
+ blendFactor.z = Float4(1.0f) - pixel.z;
+ break;
+ case BLEND_SOURCEALPHA:
+ blendFactor.x = oC.w;
+ blendFactor.y = oC.w;
+ blendFactor.z = oC.w;
+ break;
+ case BLEND_INVSOURCEALPHA:
+ blendFactor.x = Float4(1.0f) - oC.w;
+ blendFactor.y = Float4(1.0f) - oC.w;
+ blendFactor.z = Float4(1.0f) - oC.w;
+ break;
+ case BLEND_DESTALPHA:
+ blendFactor.x = pixel.w;
+ blendFactor.y = pixel.w;
+ blendFactor.z = pixel.w;
+ break;
+ case BLEND_INVDESTALPHA:
+ blendFactor.x = Float4(1.0f) - pixel.w;
+ blendFactor.y = Float4(1.0f) - pixel.w;
+ blendFactor.z = Float4(1.0f) - pixel.w;
+ break;
+ case BLEND_SRCALPHASAT:
+ blendFactor.x = Float4(1.0f) - pixel.w;
+ blendFactor.x = Min(blendFactor.x, oC.w);
+ blendFactor.y = blendFactor.x;
+ blendFactor.z = blendFactor.x;
+ break;
+ case BLEND_CONSTANT:
+ blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
+ blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
+ blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
+ break;
+ case BLEND_INVCONSTANT:
+ blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
+ blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
+ blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
+ {
+ switch(blendFactorAlphaActive)
+ {
+ case BLEND_ZERO:
+ // Optimized
+ break;
+ case BLEND_ONE:
+ // Optimized
+ break;
+ case BLEND_SOURCE:
+ blendFactor.w = oC.w;
+ break;
+ case BLEND_INVSOURCE:
+ blendFactor.w = Float4(1.0f) - oC.w;
+ break;
+ case BLEND_DEST:
+ blendFactor.w = pixel.w;
+ break;
+ case BLEND_INVDEST:
+ blendFactor.w = Float4(1.0f) - pixel.w;
+ break;
+ case BLEND_SOURCEALPHA:
+ blendFactor.w = oC.w;
+ break;
+ case BLEND_INVSOURCEALPHA:
+ blendFactor.w = Float4(1.0f) - oC.w;
+ break;
+ case BLEND_DESTALPHA:
+ blendFactor.w = pixel.w;
+ break;
+ case BLEND_INVDESTALPHA:
+ blendFactor.w = Float4(1.0f) - pixel.w;
+ break;
+ case BLEND_SRCALPHASAT:
+ blendFactor.w = Float4(1.0f);
+ break;
+ case BLEND_CONSTANT:
+ blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
+ break;
+ case BLEND_INVCONSTANT:
+ blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
+ {
+ if(!state.alphaBlendActive)
+ {
+ return;
+ }
+
+ Pointer<Byte> buffer;
+ Vector4f pixel;
+
+ Vector4s color;
+ Short4 c01;
+ Short4 c23;
+
+ Float4 one;
+ if(Surface::isFloatFormat(state.targetFormat[index]))
+ {
+ one = Float4(1.0f);
+ }
+ else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
+ {
+ one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
+ }
+
+ switch(state.targetFormat[index])
+ {
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ case FORMAT_R32F:
+ buffer = cBuffer;
+ // FIXME: movlps
+ pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
+ pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+ // FIXME: movhps
+ pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
+ pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
+ pixel.y = pixel.z = pixel.w = one;
+ break;
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ case FORMAT_G32R32F:
+ buffer = cBuffer;
+ pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+ pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
+ pixel.z = pixel.x;
+ pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
+ pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
+ pixel.y = pixel.z;
+ pixel.z = pixel.w = one;
+ break;
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_A32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ buffer = cBuffer;
+ pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
+ pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+ pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
+ pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
+ transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
+ if(state.targetFormat[index] == FORMAT_X32B32G32R32F ||
+ state.targetFormat[index] == FORMAT_X32B32G32R32F_UNSIGNED)
+ {
+ pixel.w = Float4(1.0f);
+ }
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
+ {
+ sRGBtoLinear(pixel.x);
+ sRGBtoLinear(pixel.y);
+ sRGBtoLinear(pixel.z);
+ }
+
+ // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
+ Vector4f sourceFactor;
+ Vector4f destFactor;
+
+ blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
+ blendFactor(destFactor, oC, pixel, state.destBlendFactor);
+
+ if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
+ {
+ oC.x *= sourceFactor.x;
+ oC.y *= sourceFactor.y;
+ oC.z *= sourceFactor.z;
+ }
+
+ if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
+ {
+ pixel.x *= destFactor.x;
+ pixel.y *= destFactor.y;
+ pixel.z *= destFactor.z;
+ }
+
+ switch(state.blendOperation)
+ {
+ case BLENDOP_ADD:
+ oC.x += pixel.x;
+ oC.y += pixel.y;
+ oC.z += pixel.z;
+ break;
+ case BLENDOP_SUB:
+ oC.x -= pixel.x;
+ oC.y -= pixel.y;
+ oC.z -= pixel.z;
+ break;
+ case BLENDOP_INVSUB:
+ oC.x = pixel.x - oC.x;
+ oC.y = pixel.y - oC.y;
+ oC.z = pixel.z - oC.z;
+ break;
+ case BLENDOP_MIN:
+ oC.x = Min(oC.x, pixel.x);
+ oC.y = Min(oC.y, pixel.y);
+ oC.z = Min(oC.z, pixel.z);
+ break;
+ case BLENDOP_MAX:
+ oC.x = Max(oC.x, pixel.x);
+ oC.y = Max(oC.y, pixel.y);
+ oC.z = Max(oC.z, pixel.z);
+ break;
+ case BLENDOP_SOURCE:
+ // No operation
+ break;
+ case BLENDOP_DEST:
+ oC.x = pixel.x;
+ oC.y = pixel.y;
+ oC.z = pixel.z;
+ break;
+ case BLENDOP_NULL:
+ oC.x = Float4(0.0f);
+ oC.y = Float4(0.0f);
+ oC.z = Float4(0.0f);
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
+ blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
+
+ if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
+ {
+ oC.w *= sourceFactor.w;
+ }
+
+ if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
+ {
+ pixel.w *= destFactor.w;
+ }
+
+ switch(state.blendOperationAlpha)
+ {
+ case BLENDOP_ADD:
+ oC.w += pixel.w;
+ break;
+ case BLENDOP_SUB:
+ oC.w -= pixel.w;
+ break;
+ case BLENDOP_INVSUB:
+ pixel.w -= oC.w;
+ oC.w = pixel.w;
+ break;
+ case BLENDOP_MIN:
+ oC.w = Min(oC.w, pixel.w);
+ break;
+ case BLENDOP_MAX:
+ oC.w = Max(oC.w, pixel.w);
+ break;
+ case BLENDOP_SOURCE:
+ // No operation
+ break;
+ case BLENDOP_DEST:
+ oC.w = pixel.w;
+ break;
+ case BLENDOP_NULL:
+ oC.w = Float4(0.0f);
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
+ {
+ switch(state.targetFormat[index])
+ {
+ case FORMAT_R32F:
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ case FORMAT_R16I:
+ case FORMAT_R16UI:
+ case FORMAT_R8I:
+ case FORMAT_R8UI:
+ break;
+ case FORMAT_G32R32F:
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ case FORMAT_G16R16I:
+ case FORMAT_G16R16UI:
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8UI:
+ oC.z = oC.x;
+ oC.x = UnpackLow(oC.x, oC.y);
+ oC.z = UnpackHigh(oC.z, oC.y);
+ oC.y = oC.z;
+ break;
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_A32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8UI:
+ transpose4x4(oC.x, oC.y, oC.z, oC.w);
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ int rgbaWriteMask = state.colorWriteActive(index);
+
+ Int xMask; // Combination of all masks
+
+ if(state.depthTestActive)
+ {
+ xMask = zMask;
+ }
+ else
+ {
+ xMask = cMask;
+ }
+
+ if(state.stencilActive)
+ {
+ xMask &= sMask;
+ }
+
+ Pointer<Byte> buffer;
+ Float4 value;
+
+ switch(state.targetFormat[index])
+ {
+ case FORMAT_R32F:
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ if(rgbaWriteMask & 0x00000001)
+ {
+ buffer = cBuffer + 4 * x;
+
+ // FIXME: movlps
+ value.x = *Pointer<Float>(buffer + 0);
+ value.y = *Pointer<Float>(buffer + 4);
+
+ buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+ // FIXME: movhps
+ value.z = *Pointer<Float>(buffer + 0);
+ value.w = *Pointer<Float>(buffer + 4);
+
+ oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
+ value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
+ oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+
+ // FIXME: movhps
+ *Pointer<Float>(buffer + 0) = oC.x.z;
+ *Pointer<Float>(buffer + 4) = oC.x.w;
+
+ buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+ // FIXME: movlps
+ *Pointer<Float>(buffer + 0) = oC.x.x;
+ *Pointer<Float>(buffer + 4) = oC.x.y;
+ }
+ break;
+ case FORMAT_R16I:
+ case FORMAT_R16UI:
+ if(rgbaWriteMask & 0x00000001)
+ {
+ buffer = cBuffer + 2 * x;
+
+ UShort4 xyzw;
+ xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
+
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+ xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
+ value = As<Float4>(Int4(xyzw));
+
+ oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
+ value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
+ oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+
+ if(state.targetFormat[index] == FORMAT_R16I)
+ {
+ Float component = oC.x.z;
+ *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
+ component = oC.x.w;
+ *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
+
+ buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+ component = oC.x.x;
+ *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
+ component = oC.x.y;
+ *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
+ }
+ else // FORMAT_R16UI
+ {
+ Float component = oC.x.z;
+ *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
+ component = oC.x.w;
+ *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
+
+ buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+ component = oC.x.x;
+ *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
+ component = oC.x.y;
+ *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
+ }
+ }
+ break;
+ case FORMAT_R8I:
+ case FORMAT_R8UI:
+ if(rgbaWriteMask & 0x00000001)
+ {
+ buffer = cBuffer + x;
+
+ UInt xyzw, packedCol;
+
+ xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
+
+ Short4 tmpCol = Short4(As<Int4>(oC.x));
+ if(state.targetFormat[index] == FORMAT_R8I)
+ {
+ tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
+ }
+ else
+ {
+ tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
+ }
+ packedCol = Extract(As<Int2>(tmpCol), 0);
+
+ packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
+ (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
+
+ *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
+ buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ *Pointer<UShort>(buffer) = UShort(packedCol);
+ }
+ break;
+ case FORMAT_G32R32F:
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ buffer = cBuffer + 8 * x;
+
+ value = *Pointer<Float4>(buffer);
+
+ if((rgbaWriteMask & 0x00000003) != 0x00000003)
+ {
+ Float4 masked = value;
+ oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
+ masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
+ oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
+ }
+
+ oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
+ value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
+ oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+ *Pointer<Float4>(buffer) = oC.x;
+
+ buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+ value = *Pointer<Float4>(buffer);
+
+ if((rgbaWriteMask & 0x00000003) != 0x00000003)
+ {
+ Float4 masked;
+
+ masked = value;
+ oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
+ masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
+ oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
+ }
+
+ oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
+ value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
+ oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
+ *Pointer<Float4>(buffer) = oC.y;
+ break;
+ case FORMAT_G16R16I:
+ case FORMAT_G16R16UI:
+ if((rgbaWriteMask & 0x00000003) != 0x0)
+ {
+ buffer = cBuffer + 4 * x;
+
+ UInt2 rgbaMask;
+ UShort4 packedCol = UShort4(As<Int4>(oC.x));
+ UShort4 value = *Pointer<UShort4>(buffer);
+ UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+ if((rgbaWriteMask & 0x3) != 0x3)
+ {
+ Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
+ rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+ mergedMask &= rgbaMask;
+ }
+ *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
+
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+ packedCol = UShort4(As<Int4>(oC.y));
+ value = *Pointer<UShort4>(buffer);
+ mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+ if((rgbaWriteMask & 0x3) != 0x3)
+ {
+ mergedMask &= rgbaMask;
+ }
+ *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
+ }
+ break;
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8UI:
+ if((rgbaWriteMask & 0x00000003) != 0x0)
+ {
+ buffer = cBuffer + 2 * x;
+
+ Int2 xyzw, packedCol;
+
+ xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
+
+ if(state.targetFormat[index] == FORMAT_G8R8I)
+ {
+ packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+ }
+ else
+ {
+ packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+ }
+
+ UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
+ if((rgbaWriteMask & 0x3) != 0x3)
+ {
+ Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
+ UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+ mergedMask &= rgbaMask;
+ }
+
+ packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
+
+ *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
+ buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
+ }
+ break;
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_A32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ buffer = cBuffer + 16 * x;
+
+ {
+ value = *Pointer<Float4>(buffer, 16);
+
+ if(rgbaWriteMask != 0x0000000F)
+ {
+ Float4 masked = value;
+ oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+ masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+ oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
+ }
+
+ oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
+ value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
+ oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+ *Pointer<Float4>(buffer, 16) = oC.x;
+ }
+
+ {
+ value = *Pointer<Float4>(buffer + 16, 16);
+
+ if(rgbaWriteMask != 0x0000000F)
+ {
+ Float4 masked = value;
+ oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+ masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+ oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
+ }
+
+ oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
+ value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
+ oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
+ *Pointer<Float4>(buffer + 16, 16) = oC.y;
+ }
+
+ buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+ {
+ value = *Pointer<Float4>(buffer, 16);
+
+ if(rgbaWriteMask != 0x0000000F)
+ {
+ Float4 masked = value;
+ oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+ masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+ oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
+ }
+
+ oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
+ value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
+ oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
+ *Pointer<Float4>(buffer, 16) = oC.z;
+ }
+
+ {
+ value = *Pointer<Float4>(buffer + 16, 16);
+
+ if(rgbaWriteMask != 0x0000000F)
+ {
+ Float4 masked = value;
+ oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+ masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+ oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
+ }
+
+ oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
+ value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
+ oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
+ *Pointer<Float4>(buffer + 16, 16) = oC.w;
+ }
+ break;
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_A16B16G16R16UI:
+ if((rgbaWriteMask & 0x0000000F) != 0x0)
+ {
+ buffer = cBuffer + 8 * x;
+
+ UInt4 rgbaMask;
+ UShort8 value = *Pointer<UShort8>(buffer);
+ UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
+ UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
+ if((rgbaWriteMask & 0xF) != 0xF)
+ {
+ UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
+ rgbaMask = UInt4(tmpMask, tmpMask);
+ mergedMask &= rgbaMask;
+ }
+ *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
+
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+ value = *Pointer<UShort8>(buffer);
+ packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
+ mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
+ if((rgbaWriteMask & 0xF) != 0xF)
+ {
+ mergedMask &= rgbaMask;
+ }
+ *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
+ }
+ break;
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8UI:
+ if((rgbaWriteMask & 0x0000000F) != 0x0)
+ {
+ UInt2 value, packedCol, mergedMask;
+
+ buffer = cBuffer + 4 * x;
+
+ if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
+ {
+ packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+ }
+ else
+ {
+ packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+ }
+ value = *Pointer<UInt2>(buffer, 16);
+ mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+ if(rgbaWriteMask != 0xF)
+ {
+ mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
+ }
+ *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
+
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+ if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
+ {
+ packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
+ }
+ else
+ {
+ packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
+ }
+ value = *Pointer<UInt2>(buffer, 16);
+ mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+ if(rgbaWriteMask != 0xF)
+ {
+ mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
+ }
+ *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
+ }
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
+ {
+ return UShort4(cf * Float4(0xFFFF), saturate);
+ }
+
+ void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
+ {
+ Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
+
+ c.x = As<UShort4>(c.x) >> 4;
+ c.y = As<UShort4>(c.y) >> 4;
+ c.z = As<UShort4>(c.z) >> 4;
+
+ c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
+ c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
+ c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
+ c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
+
+ c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
+ c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
+ c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
+ c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
+
+ c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
+ c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
+ c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
+ c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
+ }
+
+ void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
+ {
+ c.x = As<UShort4>(c.x) >> 4;
+ c.y = As<UShort4>(c.y) >> 4;
+ c.z = As<UShort4>(c.z) >> 4;
+
+ linearToSRGB12_16(c);
+ }
+
+ void PixelRoutine::linearToSRGB12_16(Vector4s &c)
+ {
+ Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
+
+ c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
+ c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
+ c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
+ c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
+
+ c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
+ c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
+ c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
+ c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
+
+ c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
+ c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
+ c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
+ c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
+ }
+
+ Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2
+ {
+ Float4 linear = x * x;
+ linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
+
+ return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
+ }
+
+ bool PixelRoutine::colorUsed()
+ {
+ return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
+ }
+}
diff --git a/src/Pipeline/PixelRoutine.hpp b/src/Pipeline/PixelRoutine.hpp
new file mode 100644
index 0000000..1cd076e
--- /dev/null
+++ b/src/Pipeline/PixelRoutine.hpp
@@ -0,0 +1,93 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_PixelRoutine_hpp
+#define sw_PixelRoutine_hpp
+
+#include "Renderer/QuadRasterizer.hpp"
+
+namespace sw
+{
+ class PixelShader;
+ class SamplerCore;
+
+ class PixelRoutine : public sw::QuadRasterizer, public ShaderCore
+ {
+ public:
+ PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader);
+
+ virtual ~PixelRoutine();
+
+ protected:
+ Float4 z[4]; // Multisampled z
+ Float4 w; // Used as is
+ Float4 rhw; // Reciprocal w
+
+ RegisterArray<MAX_FRAGMENT_INPUTS> v; // Varying registers
+
+ // Depth output
+ Float4 oDepth;
+
+ typedef Shader::SourceParameter Src;
+ typedef Shader::DestinationParameter Dst;
+
+ virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w) = 0;
+ virtual void applyShader(Int cMask[4]) = 0;
+ virtual Bool alphaTest(Int cMask[4]) = 0;
+ virtual void rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]) = 0;
+
+ virtual void quad(Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y);
+
+ void alphaTest(Int &aMask, Short4 &alpha);
+ void alphaToCoverage(Int cMask[4], Float4 &alpha);
+ void fogBlend(Vector4f &c0, Float4 &fog);
+ void pixelFog(Float4 &visibility);
+
+ // Raster operations
+ void alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x);
+ void logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x);
+ void writeColor(int index, Pointer<Byte> &cBuffer, Int &i, Vector4s ¤t, Int &sMask, Int &zMask, Int &cMask);
+ void alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x);
+ void writeColor(int index, Pointer<Byte> &cBuffer, Int &i, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask);
+
+ bool isSRGB(int index) const;
+ UShort4 convertFixed16(Float4 &cf, bool saturate = true);
+ void linearToSRGB12_16(Vector4s &c);
+
+ private:
+ Float4 interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective);
+ void stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask);
+ void stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW);
+ void stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask);
+ void stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW);
+ Bool depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask);
+
+ // Raster operations
+ void blendFactor(Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorActive);
+ void blendFactorAlpha(Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorAlphaActive);
+ void readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel);
+ void blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive);
+ void blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive);
+ void writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask);
+ void writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask);
+
+ void sRGBtoLinear16_12_16(Vector4s &c);
+ void linearToSRGB16_12_16(Vector4s &c);
+ Float4 sRGBtoLinear(const Float4 &x);
+
+ bool colorUsed();
+ };
+}
+
+#endif // sw_PixelRoutine_hpp
diff --git a/src/Pipeline/PixelShader.cpp b/src/Pipeline/PixelShader.cpp
new file mode 100644
index 0000000..d24e7c2
--- /dev/null
+++ b/src/Pipeline/PixelShader.cpp
@@ -0,0 +1,746 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "PixelShader.hpp"
+
+#include "Common/Debug.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+ PixelShader::PixelShader(const PixelShader *ps) : Shader()
+ {
+ shaderModel = 0x0300;
+ vPosDeclared = false;
+ vFaceDeclared = false;
+ centroid = false;
+
+ if(ps) // Make a copy
+ {
+ for(size_t i = 0; i < ps->getLength(); i++)
+ {
+ append(new sw::Shader::Instruction(*ps->getInstruction(i)));
+ }
+
+ memcpy(input, ps->input, sizeof(input));
+ vPosDeclared = ps->vPosDeclared;
+ vFaceDeclared = ps->vFaceDeclared;
+ usedSamplers = ps->usedSamplers;
+
+ optimize();
+ analyze();
+ }
+ }
+
+ PixelShader::PixelShader(const unsigned long *token) : Shader()
+ {
+ parse(token);
+
+ vPosDeclared = false;
+ vFaceDeclared = false;
+ centroid = false;
+
+ optimize();
+ analyze();
+ }
+
+ PixelShader::~PixelShader()
+ {
+ }
+
+ int PixelShader::validate(const unsigned long *const token)
+ {
+ if(!token)
+ {
+ return 0;
+ }
+
+ unsigned short version = (unsigned short)(token[0] & 0x0000FFFF);
+ // unsigned char minorVersion = (unsigned char)(token[0] & 0x000000FF);
+ unsigned char majorVersion = (unsigned char)((token[0] & 0x0000FF00) >> 8);
+ ShaderType shaderType = (ShaderType)((token[0] & 0xFFFF0000) >> 16);
+
+ if(shaderType != SHADER_PIXEL || majorVersion > 3)
+ {
+ return 0;
+ }
+
+ int instructionCount = 1;
+
+ for(int i = 0; token[i] != 0x0000FFFF; i++)
+ {
+ if((token[i] & 0x0000FFFF) == 0x0000FFFE) // Comment token
+ {
+ int length = (token[i] & 0x7FFF0000) >> 16;
+
+ i += length;
+ }
+ else
+ {
+ Shader::Opcode opcode = (Shader::Opcode)(token[i] & 0x0000FFFF);
+
+ switch(opcode)
+ {
+ case Shader::OPCODE_RESERVED0:
+ case Shader::OPCODE_MOVA:
+ return 0; // Unsupported operation
+ default:
+ instructionCount++;
+ break;
+ }
+
+ i += size(token[i], version);
+ }
+ }
+
+ return instructionCount;
+ }
+
+ bool PixelShader::depthOverride() const
+ {
+ return zOverride;
+ }
+
+ bool PixelShader::containsKill() const
+ {
+ return kill;
+ }
+
+ bool PixelShader::containsCentroid() const
+ {
+ return centroid;
+ }
+
+ bool PixelShader::usesDiffuse(int component) const
+ {
+ return input[0][component].active();
+ }
+
+ bool PixelShader::usesSpecular(int component) const
+ {
+ return input[1][component].active();
+ }
+
+ bool PixelShader::usesTexture(int coordinate, int component) const
+ {
+ return input[2 + coordinate][component].active();
+ }
+
+ void PixelShader::setInput(int inputIdx, int nbComponents, const sw::Shader::Semantic& semantic)
+ {
+ for(int i = 0; i < nbComponents; ++i)
+ {
+ input[inputIdx][i] = semantic;
+ }
+ }
+
+ const sw::Shader::Semantic& PixelShader::getInput(int inputIdx, int component) const
+ {
+ return input[inputIdx][component];
+ }
+
+ void PixelShader::analyze()
+ {
+ analyzeZOverride();
+ analyzeKill();
+ analyzeInterpolants();
+ analyzeDirtyConstants();
+ analyzeDynamicBranching();
+ analyzeSamplers();
+ analyzeCallSites();
+ analyzeIndirectAddressing();
+ }
+
+ void PixelShader::analyzeZOverride()
+ {
+ zOverride = false;
+
+ for(const auto &inst : instruction)
+ {
+ if(inst->opcode == Shader::OPCODE_TEXM3X2DEPTH ||
+ inst->opcode == Shader::OPCODE_TEXDEPTH ||
+ inst->dst.type == Shader::PARAMETER_DEPTHOUT)
+ {
+ zOverride = true;
+
+ break;
+ }
+ }
+ }
+
+ void PixelShader::analyzeKill()
+ {
+ kill = false;
+
+ for(const auto &inst : instruction)
+ {
+ if(inst->opcode == Shader::OPCODE_TEXKILL ||
+ inst->opcode == Shader::OPCODE_DISCARD)
+ {
+ kill = true;
+
+ break;
+ }
+ }
+ }
+
+ void PixelShader::analyzeInterpolants()
+ {
+ if(shaderModel < 0x0300)
+ {
+ // Set default mapping; disable unused interpolants below
+ input[0][0] = Semantic(Shader::USAGE_COLOR, 0);
+ input[0][1] = Semantic(Shader::USAGE_COLOR, 0);
+ input[0][2] = Semantic(Shader::USAGE_COLOR, 0);
+ input[0][3] = Semantic(Shader::USAGE_COLOR, 0);
+
+ input[1][0] = Semantic(Shader::USAGE_COLOR, 1);
+ input[1][1] = Semantic(Shader::USAGE_COLOR, 1);
+ input[1][2] = Semantic(Shader::USAGE_COLOR, 1);
+ input[1][3] = Semantic(Shader::USAGE_COLOR, 1);
+
+ for(int i = 0; i < 8; i++)
+ {
+ input[2 + i][0] = Semantic(Shader::USAGE_TEXCOORD, i);
+ input[2 + i][1] = Semantic(Shader::USAGE_TEXCOORD, i);
+ input[2 + i][2] = Semantic(Shader::USAGE_TEXCOORD, i);
+ input[2 + i][3] = Semantic(Shader::USAGE_TEXCOORD, i);
+ }
+
+ Shader::SamplerType samplerType[16];
+
+ for(int i = 0; i < 16; i++)
+ {
+ samplerType[i] = Shader::SAMPLER_UNKNOWN;
+ }
+
+ for(const auto &inst : instruction)
+ {
+ if(inst->dst.type == Shader::PARAMETER_SAMPLER)
+ {
+ int sampler = inst->dst.index;
+
+ samplerType[sampler] = inst->samplerType;
+ }
+ }
+
+ bool interpolant[MAX_FRAGMENT_INPUTS][4] = {{false}}; // Interpolants in use
+
+ for(const auto &inst : instruction)
+ {
+ if(inst->dst.type == Shader::PARAMETER_TEXTURE)
+ {
+ int index = inst->dst.index + 2;
+
+ switch(inst->opcode)
+ {
+ case Shader::OPCODE_TEX:
+ case Shader::OPCODE_TEXBEM:
+ case Shader::OPCODE_TEXBEML:
+ case Shader::OPCODE_TEXCOORD:
+ case Shader::OPCODE_TEXDP3:
+ case Shader::OPCODE_TEXDP3TEX:
+ case Shader::OPCODE_TEXM3X2DEPTH:
+ case Shader::OPCODE_TEXM3X2PAD:
+ case Shader::OPCODE_TEXM3X2TEX:
+ case Shader::OPCODE_TEXM3X3:
+ case Shader::OPCODE_TEXM3X3PAD:
+ case Shader::OPCODE_TEXM3X3TEX:
+ interpolant[index][0] = true;
+ interpolant[index][1] = true;
+ interpolant[index][2] = true;
+ break;
+ case Shader::OPCODE_TEXKILL:
+ if(majorVersion < 2)
+ {
+ interpolant[index][0] = true;
+ interpolant[index][1] = true;
+ interpolant[index][2] = true;
+ }
+ else
+ {
+ interpolant[index][0] = true;
+ interpolant[index][1] = true;
+ interpolant[index][2] = true;
+ interpolant[index][3] = true;
+ }
+ break;
+ case Shader::OPCODE_TEXM3X3VSPEC:
+ interpolant[index][0] = true;
+ interpolant[index][1] = true;
+ interpolant[index][2] = true;
+ interpolant[index - 2][3] = true;
+ interpolant[index - 1][3] = true;
+ interpolant[index - 0][3] = true;
+ break;
+ case Shader::OPCODE_DCL:
+ break; // Ignore
+ default: // Arithmetic instruction
+ if(shaderModel >= 0x0104)
+ {
+ ASSERT(false);
+ }
+ }
+ }
+
+ for(int argument = 0; argument < 4; argument++)
+ {
+ if(inst->src[argument].type == Shader::PARAMETER_INPUT ||
+ inst->src[argument].type == Shader::PARAMETER_TEXTURE)
+ {
+ int index = inst->src[argument].index;
+ int swizzle = inst->src[argument].swizzle;
+ int mask = inst->dst.mask;
+
+ if(inst->src[argument].type == Shader::PARAMETER_TEXTURE)
+ {
+ index += 2;
+ }
+
+ switch(inst->opcode)
+ {
+ case Shader::OPCODE_TEX:
+ case Shader::OPCODE_TEXLDD:
+ case Shader::OPCODE_TEXLDL:
+ case Shader::OPCODE_TEXLOD:
+ case Shader::OPCODE_TEXBIAS:
+ case Shader::OPCODE_TEXOFFSET:
+ case Shader::OPCODE_TEXOFFSETBIAS:
+ case Shader::OPCODE_TEXLODOFFSET:
+ case Shader::OPCODE_TEXELFETCH:
+ case Shader::OPCODE_TEXELFETCHOFFSET:
+ case Shader::OPCODE_TEXGRAD:
+ case Shader::OPCODE_TEXGRADOFFSET:
+ {
+ int sampler = inst->src[1].index;
+
+ switch(samplerType[sampler])
+ {
+ case Shader::SAMPLER_UNKNOWN:
+ if(shaderModel == 0x0104)
+ {
+ if((inst->src[0].swizzle & 0x30) == 0x20) // .xyz
+ {
+ interpolant[index][0] = true;
+ interpolant[index][1] = true;
+ interpolant[index][2] = true;
+ }
+ else // .xyw
+ {
+ interpolant[index][0] = true;
+ interpolant[index][1] = true;
+ interpolant[index][3] = true;
+ }
+ }
+ else
+ {
+ ASSERT(false);
+ }
+ break;
+ case Shader::SAMPLER_1D:
+ interpolant[index][0] = true;
+ break;
+ case Shader::SAMPLER_2D:
+ interpolant[index][0] = true;
+ interpolant[index][1] = true;
+ break;
+ case Shader::SAMPLER_CUBE:
+ interpolant[index][0] = true;
+ interpolant[index][1] = true;
+ interpolant[index][2] = true;
+ break;
+ case Shader::SAMPLER_VOLUME:
+ interpolant[index][0] = true;
+ interpolant[index][1] = true;
+ interpolant[index][2] = true;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ if(inst->bias)
+ {
+ interpolant[index][3] = true;
+ }
+
+ if(inst->project)
+ {
+ interpolant[index][3] = true;
+ }
+
+ if(shaderModel == 0x0104 && inst->opcode == Shader::OPCODE_TEX)
+ {
+ if(inst->src[0].modifier == Shader::MODIFIER_DZ)
+ {
+ interpolant[index][2] = true;
+ }
+
+ if(inst->src[0].modifier == Shader::MODIFIER_DW)
+ {
+ interpolant[index][3] = true;
+ }
+ }
+ }
+ break;
+ case Shader::OPCODE_M3X2:
+ if(mask & 0x1)
+ {
+ interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+ interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+ interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+ interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+ }
+
+ if(argument == 1)
+ {
+ if(mask & 0x2)
+ {
+ interpolant[index + 1][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+ interpolant[index + 1][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+ interpolant[index + 1][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+ interpolant[index + 1][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+ }
+ }
+ break;
+ case Shader::OPCODE_M3X3:
+ if(mask & 0x1)
+ {
+ interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+ interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+ interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+ interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+ }
+
+ if(argument == 1)
+ {
+ if(mask & 0x2)
+ {
+ interpolant[index + 1][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+ interpolant[index + 1][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+ interpolant[index + 1][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+ interpolant[index + 1][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+ }
+
+ if(mask & 0x4)
+ {
+ interpolant[index + 2][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+ interpolant[index + 2][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+ interpolant[index + 2][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+ interpolant[index + 2][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+ }
+ }
+ break;
+ case Shader::OPCODE_M3X4:
+ if(mask & 0x1)
+ {
+ interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+ interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+ interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+ interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+ }
+
+ if(argument == 1)
+ {
+ if(mask & 0x2)
+ {
+ interpolant[index + 1][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+ interpolant[index + 1][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+ interpolant[index + 1][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+ interpolant[index + 1][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+ }
+
+ if(mask & 0x4)
+ {
+ interpolant[index + 2][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+ interpolant[index + 2][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+ interpolant[index + 2][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+ interpolant[index + 2][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+ }
+
+ if(mask & 0x8)
+ {
+ interpolant[index + 3][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+ interpolant[index + 3][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+ interpolant[index + 3][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+ interpolant[index + 3][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+ }
+ }
+ break;
+ case Shader::OPCODE_M4X3:
+ if(mask & 0x1)
+ {
+ interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
+ interpolant[index][1] |= swizzleContainsComponent(swizzle, 1);
+ interpolant[index][2] |= swizzleContainsComponent(swizzle, 2);
+ interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
+ }
+
+ if(argument == 1)
+ {
+ if(mask & 0x2)
+ {
+ interpolant[index + 1][0] |= swizzleContainsComponent(swizzle, 0);
+ interpolant[index + 1][1] |= swizzleContainsComponent(swizzle, 1);
+ interpolant[index + 1][2] |= swizzleContainsComponent(swizzle, 2);
+ interpolant[index + 1][3] |= swizzleContainsComponent(swizzle, 3);
+ }
+
+ if(mask & 0x4)
+ {
+ interpolant[index + 2][0] |= swizzleContainsComponent(swizzle, 0);
+ interpolant[index + 2][1] |= swizzleContainsComponent(swizzle, 1);
+ interpolant[index + 2][2] |= swizzleContainsComponent(swizzle, 2);
+ interpolant[index + 2][3] |= swizzleContainsComponent(swizzle, 3);
+ }
+ }
+ break;
+ case Shader::OPCODE_M4X4:
+ if(mask & 0x1)
+ {
+ interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
+ interpolant[index][1] |= swizzleContainsComponent(swizzle, 1);
+ interpolant[index][2] |= swizzleContainsComponent(swizzle, 2);
+ interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
+ }
+
+ if(argument == 1)
+ {
+ if(mask & 0x2)
+ {
+ interpolant[index + 1][0] |= swizzleContainsComponent(swizzle, 0);
+ interpolant[index + 1][1] |= swizzleContainsComponent(swizzle, 1);
+ interpolant[index + 1][2] |= swizzleContainsComponent(swizzle, 2);
+ interpolant[index + 1][3] |= swizzleContainsComponent(swizzle, 3);
+ }
+
+ if(mask & 0x4)
+ {
+ interpolant[index + 2][0] |= swizzleContainsComponent(swizzle, 0);
+ interpolant[index + 2][1] |= swizzleContainsComponent(swizzle, 1);
+ interpolant[index + 2][2] |= swizzleContainsComponent(swizzle, 2);
+ interpolant[index + 2][3] |= swizzleContainsComponent(swizzle, 3);
+ }
+
+ if(mask & 0x8)
+ {
+ interpolant[index + 3][0] |= swizzleContainsComponent(swizzle, 0);
+ interpolant[index + 3][1] |= swizzleContainsComponent(swizzle, 1);
+ interpolant[index + 3][2] |= swizzleContainsComponent(swizzle, 2);
+ interpolant[index + 3][3] |= swizzleContainsComponent(swizzle, 3);
+ }
+ }
+ break;
+ case Shader::OPCODE_CRS:
+ if(mask & 0x1)
+ {
+ interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x6);
+ interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x6);
+ interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x6);
+ interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x6);
+ }
+
+ if(mask & 0x2)
+ {
+ interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x5);
+ interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x5);
+ interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x5);
+ interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x5);
+ }
+
+ if(mask & 0x4)
+ {
+ interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x3);
+ interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x3);
+ interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x3);
+ interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x3);
+ }
+ break;
+ case Shader::OPCODE_DP2ADD:
+ if(argument == 0 || argument == 1)
+ {
+ interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x3);
+ interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x3);
+ interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x3);
+ interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x3);
+ }
+ else // argument == 2
+ {
+ interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
+ interpolant[index][1] |= swizzleContainsComponent(swizzle, 1);
+ interpolant[index][2] |= swizzleContainsComponent(swizzle, 2);
+ interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
+ }
+ break;
+ case Shader::OPCODE_DP3:
+ interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+ interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+ interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+ interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+ break;
+ case Shader::OPCODE_DP4:
+ interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
+ interpolant[index][1] |= swizzleContainsComponent(swizzle, 1);
+ interpolant[index][2] |= swizzleContainsComponent(swizzle, 2);
+ interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
+ break;
+ case Shader::OPCODE_SINCOS:
+ case Shader::OPCODE_EXP2X:
+ case Shader::OPCODE_LOG2X:
+ case Shader::OPCODE_POWX:
+ case Shader::OPCODE_RCPX:
+ case Shader::OPCODE_RSQX:
+ interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
+ interpolant[index][1] |= swizzleContainsComponent(swizzle, 1);
+ interpolant[index][2] |= swizzleContainsComponent(swizzle, 2);
+ interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
+ break;
+ case Shader::OPCODE_NRM3:
+ interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7 | mask);
+ interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7 | mask);
+ interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7 | mask);
+ interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7 | mask);
+ break;
+ case Shader::OPCODE_MOV:
+ case Shader::OPCODE_ADD:
+ case Shader::OPCODE_SUB:
+ case Shader::OPCODE_MUL:
+ case Shader::OPCODE_MAD:
+ case Shader::OPCODE_ABS:
+ case Shader::OPCODE_CMP0:
+ case Shader::OPCODE_CND:
+ case Shader::OPCODE_FRC:
+ case Shader::OPCODE_LRP:
+ case Shader::OPCODE_MAX:
+ case Shader::OPCODE_MIN:
+ case Shader::OPCODE_CMP:
+ case Shader::OPCODE_BREAKC:
+ case Shader::OPCODE_DFDX:
+ case Shader::OPCODE_DFDY:
+ interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, mask);
+ interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, mask);
+ interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, mask);
+ interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, mask);
+ break;
+ case Shader::OPCODE_TEXCOORD:
+ interpolant[index][0] = true;
+ interpolant[index][1] = true;
+ interpolant[index][2] = true;
+ interpolant[index][3] = true;
+ break;
+ case Shader::OPCODE_TEXDP3:
+ case Shader::OPCODE_TEXDP3TEX:
+ case Shader::OPCODE_TEXM3X2PAD:
+ case Shader::OPCODE_TEXM3X3PAD:
+ case Shader::OPCODE_TEXM3X2TEX:
+ case Shader::OPCODE_TEXM3X3SPEC:
+ case Shader::OPCODE_TEXM3X3VSPEC:
+ case Shader::OPCODE_TEXBEM:
+ case Shader::OPCODE_TEXBEML:
+ case Shader::OPCODE_TEXM3X2DEPTH:
+ case Shader::OPCODE_TEXM3X3:
+ case Shader::OPCODE_TEXM3X3TEX:
+ interpolant[index][0] = true;
+ interpolant[index][1] = true;
+ interpolant[index][2] = true;
+ break;
+ case Shader::OPCODE_TEXREG2AR:
+ case Shader::OPCODE_TEXREG2GB:
+ case Shader::OPCODE_TEXREG2RGB:
+ break;
+ default:
+ // ASSERT(false); // Refine component usage
+ interpolant[index][0] = true;
+ interpolant[index][1] = true;
+ interpolant[index][2] = true;
+ interpolant[index][3] = true;
+ }
+ }
+ }
+ }
+
+ for(int index = 0; index < MAX_FRAGMENT_INPUTS; index++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ if(!interpolant[index][component])
+ {
+ input[index][component] = Semantic();
+ }
+ }
+ }
+ }
+ else // Shader Model 3.0 input declaration; v# indexable
+ {
+ for(const auto &inst : instruction)
+ {
+ if(inst->opcode == Shader::OPCODE_DCL)
+ {
+ if(inst->dst.type == Shader::PARAMETER_INPUT)
+ {
+ unsigned char usage = inst->usage;
+ unsigned char index = inst->usageIndex;
+ unsigned char mask = inst->dst.mask;
+ unsigned char reg = inst->dst.index;
+
+ if(mask & 0x01) input[reg][0] = Semantic(usage, index);
+ if(mask & 0x02) input[reg][1] = Semantic(usage, index);
+ if(mask & 0x04) input[reg][2] = Semantic(usage, index);
+ if(mask & 0x08) input[reg][3] = Semantic(usage, index);
+ }
+ else if(inst->dst.type == Shader::PARAMETER_MISCTYPE)
+ {
+ unsigned char index = inst->dst.index;
+
+ if(index == Shader::VPosIndex)
+ {
+ vPosDeclared = true;
+ }
+ else if(index == Shader::VFaceIndex)
+ {
+ vFaceDeclared = true;
+ }
+ else ASSERT(false);
+ }
+ }
+ }
+ }
+
+ if(shaderModel >= 0x0200)
+ {
+ for(const auto &inst : instruction)
+ {
+ if(inst->opcode == Shader::OPCODE_DCL)
+ {
+ bool centroid = inst->dst.centroid;
+ unsigned char reg = inst->dst.index;
+
+ switch(inst->dst.type)
+ {
+ case Shader::PARAMETER_INPUT:
+ input[reg][0].centroid = centroid;
+ break;
+ case Shader::PARAMETER_TEXTURE:
+ input[2 + reg][0].centroid = centroid;
+ break;
+ default:
+ break;
+ }
+
+ this->centroid = this->centroid || centroid;
+ }
+ }
+ }
+ }
+}
diff --git a/src/Pipeline/PixelShader.hpp b/src/Pipeline/PixelShader.hpp
new file mode 100644
index 0000000..a06aaaa
--- /dev/null
+++ b/src/Pipeline/PixelShader.hpp
@@ -0,0 +1,63 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_PixelShader_hpp
+#define sw_PixelShader_hpp
+
+#include "Shader.hpp"
+#include "Main/Config.hpp"
+
+namespace sw
+{
+ class PixelShader : public Shader
+ {
+ public:
+ explicit PixelShader(const PixelShader *ps = 0);
+ explicit PixelShader(const unsigned long *token);
+
+ virtual ~PixelShader();
+
+ static int validate(const unsigned long *const token); // Returns number of instructions if valid
+ bool depthOverride() const;
+ bool containsKill() const;
+ bool containsCentroid() const;
+ bool usesDiffuse(int component) const;
+ bool usesSpecular(int component) const;
+ bool usesTexture(int coordinate, int component) const;
+
+ void setInput(int inputIdx, int nbComponents, const Semantic& semantic);
+ const Semantic& getInput(int inputIdx, int component) const;
+
+ void declareVPos() { vPosDeclared = true; }
+ void declareVFace() { vFaceDeclared = true; }
+ bool isVPosDeclared() const { return vPosDeclared; }
+ bool isVFaceDeclared() const { return vFaceDeclared; }
+
+ private:
+ void analyze();
+ void analyzeZOverride();
+ void analyzeKill();
+ void analyzeInterpolants();
+
+ Semantic input[MAX_FRAGMENT_INPUTS][4];
+
+ bool vPosDeclared;
+ bool vFaceDeclared;
+ bool zOverride;
+ bool kill;
+ bool centroid;
+ };
+}
+
+#endif // sw_PixelShader_hpp
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
new file mode 100644
index 0000000..8a2aa39
--- /dev/null
+++ b/src/Pipeline/SamplerCore.cpp
@@ -0,0 +1,3035 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "SamplerCore.hpp"
+
+#include "Constants.hpp"
+#include "Common/Debug.hpp"
+
+namespace
+{
+ void applySwizzle(sw::SwizzleType swizzle, sw::Short4& s, const sw::Vector4s& c)
+ {
+ switch(swizzle)
+ {
+ case sw::SWIZZLE_RED: s = c.x; break;
+ case sw::SWIZZLE_GREEN: s = c.y; break;
+ case sw::SWIZZLE_BLUE: s = c.z; break;
+ case sw::SWIZZLE_ALPHA: s = c.w; break;
+ case sw::SWIZZLE_ZERO: s = sw::Short4(0x0000); break;
+ case sw::SWIZZLE_ONE: s = sw::Short4(0x1000); break;
+ default: ASSERT(false);
+ }
+ }
+
+ void applySwizzle(sw::SwizzleType swizzle, sw::Float4& f, const sw::Vector4f& c)
+ {
+ switch(swizzle)
+ {
+ case sw::SWIZZLE_RED: f = c.x; break;
+ case sw::SWIZZLE_GREEN: f = c.y; break;
+ case sw::SWIZZLE_BLUE: f = c.z; break;
+ case sw::SWIZZLE_ALPHA: f = c.w; break;
+ case sw::SWIZZLE_ZERO: f = sw::Float4(0.0f, 0.0f, 0.0f, 0.0f); break;
+ case sw::SWIZZLE_ONE: f = sw::Float4(1.0f, 1.0f, 1.0f, 1.0f); break;
+ default: ASSERT(false);
+ }
+ }
+}
+
+namespace sw
+{
+ extern bool colorsDefaultToZero;
+
+ SamplerCore::SamplerCore(Pointer<Byte> &constants, const Sampler::State &state) : constants(constants), state(state)
+ {
+ }
+
+ Vector4s SamplerCore::sampleTexture(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &bias, Vector4f &dsx, Vector4f &dsy)
+ {
+ return sampleTexture(texture, u, v, w, q, q, dsx, dsy, (dsx), Implicit, true);
+ }
+
+ Vector4s SamplerCore::sampleTexture(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function, bool fixed12)
+ {
+ Vector4s c;
+
+ #if PERF_PROFILE
+ AddAtomic(Pointer<Long>(&profiler.texOperations), 4);
+
+ if(state.compressedFormat)
+ {
+ AddAtomic(Pointer<Long>(&profiler.compressedTex), 4);
+ }
+ #endif
+
+ if(state.textureType == TEXTURE_NULL)
+ {
+ c.x = Short4(0x0000);
+ c.y = Short4(0x0000);
+ c.z = Short4(0x0000);
+
+ if(fixed12) // FIXME: Convert to fixed12 at higher level, when required
+ {
+ c.w = Short4(0x1000);
+ }
+ else
+ {
+ c.w = Short4(0xFFFFu); // FIXME
+ }
+ }
+ else
+ {
+ Float4 uuuu = u;
+ Float4 vvvv = v;
+ Float4 wwww = w;
+ Float4 qqqq = q;
+
+ Int face[4];
+ Float lod;
+ Float anisotropy;
+ Float4 uDelta;
+ Float4 vDelta;
+
+ if(state.textureType != TEXTURE_3D)
+ {
+ if(state.textureType != TEXTURE_CUBE)
+ {
+ computeLod(texture, lod, anisotropy, uDelta, vDelta, uuuu, vvvv, bias.x, dsx, dsy, function);
+ }
+ else
+ {
+ Float4 M;
+ cubeFace(face, uuuu, vvvv, u, v, w, M);
+ computeLodCube(texture, lod, u, v, w, bias.x, dsx, dsy, M, function);
+ }
+ }
+ else
+ {
+ computeLod3D(texture, lod, uuuu, vvvv, wwww, bias.x, dsx, dsy, function);
+ }
+
+ if(!hasFloatTexture())
+ {
+ c = sampleFilter(texture, uuuu, vvvv, wwww, offset, lod, anisotropy, uDelta, vDelta, face, function);
+ }
+ else
+ {
+ Vector4f cf = sampleFloatFilter(texture, uuuu, vvvv, wwww, qqqq, offset, lod, anisotropy, uDelta, vDelta, face, function);
+
+ convertFixed12(c, cf);
+ }
+
+ if(fixed12)
+ {
+ if(!hasFloatTexture())
+ {
+ if(state.textureFormat == FORMAT_R5G6B5)
+ {
+ c.x = MulHigh(As<UShort4>(c.x), UShort4(0x10000000 / 0xF800));
+ c.y = MulHigh(As<UShort4>(c.y), UShort4(0x10000000 / 0xFC00));
+ c.z = MulHigh(As<UShort4>(c.z), UShort4(0x10000000 / 0xF800));
+ }
+ else
+ {
+ for(int component = 0; component < textureComponentCount(); component++)
+ {
+ if(hasUnsignedTextureComponent(component))
+ {
+ c[component] = As<UShort4>(c[component]) >> 4;
+ }
+ else
+ {
+ c[component] = c[component] >> 3;
+ }
+ }
+ }
+ }
+
+ if(state.textureFilter != FILTER_GATHER)
+ {
+ int componentCount = textureComponentCount();
+ short defaultColorValue = colorsDefaultToZero ? 0x0000 : 0x1000;
+
+ switch(state.textureFormat)
+ {
+ case FORMAT_R8_SNORM:
+ case FORMAT_G8R8_SNORM:
+ case FORMAT_X8B8G8R8_SNORM:
+ case FORMAT_A8B8G8R8_SNORM:
+ case FORMAT_R8:
+ case FORMAT_R5G6B5:
+ case FORMAT_G8R8:
+ case FORMAT_R8I:
+ case FORMAT_R8UI:
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8UI:
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X8B8G8R8UI:
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8UI:
+ case FORMAT_R16I:
+ case FORMAT_R16UI:
+ case FORMAT_G16R16:
+ case FORMAT_G16R16I:
+ case FORMAT_G16R16UI:
+ case FORMAT_X16B16G16R16I:
+ case FORMAT_X16B16G16R16UI:
+ case FORMAT_A16B16G16R16:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_X32B32G32R32UI:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_V8U8:
+ case FORMAT_Q8W8V8U8:
+ case FORMAT_X8L8V8U8:
+ case FORMAT_V16U16:
+ case FORMAT_A16W16V16U16:
+ case FORMAT_Q16W16V16U16:
+ case FORMAT_YV12_BT601:
+ case FORMAT_YV12_BT709:
+ case FORMAT_YV12_JFIF:
+ if(componentCount < 2) c.y = Short4(defaultColorValue);
+ if(componentCount < 3) c.z = Short4(defaultColorValue);
+ if(componentCount < 4) c.w = Short4(0x1000);
+ break;
+ case FORMAT_A8:
+ c.w = c.x;
+ c.x = Short4(0x0000);
+ c.y = Short4(0x0000);
+ c.z = Short4(0x0000);
+ break;
+ case FORMAT_L8:
+ case FORMAT_L16:
+ c.y = c.x;
+ c.z = c.x;
+ c.w = Short4(0x1000);
+ break;
+ case FORMAT_A8L8:
+ c.w = c.y;
+ c.y = c.x;
+ c.z = c.x;
+ break;
+ case FORMAT_R32F:
+ c.y = Short4(defaultColorValue);
+ case FORMAT_G32R32F:
+ c.z = Short4(defaultColorValue);
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ c.w = Short4(0x1000);
+ case FORMAT_A32B32G32R32F:
+ break;
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ c.y = c.x;
+ c.z = c.x;
+ c.w = c.x;
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ if((state.swizzleR != SWIZZLE_RED) ||
+ (state.swizzleG != SWIZZLE_GREEN) ||
+ (state.swizzleB != SWIZZLE_BLUE) ||
+ (state.swizzleA != SWIZZLE_ALPHA))
+ {
+ const Vector4s col(c);
+ applySwizzle(state.swizzleR, c.x, col);
+ applySwizzle(state.swizzleG, c.y, col);
+ applySwizzle(state.swizzleB, c.z, col);
+ applySwizzle(state.swizzleA, c.w, col);
+ }
+ }
+ }
+
+ return c;
+ }
+
+ Vector4f SamplerCore::sampleTexture(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function)
+ {
+ Vector4f c;
+
+ #if PERF_PROFILE
+ AddAtomic(Pointer<Long>(&profiler.texOperations), 4);
+
+ if(state.compressedFormat)
+ {
+ AddAtomic(Pointer<Long>(&profiler.compressedTex), 4);
+ }
+ #endif
+
+ if(state.textureType == TEXTURE_NULL)
+ {
+ c.x = Float4(0.0f);
+ c.y = Float4(0.0f);
+ c.z = Float4(0.0f);
+ c.w = Float4(1.0f);
+ }
+ else
+ {
+ // FIXME: YUV is not supported by the floating point path
+ bool forceFloatFiltering = state.highPrecisionFiltering && !hasYuvFormat() && (state.textureFilter != FILTER_POINT);
+ bool seamlessCube = (state.addressingModeU == ADDRESSING_SEAMLESS);
+ bool rectangleTexture = (state.textureType == TEXTURE_RECTANGLE);
+ if(hasFloatTexture() || hasUnnormalizedIntegerTexture() || forceFloatFiltering || seamlessCube || rectangleTexture) // FIXME: Mostly identical to integer sampling
+ {
+ Float4 uuuu = u;
+ Float4 vvvv = v;
+ Float4 wwww = w;
+ Float4 qqqq = q;
+
+ Int face[4];
+ Float lod;
+ Float anisotropy;
+ Float4 uDelta;
+ Float4 vDelta;
+
+ if(state.textureType != TEXTURE_3D)
+ {
+ if(state.textureType != TEXTURE_CUBE)
+ {
+ computeLod(texture, lod, anisotropy, uDelta, vDelta, uuuu, vvvv, bias.x, dsx, dsy, function);
+ }
+ else
+ {
+ Float4 M;
+ cubeFace(face, uuuu, vvvv, u, v, w, M);
+ computeLodCube(texture, lod, u, v, w, bias.x, dsx, dsy, M, function);
+ }
+ }
+ else
+ {
+ computeLod3D(texture, lod, uuuu, vvvv, wwww, bias.x, dsx, dsy, function);
+ }
+
+ c = sampleFloatFilter(texture, uuuu, vvvv, wwww, qqqq, offset, lod, anisotropy, uDelta, vDelta, face, function);
+
+ if(!hasFloatTexture() && !hasUnnormalizedIntegerTexture())
+ {
+ if(has16bitTextureFormat())
+ {
+ switch(state.textureFormat)
+ {
+ case FORMAT_R5G6B5:
+ c.x *= Float4(1.0f / 0xF800);
+ c.y *= Float4(1.0f / 0xFC00);
+ c.z *= Float4(1.0f / 0xF800);
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ else
+ {
+ for(int component = 0; component < textureComponentCount(); component++)
+ {
+ c[component] *= Float4(hasUnsignedTextureComponent(component) ? 1.0f / 0xFFFF : 1.0f / 0x7FFF);
+ }
+ }
+ }
+ }
+ else
+ {
+ Vector4s cs = sampleTexture(texture, u, v, w, q, bias, dsx, dsy, offset, function, false);
+
+ if(state.textureFormat == FORMAT_R5G6B5)
+ {
+ c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF800);
+ c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xFC00);
+ c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF800);
+ }
+ else
+ {
+ for(int component = 0; component < textureComponentCount(); component++)
+ {
+ if(hasUnsignedTextureComponent(component))
+ {
+ convertUnsigned16(c[component], cs[component]);
+ }
+ else
+ {
+ convertSigned15(c[component], cs[component]);
+ }
+ }
+ }
+ }
+
+ int componentCount = textureComponentCount();
+ float defaultColorValue = colorsDefaultToZero ? 0.0f : 1.0f;
+
+ if(state.textureFilter != FILTER_GATHER)
+ {
+ switch(state.textureFormat)
+ {
+ case FORMAT_R8I:
+ case FORMAT_R8UI:
+ case FORMAT_R16I:
+ case FORMAT_R16UI:
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ c.y = As<Float4>(UInt4(0));
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8UI:
+ case FORMAT_G16R16I:
+ case FORMAT_G16R16UI:
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ c.z = As<Float4>(UInt4(0));
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X8B8G8R8UI:
+ case FORMAT_X16B16G16R16I:
+ case FORMAT_X16B16G16R16UI:
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_X32B32G32R32UI:
+ c.w = As<Float4>(UInt4(1));
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8UI:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ break;
+ case FORMAT_R8_SNORM:
+ case FORMAT_G8R8_SNORM:
+ case FORMAT_X8B8G8R8_SNORM:
+ case FORMAT_A8B8G8R8_SNORM:
+ case FORMAT_R8:
+ case FORMAT_R5G6B5:
+ case FORMAT_G8R8:
+ case FORMAT_G16R16:
+ case FORMAT_A16B16G16R16:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_V8U8:
+ case FORMAT_Q8W8V8U8:
+ case FORMAT_X8L8V8U8:
+ case FORMAT_V16U16:
+ case FORMAT_A16W16V16U16:
+ case FORMAT_Q16W16V16U16:
+ case FORMAT_YV12_BT601:
+ case FORMAT_YV12_BT709:
+ case FORMAT_YV12_JFIF:
+ if(componentCount < 2) c.y = Float4(defaultColorValue);
+ if(componentCount < 3) c.z = Float4(defaultColorValue);
+ if(componentCount < 4) c.w = Float4(1.0f);
+ break;
+ case FORMAT_A8:
+ c.w = c.x;
+ c.x = Float4(0.0f);
+ c.y = Float4(0.0f);
+ c.z = Float4(0.0f);
+ break;
+ case FORMAT_L8:
+ case FORMAT_L16:
+ c.y = c.x;
+ c.z = c.x;
+ c.w = Float4(1.0f);
+ break;
+ case FORMAT_A8L8:
+ c.w = c.y;
+ c.y = c.x;
+ c.z = c.x;
+ break;
+ case FORMAT_R32F:
+ c.y = Float4(defaultColorValue);
+ case FORMAT_G32R32F:
+ c.z = Float4(defaultColorValue);
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ c.w = Float4(1.0f);
+ case FORMAT_A32B32G32R32F:
+ break;
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ c.y = Float4(0.0f);
+ c.z = Float4(0.0f);
+ c.w = Float4(1.0f);
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ if((state.swizzleR != SWIZZLE_RED) ||
+ (state.swizzleG != SWIZZLE_GREEN) ||
+ (state.swizzleB != SWIZZLE_BLUE) ||
+ (state.swizzleA != SWIZZLE_ALPHA))
+ {
+ const Vector4f col(c);
+ applySwizzle(state.swizzleR, c.x, col);
+ applySwizzle(state.swizzleG, c.y, col);
+ applySwizzle(state.swizzleB, c.z, col);
+ applySwizzle(state.swizzleA, c.w, col);
+ }
+ }
+
+ return c;
+ }
+
+ Vector4f SamplerCore::textureSize(Pointer<Byte> &texture, Float4 &lod)
+ {
+ Vector4f size;
+
+ for(int i = 0; i < 4; ++i)
+ {
+ Int baseLevel = *Pointer<Int>(texture + OFFSET(Texture, baseLevel));
+ Pointer<Byte> mipmap = texture + OFFSET(Texture, mipmap) + (As<Int>(Extract(lod, i)) + baseLevel) * sizeof(Mipmap);
+ size.x = Insert(size.x, As<Float>(Int(*Pointer<Short>(mipmap + OFFSET(Mipmap, width)))), i);
+ size.y = Insert(size.y, As<Float>(Int(*Pointer<Short>(mipmap + OFFSET(Mipmap, height)))), i);
+ size.z = Insert(size.z, As<Float>(Int(*Pointer<Short>(mipmap + OFFSET(Mipmap, depth)))), i);
+ }
+
+ return size;
+ }
+
+ void SamplerCore::border(Short4 &mask, Float4 &coordinates)
+ {
+ Int4 border = As<Int4>(CmpLT(Abs(coordinates - Float4(0.5f)), Float4(0.5f)));
+ mask = As<Short4>(Int2(As<Int4>(PackSigned(border, border))));
+ }
+
+ void SamplerCore::border(Int4 &mask, Float4 &coordinates)
+ {
+ mask = As<Int4>(CmpLT(Abs(coordinates - Float4(0.5f)), Float4(0.5f)));
+ }
+
+ Short4 SamplerCore::offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod)
+ {
+ Short4 offset = *Pointer<Short4>(mipmap + halfOffset);
+
+ if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
+ {
+ offset &= Short4(CmpNLE(Float4(lod), Float4(0.0f)));
+ }
+ else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
+ {
+ offset &= Short4(CmpLE(Float4(lod), Float4(0.0f)));
+ }
+
+ if(wrap)
+ {
+ switch(count)
+ {
+ case -1: return uvw - offset;
+ case 0: return uvw;
+ case +1: return uvw + offset;
+ case 2: return uvw + offset + offset;
+ }
+ }
+ else // Clamp or mirror
+ {
+ switch(count)
+ {
+ case -1: return SubSat(As<UShort4>(uvw), As<UShort4>(offset));
+ case 0: return uvw;
+ case +1: return AddSat(As<UShort4>(uvw), As<UShort4>(offset));
+ case 2: return AddSat(AddSat(As<UShort4>(uvw), As<UShort4>(offset)), As<UShort4>(offset));
+ }
+ }
+
+ return uvw;
+ }
+
+ Vector4s SamplerCore::sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], SamplerFunction function)
+ {
+ Vector4s c = sampleAniso(texture, u, v, w, offset, lod, anisotropy, uDelta, vDelta, face, false, function);
+
+ if(function == Fetch)
+ {
+ return c;
+ }
+
+ if(state.mipmapFilter == MIPMAP_LINEAR)
+ {
+ Vector4s cc = sampleAniso(texture, u, v, w, offset, lod, anisotropy, uDelta, vDelta, face, true, function);
+
+ lod *= Float(1 << 16);
+
+ UShort4 utri = UShort4(Float4(lod)); // FIXME: Optimize
+ Short4 stri = utri >> 1; // FIXME: Optimize
+
+ if(hasUnsignedTextureComponent(0)) cc.x = MulHigh(As<UShort4>(cc.x), utri); else cc.x = MulHigh(cc.x, stri);
+ if(hasUnsignedTextureComponent(1)) cc.y = MulHigh(As<UShort4>(cc.y), utri); else cc.y = MulHigh(cc.y, stri);
+ if(hasUnsignedTextureComponent(2)) cc.z = MulHigh(As<UShort4>(cc.z), utri); else cc.z = MulHigh(cc.z, stri);
+ if(hasUnsignedTextureComponent(3)) cc.w = MulHigh(As<UShort4>(cc.w), utri); else cc.w = MulHigh(cc.w, stri);
+
+ utri = ~utri;
+ stri = Short4(0x7FFF) - stri;
+
+ if(hasUnsignedTextureComponent(0)) c.x = MulHigh(As<UShort4>(c.x), utri); else c.x = MulHigh(c.x, stri);
+ if(hasUnsignedTextureComponent(1)) c.y = MulHigh(As<UShort4>(c.y), utri); else c.y = MulHigh(c.y, stri);
+ if(hasUnsignedTextureComponent(2)) c.z = MulHigh(As<UShort4>(c.z), utri); else c.z = MulHigh(c.z, stri);
+ if(hasUnsignedTextureComponent(3)) c.w = MulHigh(As<UShort4>(c.w), utri); else c.w = MulHigh(c.w, stri);
+
+ c.x += cc.x;
+ c.y += cc.y;
+ c.z += cc.z;
+ c.w += cc.w;
+
+ if(!hasUnsignedTextureComponent(0)) c.x += c.x;
+ if(!hasUnsignedTextureComponent(1)) c.y += c.y;
+ if(!hasUnsignedTextureComponent(2)) c.z += c.z;
+ if(!hasUnsignedTextureComponent(3)) c.w += c.w;
+ }
+
+ Short4 borderMask;
+
+ if(state.addressingModeU == ADDRESSING_BORDER)
+ {
+ Short4 u0;
+
+ border(u0, u);
+
+ borderMask = u0;
+ }
+
+ if(state.addressingModeV == ADDRESSING_BORDER)
+ {
+ Short4 v0;
+
+ border(v0, v);
+
+ if(state.addressingModeU == ADDRESSING_BORDER)
+ {
+ borderMask &= v0;
+ }
+ else
+ {
+ borderMask = v0;
+ }
+ }
+
+ if(state.addressingModeW == ADDRESSING_BORDER && state.textureType == TEXTURE_3D)
+ {
+ Short4 s0;
+
+ border(s0, w);
+
+ if(state.addressingModeU == ADDRESSING_BORDER ||
+ state.addressingModeV == ADDRESSING_BORDER)
+ {
+ borderMask &= s0;
+ }
+ else
+ {
+ borderMask = s0;
+ }
+ }
+
+ if(state.addressingModeU == ADDRESSING_BORDER ||
+ state.addressingModeV == ADDRESSING_BORDER ||
+ (state.addressingModeW == ADDRESSING_BORDER && state.textureType == TEXTURE_3D))
+ {
+ Short4 b;
+
+ c.x = (borderMask & c.x) | (~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[0])) >> (hasUnsignedTextureComponent(0) ? 0 : 1)));
+ c.y = (borderMask & c.y) | (~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[1])) >> (hasUnsignedTextureComponent(1) ? 0 : 1)));
+ c.z = (borderMask & c.z) | (~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[2])) >> (hasUnsignedTextureComponent(2) ? 0 : 1)));
+ c.w = (borderMask & c.w) | (~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[3])) >> (hasUnsignedTextureComponent(3) ? 0 : 1)));
+ }
+
+ return c;
+ }
+
+ Vector4s SamplerCore::sampleAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, SamplerFunction function)
+ {
+ Vector4s c;
+
+ if(state.textureFilter != FILTER_ANISOTROPIC || function == Lod || function == Fetch)
+ {
+ c = sampleQuad(texture, u, v, w, offset, lod, face, secondLOD, function);
+ }
+ else
+ {
+ Int a = RoundInt(anisotropy);
+
+ Vector4s cSum;
+
+ cSum.x = Short4(0);
+ cSum.y = Short4(0);
+ cSum.z = Short4(0);
+ cSum.w = Short4(0);
+
+ Float4 A = *Pointer<Float4>(constants + OFFSET(Constants,uvWeight) + 16 * a);
+ Float4 B = *Pointer<Float4>(constants + OFFSET(Constants,uvStart) + 16 * a);
+ UShort4 cw = *Pointer<UShort4>(constants + OFFSET(Constants,cWeight) + 8 * a);
+ Short4 sw = Short4(cw >> 1);
+
+ Float4 du = uDelta;
+ Float4 dv = vDelta;
+
+ Float4 u0 = u + B * du;
+ Float4 v0 = v + B * dv;
+
+ du *= A;
+ dv *= A;
+
+ Int i = 0;
+
+ Do
+ {
+ c = sampleQuad(texture, u0, v0, w, offset, lod, face, secondLOD, function);
+
+ u0 += du;
+ v0 += dv;
+
+ if(hasUnsignedTextureComponent(0)) cSum.x += As<Short4>(MulHigh(As<UShort4>(c.x), cw)); else cSum.x += MulHigh(c.x, sw);
+ if(hasUnsignedTextureComponent(1)) cSum.y += As<Short4>(MulHigh(As<UShort4>(c.y), cw)); else cSum.y += MulHigh(c.y, sw);
+ if(hasUnsignedTextureComponent(2)) cSum.z += As<Short4>(MulHigh(As<UShort4>(c.z), cw)); else cSum.z += MulHigh(c.z, sw);
+ if(hasUnsignedTextureComponent(3)) cSum.w += As<Short4>(MulHigh(As<UShort4>(c.w), cw)); else cSum.w += MulHigh(c.w, sw);
+
+ i++;
+ }
+ Until(i >= a)
+
+ if(hasUnsignedTextureComponent(0)) c.x = cSum.x; else c.x = AddSat(cSum.x, cSum.x);
+ if(hasUnsignedTextureComponent(1)) c.y = cSum.y; else c.y = AddSat(cSum.y, cSum.y);
+ if(hasUnsignedTextureComponent(2)) c.z = cSum.z; else c.z = AddSat(cSum.z, cSum.z);
+ if(hasUnsignedTextureComponent(3)) c.w = cSum.w; else c.w = AddSat(cSum.w, cSum.w);
+ }
+
+ return c;
+ }
+
+ Vector4s SamplerCore::sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function)
+ {
+ if(state.textureType != TEXTURE_3D)
+ {
+ return sampleQuad2D(texture, u, v, w, offset, lod, face, secondLOD, function);
+ }
+ else
+ {
+ return sample3D(texture, u, v, w, offset, lod, secondLOD, function);
+ }
+ }
+
+ Vector4s SamplerCore::sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function)
+ {
+ Vector4s c;
+
+ int componentCount = textureComponentCount();
+ bool gather = state.textureFilter == FILTER_GATHER;
+
+ Pointer<Byte> mipmap;
+ Pointer<Byte> buffer[4];
+
+ selectMipmap(texture, buffer, mipmap, lod, face, secondLOD);
+
+ bool texelFetch = (function == Fetch);
+
+ Short4 uuuu = texelFetch ? Short4(As<Int4>(u)) : address(u, state.addressingModeU, mipmap);
+ Short4 vvvv = texelFetch ? Short4(As<Int4>(v)) : address(v, state.addressingModeV, mipmap);
+ Short4 wwww = texelFetch ? Short4(As<Int4>(w)) : address(w, state.addressingModeW, mipmap);
+
+ if(state.textureFilter == FILTER_POINT || texelFetch)
+ {
+ c = sampleTexel(uuuu, vvvv, wwww, offset, mipmap, buffer, function);
+ }
+ else
+ {
+ Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, gather ? 0 : -1, lod);
+ Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, gather ? 0 : -1, lod);
+ Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, gather ? 2 : +1, lod);
+ Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, gather ? 2 : +1, lod);
+
+ Vector4s c0 = sampleTexel(uuuu0, vvvv0, wwww, offset, mipmap, buffer, function);
+ Vector4s c1 = sampleTexel(uuuu1, vvvv0, wwww, offset, mipmap, buffer, function);
+ Vector4s c2 = sampleTexel(uuuu0, vvvv1, wwww, offset, mipmap, buffer, function);
+ Vector4s c3 = sampleTexel(uuuu1, vvvv1, wwww, offset, mipmap, buffer, function);
+
+ if(!gather) // Blend
+ {
+ // Fractions
+ UShort4 f0u = As<UShort4>(uuuu0) * *Pointer<UShort4>(mipmap + OFFSET(Mipmap,width));
+ UShort4 f0v = As<UShort4>(vvvv0) * *Pointer<UShort4>(mipmap + OFFSET(Mipmap,height));
+
+ UShort4 f1u = ~f0u;
+ UShort4 f1v = ~f0v;
+
+ UShort4 f0u0v = MulHigh(f0u, f0v);
+ UShort4 f1u0v = MulHigh(f1u, f0v);
+ UShort4 f0u1v = MulHigh(f0u, f1v);
+ UShort4 f1u1v = MulHigh(f1u, f1v);
+
+ // Signed fractions
+ Short4 f1u1vs;
+ Short4 f0u1vs;
+ Short4 f1u0vs;
+ Short4 f0u0vs;
+
+ if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
+ {
+ f1u1vs = f1u1v >> 1;
+ f0u1vs = f0u1v >> 1;
+ f1u0vs = f1u0v >> 1;
+ f0u0vs = f0u0v >> 1;
+ }
+
+ // Bilinear interpolation
+ if(componentCount >= 1)
+ {
+ if(has16bitTextureComponents() && hasUnsignedTextureComponent(0))
+ {
+ c0.x = As<UShort4>(c0.x) - MulHigh(As<UShort4>(c0.x), f0u) + MulHigh(As<UShort4>(c1.x), f0u);
+ c2.x = As<UShort4>(c2.x) - MulHigh(As<UShort4>(c2.x), f0u) + MulHigh(As<UShort4>(c3.x), f0u);
+ c.x = As<UShort4>(c0.x) - MulHigh(As<UShort4>(c0.x), f0v) + MulHigh(As<UShort4>(c2.x), f0v);
+ }
+ else
+ {
+ if(hasUnsignedTextureComponent(0))
+ {
+ c0.x = MulHigh(As<UShort4>(c0.x), f1u1v);
+ c1.x = MulHigh(As<UShort4>(c1.x), f0u1v);
+ c2.x = MulHigh(As<UShort4>(c2.x), f1u0v);
+ c3.x = MulHigh(As<UShort4>(c3.x), f0u0v);
+ }
+ else
+ {
+ c0.x = MulHigh(c0.x, f1u1vs);
+ c1.x = MulHigh(c1.x, f0u1vs);
+ c2.x = MulHigh(c2.x, f1u0vs);
+ c3.x = MulHigh(c3.x, f0u0vs);
+ }
+
+ c.x = (c0.x + c1.x) + (c2.x + c3.x);
+ if(!hasUnsignedTextureComponent(0)) c.x = AddSat(c.x, c.x); // Correct for signed fractions
+ }
+ }
+
+ if(componentCount >= 2)
+ {
+ if(has16bitTextureComponents() && hasUnsignedTextureComponent(1))
+ {
+ c0.y = As<UShort4>(c0.y) - MulHigh(As<UShort4>(c0.y), f0u) + MulHigh(As<UShort4>(c1.y), f0u);
+ c2.y = As<UShort4>(c2.y) - MulHigh(As<UShort4>(c2.y), f0u) + MulHigh(As<UShort4>(c3.y), f0u);
+ c.y = As<UShort4>(c0.y) - MulHigh(As<UShort4>(c0.y), f0v) + MulHigh(As<UShort4>(c2.y), f0v);
+ }
+ else
+ {
+ if(hasUnsignedTextureComponent(1))
+ {
+ c0.y = MulHigh(As<UShort4>(c0.y), f1u1v);
+ c1.y = MulHigh(As<UShort4>(c1.y), f0u1v);
+ c2.y = MulHigh(As<UShort4>(c2.y), f1u0v);
+ c3.y = MulHigh(As<UShort4>(c3.y), f0u0v);
+ }
+ else
+ {
+ c0.y = MulHigh(c0.y, f1u1vs);
+ c1.y = MulHigh(c1.y, f0u1vs);
+ c2.y = MulHigh(c2.y, f1u0vs);
+ c3.y = MulHigh(c3.y, f0u0vs);
+ }
+
+ c.y = (c0.y + c1.y) + (c2.y + c3.y);
+ if(!hasUnsignedTextureComponent(1)) c.y = AddSat(c.y, c.y); // Correct for signed fractions
+ }
+ }
+
+ if(componentCount >= 3)
+ {
+ if(has16bitTextureComponents() && hasUnsignedTextureComponent(2))
+ {
+ c0.z = As<UShort4>(c0.z) - MulHigh(As<UShort4>(c0.z), f0u) + MulHigh(As<UShort4>(c1.z), f0u);
+ c2.z = As<UShort4>(c2.z) - MulHigh(As<UShort4>(c2.z), f0u) + MulHigh(As<UShort4>(c3.z), f0u);
+ c.z = As<UShort4>(c0.z) - MulHigh(As<UShort4>(c0.z), f0v) + MulHigh(As<UShort4>(c2.z), f0v);
+ }
+ else
+ {
+ if(hasUnsignedTextureComponent(2))
+ {
+ c0.z = MulHigh(As<UShort4>(c0.z), f1u1v);
+ c1.z = MulHigh(As<UShort4>(c1.z), f0u1v);
+ c2.z = MulHigh(As<UShort4>(c2.z), f1u0v);
+ c3.z = MulHigh(As<UShort4>(c3.z), f0u0v);
+ }
+ else
+ {
+ c0.z = MulHigh(c0.z, f1u1vs);
+ c1.z = MulHigh(c1.z, f0u1vs);
+ c2.z = MulHigh(c2.z, f1u0vs);
+ c3.z = MulHigh(c3.z, f0u0vs);
+ }
+
+ c.z = (c0.z + c1.z) + (c2.z + c3.z);
+ if(!hasUnsignedTextureComponent(2)) c.z = AddSat(c.z, c.z); // Correct for signed fractions
+ }
+ }
+
+ if(componentCount >= 4)
+ {
+ if(has16bitTextureComponents() && hasUnsignedTextureComponent(3))
+ {
+ c0.w = As<UShort4>(c0.w) - MulHigh(As<UShort4>(c0.w), f0u) + MulHigh(As<UShort4>(c1.w), f0u);
+ c2.w = As<UShort4>(c2.w) - MulHigh(As<UShort4>(c2.w), f0u) + MulHigh(As<UShort4>(c3.w), f0u);
+ c.w = As<UShort4>(c0.w) - MulHigh(As<UShort4>(c0.w), f0v) + MulHigh(As<UShort4>(c2.w), f0v);
+ }
+ else
+ {
+ if(hasUnsignedTextureComponent(3))
+ {
+ c0.w = MulHigh(As<UShort4>(c0.w), f1u1v);
+ c1.w = MulHigh(As<UShort4>(c1.w), f0u1v);
+ c2.w = MulHigh(As<UShort4>(c2.w), f1u0v);
+ c3.w = MulHigh(As<UShort4>(c3.w), f0u0v);
+ }
+ else
+ {
+ c0.w = MulHigh(c0.w, f1u1vs);
+ c1.w = MulHigh(c1.w, f0u1vs);
+ c2.w = MulHigh(c2.w, f1u0vs);
+ c3.w = MulHigh(c3.w, f0u0vs);
+ }
+
+ c.w = (c0.w + c1.w) + (c2.w + c3.w);
+ if(!hasUnsignedTextureComponent(3)) c.w = AddSat(c.w, c.w); // Correct for signed fractions
+ }
+ }
+ }
+ else
+ {
+ c.x = c1.x;
+ c.y = c2.x;
+ c.z = c3.x;
+ c.w = c0.x;
+ }
+ }
+
+ return c;
+ }
+
+ Vector4s SamplerCore::sample3D(Pointer<Byte> &texture, Float4 &u_, Float4 &v_, Float4 &w_, Vector4f &offset, Float &lod, bool secondLOD, SamplerFunction function)
+ {
+ Vector4s c_;
+
+ int componentCount = textureComponentCount();
+
+ Pointer<Byte> mipmap;
+ Pointer<Byte> buffer[4];
+ Int face[4];
+
+ selectMipmap(texture, buffer, mipmap, lod, face, secondLOD);
+
+ bool texelFetch = (function == Fetch);
+
+ Short4 uuuu = texelFetch ? Short4(As<Int4>(u_)) : address(u_, state.addressingModeU, mipmap);
+ Short4 vvvv = texelFetch ? Short4(As<Int4>(v_)) : address(v_, state.addressingModeV, mipmap);
+ Short4 wwww = texelFetch ? Short4(As<Int4>(w_)) : address(w_, state.addressingModeW, mipmap);
+
+ if(state.textureFilter == FILTER_POINT || texelFetch)
+ {
+ c_ = sampleTexel(uuuu, vvvv, wwww, offset, mipmap, buffer, function);
+ }
+ else
+ {
+ Vector4s c[2][2][2];
+
+ Short4 u[2][2][2];
+ Short4 v[2][2][2];
+ Short4 s[2][2][2];
+
+ for(int i = 0; i < 2; i++)
+ {
+ for(int j = 0; j < 2; j++)
+ {
+ for(int k = 0; k < 2; k++)
+ {
+ u[i][j][k] = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, i * 2 - 1, lod);
+ v[i][j][k] = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, j * 2 - 1, lod);
+ s[i][j][k] = offsetSample(wwww, mipmap, OFFSET(Mipmap,wHalf), state.addressingModeW == ADDRESSING_WRAP, k * 2 - 1, lod);
+ }
+ }
+ }
+
+ // Fractions
+ UShort4 f0u = As<UShort4>(u[0][0][0]) * *Pointer<UShort4>(mipmap + OFFSET(Mipmap,width));
+ UShort4 f0v = As<UShort4>(v[0][0][0]) * *Pointer<UShort4>(mipmap + OFFSET(Mipmap,height));
+ UShort4 f0s = As<UShort4>(s[0][0][0]) * *Pointer<UShort4>(mipmap + OFFSET(Mipmap,depth));
+
+ UShort4 f1u = ~f0u;
+ UShort4 f1v = ~f0v;
+ UShort4 f1s = ~f0s;
+
+ UShort4 f[2][2][2];
+ Short4 fs[2][2][2];
+
+ f[1][1][1] = MulHigh(f1u, f1v);
+ f[0][1][1] = MulHigh(f0u, f1v);
+ f[1][0][1] = MulHigh(f1u, f0v);
+ f[0][0][1] = MulHigh(f0u, f0v);
+ f[1][1][0] = MulHigh(f1u, f1v);
+ f[0][1][0] = MulHigh(f0u, f1v);
+ f[1][0][0] = MulHigh(f1u, f0v);
+ f[0][0][0] = MulHigh(f0u, f0v);
+
+ f[1][1][1] = MulHigh(f[1][1][1], f1s);
+ f[0][1][1] = MulHigh(f[0][1][1], f1s);
+ f[1][0][1] = MulHigh(f[1][0][1], f1s);
+ f[0][0][1] = MulHigh(f[0][0][1], f1s);
+ f[1][1][0] = MulHigh(f[1][1][0], f0s);
+ f[0][1][0] = MulHigh(f[0][1][0], f0s);
+ f[1][0][0] = MulHigh(f[1][0][0], f0s);
+ f[0][0][0] = MulHigh(f[0][0][0], f0s);
+
+ // Signed fractions
+ if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
+ {
+ fs[0][0][0] = f[0][0][0] >> 1;
+ fs[0][0][1] = f[0][0][1] >> 1;
+ fs[0][1][0] = f[0][1][0] >> 1;
+ fs[0][1][1] = f[0][1][1] >> 1;
+ fs[1][0][0] = f[1][0][0] >> 1;
+ fs[1][0][1] = f[1][0][1] >> 1;
+ fs[1][1][0] = f[1][1][0] >> 1;
+ fs[1][1][1] = f[1][1][1] >> 1;
+ }
+
+ for(int i = 0; i < 2; i++)
+ {
+ for(int j = 0; j < 2; j++)
+ {
+ for(int k = 0; k < 2; k++)
+ {
+ c[i][j][k] = sampleTexel(u[i][j][k], v[i][j][k], s[i][j][k], offset, mipmap, buffer, function);
+
+ if(componentCount >= 1) { if(hasUnsignedTextureComponent(0)) c[i][j][k].x = MulHigh(As<UShort4>(c[i][j][k].x), f[1 - i][1 - j][1 - k]); else c[i][j][k].x = MulHigh(c[i][j][k].x, fs[1 - i][1 - j][1 - k]); }
+ if(componentCount >= 2) { if(hasUnsignedTextureComponent(1)) c[i][j][k].y = MulHigh(As<UShort4>(c[i][j][k].y), f[1 - i][1 - j][1 - k]); else c[i][j][k].y = MulHigh(c[i][j][k].y, fs[1 - i][1 - j][1 - k]); }
+ if(componentCount >= 3) { if(hasUnsignedTextureComponent(2)) c[i][j][k].z = MulHigh(As<UShort4>(c[i][j][k].z), f[1 - i][1 - j][1 - k]); else c[i][j][k].z = MulHigh(c[i][j][k].z, fs[1 - i][1 - j][1 - k]); }
+ if(componentCount >= 4) { if(hasUnsignedTextureComponent(3)) c[i][j][k].w = MulHigh(As<UShort4>(c[i][j][k].w), f[1 - i][1 - j][1 - k]); else c[i][j][k].w = MulHigh(c[i][j][k].w, fs[1 - i][1 - j][1 - k]); }
+
+ if(i != 0 || j != 0 || k != 0)
+ {
+ if(componentCount >= 1) c[0][0][0].x += c[i][j][k].x;
+ if(componentCount >= 2) c[0][0][0].y += c[i][j][k].y;
+ if(componentCount >= 3) c[0][0][0].z += c[i][j][k].z;
+ if(componentCount >= 4) c[0][0][0].w += c[i][j][k].w;
+ }
+ }
+ }
+ }
+
+ if(componentCount >= 1) c_.x = c[0][0][0].x;
+ if(componentCount >= 2) c_.y = c[0][0][0].y;
+ if(componentCount >= 3) c_.z = c[0][0][0].z;
+ if(componentCount >= 4) c_.w = c[0][0][0].w;
+
+ // Correct for signed fractions
+ if(componentCount >= 1) if(!hasUnsignedTextureComponent(0)) c_.x = AddSat(c_.x, c_.x);
+ if(componentCount >= 2) if(!hasUnsignedTextureComponent(1)) c_.y = AddSat(c_.y, c_.y);
+ if(componentCount >= 3) if(!hasUnsignedTextureComponent(2)) c_.z = AddSat(c_.z, c_.z);
+ if(componentCount >= 4) if(!hasUnsignedTextureComponent(3)) c_.w = AddSat(c_.w, c_.w);
+ }
+
+ return c_;
+ }
+
+ Vector4f SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], SamplerFunction function)
+ {
+ Vector4f c = sampleFloatAniso(texture, u, v, w, q, offset, lod, anisotropy, uDelta, vDelta, face, false, function);
+
+ if(function == Fetch)
+ {
+ return c;
+ }
+
+ if(state.mipmapFilter == MIPMAP_LINEAR)
+ {
+ Vector4f cc = sampleFloatAniso(texture, u, v, w, q, offset, lod, anisotropy, uDelta, vDelta, face, true, function);
+
+ Float4 lod4 = Float4(Frac(lod));
+
+ c.x = (cc.x - c.x) * lod4 + c.x;
+ c.y = (cc.y - c.y) * lod4 + c.y;
+ c.z = (cc.z - c.z) * lod4 + c.z;
+ c.w = (cc.w - c.w) * lod4 + c.w;
+ }
+
+ Int4 borderMask;
+
+ if(state.addressingModeU == ADDRESSING_BORDER)
+ {
+ Int4 u0;
+
+ border(u0, u);
+
+ borderMask = u0;
+ }
+
+ if(state.addressingModeV == ADDRESSING_BORDER)
+ {
+ Int4 v0;
+
+ border(v0, v);
+
+ if(state.addressingModeU == ADDRESSING_BORDER)
+ {
+ borderMask &= v0;
+ }
+ else
+ {
+ borderMask = v0;
+ }
+ }
+
+ if(state.addressingModeW == ADDRESSING_BORDER && state.textureType == TEXTURE_3D)
+ {
+ Int4 s0;
+
+ border(s0, w);
+
+ if(state.addressingModeU == ADDRESSING_BORDER ||
+ state.addressingModeV == ADDRESSING_BORDER)
+ {
+ borderMask &= s0;
+ }
+ else
+ {
+ borderMask = s0;
+ }
+ }
+
+ if(state.addressingModeU == ADDRESSING_BORDER ||
+ state.addressingModeV == ADDRESSING_BORDER ||
+ (state.addressingModeW == ADDRESSING_BORDER && state.textureType == TEXTURE_3D))
+ {
+ Int4 b;
+
+ c.x = As<Float4>((borderMask & As<Int4>(c.x)) | (~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[0]))));
+ c.y = As<Float4>((borderMask & As<Int4>(c.y)) | (~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[1]))));
+ c.z = As<Float4>((borderMask & As<Int4>(c.z)) | (~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[2]))));
+ c.w = As<Float4>((borderMask & As<Int4>(c.w)) | (~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[3]))));
+ }
+
+ return c;
+ }
+
+ Vector4f SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, SamplerFunction function)
+ {
+ Vector4f c;
+
+ if(state.textureFilter != FILTER_ANISOTROPIC || function == Lod || function == Fetch)
+ {
+ c = sampleFloat(texture, u, v, w, q, offset, lod, face, secondLOD, function);
+ }
+ else
+ {
+ Int a = RoundInt(anisotropy);
+
+ Vector4f cSum;
+
+ cSum.x = Float4(0.0f);
+ cSum.y = Float4(0.0f);
+ cSum.z = Float4(0.0f);
+ cSum.w = Float4(0.0f);
+
+ Float4 A = *Pointer<Float4>(constants + OFFSET(Constants,uvWeight) + 16 * a);
+ Float4 B = *Pointer<Float4>(constants + OFFSET(Constants,uvStart) + 16 * a);
+
+ Float4 du = uDelta;
+ Float4 dv = vDelta;
+
+ Float4 u0 = u + B * du;
+ Float4 v0 = v + B * dv;
+
+ du *= A;
+ dv *= A;
+
+ Int i = 0;
+
+ Do
+ {
+ c = sampleFloat(texture, u0, v0, w, q, offset, lod, face, secondLOD, function);
+
+ u0 += du;
+ v0 += dv;
+
+ cSum.x += c.x * A;
+ cSum.y += c.y * A;
+ cSum.z += c.z * A;
+ cSum.w += c.w * A;
+
+ i++;
+ }
+ Until(i >= a)
+
+ c.x = cSum.x;
+ c.y = cSum.y;
+ c.z = cSum.z;
+ c.w = cSum.w;
+ }
+
+ return c;
+ }
+
+ Vector4f SamplerCore::sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function)
+ {
+ if(state.textureType != TEXTURE_3D)
+ {
+ return sampleFloat2D(texture, u, v, w, q, offset, lod, face, secondLOD, function);
+ }
+ else
+ {
+ return sampleFloat3D(texture, u, v, w, offset, lod, secondLOD, function);
+ }
+ }
+
+ Vector4f SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function)
+ {
+ Vector4f c;
+
+ int componentCount = textureComponentCount();
+ bool gather = state.textureFilter == FILTER_GATHER;
+
+ Pointer<Byte> mipmap;
+ Pointer<Byte> buffer[4];
+
+ selectMipmap(texture, buffer, mipmap, lod, face, secondLOD);
+
+ Int4 x0, x1, y0, y1, z0;
+ Float4 fu, fv;
+ Int4 filter = computeFilterOffset(lod);
+ address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU, function);
+ address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
+ address(w, z0, z0, fv, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
+
+ Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
+ y0 *= pitchP;
+ if(hasThirdCoordinate())
+ {
+ Int4 sliceP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
+ z0 *= sliceP;
+ }
+
+ if(state.textureFilter == FILTER_POINT || (function == Fetch))
+ {
+ c = sampleTexel(x0, y0, z0, q, mipmap, buffer, function);
+ }
+ else
+ {
+ y1 *= pitchP;
+
+ Vector4f c0 = sampleTexel(x0, y0, z0, q, mipmap, buffer, function);
+ Vector4f c1 = sampleTexel(x1, y0, z0, q, mipmap, buffer, function);
+ Vector4f c2 = sampleTexel(x0, y1, z0, q, mipmap, buffer, function);
+ Vector4f c3 = sampleTexel(x1, y1, z0, q, mipmap, buffer, function);
+
+ if(!gather) // Blend
+ {
+ if(componentCount >= 1) c0.x = c0.x + fu * (c1.x - c0.x);
+ if(componentCount >= 2) c0.y = c0.y + fu * (c1.y - c0.y);
+ if(componentCount >= 3) c0.z = c0.z + fu * (c1.z - c0.z);
+ if(componentCount >= 4) c0.w = c0.w + fu * (c1.w - c0.w);
+
+ if(componentCount >= 1) c2.x = c2.x + fu * (c3.x - c2.x);
+ if(componentCount >= 2) c2.y = c2.y + fu * (c3.y - c2.y);
+ if(componentCount >= 3) c2.z = c2.z + fu * (c3.z - c2.z);
+ if(componentCount >= 4) c2.w = c2.w + fu * (c3.w - c2.w);
+
+ if(componentCount >= 1) c.x = c0.x + fv * (c2.x - c0.x);
+ if(componentCount >= 2) c.y = c0.y + fv * (c2.y - c0.y);
+ if(componentCount >= 3) c.z = c0.z + fv * (c2.z - c0.z);
+ if(componentCount >= 4) c.w = c0.w + fv * (c2.w - c0.w);
+ }
+ else
+ {
+ c.x = c1.x;
+ c.y = c2.x;
+ c.z = c3.x;
+ c.w = c0.x;
+ }
+ }
+
+ return c;
+ }
+
+ Vector4f SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, bool secondLOD, SamplerFunction function)
+ {
+ Vector4f c;
+
+ int componentCount = textureComponentCount();
+
+ Pointer<Byte> mipmap;
+ Pointer<Byte> buffer[4];
+ Int face[4];
+
+ selectMipmap(texture, buffer, mipmap, lod, face, secondLOD);
+
+ Int4 x0, x1, y0, y1, z0, z1;
+ Float4 fu, fv, fw;
+ Int4 filter = computeFilterOffset(lod);
+ address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU, function);
+ address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
+ address(w, z0, z1, fw, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
+
+ Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
+ Int4 sliceP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
+ y0 *= pitchP;
+ z0 *= sliceP;
+
+ if(state.textureFilter == FILTER_POINT || (function == Fetch))
+ {
+ c = sampleTexel(x0, y0, z0, w, mipmap, buffer, function);
+ }
+ else
+ {
+ y1 *= pitchP;
+ z1 *= sliceP;
+
+ Vector4f c0 = sampleTexel(x0, y0, z0, w, mipmap, buffer, function);
+ Vector4f c1 = sampleTexel(x1, y0, z0, w, mipmap, buffer, function);
+ Vector4f c2 = sampleTexel(x0, y1, z0, w, mipmap, buffer, function);
+ Vector4f c3 = sampleTexel(x1, y1, z0, w, mipmap, buffer, function);
+ Vector4f c4 = sampleTexel(x0, y0, z1, w, mipmap, buffer, function);
+ Vector4f c5 = sampleTexel(x1, y0, z1, w, mipmap, buffer, function);
+ Vector4f c6 = sampleTexel(x0, y1, z1, w, mipmap, buffer, function);
+ Vector4f c7 = sampleTexel(x1, y1, z1, w, mipmap, buffer, function);
+
+ // Blend first slice
+ if(componentCount >= 1) c0.x = c0.x + fu * (c1.x - c0.x);
+ if(componentCount >= 2) c0.y = c0.y + fu * (c1.y - c0.y);
+ if(componentCount >= 3) c0.z = c0.z + fu * (c1.z - c0.z);
+ if(componentCount >= 4) c0.w = c0.w + fu * (c1.w - c0.w);
+
+ if(componentCount >= 1) c2.x = c2.x + fu * (c3.x - c2.x);
+ if(componentCount >= 2) c2.y = c2.y + fu * (c3.y - c2.y);
+ if(componentCount >= 3) c2.z = c2.z + fu * (c3.z - c2.z);
+ if(componentCount >= 4) c2.w = c2.w + fu * (c3.w - c2.w);
+
+ if(componentCount >= 1) c0.x = c0.x + fv * (c2.x - c0.x);
+ if(componentCount >= 2) c0.y = c0.y + fv * (c2.y - c0.y);
+ if(componentCount >= 3) c0.z = c0.z + fv * (c2.z - c0.z);
+ if(componentCount >= 4) c0.w = c0.w + fv * (c2.w - c0.w);
+
+ // Blend second slice
+ if(componentCount >= 1) c4.x = c4.x + fu * (c5.x - c4.x);
+ if(componentCount >= 2) c4.y = c4.y + fu * (c5.y - c4.y);
+ if(componentCount >= 3) c4.z = c4.z + fu * (c5.z - c4.z);
+ if(componentCount >= 4) c4.w = c4.w + fu * (c5.w - c4.w);
+
+ if(componentCount >= 1) c6.x = c6.x + fu * (c7.x - c6.x);
+ if(componentCount >= 2) c6.y = c6.y + fu * (c7.y - c6.y);
+ if(componentCount >= 3) c6.z = c6.z + fu * (c7.z - c6.z);
+ if(componentCount >= 4) c6.w = c6.w + fu * (c7.w - c6.w);
+
+ if(componentCount >= 1) c4.x = c4.x + fv * (c6.x - c4.x);
+ if(componentCount >= 2) c4.y = c4.y + fv * (c6.y - c4.y);
+ if(componentCount >= 3) c4.z = c4.z + fv * (c6.z - c4.z);
+ if(componentCount >= 4) c4.w = c4.w + fv * (c6.w - c4.w);
+
+ // Blend slices
+ if(componentCount >= 1) c.x = c0.x + fw * (c4.x - c0.x);
+ if(componentCount >= 2) c.y = c0.y + fw * (c4.y - c0.y);
+ if(componentCount >= 3) c.z = c0.z + fw * (c4.z - c0.z);
+ if(componentCount >= 4) c.w = c0.w + fw * (c4.w - c0.w);
+ }
+
+ return c;
+ }
+
+ Float SamplerCore::log2sqrt(Float lod)
+ {
+ // log2(sqrt(lod)) // Equals 0.25 * log2(lod^2).
+ lod *= lod; // Squaring doubles the exponent and produces an extra bit of precision.
+ lod = Float(As<Int>(lod)) - Float(0x3F800000); // Interpret as integer and subtract the exponent bias.
+ lod *= As<Float>(Int(0x33000000)); // Scale by 0.25 * 2^-23 (mantissa length).
+
+ return lod;
+ }
+
+ Float SamplerCore::log2(Float lod)
+ {
+ lod *= lod; // Squaring doubles the exponent and produces an extra bit of precision.
+ lod = Float(As<Int>(lod)) - Float(0x3F800000); // Interpret as integer and subtract the exponent bias.
+ lod *= As<Float>(Int(0x33800000)); // Scale by 0.5 * 2^-23 (mantissa length).
+
+ return lod;
+ }
+
+ void SamplerCore::computeLod(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerFunction function)
+ {
+ if(function != Lod && function != Fetch)
+ {
+ Float4 duvdxy;
+
+ if(function != Grad) // Implicit
+ {
+ duvdxy = Float4(uuuu.yz, vvvv.yz) - Float4(uuuu.xx, vvvv.xx);
+ }
+ else
+ {
+ Float4 dudxy = Float4(dsx.x.xx, dsy.x.xx);
+ Float4 dvdxy = Float4(dsx.y.xx, dsy.y.xx);
+
+ duvdxy = Float4(dudxy.xz, dvdxy.xz);
+ }
+
+ // Scale by texture dimensions and global LOD.
+ Float4 dUVdxy = duvdxy * *Pointer<Float4>(texture + OFFSET(Texture,widthHeightLOD));
+
+ Float4 dUV2dxy = dUVdxy * dUVdxy;
+ Float4 dUV2 = dUV2dxy.xy + dUV2dxy.zw;
+
+ lod = Max(Float(dUV2.x), Float(dUV2.y)); // Square length of major axis
+
+ if(state.textureFilter == FILTER_ANISOTROPIC)
+ {
+ Float det = Abs(Float(dUVdxy.x) * Float(dUVdxy.w) - Float(dUVdxy.y) * Float(dUVdxy.z));
+
+ Float4 dudx = duvdxy.xxxx;
+ Float4 dudy = duvdxy.yyyy;
+ Float4 dvdx = duvdxy.zzzz;
+ Float4 dvdy = duvdxy.wwww;
+
+ Int4 mask = As<Int4>(CmpNLT(dUV2.x, dUV2.y));
+ uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask)));
+ vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask)));
+
+ anisotropy = lod * Rcp_pp(det);
+ anisotropy = Min(anisotropy, *Pointer<Float>(texture + OFFSET(Texture,maxAnisotropy)));
+
+ lod *= Rcp_pp(anisotropy * anisotropy);
+ }
+
+ lod = log2sqrt(lod); // log2(sqrt(lod))
+
+ if(function == Bias)
+ {
+ lod += lodBias;
+ }
+ }
+ else if(function == Lod)
+ {
+ lod = lodBias;
+ }
+ else if(function == Fetch)
+ {
+ // TODO: Eliminate int-float-int conversion.
+ lod = Float(As<Int>(lodBias));
+ }
+ else if(function == Base)
+ {
+ lod = Float(0);
+ }
+ else assert(false);
+
+ lod = Max(lod, *Pointer<Float>(texture + OFFSET(Texture, minLod)));
+ lod = Min(lod, *Pointer<Float>(texture + OFFSET(Texture, maxLod)));
+ }
+
+ void SamplerCore::computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, Float4 &M, SamplerFunction function)
+ {
+ if(function != Lod && function != Fetch)
+ {
+ Float4 dudxy, dvdxy, dsdxy;
+
+ if(function != Grad) // Implicit
+ {
+ Float4 U = u * M;
+ Float4 V = v * M;
+ Float4 W = w * M;
+
+ dudxy = Abs(U - U.xxxx);
+ dvdxy = Abs(V - V.xxxx);
+ dsdxy = Abs(W - W.xxxx);
+ }
+ else
+ {
+ dudxy = Float4(dsx.x.xx, dsy.x.xx);
+ dvdxy = Float4(dsx.y.xx, dsy.y.xx);
+ dsdxy = Float4(dsx.z.xx, dsy.z.xx);
+
+ dudxy = Abs(dudxy * Float4(M.x));
+ dvdxy = Abs(dvdxy * Float4(M.x));
+ dsdxy = Abs(dsdxy * Float4(M.x));
+ }
+
+ // Compute the largest Manhattan distance in two dimensions.
+ // This takes the footprint across adjacent faces into account.
+ Float4 duvdxy = dudxy + dvdxy;
+ Float4 dusdxy = dudxy + dsdxy;
+ Float4 dvsdxy = dvdxy + dsdxy;
+
+ dudxy = Max(Max(duvdxy, dusdxy), dvsdxy);
+
+ lod = Max(Float(dudxy.y), Float(dudxy.z)); // FIXME: Max(dudxy.y, dudxy.z);
+
+ // Scale by texture dimension and global LOD.
+ lod *= *Pointer<Float>(texture + OFFSET(Texture,widthLOD));
+
+ lod = log2(lod);
+
+ if(function == Bias)
+ {
+ lod += lodBias;
+ }
+ }
+ else if(function == Lod)
+ {
+ lod = lodBias;
+ }
+ else if(function == Fetch)
+ {
+ // TODO: Eliminate int-float-int conversion.
+ lod = Float(As<Int>(lodBias));
+ }
+ else if(function == Base)
+ {
+ lod = Float(0);
+ }
+ else assert(false);
+
+ lod = Max(lod, *Pointer<Float>(texture + OFFSET(Texture, minLod)));
+ lod = Min(lod, *Pointer<Float>(texture + OFFSET(Texture, maxLod)));
+ }
+
+ void SamplerCore::computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerFunction function)
+ {
+ if(function != Lod && function != Fetch)
+ {
+ Float4 dudxy, dvdxy, dsdxy;
+
+ if(function != Grad) // Implicit
+ {
+ dudxy = uuuu - uuuu.xxxx;
+ dvdxy = vvvv - vvvv.xxxx;
+ dsdxy = wwww - wwww.xxxx;
+ }
+ else
+ {
+ dudxy = Float4(dsx.x.xx, dsy.x.xx);
+ dvdxy = Float4(dsx.y.xx, dsy.y.xx);
+ dsdxy = Float4(dsx.z.xx, dsy.z.xx);
+ }
+
+ // Scale by texture dimensions and global LOD.
+ dudxy *= *Pointer<Float4>(texture + OFFSET(Texture,widthLOD));
+ dvdxy *= *Pointer<Float4>(texture + OFFSET(Texture,heightLOD));
+ dsdxy *= *Pointer<Float4>(texture + OFFSET(Texture,depthLOD));
+
+ dudxy *= dudxy;
+ dvdxy *= dvdxy;
+ dsdxy *= dsdxy;
+
+ dudxy += dvdxy;
+ dudxy += dsdxy;
+
+ lod = Max(Float(dudxy.y), Float(dudxy.z)); // FIXME: Max(dudxy.y, dudxy.z);
+
+ lod = log2sqrt(lod); // log2(sqrt(lod))
+
+ if(function == Bias)
+ {
+ lod += lodBias;
+ }
+ }
+ else if(function == Lod)
+ {
+ lod = lodBias;
+ }
+ else if(function == Fetch)
+ {
+ // TODO: Eliminate int-float-int conversion.
+ lod = Float(As<Int>(lodBias));
+ }
+ else if(function == Base)
+ {
+ lod = Float(0);
+ }
+ else assert(false);
+
+ lod = Max(lod, *Pointer<Float>(texture + OFFSET(Texture, minLod)));
+ lod = Min(lod, *Pointer<Float>(texture + OFFSET(Texture, maxLod)));
+ }
+
+ void SamplerCore::cubeFace(Int face[4], Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M)
+ {
+ Int4 xn = CmpLT(x, Float4(0.0f)); // x < 0
+ Int4 yn = CmpLT(y, Float4(0.0f)); // y < 0
+ Int4 zn = CmpLT(z, Float4(0.0f)); // z < 0
+
+ Float4 absX = Abs(x);
+ Float4 absY = Abs(y);
+ Float4 absZ = Abs(z);
+
+ Int4 xy = CmpNLE(absX, absY); // abs(x) > abs(y)
+ Int4 yz = CmpNLE(absY, absZ); // abs(y) > abs(z)
+ Int4 zx = CmpNLE(absZ, absX); // abs(z) > abs(x)
+ Int4 xMajor = xy & ~zx; // abs(x) > abs(y) && abs(x) > abs(z)
+ Int4 yMajor = yz & ~xy; // abs(y) > abs(z) && abs(y) > abs(x)
+ Int4 zMajor = zx & ~yz; // abs(z) > abs(x) && abs(z) > abs(y)
+
+ // FACE_POSITIVE_X = 000b
+ // FACE_NEGATIVE_X = 001b
+ // FACE_POSITIVE_Y = 010b
+ // FACE_NEGATIVE_Y = 011b
+ // FACE_POSITIVE_Z = 100b
+ // FACE_NEGATIVE_Z = 101b
+
+ Int yAxis = SignMask(yMajor);
+ Int zAxis = SignMask(zMajor);
+
+ Int4 n = ((xn & xMajor) | (yn & yMajor) | (zn & zMajor)) & Int4(0x80000000);
+ Int negative = SignMask(n);
+
+ face[0] = *Pointer<Int>(constants + OFFSET(Constants,transposeBit0) + negative * 4);
+ face[0] |= *Pointer<Int>(constants + OFFSET(Constants,transposeBit1) + yAxis * 4);
+ face[0] |= *Pointer<Int>(constants + OFFSET(Constants,transposeBit2) + zAxis * 4);
+ face[1] = (face[0] >> 4) & 0x7;
+ face[2] = (face[0] >> 8) & 0x7;
+ face[3] = (face[0] >> 12) & 0x7;
+ face[0] &= 0x7;
+
+ M = Max(Max(absX, absY), absZ);
+
+ // U = xMajor ? (neg ^ -z) : ((zMajor & neg) ^ x)
+ U = As<Float4>((xMajor & (n ^ As<Int4>(-z))) | (~xMajor & ((zMajor & n) ^ As<Int4>(x))));
+
+ // V = !yMajor ? -y : (n ^ z)
+ V = As<Float4>((~yMajor & As<Int4>(-y)) | (yMajor & (n ^ As<Int4>(z))));
+
+ M = reciprocal(M) * Float4(0.5f);
+ U = U * M + Float4(0.5f);
+ V = V * M + Float4(0.5f);
+ }
+
+ Short4 SamplerCore::applyOffset(Short4 &uvw, Float4 &offset, const Int4 &whd, AddressingMode mode)
+ {
+ Int4 tmp = Int4(As<UShort4>(uvw));
+ tmp = tmp + As<Int4>(offset);
+
+ switch(mode)
+ {
+ case AddressingMode::ADDRESSING_WRAP:
+ tmp = (tmp + whd * Int4(-MIN_PROGRAM_TEXEL_OFFSET)) % whd;
+ break;
+ case AddressingMode::ADDRESSING_CLAMP:
+ case AddressingMode::ADDRESSING_MIRROR:
+ case AddressingMode::ADDRESSING_MIRRORONCE:
+ case AddressingMode::ADDRESSING_BORDER: // FIXME: Implement and test ADDRESSING_MIRROR, ADDRESSING_MIRRORONCE, ADDRESSING_BORDER
+ tmp = Min(Max(tmp, Int4(0)), whd - Int4(1));
+ break;
+ case ADDRESSING_TEXELFETCH:
+ break;
+ case AddressingMode::ADDRESSING_SEAMLESS:
+ ASSERT(false); // Cube sampling doesn't support offset.
+ default:
+ ASSERT(false);
+ }
+
+ return As<Short4>(UShort4(tmp));
+ }
+
+ void SamplerCore::computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, Vector4f &offset, const Pointer<Byte> &mipmap, SamplerFunction function)
+ {
+ bool texelFetch = (function == Fetch);
+ bool hasOffset = (function.option == Offset);
+
+ if(!texelFetch)
+ {
+ uuuu = MulHigh(As<UShort4>(uuuu), *Pointer<UShort4>(mipmap + OFFSET(Mipmap, width)));
+ vvvv = MulHigh(As<UShort4>(vvvv), *Pointer<UShort4>(mipmap + OFFSET(Mipmap, height)));
+ }
+
+ if(hasOffset)
+ {
+ UShort4 w = *Pointer<UShort4>(mipmap + OFFSET(Mipmap, width));
+ uuuu = applyOffset(uuuu, offset.x, Int4(w), texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeU);
+ UShort4 h = *Pointer<UShort4>(mipmap + OFFSET(Mipmap, height));
+ vvvv = applyOffset(vvvv, offset.y, Int4(h), texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeV);
+ }
+
+ Short4 uuu2 = uuuu;
+ uuuu = As<Short4>(UnpackLow(uuuu, vvvv));
+ uuu2 = As<Short4>(UnpackHigh(uuu2, vvvv));
+ uuuu = As<Short4>(MulAdd(uuuu, *Pointer<Short4>(mipmap + OFFSET(Mipmap,onePitchP))));
+ uuu2 = As<Short4>(MulAdd(uuu2, *Pointer<Short4>(mipmap + OFFSET(Mipmap,onePitchP))));
+
+ if(hasThirdCoordinate())
+ {
+ if(state.textureType != TEXTURE_2D_ARRAY)
+ {
+ if(!texelFetch)
+ {
+ wwww = MulHigh(As<UShort4>(wwww), *Pointer<UShort4>(mipmap + OFFSET(Mipmap, depth)));
+ }
+
+ if(hasOffset)
+ {
+ UShort4 d = *Pointer<UShort4>(mipmap + OFFSET(Mipmap, depth));
+ wwww = applyOffset(wwww, offset.z, Int4(d), texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeW);
+ }
+ }
+
+ UInt4 uv(As<UInt2>(uuuu), As<UInt2>(uuu2));
+ uv += As<UInt4>(Int4(As<UShort4>(wwww))) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP));
+
+ index[0] = Extract(As<Int4>(uv), 0);
+ index[1] = Extract(As<Int4>(uv), 1);
+ index[2] = Extract(As<Int4>(uv), 2);
+ index[3] = Extract(As<Int4>(uv), 3);
+ }
+ else
+ {
+ index[0] = Extract(As<Int2>(uuuu), 0);
+ index[1] = Extract(As<Int2>(uuuu), 1);
+ index[2] = Extract(As<Int2>(uuu2), 0);
+ index[3] = Extract(As<Int2>(uuu2), 1);
+ }
+
+ if(texelFetch)
+ {
+ Int size = Int(*Pointer<Int>(mipmap + OFFSET(Mipmap, sliceP)));
+ if(hasThirdCoordinate())
+ {
+ size *= Int(*Pointer<Short>(mipmap + OFFSET(Mipmap, depth)));
+ }
+ UInt min = 0;
+ UInt max = size - 1;
+
+ for(int i = 0; i < 4; i++)
+ {
+ index[i] = Min(Max(index[i], min), max);
+ }
+ }
+ }
+
+ void SamplerCore::computeIndices(UInt index[4], Int4& uuuu, Int4& vvvv, Int4& wwww, const Pointer<Byte> &mipmap, SamplerFunction function)
+ {
+ UInt4 indices = uuuu + vvvv;
+
+ if(hasThirdCoordinate())
+ {
+ indices += As<UInt4>(wwww);
+ }
+
+ for(int i = 0; i < 4; i++)
+ {
+ index[i] = Extract(As<Int4>(indices), i);
+ }
+ }
+
+ Vector4s SamplerCore::sampleTexel(UInt index[4], Pointer<Byte> buffer[4])
+ {
+ Vector4s c;
+
+ int f0 = state.textureType == TEXTURE_CUBE ? 0 : 0;
+ int f1 = state.textureType == TEXTURE_CUBE ? 1 : 0;
+ int f2 = state.textureType == TEXTURE_CUBE ? 2 : 0;
+ int f3 = state.textureType == TEXTURE_CUBE ? 3 : 0;
+
+ if(has16bitTextureFormat())
+ {
+ c.x = Insert(c.x, Pointer<Short>(buffer[f0])[index[0]], 0);
+ c.x = Insert(c.x, Pointer<Short>(buffer[f1])[index[1]], 1);
+ c.x = Insert(c.x, Pointer<Short>(buffer[f2])[index[2]], 2);
+ c.x = Insert(c.x, Pointer<Short>(buffer[f3])[index[3]], 3);
+
+ switch(state.textureFormat)
+ {
+ case FORMAT_R5G6B5:
+ c.z = (c.x & Short4(0x001Fu)) << 11;
+ c.y = (c.x & Short4(0x07E0u)) << 5;
+ c.x = (c.x & Short4(0xF800u));
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ else if(has8bitTextureComponents())
+ {
+ switch(textureComponentCount())
+ {
+ case 4:
+ {
+ Byte4 c0 = Pointer<Byte4>(buffer[f0])[index[0]];
+ Byte4 c1 = Pointer<Byte4>(buffer[f1])[index[1]];
+ Byte4 c2 = Pointer<Byte4>(buffer[f2])[index[2]];
+ Byte4 c3 = Pointer<Byte4>(buffer[f3])[index[3]];
+ c.x = Unpack(c0, c1);
+ c.y = Unpack(c2, c3);
+
+ switch(state.textureFormat)
+ {
+ case FORMAT_A8R8G8B8:
+ c.z = As<Short4>(UnpackLow(c.x, c.y));
+ c.x = As<Short4>(UnpackHigh(c.x, c.y));
+ c.y = c.z;
+ c.w = c.x;
+ c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
+ c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
+ c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
+ c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(c.w));
+ break;
+ case FORMAT_A8B8G8R8:
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8_SNORM:
+ case FORMAT_Q8W8V8U8:
+ case FORMAT_SRGB8_A8:
+ c.z = As<Short4>(UnpackHigh(c.x, c.y));
+ c.x = As<Short4>(UnpackLow(c.x, c.y));
+ c.y = c.x;
+ c.w = c.z;
+ c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
+ c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
+ c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
+ c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(c.w));
+ // Propagate sign bit
+ if(state.textureFormat == FORMAT_A8B8G8R8I)
+ {
+ c.x >>= 8;
+ c.y >>= 8;
+ c.z >>= 8;
+ c.w >>= 8;
+ }
+ break;
+ case FORMAT_A8B8G8R8UI:
+ c.z = As<Short4>(UnpackHigh(c.x, c.y));
+ c.x = As<Short4>(UnpackLow(c.x, c.y));
+ c.y = c.x;
+ c.w = c.z;
+ c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0)));
+ c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0)));
+ c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(Short4(0)));
+ c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(Short4(0)));
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ break;
+ case 3:
+ {
+ Byte4 c0 = Pointer<Byte4>(buffer[f0])[index[0]];
+ Byte4 c1 = Pointer<Byte4>(buffer[f1])[index[1]];
+ Byte4 c2 = Pointer<Byte4>(buffer[f2])[index[2]];
+ Byte4 c3 = Pointer<Byte4>(buffer[f3])[index[3]];
+ c.x = Unpack(c0, c1);
+ c.y = Unpack(c2, c3);
+
+ switch(state.textureFormat)
+ {
+ case FORMAT_X8R8G8B8:
+ c.z = As<Short4>(UnpackLow(c.x, c.y));
+ c.x = As<Short4>(UnpackHigh(c.x, c.y));
+ c.y = c.z;
+ c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
+ c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
+ c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
+ break;
+ case FORMAT_X8B8G8R8_SNORM:
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_X8L8V8U8:
+ case FORMAT_SRGB8_X8:
+ c.z = As<Short4>(UnpackHigh(c.x, c.y));
+ c.x = As<Short4>(UnpackLow(c.x, c.y));
+ c.y = c.x;
+ c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
+ c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
+ c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
+ // Propagate sign bit
+ if(state.textureFormat == FORMAT_X8B8G8R8I)
+ {
+ c.x >>= 8;
+ c.y >>= 8;
+ c.z >>= 8;
+ }
+ break;
+ case FORMAT_X8B8G8R8UI:
+ c.z = As<Short4>(UnpackHigh(c.x, c.y));
+ c.x = As<Short4>(UnpackLow(c.x, c.y));
+ c.y = c.x;
+ c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0)));
+ c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0)));
+ c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(Short4(0)));
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ break;
+ case 2:
+ c.x = Insert(c.x, Pointer<Short>(buffer[f0])[index[0]], 0);
+ c.x = Insert(c.x, Pointer<Short>(buffer[f1])[index[1]], 1);
+ c.x = Insert(c.x, Pointer<Short>(buffer[f2])[index[2]], 2);
+ c.x = Insert(c.x, Pointer<Short>(buffer[f3])[index[3]], 3);
+
+ switch(state.textureFormat)
+ {
+ case FORMAT_G8R8:
+ case FORMAT_G8R8_SNORM:
+ case FORMAT_V8U8:
+ case FORMAT_A8L8:
+ c.y = (c.x & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c.x) >> 8);
+ c.x = (c.x & Short4(0x00FFu)) | (c.x << 8);
+ break;
+ case FORMAT_G8R8I:
+ c.y = c.x >> 8;
+ c.x = (c.x << 8) >> 8; // Propagate sign bit
+ break;
+ case FORMAT_G8R8UI:
+ c.y = As<Short4>(As<UShort4>(c.x) >> 8);
+ c.x &= Short4(0x00FFu);
+ break;
+ default:
+ ASSERT(false);
+ }
+ break;
+ case 1:
+ {
+ Int c0 = Int(*Pointer<Byte>(buffer[f0] + index[0]));
+ Int c1 = Int(*Pointer<Byte>(buffer[f1] + index[1]));
+ Int c2 = Int(*Pointer<Byte>(buffer[f2] + index[2]));
+ Int c3 = Int(*Pointer<Byte>(buffer[f3] + index[3]));
+ c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+
+ switch(state.textureFormat)
+ {
+ case FORMAT_R8I:
+ case FORMAT_R8UI:
+ {
+ Int zero(0);
+ c.x = Unpack(As<Byte4>(c0), As<Byte4>(zero));
+ // Propagate sign bit
+ if(state.textureFormat == FORMAT_R8I)
+ {
+ c.x = (c.x << 8) >> 8;
+ }
+ }
+ break;
+ default:
+ c.x = Unpack(As<Byte4>(c0));
+ break;
+ }
+ }
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ else if(has16bitTextureComponents())
+ {
+ switch(textureComponentCount())
+ {
+ case 4:
+ c.x = Pointer<Short4>(buffer[f0])[index[0]];
+ c.y = Pointer<Short4>(buffer[f1])[index[1]];
+ c.z = Pointer<Short4>(buffer[f2])[index[2]];
+ c.w = Pointer<Short4>(buffer[f3])[index[3]];
+ transpose4x4(c.x, c.y, c.z, c.w);
+ break;
+ case 3:
+ c.x = Pointer<Short4>(buffer[f0])[index[0]];
+ c.y = Pointer<Short4>(buffer[f1])[index[1]];
+ c.z = Pointer<Short4>(buffer[f2])[index[2]];
+ c.w = Pointer<Short4>(buffer[f3])[index[3]];
+ transpose4x3(c.x, c.y, c.z, c.w);
+ break;
+ case 2:
+ c.x = *Pointer<Short4>(buffer[f0] + 4 * index[0]);
+ c.x = As<Short4>(UnpackLow(c.x, *Pointer<Short4>(buffer[f1] + 4 * index[1])));
+ c.z = *Pointer<Short4>(buffer[f2] + 4 * index[2]);
+ c.z = As<Short4>(UnpackLow(c.z, *Pointer<Short4>(buffer[f3] + 4 * index[3])));
+ c.y = c.x;
+ c.x = UnpackLow(As<Int2>(c.x), As<Int2>(c.z));
+ c.y = UnpackHigh(As<Int2>(c.y), As<Int2>(c.z));
+ break;
+ case 1:
+ c.x = Insert(c.x, Pointer<Short>(buffer[f0])[index[0]], 0);
+ c.x = Insert(c.x, Pointer<Short>(buffer[f1])[index[1]], 1);
+ c.x = Insert(c.x, Pointer<Short>(buffer[f2])[index[2]], 2);
+ c.x = Insert(c.x, Pointer<Short>(buffer[f3])[index[3]], 3);
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ else ASSERT(false);
+
+ if(state.sRGB)
+ {
+ if(state.textureFormat == FORMAT_R5G6B5)
+ {
+ sRGBtoLinear16_5_16(c.x);
+ sRGBtoLinear16_6_16(c.y);
+ sRGBtoLinear16_5_16(c.z);
+ }
+ else
+ {
+ for(int i = 0; i < textureComponentCount(); i++)
+ {
+ if(isRGBComponent(i))
+ {
+ sRGBtoLinear16_8_16(c[i]);
+ }
+ }
+ }
+ }
+
+ return c;
+ }
+
+ Vector4s SamplerCore::sampleTexel(Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Vector4f &offset, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function)
+ {
+ Vector4s c;
+
+ UInt index[4];
+ computeIndices(index, uuuu, vvvv, wwww, offset, mipmap, function);
+
+ if(hasYuvFormat())
+ {
+ // Generic YPbPr to RGB transformation
+ // R = Y + 2 * (1 - Kr) * Pr
+ // G = Y - 2 * Kb * (1 - Kb) / Kg * Pb - 2 * Kr * (1 - Kr) / Kg * Pr
+ // B = Y + 2 * (1 - Kb) * Pb
+
+ float Kb = 0.114f;
+ float Kr = 0.299f;
+ int studioSwing = 1;
+
+ switch(state.textureFormat)
+ {
+ case FORMAT_YV12_BT601:
+ Kb = 0.114f;
+ Kr = 0.299f;
+ studioSwing = 1;
+ break;
+ case FORMAT_YV12_BT709:
+ Kb = 0.0722f;
+ Kr = 0.2126f;
+ studioSwing = 1;
+ break;
+ case FORMAT_YV12_JFIF:
+ Kb = 0.114f;
+ Kr = 0.299f;
+ studioSwing = 0;
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ const float Kg = 1.0f - Kr - Kb;
+
+ const float Rr = 2 * (1 - Kr);
+ const float Gb = -2 * Kb * (1 - Kb) / Kg;
+ const float Gr = -2 * Kr * (1 - Kr) / Kg;
+ const float Bb = 2 * (1 - Kb);
+
+ // Scaling and bias for studio-swing range: Y = [16 .. 235], U/V = [16 .. 240]
+ const float Yy = studioSwing ? 255.0f / (235 - 16) : 1.0f;
+ const float Uu = studioSwing ? 255.0f / (240 - 16) : 1.0f;
+ const float Vv = studioSwing ? 255.0f / (240 - 16) : 1.0f;
+
+ const float Rv = Vv * Rr;
+ const float Gu = Uu * Gb;
+ const float Gv = Vv * Gr;
+ const float Bu = Uu * Bb;
+
+ const float R0 = (studioSwing * -16 * Yy - 128 * Rv) / 255;
+ const float G0 = (studioSwing * -16 * Yy - 128 * Gu - 128 * Gv) / 255;
+ const float B0 = (studioSwing * -16 * Yy - 128 * Bu) / 255;
+
+ Int c0 = Int(buffer[0][index[0]]);
+ Int c1 = Int(buffer[0][index[1]]);
+ Int c2 = Int(buffer[0][index[2]]);
+ Int c3 = Int(buffer[0][index[3]]);
+ c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+ UShort4 Y = As<UShort4>(Unpack(As<Byte4>(c0)));
+
+ computeIndices(index, uuuu, vvvv, wwww, offset, mipmap + sizeof(Mipmap), function);
+ c0 = Int(buffer[1][index[0]]);
+ c1 = Int(buffer[1][index[1]]);
+ c2 = Int(buffer[1][index[2]]);
+ c3 = Int(buffer[1][index[3]]);
+ c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+ UShort4 V = As<UShort4>(Unpack(As<Byte4>(c0)));
+
+ c0 = Int(buffer[2][index[0]]);
+ c1 = Int(buffer[2][index[1]]);
+ c2 = Int(buffer[2][index[2]]);
+ c3 = Int(buffer[2][index[3]]);
+ c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+ UShort4 U = As<UShort4>(Unpack(As<Byte4>(c0)));
+
+ const UShort4 yY = UShort4(iround(Yy * 0x4000));
+ const UShort4 rV = UShort4(iround(Rv * 0x4000));
+ const UShort4 gU = UShort4(iround(-Gu * 0x4000));
+ const UShort4 gV = UShort4(iround(-Gv * 0x4000));
+ const UShort4 bU = UShort4(iround(Bu * 0x4000));
+
+ const UShort4 r0 = UShort4(iround(-R0 * 0x4000));
+ const UShort4 g0 = UShort4(iround(G0 * 0x4000));
+ const UShort4 b0 = UShort4(iround(-B0 * 0x4000));
+
+ UShort4 y = MulHigh(Y, yY);
+ UShort4 r = SubSat(y + MulHigh(V, rV), r0);
+ UShort4 g = SubSat(y + g0, MulHigh(U, gU) + MulHigh(V, gV));
+ UShort4 b = SubSat(y + MulHigh(U, bU), b0);
+
+ c.x = Min(r, UShort4(0x3FFF)) << 2;
+ c.y = Min(g, UShort4(0x3FFF)) << 2;
+ c.z = Min(b, UShort4(0x3FFF)) << 2;
+ }
+ else
+ {
+ return sampleTexel(index, buffer);
+ }
+
+ return c;
+ }
+
+ Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function)
+ {
+ Vector4f c;
+
+ UInt index[4];
+ computeIndices(index, uuuu, vvvv, wwww, mipmap, function);
+
+ if(hasFloatTexture() || has32bitIntegerTextureComponents())
+ {
+ int f0 = state.textureType == TEXTURE_CUBE ? 0 : 0;
+ int f1 = state.textureType == TEXTURE_CUBE ? 1 : 0;
+ int f2 = state.textureType == TEXTURE_CUBE ? 2 : 0;
+ int f3 = state.textureType == TEXTURE_CUBE ? 3 : 0;
+
+ // Read texels
+ switch(textureComponentCount())
+ {
+ case 4:
+ c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
+ c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
+ c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
+ c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
+ transpose4x4(c.x, c.y, c.z, c.w);
+ break;
+ case 3:
+ c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
+ c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
+ c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
+ c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
+ transpose4x3(c.x, c.y, c.z, c.w);
+ break;
+ case 2:
+ // FIXME: Optimal shuffling?
+ c.x.xy = *Pointer<Float4>(buffer[f0] + index[0] * 8);
+ c.x.zw = *Pointer<Float4>(buffer[f1] + index[1] * 8 - 8);
+ c.z.xy = *Pointer<Float4>(buffer[f2] + index[2] * 8);
+ c.z.zw = *Pointer<Float4>(buffer[f3] + index[3] * 8 - 8);
+ c.y = c.x;
+ c.x = Float4(c.x.xz, c.z.xz);
+ c.y = Float4(c.y.yw, c.z.yw);
+ break;
+ case 1:
+ // FIXME: Optimal shuffling?
+ c.x.x = *Pointer<Float>(buffer[f0] + index[0] * 4);
+ c.x.y = *Pointer<Float>(buffer[f1] + index[1] * 4);
+ c.x.z = *Pointer<Float>(buffer[f2] + index[2] * 4);
+ c.x.w = *Pointer<Float>(buffer[f3] + index[3] * 4);
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ if(state.compare != COMPARE_BYPASS)
+ {
+ Float4 ref = z;
+
+ if(!hasFloatTexture())
+ {
+ ref = Min(Max(ref, Float4(0.0f)), Float4(1.0f));
+ }
+
+ Int4 boolean;
+
+ switch(state.compare)
+ {
+ case COMPARE_LESSEQUAL: boolean = CmpLE(ref, c.x); break;
+ case COMPARE_GREATEREQUAL: boolean = CmpNLT(ref, c.x); break;
+ case COMPARE_LESS: boolean = CmpLT(ref, c.x); break;
+ case COMPARE_GREATER: boolean = CmpNLE(ref, c.x); break;
+ case COMPARE_EQUAL: boolean = CmpEQ(ref, c.x); break;
+ case COMPARE_NOTEQUAL: boolean = CmpNEQ(ref, c.x); break;
+ case COMPARE_ALWAYS: boolean = Int4(-1); break;
+ case COMPARE_NEVER: boolean = Int4(0); break;
+ default: ASSERT(false);
+ }
+
+ c.x = As<Float4>(boolean & As<Int4>(Float4(1.0f)));
+ c.y = Float4(0.0f);
+ c.z = Float4(0.0f);
+ c.w = Float4(1.0f);
+ }
+ }
+ else
+ {
+ ASSERT(!hasYuvFormat());
+
+ Vector4s cs = sampleTexel(index, buffer);
+
+ bool isInteger = Surface::isNonNormalizedInteger(state.textureFormat);
+ int componentCount = textureComponentCount();
+ for(int n = 0; n < componentCount; n++)
+ {
+ if(hasUnsignedTextureComponent(n))
+ {
+ if(isInteger)
+ {
+ c[n] = As<Float4>(Int4(As<UShort4>(cs[n])));
+ }
+ else
+ {
+ c[n] = Float4(As<UShort4>(cs[n]));
+ }
+ }
+ else
+ {
+ if(isInteger)
+ {
+ c[n] = As<Float4>(Int4(cs[n]));
+ }
+ else
+ {
+ c[n] = Float4(cs[n]);
+ }
+ }
+ }
+ }
+
+ return c;
+ }
+
+ void SamplerCore::selectMipmap(Pointer<Byte> &texture, Pointer<Byte> buffer[4], Pointer<Byte> &mipmap, Float &lod, Int face[4], bool secondLOD)
+ {
+ if(state.mipmapFilter == MIPMAP_NONE)
+ {
+ mipmap = texture + OFFSET(Texture,mipmap[0]);
+ }
+ else
+ {
+ Int ilod;
+
+ if(state.mipmapFilter == MIPMAP_POINT)
+ {
+ ilod = RoundInt(lod);
+ }
+ else // MIPMAP_LINEAR
+ {
+ ilod = Int(lod);
+ }
+
+ mipmap = texture + OFFSET(Texture,mipmap) + ilod * sizeof(Mipmap) + secondLOD * sizeof(Mipmap);
+ }
+
+ if(state.textureType != TEXTURE_CUBE)
+ {
+ buffer[0] = *Pointer<Pointer<Byte> >(mipmap + OFFSET(Mipmap,buffer[0]));
+
+ if(hasYuvFormat())
+ {
+ buffer[1] = *Pointer<Pointer<Byte> >(mipmap + OFFSET(Mipmap,buffer[1]));
+ buffer[2] = *Pointer<Pointer<Byte> >(mipmap + OFFSET(Mipmap,buffer[2]));
+ }
+ }
+ else
+ {
+ for(int i = 0; i < 4; i++)
+ {
+ buffer[i] = *Pointer<Pointer<Byte> >(mipmap + OFFSET(Mipmap,buffer) + face[i] * sizeof(void*));
+ }
+ }
+ }
+
+ Int4 SamplerCore::computeFilterOffset(Float &lod)
+ {
+ Int4 filter = -1;
+
+ if(state.textureFilter == FILTER_POINT)
+ {
+ filter = 0;
+ }
+ else if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
+ {
+ filter = CmpNLE(Float4(lod), Float4(0.0f));
+ }
+ else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
+ {
+ filter = CmpLE(Float4(lod), Float4(0.0f));
+ }
+
+ return filter;
+ }
+
+ Short4 SamplerCore::address(Float4 &uw, AddressingMode addressingMode, Pointer<Byte> &mipmap)
+ {
+ if(addressingMode == ADDRESSING_LAYER && state.textureType != TEXTURE_2D_ARRAY)
+ {
+ return Short4(); // Unused
+ }
+ else if(addressingMode == ADDRESSING_LAYER && state.textureType == TEXTURE_2D_ARRAY)
+ {
+ return Min(Max(Short4(RoundInt(uw)), Short4(0)), *Pointer<Short4>(mipmap + OFFSET(Mipmap, depth)) - Short4(1));
+ }
+ else if(addressingMode == ADDRESSING_CLAMP || addressingMode == ADDRESSING_BORDER)
+ {
+ Float4 clamp = Min(Max(uw, Float4(0.0f)), Float4(65535.0f / 65536.0f));
+
+ return Short4(Int4(clamp * Float4(1 << 16)));
+ }
+ else if(addressingMode == ADDRESSING_MIRROR)
+ {
+ Int4 convert = Int4(uw * Float4(1 << 16));
+ Int4 mirror = (convert << 15) >> 31;
+
+ convert ^= mirror;
+
+ return Short4(convert);
+ }
+ else if(addressingMode == ADDRESSING_MIRRORONCE)
+ {
+ // Absolute value
+ Int4 convert = Int4(Abs(uw * Float4(1 << 16)));
+
+ // Clamp
+ convert -= Int4(0x00008000, 0x00008000, 0x00008000, 0x00008000);
+ convert = As<Int4>(PackSigned(convert, convert));
+
+ return As<Short4>(Int2(convert)) + Short4(0x8000u);
+ }
+ else // Wrap
+ {
+ return Short4(Int4(uw * Float4(1 << 16)));
+ }
+ }
+
+ void SamplerCore::address(Float4 &uvw, Int4 &xyz0, Int4 &xyz1, Float4 &f, Pointer<Byte> &mipmap, Float4 &texOffset, Int4 &filter, int whd, AddressingMode addressingMode, SamplerFunction function)
+ {
+ if(addressingMode == ADDRESSING_LAYER && state.textureType != TEXTURE_2D_ARRAY)
+ {
+ return; // Unused
+ }
+
+ Int4 dim = Int4(*Pointer<Short4>(mipmap + whd, 16));
+ Int4 maxXYZ = dim - Int4(1);
+
+ if(function == Fetch)
+ {
+ xyz0 = Min(Max(((function.option == Offset) && (addressingMode != ADDRESSING_LAYER)) ? As<Int4>(uvw) + As<Int4>(texOffset) : As<Int4>(uvw), Int4(0)), maxXYZ);
+ }
+ else if(addressingMode == ADDRESSING_LAYER && state.textureType == TEXTURE_2D_ARRAY) // Note: Offset does not apply to array layers
+ {
+ xyz0 = Min(Max(RoundInt(uvw), Int4(0)), maxXYZ);
+ }
+ else
+ {
+ const int halfBits = 0x3EFFFFFF; // Value just under 0.5f
+ const int oneBits = 0x3F7FFFFF; // Value just under 1.0f
+ const int twoBits = 0x3FFFFFFF; // Value just under 2.0f
+
+ bool pointFilter = state.textureFilter == FILTER_POINT ||
+ state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR ||
+ state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT;
+
+ Float4 coord = uvw;
+
+ if(state.textureType == TEXTURE_RECTANGLE)
+ {
+ // According to https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_rectangle.txt
+ // "CLAMP_TO_EDGE causes the s coordinate to be clamped to the range[0.5, wt - 0.5].
+ // CLAMP_TO_EDGE causes the t coordinate to be clamped to the range[0.5, ht - 0.5]."
+ // Unless SwiftShader implements support for ADDRESSING_BORDER, other modes should be equivalent
+ // to CLAMP_TO_EDGE. Rectangle textures have no support for any MIRROR or REPEAT modes.
+ coord = Min(Max(coord, Float4(0.5f)), Float4(dim) - Float4(0.5f));
+ }
+ else
+ {
+ switch(addressingMode)
+ {
+ case ADDRESSING_CLAMP:
+ case ADDRESSING_BORDER:
+ case ADDRESSING_SEAMLESS:
+ // Linear filtering of cube doesn't require clamping because the coordinates
+ // are already in [0, 1] range and numerical imprecision is tolerated.
+ if(addressingMode != ADDRESSING_SEAMLESS || pointFilter)
+ {
+ Float4 one = As<Float4>(Int4(oneBits));
+ coord = Min(Max(coord, Float4(0.0f)), one);
+ }
+ break;
+ case ADDRESSING_MIRROR:
+ {
+ Float4 half = As<Float4>(Int4(halfBits));
+ Float4 one = As<Float4>(Int4(oneBits));
+ Float4 two = As<Float4>(Int4(twoBits));
+ coord = one - Abs(two * Frac(coord * half) - one);
+ }
+ break;
+ case ADDRESSING_MIRRORONCE:
+ {
+ Float4 half = As<Float4>(Int4(halfBits));
+ Float4 one = As<Float4>(Int4(oneBits));
+ Float4 two = As<Float4>(Int4(twoBits));
+ coord = one - Abs(two * Frac(Min(Max(coord, -one), two) * half) - one);
+ }
+ break;
+ default: // Wrap
+ coord = Frac(coord);
+ break;
+ }
+
+ coord = coord * Float4(dim);
+ }
+
+ if(state.textureFilter == FILTER_POINT ||
+ state.textureFilter == FILTER_GATHER)
+ {
+ xyz0 = Int4(coord);
+ }
+ else
+ {
+ if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR ||
+ state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
+ {
+ coord -= As<Float4>(As<Int4>(Float4(0.5f)) & filter);
+ }
+ else
+ {
+ coord -= Float4(0.5f);
+ }
+
+ Float4 floor = Floor(coord);
+ xyz0 = Int4(floor);
+ f = coord - floor;
+ }
+
+ if(function.option == Offset)
+ {
+ xyz0 += As<Int4>(texOffset);
+ }
+
+ if(addressingMode == ADDRESSING_SEAMLESS)
+ {
+ xyz0 += Int4(1);
+ }
+
+ xyz1 = xyz0 - filter; // Increment
+
+ if(function.option == Offset)
+ {
+ switch(addressingMode)
+ {
+ case ADDRESSING_SEAMLESS:
+ ASSERT(false); // Cube sampling doesn't support offset.
+ case ADDRESSING_MIRROR:
+ case ADDRESSING_MIRRORONCE:
+ case ADDRESSING_BORDER:
+ // FIXME: Implement ADDRESSING_MIRROR, ADDRESSING_MIRRORONCE, and ADDRESSING_BORDER.
+ // Fall through to Clamp.
+ case ADDRESSING_CLAMP:
+ xyz0 = Min(Max(xyz0, Int4(0)), maxXYZ);
+ xyz1 = Min(Max(xyz1, Int4(0)), maxXYZ);
+ break;
+ default: // Wrap
+ xyz0 = (xyz0 + dim * Int4(-MIN_PROGRAM_TEXEL_OFFSET)) % dim;
+ xyz1 = (xyz1 + dim * Int4(-MIN_PROGRAM_TEXEL_OFFSET)) % dim;
+ break;
+ }
+ }
+ else if(state.textureFilter != FILTER_POINT)
+ {
+ switch(addressingMode)
+ {
+ case ADDRESSING_SEAMLESS:
+ break;
+ case ADDRESSING_MIRROR:
+ case ADDRESSING_MIRRORONCE:
+ case ADDRESSING_BORDER:
+ case ADDRESSING_CLAMP:
+ xyz0 = Max(xyz0, Int4(0));
+ xyz1 = Min(xyz1, maxXYZ);
+ break;
+ default: // Wrap
+ {
+ Int4 under = CmpLT(xyz0, Int4(0));
+ xyz0 = (under & maxXYZ) | (~under & xyz0); // xyz < 0 ? dim - 1 : xyz // FIXME: IfThenElse()
+
+ Int4 nover = CmpLT(xyz1, dim);
+ xyz1 = nover & xyz1; // xyz >= dim ? 0 : xyz
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ void SamplerCore::convertFixed12(Short4 &cs, Float4 &cf)
+ {
+ cs = RoundShort4(cf * Float4(0x1000));
+ }
+
+ void SamplerCore::convertFixed12(Vector4s &cs, Vector4f &cf)
+ {
+ convertFixed12(cs.x, cf.x);
+ convertFixed12(cs.y, cf.y);
+ convertFixed12(cs.z, cf.z);
+ convertFixed12(cs.w, cf.w);
+ }
+
+ void SamplerCore::convertSigned12(Float4 &cf, Short4 &cs)
+ {
+ cf = Float4(cs) * Float4(1.0f / 0x0FFE);
+ }
+
+// void SamplerCore::convertSigned12(Vector4f &cf, Vector4s &cs)
+// {
+// convertSigned12(cf.x, cs.x);
+// convertSigned12(cf.y, cs.y);
+// convertSigned12(cf.z, cs.z);
+// convertSigned12(cf.w, cs.w);
+// }
+
+ void SamplerCore::convertSigned15(Float4 &cf, Short4 &cs)
+ {
+ cf = Float4(cs) * Float4(1.0f / 0x7FFF);
+ }
+
+ void SamplerCore::convertUnsigned16(Float4 &cf, Short4 &cs)
+ {
+ cf = Float4(As<UShort4>(cs)) * Float4(1.0f / 0xFFFF);
+ }
+
+ void SamplerCore::sRGBtoLinear16_8_16(Short4 &c)
+ {
+ c = As<UShort4>(c) >> 8;
+
+ Pointer<Byte> LUT = Pointer<Byte>(constants + OFFSET(Constants,sRGBtoLinear8_16));
+
+ c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 0))), 0);
+ c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 1))), 1);
+ c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 2))), 2);
+ c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 3))), 3);
+ }
+
+ void SamplerCore::sRGBtoLinear16_6_16(Short4 &c)
+ {
+ c = As<UShort4>(c) >> 10;
+
+ Pointer<Byte> LUT = Pointer<Byte>(constants + OFFSET(Constants,sRGBtoLinear6_16));
+
+ c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 0))), 0);
+ c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 1))), 1);
+ c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 2))), 2);
+ c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 3))), 3);
+ }
+
+ void SamplerCore::sRGBtoLinear16_5_16(Short4 &c)
+ {
+ c = As<UShort4>(c) >> 11;
+
+ Pointer<Byte> LUT = Pointer<Byte>(constants + OFFSET(Constants,sRGBtoLinear5_16));
+
+ c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 0))), 0);
+ c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 1))), 1);
+ c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 2))), 2);
+ c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 3))), 3);
+ }
+
+ bool SamplerCore::hasFloatTexture() const
+ {
+ return Surface::isFloatFormat(state.textureFormat);
+ }
+
+ bool SamplerCore::hasUnnormalizedIntegerTexture() const
+ {
+ return Surface::isNonNormalizedInteger(state.textureFormat);
+ }
+
+ bool SamplerCore::hasUnsignedTextureComponent(int component) const
+ {
+ return Surface::isUnsignedComponent(state.textureFormat, component);
+ }
+
+ int SamplerCore::textureComponentCount() const
+ {
+ return Surface::componentCount(state.textureFormat);
+ }
+
+ bool SamplerCore::hasThirdCoordinate() const
+ {
+ return (state.textureType == TEXTURE_3D) || (state.textureType == TEXTURE_2D_ARRAY);
+ }
+
+ bool SamplerCore::has16bitTextureFormat() const
+ {
+ switch(state.textureFormat)
+ {
+ case FORMAT_R5G6B5:
+ return true;
+ case FORMAT_R8_SNORM:
+ case FORMAT_G8R8_SNORM:
+ case FORMAT_X8B8G8R8_SNORM:
+ case FORMAT_A8B8G8R8_SNORM:
+ case FORMAT_R8I:
+ case FORMAT_R8UI:
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8UI:
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X8B8G8R8UI:
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8UI:
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_X32B32G32R32UI:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ case FORMAT_G8R8:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_V8U8:
+ case FORMAT_Q8W8V8U8:
+ case FORMAT_X8L8V8U8:
+ case FORMAT_R32F:
+ case FORMAT_G32R32F:
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_A32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_A8:
+ case FORMAT_R8:
+ case FORMAT_L8:
+ case FORMAT_A8L8:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ case FORMAT_L16:
+ case FORMAT_G16R16:
+ case FORMAT_A16B16G16R16:
+ case FORMAT_V16U16:
+ case FORMAT_A16W16V16U16:
+ case FORMAT_Q16W16V16U16:
+ case FORMAT_R16I:
+ case FORMAT_R16UI:
+ case FORMAT_G16R16I:
+ case FORMAT_G16R16UI:
+ case FORMAT_X16B16G16R16I:
+ case FORMAT_X16B16G16R16UI:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_YV12_BT601:
+ case FORMAT_YV12_BT709:
+ case FORMAT_YV12_JFIF:
+ return false;
+ default:
+ ASSERT(false);
+ }
+
+ return false;
+ }
+
+ bool SamplerCore::has8bitTextureComponents() const
+ {
+ switch(state.textureFormat)
+ {
+ case FORMAT_G8R8:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_V8U8:
+ case FORMAT_Q8W8V8U8:
+ case FORMAT_X8L8V8U8:
+ case FORMAT_A8:
+ case FORMAT_R8:
+ case FORMAT_L8:
+ case FORMAT_A8L8:
+ case FORMAT_R8_SNORM:
+ case FORMAT_G8R8_SNORM:
+ case FORMAT_X8B8G8R8_SNORM:
+ case FORMAT_A8B8G8R8_SNORM:
+ case FORMAT_R8I:
+ case FORMAT_R8UI:
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8UI:
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X8B8G8R8UI:
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8UI:
+ return true;
+ case FORMAT_R5G6B5:
+ case FORMAT_R32F:
+ case FORMAT_G32R32F:
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_A32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ case FORMAT_L16:
+ case FORMAT_G16R16:
+ case FORMAT_A16B16G16R16:
+ case FORMAT_V16U16:
+ case FORMAT_A16W16V16U16:
+ case FORMAT_Q16W16V16U16:
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_X32B32G32R32UI:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ case FORMAT_R16I:
+ case FORMAT_R16UI:
+ case FORMAT_G16R16I:
+ case FORMAT_G16R16UI:
+ case FORMAT_X16B16G16R16I:
+ case FORMAT_X16B16G16R16UI:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_YV12_BT601:
+ case FORMAT_YV12_BT709:
+ case FORMAT_YV12_JFIF:
+ return false;
+ default:
+ ASSERT(false);
+ }
+
+ return false;
+ }
+
+ bool SamplerCore::has16bitTextureComponents() const
+ {
+ switch(state.textureFormat)
+ {
+ case FORMAT_R5G6B5:
+ case FORMAT_R8_SNORM:
+ case FORMAT_G8R8_SNORM:
+ case FORMAT_X8B8G8R8_SNORM:
+ case FORMAT_A8B8G8R8_SNORM:
+ case FORMAT_R8I:
+ case FORMAT_R8UI:
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8UI:
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X8B8G8R8UI:
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8UI:
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_X32B32G32R32UI:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ case FORMAT_G8R8:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_V8U8:
+ case FORMAT_Q8W8V8U8:
+ case FORMAT_X8L8V8U8:
+ case FORMAT_R32F:
+ case FORMAT_G32R32F:
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_A32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_A8:
+ case FORMAT_R8:
+ case FORMAT_L8:
+ case FORMAT_A8L8:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ case FORMAT_YV12_BT601:
+ case FORMAT_YV12_BT709:
+ case FORMAT_YV12_JFIF:
+ return false;
+ case FORMAT_L16:
+ case FORMAT_G16R16:
+ case FORMAT_A16B16G16R16:
+ case FORMAT_R16I:
+ case FORMAT_R16UI:
+ case FORMAT_G16R16I:
+ case FORMAT_G16R16UI:
+ case FORMAT_X16B16G16R16I:
+ case FORMAT_X16B16G16R16UI:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_V16U16:
+ case FORMAT_A16W16V16U16:
+ case FORMAT_Q16W16V16U16:
+ return true;
+ default:
+ ASSERT(false);
+ }
+
+ return false;
+ }
+
+ bool SamplerCore::has32bitIntegerTextureComponents() const
+ {
+ switch(state.textureFormat)
+ {
+ case FORMAT_R5G6B5:
+ case FORMAT_R8_SNORM:
+ case FORMAT_G8R8_SNORM:
+ case FORMAT_X8B8G8R8_SNORM:
+ case FORMAT_A8B8G8R8_SNORM:
+ case FORMAT_R8I:
+ case FORMAT_R8UI:
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8UI:
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X8B8G8R8UI:
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8UI:
+ case FORMAT_G8R8:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_V8U8:
+ case FORMAT_Q8W8V8U8:
+ case FORMAT_X8L8V8U8:
+ case FORMAT_L16:
+ case FORMAT_G16R16:
+ case FORMAT_A16B16G16R16:
+ case FORMAT_R16I:
+ case FORMAT_R16UI:
+ case FORMAT_G16R16I:
+ case FORMAT_G16R16UI:
+ case FORMAT_X16B16G16R16I:
+ case FORMAT_X16B16G16R16UI:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_V16U16:
+ case FORMAT_A16W16V16U16:
+ case FORMAT_Q16W16V16U16:
+ case FORMAT_R32F:
+ case FORMAT_G32R32F:
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_A32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_A8:
+ case FORMAT_R8:
+ case FORMAT_L8:
+ case FORMAT_A8L8:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ case FORMAT_YV12_BT601:
+ case FORMAT_YV12_BT709:
+ case FORMAT_YV12_JFIF:
+ return false;
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_X32B32G32R32UI:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ return true;
+ default:
+ ASSERT(false);
+ }
+
+ return false;
+ }
+
+ bool SamplerCore::hasYuvFormat() const
+ {
+ switch(state.textureFormat)
+ {
+ case FORMAT_YV12_BT601:
+ case FORMAT_YV12_BT709:
+ case FORMAT_YV12_JFIF:
+ return true;
+ case FORMAT_R5G6B5:
+ case FORMAT_R8_SNORM:
+ case FORMAT_G8R8_SNORM:
+ case FORMAT_X8B8G8R8_SNORM:
+ case FORMAT_A8B8G8R8_SNORM:
+ case FORMAT_R8I:
+ case FORMAT_R8UI:
+ case FORMAT_G8R8I:
+ case FORMAT_G8R8UI:
+ case FORMAT_X8B8G8R8I:
+ case FORMAT_X8B8G8R8UI:
+ case FORMAT_A8B8G8R8I:
+ case FORMAT_A8B8G8R8UI:
+ case FORMAT_R32I:
+ case FORMAT_R32UI:
+ case FORMAT_G32R32I:
+ case FORMAT_G32R32UI:
+ case FORMAT_X32B32G32R32I:
+ case FORMAT_X32B32G32R32UI:
+ case FORMAT_A32B32G32R32I:
+ case FORMAT_A32B32G32R32UI:
+ case FORMAT_G8R8:
+ case FORMAT_X8R8G8B8:
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8R8G8B8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ case FORMAT_V8U8:
+ case FORMAT_Q8W8V8U8:
+ case FORMAT_X8L8V8U8:
+ case FORMAT_R32F:
+ case FORMAT_G32R32F:
+ case FORMAT_X32B32G32R32F:
+ case FORMAT_A32B32G32R32F:
+ case FORMAT_X32B32G32R32F_UNSIGNED:
+ case FORMAT_A8:
+ case FORMAT_R8:
+ case FORMAT_L8:
+ case FORMAT_A8L8:
+ case FORMAT_D32F_LOCKABLE:
+ case FORMAT_D32FS8_TEXTURE:
+ case FORMAT_D32F_SHADOW:
+ case FORMAT_D32FS8_SHADOW:
+ case FORMAT_L16:
+ case FORMAT_G16R16:
+ case FORMAT_A16B16G16R16:
+ case FORMAT_R16I:
+ case FORMAT_R16UI:
+ case FORMAT_G16R16I:
+ case FORMAT_G16R16UI:
+ case FORMAT_X16B16G16R16I:
+ case FORMAT_X16B16G16R16UI:
+ case FORMAT_A16B16G16R16I:
+ case FORMAT_A16B16G16R16UI:
+ case FORMAT_V16U16:
+ case FORMAT_A16W16V16U16:
+ case FORMAT_Q16W16V16U16:
+ return false;
+ default:
+ ASSERT(false);
+ }
+
+ return false;
+ }
+
+ bool SamplerCore::isRGBComponent(int component) const
+ {
+ switch(state.textureFormat)
+ {
+ case FORMAT_R5G6B5: return component < 3;
+ case FORMAT_R8_SNORM: return component < 1;
+ case FORMAT_G8R8_SNORM: return component < 2;
+ case FORMAT_X8B8G8R8_SNORM: return component < 3;
+ case FORMAT_A8B8G8R8_SNORM: return component < 3;
+ case FORMAT_R8I: return component < 1;
+ case FORMAT_R8UI: return component < 1;
+ case FORMAT_G8R8I: return component < 2;
+ case FORMAT_G8R8UI: return component < 2;
+ case FORMAT_X8B8G8R8I: return component < 3;
+ case FORMAT_X8B8G8R8UI: return component < 3;
+ case FORMAT_A8B8G8R8I: return component < 3;
+ case FORMAT_A8B8G8R8UI: return component < 3;
+ case FORMAT_R32I: return component < 1;
+ case FORMAT_R32UI: return component < 1;
+ case FORMAT_G32R32I: return component < 2;
+ case FORMAT_G32R32UI: return component < 2;
+ case FORMAT_X32B32G32R32I: return component < 3;
+ case FORMAT_X32B32G32R32UI: return component < 3;
+ case FORMAT_A32B32G32R32I: return component < 3;
+ case FORMAT_A32B32G32R32UI: return component < 3;
+ case FORMAT_G8R8: return component < 2;
+ case FORMAT_X8R8G8B8: return component < 3;
+ case FORMAT_X8B8G8R8: return component < 3;
+ case FORMAT_A8R8G8B8: return component < 3;
+ case FORMAT_A8B8G8R8: return component < 3;
+ case FORMAT_SRGB8_X8: return component < 3;
+ case FORMAT_SRGB8_A8: return component < 3;
+ case FORMAT_V8U8: return false;
+ case FORMAT_Q8W8V8U8: return false;
+ case FORMAT_X8L8V8U8: return false;
+ case FORMAT_R32F: return component < 1;
+ case FORMAT_G32R32F: return component < 2;
+ case FORMAT_X32B32G32R32F: return component < 3;
+ case FORMAT_A32B32G32R32F: return component < 3;
+ case FORMAT_X32B32G32R32F_UNSIGNED: return component < 3;
+ case FORMAT_A8: return false;
+ case FORMAT_R8: return component < 1;
+ case FORMAT_L8: return component < 1;
+ case FORMAT_A8L8: return component < 1;
+ case FORMAT_D32F_LOCKABLE: return false;
+ case FORMAT_D32FS8_TEXTURE: return false;
+ case FORMAT_D32F_SHADOW: return false;
+ case FORMAT_D32FS8_SHADOW: return false;
+ case FORMAT_L16: return component < 1;
+ case FORMAT_G16R16: return component < 2;
+ case FORMAT_A16B16G16R16: return component < 3;
+ case FORMAT_R16I: return component < 1;
+ case FORMAT_R16UI: return component < 1;
+ case FORMAT_G16R16I: return component < 2;
+ case FORMAT_G16R16UI: return component < 2;
+ case FORMAT_X16B16G16R16I: return component < 3;
+ case FORMAT_X16B16G16R16UI: return component < 3;
+ case FORMAT_A16B16G16R16I: return component < 3;
+ case FORMAT_A16B16G16R16UI: return component < 3;
+ case FORMAT_V16U16: return false;
+ case FORMAT_A16W16V16U16: return false;
+ case FORMAT_Q16W16V16U16: return false;
+ case FORMAT_YV12_BT601: return component < 3;
+ case FORMAT_YV12_BT709: return component < 3;
+ case FORMAT_YV12_JFIF: return component < 3;
+ default:
+ ASSERT(false);
+ }
+
+ return false;
+ }
+}
diff --git a/src/Pipeline/SamplerCore.hpp b/src/Pipeline/SamplerCore.hpp
new file mode 100644
index 0000000..684c1a7
--- /dev/null
+++ b/src/Pipeline/SamplerCore.hpp
@@ -0,0 +1,116 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_SamplerCore_hpp
+#define sw_SamplerCore_hpp
+
+#include "PixelRoutine.hpp"
+#include "Reactor/Reactor.hpp"
+
+namespace sw
+{
+ enum SamplerMethod
+ {
+ Implicit, // Compute gradients (pixel shader only).
+ Bias, // Compute gradients and add provided bias.
+ Lod, // Use provided LOD.
+ Grad, // Use provided gradients.
+ Fetch, // Use provided integer coordinates.
+ Base // Sample base level.
+ };
+
+ enum SamplerOption
+ {
+ None,
+ Offset // Offset sample location by provided integer coordinates.
+ };
+
+ struct SamplerFunction
+ {
+ SamplerFunction(SamplerMethod method, SamplerOption option = None) : method(method), option(option) {}
+ operator SamplerMethod() { return method; }
+
+ const SamplerMethod method;
+ const SamplerOption option;
+ };
+
+ class SamplerCore
+ {
+ public:
+ SamplerCore(Pointer<Byte> &constants, const Sampler::State &state);
+
+ Vector4s sampleTexture(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &bias, Vector4f &dsx, Vector4f &dsy);
+ Vector4f sampleTexture(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
+ static Vector4f textureSize(Pointer<Byte> &mipmap, Float4 &lod);
+
+ private:
+ Vector4s sampleTexture(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function, bool fixed12);
+
+ void border(Short4 &mask, Float4 &coordinates);
+ void border(Int4 &mask, Float4 &coordinates);
+ Short4 offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod);
+ Vector4s sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], SamplerFunction function);
+ Vector4s sampleAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, SamplerFunction function);
+ Vector4s sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function);
+ Vector4s sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function);
+ Vector4s sample3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, bool secondLOD, SamplerFunction function);
+ Vector4f sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], SamplerFunction function);
+ Vector4f sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, SamplerFunction function);
+ Vector4f sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function);
+ Vector4f sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function);
+ Vector4f sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, bool secondLOD, SamplerFunction function);
+ Float log2sqrt(Float lod);
+ Float log2(Float lod);
+ void computeLod(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerFunction function);
+ void computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, Float4 &M, SamplerFunction function);
+ void computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerFunction function);
+ void cubeFace(Int face[4], Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M);
+ Short4 applyOffset(Short4 &uvw, Float4 &offset, const Int4 &whd, AddressingMode mode);
+ void computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, Vector4f &offset, const Pointer<Byte> &mipmap, SamplerFunction function);
+ void computeIndices(UInt index[4], Int4& uuuu, Int4& vvvv, Int4& wwww, const Pointer<Byte> &mipmap, SamplerFunction function);
+ Vector4s sampleTexel(Short4 &u, Short4 &v, Short4 &s, Vector4f &offset, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function);
+ Vector4s sampleTexel(UInt index[4], Pointer<Byte> buffer[4]);
+ Vector4f sampleTexel(Int4 &u, Int4 &v, Int4 &s, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function);
+ void selectMipmap(Pointer<Byte> &texture, Pointer<Byte> buffer[4], Pointer<Byte> &mipmap, Float &lod, Int face[4], bool secondLOD);
+ Short4 address(Float4 &uw, AddressingMode addressingMode, Pointer<Byte>& mipmap);
+ void address(Float4 &uw, Int4& xyz0, Int4& xyz1, Float4& f, Pointer<Byte>& mipmap, Float4 &texOffset, Int4 &filter, int whd, AddressingMode addressingMode, SamplerFunction function);
+ Int4 computeFilterOffset(Float &lod);
+
+ void convertFixed12(Short4 &ci, Float4 &cf);
+ void convertFixed12(Vector4s &cs, Vector4f &cf);
+ void convertSigned12(Float4 &cf, Short4 &ci);
+ void convertSigned15(Float4 &cf, Short4 &ci);
+ void convertUnsigned16(Float4 &cf, Short4 &ci);
+ void sRGBtoLinear16_8_16(Short4 &c);
+ void sRGBtoLinear16_6_16(Short4 &c);
+ void sRGBtoLinear16_5_16(Short4 &c);
+
+ bool hasFloatTexture() const;
+ bool hasUnnormalizedIntegerTexture() const;
+ bool hasUnsignedTextureComponent(int component) const;
+ int textureComponentCount() const;
+ bool hasThirdCoordinate() const;
+ bool has16bitTextureFormat() const;
+ bool has8bitTextureComponents() const;
+ bool has16bitTextureComponents() const;
+ bool has32bitIntegerTextureComponents() const;
+ bool hasYuvFormat() const;
+ bool isRGBComponent(int component) const;
+
+ Pointer<Byte> &constants;
+ const Sampler::State &state;
+ };
+}
+
+#endif // sw_SamplerCore_hpp
diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
new file mode 100644
index 0000000..d733c2d
--- /dev/null
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -0,0 +1,669 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "SetupRoutine.hpp"
+
+#include "Constants.hpp"
+#include "Renderer/Primitive.hpp"
+#include "Renderer/Polygon.hpp"
+#include "Renderer/Renderer.hpp"
+#include "Reactor/Reactor.hpp"
+
+namespace sw
+{
+ extern bool complementaryDepthBuffer;
+ extern TranscendentalPrecision logPrecision;
+ extern bool leadingVertexFirst;
+
+ SetupRoutine::SetupRoutine(const SetupProcessor::State &state) : state(state)
+ {
+ routine = 0;
+ }
+
+ SetupRoutine::~SetupRoutine()
+ {
+ }
+
+ void SetupRoutine::generate()
+ {
+ Function<Bool(Pointer<Byte>, Pointer<Byte>, Pointer<Byte>, Pointer<Byte>)> function;
+ {
+ Pointer<Byte> primitive(function.Arg<0>());
+ Pointer<Byte> tri(function.Arg<1>());
+ Pointer<Byte> polygon(function.Arg<2>());
+ Pointer<Byte> data(function.Arg<3>());
+
+ Pointer<Byte> constants = *Pointer<Pointer<Byte> >(data + OFFSET(DrawData,constants));
+
+ const bool point = state.isDrawPoint;
+ const bool sprite = state.pointSprite;
+ const bool line = state.isDrawLine;
+ const bool triangle = state.isDrawSolidTriangle || sprite;
+ const bool solidTriangle = state.isDrawSolidTriangle;
+
+ const int V0 = OFFSET(Triangle,v0);
+ const int V1 = (triangle || line) ? OFFSET(Triangle,v1) : OFFSET(Triangle,v0);
+ const int V2 = triangle ? OFFSET(Triangle,v2) : (line ? OFFSET(Triangle,v1) : OFFSET(Triangle,v0));
+
+ int pos = state.positionRegister;
+
+ Pointer<Byte> v0 = tri + V0;
+ Pointer<Byte> v1 = tri + V1;
+ Pointer<Byte> v2 = tri + V2;
+
+ Array<Int> X(16);
+ Array<Int> Y(16);
+
+ X[0] = *Pointer<Int>(v0 + OFFSET(Vertex,X));
+ X[1] = *Pointer<Int>(v1 + OFFSET(Vertex,X));
+ X[2] = *Pointer<Int>(v2 + OFFSET(Vertex,X));
+
+ Y[0] = *Pointer<Int>(v0 + OFFSET(Vertex,Y));
+ Y[1] = *Pointer<Int>(v1 + OFFSET(Vertex,Y));
+ Y[2] = *Pointer<Int>(v2 + OFFSET(Vertex,Y));
+
+ Int d = 1; // Winding direction
+
+ // Culling
+ if(solidTriangle)
+ {
+ Float x0 = Float(X[0]);
+ Float x1 = Float(X[1]);
+ Float x2 = Float(X[2]);
+
+ Float y0 = Float(Y[0]);
+ Float y1 = Float(Y[1]);
+ Float y2 = Float(Y[2]);
+
+ Float A = (y2 - y0) * x1 + (y1 - y2) * x0 + (y0 - y1) * x2; // Area
+
+ If(A == 0.0f)
+ {
+ Return(false);
+ }
+
+ Int w0w1w2 = *Pointer<Int>(v0 + pos * 16 + 12) ^
+ *Pointer<Int>(v1 + pos * 16 + 12) ^
+ *Pointer<Int>(v2 + pos * 16 + 12);
+
+ A = IfThenElse(w0w1w2 < 0, -A, A);
+
+ if(state.cullMode == CULL_CLOCKWISE)
+ {
+ If(A >= 0.0f) Return(false);
+ }
+ else if(state.cullMode == CULL_COUNTERCLOCKWISE)
+ {
+ If(A <= 0.0f) Return(false);
+ }
+
+ d = IfThenElse(A < 0.0f, d, Int(0));
+
+ if(state.twoSidedStencil)
+ {
+ If(A > 0.0f)
+ {
+ *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+ }
+ Else
+ {
+ *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+ *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ }
+ }
+
+ if(state.vFace)
+ {
+ *Pointer<Float>(primitive + OFFSET(Primitive,area)) = 0.5f * A;
+ }
+ }
+ else
+ {
+ if(state.twoSidedStencil)
+ {
+ *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+ }
+ }
+
+ Int n = *Pointer<Int>(polygon + OFFSET(Polygon,n));
+ Int m = *Pointer<Int>(polygon + OFFSET(Polygon,i));
+
+ If(m != 0 || Bool(!solidTriangle)) // Clipped triangle; reproject
+ {
+ Pointer<Byte> V = polygon + OFFSET(Polygon,P) + m * sizeof(void*) * 16;
+
+ Int i = 0;
+
+ Do
+ {
+ Pointer<Float4> p = *Pointer<Pointer<Float4> >(V + i * sizeof(void*));
+ Float4 v = *Pointer<Float4>(p, 16);
+
+ Float w = v.w;
+ Float rhw = IfThenElse(w != 0.0f, 1.0f / w, Float(1.0f));
+
+ X[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float>(data + OFFSET(DrawData,Wx16)));
+ Y[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float>(data + OFFSET(DrawData,Hx16)));
+
+ i++;
+ }
+ Until(i >= n)
+ }
+
+ // Vertical range
+ Int yMin = Y[0];
+ Int yMax = Y[0];
+
+ Int i = 1;
+
+ Do
+ {
+ yMin = Min(Y[i], yMin);
+ yMax = Max(Y[i], yMax);
+
+ i++;
+ }
+ Until(i >= n)
+
+ if(state.multiSample > 1)
+ {
+ yMin = (yMin + 0x0A) >> 4;
+ yMax = (yMax + 0x14) >> 4;
+ }
+ else
+ {
+ yMin = (yMin + 0x0F) >> 4;
+ yMax = (yMax + 0x0F) >> 4;
+ }
+
+ If(yMin == yMax)
+ {
+ Return(false);
+ }
+
+ yMin = Max(yMin, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
+ yMax = Min(yMax, *Pointer<Int>(data + OFFSET(DrawData,scissorY1)));
+
+ For(Int q = 0, q < state.multiSample, q++)
+ {
+ Array<Int> Xq(16);
+ Array<Int> Yq(16);
+
+ Int i = 0;
+
+ Do
+ {
+ Xq[i] = X[i];
+ Yq[i] = Y[i];
+
+ if(state.multiSample > 1)
+ {
+ Xq[i] = Xq[i] + *Pointer<Int>(constants + OFFSET(Constants,Xf) + q * sizeof(int));
+ Yq[i] = Yq[i] + *Pointer<Int>(constants + OFFSET(Constants,Yf) + q * sizeof(int));
+ }
+
+ i++;
+ }
+ Until(i >= n)
+
+ Pointer<Byte> leftEdge = Pointer<Byte>(primitive + OFFSET(Primitive,outline->left)) + q * sizeof(Primitive);
+ Pointer<Byte> rightEdge = Pointer<Byte>(primitive + OFFSET(Primitive,outline->right)) + q * sizeof(Primitive);
+
+ if(state.multiSample > 1)
+ {
+ Int xMin = *Pointer<Int>(data + OFFSET(DrawData, scissorX0));
+ Int xMax = *Pointer<Int>(data + OFFSET(DrawData, scissorX1));
+ Short x = Short(Clamp((X[0] + 0xF) >> 4, xMin, xMax));
+
+ For(Int y = yMin - 1, y < yMax + 1, y++)
+ {
+ *Pointer<Short>(leftEdge + y * sizeof(Primitive::Span)) = x;
+ *Pointer<Short>(rightEdge + y * sizeof(Primitive::Span)) = x;
+ }
+ }
+
+ Xq[n] = Xq[0];
+ Yq[n] = Yq[0];
+
+ // Rasterize
+ {
+ Int i = 0;
+
+ Do
+ {
+ edge(primitive, data, Xq[i + 1 - d], Yq[i + 1 - d], Xq[i + d], Yq[i + d], q);
+
+ i++;
+ }
+ Until(i >= n)
+ }
+
+ if(state.multiSample == 1)
+ {
+ For(, yMin < yMax && *Pointer<Short>(leftEdge + yMin * sizeof(Primitive::Span)) == *Pointer<Short>(rightEdge + yMin * sizeof(Primitive::Span)), yMin++)
+ {
+ // Increments yMin
+ }
+
+ For(, yMax > yMin && *Pointer<Short>(leftEdge + (yMax - 1) * sizeof(Primitive::Span)) == *Pointer<Short>(rightEdge + (yMax - 1) * sizeof(Primitive::Span)), yMax--)
+ {
+ // Decrements yMax
+ }
+
+ If(yMin == yMax)
+ {
+ Return(false);
+ }
+
+ *Pointer<Short>(leftEdge + (yMin - 1) * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + yMin * sizeof(Primitive::Span));
+ *Pointer<Short>(rightEdge + (yMin - 1) * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + yMin * sizeof(Primitive::Span));
+ *Pointer<Short>(leftEdge + yMax * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + (yMax - 1) * sizeof(Primitive::Span));
+ *Pointer<Short>(rightEdge + yMax * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + (yMax - 1) * sizeof(Primitive::Span));
+ }
+ }
+
+ *Pointer<Int>(primitive + OFFSET(Primitive,yMin)) = yMin;
+ *Pointer<Int>(primitive + OFFSET(Primitive,yMax)) = yMax;
+
+ // Sort by minimum y
+ if(solidTriangle && logPrecision >= WHQL)
+ {
+ Float y0 = *Pointer<Float>(v0 + pos * 16 + 4);
+ Float y1 = *Pointer<Float>(v1 + pos * 16 + 4);
+ Float y2 = *Pointer<Float>(v2 + pos * 16 + 4);
+
+ Float yMin = Min(Min(y0, y1), y2);
+
+ conditionalRotate1(yMin == y1, v0, v1, v2);
+ conditionalRotate2(yMin == y2, v0, v1, v2);
+ }
+
+ // Sort by maximum w
+ if(solidTriangle)
+ {
+ Float w0 = *Pointer<Float>(v0 + pos * 16 + 12);
+ Float w1 = *Pointer<Float>(v1 + pos * 16 + 12);
+ Float w2 = *Pointer<Float>(v2 + pos * 16 + 12);
+
+ Float wMax = Max(Max(w0, w1), w2);
+
+ conditionalRotate1(wMax == w1, v0, v1, v2);
+ conditionalRotate2(wMax == w2, v0, v1, v2);
+ }
+
+ Float w0 = *Pointer<Float>(v0 + pos * 16 + 12);
+ Float w1 = *Pointer<Float>(v1 + pos * 16 + 12);
+ Float w2 = *Pointer<Float>(v2 + pos * 16 + 12);
+
+ Float4 w012;
+
+ w012.x = w0;
+ w012.y = w1;
+ w012.z = w2;
+ w012.w = 1;
+
+ Float rhw0 = *Pointer<Float>(v0 + OFFSET(Vertex,W));
+
+ Int X0 = *Pointer<Int>(v0 + OFFSET(Vertex,X));
+ Int X1 = *Pointer<Int>(v1 + OFFSET(Vertex,X));
+ Int X2 = *Pointer<Int>(v2 + OFFSET(Vertex,X));
+
+ Int Y0 = *Pointer<Int>(v0 + OFFSET(Vertex,Y));
+ Int Y1 = *Pointer<Int>(v1 + OFFSET(Vertex,Y));
+ Int Y2 = *Pointer<Int>(v2 + OFFSET(Vertex,Y));
+
+ if(line)
+ {
+ X2 = X1 + Y1 - Y0;
+ Y2 = Y1 + X0 - X1;
+ }
+
+ Float dx = Float(X0) * (1.0f / 16.0f);
+ Float dy = Float(Y0) * (1.0f / 16.0f);
+
+ X1 -= X0;
+ Y1 -= Y0;
+
+ X2 -= X0;
+ Y2 -= Y0;
+
+ Float x1 = w1 * (1.0f / 16.0f) * Float(X1);
+ Float y1 = w1 * (1.0f / 16.0f) * Float(Y1);
+
+ Float x2 = w2 * (1.0f / 16.0f) * Float(X2);
+ Float y2 = w2 * (1.0f / 16.0f) * Float(Y2);
+
+ Float a = x1 * y2 - x2 * y1;
+
+ Float4 xQuad = Float4(0, 1, 0, 1) - Float4(dx);
+ Float4 yQuad = Float4(0, 0, 1, 1) - Float4(dy);
+
+ *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16) = xQuad;
+ *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16) = yQuad;
+
+ Float4 M[3];
+
+ M[0] = Float4(0, 0, 0, 0);
+ M[1] = Float4(0, 0, 0, 0);
+ M[2] = Float4(0, 0, 0, 0);
+
+ M[0].z = rhw0;
+
+ If(a != 0.0f)
+ {
+ Float A = 1.0f / a;
+ Float D = A * rhw0;
+
+ M[0].x = (y1 * w2 - y2 * w1) * D;
+ M[0].y = (x2 * w1 - x1 * w2) * D;
+ // M[0].z = rhw0;
+ // M[0].w = 0;
+
+ M[1].x = y2 * A;
+ M[1].y = -x2 * A;
+ // M[1].z = 0;
+ // M[1].w = 0;
+
+ M[2].x = -y1 * A;
+ M[2].y = x1 * A;
+ // M[2].z = 0;
+ // M[2].w = 0;
+ }
+
+ if(state.interpolateW)
+ {
+ Float4 ABC = M[0] + M[1] + M[2];
+
+ Float4 A = ABC.x;
+ Float4 B = ABC.y;
+ Float4 C = ABC.z;
+
+ *Pointer<Float4>(primitive + OFFSET(Primitive,w.A), 16) = A;
+ *Pointer<Float4>(primitive + OFFSET(Primitive,w.B), 16) = B;
+ *Pointer<Float4>(primitive + OFFSET(Primitive,w.C), 16) = C;
+ }
+
+ if(state.interpolateZ)
+ {
+ Float z0 = *Pointer<Float>(v0 + OFFSET(Vertex,Z));
+ Float z1 = *Pointer<Float>(v1 + OFFSET(Vertex,Z));
+ Float z2 = *Pointer<Float>(v2 + OFFSET(Vertex,Z));
+
+ z1 -= z0;
+ z2 -= z0;
+
+ Float4 A;
+ Float4 B;
+ Float4 C;
+
+ if(!point)
+ {
+ Float x1 = Float(X1) * (1.0f / 16.0f);
+ Float y1 = Float(Y1) * (1.0f / 16.0f);
+ Float x2 = Float(X2) * (1.0f / 16.0f);
+ Float y2 = Float(Y2) * (1.0f / 16.0f);
+
+ Float D = *Pointer<Float>(data + OFFSET(DrawData,depthRange)) / (x1 * y2 - x2 * y1);
+
+ Float a = (y2 * z1 - y1 * z2) * D;
+ Float b = (x1 * z2 - x2 * z1) * D;
+
+ A = Float4(a);
+ B = Float4(b);
+ }
+ else
+ {
+ A = Float4(0, 0, 0, 0);
+ B = Float4(0, 0, 0, 0);
+ }
+
+ *Pointer<Float4>(primitive + OFFSET(Primitive,z.A), 16) = A;
+ *Pointer<Float4>(primitive + OFFSET(Primitive,z.B), 16) = B;
+
+ Float c = z0;
+
+ if(state.isDrawTriangle && state.slopeDepthBias)
+ {
+ Float bias = Max(Abs(Float(A.x)), Abs(Float(B.x)));
+ bias *= *Pointer<Float>(data + OFFSET(DrawData,slopeDepthBias));
+
+ if(complementaryDepthBuffer)
+ {
+ bias = -bias;
+ }
+
+ c += bias;
+ }
+
+ C = Float4(c * *Pointer<Float>(data + OFFSET(DrawData,depthRange)) + *Pointer<Float>(data + OFFSET(DrawData,depthNear)));
+
+ *Pointer<Float4>(primitive + OFFSET(Primitive,z.C), 16) = C;
+ }
+
+ for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+ {
+ for(int component = 0; component < 4; component++)
+ {
+ int attribute = state.gradient[interpolant][component].attribute;
+ bool flat = state.gradient[interpolant][component].flat;
+ bool wrap = state.gradient[interpolant][component].wrap;
+
+ if(attribute != Unused)
+ {
+ setupGradient(primitive, tri, w012, M, v0, v1, v2, OFFSET(Vertex,v[attribute][component]), OFFSET(Primitive,V[interpolant][component]), flat, sprite, state.perspective, wrap, component);
+ }
+ }
+ }
+
+ if(state.fog.attribute == Fog)
+ {
+ setupGradient(primitive, tri, w012, M, v0, v1, v2, OFFSET(Vertex,f), OFFSET(Primitive,f), state.fog.flat, false, state.perspective, false, 0);
+ }
+
+ Return(true);
+ }
+
+ routine = function(L"SetupRoutine");
+ }
+
+ void SetupRoutine::setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flat, bool sprite, bool perspective, bool wrap, int component)
+ {
+ Float4 i;
+
+ if(!flat)
+ {
+ if(!sprite)
+ {
+ i.x = *Pointer<Float>(v0 + attribute);
+ i.y = *Pointer<Float>(v1 + attribute);
+ i.z = *Pointer<Float>(v2 + attribute);
+ i.w = 0;
+ }
+ else
+ {
+ if(component == 0) i.x = 0.5f;
+ if(component == 1) i.x = 0.5f;
+ if(component == 2) i.x = 0.0f;
+ if(component == 3) i.x = 1.0f;
+
+ if(component == 0) i.y = 1.0f;
+ if(component == 1) i.y = 0.5f;
+ if(component == 2) i.y = 0.0f;
+ if(component == 3) i.y = 1.0f;
+
+ if(component == 0) i.z = 0.5f;
+ if(component == 1) i.z = 1.0f;
+ if(component == 2) i.z = 0.0f;
+ if(component == 3) i.z = 1.0f;
+
+ i.w = 0;
+ }
+
+ if(wrap)
+ {
+ Float m;
+
+ m = *Pointer<Float>(v0 + attribute);
+ m = Max(m, *Pointer<Float>(v1 + attribute));
+ m = Max(m, *Pointer<Float>(v2 + attribute));
+ m -= 0.5f;
+
+ // FIXME: Vectorize
+ If(Float(i.x) < m) i.x = i.x + 1.0f;
+ If(Float(i.y) < m) i.y = i.y + 1.0f;
+ If(Float(i.z) < m) i.z = i.z + 1.0f;
+ }
+
+ if(!perspective)
+ {
+ i *= w012;
+ }
+
+ Float4 A = i.xxxx * m[0];
+ Float4 B = i.yyyy * m[1];
+ Float4 C = i.zzzz * m[2];
+
+ C = A + B + C;
+
+ A = C.xxxx;
+ B = C.yyyy;
+ C = C.zzzz;
+
+ *Pointer<Float4>(primitive + planeEquation + 0, 16) = A;
+ *Pointer<Float4>(primitive + planeEquation + 16, 16) = B;
+ *Pointer<Float4>(primitive + planeEquation + 32, 16) = C;
+ }
+ else
+ {
+ int leadingVertex = leadingVertexFirst ? OFFSET(Triangle,v0) : OFFSET(Triangle,v2);
+ Float C = *Pointer<Float>(triangle + leadingVertex + attribute);
+
+ *Pointer<Float4>(primitive + planeEquation + 0, 16) = Float4(0, 0, 0, 0);
+ *Pointer<Float4>(primitive + planeEquation + 16, 16) = Float4(0, 0, 0, 0);
+ *Pointer<Float4>(primitive + planeEquation + 32, 16) = Float4(C);
+ }
+ }
+
+ void SetupRoutine::edge(Pointer<Byte> &primitive, Pointer<Byte> &data, const Int &Xa, const Int &Ya, const Int &Xb, const Int &Yb, Int &q)
+ {
+ If(Ya != Yb)
+ {
+ Bool swap = Yb < Ya;
+
+ Int X1 = IfThenElse(swap, Xb, Xa);
+ Int X2 = IfThenElse(swap, Xa, Xb);
+ Int Y1 = IfThenElse(swap, Yb, Ya);
+ Int Y2 = IfThenElse(swap, Ya, Yb);
+
+ Int y1 = Max((Y1 + 0x0000000F) >> 4, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
+ Int y2 = Min((Y2 + 0x0000000F) >> 4, *Pointer<Int>(data + OFFSET(DrawData,scissorY1)));
+
+ If(y1 < y2)
+ {
+ Int xMin = *Pointer<Int>(data + OFFSET(DrawData,scissorX0));
+ Int xMax = *Pointer<Int>(data + OFFSET(DrawData,scissorX1));
+
+ Pointer<Byte> leftEdge = primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left);
+ Pointer<Byte> rightEdge = primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right);
+ Pointer<Byte> edge = IfThenElse(swap, rightEdge, leftEdge);
+
+ // Deltas
+ Int DX12 = X2 - X1;
+ Int DY12 = Y2 - Y1;
+
+ Int FDX12 = DX12 << 4;
+ Int FDY12 = DY12 << 4;
+
+ Int X = DX12 * ((y1 << 4) - Y1) + (X1 & 0x0000000F) * DY12;
+ Int x = (X1 >> 4) + X / FDY12; // Edge
+ Int d = X % FDY12; // Error-term
+ Int ceil = -d >> 31; // Ceiling division: remainder <= 0
+ x -= ceil;
+ d -= ceil & FDY12;
+
+ Int Q = FDX12 / FDY12; // Edge-step
+ Int R = FDX12 % FDY12; // Error-step
+ Int floor = R >> 31; // Flooring division: remainder >= 0
+ Q += floor;
+ R += floor & FDY12;
+
+ Int D = FDY12; // Error-overflow
+ Int y = y1;
+
+ Do
+ {
+ *Pointer<Short>(edge + y * sizeof(Primitive::Span)) = Short(Clamp(x, xMin, xMax));
+
+ x += Q;
+ d += R;
+
+ Int overflow = -d >> 31;
+
+ d -= D & overflow;
+ x -= overflow;
+
+ y++;
+ }
+ Until(y >= y2)
+ }
+ }
+ }
+
+ void SetupRoutine::conditionalRotate1(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2)
+ {
+ #if 0 // Rely on LLVM optimization
+ If(condition)
+ {
+ Pointer<Byte> vX;
+
+ vX = v0;
+ v0 = v1;
+ v1 = v2;
+ v2 = vX;
+ }
+ #else
+ Pointer<Byte> vX = v0;
+ v0 = IfThenElse(condition, v1, v0);
+ v1 = IfThenElse(condition, v2, v1);
+ v2 = IfThenElse(condition, vX, v2);
+ #endif
+ }
+
+ void SetupRoutine::conditionalRotate2(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2)
+ {
+ #if 0 // Rely on LLVM optimization
+ If(condition)
+ {
+ Pointer<Byte> vX;
+
+ vX = v2;
+ v2 = v1;
+ v1 = v0;
+ v0 = vX;
+ }
+ #else
+ Pointer<Byte> vX = v2;
+ v2 = IfThenElse(condition, v1, v2);
+ v1 = IfThenElse(condition, v0, v1);
+ v0 = IfThenElse(condition, vX, v0);
+ #endif
+ }
+
+ Routine *SetupRoutine::getRoutine()
+ {
+ return routine;
+ }
+}
diff --git a/src/Pipeline/SetupRoutine.hpp b/src/Pipeline/SetupRoutine.hpp
new file mode 100644
index 0000000..c1c3205
--- /dev/null
+++ b/src/Pipeline/SetupRoutine.hpp
@@ -0,0 +1,47 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_SetupRoutine_hpp
+#define sw_SetupRoutine_hpp
+
+#include "Renderer/SetupProcessor.hpp"
+#include "Reactor/Reactor.hpp"
+
+namespace sw
+{
+ class Context;
+
+ class SetupRoutine
+ {
+ public:
+ SetupRoutine(const SetupProcessor::State &state);
+
+ virtual ~SetupRoutine();
+
+ void generate();
+ Routine *getRoutine();
+
+ private:
+ void setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flatShading, bool sprite, bool perspective, bool wrap, int component);
+ void edge(Pointer<Byte> &primitive, Pointer<Byte> &data, const Int &Xa, const Int &Ya, const Int &Xb, const Int &Yb, Int &q);
+ void conditionalRotate1(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);
+ void conditionalRotate2(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);
+
+ const SetupProcessor::State &state;
+
+ Routine *routine;
+ };
+}
+
+#endif // sw_SetupRoutine_hpp
diff --git a/src/Pipeline/Shader.cpp b/src/Pipeline/Shader.cpp
new file mode 100644
index 0000000..36192c9
--- /dev/null
+++ b/src/Pipeline/Shader.cpp
@@ -0,0 +1,1927 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Shader.hpp"
+
+#include "VertexShader.hpp"
+#include "PixelShader.hpp"
+#include "Common/Math.hpp"
+#include "Common/Debug.hpp"
+
+#include <set>
+#include <fstream>
+#include <sstream>
+#include <stdarg.h>
+
+namespace sw
+{
+ volatile int Shader::serialCounter = 1;
+
+ Shader::Opcode Shader::OPCODE_DP(int i)
+ {
+ switch(i)
+ {
+ default: ASSERT(false);
+ case 1: return OPCODE_DP1;
+ case 2: return OPCODE_DP2;
+ case 3: return OPCODE_DP3;
+ case 4: return OPCODE_DP4;
+ }
+ }
+
+ Shader::Opcode Shader::OPCODE_LEN(int i)
+ {
+ switch(i)
+ {
+ default: ASSERT(false);
+ case 1: return OPCODE_ABS;
+ case 2: return OPCODE_LEN2;
+ case 3: return OPCODE_LEN3;
+ case 4: return OPCODE_LEN4;
+ }
+ }
+
+ Shader::Opcode Shader::OPCODE_DIST(int i)
+ {
+ switch(i)
+ {
+ default: ASSERT(false);
+ case 1: return OPCODE_DIST1;
+ case 2: return OPCODE_DIST2;
+ case 3: return OPCODE_DIST3;
+ case 4: return OPCODE_DIST4;
+ }
+ }
+
+ Shader::Opcode Shader::OPCODE_NRM(int i)
+ {
+ switch(i)
+ {
+ default: ASSERT(false);
+ case 1: return OPCODE_SGN;
+ case 2: return OPCODE_NRM2;
+ case 3: return OPCODE_NRM3;
+ case 4: return OPCODE_NRM4;
+ }
+ }
+
+ Shader::Opcode Shader::OPCODE_FORWARD(int i)
+ {
+ switch(i)
+ {
+ default: ASSERT(false);
+ case 1: return OPCODE_FORWARD1;
+ case 2: return OPCODE_FORWARD2;
+ case 3: return OPCODE_FORWARD3;
+ case 4: return OPCODE_FORWARD4;
+ }
+ }
+
+ Shader::Opcode Shader::OPCODE_REFLECT(int i)
+ {
+ switch(i)
+ {
+ default: ASSERT(false);
+ case 1: return OPCODE_REFLECT1;
+ case 2: return OPCODE_REFLECT2;
+ case 3: return OPCODE_REFLECT3;
+ case 4: return OPCODE_REFLECT4;
+ }
+ }
+
+ Shader::Opcode Shader::OPCODE_REFRACT(int i)
+ {
+ switch(i)
+ {
+ default: ASSERT(false);
+ case 1: return OPCODE_REFRACT1;
+ case 2: return OPCODE_REFRACT2;
+ case 3: return OPCODE_REFRACT3;
+ case 4: return OPCODE_REFRACT4;
+ }
+ }
+
+ Shader::Instruction::Instruction(Opcode opcode) : opcode(opcode), analysis(0)
+ {
+ control = CONTROL_RESERVED0;
+
+ predicate = false;
+ predicateNot = false;
+ predicateSwizzle = 0xE4;
+
+ coissue = false;
+ samplerType = SAMPLER_UNKNOWN;
+ usage = USAGE_POSITION;
+ usageIndex = 0;
+ }
+
+ Shader::Instruction::Instruction(const unsigned long *token, int size, unsigned char majorVersion) : analysis(0)
+ {
+ parseOperationToken(*token++, majorVersion);
+
+ samplerType = SAMPLER_UNKNOWN;
+ usage = USAGE_POSITION;
+ usageIndex = 0;
+
+ if(opcode == OPCODE_IF ||
+ opcode == OPCODE_IFC ||
+ opcode == OPCODE_LOOP ||
+ opcode == OPCODE_REP ||
+ opcode == OPCODE_BREAKC ||
+ opcode == OPCODE_BREAKP) // No destination operand
+ {
+ if(size > 0) parseSourceToken(0, token++, majorVersion);
+ if(size > 1) parseSourceToken(1, token++, majorVersion);
+ if(size > 2) parseSourceToken(2, token++, majorVersion);
+ if(size > 3) ASSERT(false);
+ }
+ else if(opcode == OPCODE_DCL)
+ {
+ parseDeclarationToken(*token++);
+ parseDestinationToken(token++, majorVersion);
+ }
+ else
+ {
+ if(size > 0)
+ {
+ parseDestinationToken(token, majorVersion);
+
+ if(dst.rel.type != PARAMETER_VOID && majorVersion >= 3)
+ {
+ token++;
+ size--;
+ }
+
+ token++;
+ size--;
+ }
+
+ if(predicate)
+ {
+ ASSERT(size != 0);
+
+ predicateNot = (Modifier)((*token & 0x0F000000) >> 24) == MODIFIER_NOT;
+ predicateSwizzle = (unsigned char)((*token & 0x00FF0000) >> 16);
+
+ token++;
+ size--;
+ }
+
+ for(int i = 0; size > 0; i++)
+ {
+ parseSourceToken(i, token, majorVersion);
+
+ token++;
+ size--;
+
+ if(src[i].rel.type != PARAMETER_VOID && majorVersion >= 2)
+ {
+ token++;
+ size--;
+ }
+ }
+ }
+ }
+
+ Shader::Instruction::~Instruction()
+ {
+ }
+
+ std::string Shader::Instruction::string(ShaderType shaderType, unsigned short version) const
+ {
+ std::string instructionString;
+
+ if(opcode != OPCODE_DCL)
+ {
+ instructionString += coissue ? "+ " : "";
+
+ if(predicate)
+ {
+ instructionString += predicateNot ? "(!p0" : "(p0";
+ instructionString += swizzleString(PARAMETER_PREDICATE, predicateSwizzle);
+ instructionString += ") ";
+ }
+
+ instructionString += operationString(version) + controlString() + dst.shiftString() + dst.modifierString();
+
+ if(dst.type != PARAMETER_VOID)
+ {
+ instructionString += " " + dst.string(shaderType, version) +
+ dst.relativeString() +
+ dst.maskString();
+ }
+
+ for(int i = 0; i < 4; i++)
+ {
+ if(src[i].type != PARAMETER_VOID)
+ {
+ instructionString += (dst.type != PARAMETER_VOID || i > 0) ? ", " : " ";
+ instructionString += src[i].preModifierString() +
+ src[i].string(shaderType, version) +
+ src[i].relativeString() +
+ src[i].postModifierString() +
+ src[i].swizzleString();
+ }
+ }
+ }
+ else // DCL
+ {
+ instructionString += "dcl";
+
+ if(dst.type == PARAMETER_SAMPLER)
+ {
+ switch(samplerType)
+ {
+ case SAMPLER_UNKNOWN: instructionString += " "; break;
+ case SAMPLER_1D: instructionString += "_1d "; break;
+ case SAMPLER_2D: instructionString += "_2d "; break;
+ case SAMPLER_CUBE: instructionString += "_cube "; break;
+ case SAMPLER_VOLUME: instructionString += "_volume "; break;
+ default:
+ ASSERT(false);
+ }
+
+ instructionString += dst.string(shaderType, version);
+ }
+ else if(dst.type == PARAMETER_INPUT ||
+ dst.type == PARAMETER_OUTPUT ||
+ dst.type == PARAMETER_TEXTURE)
+ {
+ if(version >= 0x0300)
+ {
+ switch(usage)
+ {
+ case USAGE_POSITION: instructionString += "_position"; break;
+ case USAGE_BLENDWEIGHT: instructionString += "_blendweight"; break;
+ case USAGE_BLENDINDICES: instructionString += "_blendindices"; break;
+ case USAGE_NORMAL: instructionString += "_normal"; break;
+ case USAGE_PSIZE: instructionString += "_psize"; break;
+ case USAGE_TEXCOORD: instructionString += "_texcoord"; break;
+ case USAGE_TANGENT: instructionString += "_tangent"; break;
+ case USAGE_BINORMAL: instructionString += "_binormal"; break;
+ case USAGE_TESSFACTOR: instructionString += "_tessfactor"; break;
+ case USAGE_POSITIONT: instructionString += "_positiont"; break;
+ case USAGE_COLOR: instructionString += "_color"; break;
+ case USAGE_FOG: instructionString += "_fog"; break;
+ case USAGE_DEPTH: instructionString += "_depth"; break;
+ case USAGE_SAMPLE: instructionString += "_sample"; break;
+ default:
+ ASSERT(false);
+ }
+
+ if(usageIndex > 0)
+ {
+ std::ostringstream buffer;
+
+ buffer << (int)usageIndex;
+
+ instructionString += buffer.str();
+ }
+ }
+ else ASSERT(dst.type != PARAMETER_OUTPUT);
+
+ instructionString += " ";
+
+ instructionString += dst.string(shaderType, version);
+ instructionString += dst.maskString();
+ }
+ else if(dst.type == PARAMETER_MISCTYPE) // vPos and vFace
+ {
+ instructionString += " ";
+
+ instructionString += dst.string(shaderType, version);
+ }
+ else ASSERT(false);
+ }
+
+ return instructionString;
+ }
+
+ std::string Shader::DestinationParameter::modifierString() const
+ {
+ if(type == PARAMETER_VOID || type == PARAMETER_LABEL)
+ {
+ return "";
+ }
+
+ std::string modifierString;
+
+ if(saturate)
+ {
+ modifierString += "_sat";
+ }
+
+ if(partialPrecision)
+ {
+ modifierString += "_pp";
+ }
+
+ if(centroid)
+ {
+ modifierString += "_centroid";
+ }
+
+ return modifierString;
+ }
+
+ std::string Shader::DestinationParameter::shiftString() const
+ {
+ if(type == PARAMETER_VOID || type == PARAMETER_LABEL)
+ {
+ return "";
+ }
+
+ switch(shift)
+ {
+ case 0: return "";
+ case 1: return "_x2";
+ case 2: return "_x4";
+ case 3: return "_x8";
+ case -1: return "_d2";
+ case -2: return "_d4";
+ case -3: return "_d8";
+ default:
+ return "";
+ // ASSERT(false); // FIXME
+ }
+ }
+
+ std::string Shader::DestinationParameter::maskString() const
+ {
+ if(type == PARAMETER_VOID || type == PARAMETER_LABEL)
+ {
+ return "";
+ }
+
+ switch(mask)
+ {
+ case 0x0: return "";
+ case 0x1: return ".x";
+ case 0x2: return ".y";
+ case 0x3: return ".xy";
+ case 0x4: return ".z";
+ case 0x5: return ".xz";
+ case 0x6: return ".yz";
+ case 0x7: return ".xyz";
+ case 0x8: return ".w";
+ case 0x9: return ".xw";
+ case 0xA: return ".yw";
+ case 0xB: return ".xyw";
+ case 0xC: return ".zw";
+ case 0xD: return ".xzw";
+ case 0xE: return ".yzw";
+ case 0xF: return "";
+ default:
+ ASSERT(false);
+ }
+
+ return "";
+ }
+
+ std::string Shader::SourceParameter::preModifierString() const
+ {
+ if(type == PARAMETER_VOID)
+ {
+ return "";
+ }
+
+ switch(modifier)
+ {
+ case MODIFIER_NONE: return "";
+ case MODIFIER_NEGATE: return "-";
+ case MODIFIER_BIAS: return "";
+ case MODIFIER_BIAS_NEGATE: return "-";
+ case MODIFIER_SIGN: return "";
+ case MODIFIER_SIGN_NEGATE: return "-";
+ case MODIFIER_COMPLEMENT: return "1-";
+ case MODIFIER_X2: return "";
+ case MODIFIER_X2_NEGATE: return "-";
+ case MODIFIER_DZ: return "";
+ case MODIFIER_DW: return "";
+ case MODIFIER_ABS: return "";
+ case MODIFIER_ABS_NEGATE: return "-";
+ case MODIFIER_NOT: return "!";
+ default:
+ ASSERT(false);
+ }
+
+ return "";
+ }
+
+ std::string Shader::Parameter::relativeString() const
+ {
+ if(type == PARAMETER_CONST || type == PARAMETER_INPUT || type == PARAMETER_OUTPUT || type == PARAMETER_TEMP)
+ {
+ if(rel.type == PARAMETER_VOID)
+ {
+ return "";
+ }
+ else if(rel.type == PARAMETER_ADDR)
+ {
+ switch(rel.swizzle & 0x03)
+ {
+ case 0: return "[a0.x]";
+ case 1: return "[a0.y]";
+ case 2: return "[a0.z]";
+ case 3: return "[a0.w]";
+ }
+ }
+ else if(rel.type == PARAMETER_TEMP)
+ {
+ std::ostringstream buffer;
+ buffer << rel.index;
+
+ switch(rel.swizzle & 0x03)
+ {
+ case 0: return "[r" + buffer.str() + ".x]";
+ case 1: return "[r" + buffer.str() + ".y]";
+ case 2: return "[r" + buffer.str() + ".z]";
+ case 3: return "[r" + buffer.str() + ".w]";
+ }
+ }
+ else if(rel.type == PARAMETER_LOOP)
+ {
+ return "[aL]";
+ }
+ else if(rel.type == PARAMETER_CONST)
+ {
+ std::ostringstream buffer;
+ buffer << rel.index;
+
+ switch(rel.swizzle & 0x03)
+ {
+ case 0: return "[c" + buffer.str() + ".x]";
+ case 1: return "[c" + buffer.str() + ".y]";
+ case 2: return "[c" + buffer.str() + ".z]";
+ case 3: return "[c" + buffer.str() + ".w]";
+ }
+ }
+ else ASSERT(false);
+ }
+
+ return "";
+ }
+
+ std::string Shader::SourceParameter::postModifierString() const
+ {
+ if(type == PARAMETER_VOID)
+ {
+ return "";
+ }
+
+ switch(modifier)
+ {
+ case MODIFIER_NONE: return "";
+ case MODIFIER_NEGATE: return "";
+ case MODIFIER_BIAS: return "_bias";
+ case MODIFIER_BIAS_NEGATE: return "_bias";
+ case MODIFIER_SIGN: return "_bx2";
+ case MODIFIER_SIGN_NEGATE: return "_bx2";
+ case MODIFIER_COMPLEMENT: return "";
+ case MODIFIER_X2: return "_x2";
+ case MODIFIER_X2_NEGATE: return "_x2";
+ case MODIFIER_DZ: return "_dz";
+ case MODIFIER_DW: return "_dw";
+ case MODIFIER_ABS: return "_abs";
+ case MODIFIER_ABS_NEGATE: return "_abs";
+ case MODIFIER_NOT: return "";
+ default:
+ ASSERT(false);
+ }
+
+ return "";
+ }
+
+ std::string Shader::SourceParameter::string(ShaderType shaderType, unsigned short version) const
+ {
+ if(type == PARAMETER_CONST && bufferIndex >= 0)
+ {
+ std::ostringstream buffer;
+ buffer << bufferIndex;
+
+ std::ostringstream offset;
+ offset << index;
+
+ return "cb" + buffer.str() + "[" + offset.str() + "]";
+ }
+ else
+ {
+ return Parameter::string(shaderType, version);
+ }
+ }
+
+ std::string Shader::SourceParameter::swizzleString() const
+ {
+ return Instruction::swizzleString(type, swizzle);
+ }
+
+ void Shader::Instruction::parseOperationToken(unsigned long token, unsigned char majorVersion)
+ {
+ if((token & 0xFFFF0000) == 0xFFFF0000 || (token & 0xFFFF0000) == 0xFFFE0000) // Version token
+ {
+ opcode = (Opcode)token;
+
+ control = CONTROL_RESERVED0;
+ predicate = false;
+ coissue = false;
+ }
+ else
+ {
+ opcode = (Opcode)(token & 0x0000FFFF);
+ control = (Control)((token & 0x00FF0000) >> 16);
+
+ int size = (token & 0x0F000000) >> 24;
+
+ predicate = (token & 0x10000000) != 0x00000000;
+ coissue = (token & 0x40000000) != 0x00000000;
+
+ if(majorVersion < 2)
+ {
+ if(size != 0)
+ {
+ ASSERT(false); // Reserved
+ }
+ }
+
+ if(majorVersion < 2)
+ {
+ if(predicate)
+ {
+ ASSERT(false);
+ }
+ }
+
+ if((token & 0x20000000) != 0x00000000)
+ {
+ ASSERT(false); // Reserved
+ }
+
+ if(majorVersion >= 2)
+ {
+ if(coissue)
+ {
+ ASSERT(false); // Reserved
+ }
+ }
+
+ if((token & 0x80000000) != 0x00000000)
+ {
+ ASSERT(false);
+ }
+ }
+ }
+
+ void Shader::Instruction::parseDeclarationToken(unsigned long token)
+ {
+ samplerType = (SamplerType)((token & 0x78000000) >> 27);
+ usage = (Usage)(token & 0x0000001F);
+ usageIndex = (unsigned char)((token & 0x000F0000) >> 16);
+ }
+
+ void Shader::Instruction::parseDestinationToken(const unsigned long *token, unsigned char majorVersion)
+ {
+ dst.index = (unsigned short)(token[0] & 0x000007FF);
+ dst.type = (ParameterType)(((token[0] & 0x00001800) >> 8) | ((token[0] & 0x70000000) >> 28));
+
+ // TODO: Check type and index range
+
+ bool relative = (token[0] & 0x00002000) != 0x00000000;
+ dst.rel.type = relative ? PARAMETER_ADDR : PARAMETER_VOID;
+ dst.rel.swizzle = 0x00;
+ dst.rel.scale = 1;
+
+ if(relative && majorVersion >= 3)
+ {
+ dst.rel.type = (ParameterType)(((token[1] & 0x00001800) >> 8) | ((token[1] & 0x70000000) >> 28));
+ dst.rel.swizzle = (unsigned char)((token[1] & 0x00FF0000) >> 16);
+ }
+ else if(relative) ASSERT(false); // Reserved
+
+ if((token[0] & 0x0000C000) != 0x00000000)
+ {
+ ASSERT(false); // Reserved
+ }
+
+ dst.mask = (unsigned char)((token[0] & 0x000F0000) >> 16);
+ dst.saturate = (token[0] & 0x00100000) != 0;
+ dst.partialPrecision = (token[0] & 0x00200000) != 0;
+ dst.centroid = (token[0] & 0x00400000) != 0;
+ dst.shift = (signed char)((token[0] & 0x0F000000) >> 20) >> 4;
+
+ if(majorVersion >= 2)
+ {
+ if(dst.shift)
+ {
+ ASSERT(false); // Reserved
+ }
+ }
+
+ if((token[0] & 0x80000000) != 0x80000000)
+ {
+ ASSERT(false);
+ }
+ }
+
+ void Shader::Instruction::parseSourceToken(int i, const unsigned long *token, unsigned char majorVersion)
+ {
+ // Defaults
+ src[i].index = 0;
+ src[i].type = PARAMETER_VOID;
+ src[i].modifier = MODIFIER_NONE;
+ src[i].swizzle = 0xE4;
+ src[i].rel.type = PARAMETER_VOID;
+ src[i].rel.swizzle = 0x00;
+ src[i].rel.scale = 1;
+
+ switch(opcode)
+ {
+ case OPCODE_DEF:
+ src[0].type = PARAMETER_FLOAT4LITERAL;
+ src[0].value[i] = *(float*)token;
+ break;
+ case OPCODE_DEFB:
+ src[0].type = PARAMETER_BOOL1LITERAL;
+ src[0].boolean[0] = *(int*)token;
+ break;
+ case OPCODE_DEFI:
+ src[0].type = PARAMETER_INT4LITERAL;
+ src[0].integer[i] = *(int*)token;
+ break;
+ default:
+ src[i].index = (unsigned short)(token[0] & 0x000007FF);
+ src[i].type = (ParameterType)(((token[0] & 0x00001800) >> 8) | ((token[0] & 0x70000000) >> 28));
+
+ // FIXME: Check type and index range
+
+ bool relative = (token[0] & 0x00002000) != 0x00000000;
+ src[i].rel.type = relative ? PARAMETER_ADDR : PARAMETER_VOID;
+
+ if((token[0] & 0x0000C000) != 0x00000000)
+ {
+ if(opcode != OPCODE_DEF &&
+ opcode != OPCODE_DEFI &&
+ opcode != OPCODE_DEFB)
+ {
+ ASSERT(false);
+ }
+ }
+
+ src[i].swizzle = (unsigned char)((token[0] & 0x00FF0000) >> 16);
+ src[i].modifier = (Modifier)((token[0] & 0x0F000000) >> 24);
+
+ if((token[0] & 0x80000000) != 0x80000000)
+ {
+ if(opcode != OPCODE_DEF &&
+ opcode != OPCODE_DEFI &&
+ opcode != OPCODE_DEFB)
+ {
+ ASSERT(false);
+ }
+ }
+
+ if(relative && majorVersion >= 2)
+ {
+ src[i].rel.type = (ParameterType)(((token[1] & 0x00001800) >> 8) | ((token[1] & 0x70000000) >> 28));
+ src[i].rel.swizzle = (unsigned char)((token[1] & 0x00FF0000) >> 16);
+ }
+ }
+ }
+
+ std::string Shader::Instruction::swizzleString(ParameterType type, unsigned char swizzle)
+ {
+ if(type == PARAMETER_VOID || type == PARAMETER_LABEL || swizzle == 0xE4)
+ {
+ return "";
+ }
+
+ int x = (swizzle & 0x03) >> 0;
+ int y = (swizzle & 0x0C) >> 2;
+ int z = (swizzle & 0x30) >> 4;
+ int w = (swizzle & 0xC0) >> 6;
+
+ std::string swizzleString = ".";
+
+ switch(x)
+ {
+ case 0: swizzleString += "x"; break;
+ case 1: swizzleString += "y"; break;
+ case 2: swizzleString += "z"; break;
+ case 3: swizzleString += "w"; break;
+ }
+
+ if(!(x == y && y == z && z == w))
+ {
+ switch(y)
+ {
+ case 0: swizzleString += "x"; break;
+ case 1: swizzleString += "y"; break;
+ case 2: swizzleString += "z"; break;
+ case 3: swizzleString += "w"; break;
+ }
+
+ if(!(y == z && z == w))
+ {
+ switch(z)
+ {
+ case 0: swizzleString += "x"; break;
+ case 1: swizzleString += "y"; break;
+ case 2: swizzleString += "z"; break;
+ case 3: swizzleString += "w"; break;
+ }
+
+ if(!(z == w))
+ {
+ switch(w)
+ {
+ case 0: swizzleString += "x"; break;
+ case 1: swizzleString += "y"; break;
+ case 2: swizzleString += "z"; break;
+ case 3: swizzleString += "w"; break;
+ }
+ }
+ }
+ }
+
+ return swizzleString;
+ }
+
+ std::string Shader::Instruction::operationString(unsigned short version) const
+ {
+ switch(opcode)
+ {
+ case OPCODE_NULL: return "null";
+ case OPCODE_NOP: return "nop";
+ case OPCODE_MOV: return "mov";
+ case OPCODE_ADD: return "add";
+ case OPCODE_IADD: return "iadd";
+ case OPCODE_SUB: return "sub";
+ case OPCODE_ISUB: return "isub";
+ case OPCODE_MAD: return "mad";
+ case OPCODE_IMAD: return "imad";
+ case OPCODE_MUL: return "mul";
+ case OPCODE_IMUL: return "imul";
+ case OPCODE_RCPX: return "rcpx";
+ case OPCODE_DIV: return "div";
+ case OPCODE_IDIV: return "idiv";
+ case OPCODE_UDIV: return "udiv";
+ case OPCODE_MOD: return "mod";
+ case OPCODE_IMOD: return "imod";
+ case OPCODE_UMOD: return "umod";
+ case OPCODE_SHL: return "shl";
+ case OPCODE_ISHR: return "ishr";
+ case OPCODE_USHR: return "ushr";
+ case OPCODE_RSQX: return "rsqx";
+ case OPCODE_SQRT: return "sqrt";
+ case OPCODE_RSQ: return "rsq";
+ case OPCODE_LEN2: return "len2";
+ case OPCODE_LEN3: return "len3";
+ case OPCODE_LEN4: return "len4";
+ case OPCODE_DIST1: return "dist1";
+ case OPCODE_DIST2: return "dist2";
+ case OPCODE_DIST3: return "dist3";
+ case OPCODE_DIST4: return "dist4";
+ case OPCODE_DP3: return "dp3";
+ case OPCODE_DP4: return "dp4";
+ case OPCODE_DET2: return "det2";
+ case OPCODE_DET3: return "det3";
+ case OPCODE_DET4: return "det4";
+ case OPCODE_MIN: return "min";
+ case OPCODE_IMIN: return "imin";
+ case OPCODE_UMIN: return "umin";
+ case OPCODE_MAX: return "max";
+ case OPCODE_IMAX: return "imax";
+ case OPCODE_UMAX: return "umax";
+ case OPCODE_SLT: return "slt";
+ case OPCODE_SGE: return "sge";
+ case OPCODE_EXP2X: return "exp2x";
+ case OPCODE_LOG2X: return "log2x";
+ case OPCODE_LIT: return "lit";
+ case OPCODE_ATT: return "att";
+ case OPCODE_LRP: return "lrp";
+ case OPCODE_STEP: return "step";
+ case OPCODE_SMOOTH: return "smooth";
+ case OPCODE_FLOATBITSTOINT: return "floatBitsToInt";
+ case OPCODE_FLOATBITSTOUINT: return "floatBitsToUInt";
+ case OPCODE_INTBITSTOFLOAT: return "intBitsToFloat";
+ case OPCODE_UINTBITSTOFLOAT: return "uintBitsToFloat";
+ case OPCODE_PACKSNORM2x16: return "packSnorm2x16";
+ case OPCODE_PACKUNORM2x16: return "packUnorm2x16";
+ case OPCODE_PACKHALF2x16: return "packHalf2x16";
+ case OPCODE_UNPACKSNORM2x16: return "unpackSnorm2x16";
+ case OPCODE_UNPACKUNORM2x16: return "unpackUnorm2x16";
+ case OPCODE_UNPACKHALF2x16: return "unpackHalf2x16";
+ case OPCODE_FRC: return "frc";
+ case OPCODE_M4X4: return "m4x4";
+ case OPCODE_M4X3: return "m4x3";
+ case OPCODE_M3X4: return "m3x4";
+ case OPCODE_M3X3: return "m3x3";
+ case OPCODE_M3X2: return "m3x2";
+ case OPCODE_CALL: return "call";
+ case OPCODE_CALLNZ: return "callnz";
+ case OPCODE_LOOP: return "loop";
+ case OPCODE_RET: return "ret";
+ case OPCODE_ENDLOOP: return "endloop";
+ case OPCODE_LABEL: return "label";
+ case OPCODE_DCL: return "dcl";
+ case OPCODE_POWX: return "powx";
+ case OPCODE_CRS: return "crs";
+ case OPCODE_SGN: return "sgn";
+ case OPCODE_ISGN: return "isgn";
+ case OPCODE_ABS: return "abs";
+ case OPCODE_IABS: return "iabs";
+ case OPCODE_NRM2: return "nrm2";
+ case OPCODE_NRM3: return "nrm3";
+ case OPCODE_NRM4: return "nrm4";
+ case OPCODE_SINCOS: return "sincos";
+ case OPCODE_REP: return "rep";
+ case OPCODE_ENDREP: return "endrep";
+ case OPCODE_IF: return "if";
+ case OPCODE_IFC: return "ifc";
+ case OPCODE_ELSE: return "else";
+ case OPCODE_ENDIF: return "endif";
+ case OPCODE_BREAK: return "break";
+ case OPCODE_BREAKC: return "breakc";
+ case OPCODE_MOVA: return "mova";
+ case OPCODE_DEFB: return "defb";
+ case OPCODE_DEFI: return "defi";
+ case OPCODE_TEXCOORD: return "texcoord";
+ case OPCODE_TEXKILL: return "texkill";
+ case OPCODE_DISCARD: return "discard";
+ case OPCODE_TEX:
+ if(version < 0x0104) return "tex";
+ else return "texld";
+ case OPCODE_TEXBEM: return "texbem";
+ case OPCODE_TEXBEML: return "texbeml";
+ case OPCODE_TEXREG2AR: return "texreg2ar";
+ case OPCODE_TEXREG2GB: return "texreg2gb";
+ case OPCODE_TEXM3X2PAD: return "texm3x2pad";
+ case OPCODE_TEXM3X2TEX: return "texm3x2tex";
+ case OPCODE_TEXM3X3PAD: return "texm3x3pad";
+ case OPCODE_TEXM3X3TEX: return "texm3x3tex";
+ case OPCODE_RESERVED0: return "reserved0";
+ case OPCODE_TEXM3X3SPEC: return "texm3x3spec";
+ case OPCODE_TEXM3X3VSPEC: return "texm3x3vspec";
+ case OPCODE_EXPP: return "expp";
+ case OPCODE_LOGP: return "logp";
+ case OPCODE_CND: return "cnd";
+ case OPCODE_DEF: return "def";
+ case OPCODE_TEXREG2RGB: return "texreg2rgb";
+ case OPCODE_TEXDP3TEX: return "texdp3tex";
+ case OPCODE_TEXM3X2DEPTH: return "texm3x2depth";
+ case OPCODE_TEXDP3: return "texdp3";
+ case OPCODE_TEXM3X3: return "texm3x3";
+ case OPCODE_TEXDEPTH: return "texdepth";
+ case OPCODE_CMP0: return "cmp0";
+ case OPCODE_ICMP: return "icmp";
+ case OPCODE_UCMP: return "ucmp";
+ case OPCODE_SELECT: return "select";
+ case OPCODE_EXTRACT: return "extract";
+ case OPCODE_INSERT: return "insert";
+ case OPCODE_BEM: return "bem";
+ case OPCODE_DP2ADD: return "dp2add";
+ case OPCODE_DFDX: return "dFdx";
+ case OPCODE_DFDY: return "dFdy";
+ case OPCODE_FWIDTH: return "fwidth";
+ case OPCODE_TEXLDD: return "texldd";
+ case OPCODE_CMP: return "cmp";
+ case OPCODE_TEXLDL: return "texldl";
+ case OPCODE_TEXBIAS: return "texbias";
+ case OPCODE_TEXOFFSET: return "texoffset";
+ case OPCODE_TEXOFFSETBIAS: return "texoffsetbias";
+ case OPCODE_TEXLODOFFSET: return "texlodoffset";
+ case OPCODE_TEXELFETCH: return "texelfetch";
+ case OPCODE_TEXELFETCHOFFSET: return "texelfetchoffset";
+ case OPCODE_TEXGRAD: return "texgrad";
+ case OPCODE_TEXGRADOFFSET: return "texgradoffset";
+ case OPCODE_BREAKP: return "breakp";
+ case OPCODE_TEXSIZE: return "texsize";
+ case OPCODE_PHASE: return "phase";
+ case OPCODE_COMMENT: return "comment";
+ case OPCODE_END: return "end";
+ case OPCODE_PS_1_0: return "ps_1_0";
+ case OPCODE_PS_1_1: return "ps_1_1";
+ case OPCODE_PS_1_2: return "ps_1_2";
+ case OPCODE_PS_1_3: return "ps_1_3";
+ case OPCODE_PS_1_4: return "ps_1_4";
+ case OPCODE_PS_2_0: return "ps_2_0";
+ case OPCODE_PS_2_x: return "ps_2_x";
+ case OPCODE_PS_3_0: return "ps_3_0";
+ case OPCODE_VS_1_0: return "vs_1_0";
+ case OPCODE_VS_1_1: return "vs_1_1";
+ case OPCODE_VS_2_0: return "vs_2_0";
+ case OPCODE_VS_2_x: return "vs_2_x";
+ case OPCODE_VS_2_sw: return "vs_2_sw";
+ case OPCODE_VS_3_0: return "vs_3_0";
+ case OPCODE_VS_3_sw: return "vs_3_sw";
+ case OPCODE_WHILE: return "while";
+ case OPCODE_ENDWHILE: return "endwhile";
+ case OPCODE_COS: return "cos";
+ case OPCODE_SIN: return "sin";
+ case OPCODE_TAN: return "tan";
+ case OPCODE_ACOS: return "acos";
+ case OPCODE_ASIN: return "asin";
+ case OPCODE_ATAN: return "atan";
+ case OPCODE_ATAN2: return "atan2";
+ case OPCODE_COSH: return "cosh";
+ case OPCODE_SINH: return "sinh";
+ case OPCODE_TANH: return "tanh";
+ case OPCODE_ACOSH: return "acosh";
+ case OPCODE_ASINH: return "asinh";
+ case OPCODE_ATANH: return "atanh";
+ case OPCODE_DP1: return "dp1";
+ case OPCODE_DP2: return "dp2";
+ case OPCODE_TRUNC: return "trunc";
+ case OPCODE_FLOOR: return "floor";
+ case OPCODE_ROUND: return "round";
+ case OPCODE_ROUNDEVEN: return "roundEven";
+ case OPCODE_CEIL: return "ceil";
+ case OPCODE_EXP2: return "exp2";
+ case OPCODE_LOG2: return "log2";
+ case OPCODE_EXP: return "exp";
+ case OPCODE_LOG: return "log";
+ case OPCODE_POW: return "pow";
+ case OPCODE_F2B: return "f2b";
+ case OPCODE_B2F: return "b2f";
+ case OPCODE_F2I: return "f2i";
+ case OPCODE_I2F: return "i2f";
+ case OPCODE_F2U: return "f2u";
+ case OPCODE_U2F: return "u2f";
+ case OPCODE_B2I: return "b2i";
+ case OPCODE_I2B: return "i2b";
+ case OPCODE_ALL: return "all";
+ case OPCODE_ANY: return "any";
+ case OPCODE_NEG: return "neg";
+ case OPCODE_INEG: return "ineg";
+ case OPCODE_ISNAN: return "isnan";
+ case OPCODE_ISINF: return "isinf";
+ case OPCODE_NOT: return "not";
+ case OPCODE_OR: return "or";
+ case OPCODE_XOR: return "xor";
+ case OPCODE_AND: return "and";
+ case OPCODE_EQ: return "eq";
+ case OPCODE_NE: return "neq";
+ case OPCODE_FORWARD1: return "forward1";
+ case OPCODE_FORWARD2: return "forward2";
+ case OPCODE_FORWARD3: return "forward3";
+ case OPCODE_FORWARD4: return "forward4";
+ case OPCODE_REFLECT1: return "reflect1";
+ case OPCODE_REFLECT2: return "reflect2";
+ case OPCODE_REFLECT3: return "reflect3";
+ case OPCODE_REFLECT4: return "reflect4";
+ case OPCODE_REFRACT1: return "refract1";
+ case OPCODE_REFRACT2: return "refract2";
+ case OPCODE_REFRACT3: return "refract3";
+ case OPCODE_REFRACT4: return "refract4";
+ case OPCODE_LEAVE: return "leave";
+ case OPCODE_CONTINUE: return "continue";
+ case OPCODE_TEST: return "test";
+ case OPCODE_SWITCH: return "switch";
+ case OPCODE_ENDSWITCH: return "endswitch";
+ default:
+ ASSERT(false);
+ }
+
+ return "<unknown>";
+ }
+
+ std::string Shader::Instruction::controlString() const
+ {
+ if(opcode != OPCODE_LOOP && opcode != OPCODE_BREAKC && opcode != OPCODE_IFC && opcode != OPCODE_CMP)
+ {
+ if(project) return "p";
+
+ if(bias) return "b";
+
+ // FIXME: LOD
+ }
+
+ switch(control)
+ {
+ case 1: return "_gt";
+ case 2: return "_eq";
+ case 3: return "_ge";
+ case 4: return "_lt";
+ case 5: return "_ne";
+ case 6: return "_le";
+ default:
+ return "";
+ // ASSERT(false); // FIXME
+ }
+ }
+
+ std::string Shader::Parameter::string(ShaderType shaderType, unsigned short version) const
+ {
+ std::ostringstream buffer;
+
+ if(type == PARAMETER_FLOAT4LITERAL)
+ {
+ buffer << '{' << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << '}';
+
+ return buffer.str();
+ }
+ else if(type != PARAMETER_RASTOUT && !(type == PARAMETER_ADDR && shaderType == SHADER_VERTEX) && type != PARAMETER_LOOP && type != PARAMETER_PREDICATE && type != PARAMETER_MISCTYPE)
+ {
+ buffer << index;
+
+ return typeString(shaderType, version) + buffer.str();
+ }
+ else
+ {
+ return typeString(shaderType, version);
+ }
+ }
+
+ std::string Shader::Parameter::typeString(ShaderType shaderType, unsigned short version) const
+ {
+ switch(type)
+ {
+ case PARAMETER_TEMP: return "r";
+ case PARAMETER_INPUT: return "v";
+ case PARAMETER_CONST: return "c";
+ case PARAMETER_TEXTURE:
+ // case PARAMETER_ADDR:
+ if(shaderType == SHADER_PIXEL) return "t";
+ else return "a0";
+ case PARAMETER_RASTOUT:
+ if(index == 0) return "oPos";
+ else if(index == 1) return "oFog";
+ else if(index == 2) return "oPts";
+ else ASSERT(false);
+ case PARAMETER_ATTROUT: return "oD";
+ case PARAMETER_TEXCRDOUT:
+ // case PARAMETER_OUTPUT: return "";
+ if(version < 0x0300) return "oT";
+ else return "o";
+ case PARAMETER_CONSTINT: return "i";
+ case PARAMETER_COLOROUT: return "oC";
+ case PARAMETER_DEPTHOUT: return "oDepth";
+ case PARAMETER_SAMPLER: return "s";
+ // case PARAMETER_CONST2: return "";
+ // case PARAMETER_CONST3: return "";
+ // case PARAMETER_CONST4: return "";
+ case PARAMETER_CONSTBOOL: return "b";
+ case PARAMETER_LOOP: return "aL";
+ // case PARAMETER_TEMPFLOAT16: return "";
+ case PARAMETER_MISCTYPE:
+ switch(index)
+ {
+ case VPosIndex: return "vPos";
+ case VFaceIndex: return "vFace";
+ case InstanceIDIndex: return "iID";
+ case VertexIDIndex: return "vID";
+ default: ASSERT(false);
+ }
+ case PARAMETER_LABEL: return "l";
+ case PARAMETER_PREDICATE: return "p0";
+ case PARAMETER_FLOAT4LITERAL: return "";
+ case PARAMETER_BOOL1LITERAL: return "";
+ case PARAMETER_INT4LITERAL: return "";
+ // case PARAMETER_VOID: return "";
+ default:
+ ASSERT(false);
+ }
+
+ return "";
+ }
+
+ bool Shader::Instruction::isBranch() const
+ {
+ return opcode == OPCODE_IF || opcode == OPCODE_IFC;
+ }
+
+ bool Shader::Instruction::isCall() const
+ {
+ return opcode == OPCODE_CALL || opcode == OPCODE_CALLNZ;
+ }
+
+ bool Shader::Instruction::isBreak() const
+ {
+ return opcode == OPCODE_BREAK || opcode == OPCODE_BREAKC || opcode == OPCODE_BREAKP;
+ }
+
+ bool Shader::Instruction::isLoop() const
+ {
+ return opcode == OPCODE_LOOP || opcode == OPCODE_REP || opcode == OPCODE_WHILE;
+ }
+
+ bool Shader::Instruction::isEndLoop() const
+ {
+ return opcode == OPCODE_ENDLOOP || opcode == OPCODE_ENDREP || opcode == OPCODE_ENDWHILE;
+ }
+
+ bool Shader::Instruction::isPredicated() const
+ {
+ return predicate ||
+ analysisBranch ||
+ analysisBreak ||
+ analysisContinue ||
+ analysisLeave;
+ }
+
+ Shader::Shader() : serialID(serialCounter++)
+ {
+ usedSamplers = 0;
+ }
+
+ Shader::~Shader()
+ {
+ for(auto &inst : instruction)
+ {
+ delete inst;
+ inst = 0;
+ }
+ }
+
+ void Shader::parse(const unsigned long *token)
+ {
+ minorVersion = (unsigned char)(token[0] & 0x000000FF);
+ majorVersion = (unsigned char)((token[0] & 0x0000FF00) >> 8);
+ shaderType = (ShaderType)((token[0] & 0xFFFF0000) >> 16);
+
+ int length = 0;
+
+ if(shaderType == SHADER_VERTEX)
+ {
+ length = VertexShader::validate(token);
+ }
+ else if(shaderType == SHADER_PIXEL)
+ {
+ length = PixelShader::validate(token);
+ }
+ else ASSERT(false);
+
+ ASSERT(length != 0);
+ instruction.resize(length);
+
+ for(int i = 0; i < length; i++)
+ {
+ while((*token & 0x0000FFFF) == 0x0000FFFE) // Comment token
+ {
+ int length = (*token & 0x7FFF0000) >> 16;
+
+ token += length + 1;
+ }
+
+ int tokenCount = size(*token);
+
+ instruction[i] = new Instruction(token, tokenCount, majorVersion);
+
+ token += 1 + tokenCount;
+ }
+ }
+
+ int Shader::size(unsigned long opcode) const
+ {
+ return size(opcode, shaderModel);
+ }
+
+ int Shader::size(unsigned long opcode, unsigned short shaderModel)
+ {
+ if(shaderModel > 0x0300)
+ {
+ ASSERT(false);
+ }
+
+ static const signed char size[] =
+ {
+ 0, // NOP = 0
+ 2, // MOV
+ 3, // ADD
+ 3, // SUB
+ 4, // MAD
+ 3, // MUL
+ 2, // RCP
+ 2, // RSQ
+ 3, // DP3
+ 3, // DP4
+ 3, // MIN
+ 3, // MAX
+ 3, // SLT
+ 3, // SGE
+ 2, // EXP
+ 2, // LOG
+ 2, // LIT
+ 3, // DST
+ 4, // LRP
+ 2, // FRC
+ 3, // M4x4
+ 3, // M4x3
+ 3, // M3x4
+ 3, // M3x3
+ 3, // M3x2
+ 1, // CALL
+ 2, // CALLNZ
+ 2, // LOOP
+ 0, // RET
+ 0, // ENDLOOP
+ 1, // LABEL
+ 2, // DCL
+ 3, // POW
+ 3, // CRS
+ 4, // SGN
+ 2, // ABS
+ 2, // NRM
+ 4, // SINCOS
+ 1, // REP
+ 0, // ENDREP
+ 1, // IF
+ 2, // IFC
+ 0, // ELSE
+ 0, // ENDIF
+ 0, // BREAK
+ 2, // BREAKC
+ 2, // MOVA
+ 2, // DEFB
+ 5, // DEFI
+ -1, // 49
+ -1, // 50
+ -1, // 51
+ -1, // 52
+ -1, // 53
+ -1, // 54
+ -1, // 55
+ -1, // 56
+ -1, // 57
+ -1, // 58
+ -1, // 59
+ -1, // 60
+ -1, // 61
+ -1, // 62
+ -1, // 63
+ 1, // TEXCOORD = 64
+ 1, // TEXKILL
+ 1, // TEX
+ 2, // TEXBEM
+ 2, // TEXBEML
+ 2, // TEXREG2AR
+ 2, // TEXREG2GB
+ 2, // TEXM3x2PAD
+ 2, // TEXM3x2TEX
+ 2, // TEXM3x3PAD
+ 2, // TEXM3x3TEX
+ -1, // RESERVED0
+ 3, // TEXM3x3SPEC
+ 2, // TEXM3x3VSPEC
+ 2, // EXPP
+ 2, // LOGP
+ 4, // CND
+ 5, // DEF
+ 2, // TEXREG2RGB
+ 2, // TEXDP3TEX
+ 2, // TEXM3x2DEPTH
+ 2, // TEXDP3
+ 2, // TEXM3x3
+ 1, // TEXDEPTH
+ 4, // CMP
+ 3, // BEM
+ 4, // DP2ADD
+ 2, // DSX
+ 2, // DSY
+ 5, // TEXLDD
+ 3, // SETP
+ 3, // TEXLDL
+ 2, // BREAKP
+ -1, // 97
+ -1, // 98
+ -1, // 99
+ -1, // 100
+ -1, // 101
+ -1, // 102
+ -1, // 103
+ -1, // 104
+ -1, // 105
+ -1, // 106
+ -1, // 107
+ -1, // 108
+ -1, // 109
+ -1, // 110
+ -1, // 111
+ -1, // 112
+ };
+
+ int length = 0;
+
+ if((opcode & 0x0000FFFF) == OPCODE_COMMENT)
+ {
+ return (opcode & 0x7FFF0000) >> 16;
+ }
+
+ if(opcode != OPCODE_PS_1_0 &&
+ opcode != OPCODE_PS_1_1 &&
+ opcode != OPCODE_PS_1_2 &&
+ opcode != OPCODE_PS_1_3 &&
+ opcode != OPCODE_PS_1_4 &&
+ opcode != OPCODE_PS_2_0 &&
+ opcode != OPCODE_PS_2_x &&
+ opcode != OPCODE_PS_3_0 &&
+ opcode != OPCODE_VS_1_0 &&
+ opcode != OPCODE_VS_1_1 &&
+ opcode != OPCODE_VS_2_0 &&
+ opcode != OPCODE_VS_2_x &&
+ opcode != OPCODE_VS_2_sw &&
+ opcode != OPCODE_VS_3_0 &&
+ opcode != OPCODE_VS_3_sw &&
+ opcode != OPCODE_PHASE &&
+ opcode != OPCODE_END)
+ {
+ if(shaderModel >= 0x0200)
+ {
+ length = (opcode & 0x0F000000) >> 24;
+ }
+ else
+ {
+ length = size[opcode & 0x0000FFFF];
+ }
+ }
+
+ if(length < 0)
+ {
+ ASSERT(false);
+ }
+
+ if(shaderModel == 0x0104)
+ {
+ switch(opcode & 0x0000FFFF)
+ {
+ case OPCODE_TEX:
+ length += 1;
+ break;
+ case OPCODE_TEXCOORD:
+ length += 1;
+ break;
+ default:
+ break;
+ }
+ }
+
+ return length;
+ }
+
+ bool Shader::maskContainsComponent(int mask, int component)
+ {
+ return (mask & (1 << component)) != 0;
+ }
+
+ bool Shader::swizzleContainsComponent(int swizzle, int component)
+ {
+ if((swizzle & 0x03) >> 0 == component) return true;
+ if((swizzle & 0x0C) >> 2 == component) return true;
+ if((swizzle & 0x30) >> 4 == component) return true;
+ if((swizzle & 0xC0) >> 6 == component) return true;
+
+ return false;
+ }
+
+ bool Shader::swizzleContainsComponentMasked(int swizzle, int component, int mask)
+ {
+ if(mask & 0x1) if((swizzle & 0x03) >> 0 == component) return true;
+ if(mask & 0x2) if((swizzle & 0x0C) >> 2 == component) return true;
+ if(mask & 0x4) if((swizzle & 0x30) >> 4 == component) return true;
+ if(mask & 0x8) if((swizzle & 0xC0) >> 6 == component) return true;
+
+ return false;
+ }
+
+ bool Shader::containsDynamicBranching() const
+ {
+ return dynamicBranching;
+ }
+
+ bool Shader::containsBreakInstruction() const
+ {
+ return containsBreak;
+ }
+
+ bool Shader::containsContinueInstruction() const
+ {
+ return containsContinue;
+ }
+
+ bool Shader::containsLeaveInstruction() const
+ {
+ return containsLeave;
+ }
+
+ bool Shader::containsDefineInstruction() const
+ {
+ return containsDefine;
+ }
+
+ bool Shader::usesSampler(int index) const
+ {
+ return (usedSamplers & (1 << index)) != 0;
+ }
+
+ int Shader::getSerialID() const
+ {
+ return serialID;
+ }
+
+ size_t Shader::getLength() const
+ {
+ return instruction.size();
+ }
+
+ Shader::ShaderType Shader::getShaderType() const
+ {
+ return shaderType;
+ }
+
+ unsigned short Shader::getShaderModel() const
+ {
+ return shaderModel;
+ }
+
+ void Shader::print(const char *fileName, ...) const
+ {
+ char fullName[1024 + 1];
+
+ va_list vararg;
+ va_start(vararg, fileName);
+ vsnprintf(fullName, 1024, fileName, vararg);
+ va_end(vararg);
+
+ std::ofstream file(fullName, std::ofstream::out);
+
+ for(const auto &inst : instruction)
+ {
+ file << inst->string(shaderType, shaderModel) << std::endl;
+ }
+ }
+
+ void Shader::printInstruction(int index, const char *fileName) const
+ {
+ std::ofstream file(fileName, std::ofstream::out | std::ofstream::app);
+
+ file << instruction[index]->string(shaderType, shaderModel) << std::endl;
+ }
+
+ void Shader::append(Instruction *instruction)
+ {
+ this->instruction.push_back(instruction);
+ }
+
+ void Shader::declareSampler(int i)
+ {
+ if(i >= 0 && i < 16)
+ {
+ usedSamplers |= 1 << i;
+ }
+ }
+
+ const Shader::Instruction *Shader::getInstruction(size_t i) const
+ {
+ ASSERT(i < instruction.size());
+
+ return instruction[i];
+ }
+
+ void Shader::optimize()
+ {
+ optimizeLeave();
+ optimizeCall();
+ removeNull();
+ }
+
+ void Shader::optimizeLeave()
+ {
+ // A return (leave) right before the end of a function or the shader can be removed
+ for(unsigned int i = 0; i < instruction.size(); i++)
+ {
+ if(instruction[i]->opcode == OPCODE_LEAVE)
+ {
+ if(i == instruction.size() - 1 || instruction[i + 1]->opcode == OPCODE_RET)
+ {
+ instruction[i]->opcode = OPCODE_NULL;
+ }
+ }
+ }
+ }
+
+ void Shader::optimizeCall()
+ {
+ // Eliminate uncalled functions
+ std::set<int> calledFunctions;
+ bool rescan = true;
+
+ while(rescan)
+ {
+ calledFunctions.clear();
+ rescan = false;
+
+ for(const auto &inst : instruction)
+ {
+ if(inst->isCall())
+ {
+ calledFunctions.insert(inst->dst.label);
+ }
+ }
+
+ if(!calledFunctions.empty())
+ {
+ for(unsigned int i = 0; i < instruction.size(); i++)
+ {
+ if(instruction[i]->opcode == OPCODE_LABEL)
+ {
+ if(calledFunctions.find(instruction[i]->dst.label) == calledFunctions.end())
+ {
+ for( ; i < instruction.size(); i++)
+ {
+ Opcode oldOpcode = instruction[i]->opcode;
+ instruction[i]->opcode = OPCODE_NULL;
+
+ if(oldOpcode == OPCODE_RET)
+ {
+ rescan = true;
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Optimize the entry call
+ if(instruction.size() >= 2 && instruction[0]->opcode == OPCODE_CALL && instruction[1]->opcode == OPCODE_RET)
+ {
+ if(calledFunctions.size() == 1)
+ {
+ instruction[0]->opcode = OPCODE_NULL;
+ instruction[1]->opcode = OPCODE_NULL;
+
+ for(size_t i = 2; i < instruction.size(); i++)
+ {
+ if(instruction[i]->opcode == OPCODE_LABEL || instruction[i]->opcode == OPCODE_RET)
+ {
+ instruction[i]->opcode = OPCODE_NULL;
+ }
+ }
+ }
+ }
+ }
+
+ void Shader::removeNull()
+ {
+ size_t size = 0;
+ for(size_t i = 0; i < instruction.size(); i++)
+ {
+ if(instruction[i]->opcode != OPCODE_NULL)
+ {
+ instruction[size] = instruction[i];
+ size++;
+ }
+ else
+ {
+ delete instruction[i];
+ }
+ }
+
+ instruction.resize(size);
+ }
+
+ void Shader::analyzeDirtyConstants()
+ {
+ dirtyConstantsF = 0;
+ dirtyConstantsI = 0;
+ dirtyConstantsB = 0;
+
+ for(const auto &inst : instruction)
+ {
+ switch(inst->opcode)
+ {
+ case OPCODE_DEF:
+ if(inst->dst.index + 1 > dirtyConstantsF)
+ {
+ dirtyConstantsF = inst->dst.index + 1;
+ }
+ break;
+ case OPCODE_DEFI:
+ if(inst->dst.index + 1 > dirtyConstantsI)
+ {
+ dirtyConstantsI = inst->dst.index + 1;
+ }
+ break;
+ case OPCODE_DEFB:
+ if(inst->dst.index + 1 > dirtyConstantsB)
+ {
+ dirtyConstantsB = inst->dst.index + 1;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ void Shader::analyzeDynamicBranching()
+ {
+ dynamicBranching = false;
+ containsLeave = false;
+ containsBreak = false;
+ containsContinue = false;
+ containsDefine = false;
+
+ // Determine global presence of branching instructions
+ for(const auto &inst : instruction)
+ {
+ switch(inst->opcode)
+ {
+ case OPCODE_CALLNZ:
+ case OPCODE_IF:
+ case OPCODE_IFC:
+ case OPCODE_BREAK:
+ case OPCODE_BREAKC:
+ case OPCODE_CMP:
+ case OPCODE_BREAKP:
+ case OPCODE_LEAVE:
+ case OPCODE_CONTINUE:
+ if(inst->src[0].type != PARAMETER_CONSTBOOL)
+ {
+ dynamicBranching = true;
+ }
+
+ if(inst->opcode == OPCODE_LEAVE)
+ {
+ containsLeave = true;
+ }
+
+ if(inst->isBreak())
+ {
+ containsBreak = true;
+ }
+
+ if(inst->opcode == OPCODE_CONTINUE)
+ {
+ containsContinue = true;
+ }
+ case OPCODE_DEF:
+ case OPCODE_DEFB:
+ case OPCODE_DEFI:
+ containsDefine = true;
+ default:
+ break;
+ }
+ }
+
+ // Conservatively determine which instructions are affected by dynamic branching
+ int branchDepth = 0;
+ int breakDepth = 0;
+ int continueDepth = 0;
+ bool leaveReturn = false;
+ unsigned int functionBegin = 0;
+
+ for(unsigned int i = 0; i < instruction.size(); i++)
+ {
+ // If statements and loops
+ if(instruction[i]->isBranch() || instruction[i]->isLoop())
+ {
+ branchDepth++;
+ }
+ else if(instruction[i]->opcode == OPCODE_ENDIF || instruction[i]->isEndLoop())
+ {
+ branchDepth--;
+ }
+
+ if(branchDepth > 0)
+ {
+ instruction[i]->analysisBranch = true;
+
+ if(instruction[i]->isCall())
+ {
+ markFunctionAnalysis(instruction[i]->dst.label, ANALYSIS_BRANCH);
+ }
+ }
+
+ // Break statemement
+ if(instruction[i]->isBreak())
+ {
+ breakDepth++;
+ }
+
+ if(breakDepth > 0)
+ {
+ if(instruction[i]->isLoop() || instruction[i]->opcode == OPCODE_SWITCH) // Nested loop or switch, don't make the end of it disable the break execution mask
+ {
+ breakDepth++;
+ }
+ else if(instruction[i]->isEndLoop() || instruction[i]->opcode == OPCODE_ENDSWITCH)
+ {
+ breakDepth--;
+ }
+
+ instruction[i]->analysisBreak = true;
+
+ if(instruction[i]->isCall())
+ {
+ markFunctionAnalysis(instruction[i]->dst.label, ANALYSIS_BRANCH);
+ }
+ }
+
+ // Continue statement
+ if(instruction[i]->opcode == OPCODE_CONTINUE)
+ {
+ continueDepth++;
+ }
+
+ if(continueDepth > 0)
+ {
+ if(instruction[i]->isLoop() || instruction[i]->opcode == OPCODE_SWITCH) // Nested loop or switch, don't make the end of it disable the break execution mask
+ {
+ continueDepth++;
+ }
+ else if(instruction[i]->isEndLoop() || instruction[i]->opcode == OPCODE_ENDSWITCH)
+ {
+ continueDepth--;
+ }
+
+ instruction[i]->analysisContinue = true;
+
+ if(instruction[i]->isCall())
+ {
+ markFunctionAnalysis(instruction[i]->dst.label, ANALYSIS_CONTINUE);
+ }
+ }
+
+ // Return (leave) statement
+ if(instruction[i]->opcode == OPCODE_LEAVE)
+ {
+ leaveReturn = true;
+
+ // Mark loop body instructions prior to the return statement
+ for(unsigned int l = functionBegin; l < i; l++)
+ {
+ if(instruction[l]->isLoop())
+ {
+ for(unsigned int r = l + 1; r < i; r++)
+ {
+ instruction[r]->analysisLeave = true;
+ }
+
+ break;
+ }
+ }
+ }
+ else if(instruction[i]->opcode == OPCODE_RET) // End of the function
+ {
+ leaveReturn = false;
+ }
+ else if(instruction[i]->opcode == OPCODE_LABEL)
+ {
+ functionBegin = i;
+ }
+
+ if(leaveReturn)
+ {
+ instruction[i]->analysisLeave = true;
+
+ if(instruction[i]->isCall())
+ {
+ markFunctionAnalysis(instruction[i]->dst.label, ANALYSIS_LEAVE);
+ }
+ }
+ }
+ }
+
+ void Shader::markFunctionAnalysis(unsigned int functionLabel, Analysis flag)
+ {
+ bool marker = false;
+ for(auto &inst : instruction)
+ {
+ if(!marker)
+ {
+ if(inst->opcode == OPCODE_LABEL && inst->dst.label == functionLabel)
+ {
+ marker = true;
+ }
+ }
+ else
+ {
+ if(inst->opcode == OPCODE_RET)
+ {
+ break;
+ }
+ else if(inst->isCall())
+ {
+ markFunctionAnalysis(inst->dst.label, flag);
+ }
+
+ inst->analysis |= flag;
+ }
+ }
+ }
+
+ void Shader::analyzeSamplers()
+ {
+ for(const auto &inst : instruction)
+ {
+ switch(inst->opcode)
+ {
+ case OPCODE_TEX:
+ case OPCODE_TEXBEM:
+ case OPCODE_TEXBEML:
+ case OPCODE_TEXREG2AR:
+ case OPCODE_TEXREG2GB:
+ case OPCODE_TEXM3X2TEX:
+ case OPCODE_TEXM3X3TEX:
+ case OPCODE_TEXM3X3SPEC:
+ case OPCODE_TEXM3X3VSPEC:
+ case OPCODE_TEXREG2RGB:
+ case OPCODE_TEXDP3TEX:
+ case OPCODE_TEXM3X2DEPTH:
+ case OPCODE_TEXLDD:
+ case OPCODE_TEXLDL:
+ case OPCODE_TEXLOD:
+ case OPCODE_TEXOFFSET:
+ case OPCODE_TEXOFFSETBIAS:
+ case OPCODE_TEXLODOFFSET:
+ case OPCODE_TEXELFETCH:
+ case OPCODE_TEXELFETCHOFFSET:
+ case OPCODE_TEXGRAD:
+ case OPCODE_TEXGRADOFFSET:
+ {
+ Parameter &dst = inst->dst;
+ Parameter &src1 = inst->src[1];
+
+ if(majorVersion >= 2)
+ {
+ if(src1.type == PARAMETER_SAMPLER)
+ {
+ usedSamplers |= 1 << src1.index;
+ }
+ }
+ else
+ {
+ usedSamplers |= 1 << dst.index;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ // Assigns a unique index to each call instruction, on a per label basis.
+ // This is used to know what basic block to return to.
+ void Shader::analyzeCallSites()
+ {
+ int callSiteIndex[2048] = {0};
+
+ for(auto &inst : instruction)
+ {
+ if(inst->opcode == OPCODE_CALL || inst->opcode == OPCODE_CALLNZ)
+ {
+ int label = inst->dst.label;
+
+ inst->dst.callSite = callSiteIndex[label]++;
+ }
+ }
+ }
+
+ void Shader::analyzeIndirectAddressing()
+ {
+ indirectAddressableTemporaries = false;
+ indirectAddressableInput = false;
+ indirectAddressableOutput = false;
+
+ for(const auto &inst : instruction)
+ {
+ if(inst->dst.rel.type != PARAMETER_VOID)
+ {
+ switch(inst->dst.type)
+ {
+ case PARAMETER_TEMP: indirectAddressableTemporaries = true; break;
+ case PARAMETER_INPUT: indirectAddressableInput = true; break;
+ case PARAMETER_OUTPUT: indirectAddressableOutput = true; break;
+ default: break;
+ }
+ }
+
+ for(int j = 0; j < 3; j++)
+ {
+ if(inst->src[j].rel.type != PARAMETER_VOID)
+ {
+ switch(inst->src[j].type)
+ {
+ case PARAMETER_TEMP: indirectAddressableTemporaries = true; break;
+ case PARAMETER_INPUT: indirectAddressableInput = true; break;
+ case PARAMETER_OUTPUT: indirectAddressableOutput = true; break;
+ default: break;
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/src/Pipeline/Shader.hpp b/src/Pipeline/Shader.hpp
new file mode 100644
index 0000000..9e4a810
--- /dev/null
+++ b/src/Pipeline/Shader.hpp
@@ -0,0 +1,662 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Shader_hpp
+#define sw_Shader_hpp
+
+#include "Common/Types.hpp"
+
+#include <string>
+#include <vector>
+
+namespace sw
+{
+ class Shader
+ {
+ public:
+ enum ShaderType
+ {
+ SHADER_PIXEL = 0xFFFF,
+ SHADER_VERTEX = 0xFFFE,
+ SHADER_GEOMETRY = 0xFFFD
+ };
+
+ enum Opcode
+ {
+ // Matches order in d3d9types.h
+ OPCODE_NOP = 0,
+ OPCODE_MOV,
+ OPCODE_ADD,
+ OPCODE_SUB,
+ OPCODE_MAD,
+ OPCODE_MUL,
+ OPCODE_RCPX,
+ OPCODE_RSQX,
+ OPCODE_DP3,
+ OPCODE_DP4,
+ OPCODE_MIN,
+ OPCODE_MAX,
+ OPCODE_SLT,
+ OPCODE_SGE,
+ OPCODE_EXP2X, // D3DSIO_EXP
+ OPCODE_LOG2X, // D3DSIO_LOG
+ OPCODE_LIT,
+ OPCODE_ATT, // D3DSIO_DST
+ OPCODE_LRP,
+ OPCODE_FRC,
+ OPCODE_M4X4,
+ OPCODE_M4X3,
+ OPCODE_M3X4,
+ OPCODE_M3X3,
+ OPCODE_M3X2,
+ OPCODE_CALL,
+ OPCODE_CALLNZ,
+ OPCODE_LOOP,
+ OPCODE_RET,
+ OPCODE_ENDLOOP,
+ OPCODE_LABEL,
+ OPCODE_DCL,
+ OPCODE_POWX,
+ OPCODE_CRS,
+ OPCODE_SGN,
+ OPCODE_ABS,
+ OPCODE_NRM3, // D3DSIO_NRM
+ OPCODE_SINCOS,
+ OPCODE_REP,
+ OPCODE_ENDREP,
+ OPCODE_IF,
+ OPCODE_IFC,
+ OPCODE_ELSE,
+ OPCODE_ENDIF,
+ OPCODE_BREAK,
+ OPCODE_BREAKC,
+ OPCODE_MOVA,
+ OPCODE_DEFB,
+ OPCODE_DEFI,
+
+ OPCODE_TEXCOORD = 64,
+ OPCODE_TEXKILL,
+ OPCODE_TEX,
+ OPCODE_TEXBEM,
+ OPCODE_TEXBEML,
+ OPCODE_TEXREG2AR,
+ OPCODE_TEXREG2GB,
+ OPCODE_TEXM3X2PAD,
+ OPCODE_TEXM3X2TEX,
+ OPCODE_TEXM3X3PAD,
+ OPCODE_TEXM3X3TEX,
+ OPCODE_RESERVED0,
+ OPCODE_TEXM3X3SPEC,
+ OPCODE_TEXM3X3VSPEC,
+ OPCODE_EXPP,
+ OPCODE_LOGP,
+ OPCODE_CND,
+ OPCODE_DEF,
+ OPCODE_TEXREG2RGB,
+ OPCODE_TEXDP3TEX,
+ OPCODE_TEXM3X2DEPTH,
+ OPCODE_TEXDP3,
+ OPCODE_TEXM3X3,
+ OPCODE_TEXDEPTH,
+ OPCODE_CMP0, // D3DSIO_CMP
+ OPCODE_BEM,
+ OPCODE_DP2ADD,
+ OPCODE_DFDX, // D3DSIO_DSX
+ OPCODE_DFDY, // D3DSIO_DSY
+ OPCODE_TEXLDD,
+ OPCODE_CMP, // D3DSIO_SETP
+ OPCODE_TEXLDL,
+ OPCODE_BREAKP,
+
+ OPCODE_PHASE = 0xFFFD,
+ OPCODE_COMMENT = 0xFFFE,
+ OPCODE_END = 0xFFFF,
+
+ OPCODE_PS_1_0 = 0xFFFF0100,
+ OPCODE_PS_1_1 = 0xFFFF0101,
+ OPCODE_PS_1_2 = 0xFFFF0102,
+ OPCODE_PS_1_3 = 0xFFFF0103,
+ OPCODE_PS_1_4 = 0xFFFF0104,
+ OPCODE_PS_2_0 = 0xFFFF0200,
+ OPCODE_PS_2_x = 0xFFFF0201,
+ OPCODE_PS_3_0 = 0xFFFF0300,
+
+ OPCODE_VS_1_0 = 0xFFFE0100,
+ OPCODE_VS_1_1 = 0xFFFE0101,
+ OPCODE_VS_2_0 = 0xFFFE0200,
+ OPCODE_VS_2_x = 0xFFFE0201,
+ OPCODE_VS_2_sw = 0xFFFE02FF,
+ OPCODE_VS_3_0 = 0xFFFE0300,
+ OPCODE_VS_3_sw = 0xFFFE03FF,
+
+ OPCODE_NULL = 0x10000000, // Dead instruction, to be eliminated
+ OPCODE_WHILE,
+ OPCODE_ENDWHILE,
+ OPCODE_COS,
+ OPCODE_SIN,
+ OPCODE_TAN,
+ OPCODE_ACOS,
+ OPCODE_ASIN,
+ OPCODE_ATAN,
+ OPCODE_ATAN2,
+ OPCODE_COSH,
+ OPCODE_SINH,
+ OPCODE_TANH,
+ OPCODE_ACOSH,
+ OPCODE_ASINH,
+ OPCODE_ATANH,
+ OPCODE_DP1,
+ OPCODE_DP2,
+ OPCODE_TRUNC,
+ OPCODE_FLOOR,
+ OPCODE_ROUND,
+ OPCODE_ROUNDEVEN,
+ OPCODE_CEIL,
+ OPCODE_SQRT,
+ OPCODE_RSQ,
+ OPCODE_LEN2,
+ OPCODE_LEN3,
+ OPCODE_LEN4,
+ OPCODE_DIST1,
+ OPCODE_DIST2,
+ OPCODE_DIST3,
+ OPCODE_DIST4,
+ OPCODE_NRM2,
+ OPCODE_NRM4,
+ OPCODE_DIV,
+ OPCODE_MOD,
+ OPCODE_EXP2,
+ OPCODE_LOG2,
+ OPCODE_EXP,
+ OPCODE_LOG,
+ OPCODE_POW,
+ OPCODE_F2B, // Float to bool
+ OPCODE_B2F, // Bool to float
+ OPCODE_F2I, // Float to int
+ OPCODE_I2F, // Int to float
+ OPCODE_F2U, // Float to uint
+ OPCODE_U2F, // Uint to float
+ OPCODE_I2B, // Int to bool
+ OPCODE_B2I, // Bool to int
+ OPCODE_DET2,
+ OPCODE_DET3,
+ OPCODE_DET4,
+ OPCODE_ALL,
+ OPCODE_ANY,
+ OPCODE_NEG,
+ OPCODE_NOT,
+ OPCODE_OR,
+ OPCODE_XOR,
+ OPCODE_AND,
+ OPCODE_EQ,
+ OPCODE_NE,
+ OPCODE_STEP,
+ OPCODE_SMOOTH,
+ OPCODE_ISNAN,
+ OPCODE_ISINF,
+ OPCODE_TEXOFFSET,
+ OPCODE_TEXLODOFFSET,
+ OPCODE_TEXELFETCH,
+ OPCODE_TEXELFETCHOFFSET,
+ OPCODE_TEXGRAD,
+ OPCODE_TEXGRADOFFSET,
+ OPCODE_TEXBIAS,
+ OPCODE_TEXLOD,
+ OPCODE_TEXOFFSETBIAS,
+ OPCODE_TEXSIZE,
+ OPCODE_FLOATBITSTOINT,
+ OPCODE_FLOATBITSTOUINT,
+ OPCODE_INTBITSTOFLOAT,
+ OPCODE_UINTBITSTOFLOAT,
+ OPCODE_PACKSNORM2x16,
+ OPCODE_PACKUNORM2x16,
+ OPCODE_PACKHALF2x16,
+ OPCODE_UNPACKSNORM2x16,
+ OPCODE_UNPACKUNORM2x16,
+ OPCODE_UNPACKHALF2x16,
+ OPCODE_FORWARD1,
+ OPCODE_FORWARD2,
+ OPCODE_FORWARD3,
+ OPCODE_FORWARD4,
+ OPCODE_REFLECT1,
+ OPCODE_REFLECT2,
+ OPCODE_REFLECT3,
+ OPCODE_REFLECT4,
+ OPCODE_REFRACT1,
+ OPCODE_REFRACT2,
+ OPCODE_REFRACT3,
+ OPCODE_REFRACT4,
+ OPCODE_ICMP,
+ OPCODE_UCMP,
+ OPCODE_SELECT,
+ OPCODE_EXTRACT,
+ OPCODE_INSERT,
+ OPCODE_DISCARD,
+ OPCODE_FWIDTH,
+ OPCODE_LEAVE, // Return before the end of the function
+ OPCODE_CONTINUE,
+ OPCODE_TEST, // Marks the end of the code that can be skipped by 'continue'
+ OPCODE_SWITCH,
+ OPCODE_ENDSWITCH,
+
+ // Integer opcodes
+ OPCODE_INEG,
+ OPCODE_IABS,
+ OPCODE_ISGN,
+ OPCODE_IADD,
+ OPCODE_ISUB,
+ OPCODE_IMUL,
+ OPCODE_IDIV,
+ OPCODE_IMAD,
+ OPCODE_IMOD,
+ OPCODE_SHL,
+ OPCODE_ISHR,
+ OPCODE_IMIN,
+ OPCODE_IMAX,
+
+ // Unsigned integer opcodes
+ OPCODE_UDIV,
+ OPCODE_UMOD,
+ OPCODE_USHR,
+ OPCODE_UMIN,
+ OPCODE_UMAX,
+ };
+
+ static Opcode OPCODE_DP(int);
+ static Opcode OPCODE_LEN(int);
+ static Opcode OPCODE_DIST(int);
+ static Opcode OPCODE_NRM(int);
+ static Opcode OPCODE_FORWARD(int);
+ static Opcode OPCODE_REFLECT(int);
+ static Opcode OPCODE_REFRACT(int);
+
+ enum Control
+ {
+ CONTROL_RESERVED0,
+ CONTROL_GT,
+ CONTROL_EQ,
+ CONTROL_GE,
+ CONTROL_LT,
+ CONTROL_NE,
+ CONTROL_LE,
+ CONTROL_RESERVED1
+ };
+
+ enum SamplerType
+ {
+ SAMPLER_UNKNOWN,
+ SAMPLER_1D,
+ SAMPLER_2D,
+ SAMPLER_CUBE,
+ SAMPLER_VOLUME
+ };
+
+ enum Usage // For vertex input/output declarations
+ {
+ USAGE_POSITION = 0,
+ USAGE_BLENDWEIGHT = 1,
+ USAGE_BLENDINDICES = 2,
+ USAGE_NORMAL = 3,
+ USAGE_PSIZE = 4,
+ USAGE_TEXCOORD = 5,
+ USAGE_TANGENT = 6,
+ USAGE_BINORMAL = 7,
+ USAGE_TESSFACTOR = 8,
+ USAGE_POSITIONT = 9,
+ USAGE_COLOR = 10,
+ USAGE_FOG = 11,
+ USAGE_DEPTH = 12,
+ USAGE_SAMPLE = 13
+ };
+
+ enum ParameterType
+ {
+ PARAMETER_TEMP = 0,
+ PARAMETER_INPUT = 1,
+ PARAMETER_CONST = 2,
+ PARAMETER_TEXTURE = 3,
+ PARAMETER_ADDR = 3,
+ PARAMETER_RASTOUT = 4,
+ PARAMETER_ATTROUT = 5,
+ PARAMETER_TEXCRDOUT = 6,
+ PARAMETER_OUTPUT = 6,
+ PARAMETER_CONSTINT = 7,
+ PARAMETER_COLOROUT = 8,
+ PARAMETER_DEPTHOUT = 9,
+ PARAMETER_SAMPLER = 10,
+ PARAMETER_CONST2 = 11,
+ PARAMETER_CONST3 = 12,
+ PARAMETER_CONST4 = 13,
+ PARAMETER_CONSTBOOL = 14,
+ PARAMETER_LOOP = 15,
+ PARAMETER_TEMPFLOAT16 = 16,
+ PARAMETER_MISCTYPE = 17,
+ PARAMETER_LABEL = 18,
+ PARAMETER_PREDICATE = 19,
+
+ // PARAMETER_FLOAT1LITERAL,
+ // PARAMETER_FLOAT2LITERAL,
+ // PARAMETER_FLOAT3LITERAL,
+ PARAMETER_FLOAT4LITERAL,
+ PARAMETER_BOOL1LITERAL,
+ // PARAMETER_BOOL2LITERAL,
+ // PARAMETER_BOOL3LITERAL,
+ // PARAMETER_BOOL4LITERAL,
+ // PARAMETER_INT1LITERAL,
+ // PARAMETER_INT2LITERAL,
+ // PARAMETER_INT3LITERAL,
+ PARAMETER_INT4LITERAL,
+
+ PARAMETER_VOID
+ };
+
+ enum MiscParameterIndex
+ {
+ VPosIndex = 0,
+ VFaceIndex = 1,
+ InstanceIDIndex = 2,
+ VertexIDIndex = 3,
+ };
+
+ enum Modifier
+ {
+ MODIFIER_NONE,
+ MODIFIER_NEGATE,
+ MODIFIER_BIAS,
+ MODIFIER_BIAS_NEGATE,
+ MODIFIER_SIGN,
+ MODIFIER_SIGN_NEGATE,
+ MODIFIER_COMPLEMENT,
+ MODIFIER_X2,
+ MODIFIER_X2_NEGATE,
+ MODIFIER_DZ,
+ MODIFIER_DW,
+ MODIFIER_ABS,
+ MODIFIER_ABS_NEGATE,
+ MODIFIER_NOT
+ };
+
+ enum Analysis
+ {
+ // Flags indicating whether an instruction is affected by an execution enable mask
+ ANALYSIS_BRANCH = 0x00000001,
+ ANALYSIS_BREAK = 0x00000002,
+ ANALYSIS_CONTINUE = 0x00000004,
+ ANALYSIS_LEAVE = 0x00000008,
+ };
+
+ struct Relative
+ {
+ ParameterType type : 8;
+ unsigned int index;
+ unsigned int swizzle : 8;
+ unsigned int scale;
+ bool dynamic; // Varies between concurrent shader instances
+ };
+
+ struct Parameter
+ {
+ union
+ {
+ struct
+ {
+ unsigned int index; // For registers types
+
+ Relative rel;
+ };
+
+ float value[4]; // For float constants
+ int integer[4]; // For integer constants
+ int boolean[4]; // For boolean constants
+
+ struct
+ {
+ unsigned int label; // Label index
+ unsigned int callSite; // Call index (per label)
+ };
+ };
+
+ Parameter() : index(0), type(PARAMETER_VOID)
+ {
+ rel.type = PARAMETER_VOID;
+ rel.index = 0;
+ rel.swizzle = 0;
+ rel.scale = 1;
+ rel.dynamic = true;
+ }
+
+ std::string string(ShaderType shaderType, unsigned short version) const;
+ std::string typeString(ShaderType shaderType, unsigned short version) const;
+ std::string relativeString() const;
+
+ ParameterType type : 8;
+ };
+
+ struct DestinationParameter : Parameter
+ {
+ union
+ {
+ unsigned char mask;
+
+ struct
+ {
+ bool x : 1;
+ bool y : 1;
+ bool z : 1;
+ bool w : 1;
+ };
+ };
+
+ DestinationParameter() : mask(0xF), saturate(false), partialPrecision(false), centroid(false), shift(0)
+ {
+ }
+
+ std::string modifierString() const;
+ std::string shiftString() const;
+ std::string maskString() const;
+
+ bool saturate : 1;
+ bool partialPrecision : 1;
+ bool centroid : 1;
+ signed char shift : 4;
+ };
+
+ struct SourceParameter : Parameter
+ {
+ SourceParameter() : swizzle(0xE4), modifier(MODIFIER_NONE), bufferIndex(-1)
+ {
+ }
+
+ std::string string(ShaderType shaderType, unsigned short version) const;
+ std::string swizzleString() const;
+ std::string preModifierString() const;
+ std::string postModifierString() const;
+
+ unsigned int swizzle : 8;
+ Modifier modifier : 8;
+ int bufferIndex : 8;
+ };
+
+ struct Instruction
+ {
+ explicit Instruction(Opcode opcode);
+ Instruction(const unsigned long *token, int size, unsigned char majorVersion);
+
+ virtual ~Instruction();
+
+ void parseOperationToken(unsigned long token, unsigned char majorVersion);
+ void parseDeclarationToken(unsigned long token);
+ void parseDestinationToken(const unsigned long *token, unsigned char majorVersion);
+ void parseSourceToken(int i, const unsigned long *token, unsigned char majorVersion);
+
+ std::string string(ShaderType shaderType, unsigned short version) const;
+ static std::string swizzleString(ParameterType type, unsigned char swizzle);
+ std::string operationString(unsigned short version) const;
+ std::string controlString() const;
+
+ bool isBranch() const;
+ bool isCall() const;
+ bool isBreak() const;
+ bool isLoop() const;
+ bool isEndLoop() const;
+
+ bool isPredicated() const;
+
+ Opcode opcode;
+
+ union
+ {
+ Control control;
+
+ struct
+ {
+ unsigned char project : 1; // D3DSI_TEXLD_PROJECT
+ unsigned char bias : 1; // D3DSI_TEXLD_BIAS
+ };
+ };
+
+ bool predicate;
+ bool predicateNot; // Negative predicate
+ unsigned char predicateSwizzle;
+
+ bool coissue;
+ SamplerType samplerType;
+ Usage usage;
+ unsigned char usageIndex;
+
+ DestinationParameter dst;
+ SourceParameter src[5];
+
+ union
+ {
+ unsigned int analysis;
+
+ struct
+ {
+ // Keep in sync with Shader::Analysis flags
+ unsigned int analysisBranch : 1;
+ unsigned int analysisBreak : 1;
+ unsigned int analysisContinue : 1;
+ unsigned int analysisLeave : 1;
+ };
+ };
+ };
+
+ Shader();
+
+ virtual ~Shader();
+
+ int getSerialID() const;
+ size_t getLength() const;
+ ShaderType getShaderType() const;
+ unsigned short getShaderModel() const;
+
+ void append(Instruction *instruction);
+ void declareSampler(int i);
+
+ const Instruction *getInstruction(size_t i) const;
+ int size(unsigned long opcode) const;
+ static int size(unsigned long opcode, unsigned short shaderModel);
+
+ void print(const char *fileName, ...) const;
+ void printInstruction(int index, const char *fileName) const;
+
+ static bool maskContainsComponent(int mask, int component);
+ static bool swizzleContainsComponent(int swizzle, int component);
+ static bool swizzleContainsComponentMasked(int swizzle, int component, int mask);
+
+ bool containsDynamicBranching() const;
+ bool containsBreakInstruction() const;
+ bool containsContinueInstruction() const;
+ bool containsLeaveInstruction() const;
+ bool containsDefineInstruction() const;
+ bool usesSampler(int i) const;
+
+ struct Semantic
+ {
+ Semantic(unsigned char usage = 0xFF, unsigned char index = 0xFF, bool flat = false) : usage(usage), index(index), centroid(false), flat(flat)
+ {
+ }
+
+ bool operator==(const Semantic &semantic) const
+ {
+ return usage == semantic.usage && index == semantic.index;
+ }
+
+ bool active() const
+ {
+ return usage != 0xFF;
+ }
+
+ unsigned char usage;
+ unsigned char index;
+ bool centroid;
+ bool flat;
+ };
+
+ void optimize();
+
+ // FIXME: Private
+ unsigned int dirtyConstantsF;
+ unsigned int dirtyConstantsI;
+ unsigned int dirtyConstantsB;
+
+ bool indirectAddressableTemporaries;
+ bool indirectAddressableInput;
+ bool indirectAddressableOutput;
+
+ protected:
+ void parse(const unsigned long *token);
+
+ void optimizeLeave();
+ void optimizeCall();
+ void removeNull();
+
+ void analyzeDirtyConstants();
+ void analyzeDynamicBranching();
+ void analyzeSamplers();
+ void analyzeCallSites();
+ void analyzeIndirectAddressing();
+ void markFunctionAnalysis(unsigned int functionLabel, Analysis flag);
+
+ ShaderType shaderType;
+
+ union
+ {
+ unsigned short shaderModel;
+
+ struct
+ {
+ unsigned char minorVersion;
+ unsigned char majorVersion;
+ };
+ };
+
+ std::vector<Instruction*> instruction;
+
+ unsigned short usedSamplers; // Bit flags
+
+ private:
+ const int serialID;
+ static volatile int serialCounter;
+
+ bool dynamicBranching;
+ bool containsBreak;
+ bool containsContinue;
+ bool containsLeave;
+ bool containsDefine;
+ };
+}
+
+#endif // sw_Shader_hpp
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
new file mode 100644
index 0000000..4ea3260
--- /dev/null
+++ b/src/Pipeline/ShaderCore.cpp
@@ -0,0 +1,2006 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ShaderCore.hpp"
+
+#include "Renderer/Renderer.hpp"
+#include "Common/Debug.hpp"
+
+#include <limits.h>
+
+namespace sw
+{
+ extern TranscendentalPrecision logPrecision;
+ extern TranscendentalPrecision expPrecision;
+ extern TranscendentalPrecision rcpPrecision;
+ extern TranscendentalPrecision rsqPrecision;
+
+ Vector4s::Vector4s()
+ {
+ }
+
+ Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+ {
+ this->x = Short4(x);
+ this->y = Short4(y);
+ this->z = Short4(z);
+ this->w = Short4(w);
+ }
+
+ Vector4s::Vector4s(const Vector4s &rhs)
+ {
+ x = rhs.x;
+ y = rhs.y;
+ z = rhs.z;
+ w = rhs.w;
+ }
+
+ Vector4s &Vector4s::operator=(const Vector4s &rhs)
+ {
+ x = rhs.x;
+ y = rhs.y;
+ z = rhs.z;
+ w = rhs.w;
+
+ return *this;
+ }
+
+ Short4 &Vector4s::operator[](int i)
+ {
+ switch(i)
+ {
+ case 0: return x;
+ case 1: return y;
+ case 2: return z;
+ case 3: return w;
+ }
+
+ return x;
+ }
+
+ Vector4f::Vector4f()
+ {
+ }
+
+ Vector4f::Vector4f(float x, float y, float z, float w)
+ {
+ this->x = Float4(x);
+ this->y = Float4(y);
+ this->z = Float4(z);
+ this->w = Float4(w);
+ }
+
+ Vector4f::Vector4f(const Vector4f &rhs)
+ {
+ x = rhs.x;
+ y = rhs.y;
+ z = rhs.z;
+ w = rhs.w;
+ }
+
+ Vector4f &Vector4f::operator=(const Vector4f &rhs)
+ {
+ x = rhs.x;
+ y = rhs.y;
+ z = rhs.z;
+ w = rhs.w;
+
+ return *this;
+ }
+
+ Float4 &Vector4f::operator[](int i)
+ {
+ switch(i)
+ {
+ case 0: return x;
+ case 1: return y;
+ case 2: return z;
+ case 3: return w;
+ }
+
+ return x;
+ }
+
+ Float4 exponential2(RValue<Float4> x, bool pp)
+ {
+ // This implementation is based on 2^(i + f) = 2^i * 2^f,
+ // where i is the integer part of x and f is the fraction.
+
+ // For 2^i we can put the integer part directly in the exponent of
+ // the IEEE-754 floating-point number. Clamp to prevent overflow
+ // past the representation of infinity.
+ Float4 x0 = x;
+ x0 = Min(x0, As<Float4>(Int4(0x43010000))); // 129.00000e+0f
+ x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF))); // -126.99999e+0f
+
+ Int4 i = RoundInt(x0 - Float4(0.5f));
+ Float4 ii = As<Float4>((i + Int4(127)) << 23); // Add single-precision bias, and shift into exponent.
+
+ // For the fractional part use a polynomial
+ // which approximates 2^f in the 0 to 1 range.
+ Float4 f = x0 - Float4(i);
+ Float4 ff = As<Float4>(Int4(0x3AF61905)); // 1.8775767e-3f
+ ff = ff * f + As<Float4>(Int4(0x3C134806)); // 8.9893397e-3f
+ ff = ff * f + As<Float4>(Int4(0x3D64AA23)); // 5.5826318e-2f
+ ff = ff * f + As<Float4>(Int4(0x3E75EAD4)); // 2.4015361e-1f
+ ff = ff * f + As<Float4>(Int4(0x3F31727B)); // 6.9315308e-1f
+ ff = ff * f + Float4(1.0f);
+
+ return ii * ff;
+ }
+
+ Float4 logarithm2(RValue<Float4> x, bool absolute, bool pp)
+ {
+ Float4 x0;
+ Float4 x1;
+ Float4 x2;
+ Float4 x3;
+
+ x0 = x;
+
+ x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000));
+ x1 = As<Float4>(As<UInt4>(x1) >> 8);
+ x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f)));
+ x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f); // FIXME: (x1 - 1.4960938f) * 256.0f;
+ x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
+
+ x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f);
+ x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f);
+ x2 /= x3;
+
+ x1 += (x0 - Float4(1.0f)) * x2;
+
+ Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4(0x7F800000));
+ return As<Float4>((pos_inf_x & As<Int4>(x)) | (~pos_inf_x & As<Int4>(x1)));
+ }
+
+ Float4 exponential(RValue<Float4> x, bool pp)
+ {
+ // FIXME: Propagate the constant
+ return exponential2(Float4(1.44269504f) * x, pp); // 1/ln(2)
+ }
+
+ Float4 logarithm(RValue<Float4> x, bool absolute, bool pp)
+ {
+ // FIXME: Propagate the constant
+ return Float4(6.93147181e-1f) * logarithm2(x, absolute, pp); // ln(2)
+ }
+
+ Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
+ {
+ Float4 log = logarithm2(x, true, pp);
+ log *= y;
+ return exponential2(log, pp);
+ }
+
+ Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
+ {
+ Float4 rcp;
+
+ if(!pp && rcpPrecision >= WHQL)
+ {
+ rcp = Float4(1.0f) / x;
+ }
+ else
+ {
+ rcp = Rcp_pp(x, exactAtPow2);
+
+ if(!pp)
+ {
+ rcp = (rcp + rcp) - (x * rcp * rcp);
+ }
+ }
+
+ if(finite)
+ {
+ int big = 0x7F7FFFFF;
+ rcp = Min(rcp, Float4((float&)big));
+ }
+
+ return rcp;
+ }
+
+ Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
+ {
+ Float4 abs = x;
+
+ if(absolute)
+ {
+ abs = Abs(abs);
+ }
+
+ Float4 rsq;
+
+ if(!pp)
+ {
+ rsq = Float4(1.0f) / Sqrt(abs);
+ }
+ else
+ {
+ rsq = RcpSqrt_pp(abs);
+
+ if(!pp)
+ {
+ rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
+ }
+
+ rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq));
+ }
+
+ return rsq;
+ }
+
+ Float4 modulo(RValue<Float4> x, RValue<Float4> y)
+ {
+ return x - y * Floor(x / y);
+ }
+
+ Float4 sine_pi(RValue<Float4> x, bool pp)
+ {
+ const Float4 A = Float4(-4.05284734e-1f); // -4/pi^2
+ const Float4 B = Float4(1.27323954e+0f); // 4/pi
+ const Float4 C = Float4(7.75160950e-1f);
+ const Float4 D = Float4(2.24839049e-1f);
+
+ // Parabola approximating sine
+ Float4 sin = x * (Abs(x) * A + B);
+
+ // Improve precision from 0.06 to 0.001
+ if(true)
+ {
+ sin = sin * (Abs(sin) * D + C);
+ }
+
+ return sin;
+ }
+
+ Float4 cosine_pi(RValue<Float4> x, bool pp)
+ {
+ // cos(x) = sin(x + pi/2)
+ Float4 y = x + Float4(1.57079632e+0f);
+
+ // Wrap around
+ y -= As<Float4>(CmpNLT(y, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f)));
+
+ return sine_pi(y, pp);
+ }
+
+ Float4 sine(RValue<Float4> x, bool pp)
+ {
+ // Reduce to [-0.5, 0.5] range
+ Float4 y = x * Float4(1.59154943e-1f); // 1/2pi
+ y = y - Round(y);
+
+ if(!pp)
+ {
+ // From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs"
+ // This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations:
+ // !pp : 17 mul, 7 add, 1 sub, 1 reciprocal
+ // pp : 4 mul, 2 add, 2 abs
+
+ Float4 y2 = y * y;
+ Float4 c1 = y2 * (y2 * (y2 * Float4(-0.0204391631f) + Float4(0.2536086171f)) + Float4(-1.2336977925f)) + Float4(1.0f);
+ Float4 s1 = y * (y2 * (y2 * (y2 * Float4(-0.0046075748f) + Float4(0.0796819754f)) + Float4(-0.645963615f)) + Float4(1.5707963235f));
+ Float4 c2 = (c1 * c1) - (s1 * s1);
+ Float4 s2 = Float4(2.0f) * s1 * c1;
+ return Float4(2.0f) * s2 * c2 * reciprocal(s2 * s2 + c2 * c2, pp, true);
+ }
+
+ const Float4 A = Float4(-16.0f);
+ const Float4 B = Float4(8.0f);
+ const Float4 C = Float4(7.75160950e-1f);
+ const Float4 D = Float4(2.24839049e-1f);
+
+ // Parabola approximating sine
+ Float4 sin = y * (Abs(y) * A + B);
+
+ // Improve precision from 0.06 to 0.001
+ if(true)
+ {
+ sin = sin * (Abs(sin) * D + C);
+ }
+
+ return sin;
+ }
+
+ Float4 cosine(RValue<Float4> x, bool pp)
+ {
+ // cos(x) = sin(x + pi/2)
+ Float4 y = x + Float4(1.57079632e+0f);
+ return sine(y, pp);
+ }
+
+ Float4 tangent(RValue<Float4> x, bool pp)
+ {
+ return sine(x, pp) / cosine(x, pp);
+ }
+
+ Float4 arccos(RValue<Float4> x, bool pp)
+ {
+ // pi/2 - arcsin(x)
+ return Float4(1.57079632e+0f) - arcsin(x);
+ }
+
+ Float4 arcsin(RValue<Float4> x, bool pp)
+ {
+ if(false) // Simpler implementation fails even lowp precision tests
+ {
+ // x*(pi/2-sqrt(1-x*x)*pi/5)
+ return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f));
+ }
+ else
+ {
+ // From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
+ const Float4 half_pi(1.57079632f);
+ const Float4 a0(1.5707288f);
+ const Float4 a1(-0.2121144f);
+ const Float4 a2(0.0742610f);
+ const Float4 a3(-0.0187293f);
+ Float4 absx = Abs(x);
+ return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^
+ (As<Int4>(x) & Int4(0x80000000)));
+ }
+ }
+
+ // Approximation of atan in [0..1]
+ Float4 arctan_01(Float4 x, bool pp)
+ {
+ if(pp)
+ {
+ return x * (Float4(-0.27f) * x + Float4(1.05539816f));
+ }
+ else
+ {
+ // From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
+ const Float4 a2(-0.3333314528f);
+ const Float4 a4(0.1999355085f);
+ const Float4 a6(-0.1420889944f);
+ const Float4 a8(0.1065626393f);
+ const Float4 a10(-0.0752896400f);
+ const Float4 a12(0.0429096138f);
+ const Float4 a14(-0.0161657367f);
+ const Float4 a16(0.0028662257f);
+ Float4 x2 = x * x;
+ return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16)))))))));
+ }
+ }
+
+ Float4 arctan(RValue<Float4> x, bool pp)
+ {
+ Float4 absx = Abs(x);
+ Int4 O = CmpNLT(absx, Float4(1.0f));
+ Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / absx)) | (~O & As<Int4>(absx))); // FIXME: Vector select
+
+ const Float4 half_pi(1.57079632f);
+ Float4 theta = arctan_01(y, pp);
+ return As<Float4>(((O & As<Int4>(half_pi - theta)) | (~O & As<Int4>(theta))) ^ // FIXME: Vector select
+ (As<Int4>(x) & Int4(0x80000000)));
+ }
+
+ Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp)
+ {
+ const Float4 pi(3.14159265f); // pi
+ const Float4 minus_pi(-3.14159265f); // -pi
+ const Float4 half_pi(1.57079632f); // pi/2
+ const Float4 quarter_pi(7.85398163e-1f); // pi/4
+
+ // Rotate to upper semicircle when in lower semicircle
+ Int4 S = CmpLT(y, Float4(0.0f));
+ Float4 theta = As<Float4>(S & As<Int4>(minus_pi));
+ Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x));
+ Float4 y0 = Abs(y);
+
+ // Rotate to right quadrant when in left quadrant
+ Int4 Q = CmpLT(x0, Float4(0.0f));
+ theta += As<Float4>(Q & As<Int4>(half_pi));
+ Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0))); // FIXME: Vector select
+ Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0))); // FIXME: Vector select
+
+ // Mirror to first octant when in second octant
+ Int4 O = CmpNLT(y1, x1);
+ Float4 x2 = As<Float4>((O & As<Int4>(y1)) | (~O & As<Int4>(x1))); // FIXME: Vector select
+ Float4 y2 = As<Float4>((O & As<Int4>(x1)) | (~O & As<Int4>(y1))); // FIXME: Vector select
+
+ // Approximation of atan in [0..1]
+ Int4 zero_x = CmpEQ(x2, Float4(0.0f));
+ Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4
+ Float4 atan2_theta = arctan_01(y2 / x2, pp);
+ theta += As<Float4>((~zero_x & ~inf_y & ((O & As<Int4>(half_pi - atan2_theta)) | (~O & (As<Int4>(atan2_theta))))) | // FIXME: Vector select
+ (inf_y & As<Int4>(quarter_pi)));
+
+ // Recover loss of precision for tiny theta angles
+ Int4 precision_loss = S & Q & O & ~inf_y; // This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta
+ return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) | (~precision_loss & As<Int4>(theta))); // FIXME: Vector select
+ }
+
+ Float4 sineh(RValue<Float4> x, bool pp)
+ {
+ return (exponential(x, pp) - exponential(-x, pp)) * Float4(0.5f);
+ }
+
+ Float4 cosineh(RValue<Float4> x, bool pp)
+ {
+ return (exponential(x, pp) + exponential(-x, pp)) * Float4(0.5f);
+ }
+
+ Float4 tangenth(RValue<Float4> x, bool pp)
+ {
+ Float4 e_x = exponential(x, pp);
+ Float4 e_minus_x = exponential(-x, pp);
+ return (e_x - e_minus_x) / (e_x + e_minus_x);
+ }
+
+ Float4 arccosh(RValue<Float4> x, bool pp)
+ {
+ return logarithm(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)), pp);
+ }
+
+ Float4 arcsinh(RValue<Float4> x, bool pp)
+ {
+ return logarithm(x + Sqrt(x * x + Float4(1.0f)), pp);
+ }
+
+ Float4 arctanh(RValue<Float4> x, bool pp)
+ {
+ return logarithm((Float4(1.0f) + x) / (Float4(1.0f) - x), pp) * Float4(0.5f);
+ }
+
+ Float4 dot2(const Vector4f &v0, const Vector4f &v1)
+ {
+ return v0.x * v1.x + v0.y * v1.y;
+ }
+
+ Float4 dot3(const Vector4f &v0, const Vector4f &v1)
+ {
+ return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
+ }
+
+ Float4 dot4(const Vector4f &v0, const Vector4f &v1)
+ {
+ return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w;
+ }
+
+ void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
+ {
+ Int2 tmp0 = UnpackHigh(row0, row1);
+ Int2 tmp1 = UnpackHigh(row2, row3);
+ Int2 tmp2 = UnpackLow(row0, row1);
+ Int2 tmp3 = UnpackLow(row2, row3);
+
+ row0 = UnpackLow(tmp2, tmp3);
+ row1 = UnpackHigh(tmp2, tmp3);
+ row2 = UnpackLow(tmp0, tmp1);
+ row3 = UnpackHigh(tmp0, tmp1);
+ }
+
+ void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
+ {
+ Int2 tmp0 = UnpackHigh(row0, row1);
+ Int2 tmp1 = UnpackHigh(row2, row3);
+ Int2 tmp2 = UnpackLow(row0, row1);
+ Int2 tmp3 = UnpackLow(row2, row3);
+
+ row0 = UnpackLow(tmp2, tmp3);
+ row1 = UnpackHigh(tmp2, tmp3);
+ row2 = UnpackLow(tmp0, tmp1);
+ }
+
+ void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+ {
+ Float4 tmp0 = UnpackLow(row0, row1);
+ Float4 tmp1 = UnpackLow(row2, row3);
+ Float4 tmp2 = UnpackHigh(row0, row1);
+ Float4 tmp3 = UnpackHigh(row2, row3);
+
+ row0 = Float4(tmp0.xy, tmp1.xy);
+ row1 = Float4(tmp0.zw, tmp1.zw);
+ row2 = Float4(tmp2.xy, tmp3.xy);
+ row3 = Float4(tmp2.zw, tmp3.zw);
+ }
+
+ void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+ {
+ Float4 tmp0 = UnpackLow(row0, row1);
+ Float4 tmp1 = UnpackLow(row2, row3);
+ Float4 tmp2 = UnpackHigh(row0, row1);
+ Float4 tmp3 = UnpackHigh(row2, row3);
+
+ row0 = Float4(tmp0.xy, tmp1.xy);
+ row1 = Float4(tmp0.zw, tmp1.zw);
+ row2 = Float4(tmp2.xy, tmp3.xy);
+ }
+
+ void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+ {
+ Float4 tmp0 = UnpackLow(row0, row1);
+ Float4 tmp1 = UnpackLow(row2, row3);
+
+ row0 = Float4(tmp0.xy, tmp1.xy);
+ row1 = Float4(tmp0.zw, tmp1.zw);
+ }
+
+ void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+ {
+ Float4 tmp0 = UnpackLow(row0, row1);
+ Float4 tmp1 = UnpackLow(row2, row3);
+
+ row0 = Float4(tmp0.xy, tmp1.xy);
+ }
+
+ void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+ {
+ Float4 tmp01 = UnpackLow(row0, row1);
+ Float4 tmp23 = UnpackHigh(row0, row1);
+
+ row0 = tmp01;
+ row1 = Float4(tmp01.zw, row1.zw);
+ row2 = tmp23;
+ row3 = Float4(tmp23.zw, row3.zw);
+ }
+
+ void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N)
+ {
+ switch(N)
+ {
+ case 1: transpose4x1(row0, row1, row2, row3); break;
+ case 2: transpose4x2(row0, row1, row2, row3); break;
+ case 3: transpose4x3(row0, row1, row2, row3); break;
+ case 4: transpose4x4(row0, row1, row2, row3); break;
+ }
+ }
+
+ const Vector4f RegisterFile::operator[](RValue<Int4> index)
+ {
+ ASSERT(indirectAddressable);
+
+ Int index0 = Extract(index, 0);
+ Int index1 = Extract(index, 1);
+ Int index2 = Extract(index, 2);
+ Int index3 = Extract(index, 3);
+
+ Vector4f r;
+
+ r.x.x = Extract(x[0][index0], 0);
+ r.x.y = Extract(x[0][index1], 1);
+ r.x.z = Extract(x[0][index2], 2);
+ r.x.w = Extract(x[0][index3], 3);
+
+ r.y.x = Extract(y[0][index0], 0);
+ r.y.y = Extract(y[0][index1], 1);
+ r.y.z = Extract(y[0][index2], 2);
+ r.y.w = Extract(y[0][index3], 3);
+
+ r.z.x = Extract(z[0][index0], 0);
+ r.z.y = Extract(z[0][index1], 1);
+ r.z.z = Extract(z[0][index2], 2);
+ r.z.w = Extract(z[0][index3], 3);
+
+ r.w.x = Extract(w[0][index0], 0);
+ r.w.y = Extract(w[0][index1], 1);
+ r.w.z = Extract(w[0][index2], 2);
+ r.w.w = Extract(w[0][index3], 3);
+
+ return r;
+ }
+
+ void RegisterFile::scatter_x(Int4 index, RValue<Float4> r)
+ {
+ ASSERT(indirectAddressable);
+
+ Int index0 = Extract(index, 0);
+ Int index1 = Extract(index, 1);
+ Int index2 = Extract(index, 2);
+ Int index3 = Extract(index, 3);
+
+ x[0][index0] = Insert(x[0][index0], Extract(r, 0), 0);
+ x[0][index1] = Insert(x[0][index1], Extract(r, 1), 1);
+ x[0][index2] = Insert(x[0][index2], Extract(r, 2), 2);
+ x[0][index3] = Insert(x[0][index3], Extract(r, 3), 3);
+ }
+
+ void RegisterFile::scatter_y(Int4 index, RValue<Float4> r)
+ {
+ ASSERT(indirectAddressable);
+
+ Int index0 = Extract(index, 0);
+ Int index1 = Extract(index, 1);
+ Int index2 = Extract(index, 2);
+ Int index3 = Extract(index, 3);
+
+ y[0][index0] = Insert(y[0][index0], Extract(r, 0), 0);
+ y[0][index1] = Insert(y[0][index1], Extract(r, 1), 1);
+ y[0][index2] = Insert(y[0][index2], Extract(r, 2), 2);
+ y[0][index3] = Insert(y[0][index3], Extract(r, 3), 3);
+ }
+
+ void RegisterFile::scatter_z(Int4 index, RValue<Float4> r)
+ {
+ ASSERT(indirectAddressable);
+
+ Int index0 = Extract(index, 0);
+ Int index1 = Extract(index, 1);
+ Int index2 = Extract(index, 2);
+ Int index3 = Extract(index, 3);
+
+ z[0][index0] = Insert(z[0][index0], Extract(r, 0), 0);
+ z[0][index1] = Insert(z[0][index1], Extract(r, 1), 1);
+ z[0][index2] = Insert(z[0][index2], Extract(r, 2), 2);
+ z[0][index3] = Insert(z[0][index3], Extract(r, 3), 3);
+ }
+
+ void RegisterFile::scatter_w(Int4 index, RValue<Float4> r)
+ {
+ ASSERT(indirectAddressable);
+
+ Int index0 = Extract(index, 0);
+ Int index1 = Extract(index, 1);
+ Int index2 = Extract(index, 2);
+ Int index3 = Extract(index, 3);
+
+ w[0][index0] = Insert(w[0][index0], Extract(r, 0), 0);
+ w[0][index1] = Insert(w[0][index1], Extract(r, 1), 1);
+ w[0][index2] = Insert(w[0][index2], Extract(r, 2), 2);
+ w[0][index3] = Insert(w[0][index3], Extract(r, 3), 3);
+ }
+
+ void ShaderCore::mov(Vector4f &dst, const Vector4f &src, bool integerDestination)
+ {
+ if(integerDestination)
+ {
+ dst.x = As<Float4>(RoundInt(src.x));
+ dst.y = As<Float4>(RoundInt(src.y));
+ dst.z = As<Float4>(RoundInt(src.z));
+ dst.w = As<Float4>(RoundInt(src.w));
+ }
+ else
+ {
+ dst = src;
+ }
+ }
+
+ void ShaderCore::neg(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = -src.x;
+ dst.y = -src.y;
+ dst.z = -src.z;
+ dst.w = -src.w;
+ }
+
+ void ShaderCore::ineg(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = As<Float4>(-As<Int4>(src.x));
+ dst.y = As<Float4>(-As<Int4>(src.y));
+ dst.z = As<Float4>(-As<Int4>(src.z));
+ dst.w = As<Float4>(-As<Int4>(src.w));
+ }
+
+ void ShaderCore::f2b(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = As<Float4>(CmpNEQ(src.x, Float4(0.0f)));
+ dst.y = As<Float4>(CmpNEQ(src.y, Float4(0.0f)));
+ dst.z = As<Float4>(CmpNEQ(src.z, Float4(0.0f)));
+ dst.w = As<Float4>(CmpNEQ(src.w, Float4(0.0f)));
+ }
+
+ void ShaderCore::b2f(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = As<Float4>(As<Int4>(src.x) & As<Int4>(Float4(1.0f)));
+ dst.y = As<Float4>(As<Int4>(src.y) & As<Int4>(Float4(1.0f)));
+ dst.z = As<Float4>(As<Int4>(src.z) & As<Int4>(Float4(1.0f)));
+ dst.w = As<Float4>(As<Int4>(src.w) & As<Int4>(Float4(1.0f)));
+ }
+
+ void ShaderCore::f2i(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = As<Float4>(Int4(src.x));
+ dst.y = As<Float4>(Int4(src.y));
+ dst.z = As<Float4>(Int4(src.z));
+ dst.w = As<Float4>(Int4(src.w));
+ }
+
+ void ShaderCore::i2f(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = Float4(As<Int4>(src.x));
+ dst.y = Float4(As<Int4>(src.y));
+ dst.z = Float4(As<Int4>(src.z));
+ dst.w = Float4(As<Int4>(src.w));
+ }
+
+ void ShaderCore::f2u(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = As<Float4>(UInt4(src.x));
+ dst.y = As<Float4>(UInt4(src.y));
+ dst.z = As<Float4>(UInt4(src.z));
+ dst.w = As<Float4>(UInt4(src.w));
+ }
+
+ void ShaderCore::u2f(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = Float4(As<UInt4>(src.x));
+ dst.y = Float4(As<UInt4>(src.y));
+ dst.z = Float4(As<UInt4>(src.z));
+ dst.w = Float4(As<UInt4>(src.w));
+ }
+
+ void ShaderCore::i2b(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = As<Float4>(CmpNEQ(As<Int4>(src.x), Int4(0)));
+ dst.y = As<Float4>(CmpNEQ(As<Int4>(src.y), Int4(0)));
+ dst.z = As<Float4>(CmpNEQ(As<Int4>(src.z), Int4(0)));
+ dst.w = As<Float4>(CmpNEQ(As<Int4>(src.w), Int4(0)));
+ }
+
+ void ShaderCore::b2i(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = As<Float4>(As<Int4>(src.x) & Int4(1));
+ dst.y = As<Float4>(As<Int4>(src.y) & Int4(1));
+ dst.z = As<Float4>(As<Int4>(src.z) & Int4(1));
+ dst.w = As<Float4>(As<Int4>(src.w) & Int4(1));
+ }
+
+ void ShaderCore::add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = src0.x + src1.x;
+ dst.y = src0.y + src1.y;
+ dst.z = src0.z + src1.z;
+ dst.w = src0.w + src1.w;
+ }
+
+ void ShaderCore::iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(As<Int4>(src0.x) + As<Int4>(src1.x));
+ dst.y = As<Float4>(As<Int4>(src0.y) + As<Int4>(src1.y));
+ dst.z = As<Float4>(As<Int4>(src0.z) + As<Int4>(src1.z));
+ dst.w = As<Float4>(As<Int4>(src0.w) + As<Int4>(src1.w));
+ }
+
+ void ShaderCore::sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = src0.x - src1.x;
+ dst.y = src0.y - src1.y;
+ dst.z = src0.z - src1.z;
+ dst.w = src0.w - src1.w;
+ }
+
+ void ShaderCore::isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(As<Int4>(src0.x) - As<Int4>(src1.x));
+ dst.y = As<Float4>(As<Int4>(src0.y) - As<Int4>(src1.y));
+ dst.z = As<Float4>(As<Int4>(src0.z) - As<Int4>(src1.z));
+ dst.w = As<Float4>(As<Int4>(src0.w) - As<Int4>(src1.w));
+ }
+
+ void ShaderCore::mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+ {
+ dst.x = src0.x * src1.x + src2.x;
+ dst.y = src0.y * src1.y + src2.y;
+ dst.z = src0.z * src1.z + src2.z;
+ dst.w = src0.w * src1.w + src2.w;
+ }
+
+ void ShaderCore::imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+ {
+ dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x) + As<Int4>(src2.x));
+ dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y) + As<Int4>(src2.y));
+ dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z) + As<Int4>(src2.z));
+ dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w) + As<Int4>(src2.w));
+ }
+
+ void ShaderCore::mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = src0.x * src1.x;
+ dst.y = src0.y * src1.y;
+ dst.z = src0.z * src1.z;
+ dst.w = src0.w * src1.w;
+ }
+
+ void ShaderCore::imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x));
+ dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y));
+ dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z));
+ dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w));
+ }
+
+ void ShaderCore::rcpx(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ Float4 rcp = reciprocal(src.x, pp, true, true);
+
+ dst.x = rcp;
+ dst.y = rcp;
+ dst.z = rcp;
+ dst.w = rcp;
+ }
+
+ void ShaderCore::div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = src0.x / src1.x;
+ dst.y = src0.y / src1.y;
+ dst.z = src0.z / src1.z;
+ dst.w = src0.w / src1.w;
+ }
+
+ void ShaderCore::idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ Float4 intMax(As<Float4>(Int4(INT_MAX)));
+ cmp0i(dst.x, src1.x, intMax, src1.x);
+ dst.x = As<Float4>(As<Int4>(src0.x) / As<Int4>(dst.x));
+ cmp0i(dst.y, src1.y, intMax, src1.y);
+ dst.y = As<Float4>(As<Int4>(src0.y) / As<Int4>(dst.y));
+ cmp0i(dst.z, src1.z, intMax, src1.z);
+ dst.z = As<Float4>(As<Int4>(src0.z) / As<Int4>(dst.z));
+ cmp0i(dst.w, src1.w, intMax, src1.w);
+ dst.w = As<Float4>(As<Int4>(src0.w) / As<Int4>(dst.w));
+ }
+
+ void ShaderCore::udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ Float4 uintMax(As<Float4>(UInt4(UINT_MAX)));
+ cmp0i(dst.x, src1.x, uintMax, src1.x);
+ dst.x = As<Float4>(As<UInt4>(src0.x) / As<UInt4>(dst.x));
+ cmp0i(dst.y, src1.y, uintMax, src1.y);
+ dst.y = As<Float4>(As<UInt4>(src0.y) / As<UInt4>(dst.y));
+ cmp0i(dst.z, src1.z, uintMax, src1.z);
+ dst.z = As<Float4>(As<UInt4>(src0.z) / As<UInt4>(dst.z));
+ cmp0i(dst.w, src1.w, uintMax, src1.w);
+ dst.w = As<Float4>(As<UInt4>(src0.w) / As<UInt4>(dst.w));
+ }
+
+ void ShaderCore::mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = modulo(src0.x, src1.x);
+ dst.y = modulo(src0.y, src1.y);
+ dst.z = modulo(src0.z, src1.z);
+ dst.w = modulo(src0.w, src1.w);
+ }
+
+ void ShaderCore::imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ Float4 intMax(As<Float4>(Int4(INT_MAX)));
+ cmp0i(dst.x, src1.x, intMax, src1.x);
+ dst.x = As<Float4>(As<Int4>(src0.x) % As<Int4>(dst.x));
+ cmp0i(dst.y, src1.y, intMax, src1.y);
+ dst.y = As<Float4>(As<Int4>(src0.y) % As<Int4>(dst.y));
+ cmp0i(dst.z, src1.z, intMax, src1.z);
+ dst.z = As<Float4>(As<Int4>(src0.z) % As<Int4>(dst.z));
+ cmp0i(dst.w, src1.w, intMax, src1.w);
+ dst.w = As<Float4>(As<Int4>(src0.w) % As<Int4>(dst.w));
+ }
+
+ void ShaderCore::umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ Float4 uintMax(As<Float4>(UInt4(UINT_MAX)));
+ cmp0i(dst.x, src1.x, uintMax, src1.x);
+ dst.x = As<Float4>(As<UInt4>(src0.x) % As<UInt4>(dst.x));
+ cmp0i(dst.y, src1.y, uintMax, src1.y);
+ dst.y = As<Float4>(As<UInt4>(src0.y) % As<UInt4>(dst.y));
+ cmp0i(dst.z, src1.z, uintMax, src1.z);
+ dst.z = As<Float4>(As<UInt4>(src0.z) % As<UInt4>(dst.z));
+ cmp0i(dst.w, src1.w, uintMax, src1.w);
+ dst.w = As<Float4>(As<UInt4>(src0.w) % As<UInt4>(dst.w));
+ }
+
+ void ShaderCore::shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(As<Int4>(src0.x) << As<Int4>(src1.x));
+ dst.y = As<Float4>(As<Int4>(src0.y) << As<Int4>(src1.y));
+ dst.z = As<Float4>(As<Int4>(src0.z) << As<Int4>(src1.z));
+ dst.w = As<Float4>(As<Int4>(src0.w) << As<Int4>(src1.w));
+ }
+
+ void ShaderCore::ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(As<Int4>(src0.x) >> As<Int4>(src1.x));
+ dst.y = As<Float4>(As<Int4>(src0.y) >> As<Int4>(src1.y));
+ dst.z = As<Float4>(As<Int4>(src0.z) >> As<Int4>(src1.z));
+ dst.w = As<Float4>(As<Int4>(src0.w) >> As<Int4>(src1.w));
+ }
+
+ void ShaderCore::ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(As<UInt4>(src0.x) >> As<UInt4>(src1.x));
+ dst.y = As<Float4>(As<UInt4>(src0.y) >> As<UInt4>(src1.y));
+ dst.z = As<Float4>(As<UInt4>(src0.z) >> As<UInt4>(src1.z));
+ dst.w = As<Float4>(As<UInt4>(src0.w) >> As<UInt4>(src1.w));
+ }
+
+ void ShaderCore::rsqx(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ Float4 rsq = reciprocalSquareRoot(src.x, true, pp);
+
+ dst.x = rsq;
+ dst.y = rsq;
+ dst.z = rsq;
+ dst.w = rsq;
+ }
+
+ void ShaderCore::sqrt(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = Sqrt(src.x);
+ dst.y = Sqrt(src.y);
+ dst.z = Sqrt(src.z);
+ dst.w = Sqrt(src.w);
+ }
+
+ void ShaderCore::rsq(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = reciprocalSquareRoot(src.x, false, pp);
+ dst.y = reciprocalSquareRoot(src.y, false, pp);
+ dst.z = reciprocalSquareRoot(src.z, false, pp);
+ dst.w = reciprocalSquareRoot(src.w, false, pp);
+ }
+
+ void ShaderCore::len2(Float4 &dst, const Vector4f &src, bool pp)
+ {
+ dst = Sqrt(dot2(src, src));
+ }
+
+ void ShaderCore::len3(Float4 &dst, const Vector4f &src, bool pp)
+ {
+ dst = Sqrt(dot3(src, src));
+ }
+
+ void ShaderCore::len4(Float4 &dst, const Vector4f &src, bool pp)
+ {
+ dst = Sqrt(dot4(src, src));
+ }
+
+ void ShaderCore::dist1(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+ {
+ dst = Abs(src0.x - src1.x);
+ }
+
+ void ShaderCore::dist2(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+ {
+ Float4 dx = src0.x - src1.x;
+ Float4 dy = src0.y - src1.y;
+ Float4 dot2 = dx * dx + dy * dy;
+ dst = Sqrt(dot2);
+ }
+
+ void ShaderCore::dist3(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+ {
+ Float4 dx = src0.x - src1.x;
+ Float4 dy = src0.y - src1.y;
+ Float4 dz = src0.z - src1.z;
+ Float4 dot3 = dx * dx + dy * dy + dz * dz;
+ dst = Sqrt(dot3);
+ }
+
+ void ShaderCore::dist4(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+ {
+ Float4 dx = src0.x - src1.x;
+ Float4 dy = src0.y - src1.y;
+ Float4 dz = src0.z - src1.z;
+ Float4 dw = src0.w - src1.w;
+ Float4 dot4 = dx * dx + dy * dy + dz * dz + dw * dw;
+ dst = Sqrt(dot4);
+ }
+
+ void ShaderCore::dp1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ Float4 t = src0.x * src1.x;
+
+ dst.x = t;
+ dst.y = t;
+ dst.z = t;
+ dst.w = t;
+ }
+
+ void ShaderCore::dp2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ Float4 t = dot2(src0, src1);
+
+ dst.x = t;
+ dst.y = t;
+ dst.z = t;
+ dst.w = t;
+ }
+
+ void ShaderCore::dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+ {
+ Float4 t = dot2(src0, src1) + src2.x;
+
+ dst.x = t;
+ dst.y = t;
+ dst.z = t;
+ dst.w = t;
+ }
+
+ void ShaderCore::dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ Float4 dot = dot3(src0, src1);
+
+ dst.x = dot;
+ dst.y = dot;
+ dst.z = dot;
+ dst.w = dot;
+ }
+
+ void ShaderCore::dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ Float4 dot = dot4(src0, src1);
+
+ dst.x = dot;
+ dst.y = dot;
+ dst.z = dot;
+ dst.w = dot;
+ }
+
+ void ShaderCore::min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = Min(src0.x, src1.x);
+ dst.y = Min(src0.y, src1.y);
+ dst.z = Min(src0.z, src1.z);
+ dst.w = Min(src0.w, src1.w);
+ }
+
+ void ShaderCore::imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(Min(As<Int4>(src0.x), As<Int4>(src1.x)));
+ dst.y = As<Float4>(Min(As<Int4>(src0.y), As<Int4>(src1.y)));
+ dst.z = As<Float4>(Min(As<Int4>(src0.z), As<Int4>(src1.z)));
+ dst.w = As<Float4>(Min(As<Int4>(src0.w), As<Int4>(src1.w)));
+ }
+
+ void ShaderCore::umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(Min(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+ dst.y = As<Float4>(Min(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+ dst.z = As<Float4>(Min(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+ dst.w = As<Float4>(Min(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+ }
+
+ void ShaderCore::max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = Max(src0.x, src1.x);
+ dst.y = Max(src0.y, src1.y);
+ dst.z = Max(src0.z, src1.z);
+ dst.w = Max(src0.w, src1.w);
+ }
+
+ void ShaderCore::imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
+ dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
+ dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
+ dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
+ }
+
+ void ShaderCore::umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
+ dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
+ dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
+ dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
+ }
+
+ void ShaderCore::slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(As<Int4>(CmpLT(src0.x, src1.x)) & As<Int4>(Float4(1.0f)));
+ dst.y = As<Float4>(As<Int4>(CmpLT(src0.y, src1.y)) & As<Int4>(Float4(1.0f)));
+ dst.z = As<Float4>(As<Int4>(CmpLT(src0.z, src1.z)) & As<Int4>(Float4(1.0f)));
+ dst.w = As<Float4>(As<Int4>(CmpLT(src0.w, src1.w)) & As<Int4>(Float4(1.0f)));
+ }
+
+ void ShaderCore::step(Vector4f &dst, const Vector4f &edge, const Vector4f &x)
+ {
+ dst.x = As<Float4>(CmpNLT(x.x, edge.x) & As<Int4>(Float4(1.0f)));
+ dst.y = As<Float4>(CmpNLT(x.y, edge.y) & As<Int4>(Float4(1.0f)));
+ dst.z = As<Float4>(CmpNLT(x.z, edge.z) & As<Int4>(Float4(1.0f)));
+ dst.w = As<Float4>(CmpNLT(x.w, edge.w) & As<Int4>(Float4(1.0f)));
+ }
+
+ void ShaderCore::exp2x(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ Float4 exp = exponential2(src.x, pp);
+
+ dst.x = exp;
+ dst.y = exp;
+ dst.z = exp;
+ dst.w = exp;
+ }
+
+ void ShaderCore::exp2(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = exponential2(src.x, pp);
+ dst.y = exponential2(src.y, pp);
+ dst.z = exponential2(src.z, pp);
+ dst.w = exponential2(src.w, pp);
+ }
+
+ void ShaderCore::exp(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = exponential(src.x, pp);
+ dst.y = exponential(src.y, pp);
+ dst.z = exponential(src.z, pp);
+ dst.w = exponential(src.w, pp);
+ }
+
+ void ShaderCore::log2x(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ Float4 log = logarithm2(src.x, true, pp);
+
+ dst.x = log;
+ dst.y = log;
+ dst.z = log;
+ dst.w = log;
+ }
+
+ void ShaderCore::log2(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = logarithm2(src.x, false, pp);
+ dst.y = logarithm2(src.y, false, pp);
+ dst.z = logarithm2(src.z, false, pp);
+ dst.w = logarithm2(src.w, false, pp);
+ }
+
+ void ShaderCore::log(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = logarithm(src.x, false, pp);
+ dst.y = logarithm(src.y, false, pp);
+ dst.z = logarithm(src.z, false, pp);
+ dst.w = logarithm(src.w, false, pp);
+ }
+
+ void ShaderCore::lit(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = Float4(1.0f);
+ dst.y = Max(src.x, Float4(0.0f));
+
+ Float4 pow;
+
+ pow = src.w;
+ pow = Min(pow, Float4(127.9961f));
+ pow = Max(pow, Float4(-127.9961f));
+
+ dst.z = power(src.y, pow);
+ dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.x, Float4(0.0f)));
+ dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.y, Float4(0.0f)));
+
+ dst.w = Float4(1.0f);
+ }
+
+ void ShaderCore::att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ // Computes attenuation factors (1, d, d^2, 1/d) assuming src0 = d^2, src1 = 1/d
+ dst.x = 1;
+ dst.y = src0.y * src1.y;
+ dst.z = src0.z;
+ dst.w = src1.w;
+ }
+
+ void ShaderCore::lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+ {
+ dst.x = src0.x * (src1.x - src2.x) + src2.x;
+ dst.y = src0.y * (src1.y - src2.y) + src2.y;
+ dst.z = src0.z * (src1.z - src2.z) + src2.z;
+ dst.w = src0.w * (src1.w - src2.w) + src2.w;
+ }
+
+ void ShaderCore::isinf(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = As<Float4>(IsInf(src.x));
+ dst.y = As<Float4>(IsInf(src.y));
+ dst.z = As<Float4>(IsInf(src.z));
+ dst.w = As<Float4>(IsInf(src.w));
+ }
+
+ void ShaderCore::isnan(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = As<Float4>(IsNan(src.x));
+ dst.y = As<Float4>(IsNan(src.y));
+ dst.z = As<Float4>(IsNan(src.z));
+ dst.w = As<Float4>(IsNan(src.w));
+ }
+
+ void ShaderCore::smooth(Vector4f &dst, const Vector4f &edge0, const Vector4f &edge1, const Vector4f &x)
+ {
+ Float4 tx = Min(Max((x.x - edge0.x) / (edge1.x - edge0.x), Float4(0.0f)), Float4(1.0f)); dst.x = tx * tx * (Float4(3.0f) - Float4(2.0f) * tx);
+ Float4 ty = Min(Max((x.y - edge0.y) / (edge1.y - edge0.y), Float4(0.0f)), Float4(1.0f)); dst.y = ty * ty * (Float4(3.0f) - Float4(2.0f) * ty);
+ Float4 tz = Min(Max((x.z - edge0.z) / (edge1.z - edge0.z), Float4(0.0f)), Float4(1.0f)); dst.z = tz * tz * (Float4(3.0f) - Float4(2.0f) * tz);
+ Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw);
+ }
+
+ void ShaderCore::floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits)
+ {
+ static const uint32_t mask_sign = 0x80000000u;
+ static const uint32_t mask_round = ~0xfffu;
+ static const uint32_t c_f32infty = 255 << 23;
+ static const uint32_t c_magic = 15 << 23;
+ static const uint32_t c_nanbit = 0x200;
+ static const uint32_t c_infty_as_fp16 = 0x7c00;
+ static const uint32_t c_clamp = (31 << 23) - 0x1000;
+
+ UInt4 justsign = UInt4(mask_sign) & As<UInt4>(floatBits);
+ UInt4 absf = As<UInt4>(floatBits) ^ justsign;
+ UInt4 b_isnormal = CmpNLE(UInt4(c_f32infty), absf);
+
+ // Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
+ // instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
+ UInt4 joined = ((((As<UInt4>(Min(As<Float4>(absf & UInt4(mask_round)) * As<Float4>(UInt4(c_magic)),
+ As<Float4>(UInt4(c_clamp))))) - UInt4(mask_round)) >> 13) & b_isnormal) |
+ ((b_isnormal ^ UInt4(0xFFFFFFFF)) & ((CmpNLE(absf, UInt4(c_f32infty)) & UInt4(c_nanbit)) |
+ UInt4(c_infty_as_fp16)));
+
+ dst = As<Float4>(storeInUpperBits ? As<UInt4>(dst) | ((joined << 16) | justsign) : joined | (justsign >> 16));
+ }
+
+ void ShaderCore::halfToFloatBits(Float4& dst, const Float4& halfBits)
+ {
+ static const uint32_t mask_nosign = 0x7FFF;
+ static const uint32_t magic = (254 - 15) << 23;
+ static const uint32_t was_infnan = 0x7BFF;
+ static const uint32_t exp_infnan = 255 << 23;
+
+ UInt4 expmant = As<UInt4>(halfBits) & UInt4(mask_nosign);
+ dst = As<Float4>(As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) |
+ ((As<UInt4>(halfBits) ^ UInt4(expmant)) << 16) |
+ (CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan)));
+ }
+
+ void ShaderCore::packHalf2x16(Vector4f &d, const Vector4f &s0)
+ {
+ // half2 | half1
+ floatToHalfBits(d.x, s0.x, false);
+ floatToHalfBits(d.x, s0.y, true);
+ }
+
+ void ShaderCore::unpackHalf2x16(Vector4f &dst, const Vector4f &s0)
+ {
+ // half2 | half1
+ halfToFloatBits(dst.x, As<Float4>(As<UInt4>(s0.x) & UInt4(0x0000FFFF)));
+ halfToFloatBits(dst.y, As<Float4>((As<UInt4>(s0.x) & UInt4(0xFFFF0000)) >> 16));
+ }
+
+ void ShaderCore::packSnorm2x16(Vector4f &d, const Vector4f &s0)
+ {
+ // round(clamp(c, -1.0, 1.0) * 32767.0)
+ d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) |
+ ((Int4(Round(Min(Max(s0.y, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) << 16));
+ }
+
+ void ShaderCore::packUnorm2x16(Vector4f &d, const Vector4f &s0)
+ {
+ // round(clamp(c, 0.0, 1.0) * 65535.0)
+ d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) |
+ ((Int4(Round(Min(Max(s0.y, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) << 16));
+ }
+
+ void ShaderCore::unpackSnorm2x16(Vector4f &dst, const Vector4f &s0)
+ {
+ // clamp(f / 32727.0, -1.0, 1.0)
+ dst.x = Min(Max(Float4(As<Int4>((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16)) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f));
+ dst.y = Min(Max(Float4(As<Int4>(As<UInt4>(s0.x) & UInt4(0xFFFF0000))) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f));
+ }
+
+ void ShaderCore::unpackUnorm2x16(Vector4f &dst, const Vector4f &s0)
+ {
+ // f / 65535.0
+ dst.x = Float4((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16) * Float4(1.0f / float(0xFFFF0000));
+ dst.y = Float4(As<UInt4>(s0.x) & UInt4(0xFFFF0000)) * Float4(1.0f / float(0xFFFF0000));
+ }
+
+ void ShaderCore::det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = src0.x * src1.y - src0.y * src1.x;
+ dst.y = dst.z = dst.w = dst.x;
+ }
+
+ void ShaderCore::det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+ {
+ crs(dst, src1, src2);
+ dp3(dst, dst, src0);
+ }
+
+ void ShaderCore::det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3)
+ {
+ dst.x = src2.z * src3.w - src2.w * src3.z;
+ dst.y = src1.w * src3.z - src1.z * src3.w;
+ dst.z = src1.z * src2.w - src1.w * src2.z;
+ dst.x = src0.x * (src1.y * dst.x + src2.y * dst.y + src3.y * dst.z) -
+ src0.y * (src1.x * dst.x + src2.x * dst.y + src3.x * dst.z) +
+ src0.z * (src1.x * (src2.y * src3.w - src2.w * src3.y) +
+ src2.x * (src1.w * src3.y - src1.y * src3.w) +
+ src3.x * (src1.y * src2.w - src1.w * src2.y)) +
+ src0.w * (src1.x * (src2.z * src3.y - src2.y * src3.z) +
+ src2.x * (src1.y * src3.z - src1.z * src3.y) +
+ src3.x * (src1.z * src2.y - src1.y * src2.z));
+ dst.y = dst.z = dst.w = dst.x;
+ }
+
+ void ShaderCore::frc(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = Frac(src.x);
+ dst.y = Frac(src.y);
+ dst.z = Frac(src.z);
+ dst.w = Frac(src.w);
+ }
+
+ void ShaderCore::trunc(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = Trunc(src.x);
+ dst.y = Trunc(src.y);
+ dst.z = Trunc(src.z);
+ dst.w = Trunc(src.w);
+ }
+
+ void ShaderCore::floor(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = Floor(src.x);
+ dst.y = Floor(src.y);
+ dst.z = Floor(src.z);
+ dst.w = Floor(src.w);
+ }
+
+ void ShaderCore::round(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = Round(src.x);
+ dst.y = Round(src.y);
+ dst.z = Round(src.z);
+ dst.w = Round(src.w);
+ }
+
+ void ShaderCore::roundEven(Vector4f &dst, const Vector4f &src)
+ {
+ // dst = round(src) + ((round(src) < src) * 2 - 1) * (fract(src) == 0.5) * isOdd(round(src));
+ // ex.: 1.5: 2 + (0 * 2 - 1) * 1 * 0 = 2
+ // 2.5: 3 + (0 * 2 - 1) * 1 * 1 = 2
+ // -1.5: -2 + (1 * 2 - 1) * 1 * 0 = -2
+ // -2.5: -3 + (1 * 2 - 1) * 1 * 1 = -2
+ // Even if the round implementation rounds the other way:
+ // 1.5: 1 + (1 * 2 - 1) * 1 * 1 = 2
+ // 2.5: 2 + (1 * 2 - 1) * 1 * 0 = 2
+ // -1.5: -1 + (0 * 2 - 1) * 1 * 1 = -2
+ // -2.5: -2 + (0 * 2 - 1) * 1 * 0 = -2
+ round(dst, src);
+ dst.x += ((Float4(CmpLT(dst.x, src.x) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.x), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.x) & Int4(1));
+ dst.y += ((Float4(CmpLT(dst.y, src.y) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.y), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.y) & Int4(1));
+ dst.z += ((Float4(CmpLT(dst.z, src.z) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.z), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.z) & Int4(1));
+ dst.w += ((Float4(CmpLT(dst.w, src.w) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.w), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.w) & Int4(1));
+ }
+
+ void ShaderCore::ceil(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = Ceil(src.x);
+ dst.y = Ceil(src.y);
+ dst.z = Ceil(src.z);
+ dst.w = Ceil(src.w);
+ }
+
+ void ShaderCore::powx(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+ {
+ Float4 pow = power(src0.x, src1.x, pp);
+
+ dst.x = pow;
+ dst.y = pow;
+ dst.z = pow;
+ dst.w = pow;
+ }
+
+ void ShaderCore::pow(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+ {
+ dst.x = power(src0.x, src1.x, pp);
+ dst.y = power(src0.y, src1.y, pp);
+ dst.z = power(src0.z, src1.z, pp);
+ dst.w = power(src0.w, src1.w, pp);
+ }
+
+ void ShaderCore::crs(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = src0.y * src1.z - src0.z * src1.y;
+ dst.y = src0.z * src1.x - src0.x * src1.z;
+ dst.z = src0.x * src1.y - src0.y * src1.x;
+ }
+
+ void ShaderCore::forward1(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
+ {
+ Int4 flip = CmpNLT(Nref.x * I.x, Float4(0.0f)) & Int4(0x80000000);
+
+ dst.x = As<Float4>(flip ^ As<Int4>(N.x));
+ }
+
+ void ShaderCore::forward2(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
+ {
+ Int4 flip = CmpNLT(dot2(Nref, I), Float4(0.0f)) & Int4(0x80000000);
+
+ dst.x = As<Float4>(flip ^ As<Int4>(N.x));
+ dst.y = As<Float4>(flip ^ As<Int4>(N.y));
+ }
+
+ void ShaderCore::forward3(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
+ {
+ Int4 flip = CmpNLT(dot3(Nref, I), Float4(0.0f)) & Int4(0x80000000);
+
+ dst.x = As<Float4>(flip ^ As<Int4>(N.x));
+ dst.y = As<Float4>(flip ^ As<Int4>(N.y));
+ dst.z = As<Float4>(flip ^ As<Int4>(N.z));
+ }
+
+ void ShaderCore::forward4(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
+ {
+ Int4 flip = CmpNLT(dot4(Nref, I), Float4(0.0f)) & Int4(0x80000000);
+
+ dst.x = As<Float4>(flip ^ As<Int4>(N.x));
+ dst.y = As<Float4>(flip ^ As<Int4>(N.y));
+ dst.z = As<Float4>(flip ^ As<Int4>(N.z));
+ dst.w = As<Float4>(flip ^ As<Int4>(N.w));
+ }
+
+ void ShaderCore::reflect1(Vector4f &dst, const Vector4f &I, const Vector4f &N)
+ {
+ Float4 d = N.x * I.x;
+
+ dst.x = I.x - Float4(2.0f) * d * N.x;
+ }
+
+ void ShaderCore::reflect2(Vector4f &dst, const Vector4f &I, const Vector4f &N)
+ {
+ Float4 d = dot2(N, I);
+
+ dst.x = I.x - Float4(2.0f) * d * N.x;
+ dst.y = I.y - Float4(2.0f) * d * N.y;
+ }
+
+ void ShaderCore::reflect3(Vector4f &dst, const Vector4f &I, const Vector4f &N)
+ {
+ Float4 d = dot3(N, I);
+
+ dst.x = I.x - Float4(2.0f) * d * N.x;
+ dst.y = I.y - Float4(2.0f) * d * N.y;
+ dst.z = I.z - Float4(2.0f) * d * N.z;
+ }
+
+ void ShaderCore::reflect4(Vector4f &dst, const Vector4f &I, const Vector4f &N)
+ {
+ Float4 d = dot4(N, I);
+
+ dst.x = I.x - Float4(2.0f) * d * N.x;
+ dst.y = I.y - Float4(2.0f) * d * N.y;
+ dst.z = I.z - Float4(2.0f) * d * N.z;
+ dst.w = I.w - Float4(2.0f) * d * N.w;
+ }
+
+ void ShaderCore::refract1(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
+ {
+ Float4 d = N.x * I.x;
+ Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
+ Int4 pos = CmpNLT(k, Float4(0.0f));
+ Float4 t = (eta * d + Sqrt(k));
+
+ dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
+ }
+
+ void ShaderCore::refract2(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
+ {
+ Float4 d = dot2(N, I);
+ Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
+ Int4 pos = CmpNLT(k, Float4(0.0f));
+ Float4 t = (eta * d + Sqrt(k));
+
+ dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
+ dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
+ }
+
+ void ShaderCore::refract3(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
+ {
+ Float4 d = dot3(N, I);
+ Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
+ Int4 pos = CmpNLT(k, Float4(0.0f));
+ Float4 t = (eta * d + Sqrt(k));
+
+ dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
+ dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
+ dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
+ }
+
+ void ShaderCore::refract4(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
+ {
+ Float4 d = dot4(N, I);
+ Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
+ Int4 pos = CmpNLT(k, Float4(0.0f));
+ Float4 t = (eta * d + Sqrt(k));
+
+ dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
+ dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
+ dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
+ dst.w = As<Float4>(pos & As<Int4>(eta * I.w - t * N.w));
+ }
+
+ void ShaderCore::sgn(Vector4f &dst, const Vector4f &src)
+ {
+ sgn(dst.x, src.x);
+ sgn(dst.y, src.y);
+ sgn(dst.z, src.z);
+ sgn(dst.w, src.w);
+ }
+
+ void ShaderCore::isgn(Vector4f &dst, const Vector4f &src)
+ {
+ isgn(dst.x, src.x);
+ isgn(dst.y, src.y);
+ isgn(dst.z, src.z);
+ isgn(dst.w, src.w);
+ }
+
+ void ShaderCore::abs(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = Abs(src.x);
+ dst.y = Abs(src.y);
+ dst.z = Abs(src.z);
+ dst.w = Abs(src.w);
+ }
+
+ void ShaderCore::iabs(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = As<Float4>(Abs(As<Int4>(src.x)));
+ dst.y = As<Float4>(Abs(As<Int4>(src.y)));
+ dst.z = As<Float4>(Abs(As<Int4>(src.z)));
+ dst.w = As<Float4>(Abs(As<Int4>(src.w)));
+ }
+
+ void ShaderCore::nrm2(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ Float4 dot = dot2(src, src);
+ Float4 rsq = reciprocalSquareRoot(dot, false, pp);
+
+ dst.x = src.x * rsq;
+ dst.y = src.y * rsq;
+ dst.z = src.z * rsq;
+ dst.w = src.w * rsq;
+ }
+
+ void ShaderCore::nrm3(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ Float4 dot = dot3(src, src);
+ Float4 rsq = reciprocalSquareRoot(dot, false, pp);
+
+ dst.x = src.x * rsq;
+ dst.y = src.y * rsq;
+ dst.z = src.z * rsq;
+ dst.w = src.w * rsq;
+ }
+
+ void ShaderCore::nrm4(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ Float4 dot = dot4(src, src);
+ Float4 rsq = reciprocalSquareRoot(dot, false, pp);
+
+ dst.x = src.x * rsq;
+ dst.y = src.y * rsq;
+ dst.z = src.z * rsq;
+ dst.w = src.w * rsq;
+ }
+
+ void ShaderCore::sincos(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = cosine_pi(src.x, pp);
+ dst.y = sine_pi(src.x, pp);
+ }
+
+ void ShaderCore::cos(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = cosine(src.x, pp);
+ dst.y = cosine(src.y, pp);
+ dst.z = cosine(src.z, pp);
+ dst.w = cosine(src.w, pp);
+ }
+
+ void ShaderCore::sin(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = sine(src.x, pp);
+ dst.y = sine(src.y, pp);
+ dst.z = sine(src.z, pp);
+ dst.w = sine(src.w, pp);
+ }
+
+ void ShaderCore::tan(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = tangent(src.x, pp);
+ dst.y = tangent(src.y, pp);
+ dst.z = tangent(src.z, pp);
+ dst.w = tangent(src.w, pp);
+ }
+
+ void ShaderCore::acos(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = arccos(src.x, pp);
+ dst.y = arccos(src.y, pp);
+ dst.z = arccos(src.z, pp);
+ dst.w = arccos(src.w, pp);
+ }
+
+ void ShaderCore::asin(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = arcsin(src.x, pp);
+ dst.y = arcsin(src.y, pp);
+ dst.z = arcsin(src.z, pp);
+ dst.w = arcsin(src.w, pp);
+ }
+
+ void ShaderCore::atan(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = arctan(src.x, pp);
+ dst.y = arctan(src.y, pp);
+ dst.z = arctan(src.z, pp);
+ dst.w = arctan(src.w, pp);
+ }
+
+ void ShaderCore::atan2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+ {
+ dst.x = arctan(src0.x, src1.x, pp);
+ dst.y = arctan(src0.y, src1.y, pp);
+ dst.z = arctan(src0.z, src1.z, pp);
+ dst.w = arctan(src0.w, src1.w, pp);
+ }
+
+ void ShaderCore::cosh(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = cosineh(src.x, pp);
+ dst.y = cosineh(src.y, pp);
+ dst.z = cosineh(src.z, pp);
+ dst.w = cosineh(src.w, pp);
+ }
+
+ void ShaderCore::sinh(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = sineh(src.x, pp);
+ dst.y = sineh(src.y, pp);
+ dst.z = sineh(src.z, pp);
+ dst.w = sineh(src.w, pp);
+ }
+
+ void ShaderCore::tanh(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = tangenth(src.x, pp);
+ dst.y = tangenth(src.y, pp);
+ dst.z = tangenth(src.z, pp);
+ dst.w = tangenth(src.w, pp);
+ }
+
+ void ShaderCore::acosh(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = arccosh(src.x, pp);
+ dst.y = arccosh(src.y, pp);
+ dst.z = arccosh(src.z, pp);
+ dst.w = arccosh(src.w, pp);
+ }
+
+ void ShaderCore::asinh(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = arcsinh(src.x, pp);
+ dst.y = arcsinh(src.y, pp);
+ dst.z = arcsinh(src.z, pp);
+ dst.w = arcsinh(src.w, pp);
+ }
+
+ void ShaderCore::atanh(Vector4f &dst, const Vector4f &src, bool pp)
+ {
+ dst.x = arctanh(src.x, pp);
+ dst.y = arctanh(src.y, pp);
+ dst.z = arctanh(src.z, pp);
+ dst.w = arctanh(src.w, pp);
+ }
+
+ void ShaderCore::expp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel)
+ {
+ if(shaderModel < 0x0200)
+ {
+ Float4 frc = Frac(src.x);
+ Float4 floor = src.x - frc;
+
+ dst.x = exponential2(floor, true);
+ dst.y = frc;
+ dst.z = exponential2(src.x, true);
+ dst.w = Float4(1.0f);
+ }
+ else // Version >= 2.0
+ {
+ exp2x(dst, src, true); // FIXME: 10-bit precision suffices
+ }
+ }
+
+ void ShaderCore::logp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel)
+ {
+ if(shaderModel < 0x0200)
+ {
+ Float4 tmp0;
+ Float4 tmp1;
+ Float4 t;
+ Int4 r;
+
+ tmp0 = Abs(src.x);
+ tmp1 = tmp0;
+
+ // X component
+ r = As<Int4>(As<UInt4>(tmp0) >> 23) - Int4(127);
+ dst.x = Float4(r);
+
+ // Y component
+ dst.y = As<Float4>((As<Int4>(tmp1) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
+
+ // Z component
+ dst.z = logarithm2(src.x, true, true);
+
+ // W component
+ dst.w = 1.0f;
+ }
+ else
+ {
+ log2x(dst, src, true);
+ }
+ }
+
+ void ShaderCore::cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+ {
+ cmp0(dst.x, src0.x, src1.x, src2.x);
+ cmp0(dst.y, src0.y, src1.y, src2.y);
+ cmp0(dst.z, src0.z, src1.z, src2.z);
+ cmp0(dst.w, src0.w, src1.w, src2.w);
+ }
+
+ void ShaderCore::select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+ {
+ select(dst.x, As<Int4>(src0.x), src1.x, src2.x);
+ select(dst.y, As<Int4>(src0.y), src1.y, src2.y);
+ select(dst.z, As<Int4>(src0.z), src1.z, src2.z);
+ select(dst.w, As<Int4>(src0.w), src1.w, src2.w);
+ }
+
+ void ShaderCore::extract(Float4 &dst, const Vector4f &src0, const Float4 &src1)
+ {
+ select(dst, CmpEQ(As<Int4>(src1), Int4(1)), src0.y, src0.x);
+ select(dst, CmpEQ(As<Int4>(src1), Int4(2)), src0.z, dst);
+ select(dst, CmpEQ(As<Int4>(src1), Int4(3)), src0.w, dst);
+ }
+
+ void ShaderCore::insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index)
+ {
+ select(dst.x, CmpEQ(As<Int4>(index), Int4(0)), element, src.x);
+ select(dst.y, CmpEQ(As<Int4>(index), Int4(1)), element, src.y);
+ select(dst.z, CmpEQ(As<Int4>(index), Int4(2)), element, src.z);
+ select(dst.w, CmpEQ(As<Int4>(index), Int4(3)), element, src.w);
+ }
+
+ void ShaderCore::sgn(Float4 &dst, const Float4 &src)
+ {
+ Int4 neg = As<Int4>(CmpLT(src, Float4(-0.0f))) & As<Int4>(Float4(-1.0f));
+ Int4 pos = As<Int4>(CmpNLE(src, Float4(+0.0f))) & As<Int4>(Float4(1.0f));
+ dst = As<Float4>(neg | pos);
+ }
+
+ void ShaderCore::isgn(Float4 &dst, const Float4 &src)
+ {
+ Int4 neg = CmpLT(As<Int4>(src), Int4(0)) & Int4(-1);
+ Int4 pos = CmpNLE(As<Int4>(src), Int4(0)) & Int4(1);
+ dst = As<Float4>(neg | pos);
+ }
+
+ void ShaderCore::cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
+ {
+ Int4 pos = CmpLE(Float4(0.0f), src0);
+ select(dst, pos, src1, src2);
+ }
+
+ void ShaderCore::cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
+ {
+ Int4 pos = CmpEQ(Int4(0), As<Int4>(src0));
+ select(dst, pos, src1, src2);
+ }
+
+ void ShaderCore::select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2)
+ {
+ // FIXME: LLVM vector select
+ dst = As<Float4>((src0 & As<Int4>(src1)) | (~src0 & As<Int4>(src2)));
+ }
+
+ void ShaderCore::cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
+ {
+ switch(control)
+ {
+ case Shader::CONTROL_GT:
+ dst.x = As<Float4>(CmpNLE(src0.x, src1.x));
+ dst.y = As<Float4>(CmpNLE(src0.y, src1.y));
+ dst.z = As<Float4>(CmpNLE(src0.z, src1.z));
+ dst.w = As<Float4>(CmpNLE(src0.w, src1.w));
+ break;
+ case Shader::CONTROL_EQ:
+ dst.x = As<Float4>(CmpEQ(src0.x, src1.x));
+ dst.y = As<Float4>(CmpEQ(src0.y, src1.y));
+ dst.z = As<Float4>(CmpEQ(src0.z, src1.z));
+ dst.w = As<Float4>(CmpEQ(src0.w, src1.w));
+ break;
+ case Shader::CONTROL_GE:
+ dst.x = As<Float4>(CmpNLT(src0.x, src1.x));
+ dst.y = As<Float4>(CmpNLT(src0.y, src1.y));
+ dst.z = As<Float4>(CmpNLT(src0.z, src1.z));
+ dst.w = As<Float4>(CmpNLT(src0.w, src1.w));
+ break;
+ case Shader::CONTROL_LT:
+ dst.x = As<Float4>(CmpLT(src0.x, src1.x));
+ dst.y = As<Float4>(CmpLT(src0.y, src1.y));
+ dst.z = As<Float4>(CmpLT(src0.z, src1.z));
+ dst.w = As<Float4>(CmpLT(src0.w, src1.w));
+ break;
+ case Shader::CONTROL_NE:
+ dst.x = As<Float4>(CmpNEQ(src0.x, src1.x));
+ dst.y = As<Float4>(CmpNEQ(src0.y, src1.y));
+ dst.z = As<Float4>(CmpNEQ(src0.z, src1.z));
+ dst.w = As<Float4>(CmpNEQ(src0.w, src1.w));
+ break;
+ case Shader::CONTROL_LE:
+ dst.x = As<Float4>(CmpLE(src0.x, src1.x));
+ dst.y = As<Float4>(CmpLE(src0.y, src1.y));
+ dst.z = As<Float4>(CmpLE(src0.z, src1.z));
+ dst.w = As<Float4>(CmpLE(src0.w, src1.w));
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void ShaderCore::icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
+ {
+ switch(control)
+ {
+ case Shader::CONTROL_GT:
+ dst.x = As<Float4>(CmpNLE(As<Int4>(src0.x), As<Int4>(src1.x)));
+ dst.y = As<Float4>(CmpNLE(As<Int4>(src0.y), As<Int4>(src1.y)));
+ dst.z = As<Float4>(CmpNLE(As<Int4>(src0.z), As<Int4>(src1.z)));
+ dst.w = As<Float4>(CmpNLE(As<Int4>(src0.w), As<Int4>(src1.w)));
+ break;
+ case Shader::CONTROL_EQ:
+ dst.x = As<Float4>(CmpEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
+ dst.y = As<Float4>(CmpEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
+ dst.z = As<Float4>(CmpEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
+ dst.w = As<Float4>(CmpEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
+ break;
+ case Shader::CONTROL_GE:
+ dst.x = As<Float4>(CmpNLT(As<Int4>(src0.x), As<Int4>(src1.x)));
+ dst.y = As<Float4>(CmpNLT(As<Int4>(src0.y), As<Int4>(src1.y)));
+ dst.z = As<Float4>(CmpNLT(As<Int4>(src0.z), As<Int4>(src1.z)));
+ dst.w = As<Float4>(CmpNLT(As<Int4>(src0.w), As<Int4>(src1.w)));
+ break;
+ case Shader::CONTROL_LT:
+ dst.x = As<Float4>(CmpLT(As<Int4>(src0.x), As<Int4>(src1.x)));
+ dst.y = As<Float4>(CmpLT(As<Int4>(src0.y), As<Int4>(src1.y)));
+ dst.z = As<Float4>(CmpLT(As<Int4>(src0.z), As<Int4>(src1.z)));
+ dst.w = As<Float4>(CmpLT(As<Int4>(src0.w), As<Int4>(src1.w)));
+ break;
+ case Shader::CONTROL_NE:
+ dst.x = As<Float4>(CmpNEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
+ dst.y = As<Float4>(CmpNEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
+ dst.z = As<Float4>(CmpNEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
+ dst.w = As<Float4>(CmpNEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
+ break;
+ case Shader::CONTROL_LE:
+ dst.x = As<Float4>(CmpLE(As<Int4>(src0.x), As<Int4>(src1.x)));
+ dst.y = As<Float4>(CmpLE(As<Int4>(src0.y), As<Int4>(src1.y)));
+ dst.z = As<Float4>(CmpLE(As<Int4>(src0.z), As<Int4>(src1.z)));
+ dst.w = As<Float4>(CmpLE(As<Int4>(src0.w), As<Int4>(src1.w)));
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void ShaderCore::ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
+ {
+ switch(control)
+ {
+ case Shader::CONTROL_GT:
+ dst.x = As<Float4>(CmpNLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+ dst.y = As<Float4>(CmpNLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+ dst.z = As<Float4>(CmpNLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+ dst.w = As<Float4>(CmpNLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+ break;
+ case Shader::CONTROL_EQ:
+ dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+ dst.y = As<Float4>(CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+ dst.z = As<Float4>(CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+ dst.w = As<Float4>(CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+ break;
+ case Shader::CONTROL_GE:
+ dst.x = As<Float4>(CmpNLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+ dst.y = As<Float4>(CmpNLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+ dst.z = As<Float4>(CmpNLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+ dst.w = As<Float4>(CmpNLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+ break;
+ case Shader::CONTROL_LT:
+ dst.x = As<Float4>(CmpLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+ dst.y = As<Float4>(CmpLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+ dst.z = As<Float4>(CmpLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+ dst.w = As<Float4>(CmpLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+ break;
+ case Shader::CONTROL_NE:
+ dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+ dst.y = As<Float4>(CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+ dst.z = As<Float4>(CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+ dst.w = As<Float4>(CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+ break;
+ case Shader::CONTROL_LE:
+ dst.x = As<Float4>(CmpLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+ dst.y = As<Float4>(CmpLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+ dst.z = As<Float4>(CmpLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+ dst.w = As<Float4>(CmpLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ void ShaderCore::all(Float4 &dst, const Vector4f &src)
+ {
+ dst = As<Float4>(As<Int4>(src.x) & As<Int4>(src.y) & As<Int4>(src.z) & As<Int4>(src.w));
+ }
+
+ void ShaderCore::any(Float4 &dst, const Vector4f &src)
+ {
+ dst = As<Float4>(As<Int4>(src.x) | As<Int4>(src.y) | As<Int4>(src.z) | As<Int4>(src.w));
+ }
+
+ void ShaderCore::bitwise_not(Vector4f &dst, const Vector4f &src)
+ {
+ dst.x = As<Float4>(As<Int4>(src.x) ^ Int4(0xFFFFFFFF));
+ dst.y = As<Float4>(As<Int4>(src.y) ^ Int4(0xFFFFFFFF));
+ dst.z = As<Float4>(As<Int4>(src.z) ^ Int4(0xFFFFFFFF));
+ dst.w = As<Float4>(As<Int4>(src.w) ^ Int4(0xFFFFFFFF));
+ }
+
+ void ShaderCore::bitwise_or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(As<Int4>(src0.x) | As<Int4>(src1.x));
+ dst.y = As<Float4>(As<Int4>(src0.y) | As<Int4>(src1.y));
+ dst.z = As<Float4>(As<Int4>(src0.z) | As<Int4>(src1.z));
+ dst.w = As<Float4>(As<Int4>(src0.w) | As<Int4>(src1.w));
+ }
+
+ void ShaderCore::bitwise_xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(As<Int4>(src0.x) ^ As<Int4>(src1.x));
+ dst.y = As<Float4>(As<Int4>(src0.y) ^ As<Int4>(src1.y));
+ dst.z = As<Float4>(As<Int4>(src0.z) ^ As<Int4>(src1.z));
+ dst.w = As<Float4>(As<Int4>(src0.w) ^ As<Int4>(src1.w));
+ }
+
+ void ShaderCore::bitwise_and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(As<Int4>(src0.x) & As<Int4>(src1.x));
+ dst.y = As<Float4>(As<Int4>(src0.y) & As<Int4>(src1.y));
+ dst.z = As<Float4>(As<Int4>(src0.z) & As<Int4>(src1.z));
+ dst.w = As<Float4>(As<Int4>(src0.w) & As<Int4>(src1.w));
+ }
+
+ void ShaderCore::equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) &
+ CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) &
+ CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) &
+ CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+ dst.y = dst.x;
+ dst.z = dst.x;
+ dst.w = dst.x;
+ }
+
+ void ShaderCore::notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+ {
+ dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) |
+ CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) |
+ CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) |
+ CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+ dst.y = dst.x;
+ dst.z = dst.x;
+ dst.w = dst.x;
+ }
+}
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
new file mode 100644
index 0000000..4dc109f
--- /dev/null
+++ b/src/Pipeline/ShaderCore.hpp
@@ -0,0 +1,382 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_ShaderCore_hpp
+#define sw_ShaderCore_hpp
+
+#include "Shader.hpp"
+#include "Reactor/Reactor.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+ class Vector4s
+ {
+ public:
+ Vector4s();
+ Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+ Vector4s(const Vector4s &rhs);
+
+ Short4 &operator[](int i);
+ Vector4s &operator=(const Vector4s &rhs);
+
+ Short4 x;
+ Short4 y;
+ Short4 z;
+ Short4 w;
+ };
+
+ class Vector4f
+ {
+ public:
+ Vector4f();
+ Vector4f(float x, float y, float z, float w);
+ Vector4f(const Vector4f &rhs);
+
+ Float4 &operator[](int i);
+ Vector4f &operator=(const Vector4f &rhs);
+
+ Float4 x;
+ Float4 y;
+ Float4 z;
+ Float4 w;
+ };
+
+ Float4 exponential2(RValue<Float4> x, bool pp = false);
+ Float4 logarithm2(RValue<Float4> x, bool abs, bool pp = false);
+ Float4 exponential(RValue<Float4> x, bool pp = false);
+ Float4 logarithm(RValue<Float4> x, bool abs, bool pp = false);
+ Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp = false);
+ Float4 reciprocal(RValue<Float4> x, bool pp = false, bool finite = false, bool exactAtPow2 = false);
+ Float4 reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);
+ Float4 modulo(RValue<Float4> x, RValue<Float4> y);
+ Float4 sine_pi(RValue<Float4> x, bool pp = false); // limited to [-pi, pi] range
+ Float4 cosine_pi(RValue<Float4> x, bool pp = false); // limited to [-pi, pi] range
+ Float4 sine(RValue<Float4> x, bool pp = false);
+ Float4 cosine(RValue<Float4> x, bool pp = false);
+ Float4 tangent(RValue<Float4> x, bool pp = false);
+ Float4 arccos(RValue<Float4> x, bool pp = false);
+ Float4 arcsin(RValue<Float4> x, bool pp = false);
+ Float4 arctan(RValue<Float4> x, bool pp = false);
+ Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp = false);
+ Float4 sineh(RValue<Float4> x, bool pp = false);
+ Float4 cosineh(RValue<Float4> x, bool pp = false);
+ Float4 tangenth(RValue<Float4> x, bool pp = false);
+ Float4 arccosh(RValue<Float4> x, bool pp = false); // Limited to x >= 1
+ Float4 arcsinh(RValue<Float4> x, bool pp = false);
+ Float4 arctanh(RValue<Float4> x, bool pp = false); // Limited to ]-1, 1[ range
+
+ Float4 dot2(const Vector4f &v0, const Vector4f &v1);
+ Float4 dot3(const Vector4f &v0, const Vector4f &v1);
+ Float4 dot4(const Vector4f &v0, const Vector4f &v1);
+
+ void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
+ void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
+ void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+ void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+ void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+ void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+ void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+ void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
+
+ class Register
+ {
+ public:
+ Register(const Reference<Float4> &x, const Reference<Float4> &y, const Reference<Float4> &z, const Reference<Float4> &w) : x(x), y(y), z(z), w(w)
+ {
+ }
+
+ Reference<Float4> &operator[](int i)
+ {
+ switch(i)
+ {
+ default:
+ case 0: return x;
+ case 1: return y;
+ case 2: return z;
+ case 3: return w;
+ }
+ }
+
+ Register &operator=(const Register &rhs)
+ {
+ x = rhs.x;
+ y = rhs.y;
+ z = rhs.z;
+ w = rhs.w;
+
+ return *this;
+ }
+
+ Register &operator=(const Vector4f &rhs)
+ {
+ x = rhs.x;
+ y = rhs.y;
+ z = rhs.z;
+ w = rhs.w;
+
+ return *this;
+ }
+
+ operator Vector4f()
+ {
+ Vector4f v;
+
+ v.x = x;
+ v.y = y;
+ v.z = z;
+ v.w = w;
+
+ return v;
+ }
+
+ Reference<Float4> x;
+ Reference<Float4> y;
+ Reference<Float4> z;
+ Reference<Float4> w;
+ };
+
+ class RegisterFile
+ {
+ public:
+ RegisterFile(int size, bool indirectAddressable) : size(size), indirectAddressable(indirectAddressable)
+ {
+ if(indirectAddressable)
+ {
+ x = new Array<Float4>(size);
+ y = new Array<Float4>(size);
+ z = new Array<Float4>(size);
+ w = new Array<Float4>(size);
+ }
+ else
+ {
+ x = new Array<Float4>[size];
+ y = new Array<Float4>[size];
+ z = new Array<Float4>[size];
+ w = new Array<Float4>[size];
+ }
+ }
+
+ ~RegisterFile()
+ {
+ if(indirectAddressable)
+ {
+ delete x;
+ delete y;
+ delete z;
+ delete w;
+ }
+ else
+ {
+ delete[] x;
+ delete[] y;
+ delete[] z;
+ delete[] w;
+ }
+ }
+
+ Register operator[](int i)
+ {
+ if(indirectAddressable)
+ {
+ return Register(x[0][i], y[0][i], z[0][i], w[0][i]);
+ }
+ else
+ {
+ return Register(x[i][0], y[i][0], z[i][0], w[i][0]);
+ }
+ }
+
+ Register operator[](RValue<Int> i)
+ {
+ ASSERT(indirectAddressable);
+
+ return Register(x[0][i], y[0][i], z[0][i], w[0][i]);
+ }
+
+ const Vector4f operator[](RValue<Int4> i); // Gather operation (read only).
+
+ void scatter_x(Int4 i, RValue<Float4> r);
+ void scatter_y(Int4 i, RValue<Float4> r);
+ void scatter_z(Int4 i, RValue<Float4> r);
+ void scatter_w(Int4 i, RValue<Float4> r);
+
+ protected:
+ const int size;
+ const bool indirectAddressable;
+ Array<Float4> *x;
+ Array<Float4> *y;
+ Array<Float4> *z;
+ Array<Float4> *w;
+ };
+
+ template<int S, bool I = false>
+ class RegisterArray : public RegisterFile
+ {
+ public:
+ RegisterArray(bool indirectAddressable = I) : RegisterFile(S, indirectAddressable)
+ {
+ }
+ };
+
+ class ShaderCore
+ {
+ typedef Shader::Control Control;
+
+ public:
+ void mov(Vector4f &dst, const Vector4f &src, bool integerDestination = false);
+ void neg(Vector4f &dst, const Vector4f &src);
+ void ineg(Vector4f &dst, const Vector4f &src);
+ void f2b(Vector4f &dst, const Vector4f &src);
+ void b2f(Vector4f &dst, const Vector4f &src);
+ void f2i(Vector4f &dst, const Vector4f &src);
+ void i2f(Vector4f &dst, const Vector4f &src);
+ void f2u(Vector4f &dst, const Vector4f &src);
+ void u2f(Vector4f &dst, const Vector4f &src);
+ void i2b(Vector4f &dst, const Vector4f &src);
+ void b2i(Vector4f &dst, const Vector4f &src);
+ void add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void rcpx(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void rsqx(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void sqrt(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void rsq(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void len2(Float4 &dst, const Vector4f &src, bool pp = false);
+ void len3(Float4 &dst, const Vector4f &src, bool pp = false);
+ void len4(Float4 &dst, const Vector4f &src, bool pp = false);
+ void dist1(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+ void dist2(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+ void dist3(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+ void dist4(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+ void dp1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void dp2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3);
+ void min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void step(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void exp2x(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void exp2(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void exp(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void log2x(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void log2(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void log(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void lit(Vector4f &dst, const Vector4f &src);
+ void att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void isinf(Vector4f &dst, const Vector4f &src);
+ void isnan(Vector4f &dst, const Vector4f &src);
+ void smooth(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void packHalf2x16(Vector4f &dst, const Vector4f &src);
+ void unpackHalf2x16(Vector4f &dst, const Vector4f &src);
+ void packSnorm2x16(Vector4f &dst, const Vector4f &src);
+ void packUnorm2x16(Vector4f &dst, const Vector4f &src);
+ void unpackSnorm2x16(Vector4f &dst, const Vector4f &src);
+ void unpackUnorm2x16(Vector4f &dst, const Vector4f &src);
+ void frc(Vector4f &dst, const Vector4f &src);
+ void trunc(Vector4f &dst, const Vector4f &src);
+ void floor(Vector4f &dst, const Vector4f &src);
+ void round(Vector4f &dst, const Vector4f &src);
+ void roundEven(Vector4f &dst, const Vector4f &src);
+ void ceil(Vector4f &dst, const Vector4f &src);
+ void powx(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+ void pow(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+ void crs(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void forward1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void forward2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void forward3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void forward4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void reflect1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void reflect2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void reflect3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void reflect4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void refract1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Float4 &src2);
+ void refract2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Float4 &src2);
+ void refract3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Float4 &src2);
+ void refract4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Float4 &src2);
+ void sgn(Vector4f &dst, const Vector4f &src);
+ void isgn(Vector4f &dst, const Vector4f &src);
+ void abs(Vector4f &dst, const Vector4f &src);
+ void iabs(Vector4f &dst, const Vector4f &src);
+ void nrm2(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void nrm3(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void nrm4(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void sincos(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void cos(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void sin(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void tan(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void acos(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void asin(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void atan(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void atan2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+ void cosh(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void sinh(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void tanh(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void acosh(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void asinh(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void atanh(Vector4f &dst, const Vector4f &src, bool pp = false);
+ void expp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel);
+ void logp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel);
+ void cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control);
+ void icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control);
+ void ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control);
+ void select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void extract(Float4 &dst, const Vector4f &src0, const Float4 &src1);
+ void insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index);
+ void all(Float4 &dst, const Vector4f &src);
+ void any(Float4 &dst, const Vector4f &src);
+ void bitwise_not(Vector4f &dst, const Vector4f &src);
+ void bitwise_or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void bitwise_xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void bitwise_and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+ void notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+
+ private:
+ void sgn(Float4 &dst, const Float4 &src);
+ void isgn(Float4 &dst, const Float4 &src);
+ void cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2);
+ void cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2);
+ void select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2);
+ void floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits);
+ void halfToFloatBits(Float4& dst, const Float4& halfBits);
+ };
+}
+
+#endif // sw_ShaderCore_hpp
diff --git a/src/Pipeline/VertexPipeline.cpp b/src/Pipeline/VertexPipeline.cpp
new file mode 100644
index 0000000..129d8a8
--- /dev/null
+++ b/src/Pipeline/VertexPipeline.cpp
@@ -0,0 +1,953 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "VertexPipeline.hpp"
+
+#include "Renderer/Vertex.hpp"
+#include "Renderer/Renderer.hpp"
+#include "Common/Debug.hpp"
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#undef max
+#undef min
+
+namespace sw
+{
+ extern bool secondaryColor;
+
+ VertexPipeline::VertexPipeline(const VertexProcessor::State &state) : VertexRoutine(state, 0)
+ {
+ }
+
+ VertexPipeline::~VertexPipeline()
+ {
+ }
+
+ Vector4f VertexPipeline::transformBlend(const Register &src, const Pointer<Byte> &matrix, bool homogeneous)
+ {
+ Vector4f dst;
+
+ if(state.vertexBlendMatrixCount == 0)
+ {
+ dst = transform(src, matrix, homogeneous);
+ }
+ else
+ {
+ UInt index0[4];
+ UInt index1[4];
+ UInt index2[4];
+ UInt index3[4];
+
+ if(state.indexedVertexBlendEnable)
+ {
+ for(int i = 0; i < 4; i++)
+ {
+ Float4 B = v[BlendIndices].x;
+ UInt indices;
+
+ switch(i)
+ {
+ case 0: indices = As<UInt>(Float(B.x)); break;
+ case 1: indices = As<UInt>(Float(B.y)); break;
+ case 2: indices = As<UInt>(Float(B.z)); break;
+ case 3: indices = As<UInt>(Float(B.w)); break;
+ }
+
+ index0[i] = (indices & 0x000000FF) << 6;
+ index1[i] = (indices & 0x0000FF00) >> 2;
+ index2[i] = (indices & 0x00FF0000) >> 10;
+ index3[i] = (indices & 0xFF000000) >> 18;
+ }
+ }
+ else
+ {
+ for(int i = 0; i < 4; i++)
+ {
+ index0[i] = 0 * 64;
+ index1[i] = 1 * 64;
+ index2[i] = 2 * 64;
+ index3[i] = 3 * 64;
+ }
+ }
+
+ Float4 weight0;
+ Float4 weight1;
+ Float4 weight2;
+ Float4 weight3;
+
+ switch(state.vertexBlendMatrixCount)
+ {
+ case 4: weight2 = v[BlendWeight].z;
+ case 3: weight1 = v[BlendWeight].y;
+ case 2: weight0 = v[BlendWeight].x;
+ case 1:
+ break;
+ }
+
+ if(state.vertexBlendMatrixCount == 1)
+ {
+ dst = transform(src, matrix, index0, homogeneous);
+ }
+ else if(state.vertexBlendMatrixCount == 2)
+ {
+ weight1 = Float4(1.0f) - weight0;
+
+ Vector4f pos0;
+ Vector4f pos1;
+
+ pos0 = transform(src, matrix, index0, homogeneous);
+ pos1 = transform(src, matrix, index1, homogeneous);
+
+ dst.x = pos0.x * weight0 + pos1.x * weight1; // FIXME: Vector4f operators
+ dst.y = pos0.y * weight0 + pos1.y * weight1;
+ dst.z = pos0.z * weight0 + pos1.z * weight1;
+ dst.w = pos0.w * weight0 + pos1.w * weight1;
+ }
+ else if(state.vertexBlendMatrixCount == 3)
+ {
+ weight2 = Float4(1.0f) - (weight0 + weight1);
+
+ Vector4f pos0;
+ Vector4f pos1;
+ Vector4f pos2;
+
+ pos0 = transform(src, matrix, index0, homogeneous);
+ pos1 = transform(src, matrix, index1, homogeneous);
+ pos2 = transform(src, matrix, index2, homogeneous);
+
+ dst.x = pos0.x * weight0 + pos1.x * weight1 + pos2.x * weight2;
+ dst.y = pos0.y * weight0 + pos1.y * weight1 + pos2.y * weight2;
+ dst.z = pos0.z * weight0 + pos1.z * weight1 + pos2.z * weight2;
+ dst.w = pos0.w * weight0 + pos1.w * weight1 + pos2.w * weight2;
+ }
+ else if(state.vertexBlendMatrixCount == 4)
+ {
+ weight3 = Float4(1.0f) - (weight0 + weight1 + weight2);
+
+ Vector4f pos0;
+ Vector4f pos1;
+ Vector4f pos2;
+ Vector4f pos3;
+
+ pos0 = transform(src, matrix, index0, homogeneous);
+ pos1 = transform(src, matrix, index1, homogeneous);
+ pos2 = transform(src, matrix, index2, homogeneous);
+ pos3 = transform(src, matrix, index3, homogeneous);
+
+ dst.x = pos0.x * weight0 + pos1.x * weight1 + pos2.x * weight2 + pos3.x * weight3;
+ dst.y = pos0.y * weight0 + pos1.y * weight1 + pos2.y * weight2 + pos3.y * weight3;
+ dst.z = pos0.z * weight0 + pos1.z * weight1 + pos2.z * weight2 + pos3.z * weight3;
+ dst.w = pos0.w * weight0 + pos1.w * weight1 + pos2.w * weight2 + pos3.w * weight3;
+ }
+ }
+
+ return dst;
+ }
+
+ void VertexPipeline::pipeline(UInt &index)
+ {
+ Vector4f position;
+ Vector4f normal;
+
+ if(!state.preTransformed)
+ {
+ position = transformBlend(v[Position], Pointer<Byte>(data + OFFSET(DrawData,ff.transformT)), true);
+ }
+ else
+ {
+ position = v[PositionT];
+ }
+
+ o[Pos].x = position.x;
+ o[Pos].y = position.y;
+ o[Pos].z = position.z;
+ o[Pos].w = position.w;
+
+ Vector4f vertexPosition = transformBlend(v[Position], Pointer<Byte>(data + OFFSET(DrawData,ff.cameraTransformT)), true);
+
+ if(state.vertexNormalActive)
+ {
+ normal = transformBlend(v[Normal], Pointer<Byte>(data + OFFSET(DrawData,ff.normalTransformT)), false);
+
+ if(state.normalizeNormals)
+ {
+ normal = normalize(normal);
+ }
+ }
+
+ if(!state.vertexLightingActive)
+ {
+ // FIXME: Don't process if not used at all
+ if(state.diffuseActive && state.input[Color0])
+ {
+ Vector4f diffuse = v[Color0];
+
+ o[C0].x = diffuse.x;
+ o[C0].y = diffuse.y;
+ o[C0].z = diffuse.z;
+ o[C0].w = diffuse.w;
+ }
+ else
+ {
+ o[C0].x = Float4(1.0f);
+ o[C0].y = Float4(1.0f);
+ o[C0].z = Float4(1.0f);
+ o[C0].w = Float4(1.0f);
+ }
+
+ // FIXME: Don't process if not used at all
+ if(state.specularActive && state.input[Color1])
+ {
+ Vector4f specular = v[Color1];
+
+ o[C1].x = specular.x;
+ o[C1].y = specular.y;
+ o[C1].z = specular.z;
+ o[C1].w = specular.w;
+ }
+ else
+ {
+ o[C1].x = Float4(0.0f);
+ o[C1].y = Float4(0.0f);
+ o[C1].z = Float4(0.0f);
+ o[C1].w = Float4(1.0f);
+ }
+ }
+ else
+ {
+ o[C0].x = Float4(0.0f);
+ o[C0].y = Float4(0.0f);
+ o[C0].z = Float4(0.0f);
+ o[C0].w = Float4(0.0f);
+
+ o[C1].x = Float4(0.0f);
+ o[C1].y = Float4(0.0f);
+ o[C1].z = Float4(0.0f);
+ o[C1].w = Float4(0.0f);
+
+ Vector4f ambient;
+ Float4 globalAmbient = *Pointer<Float4>(data + OFFSET(DrawData,ff.globalAmbient)); // FIXME: Unpack
+
+ ambient.x = globalAmbient.x;
+ ambient.y = globalAmbient.y;
+ ambient.z = globalAmbient.z;
+
+ for(int i = 0; i < 8; i++)
+ {
+ if(!(state.vertexLightActive & (1 << i)))
+ {
+ continue;
+ }
+
+ Vector4f L; // Light vector
+ Float4 att; // Attenuation
+
+ // Attenuation
+ {
+ Float4 d; // Distance
+
+ L.x = L.y = L.z = *Pointer<Float4>(data + OFFSET(DrawData,ff.lightPosition[i])); // FIXME: Unpack
+ L.x = L.x.xxxx;
+ L.y = L.y.yyyy;
+ L.z = L.z.zzzz;
+
+ L.x -= vertexPosition.x;
+ L.y -= vertexPosition.y;
+ L.z -= vertexPosition.z;
+ d = dot3(L, L);
+ d = RcpSqrt_pp(d); // FIXME: Sufficient precision?
+ L.x *= d;
+ L.y *= d;
+ L.z *= d;
+ d = Rcp_pp(d); // FIXME: Sufficient precision?
+
+ Float4 q = *Pointer<Float4>(data + OFFSET(DrawData,ff.attenuationQuadratic[i]));
+ Float4 l = *Pointer<Float4>(data + OFFSET(DrawData,ff.attenuationLinear[i]));
+ Float4 c = *Pointer<Float4>(data + OFFSET(DrawData,ff.attenuationConstant[i]));
+
+ att = Rcp_pp((q * d + l) * d + c);
+ }
+
+ // Ambient per light
+ {
+ Float4 lightAmbient = *Pointer<Float4>(data + OFFSET(DrawData,ff.lightAmbient[i])); // FIXME: Unpack
+
+ ambient.x = ambient.x + lightAmbient.x * att;
+ ambient.y = ambient.y + lightAmbient.y * att;
+ ambient.z = ambient.z + lightAmbient.z * att;
+ }
+
+ // Diffuse
+ if(state.vertexNormalActive)
+ {
+ Float4 dot;
+
+ dot = dot3(L, normal);
+ dot = Max(dot, Float4(0.0f));
+ dot *= att;
+
+ Vector4f diff;
+
+ if(state.vertexDiffuseMaterialSourceActive == MATERIAL_MATERIAL)
+ {
+ diff.x = diff.y = diff.z = *Pointer<Float4>(data + OFFSET(DrawData,ff.materialDiffuse)); // FIXME: Unpack
+ diff.x = diff.x.xxxx;
+ diff.y = diff.y.yyyy;
+ diff.z = diff.z.zzzz;
+ }
+ else if(state.vertexDiffuseMaterialSourceActive == MATERIAL_COLOR1)
+ {
+ diff = v[Color0];
+ }
+ else if(state.vertexDiffuseMaterialSourceActive == MATERIAL_COLOR2)
+ {
+ diff = v[Color1];
+ }
+ else ASSERT(false);
+
+ Float4 lightDiffuse = *Pointer<Float4>(data + OFFSET(DrawData,ff.lightDiffuse[i]));
+
+ o[C0].x = o[C0].x + diff.x * dot * lightDiffuse.x; // FIXME: Clamp first?
+ o[C0].y = o[C0].y + diff.y * dot * lightDiffuse.y; // FIXME: Clamp first?
+ o[C0].z = o[C0].z + diff.z * dot * lightDiffuse.z; // FIXME: Clamp first?
+ }
+
+ // Specular
+ if(state.vertexSpecularActive)
+ {
+ Vector4f S;
+ Vector4f C; // Camera vector
+ Float4 pow;
+
+ pow = *Pointer<Float>(data + OFFSET(DrawData,ff.materialShininess));
+
+ S.x = Float4(0.0f) - vertexPosition.x;
+ S.y = Float4(0.0f) - vertexPosition.y;
+ S.z = Float4(0.0f) - vertexPosition.z;
+ C = normalize(S);
+
+ S.x = L.x + C.x;
+ S.y = L.y + C.y;
+ S.z = L.z + C.z;
+ C = normalize(S);
+
+ Float4 dot = Max(dot3(C, normal), Float4(0.0f)); // FIXME: max(dot3(C, normal), 0)
+
+ Float4 P = power(dot, pow);
+ P *= att;
+
+ Vector4f spec;
+
+ if(state.vertexSpecularMaterialSourceActive == MATERIAL_MATERIAL)
+ {
+ Float4 materialSpecular = *Pointer<Float4>(data + OFFSET(DrawData,ff.materialSpecular)); // FIXME: Unpack
+
+ spec.x = materialSpecular.x;
+ spec.y = materialSpecular.y;
+ spec.z = materialSpecular.z;
+ }
+ else if(state.vertexSpecularMaterialSourceActive == MATERIAL_COLOR1)
+ {
+ spec = v[Color0];
+ }
+ else if(state.vertexSpecularMaterialSourceActive == MATERIAL_COLOR2)
+ {
+ spec = v[Color1];
+ }
+ else ASSERT(false);
+
+ Float4 lightSpecular = *Pointer<Float4>(data + OFFSET(DrawData,ff.lightSpecular[i]));
+
+ spec.x *= lightSpecular.x;
+ spec.y *= lightSpecular.y;
+ spec.z *= lightSpecular.z;
+
+ spec.x *= P;
+ spec.y *= P;
+ spec.z *= P;
+
+ spec.x = Max(spec.x, Float4(0.0f));
+ spec.y = Max(spec.y, Float4(0.0f));
+ spec.z = Max(spec.z, Float4(0.0f));
+
+ if(secondaryColor)
+ {
+ o[C1].x = o[C1].x + spec.x;
+ o[C1].y = o[C1].y + spec.y;
+ o[C1].z = o[C1].z + spec.z;
+ }
+ else
+ {
+ o[C0].x = o[C0].x + spec.x;
+ o[C0].y = o[C0].y + spec.y;
+ o[C0].z = o[C0].z + spec.z;
+ }
+ }
+ }
+
+ if(state.vertexAmbientMaterialSourceActive == MATERIAL_MATERIAL)
+ {
+ Float4 materialAmbient = *Pointer<Float4>(data + OFFSET(DrawData,ff.materialAmbient)); // FIXME: Unpack
+
+ ambient.x = ambient.x * materialAmbient.x;
+ ambient.y = ambient.y * materialAmbient.y;
+ ambient.z = ambient.z * materialAmbient.z;
+ }
+ else if(state.vertexAmbientMaterialSourceActive == MATERIAL_COLOR1)
+ {
+ Vector4f materialDiffuse = v[Color0];
+
+ ambient.x = ambient.x * materialDiffuse.x;
+ ambient.y = ambient.y * materialDiffuse.y;
+ ambient.z = ambient.z * materialDiffuse.z;
+ }
+ else if(state.vertexAmbientMaterialSourceActive == MATERIAL_COLOR2)
+ {
+ Vector4f materialSpecular = v[Color1];
+
+ ambient.x = ambient.x * materialSpecular.x;
+ ambient.y = ambient.y * materialSpecular.y;
+ ambient.z = ambient.z * materialSpecular.z;
+ }
+ else ASSERT(false);
+
+ o[C0].x = o[C0].x + ambient.x;
+ o[C0].y = o[C0].y + ambient.y;
+ o[C0].z = o[C0].z + ambient.z;
+
+ // Emissive
+ if(state.vertexEmissiveMaterialSourceActive == MATERIAL_MATERIAL)
+ {
+ Float4 materialEmission = *Pointer<Float4>(data + OFFSET(DrawData,ff.materialEmission)); // FIXME: Unpack
+
+ o[C0].x = o[C0].x + materialEmission.x;
+ o[C0].y = o[C0].y + materialEmission.y;
+ o[C0].z = o[C0].z + materialEmission.z;
+ }
+ else if(state.vertexEmissiveMaterialSourceActive == MATERIAL_COLOR1)
+ {
+ Vector4f materialSpecular = v[Color0];
+
+ o[C0].x = o[C0].x + materialSpecular.x;
+ o[C0].y = o[C0].y + materialSpecular.y;
+ o[C0].z = o[C0].z + materialSpecular.z;
+ }
+ else if(state.vertexEmissiveMaterialSourceActive == MATERIAL_COLOR2)
+ {
+ Vector4f materialSpecular = v[Color1];
+
+ o[C0].x = o[C0].x + materialSpecular.x;
+ o[C0].y = o[C0].y + materialSpecular.y;
+ o[C0].z = o[C0].z + materialSpecular.z;
+ }
+ else ASSERT(false);
+
+ // Diffuse alpha component
+ if(state.vertexDiffuseMaterialSourceActive == MATERIAL_MATERIAL)
+ {
+ o[C0].w = Float4(*Pointer<Float4>(data + OFFSET(DrawData,ff.materialDiffuse[0]))).wwww; // FIXME: Unpack
+ }
+ else if(state.vertexDiffuseMaterialSourceActive == MATERIAL_COLOR1)
+ {
+ Vector4f alpha = v[Color0];
+ o[C0].w = alpha.w;
+ }
+ else if(state.vertexDiffuseMaterialSourceActive == MATERIAL_COLOR2)
+ {
+ Vector4f alpha = v[Color1];
+ o[C0].w = alpha.w;
+ }
+ else ASSERT(false);
+
+ if(state.vertexSpecularActive)
+ {
+ // Specular alpha component
+ if(state.vertexSpecularMaterialSourceActive == MATERIAL_MATERIAL)
+ {
+ o[C1].w = Float4(*Pointer<Float4>(data + OFFSET(DrawData,ff.materialSpecular[3]))).wwww; // FIXME: Unpack
+ }
+ else if(state.vertexSpecularMaterialSourceActive == MATERIAL_COLOR1)
+ {
+ Vector4f alpha = v[Color0];
+ o[C1].w = alpha.w;
+ }
+ else if(state.vertexSpecularMaterialSourceActive == MATERIAL_COLOR2)
+ {
+ Vector4f alpha = v[Color1];
+ o[C1].w = alpha.w;
+ }
+ else ASSERT(false);
+ }
+ }
+
+ if(state.fogActive)
+ {
+ Float4 f;
+
+ if(!state.rangeFogActive)
+ {
+ f = Abs(vertexPosition.z);
+ }
+ else
+ {
+ f = Sqrt(dot3(vertexPosition, vertexPosition)); // FIXME: f = length(vertexPosition);
+ }
+
+ switch(state.vertexFogMode)
+ {
+ case FOG_NONE:
+ if(state.specularActive)
+ {
+ o[Fog].x = o[C1].w;
+ }
+ else
+ {
+ o[Fog].x = Float4(0.0f);
+ }
+ break;
+ case FOG_LINEAR:
+ o[Fog].x = f * *Pointer<Float4>(data + OFFSET(DrawData,fog.scale)) + *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
+ break;
+ case FOG_EXP:
+ o[Fog].x = exponential2(f * *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE)), true);
+ break;
+ case FOG_EXP2:
+ o[Fog].x = exponential2((f * f) * *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E)), true);
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+
+ for(int stage = 0; stage < 8; stage++)
+ {
+ processTextureCoordinate(stage, normal, position);
+ }
+
+ processPointSize();
+ }
+
+ void VertexPipeline::processTextureCoordinate(int stage, Vector4f &normal, Vector4f &position)
+ {
+ if(state.output[T0 + stage].write)
+ {
+ int i = state.textureState[stage].texCoordIndexActive;
+
+ switch(state.textureState[stage].texGenActive)
+ {
+ case TEXGEN_NONE:
+ {
+ Vector4f &&varying = v[TexCoord0 + i];
+
+ o[T0 + stage].x = varying.x;
+ o[T0 + stage].y = varying.y;
+ o[T0 + stage].z = varying.z;
+ o[T0 + stage].w = varying.w;
+ }
+ break;
+ case TEXGEN_PASSTHRU:
+ {
+ Vector4f &&varying = v[TexCoord0 + i];
+
+ o[T0 + stage].x = varying.x;
+ o[T0 + stage].y = varying.y;
+ o[T0 + stage].z = varying.z;
+ o[T0 + stage].w = varying.w;
+
+ if(state.input[TexCoord0 + i])
+ {
+ switch(state.input[TexCoord0 + i].count)
+ {
+ case 1:
+ o[T0 + stage].y = Float4(1.0f);
+ o[T0 + stage].z = Float4(0.0f);
+ o[T0 + stage].w = Float4(0.0f);
+ break;
+ case 2:
+ o[T0 + stage].z = Float4(1.0f);
+ o[T0 + stage].w = Float4(0.0f);
+ break;
+ case 3:
+ o[T0 + stage].w = Float4(1.0f);
+ break;
+ case 4:
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ }
+ break;
+ case TEXGEN_NORMAL:
+ {
+ Vector4f Nc; // Normal vector in camera space
+
+ if(state.vertexNormalActive)
+ {
+ Nc = normal;
+ }
+ else
+ {
+ Nc.x = Float4(0.0f);
+ Nc.y = Float4(0.0f);
+ Nc.z = Float4(0.0f);
+ }
+
+ Nc.w = Float4(1.0f);
+
+ o[T0 + stage].x = Nc.x;
+ o[T0 + stage].y = Nc.y;
+ o[T0 + stage].z = Nc.z;
+ o[T0 + stage].w = Nc.w;
+ }
+ break;
+ case TEXGEN_POSITION:
+ {
+ Vector4f Pn = transformBlend(v[Position], Pointer<Byte>(data + OFFSET(DrawData,ff.cameraTransformT)), true); // Position in camera space
+
+ Pn.w = Float4(1.0f);
+
+ o[T0 + stage].x = Pn.x;
+ o[T0 + stage].y = Pn.y;
+ o[T0 + stage].z = Pn.z;
+ o[T0 + stage].w = Pn.w;
+ }
+ break;
+ case TEXGEN_REFLECTION:
+ {
+ Vector4f R; // Reflection vector
+
+ if(state.vertexNormalActive)
+ {
+ Vector4f Nc; // Normal vector in camera space
+
+ Nc = normal;
+
+ if(state.localViewerActive)
+ {
+ Vector4f Ec; // Eye vector in camera space
+ Vector4f N2;
+
+ Ec = transformBlend(v[Position], Pointer<Byte>(data + OFFSET(DrawData,ff.cameraTransformT)), true);
+ Ec = normalize(Ec);
+
+ // R = E - 2 * N * (E . N)
+ Float4 dot = Float4(2.0f) * dot3(Ec, Nc);
+
+ R.x = Ec.x - Nc.x * dot;
+ R.y = Ec.y - Nc.y * dot;
+ R.z = Ec.z - Nc.z * dot;
+ }
+ else
+ {
+ // u = -2 * Nz * Nx
+ // v = -2 * Nz * Ny
+ // w = 1 - 2 * Nz * Nz
+
+ R.x = -Float4(2.0f) * Nc.z * Nc.x;
+ R.y = -Float4(2.0f) * Nc.z * Nc.y;
+ R.z = Float4(1.0f) - Float4(2.0f) * Nc.z * Nc.z;
+ }
+ }
+ else
+ {
+ R.x = Float4(0.0f);
+ R.y = Float4(0.0f);
+ R.z = Float4(0.0f);
+ }
+
+ R.w = Float4(1.0f);
+
+ o[T0 + stage].x = R.x;
+ o[T0 + stage].y = R.y;
+ o[T0 + stage].z = R.z;
+ o[T0 + stage].w = R.w;
+ }
+ break;
+ case TEXGEN_SPHEREMAP:
+ {
+ Vector4f R; // Reflection vector
+
+ if(state.vertexNormalActive)
+ {
+ Vector4f Nc; // Normal vector in camera space
+
+ Nc = normal;
+
+ if(state.localViewerActive)
+ {
+ Vector4f Ec; // Eye vector in camera space
+ Vector4f N2;
+
+ Ec = transformBlend(v[Position], Pointer<Byte>(data + OFFSET(DrawData,ff.cameraTransformT)), true);
+ Ec = normalize(Ec);
+
+ // R = E - 2 * N * (E . N)
+ Float4 dot = Float4(2.0f) * dot3(Ec, Nc);
+
+ R.x = Ec.x - Nc.x * dot;
+ R.y = Ec.y - Nc.y * dot;
+ R.z = Ec.z - Nc.z * dot;
+ }
+ else
+ {
+ // u = -2 * Nz * Nx
+ // v = -2 * Nz * Ny
+ // w = 1 - 2 * Nz * Nz
+
+ R.x = -Float4(2.0f) * Nc.z * Nc.x;
+ R.y = -Float4(2.0f) * Nc.z * Nc.y;
+ R.z = Float4(1.0f) - Float4(2.0f) * Nc.z * Nc.z;
+ }
+ }
+ else
+ {
+ R.x = Float4(0.0f);
+ R.y = Float4(0.0f);
+ R.z = Float4(0.0f);
+ }
+
+ R.z -= Float4(1.0f);
+ R = normalize(R);
+ R.x = Float4(0.5f) * R.x + Float4(0.5f);
+ R.y = Float4(0.5f) * R.y + Float4(0.5f);
+
+ R.z = Float4(1.0f);
+ R.w = Float4(0.0f);
+
+ o[T0 + stage].x = R.x;
+ o[T0 + stage].y = R.y;
+ o[T0 + stage].z = R.z;
+ o[T0 + stage].w = R.w;
+ }
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ Vector4f texTrans0;
+ Vector4f texTrans1;
+ Vector4f texTrans2;
+ Vector4f texTrans3;
+
+ Vector4f T;
+ Vector4f t;
+
+ T.x = o[T0 + stage].x;
+ T.y = o[T0 + stage].y;
+ T.z = o[T0 + stage].z;
+ T.w = o[T0 + stage].w;
+
+ switch(state.textureState[stage].textureTransformCountActive)
+ {
+ case 4:
+ texTrans3.x = texTrans3.y = texTrans3.z = texTrans3.w = *Pointer<Float4>(data + OFFSET(DrawData,ff.textureTransform[stage][3])); // FIXME: Unpack
+ texTrans3.x = texTrans3.x.xxxx;
+ texTrans3.y = texTrans3.y.yyyy;
+ texTrans3.z = texTrans3.z.zzzz;
+ texTrans3.w = texTrans3.w.wwww;
+ t.w = dot4(T, texTrans3);
+ case 3:
+ texTrans2.x = texTrans2.y = texTrans2.z = texTrans2.w = *Pointer<Float4>(data + OFFSET(DrawData,ff.textureTransform[stage][2])); // FIXME: Unpack
+ texTrans2.x = texTrans2.x.xxxx;
+ texTrans2.y = texTrans2.y.yyyy;
+ texTrans2.z = texTrans2.z.zzzz;
+ texTrans2.w = texTrans2.w.wwww;
+ t.z = dot4(T, texTrans2);
+ case 2:
+ texTrans1.x = texTrans1.y = texTrans1.z = texTrans1.w = *Pointer<Float4>(data + OFFSET(DrawData,ff.textureTransform[stage][1])); // FIXME: Unpack
+ texTrans1.x = texTrans1.x.xxxx;
+ texTrans1.y = texTrans1.y.yyyy;
+ texTrans1.z = texTrans1.z.zzzz;
+ texTrans1.w = texTrans1.w.wwww;
+ t.y = dot4(T, texTrans1);
+ case 1:
+ texTrans0.x = texTrans0.y = texTrans0.z = texTrans0.w = *Pointer<Float4>(data + OFFSET(DrawData,ff.textureTransform[stage][0])); // FIXME: Unpack
+ texTrans0.x = texTrans0.x.xxxx;
+ texTrans0.y = texTrans0.y.yyyy;
+ texTrans0.z = texTrans0.z.zzzz;
+ texTrans0.w = texTrans0.w.wwww;
+ t.x = dot4(T, texTrans0);
+
+ o[T0 + stage].x = t.x;
+ o[T0 + stage].y = t.y;
+ o[T0 + stage].z = t.z;
+ o[T0 + stage].w = t.w;
+ case 0:
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ }
+
+ void VertexPipeline::processPointSize()
+ {
+ if(!state.pointSizeActive)
+ {
+ return; // Use global pointsize
+ }
+
+ if(state.input[PointSize])
+ {
+ o[Pts].y = v[PointSize].x;
+ }
+ else
+ {
+ o[Pts].y = *Pointer<Float4>(data + OFFSET(DrawData,point.pointSize));
+ }
+
+ if(state.pointScaleActive && !state.preTransformed)
+ {
+ Vector4f p = transformBlend(v[Position], Pointer<Byte>(data + OFFSET(DrawData,ff.cameraTransformT)), true);
+
+ Float4 d = Sqrt(dot3(p, p)); // FIXME: length(p);
+
+ Float4 A = *Pointer<Float>(data + OFFSET(DrawData,point.pointScaleA)); // FIXME: Unpack
+ Float4 B = *Pointer<Float>(data + OFFSET(DrawData,point.pointScaleB)); // FIXME: Unpack
+ Float4 C = *Pointer<Float>(data + OFFSET(DrawData,point.pointScaleC)); // FIXME: Unpack
+
+ A = RcpSqrt_pp(A + d * (B + d * C));
+
+ o[Pts].y = o[Pts].y * Float4(*Pointer<Float>(data + OFFSET(DrawData,viewportHeight))) * A; // FIXME: Unpack
+ }
+ }
+
+ Vector4f VertexPipeline::transform(const Register &src, const Pointer<Byte> &matrix, bool homogeneous)
+ {
+ Vector4f dst;
+
+ if(homogeneous)
+ {
+ Float4 m[4][4];
+
+ for(int j = 0; j < 4; j++)
+ {
+ for(int i = 0; i < 4; i++)
+ {
+ m[j][i].x = *Pointer<Float>(matrix + 16 * i + 4 * j);
+ m[j][i].y = *Pointer<Float>(matrix + 16 * i + 4 * j);
+ m[j][i].z = *Pointer<Float>(matrix + 16 * i + 4 * j);
+ m[j][i].w = *Pointer<Float>(matrix + 16 * i + 4 * j);
+ }
+ }
+
+ dst.x = src.x * m[0][0] + src.y * m[0][1] + src.z * m[0][2] + src.w * m[0][3];
+ dst.y = src.x * m[1][0] + src.y * m[1][1] + src.z * m[1][2] + src.w * m[1][3];
+ dst.z = src.x * m[2][0] + src.y * m[2][1] + src.z * m[2][2] + src.w * m[2][3];
+ dst.w = src.x * m[3][0] + src.y * m[3][1] + src.z * m[3][2] + src.w * m[3][3];
+ }
+ else
+ {
+ Float4 m[3][3];
+
+ for(int j = 0; j < 3; j++)
+ {
+ for(int i = 0; i < 3; i++)
+ {
+ m[j][i].x = *Pointer<Float>(matrix + 16 * i + 4 * j);
+ m[j][i].y = *Pointer<Float>(matrix + 16 * i + 4 * j);
+ m[j][i].z = *Pointer<Float>(matrix + 16 * i + 4 * j);
+ m[j][i].w = *Pointer<Float>(matrix + 16 * i + 4 * j);
+ }
+ }
+
+ dst.x = src.x * m[0][0] + src.y * m[0][1] + src.z * m[0][2];
+ dst.y = src.x * m[1][0] + src.y * m[1][1] + src.z * m[1][2];
+ dst.z = src.x * m[2][0] + src.y * m[2][1] + src.z * m[2][2];
+ }
+
+ return dst;
+ }
+
+ Vector4f VertexPipeline::transform(const Register &src, const Pointer<Byte> &matrix, UInt index[4], bool homogeneous)
+ {
+ Vector4f dst;
+
+ if(homogeneous)
+ {
+ Float4 m[4][4];
+
+ for(int j = 0; j < 4; j++)
+ {
+ for(int i = 0; i < 4; i++)
+ {
+ m[j][i].x = *Pointer<Float>(matrix + 16 * i + 4 * j + index[0]);
+ m[j][i].y = *Pointer<Float>(matrix + 16 * i + 4 * j + index[1]);
+ m[j][i].z = *Pointer<Float>(matrix + 16 * i + 4 * j + index[2]);
+ m[j][i].w = *Pointer<Float>(matrix + 16 * i + 4 * j + index[3]);
+ }
+ }
+
+ dst.x = src.x * m[0][0] + src.y * m[0][1] + src.z * m[0][2] + m[0][3];
+ dst.y = src.x * m[1][0] + src.y * m[1][1] + src.z * m[1][2] + m[1][3];
+ dst.z = src.x * m[2][0] + src.y * m[2][1] + src.z * m[2][2] + m[2][3];
+ dst.w = src.x * m[3][0] + src.y * m[3][1] + src.z * m[3][2] + m[3][3];
+ }
+ else
+ {
+ Float4 m[3][3];
+
+ for(int j = 0; j < 3; j++)
+ {
+ for(int i = 0; i < 3; i++)
+ {
+ m[j][i].x = *Pointer<Float>(matrix + 16 * i + 4 * j + index[0]);
+ m[j][i].y = *Pointer<Float>(matrix + 16 * i + 4 * j + index[1]);
+ m[j][i].z = *Pointer<Float>(matrix + 16 * i + 4 * j + index[2]);
+ m[j][i].w = *Pointer<Float>(matrix + 16 * i + 4 * j + index[3]);
+ }
+ }
+
+ dst.x = src.x * m[0][0] + src.y * m[0][1] + src.z * m[0][2];
+ dst.y = src.x * m[1][0] + src.y * m[1][1] + src.z * m[1][2];
+ dst.z = src.x * m[2][0] + src.y * m[2][1] + src.z * m[2][2];
+ }
+
+ return dst;
+ }
+
+ Vector4f VertexPipeline::normalize(Vector4f &src)
+ {
+ Vector4f dst;
+
+ Float4 rcpLength = RcpSqrt_pp(dot3(src, src));
+
+ dst.x = src.x * rcpLength;
+ dst.y = src.y * rcpLength;
+ dst.z = src.z * rcpLength;
+
+ return dst;
+ }
+
+ Float4 VertexPipeline::power(Float4 &src0, Float4 &src1)
+ {
+ Float4 dst = src0;
+
+ dst = dst * dst;
+ dst = dst * dst;
+ dst = Float4(As<Int4>(dst) - As<Int4>(Float4(1.0f)));
+
+ dst *= src1;
+
+ dst = As<Float4>(Int4(dst) + As<Int4>(Float4(1.0f)));
+ dst = RcpSqrt_pp(dst);
+ dst = RcpSqrt_pp(dst);
+
+ return dst;
+ }
+}
diff --git a/src/Pipeline/VertexPipeline.hpp b/src/Pipeline/VertexPipeline.hpp
new file mode 100644
index 0000000..0736afb
--- /dev/null
+++ b/src/Pipeline/VertexPipeline.hpp
@@ -0,0 +1,45 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_VertexPipeline_hpp
+#define sw_VertexPipeline_hpp
+
+#include "VertexRoutine.hpp"
+
+#include "Renderer/Context.hpp"
+#include "Renderer/VertexProcessor.hpp"
+
+namespace sw
+{
+ class VertexPipeline : public VertexRoutine
+ {
+ public:
+ VertexPipeline(const VertexProcessor::State &state);
+
+ virtual ~VertexPipeline();
+
+ private:
+ void pipeline(UInt &index) override;
+ void processTextureCoordinate(int stage, Vector4f &normal, Vector4f &position);
+ void processPointSize();
+
+ Vector4f transformBlend(const Register &src, const Pointer<Byte> &matrix, bool homogenous);
+ Vector4f transform(const Register &src, const Pointer<Byte> &matrix, bool homogenous);
+ Vector4f transform(const Register &src, const Pointer<Byte> &matrix, UInt index[4], bool homogenous);
+ Vector4f normalize(Vector4f &src);
+ Float4 power(Float4 &src0, Float4 &src1);
+ };
+};
+
+#endif // sw_VertexPipeline_hpp
diff --git a/src/Pipeline/VertexProgram.cpp b/src/Pipeline/VertexProgram.cpp
new file mode 100644
index 0000000..ad4e37b
--- /dev/null
+++ b/src/Pipeline/VertexProgram.cpp
@@ -0,0 +1,1650 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "VertexProgram.hpp"
+
+#include "VertexShader.hpp"
+#include "SamplerCore.hpp"
+#include "Renderer/Renderer.hpp"
+#include "Renderer/Vertex.hpp"
+#include "Common/Half.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+ VertexProgram::VertexProgram(const VertexProcessor::State &state, const VertexShader *shader)
+ : VertexRoutine(state, shader), shader(shader), r(shader->indirectAddressableTemporaries)
+ {
+ ifDepth = 0;
+ loopRepDepth = 0;
+ currentLabel = -1;
+ whileTest = false;
+
+ for(int i = 0; i < 2048; i++)
+ {
+ labelBlock[i] = 0;
+ }
+
+ loopDepth = -1;
+ enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+
+ if(shader->containsBreakInstruction())
+ {
+ enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+ }
+
+ if(shader->containsContinueInstruction())
+ {
+ enableContinue = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+ }
+
+ if(shader->isInstanceIdDeclared())
+ {
+ instanceID = *Pointer<Int>(data + OFFSET(DrawData,instanceID));
+ }
+ }
+
+ VertexProgram::~VertexProgram()
+ {
+ }
+
+ void VertexProgram::pipeline(UInt &index)
+ {
+ if(!state.preTransformed)
+ {
+ program(index);
+ }
+ else
+ {
+ passThrough();
+ }
+ }
+
+ void VertexProgram::program(UInt &index)
+ {
+ // shader->print("VertexShader-%0.8X.txt", state.shaderID);
+
+ unsigned short shaderModel = shader->getShaderModel();
+
+ enableIndex = 0;
+ stackIndex = 0;
+
+ if(shader->containsLeaveInstruction())
+ {
+ enableLeave = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+ }
+
+ if(shader->isVertexIdDeclared())
+ {
+ if(state.textureSampling)
+ {
+ vertexID = Int4(index);
+ }
+ else
+ {
+ vertexID = Insert(vertexID, As<Int>(index), 0);
+ vertexID = Insert(vertexID, As<Int>(index + 1), 1);
+ vertexID = Insert(vertexID, As<Int>(index + 2), 2);
+ vertexID = Insert(vertexID, As<Int>(index + 3), 3);
+ }
+ }
+
+ // Create all call site return blocks up front
+ for(size_t i = 0; i < shader->getLength(); i++)
+ {
+ const Shader::Instruction *instruction = shader->getInstruction(i);
+ Shader::Opcode opcode = instruction->opcode;
+
+ if(opcode == Shader::OPCODE_CALL || opcode == Shader::OPCODE_CALLNZ)
+ {
+ const Dst &dst = instruction->dst;
+
+ ASSERT(callRetBlock[dst.label].size() == dst.callSite);
+ callRetBlock[dst.label].push_back(Nucleus::createBasicBlock());
+ }
+ }
+
+ for(size_t i = 0; i < shader->getLength(); i++)
+ {
+ const Shader::Instruction *instruction = shader->getInstruction(i);
+ Shader::Opcode opcode = instruction->opcode;
+
+ if(opcode == Shader::OPCODE_DCL || opcode == Shader::OPCODE_DEF || opcode == Shader::OPCODE_DEFI || opcode == Shader::OPCODE_DEFB)
+ {
+ continue;
+ }
+
+ Dst dst = instruction->dst;
+ Src src0 = instruction->src[0];
+ Src src1 = instruction->src[1];
+ Src src2 = instruction->src[2];
+ Src src3 = instruction->src[3];
+ Src src4 = instruction->src[4];
+
+ bool predicate = instruction->predicate;
+ Control control = instruction->control;
+ bool integer = dst.type == Shader::PARAMETER_ADDR;
+ bool pp = dst.partialPrecision;
+
+ Vector4f d;
+ Vector4f s0;
+ Vector4f s1;
+ Vector4f s2;
+ Vector4f s3;
+ Vector4f s4;
+
+ if(src0.type != Shader::PARAMETER_VOID) s0 = fetchRegister(src0);
+ if(src1.type != Shader::PARAMETER_VOID) s1 = fetchRegister(src1);
+ if(src2.type != Shader::PARAMETER_VOID) s2 = fetchRegister(src2);
+ if(src3.type != Shader::PARAMETER_VOID) s3 = fetchRegister(src3);
+ if(src4.type != Shader::PARAMETER_VOID) s4 = fetchRegister(src4);
+
+ switch(opcode)
+ {
+ case Shader::OPCODE_VS_1_0: break;
+ case Shader::OPCODE_VS_1_1: break;
+ case Shader::OPCODE_VS_2_0: break;
+ case Shader::OPCODE_VS_2_x: break;
+ case Shader::OPCODE_VS_2_sw: break;
+ case Shader::OPCODE_VS_3_0: break;
+ case Shader::OPCODE_VS_3_sw: break;
+ case Shader::OPCODE_DCL: break;
+ case Shader::OPCODE_DEF: break;
+ case Shader::OPCODE_DEFI: break;
+ case Shader::OPCODE_DEFB: break;
+ case Shader::OPCODE_NOP: break;
+ case Shader::OPCODE_ABS: abs(d, s0); break;
+ case Shader::OPCODE_IABS: iabs(d, s0); break;
+ case Shader::OPCODE_ADD: add(d, s0, s1); break;
+ case Shader::OPCODE_IADD: iadd(d, s0, s1); break;
+ case Shader::OPCODE_CRS: crs(d, s0, s1); break;
+ case Shader::OPCODE_FORWARD1: forward1(d, s0, s1, s2); break;
+ case Shader::OPCODE_FORWARD2: forward2(d, s0, s1, s2); break;
+ case Shader::OPCODE_FORWARD3: forward3(d, s0, s1, s2); break;
+ case Shader::OPCODE_FORWARD4: forward4(d, s0, s1, s2); break;
+ case Shader::OPCODE_REFLECT1: reflect1(d, s0, s1); break;
+ case Shader::OPCODE_REFLECT2: reflect2(d, s0, s1); break;
+ case Shader::OPCODE_REFLECT3: reflect3(d, s0, s1); break;
+ case Shader::OPCODE_REFLECT4: reflect4(d, s0, s1); break;
+ case Shader::OPCODE_REFRACT1: refract1(d, s0, s1, s2.x); break;
+ case Shader::OPCODE_REFRACT2: refract2(d, s0, s1, s2.x); break;
+ case Shader::OPCODE_REFRACT3: refract3(d, s0, s1, s2.x); break;
+ case Shader::OPCODE_REFRACT4: refract4(d, s0, s1, s2.x); break;
+ case Shader::OPCODE_DP1: dp1(d, s0, s1); break;
+ case Shader::OPCODE_DP2: dp2(d, s0, s1); break;
+ case Shader::OPCODE_DP3: dp3(d, s0, s1); break;
+ case Shader::OPCODE_DP4: dp4(d, s0, s1); break;
+ case Shader::OPCODE_DET2: det2(d, s0, s1); break;
+ case Shader::OPCODE_DET3: det3(d, s0, s1, s2); break;
+ case Shader::OPCODE_DET4: det4(d, s0, s1, s2, s3); break;
+ case Shader::OPCODE_ATT: att(d, s0, s1); break;
+ case Shader::OPCODE_EXP2X: exp2x(d, s0, pp); break;
+ case Shader::OPCODE_EXP2: exp2(d, s0, pp); break;
+ case Shader::OPCODE_EXPP: expp(d, s0, shaderModel); break;
+ case Shader::OPCODE_EXP: exp(d, s0, pp); break;
+ case Shader::OPCODE_FRC: frc(d, s0); break;
+ case Shader::OPCODE_TRUNC: trunc(d, s0); break;
+ case Shader::OPCODE_FLOOR: floor(d, s0); break;
+ case Shader::OPCODE_ROUND: round(d, s0); break;
+ case Shader::OPCODE_ROUNDEVEN: roundEven(d, s0); break;
+ case Shader::OPCODE_CEIL: ceil(d, s0); break;
+ case Shader::OPCODE_LIT: lit(d, s0); break;
+ case Shader::OPCODE_LOG2X: log2x(d, s0, pp); break;
+ case Shader::OPCODE_LOG2: log2(d, s0, pp); break;
+ case Shader::OPCODE_LOGP: logp(d, s0, shaderModel); break;
+ case Shader::OPCODE_LOG: log(d, s0, pp); break;
+ case Shader::OPCODE_LRP: lrp(d, s0, s1, s2); break;
+ case Shader::OPCODE_STEP: step(d, s0, s1); break;
+ case Shader::OPCODE_SMOOTH: smooth(d, s0, s1, s2); break;
+ case Shader::OPCODE_ISINF: isinf(d, s0); break;
+ case Shader::OPCODE_ISNAN: isnan(d, s0); break;
+ case Shader::OPCODE_FLOATBITSTOINT:
+ case Shader::OPCODE_FLOATBITSTOUINT:
+ case Shader::OPCODE_INTBITSTOFLOAT:
+ case Shader::OPCODE_UINTBITSTOFLOAT: d = s0; break;
+ case Shader::OPCODE_PACKSNORM2x16: packSnorm2x16(d, s0); break;
+ case Shader::OPCODE_PACKUNORM2x16: packUnorm2x16(d, s0); break;
+ case Shader::OPCODE_PACKHALF2x16: packHalf2x16(d, s0); break;
+ case Shader::OPCODE_UNPACKSNORM2x16: unpackSnorm2x16(d, s0); break;
+ case Shader::OPCODE_UNPACKUNORM2x16: unpackUnorm2x16(d, s0); break;
+ case Shader::OPCODE_UNPACKHALF2x16: unpackHalf2x16(d, s0); break;
+ case Shader::OPCODE_M3X2: M3X2(d, s0, src1); break;
+ case Shader::OPCODE_M3X3: M3X3(d, s0, src1); break;
+ case Shader::OPCODE_M3X4: M3X4(d, s0, src1); break;
+ case Shader::OPCODE_M4X3: M4X3(d, s0, src1); break;
+ case Shader::OPCODE_M4X4: M4X4(d, s0, src1); break;
+ case Shader::OPCODE_MAD: mad(d, s0, s1, s2); break;
+ case Shader::OPCODE_IMAD: imad(d, s0, s1, s2); break;
+ case Shader::OPCODE_MAX: max(d, s0, s1); break;
+ case Shader::OPCODE_IMAX: imax(d, s0, s1); break;
+ case Shader::OPCODE_UMAX: umax(d, s0, s1); break;
+ case Shader::OPCODE_MIN: min(d, s0, s1); break;
+ case Shader::OPCODE_IMIN: imin(d, s0, s1); break;
+ case Shader::OPCODE_UMIN: umin(d, s0, s1); break;
+ case Shader::OPCODE_MOV: mov(d, s0, integer); break;
+ case Shader::OPCODE_MOVA: mov(d, s0, true); break;
+ case Shader::OPCODE_NEG: neg(d, s0); break;
+ case Shader::OPCODE_INEG: ineg(d, s0); break;
+ case Shader::OPCODE_F2B: f2b(d, s0); break;
+ case Shader::OPCODE_B2F: b2f(d, s0); break;
+ case Shader::OPCODE_F2I: f2i(d, s0); break;
+ case Shader::OPCODE_I2F: i2f(d, s0); break;
+ case Shader::OPCODE_F2U: f2u(d, s0); break;
+ case Shader::OPCODE_U2F: u2f(d, s0); break;
+ case Shader::OPCODE_I2B: i2b(d, s0); break;
+ case Shader::OPCODE_B2I: b2i(d, s0); break;
+ case Shader::OPCODE_MUL: mul(d, s0, s1); break;
+ case Shader::OPCODE_IMUL: imul(d, s0, s1); break;
+ case Shader::OPCODE_NRM2: nrm2(d, s0, pp); break;
+ case Shader::OPCODE_NRM3: nrm3(d, s0, pp); break;
+ case Shader::OPCODE_NRM4: nrm4(d, s0, pp); break;
+ case Shader::OPCODE_POWX: powx(d, s0, s1, pp); break;
+ case Shader::OPCODE_POW: pow(d, s0, s1, pp); break;
+ case Shader::OPCODE_RCPX: rcpx(d, s0, pp); break;
+ case Shader::OPCODE_DIV: div(d, s0, s1); break;
+ case Shader::OPCODE_IDIV: idiv(d, s0, s1); break;
+ case Shader::OPCODE_UDIV: udiv(d, s0, s1); break;
+ case Shader::OPCODE_MOD: mod(d, s0, s1); break;
+ case Shader::OPCODE_IMOD: imod(d, s0, s1); break;
+ case Shader::OPCODE_UMOD: umod(d, s0, s1); break;
+ case Shader::OPCODE_SHL: shl(d, s0, s1); break;
+ case Shader::OPCODE_ISHR: ishr(d, s0, s1); break;
+ case Shader::OPCODE_USHR: ushr(d, s0, s1); break;
+ case Shader::OPCODE_RSQX: rsqx(d, s0, pp); break;
+ case Shader::OPCODE_SQRT: sqrt(d, s0, pp); break;
+ case Shader::OPCODE_RSQ: rsq(d, s0, pp); break;
+ case Shader::OPCODE_LEN2: len2(d.x, s0, pp); break;
+ case Shader::OPCODE_LEN3: len3(d.x, s0, pp); break;
+ case Shader::OPCODE_LEN4: len4(d.x, s0, pp); break;
+ case Shader::OPCODE_DIST1: dist1(d.x, s0, s1, pp); break;
+ case Shader::OPCODE_DIST2: dist2(d.x, s0, s1, pp); break;
+ case Shader::OPCODE_DIST3: dist3(d.x, s0, s1, pp); break;
+ case Shader::OPCODE_DIST4: dist4(d.x, s0, s1, pp); break;
+ case Shader::OPCODE_SGE: step(d, s1, s0); break;
+ case Shader::OPCODE_SGN: sgn(d, s0); break;
+ case Shader::OPCODE_ISGN: isgn(d, s0); break;
+ case Shader::OPCODE_SINCOS: sincos(d, s0, pp); break;
+ case Shader::OPCODE_COS: cos(d, s0, pp); break;
+ case Shader::OPCODE_SIN: sin(d, s0, pp); break;
+ case Shader::OPCODE_TAN: tan(d, s0); break;
+ case Shader::OPCODE_ACOS: acos(d, s0); break;
+ case Shader::OPCODE_ASIN: asin(d, s0); break;
+ case Shader::OPCODE_ATAN: atan(d, s0); break;
+ case Shader::OPCODE_ATAN2: atan2(d, s0, s1); break;
+ case Shader::OPCODE_COSH: cosh(d, s0, pp); break;
+ case Shader::OPCODE_SINH: sinh(d, s0, pp); break;
+ case Shader::OPCODE_TANH: tanh(d, s0, pp); break;
+ case Shader::OPCODE_ACOSH: acosh(d, s0, pp); break;
+ case Shader::OPCODE_ASINH: asinh(d, s0, pp); break;
+ case Shader::OPCODE_ATANH: atanh(d, s0, pp); break;
+ case Shader::OPCODE_SLT: slt(d, s0, s1); break;
+ case Shader::OPCODE_SUB: sub(d, s0, s1); break;
+ case Shader::OPCODE_ISUB: isub(d, s0, s1); break;
+ case Shader::OPCODE_BREAK: BREAK(); break;
+ case Shader::OPCODE_BREAKC: BREAKC(s0, s1, control); break;
+ case Shader::OPCODE_BREAKP: BREAKP(src0); break;
+ case Shader::OPCODE_CONTINUE: CONTINUE(); break;
+ case Shader::OPCODE_TEST: TEST(); break;
+ case Shader::OPCODE_CALL: CALL(dst.label, dst.callSite); break;
+ case Shader::OPCODE_CALLNZ: CALLNZ(dst.label, dst.callSite, src0); break;
+ case Shader::OPCODE_ELSE: ELSE(); break;
+ case Shader::OPCODE_ENDIF: ENDIF(); break;
+ case Shader::OPCODE_ENDLOOP: ENDLOOP(); break;
+ case Shader::OPCODE_ENDREP: ENDREP(); break;
+ case Shader::OPCODE_ENDWHILE: ENDWHILE(); break;
+ case Shader::OPCODE_ENDSWITCH: ENDSWITCH(); break;
+ case Shader::OPCODE_IF: IF(src0); break;
+ case Shader::OPCODE_IFC: IFC(s0, s1, control); break;
+ case Shader::OPCODE_LABEL: LABEL(dst.index); break;
+ case Shader::OPCODE_LOOP: LOOP(src1); break;
+ case Shader::OPCODE_REP: REP(src0); break;
+ case Shader::OPCODE_WHILE: WHILE(src0); break;
+ case Shader::OPCODE_SWITCH: SWITCH(); break;
+ case Shader::OPCODE_RET: RET(); break;
+ case Shader::OPCODE_LEAVE: LEAVE(); break;
+ case Shader::OPCODE_CMP: cmp(d, s0, s1, control); break;
+ case Shader::OPCODE_ICMP: icmp(d, s0, s1, control); break;
+ case Shader::OPCODE_UCMP: ucmp(d, s0, s1, control); break;
+ case Shader::OPCODE_SELECT: select(d, s0, s1, s2); break;
+ case Shader::OPCODE_EXTRACT: extract(d.x, s0, s1.x); break;
+ case Shader::OPCODE_INSERT: insert(d, s0, s1.x, s2.x); break;
+ case Shader::OPCODE_ALL: all(d.x, s0); break;
+ case Shader::OPCODE_ANY: any(d.x, s0); break;
+ case Shader::OPCODE_NOT: bitwise_not(d, s0); break;
+ case Shader::OPCODE_OR: bitwise_or(d, s0, s1); break;
+ case Shader::OPCODE_XOR: bitwise_xor(d, s0, s1); break;
+ case Shader::OPCODE_AND: bitwise_and(d, s0, s1); break;
+ case Shader::OPCODE_EQ: equal(d, s0, s1); break;
+ case Shader::OPCODE_NE: notEqual(d, s0, s1); break;
+ case Shader::OPCODE_TEXLDL: TEXLOD(d, s0, src1, s0.w); break;
+ case Shader::OPCODE_TEXLOD: TEXLOD(d, s0, src1, s2.x); break;
+ case Shader::OPCODE_TEX: TEX(d, s0, src1); break;
+ case Shader::OPCODE_TEXOFFSET: TEXOFFSET(d, s0, src1, s2); break;
+ case Shader::OPCODE_TEXLODOFFSET: TEXLODOFFSET(d, s0, src1, s2, s3.x); break;
+ case Shader::OPCODE_TEXELFETCH: TEXELFETCH(d, s0, src1, s2.x); break;
+ case Shader::OPCODE_TEXELFETCHOFFSET: TEXELFETCHOFFSET(d, s0, src1, s2, s3.x); break;
+ case Shader::OPCODE_TEXGRAD: TEXGRAD(d, s0, src1, s2, s3); break;
+ case Shader::OPCODE_TEXGRADOFFSET: TEXGRADOFFSET(d, s0, src1, s2, s3, s4); break;
+ case Shader::OPCODE_TEXSIZE: TEXSIZE(d, s0.x, src1); break;
+ case Shader::OPCODE_END: break;
+ default:
+ ASSERT(false);
+ }
+
+ if(dst.type != Shader::PARAMETER_VOID && dst.type != Shader::PARAMETER_LABEL && opcode != Shader::OPCODE_NOP)
+ {
+ if(dst.saturate)
+ {
+ if(dst.x) d.x = Max(d.x, Float4(0.0f));
+ if(dst.y) d.y = Max(d.y, Float4(0.0f));
+ if(dst.z) d.z = Max(d.z, Float4(0.0f));
+ if(dst.w) d.w = Max(d.w, Float4(0.0f));
+
+ if(dst.x) d.x = Min(d.x, Float4(1.0f));
+ if(dst.y) d.y = Min(d.y, Float4(1.0f));
+ if(dst.z) d.z = Min(d.z, Float4(1.0f));
+ if(dst.w) d.w = Min(d.w, Float4(1.0f));
+ }
+
+ if(instruction->isPredicated())
+ {
+ Vector4f pDst; // FIXME: Rename
+
+ switch(dst.type)
+ {
+ case Shader::PARAMETER_VOID: break;
+ case Shader::PARAMETER_TEMP:
+ if(dst.rel.type == Shader::PARAMETER_VOID)
+ {
+ if(dst.x) pDst.x = r[dst.index].x;
+ if(dst.y) pDst.y = r[dst.index].y;
+ if(dst.z) pDst.z = r[dst.index].z;
+ if(dst.w) pDst.w = r[dst.index].w;
+ }
+ else if(!dst.rel.dynamic)
+ {
+ Int a = dst.index + relativeAddress(dst.rel);
+
+ if(dst.x) pDst.x = r[a].x;
+ if(dst.y) pDst.y = r[a].y;
+ if(dst.z) pDst.z = r[a].z;
+ if(dst.w) pDst.w = r[a].w;
+ }
+ else
+ {
+ Int4 a = dst.index + dynamicAddress(dst.rel);
+
+ if(dst.x) pDst.x = r[a].x;
+ if(dst.y) pDst.y = r[a].y;
+ if(dst.z) pDst.z = r[a].z;
+ if(dst.w) pDst.w = r[a].w;
+ }
+ break;
+ case Shader::PARAMETER_ADDR: pDst = a0; break;
+ case Shader::PARAMETER_RASTOUT:
+ switch(dst.index)
+ {
+ case 0:
+ if(dst.x) pDst.x = o[Pos].x;
+ if(dst.y) pDst.y = o[Pos].y;
+ if(dst.z) pDst.z = o[Pos].z;
+ if(dst.w) pDst.w = o[Pos].w;
+ break;
+ case 1:
+ pDst.x = o[Fog].x;
+ break;
+ case 2:
+ pDst.x = o[Pts].y;
+ break;
+ default:
+ ASSERT(false);
+ }
+ break;
+ case Shader::PARAMETER_ATTROUT:
+ if(dst.x) pDst.x = o[C0 + dst.index].x;
+ if(dst.y) pDst.y = o[C0 + dst.index].y;
+ if(dst.z) pDst.z = o[C0 + dst.index].z;
+ if(dst.w) pDst.w = o[C0 + dst.index].w;
+ break;
+ case Shader::PARAMETER_TEXCRDOUT:
+ // case Shader::PARAMETER_OUTPUT:
+ if(shaderModel < 0x0300)
+ {
+ if(dst.x) pDst.x = o[T0 + dst.index].x;
+ if(dst.y) pDst.y = o[T0 + dst.index].y;
+ if(dst.z) pDst.z = o[T0 + dst.index].z;
+ if(dst.w) pDst.w = o[T0 + dst.index].w;
+ }
+ else if(dst.rel.type == Shader::PARAMETER_VOID) // Not relative
+ {
+ if(dst.x) pDst.x = o[dst.index].x;
+ if(dst.y) pDst.y = o[dst.index].y;
+ if(dst.z) pDst.z = o[dst.index].z;
+ if(dst.w) pDst.w = o[dst.index].w;
+ }
+ else if(!dst.rel.dynamic)
+ {
+ Int a = dst.index + relativeAddress(dst.rel);
+
+ if(dst.x) pDst.x = o[a].x;
+ if(dst.y) pDst.y = o[a].y;
+ if(dst.z) pDst.z = o[a].z;
+ if(dst.w) pDst.w = o[a].w;
+ }
+ else
+ {
+ Int4 a = dst.index + dynamicAddress(dst.rel);
+
+ if(dst.x) pDst.x = o[a].x;
+ if(dst.y) pDst.y = o[a].y;
+ if(dst.z) pDst.z = o[a].z;
+ if(dst.w) pDst.w = o[a].w;
+ }
+ break;
+ case Shader::PARAMETER_LABEL: break;
+ case Shader::PARAMETER_PREDICATE: pDst = p0; break;
+ case Shader::PARAMETER_INPUT: break;
+ default:
+ ASSERT(false);
+ }
+
+ Int4 enable = enableMask(instruction);
+
+ Int4 xEnable = enable;
+ Int4 yEnable = enable;
+ Int4 zEnable = enable;
+ Int4 wEnable = enable;
+
+ if(predicate)
+ {
+ unsigned char pSwizzle = instruction->predicateSwizzle;
+
+ Float4 xPredicate = p0[(pSwizzle >> 0) & 0x03];
+ Float4 yPredicate = p0[(pSwizzle >> 2) & 0x03];
+ Float4 zPredicate = p0[(pSwizzle >> 4) & 0x03];
+ Float4 wPredicate = p0[(pSwizzle >> 6) & 0x03];
+
+ if(!instruction->predicateNot)
+ {
+ if(dst.x) xEnable = xEnable & As<Int4>(xPredicate);
+ if(dst.y) yEnable = yEnable & As<Int4>(yPredicate);
+ if(dst.z) zEnable = zEnable & As<Int4>(zPredicate);
+ if(dst.w) wEnable = wEnable & As<Int4>(wPredicate);
+ }
+ else
+ {
+ if(dst.x) xEnable = xEnable & ~As<Int4>(xPredicate);
+ if(dst.y) yEnable = yEnable & ~As<Int4>(yPredicate);
+ if(dst.z) zEnable = zEnable & ~As<Int4>(zPredicate);
+ if(dst.w) wEnable = wEnable & ~As<Int4>(wPredicate);
+ }
+ }
+
+ if(dst.x) d.x = As<Float4>(As<Int4>(d.x) & xEnable);
+ if(dst.y) d.y = As<Float4>(As<Int4>(d.y) & yEnable);
+ if(dst.z) d.z = As<Float4>(As<Int4>(d.z) & zEnable);
+ if(dst.w) d.w = As<Float4>(As<Int4>(d.w) & wEnable);
+
+ if(dst.x) d.x = As<Float4>(As<Int4>(d.x) | (As<Int4>(pDst.x) & ~xEnable));
+ if(dst.y) d.y = As<Float4>(As<Int4>(d.y) | (As<Int4>(pDst.y) & ~yEnable));
+ if(dst.z) d.z = As<Float4>(As<Int4>(d.z) | (As<Int4>(pDst.z) & ~zEnable));
+ if(dst.w) d.w = As<Float4>(As<Int4>(d.w) | (As<Int4>(pDst.w) & ~wEnable));
+ }
+
+ switch(dst.type)
+ {
+ case Shader::PARAMETER_VOID:
+ break;
+ case Shader::PARAMETER_TEMP:
+ if(dst.rel.type == Shader::PARAMETER_VOID)
+ {
+ if(dst.x) r[dst.index].x = d.x;
+ if(dst.y) r[dst.index].y = d.y;
+ if(dst.z) r[dst.index].z = d.z;
+ if(dst.w) r[dst.index].w = d.w;
+ }
+ else if(!dst.rel.dynamic)
+ {
+ Int a = dst.index + relativeAddress(dst.rel);
+
+ if(dst.x) r[a].x = d.x;
+ if(dst.y) r[a].y = d.y;
+ if(dst.z) r[a].z = d.z;
+ if(dst.w) r[a].w = d.w;
+ }
+ else
+ {
+ Int4 a = dst.index + dynamicAddress(dst.rel);
+
+ if(dst.x) r.scatter_x(a, d.x);
+ if(dst.y) r.scatter_y(a, d.y);
+ if(dst.z) r.scatter_z(a, d.z);
+ if(dst.w) r.scatter_w(a, d.w);
+ }
+ break;
+ case Shader::PARAMETER_ADDR:
+ if(dst.x) a0.x = d.x;
+ if(dst.y) a0.y = d.y;
+ if(dst.z) a0.z = d.z;
+ if(dst.w) a0.w = d.w;
+ break;
+ case Shader::PARAMETER_RASTOUT:
+ switch(dst.index)
+ {
+ case 0:
+ if(dst.x) o[Pos].x = d.x;
+ if(dst.y) o[Pos].y = d.y;
+ if(dst.z) o[Pos].z = d.z;
+ if(dst.w) o[Pos].w = d.w;
+ break;
+ case 1:
+ o[Fog].x = d.x;
+ break;
+ case 2:
+ o[Pts].y = d.x;
+ break;
+ default: ASSERT(false);
+ }
+ break;
+ case Shader::PARAMETER_ATTROUT:
+ if(dst.x) o[C0 + dst.index].x = d.x;
+ if(dst.y) o[C0 + dst.index].y = d.y;
+ if(dst.z) o[C0 + dst.index].z = d.z;
+ if(dst.w) o[C0 + dst.index].w = d.w;
+ break;
+ case Shader::PARAMETER_TEXCRDOUT:
+ // case Shader::PARAMETER_OUTPUT:
+ if(shaderModel < 0x0300)
+ {
+ if(dst.x) o[T0 + dst.index].x = d.x;
+ if(dst.y) o[T0 + dst.index].y = d.y;
+ if(dst.z) o[T0 + dst.index].z = d.z;
+ if(dst.w) o[T0 + dst.index].w = d.w;
+ }
+ else if(dst.rel.type == Shader::PARAMETER_VOID) // Not relative
+ {
+ if(dst.x) o[dst.index].x = d.x;
+ if(dst.y) o[dst.index].y = d.y;
+ if(dst.z) o[dst.index].z = d.z;
+ if(dst.w) o[dst.index].w = d.w;
+ }
+ else if(!dst.rel.dynamic)
+ {
+ Int a = dst.index + relativeAddress(dst.rel);
+
+ if(dst.x) o[a].x = d.x;
+ if(dst.y) o[a].y = d.y;
+ if(dst.z) o[a].z = d.z;
+ if(dst.w) o[a].w = d.w;
+ }
+ else
+ {
+ Int4 a = dst.index + dynamicAddress(dst.rel);
+
+ if(dst.x) o.scatter_x(a, d.x);
+ if(dst.y) o.scatter_y(a, d.y);
+ if(dst.z) o.scatter_z(a, d.z);
+ if(dst.w) o.scatter_w(a, d.w);
+ }
+ break;
+ case Shader::PARAMETER_LABEL: break;
+ case Shader::PARAMETER_PREDICATE: p0 = d; break;
+ case Shader::PARAMETER_INPUT: break;
+ default:
+ ASSERT(false);
+ }
+ }
+ }
+
+ if(currentLabel != -1)
+ {
+ Nucleus::setInsertBlock(returnBlock);
+ }
+ }
+
+ void VertexProgram::passThrough()
+ {
+ if(shader)
+ {
+ for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
+ {
+ unsigned char usage = shader->getOutput(i, 0).usage;
+
+ switch(usage)
+ {
+ case 0xFF:
+ continue;
+ case Shader::USAGE_PSIZE:
+ o[i].y = v[i].x;
+ break;
+ case Shader::USAGE_TEXCOORD:
+ o[i].x = v[i].x;
+ o[i].y = v[i].y;
+ o[i].z = v[i].z;
+ o[i].w = v[i].w;
+ break;
+ case Shader::USAGE_POSITION:
+ o[i].x = v[i].x;
+ o[i].y = v[i].y;
+ o[i].z = v[i].z;
+ o[i].w = v[i].w;
+ break;
+ case Shader::USAGE_COLOR:
+ o[i].x = v[i].x;
+ o[i].y = v[i].y;
+ o[i].z = v[i].z;
+ o[i].w = v[i].w;
+ break;
+ case Shader::USAGE_FOG:
+ o[i].x = v[i].x;
+ break;
+ default:
+ ASSERT(false);
+ }
+ }
+ }
+ else
+ {
+ o[Pos].x = v[PositionT].x;
+ o[Pos].y = v[PositionT].y;
+ o[Pos].z = v[PositionT].z;
+ o[Pos].w = v[PositionT].w;
+
+ for(int i = 0; i < 2; i++)
+ {
+ o[C0 + i].x = v[Color0 + i].x;
+ o[C0 + i].y = v[Color0 + i].y;
+ o[C0 + i].z = v[Color0 + i].z;
+ o[C0 + i].w = v[Color0 + i].w;
+ }
+
+ for(int i = 0; i < 8; i++)
+ {
+ o[T0 + i].x = v[TexCoord0 + i].x;
+ o[T0 + i].y = v[TexCoord0 + i].y;
+ o[T0 + i].z = v[TexCoord0 + i].z;
+ o[T0 + i].w = v[TexCoord0 + i].w;
+ }
+
+ o[Pts].y = v[PointSize].x;
+ }
+ }
+
+ Vector4f VertexProgram::fetchRegister(const Src &src, unsigned int offset)
+ {
+ Vector4f reg;
+ unsigned int i = src.index + offset;
+
+ switch(src.type)
+ {
+ case Shader::PARAMETER_TEMP:
+ if(src.rel.type == Shader::PARAMETER_VOID)
+ {
+ reg = r[i];
+ }
+ else if(!src.rel.dynamic)
+ {
+ reg = r[i + relativeAddress(src.rel, src.bufferIndex)];
+ }
+ else
+ {
+ reg = r[i + dynamicAddress(src.rel)];
+ }
+ break;
+ case Shader::PARAMETER_CONST:
+ reg = readConstant(src, offset);
+ break;
+ case Shader::PARAMETER_INPUT:
+ if(src.rel.type == Shader::PARAMETER_VOID)
+ {
+ reg = v[i];
+ }
+ else if(!src.rel.dynamic)
+ {
+ reg = v[i + relativeAddress(src.rel, src.bufferIndex)];
+ }
+ else
+ {
+ reg = v[i + dynamicAddress(src.rel)];
+ }
+ break;
+ case Shader::PARAMETER_VOID: return r[0]; // Dummy
+ case Shader::PARAMETER_FLOAT4LITERAL:
+ reg.x = Float4(src.value[0]);
+ reg.y = Float4(src.value[1]);
+ reg.z = Float4(src.value[2]);
+ reg.w = Float4(src.value[3]);
+ break;
+ case Shader::PARAMETER_ADDR: reg = a0; break;
+ case Shader::PARAMETER_CONSTBOOL: return r[0]; // Dummy
+ case Shader::PARAMETER_CONSTINT: return r[0]; // Dummy
+ case Shader::PARAMETER_LOOP: return r[0]; // Dummy
+ case Shader::PARAMETER_PREDICATE: return r[0]; // Dummy
+ case Shader::PARAMETER_SAMPLER:
+ if(src.rel.type == Shader::PARAMETER_VOID)
+ {
+ reg.x = As<Float4>(Int4(i));
+ }
+ else if(src.rel.type == Shader::PARAMETER_TEMP)
+ {
+ reg.x = As<Float4>(Int4(i) + As<Int4>(r[src.rel.index].x));
+ }
+ return reg;
+ case Shader::PARAMETER_OUTPUT:
+ if(src.rel.type == Shader::PARAMETER_VOID)
+ {
+ reg = o[i];
+ }
+ else if(!src.rel.dynamic)
+ {
+ reg = o[i + relativeAddress(src.rel, src.bufferIndex)];
+ }
+ else
+ {
+ reg = o[i + dynamicAddress(src.rel)];
+ }
+ break;
+ case Shader::PARAMETER_MISCTYPE:
+ if(src.index == Shader::InstanceIDIndex)
+ {
+ reg.x = As<Float>(instanceID);
+ }
+ else if(src.index == Shader::VertexIDIndex)
+ {
+ reg.x = As<Float4>(vertexID);
+ }
+ else ASSERT(false);
+ return reg;
+ default:
+ ASSERT(false);
+ }
+
+ const Float4 &x = reg[(src.swizzle >> 0) & 0x3];
+ const Float4 &y = reg[(src.swizzle >> 2) & 0x3];
+ const Float4 &z = reg[(src.swizzle >> 4) & 0x3];
+ const Float4 &w = reg[(src.swizzle >> 6) & 0x3];
+
+ Vector4f mod;
+
+ switch(src.modifier)
+ {
+ case Shader::MODIFIER_NONE:
+ mod.x = x;
+ mod.y = y;
+ mod.z = z;
+ mod.w = w;
+ break;
+ case Shader::MODIFIER_NEGATE:
+ mod.x = -x;
+ mod.y = -y;
+ mod.z = -z;
+ mod.w = -w;
+ break;
+ case Shader::MODIFIER_ABS:
+ mod.x = Abs(x);
+ mod.y = Abs(y);
+ mod.z = Abs(z);
+ mod.w = Abs(w);
+ break;
+ case Shader::MODIFIER_ABS_NEGATE:
+ mod.x = -Abs(x);
+ mod.y = -Abs(y);
+ mod.z = -Abs(z);
+ mod.w = -Abs(w);
+ break;
+ case Shader::MODIFIER_NOT:
+ mod.x = As<Float4>(As<Int4>(x) ^ Int4(0xFFFFFFFF));
+ mod.y = As<Float4>(As<Int4>(y) ^ Int4(0xFFFFFFFF));
+ mod.z = As<Float4>(As<Int4>(z) ^ Int4(0xFFFFFFFF));
+ mod.w = As<Float4>(As<Int4>(w) ^ Int4(0xFFFFFFFF));
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ return mod;
+ }
+
+ RValue<Pointer<Byte>> VertexProgram::uniformAddress(int bufferIndex, unsigned int index)
+ {
+ if(bufferIndex == -1)
+ {
+ return data + OFFSET(DrawData, vs.c[index]);
+ }
+ else
+ {
+ return *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.u[bufferIndex])) + index;
+ }
+ }
+
+ RValue<Pointer<Byte>> VertexProgram::uniformAddress(int bufferIndex, unsigned int index, Int &offset)
+ {
+ return uniformAddress(bufferIndex, index) + offset * sizeof(float4);
+ }
+
+ Vector4f VertexProgram::readConstant(const Src &src, unsigned int offset)
+ {
+ Vector4f c;
+ unsigned int i = src.index + offset;
+
+ if(src.rel.type == Shader::PARAMETER_VOID) // Not relative
+ {
+ c.x = c.y = c.z = c.w = *Pointer<Float4>(uniformAddress(src.bufferIndex, i));
+
+ c.x = c.x.xxxx;
+ c.y = c.y.yyyy;
+ c.z = c.z.zzzz;
+ c.w = c.w.wwww;
+
+ if(shader->containsDefineInstruction()) // Constant may be known at compile time
+ {
+ for(size_t j = 0; j < shader->getLength(); j++)
+ {
+ const Shader::Instruction &instruction = *shader->getInstruction(j);
+
+ if(instruction.opcode == Shader::OPCODE_DEF)
+ {
+ if(instruction.dst.index == i)
+ {
+ c.x = Float4(instruction.src[0].value[0]);
+ c.y = Float4(instruction.src[0].value[1]);
+ c.z = Float4(instruction.src[0].value[2]);
+ c.w = Float4(instruction.src[0].value[3]);
+
+ break;
+ }
+ }
+ }
+ }
+ }
+ else if(!src.rel.dynamic || src.rel.type == Shader::PARAMETER_LOOP)
+ {
+ Int a = relativeAddress(src.rel, src.bufferIndex);
+
+ c.x = c.y = c.z = c.w = *Pointer<Float4>(uniformAddress(src.bufferIndex, i, a));
+
+ c.x = c.x.xxxx;
+ c.y = c.y.yyyy;
+ c.z = c.z.zzzz;
+ c.w = c.w.wwww;
+ }
+ else
+ {
+ int component = src.rel.swizzle & 0x03;
+ Float4 a;
+
+ switch(src.rel.type)
+ {
+ case Shader::PARAMETER_ADDR: a = a0[component]; break;
+ case Shader::PARAMETER_TEMP: a = r[src.rel.index][component]; break;
+ case Shader::PARAMETER_INPUT: a = v[src.rel.index][component]; break;
+ case Shader::PARAMETER_OUTPUT: a = o[src.rel.index][component]; break;
+ case Shader::PARAMETER_CONST: a = *Pointer<Float>(uniformAddress(src.bufferIndex, src.rel.index) + component * sizeof(float)); break;
+ case Shader::PARAMETER_MISCTYPE:
+ switch(src.rel.index)
+ {
+ case Shader::InstanceIDIndex: a = As<Float4>(Int4(instanceID)); break;
+ case Shader::VertexIDIndex: a = As<Float4>(vertexID); break;
+ default: ASSERT(false);
+ }
+ break;
+ default: ASSERT(false);
+ }
+
+ Int4 index = Int4(i) + As<Int4>(a) * Int4(src.rel.scale);
+
+ index = Min(As<UInt4>(index), UInt4(VERTEX_UNIFORM_VECTORS)); // Clamp to constant register range, c[VERTEX_UNIFORM_VECTORS] = {0, 0, 0, 0}
+
+ Int index0 = Extract(index, 0);
+ Int index1 = Extract(index, 1);
+ Int index2 = Extract(index, 2);
+ Int index3 = Extract(index, 3);
+
+ c.x = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index0), 16);
+ c.y = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index1), 16);
+ c.z = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index2), 16);
+ c.w = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index3), 16);
+
+ transpose4x4(c.x, c.y, c.z, c.w);
+ }
+
+ return c;
+ }
+
+ Int VertexProgram::relativeAddress(const Shader::Relative &rel, int bufferIndex)
+ {
+ ASSERT(!rel.dynamic);
+
+ if(rel.type == Shader::PARAMETER_TEMP)
+ {
+ return As<Int>(Extract(r[rel.index].x, 0)) * rel.scale;
+ }
+ else if(rel.type == Shader::PARAMETER_INPUT)
+ {
+ return As<Int>(Extract(v[rel.index].x, 0)) * rel.scale;
+ }
+ else if(rel.type == Shader::PARAMETER_OUTPUT)
+ {
+ return As<Int>(Extract(o[rel.index].x, 0)) * rel.scale;
+ }
+ else if(rel.type == Shader::PARAMETER_CONST)
+ {
+ return *Pointer<Int>(uniformAddress(bufferIndex, rel.index)) * rel.scale;
+ }
+ else if(rel.type == Shader::PARAMETER_LOOP)
+ {
+ return aL[loopDepth];
+ }
+ else ASSERT(false);
+
+ return 0;
+ }
+
+ Int4 VertexProgram::dynamicAddress(const Shader::Relative &rel)
+ {
+ int component = rel.swizzle & 0x03;
+ Float4 a;
+
+ switch(rel.type)
+ {
+ case Shader::PARAMETER_ADDR: a = a0[component]; break;
+ case Shader::PARAMETER_TEMP: a = r[rel.index][component]; break;
+ case Shader::PARAMETER_INPUT: a = v[rel.index][component]; break;
+ case Shader::PARAMETER_OUTPUT: a = o[rel.index][component]; break;
+ case Shader::PARAMETER_MISCTYPE:
+ switch(rel.index)
+ {
+ case Shader::InstanceIDIndex: a = As<Float>(instanceID); break;
+ case Shader::VertexIDIndex: a = As<Float4>(vertexID); break;
+ default: ASSERT(false);
+ }
+ break;
+ default: ASSERT(false);
+ }
+
+ return As<Int4>(a) * Int4(rel.scale);
+ }
+
+ Int4 VertexProgram::enableMask(const Shader::Instruction *instruction)
+ {
+ Int4 enable = instruction->analysisBranch ? Int4(enableStack[enableIndex]) : Int4(0xFFFFFFFF);
+
+ if(!whileTest)
+ {
+ if(shader->containsBreakInstruction() && instruction->analysisBreak)
+ {
+ enable &= enableBreak;
+ }
+
+ if(shader->containsContinueInstruction() && instruction->analysisContinue)
+ {
+ enable &= enableContinue;
+ }
+
+ if(shader->containsLeaveInstruction() && instruction->analysisLeave)
+ {
+ enable &= enableLeave;
+ }
+ }
+
+ return enable;
+ }
+
+ void VertexProgram::M3X2(Vector4f &dst, Vector4f &src0, Src &src1)
+ {
+ Vector4f row0 = fetchRegister(src1, 0);
+ Vector4f row1 = fetchRegister(src1, 1);
+
+ dst.x = dot3(src0, row0);
+ dst.y = dot3(src0, row1);
+ }
+
+ void VertexProgram::M3X3(Vector4f &dst, Vector4f &src0, Src &src1)
+ {
+ Vector4f row0 = fetchRegister(src1, 0);
+ Vector4f row1 = fetchRegister(src1, 1);
+ Vector4f row2 = fetchRegister(src1, 2);
+
+ dst.x = dot3(src0, row0);
+ dst.y = dot3(src0, row1);
+ dst.z = dot3(src0, row2);
+ }
+
+ void VertexProgram::M3X4(Vector4f &dst, Vector4f &src0, Src &src1)
+ {
+ Vector4f row0 = fetchRegister(src1, 0);
+ Vector4f row1 = fetchRegister(src1, 1);
+ Vector4f row2 = fetchRegister(src1, 2);
+ Vector4f row3 = fetchRegister(src1, 3);
+
+ dst.x = dot3(src0, row0);
+ dst.y = dot3(src0, row1);
+ dst.z = dot3(src0, row2);
+ dst.w = dot3(src0, row3);
+ }
+
+ void VertexProgram::M4X3(Vector4f &dst, Vector4f &src0, Src &src1)
+ {
+ Vector4f row0 = fetchRegister(src1, 0);
+ Vector4f row1 = fetchRegister(src1, 1);
+ Vector4f row2 = fetchRegister(src1, 2);
+
+ dst.x = dot4(src0, row0);
+ dst.y = dot4(src0, row1);
+ dst.z = dot4(src0, row2);
+ }
+
+ void VertexProgram::M4X4(Vector4f &dst, Vector4f &src0, Src &src1)
+ {
+ Vector4f row0 = fetchRegister(src1, 0);
+ Vector4f row1 = fetchRegister(src1, 1);
+ Vector4f row2 = fetchRegister(src1, 2);
+ Vector4f row3 = fetchRegister(src1, 3);
+
+ dst.x = dot4(src0, row0);
+ dst.y = dot4(src0, row1);
+ dst.z = dot4(src0, row2);
+ dst.w = dot4(src0, row3);
+ }
+
+ void VertexProgram::BREAK()
+ {
+ enableBreak = enableBreak & ~enableStack[enableIndex];
+ }
+
+ void VertexProgram::BREAKC(Vector4f &src0, Vector4f &src1, Control control)
+ {
+ Int4 condition;
+
+ switch(control)
+ {
+ case Shader::CONTROL_GT: condition = CmpNLE(src0.x, src1.x); break;
+ case Shader::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x); break;
+ case Shader::CONTROL_GE: condition = CmpNLT(src0.x, src1.x); break;
+ case Shader::CONTROL_LT: condition = CmpLT(src0.x, src1.x); break;
+ case Shader::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x); break;
+ case Shader::CONTROL_LE: condition = CmpLE(src0.x, src1.x); break;
+ default:
+ ASSERT(false);
+ }
+
+ BREAK(condition);
+ }
+
+ void VertexProgram::BREAKP(const Src &predicateRegister) // FIXME: Factor out parts common with BREAKC
+ {
+ Int4 condition = As<Int4>(p0[predicateRegister.swizzle & 0x3]);
+
+ if(predicateRegister.modifier == Shader::MODIFIER_NOT)
+ {
+ condition = ~condition;
+ }
+
+ BREAK(condition);
+ }
+
+ void VertexProgram::BREAK(Int4 &condition)
+ {
+ condition &= enableStack[enableIndex];
+
+ enableBreak = enableBreak & ~condition;
+ }
+
+ void VertexProgram::CONTINUE()
+ {
+ enableContinue = enableContinue & ~enableStack[enableIndex];
+ }
+
+ void VertexProgram::TEST()
+ {
+ whileTest = true;
+ }
+
+ void VertexProgram::CALL(int labelIndex, int callSiteIndex)
+ {
+ if(!labelBlock[labelIndex])
+ {
+ labelBlock[labelIndex] = Nucleus::createBasicBlock();
+ }
+
+ if(callRetBlock[labelIndex].size() > 1)
+ {
+ callStack[stackIndex++] = UInt(callSiteIndex);
+ }
+
+ Int4 restoreLeave = enableLeave;
+
+ Nucleus::createBr(labelBlock[labelIndex]);
+ Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+ enableLeave = restoreLeave;
+ }
+
+ void VertexProgram::CALLNZ(int labelIndex, int callSiteIndex, const Src &src)
+ {
+ if(src.type == Shader::PARAMETER_CONSTBOOL)
+ {
+ CALLNZb(labelIndex, callSiteIndex, src);
+ }
+ else if(src.type == Shader::PARAMETER_PREDICATE)
+ {
+ CALLNZp(labelIndex, callSiteIndex, src);
+ }
+ else ASSERT(false);
+ }
+
+ void VertexProgram::CALLNZb(int labelIndex, int callSiteIndex, const Src &boolRegister)
+ {
+ Bool condition = (*Pointer<Byte>(data + OFFSET(DrawData,vs.b[boolRegister.index])) != Byte(0)); // FIXME
+
+ if(boolRegister.modifier == Shader::MODIFIER_NOT)
+ {
+ condition = !condition;
+ }
+
+ if(!labelBlock[labelIndex])
+ {
+ labelBlock[labelIndex] = Nucleus::createBasicBlock();
+ }
+
+ if(callRetBlock[labelIndex].size() > 1)
+ {
+ callStack[stackIndex++] = UInt(callSiteIndex);
+ }
+
+ Int4 restoreLeave = enableLeave;
+
+ branch(condition, labelBlock[labelIndex], callRetBlock[labelIndex][callSiteIndex]);
+ Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+ enableLeave = restoreLeave;
+ }
+
+ void VertexProgram::CALLNZp(int labelIndex, int callSiteIndex, const Src &predicateRegister)
+ {
+ Int4 condition = As<Int4>(p0[predicateRegister.swizzle & 0x3]);
+
+ if(predicateRegister.modifier == Shader::MODIFIER_NOT)
+ {
+ condition = ~condition;
+ }
+
+ condition &= enableStack[enableIndex];
+
+ if(!labelBlock[labelIndex])
+ {
+ labelBlock[labelIndex] = Nucleus::createBasicBlock();
+ }
+
+ if(callRetBlock[labelIndex].size() > 1)
+ {
+ callStack[stackIndex++] = UInt(callSiteIndex);
+ }
+
+ enableIndex++;
+ enableStack[enableIndex] = condition;
+ Int4 restoreLeave = enableLeave;
+
+ Bool notAllFalse = SignMask(condition) != 0;
+ branch(notAllFalse, labelBlock[labelIndex], callRetBlock[labelIndex][callSiteIndex]);
+ Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+ enableIndex--;
+ enableLeave = restoreLeave;
+ }
+
+ void VertexProgram::ELSE()
+ {
+ ifDepth--;
+
+ BasicBlock *falseBlock = ifFalseBlock[ifDepth];
+ BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+ if(isConditionalIf[ifDepth])
+ {
+ Int4 condition = ~enableStack[enableIndex] & enableStack[enableIndex - 1];
+ Bool notAllFalse = SignMask(condition) != 0;
+
+ branch(notAllFalse, falseBlock, endBlock);
+
+ enableStack[enableIndex] = ~enableStack[enableIndex] & enableStack[enableIndex - 1];
+ }
+ else
+ {
+ Nucleus::createBr(endBlock);
+ Nucleus::setInsertBlock(falseBlock);
+ }
+
+ ifFalseBlock[ifDepth] = endBlock;
+
+ ifDepth++;
+ }
+
+ void VertexProgram::ENDIF()
+ {
+ ifDepth--;
+
+ BasicBlock *endBlock = ifFalseBlock[ifDepth];
+
+ Nucleus::createBr(endBlock);
+ Nucleus::setInsertBlock(endBlock);
+
+ if(isConditionalIf[ifDepth])
+ {
+ enableIndex--;
+ }
+ }
+
+ void VertexProgram::ENDLOOP()
+ {
+ loopRepDepth--;
+
+ aL[loopDepth] = aL[loopDepth] + increment[loopDepth]; // FIXME: +=
+
+ BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+ BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+ Nucleus::createBr(testBlock);
+ Nucleus::setInsertBlock(endBlock);
+
+ loopDepth--;
+ enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+ }
+
+ void VertexProgram::ENDREP()
+ {
+ loopRepDepth--;
+
+ BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+ BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+ Nucleus::createBr(testBlock);
+ Nucleus::setInsertBlock(endBlock);
+
+ loopDepth--;
+ enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+ }
+
+ void VertexProgram::ENDWHILE()
+ {
+ loopRepDepth--;
+
+ BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+ BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+ Nucleus::createBr(testBlock);
+ Nucleus::setInsertBlock(endBlock);
+
+ enableIndex--;
+ whileTest = false;
+ }
+
+ void VertexProgram::ENDSWITCH()
+ {
+ loopRepDepth--;
+
+ BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+ Nucleus::createBr(endBlock);
+ Nucleus::setInsertBlock(endBlock);
+ }
+
+ void VertexProgram::IF(const Src &src)
+ {
+ if(src.type == Shader::PARAMETER_CONSTBOOL)
+ {
+ IFb(src);
+ }
+ else if(src.type == Shader::PARAMETER_PREDICATE)
+ {
+ IFp(src);
+ }
+ else
+ {
+ Int4 condition = As<Int4>(fetchRegister(src).x);
+ IF(condition);
+ }
+ }
+
+ void VertexProgram::IFb(const Src &boolRegister)
+ {
+ ASSERT(ifDepth < 24 + 4);
+
+ Bool condition = (*Pointer<Byte>(data + OFFSET(DrawData,vs.b[boolRegister.index])) != Byte(0)); // FIXME
+
+ if(boolRegister.modifier == Shader::MODIFIER_NOT)
+ {
+ condition = !condition;
+ }
+
+ BasicBlock *trueBlock = Nucleus::createBasicBlock();
+ BasicBlock *falseBlock = Nucleus::createBasicBlock();
+
+ branch(condition, trueBlock, falseBlock);
+
+ isConditionalIf[ifDepth] = false;
+ ifFalseBlock[ifDepth] = falseBlock;
+
+ ifDepth++;
+ }
+
+ void VertexProgram::IFp(const Src &predicateRegister)
+ {
+ Int4 condition = As<Int4>(p0[predicateRegister.swizzle & 0x3]);
+
+ if(predicateRegister.modifier == Shader::MODIFIER_NOT)
+ {
+ condition = ~condition;
+ }
+
+ IF(condition);
+ }
+
+ void VertexProgram::IFC(Vector4f &src0, Vector4f &src1, Control control)
+ {
+ Int4 condition;
+
+ switch(control)
+ {
+ case Shader::CONTROL_GT: condition = CmpNLE(src0.x, src1.x); break;
+ case Shader::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x); break;
+ case Shader::CONTROL_GE: condition = CmpNLT(src0.x, src1.x); break;
+ case Shader::CONTROL_LT: condition = CmpLT(src0.x, src1.x); break;
+ case Shader::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x); break;
+ case Shader::CONTROL_LE: condition = CmpLE(src0.x, src1.x); break;
+ default:
+ ASSERT(false);
+ }
+
+ IF(condition);
+ }
+
+ void VertexProgram::IF(Int4 &condition)
+ {
+ condition &= enableStack[enableIndex];
+
+ enableIndex++;
+ enableStack[enableIndex] = condition;
+
+ BasicBlock *trueBlock = Nucleus::createBasicBlock();
+ BasicBlock *falseBlock = Nucleus::createBasicBlock();
+
+ Bool notAllFalse = SignMask(condition) != 0;
+
+ branch(notAllFalse, trueBlock, falseBlock);
+
+ isConditionalIf[ifDepth] = true;
+ ifFalseBlock[ifDepth] = falseBlock;
+
+ ifDepth++;
+ }
+
+ void VertexProgram::LABEL(int labelIndex)
+ {
+ if(!labelBlock[labelIndex])
+ {
+ labelBlock[labelIndex] = Nucleus::createBasicBlock();
+ }
+
+ Nucleus::setInsertBlock(labelBlock[labelIndex]);
+ currentLabel = labelIndex;
+ }
+
+ void VertexProgram::LOOP(const Src &integerRegister)
+ {
+ loopDepth++;
+
+ iteration[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData,vs.i[integerRegister.index][0]));
+ aL[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData,vs.i[integerRegister.index][1]));
+ increment[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData,vs.i[integerRegister.index][2]));
+
+ // FIXME: Compiles to two instructions?
+ If(increment[loopDepth] == 0)
+ {
+ increment[loopDepth] = 1;
+ }
+
+ BasicBlock *loopBlock = Nucleus::createBasicBlock();
+ BasicBlock *testBlock = Nucleus::createBasicBlock();
+ BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+ loopRepTestBlock[loopRepDepth] = testBlock;
+ loopRepEndBlock[loopRepDepth] = endBlock;
+
+ // FIXME: jump(testBlock)
+ Nucleus::createBr(testBlock);
+ Nucleus::setInsertBlock(testBlock);
+
+ branch(iteration[loopDepth] > 0, loopBlock, endBlock);
+ Nucleus::setInsertBlock(loopBlock);
+
+ iteration[loopDepth] = iteration[loopDepth] - 1; // FIXME: --
+
+ loopRepDepth++;
+ }
+
+ void VertexProgram::REP(const Src &integerRegister)
+ {
+ loopDepth++;
+
+ iteration[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData,vs.i[integerRegister.index][0]));
+ aL[loopDepth] = aL[loopDepth - 1];
+
+ BasicBlock *loopBlock = Nucleus::createBasicBlock();
+ BasicBlock *testBlock = Nucleus::createBasicBlock();
+ BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+ loopRepTestBlock[loopRepDepth] = testBlock;
+ loopRepEndBlock[loopRepDepth] = endBlock;
+
+ // FIXME: jump(testBlock)
+ Nucleus::createBr(testBlock);
+ Nucleus::setInsertBlock(testBlock);
+
+ branch(iteration[loopDepth] > 0, loopBlock, endBlock);
+ Nucleus::setInsertBlock(loopBlock);
+
+ iteration[loopDepth] = iteration[loopDepth] - 1; // FIXME: --
+
+ loopRepDepth++;
+ }
+
+ void VertexProgram::WHILE(const Src &temporaryRegister)
+ {
+ enableIndex++;
+
+ BasicBlock *loopBlock = Nucleus::createBasicBlock();
+ BasicBlock *testBlock = Nucleus::createBasicBlock();
+ BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+ loopRepTestBlock[loopRepDepth] = testBlock;
+ loopRepEndBlock[loopRepDepth] = endBlock;
+
+ Int4 restoreBreak = enableBreak;
+ Int4 restoreContinue = enableContinue;
+
+ // TODO: jump(testBlock)
+ Nucleus::createBr(testBlock);
+ Nucleus::setInsertBlock(testBlock);
+ enableContinue = restoreContinue;
+
+ const Vector4f &src = fetchRegister(temporaryRegister);
+ Int4 condition = As<Int4>(src.x);
+ condition &= enableStack[enableIndex - 1];
+ if(shader->containsLeaveInstruction()) condition &= enableLeave;
+ if(shader->containsBreakInstruction()) condition &= enableBreak;
+ enableStack[enableIndex] = condition;
+
+ Bool notAllFalse = SignMask(condition) != 0;
+ branch(notAllFalse, loopBlock, endBlock);
+
+ Nucleus::setInsertBlock(endBlock);
+ enableBreak = restoreBreak;
+
+ Nucleus::setInsertBlock(loopBlock);
+
+ loopRepDepth++;
+ }
+
+ void VertexProgram::SWITCH()
+ {
+ BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+ loopRepTestBlock[loopRepDepth] = nullptr;
+ loopRepEndBlock[loopRepDepth] = endBlock;
+
+ Int4 restoreBreak = enableBreak;
+
+ BasicBlock *currentBlock = Nucleus::getInsertBlock();
+
+ Nucleus::setInsertBlock(endBlock);
+ enableBreak = restoreBreak;
+
+ Nucleus::setInsertBlock(currentBlock);
+
+ loopRepDepth++;
+ }
+
+ void VertexProgram::RET()
+ {
+ if(currentLabel == -1)
+ {
+ returnBlock = Nucleus::createBasicBlock();
+ Nucleus::createBr(returnBlock);
+ }
+ else
+ {
+ BasicBlock *unreachableBlock = Nucleus::createBasicBlock();
+
+ if(callRetBlock[currentLabel].size() > 1) // Pop the return destination from the call stack
+ {
+ // FIXME: Encapsulate
+ UInt index = callStack[--stackIndex];
+
+ Value *value = index.loadValue();
+ SwitchCases *switchCases = Nucleus::createSwitch(value, unreachableBlock, (int)callRetBlock[currentLabel].size());
+
+ for(unsigned int i = 0; i < callRetBlock[currentLabel].size(); i++)
+ {
+ Nucleus::addSwitchCase(switchCases, i, callRetBlock[currentLabel][i]);
+ }
+ }
+ else if(callRetBlock[currentLabel].size() == 1) // Jump directly to the unique return destination
+ {
+ Nucleus::createBr(callRetBlock[currentLabel][0]);
+ }
+ else // Function isn't called
+ {
+ Nucleus::createBr(unreachableBlock);
+ }
+
+ Nucleus::setInsertBlock(unreachableBlock);
+ Nucleus::createUnreachable();
+ }
+ }
+
+ void VertexProgram::LEAVE()
+ {
+ enableLeave = enableLeave & ~enableStack[enableIndex];
+
+ // FIXME: Return from function if all instances left
+ // FIXME: Use enableLeave in other control-flow constructs
+ }
+
+ void VertexProgram::TEX(Vector4f &dst, Vector4f &src0, const Src &src1)
+ {
+ dst = sampleTexture(src1, src0, (src0.x), (src0), (src0), (src0), Base);
+ }
+
+ void VertexProgram::TEXOFFSET(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &offset)
+ {
+ dst = sampleTexture(src1, src0, (src0.x), (src0), (src0), offset, {Base, Offset});
+ }
+
+ void VertexProgram::TEXLOD(Vector4f &dst, Vector4f &src0, const Src& src1, Float4 &lod)
+ {
+ dst = sampleTexture(src1, src0, lod, (src0), (src0), (src0), Lod);
+ }
+
+ void VertexProgram::TEXLODOFFSET(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &offset, Float4 &lod)
+ {
+ dst = sampleTexture(src1, src0, lod, (src0), (src0), offset, {Lod, Offset});
+ }
+
+ void VertexProgram::TEXELFETCH(Vector4f &dst, Vector4f &src0, const Src& src1, Float4 &lod)
+ {
+ dst = sampleTexture(src1, src0, lod, (src0), (src0), (src0), Fetch);
+ }
+
+ void VertexProgram::TEXELFETCHOFFSET(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &offset, Float4 &lod)
+ {
+ dst = sampleTexture(src1, src0, lod, (src0), (src0), offset, {Fetch, Offset});
+ }
+
+ void VertexProgram::TEXGRAD(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &dsx, Vector4f &dsy)
+ {
+ dst = sampleTexture(src1, src0, (src0.x), dsx, dsy, src0, Grad);
+ }
+
+ void VertexProgram::TEXGRADOFFSET(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &dsx, Vector4f &dsy, Vector4f &offset)
+ {
+ dst = sampleTexture(src1, src0, (src0.x), dsx, dsy, offset, {Grad, Offset});
+ }
+
+ void VertexProgram::TEXSIZE(Vector4f &dst, Float4 &lod, const Src &src1)
+ {
+ Pointer<Byte> texture = data + OFFSET(DrawData, mipmap[TEXTURE_IMAGE_UNITS]) + src1.index * sizeof(Texture);
+ dst = SamplerCore::textureSize(texture, lod);
+ }
+
+ Vector4f VertexProgram::sampleTexture(const Src &s, Vector4f &uvwq, Float4 &lod, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function)
+ {
+ Vector4f tmp;
+
+ if(s.type == Shader::PARAMETER_SAMPLER && s.rel.type == Shader::PARAMETER_VOID)
+ {
+ tmp = sampleTexture(s.index, uvwq, lod, dsx, dsy, offset, function);
+ }
+ else
+ {
+ Int index = As<Int>(Float(fetchRegister(s).x.x));
+
+ for(int i = 0; i < VERTEX_TEXTURE_IMAGE_UNITS; i++)
+ {
+ if(shader->usesSampler(i))
+ {
+ If(index == i)
+ {
+ tmp = sampleTexture(i, uvwq, lod, dsx, dsy, offset, function);
+ // FIXME: When the sampler states are the same, we could use one sampler and just index the texture
+ }
+ }
+ }
+ }
+
+ Vector4f c;
+ c.x = tmp[(s.swizzle >> 0) & 0x3];
+ c.y = tmp[(s.swizzle >> 2) & 0x3];
+ c.z = tmp[(s.swizzle >> 4) & 0x3];
+ c.w = tmp[(s.swizzle >> 6) & 0x3];
+
+ return c;
+ }
+
+ Vector4f VertexProgram::sampleTexture(int sampler, Vector4f &uvwq, Float4 &lod, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function)
+ {
+ Pointer<Byte> texture = data + OFFSET(DrawData, mipmap[TEXTURE_IMAGE_UNITS]) + sampler * sizeof(Texture);
+ return SamplerCore(constants, state.sampler[sampler]).sampleTexture(texture, uvwq.x, uvwq.y, uvwq.z, uvwq.w, lod, dsx, dsy, offset, function);
+ }
+}
diff --git a/src/Pipeline/VertexProgram.hpp b/src/Pipeline/VertexProgram.hpp
new file mode 100644
index 0000000..3c4199c
--- /dev/null
+++ b/src/Pipeline/VertexProgram.hpp
@@ -0,0 +1,139 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_VertexProgram_hpp
+#define sw_VertexProgram_hpp
+
+#include "VertexRoutine.hpp"
+#include "ShaderCore.hpp"
+
+#include "SamplerCore.hpp"
+#include "Renderer/Stream.hpp"
+#include "Common/Types.hpp"
+
+namespace sw
+{
+ struct Stream;
+ class VertexShader;
+
+ class VertexProgram : public VertexRoutine, public ShaderCore
+ {
+ public:
+ VertexProgram(const VertexProcessor::State &state, const VertexShader *vertexShader);
+
+ virtual ~VertexProgram();
+
+ private:
+ const VertexShader *const shader;
+
+ RegisterArray<NUM_TEMPORARY_REGISTERS> r; // Temporary registers
+ Vector4f a0;
+ Array<Int, 4> aL;
+ Vector4f p0;
+
+ Array<Int, 4> increment;
+ Array<Int, 4> iteration;
+
+ Int loopDepth;
+ Int stackIndex; // FIXME: Inc/decrement callStack
+ Array<UInt, 16> callStack;
+
+ Int enableIndex;
+ Array<Int4, 1 + 24> enableStack;
+ Int4 enableBreak;
+ Int4 enableContinue;
+ Int4 enableLeave;
+
+ Int instanceID;
+ Int4 vertexID;
+
+ typedef Shader::DestinationParameter Dst;
+ typedef Shader::SourceParameter Src;
+ typedef Shader::Control Control;
+ typedef Shader::Usage Usage;
+
+ void pipeline(UInt &index) override;
+ void program(UInt &index);
+ void passThrough();
+
+ Vector4f fetchRegister(const Src &src, unsigned int offset = 0);
+ Vector4f readConstant(const Src &src, unsigned int offset = 0);
+ RValue<Pointer<Byte>> uniformAddress(int bufferIndex, unsigned int index);
+ RValue<Pointer<Byte>> uniformAddress(int bufferIndex, unsigned int index, Int &offset);
+ Int relativeAddress(const Shader::Relative &rel, int bufferIndex = -1);
+ Int4 dynamicAddress(const Shader::Relative &rel);
+ Int4 enableMask(const Shader::Instruction *instruction);
+
+ void M3X2(Vector4f &dst, Vector4f &src0, Src &src1);
+ void M3X3(Vector4f &dst, Vector4f &src0, Src &src1);
+ void M3X4(Vector4f &dst, Vector4f &src0, Src &src1);
+ void M4X3(Vector4f &dst, Vector4f &src0, Src &src1);
+ void M4X4(Vector4f &dst, Vector4f &src0, Src &src1);
+ void BREAK();
+ void BREAKC(Vector4f &src0, Vector4f &src1, Control);
+ void BREAKP(const Src &predicateRegister);
+ void BREAK(Int4 &condition);
+ void CONTINUE();
+ void TEST();
+ void CALL(int labelIndex, int callSiteIndex);
+ void CALLNZ(int labelIndex, int callSiteIndex, const Src &src);
+ void CALLNZb(int labelIndex, int callSiteIndex, const Src &boolRegister);
+ void CALLNZp(int labelIndex, int callSiteIndex, const Src &predicateRegister);
+ void ELSE();
+ void ENDIF();
+ void ENDLOOP();
+ void ENDREP();
+ void ENDWHILE();
+ void ENDSWITCH();
+ void IF(const Src &src);
+ void IFb(const Src &boolRegister);
+ void IFp(const Src &predicateRegister);
+ void IFC(Vector4f &src0, Vector4f &src1, Control);
+ void IF(Int4 &condition);
+ void LABEL(int labelIndex);
+ void LOOP(const Src &integerRegister);
+ void REP(const Src &integerRegister);
+ void WHILE(const Src &temporaryRegister);
+ void SWITCH();
+ void RET();
+ void LEAVE();
+ void TEX(Vector4f &dst, Vector4f &src, const Src&);
+ void TEXOFFSET(Vector4f &dst, Vector4f &src, const Src&, Vector4f &offset);
+ void TEXLOD(Vector4f &dst, Vector4f &src, const Src&, Float4 &lod);
+ void TEXLODOFFSET(Vector4f &dst, Vector4f &src, const Src&, Vector4f &offset, Float4 &lod);
+ void TEXELFETCH(Vector4f &dst, Vector4f &src, const Src&, Float4 &lod);
+ void TEXELFETCHOFFSET(Vector4f &dst, Vector4f &src, const Src&, Vector4f &offset, Float4 &lod);
+ void TEXGRAD(Vector4f &dst, Vector4f &src, const Src&, Vector4f &dsx, Vector4f &dsy);
+ void TEXGRADOFFSET(Vector4f &dst, Vector4f &src, const Src&, Vector4f &dsx, Vector4f &dsy, Vector4f &offset);
+ void TEXSIZE(Vector4f &dst, Float4 &lod, const Src&);
+
+ Vector4f sampleTexture(const Src &s, Vector4f &uvwq, Float4 &lod, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
+ Vector4f sampleTexture(int sampler, Vector4f &uvwq, Float4 &lod, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
+
+ int ifDepth;
+ int loopRepDepth;
+ int currentLabel;
+ bool whileTest;
+
+ BasicBlock *ifFalseBlock[24 + 24];
+ BasicBlock *loopRepTestBlock[4];
+ BasicBlock *loopRepEndBlock[4];
+ BasicBlock *labelBlock[2048];
+ std::vector<BasicBlock*> callRetBlock[2048];
+ BasicBlock *returnBlock;
+ bool isConditionalIf[24 + 24];
+ };
+}
+
+#endif // sw_VertexProgram_hpp
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
new file mode 100644
index 0000000..9b8d336
--- /dev/null
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -0,0 +1,788 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "VertexRoutine.hpp"
+
+#include "VertexShader.hpp"
+#include "Constants.hpp"
+#include "Renderer/Vertex.hpp"
+#include "Renderer/Renderer.hpp"
+#include "Common/Half.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+ extern bool halfIntegerCoordinates; // Pixel centers are not at integer coordinates
+ extern bool symmetricNormalizedDepth; // [-1, 1] instead of [0, 1]
+
+ VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader)
+ : v(shader && shader->indirectAddressableInput),
+ o(shader && shader->indirectAddressableOutput),
+ state(state)
+ {
+ }
+
+ VertexRoutine::~VertexRoutine()
+ {
+ }
+
+ void VertexRoutine::generate()
+ {
+ const bool textureSampling = state.textureSampling;
+
+ Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
+ Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
+ Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
+
+ UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
+ UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
+ UInt indexInPrimitive = 0;
+
+ constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
+
+ Do
+ {
+ UInt index = *Pointer<UInt>(batch);
+ UInt tagIndex = index & 0x0000003C;
+ UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index; // FIXME: TEXLDL hack to have independent LODs, hurts performance.
+
+ If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
+ {
+ *Pointer<UInt>(tagCache + tagIndex) = indexQ;
+
+ readInput(indexQ);
+ pipeline(indexQ);
+ postTransform();
+ computeClipFlags();
+
+ Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
+ writeCache(cacheLine0);
+ }
+
+ UInt cacheIndex = index & 0x0000003F;
+ Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
+ writeVertex(vertex, cacheLine);
+
+ if(state.transformFeedbackEnabled != 0)
+ {
+ transformFeedback(vertex, primitiveNumber, indexInPrimitive);
+
+ indexInPrimitive++;
+ If(indexInPrimitive == 3)
+ {
+ primitiveNumber++;
+ indexInPrimitive = 0;
+ }
+ }
+
+ vertex += sizeof(Vertex);
+ batch += sizeof(unsigned int);
+ vertexCount--;
+ }
+ Until(vertexCount == 0)
+
+ Return();
+ }
+
+ void VertexRoutine::readInput(UInt &index)
+ {
+ for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+ {
+ Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,input) + sizeof(void*) * i);
+ UInt stride = *Pointer<UInt>(data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
+
+ v[i] = readStream(input, stride, state.input[i], index);
+ }
+ }
+
+ void VertexRoutine::computeClipFlags()
+ {
+ int pos = state.positionRegister;
+
+ Int4 maxX = CmpLT(o[pos].w, o[pos].x);
+ Int4 maxY = CmpLT(o[pos].w, o[pos].y);
+ Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
+ Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
+ Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
+ Int4 minZ = symmetricNormalizedDepth ? CmpNLE(-o[pos].w, o[pos].z) : CmpNLE(Float4(0.0f), o[pos].z);
+
+ clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + SignMask(maxX) * 4); // FIXME: Array indexing
+ clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + SignMask(maxY) * 4);
+ clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxZ) + SignMask(maxZ) * 4);
+ clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minX) + SignMask(minX) * 4);
+ clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + SignMask(minY) * 4);
+ clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + SignMask(minZ) * 4);
+
+ Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+ Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+ Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+
+ Int4 finiteXYZ = finiteX & finiteY & finiteZ;
+ clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + SignMask(finiteXYZ) * 4);
+
+ if(state.preTransformed)
+ {
+ clipFlags &= 0xFBFBFBFB; // Don't clip against far clip plane
+ }
+ }
+
+ Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
+ {
+ const bool textureSampling = state.textureSampling;
+
+ Vector4f v;
+
+ Pointer<Byte> source0 = buffer + index * stride;
+ Pointer<Byte> source1 = source0 + (!textureSampling ? stride : 0);
+ Pointer<Byte> source2 = source1 + (!textureSampling ? stride : 0);
+ Pointer<Byte> source3 = source2 + (!textureSampling ? stride : 0);
+
+ bool isNativeFloatAttrib = (stream.attribType == VertexShader::ATTRIBTYPE_FLOAT) || stream.normalized;
+
+ switch(stream.type)
+ {
+ case STREAMTYPE_FLOAT:
+ {
+ if(stream.count == 0)
+ {
+ // Null stream, all default components
+ }
+ else
+ {
+ if(stream.count == 1)
+ {
+ v.x.x = *Pointer<Float>(source0);
+ v.x.y = *Pointer<Float>(source1);
+ v.x.z = *Pointer<Float>(source2);
+ v.x.w = *Pointer<Float>(source3);
+ }
+ else
+ {
+ v.x = *Pointer<Float4>(source0);
+ v.y = *Pointer<Float4>(source1);
+ v.z = *Pointer<Float4>(source2);
+ v.w = *Pointer<Float4>(source3);
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+ }
+
+ switch(stream.attribType)
+ {
+ case VertexShader::ATTRIBTYPE_INT:
+ if(stream.count >= 1) v.x = As<Float4>(Int4(v.x));
+ if(stream.count >= 2) v.x = As<Float4>(Int4(v.y));
+ if(stream.count >= 3) v.x = As<Float4>(Int4(v.z));
+ if(stream.count >= 4) v.x = As<Float4>(Int4(v.w));
+ break;
+ case VertexShader::ATTRIBTYPE_UINT:
+ if(stream.count >= 1) v.x = As<Float4>(UInt4(v.x));
+ if(stream.count >= 2) v.x = As<Float4>(UInt4(v.y));
+ if(stream.count >= 3) v.x = As<Float4>(UInt4(v.z));
+ if(stream.count >= 4) v.x = As<Float4>(UInt4(v.w));
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ break;
+ case STREAMTYPE_BYTE:
+ if(isNativeFloatAttrib) // Stream: UByte, Shader attrib: Float
+ {
+ v.x = Float4(*Pointer<Byte4>(source0));
+ v.y = Float4(*Pointer<Byte4>(source1));
+ v.z = Float4(*Pointer<Byte4>(source2));
+ v.w = Float4(*Pointer<Byte4>(source3));
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+ if(stream.normalized)
+ {
+ if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+ if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+ if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+ if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+ }
+ }
+ else // Stream: UByte, Shader attrib: Int / UInt
+ {
+ v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
+ v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
+ v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
+ v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+ }
+ break;
+ case STREAMTYPE_SBYTE:
+ if(isNativeFloatAttrib) // Stream: SByte, Shader attrib: Float
+ {
+ v.x = Float4(*Pointer<SByte4>(source0));
+ v.y = Float4(*Pointer<SByte4>(source1));
+ v.z = Float4(*Pointer<SByte4>(source2));
+ v.w = Float4(*Pointer<SByte4>(source3));
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+ if(stream.normalized)
+ {
+ if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+ if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+ if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+ if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+ }
+ }
+ else // Stream: SByte, Shader attrib: Int / UInt
+ {
+ v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
+ v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
+ v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
+ v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+ }
+ break;
+ case STREAMTYPE_COLOR:
+ {
+ v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+ v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+ v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+ v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+
+ transpose4x4(v.x, v.y, v.z, v.w);
+
+ // Swap red and blue
+ Float4 t = v.x;
+ v.x = v.z;
+ v.z = t;
+ }
+ break;
+ case STREAMTYPE_SHORT:
+ if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
+ {
+ v.x = Float4(*Pointer<Short4>(source0));
+ v.y = Float4(*Pointer<Short4>(source1));
+ v.z = Float4(*Pointer<Short4>(source2));
+ v.w = Float4(*Pointer<Short4>(source3));
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+ if(stream.normalized)
+ {
+ if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+ if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+ if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+ if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+ }
+ }
+ else // Stream: Short, Shader attrib: Int/UInt, no type conversion
+ {
+ v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
+ v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
+ v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
+ v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+ }
+ break;
+ case STREAMTYPE_USHORT:
+ if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
+ {
+ v.x = Float4(*Pointer<UShort4>(source0));
+ v.y = Float4(*Pointer<UShort4>(source1));
+ v.z = Float4(*Pointer<UShort4>(source2));
+ v.w = Float4(*Pointer<UShort4>(source3));
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+ if(stream.normalized)
+ {
+ if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+ if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+ if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+ if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+ }
+ }
+ else // Stream: UShort, Shader attrib: Int/UInt, no type conversion
+ {
+ v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
+ v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
+ v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
+ v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+ }
+ break;
+ case STREAMTYPE_INT:
+ if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
+ {
+ v.x = Float4(*Pointer<Int4>(source0));
+ v.y = Float4(*Pointer<Int4>(source1));
+ v.z = Float4(*Pointer<Int4>(source2));
+ v.w = Float4(*Pointer<Int4>(source3));
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+ if(stream.normalized)
+ {
+ if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+ if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+ if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+ if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+ }
+ }
+ else // Stream: Int, Shader attrib: Int/UInt, no type conversion
+ {
+ v.x = *Pointer<Float4>(source0);
+ v.y = *Pointer<Float4>(source1);
+ v.z = *Pointer<Float4>(source2);
+ v.w = *Pointer<Float4>(source3);
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+ }
+ break;
+ case STREAMTYPE_UINT:
+ if(isNativeFloatAttrib) // Stream: UInt, Shader attrib: Float
+ {
+ v.x = Float4(*Pointer<UInt4>(source0));
+ v.y = Float4(*Pointer<UInt4>(source1));
+ v.z = Float4(*Pointer<UInt4>(source2));
+ v.w = Float4(*Pointer<UInt4>(source3));
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+ if(stream.normalized)
+ {
+ if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+ if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+ if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+ if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+ }
+ }
+ else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
+ {
+ v.x = *Pointer<Float4>(source0);
+ v.y = *Pointer<Float4>(source1);
+ v.z = *Pointer<Float4>(source2);
+ v.w = *Pointer<Float4>(source3);
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+ }
+ break;
+ case STREAMTYPE_UDEC3:
+ {
+ // FIXME: Vectorize
+ {
+ Int x, y, z;
+
+ x = y = z = *Pointer<Int>(source0);
+
+ v.x.x = Float(x & 0x000003FF);
+ v.x.y = Float(y & 0x000FFC00);
+ v.x.z = Float(z & 0x3FF00000);
+ }
+
+ {
+ Int x, y, z;
+
+ x = y = z = *Pointer<Int>(source1);
+
+ v.y.x = Float(x & 0x000003FF);
+ v.y.y = Float(y & 0x000FFC00);
+ v.y.z = Float(z & 0x3FF00000);
+ }
+
+ {
+ Int x, y, z;
+
+ x = y = z = *Pointer<Int>(source2);
+
+ v.z.x = Float(x & 0x000003FF);
+ v.z.y = Float(y & 0x000FFC00);
+ v.z.z = Float(z & 0x3FF00000);
+ }
+
+ {
+ Int x, y, z;
+
+ x = y = z = *Pointer<Int>(source3);
+
+ v.w.x = Float(x & 0x000003FF);
+ v.w.y = Float(y & 0x000FFC00);
+ v.w.z = Float(z & 0x3FF00000);
+ }
+
+ transpose4x3(v.x, v.y, v.z, v.w);
+
+ v.y *= Float4(1.0f / 0x00000400);
+ v.z *= Float4(1.0f / 0x00100000);
+ }
+ break;
+ case STREAMTYPE_DEC3N:
+ {
+ // FIXME: Vectorize
+ {
+ Int x, y, z;
+
+ x = y = z = *Pointer<Int>(source0);
+
+ v.x.x = Float((x << 22) & 0xFFC00000);
+ v.x.y = Float((y << 12) & 0xFFC00000);
+ v.x.z = Float((z << 2) & 0xFFC00000);
+ }
+
+ {
+ Int x, y, z;
+
+ x = y = z = *Pointer<Int>(source1);
+
+ v.y.x = Float((x << 22) & 0xFFC00000);
+ v.y.y = Float((y << 12) & 0xFFC00000);
+ v.y.z = Float((z << 2) & 0xFFC00000);
+ }
+
+ {
+ Int x, y, z;
+
+ x = y = z = *Pointer<Int>(source2);
+
+ v.z.x = Float((x << 22) & 0xFFC00000);
+ v.z.y = Float((y << 12) & 0xFFC00000);
+ v.z.z = Float((z << 2) & 0xFFC00000);
+ }
+
+ {
+ Int x, y, z;
+
+ x = y = z = *Pointer<Int>(source3);
+
+ v.w.x = Float((x << 22) & 0xFFC00000);
+ v.w.y = Float((y << 12) & 0xFFC00000);
+ v.w.z = Float((z << 2) & 0xFFC00000);
+ }
+
+ transpose4x3(v.x, v.y, v.z, v.w);
+
+ v.x *= Float4(1.0f / 0x00400000 / 511.0f);
+ v.y *= Float4(1.0f / 0x00400000 / 511.0f);
+ v.z *= Float4(1.0f / 0x00400000 / 511.0f);
+ }
+ break;
+ case STREAMTYPE_FIXED:
+ {
+ v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
+ v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
+ v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
+ v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
+
+ transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+ }
+ break;
+ case STREAMTYPE_HALF:
+ {
+ if(stream.count >= 1)
+ {
+ UShort x0 = *Pointer<UShort>(source0 + 0);
+ UShort x1 = *Pointer<UShort>(source1 + 0);
+ UShort x2 = *Pointer<UShort>(source2 + 0);
+ UShort x3 = *Pointer<UShort>(source3 + 0);
+
+ v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
+ v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
+ v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
+ v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
+ }
+
+ if(stream.count >= 2)
+ {
+ UShort y0 = *Pointer<UShort>(source0 + 2);
+ UShort y1 = *Pointer<UShort>(source1 + 2);
+ UShort y2 = *Pointer<UShort>(source2 + 2);
+ UShort y3 = *Pointer<UShort>(source3 + 2);
+
+ v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
+ v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
+ v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
+ v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
+ }
+
+ if(stream.count >= 3)
+ {
+ UShort z0 = *Pointer<UShort>(source0 + 4);
+ UShort z1 = *Pointer<UShort>(source1 + 4);
+ UShort z2 = *Pointer<UShort>(source2 + 4);
+ UShort z3 = *Pointer<UShort>(source3 + 4);
+
+ v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
+ v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
+ v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
+ v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
+ }
+
+ if(stream.count >= 4)
+ {
+ UShort w0 = *Pointer<UShort>(source0 + 6);
+ UShort w1 = *Pointer<UShort>(source1 + 6);
+ UShort w2 = *Pointer<UShort>(source2 + 6);
+ UShort w3 = *Pointer<UShort>(source3 + 6);
+
+ v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
+ v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
+ v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
+ v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
+ }
+ }
+ break;
+ case STREAMTYPE_INDICES:
+ {
+ v.x.x = *Pointer<Float>(source0);
+ v.x.y = *Pointer<Float>(source1);
+ v.x.z = *Pointer<Float>(source2);
+ v.x.w = *Pointer<Float>(source3);
+ }
+ break;
+ case STREAMTYPE_2_10_10_10_INT:
+ {
+ Int4 src;
+ src = Insert(src, *Pointer<Int>(source0), 0);
+ src = Insert(src, *Pointer<Int>(source1), 1);
+ src = Insert(src, *Pointer<Int>(source2), 2);
+ src = Insert(src, *Pointer<Int>(source3), 3);
+
+ v.x = Float4((src << 22) >> 22);
+ v.y = Float4((src << 12) >> 22);
+ v.z = Float4((src << 02) >> 22);
+ v.w = Float4(src >> 30);
+
+ if(stream.normalized)
+ {
+ v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
+ v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
+ v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
+ v.w = Max(v.w, Float4(-1.0f));
+ }
+ }
+ break;
+ case STREAMTYPE_2_10_10_10_UINT:
+ {
+ Int4 src;
+ src = Insert(src, *Pointer<Int>(source0), 0);
+ src = Insert(src, *Pointer<Int>(source1), 1);
+ src = Insert(src, *Pointer<Int>(source2), 2);
+ src = Insert(src, *Pointer<Int>(source3), 3);
+
+ v.x = Float4(src & Int4(0x3FF));
+ v.y = Float4((src >> 10) & Int4(0x3FF));
+ v.z = Float4((src >> 20) & Int4(0x3FF));
+ v.w = Float4((src >> 30) & Int4(0x3));
+
+ if(stream.normalized)
+ {
+ v.x *= Float4(1.0f / 0x3FF);
+ v.y *= Float4(1.0f / 0x3FF);
+ v.z *= Float4(1.0f / 0x3FF);
+ v.w *= Float4(1.0f / 0x3);
+ }
+ }
+ break;
+ default:
+ ASSERT(false);
+ }
+
+ if(stream.count < 1) v.x = Float4(0.0f);
+ if(stream.count < 2) v.y = Float4(0.0f);
+ if(stream.count < 3) v.z = Float4(0.0f);
+ if(stream.count < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(0));
+
+ return v;
+ }
+
+ void VertexRoutine::postTransform()
+ {
+ int pos = state.positionRegister;
+
+ // Backtransform
+ if(state.preTransformed)
+ {
+ Float4 rhw = Float4(1.0f) / o[pos].w;
+
+ Float4 W = *Pointer<Float4>(data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
+ Float4 H = *Pointer<Float4>(data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
+ Float4 L = *Pointer<Float4>(data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
+ Float4 T = *Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
+
+ o[pos].x = (o[pos].x - L) / W * rhw;
+ o[pos].y = (o[pos].y - T) / H * rhw;
+ o[pos].z = o[pos].z * rhw;
+ o[pos].w = rhw;
+ }
+
+ if(!halfIntegerCoordinates && !state.preTransformed)
+ {
+ o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelX)) * o[pos].w;
+ o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelY)) * o[pos].w;
+ }
+
+ if(state.superSampling)
+ {
+ o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,XXXX)) * o[pos].w;
+ o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,YYYY)) * o[pos].w;
+ }
+ }
+
+ void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
+ {
+ Vector4f v;
+
+ for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
+ {
+ if(state.output[i].write)
+ {
+ v.x = o[i].x;
+ v.y = o[i].y;
+ v.z = o[i].z;
+ v.w = o[i].w;
+
+ if(state.output[i].xClamp)
+ {
+ v.x = Max(v.x, Float4(0.0f));
+ v.x = Min(v.x, Float4(1.0f));
+ }
+
+ if(state.output[i].yClamp)
+ {
+ v.y = Max(v.y, Float4(0.0f));
+ v.y = Min(v.y, Float4(1.0f));
+ }
+
+ if(state.output[i].zClamp)
+ {
+ v.z = Max(v.z, Float4(0.0f));
+ v.z = Min(v.z, Float4(1.0f));
+ }
+
+ if(state.output[i].wClamp)
+ {
+ v.w = Max(v.w, Float4(0.0f));
+ v.w = Min(v.w, Float4(1.0f));
+ }
+
+ if(state.output[i].write == 0x01)
+ {
+ *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
+ *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
+ *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
+ *Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
+ }
+ else
+ {
+ if(state.output[i].write == 0x03)
+ {
+ transpose2x4(v.x, v.y, v.z, v.w);
+ }
+ else
+ {
+ transpose4x4(v.x, v.y, v.z, v.w);
+ }
+
+ *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
+ *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
+ *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
+ *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
+ }
+ }
+ }
+
+ *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0) & 0x0000000FF;
+ *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8) & 0x0000000FF;
+ *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
+ *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
+
+ // Viewport transform
+ int pos = state.positionRegister;
+
+ v.x = o[pos].x;
+ v.y = o[pos].y;
+ v.z = o[pos].z;
+ v.w = o[pos].w;
+
+ if(symmetricNormalizedDepth)
+ {
+ v.z = (v.z + v.w) * Float4(0.5f); // [-1, 1] -> [0, 1]
+ }
+
+ Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
+ Float4 rhw = Float4(1.0f) / w;
+
+ v.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
+ v.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
+ v.z = v.z * rhw;
+ v.w = rhw;
+
+ transpose4x4(v.x, v.y, v.z, v.w);
+
+ *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
+ *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
+ *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
+ *Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
+ }
+
+ void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
+ {
+ for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
+ {
+ if(state.output[i].write)
+ {
+ *Pointer<Int4>(vertex + OFFSET(Vertex,v[i]), 16) = *Pointer<Int4>(cache + OFFSET(Vertex,v[i]), 16);
+ }
+ }
+
+ *Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
+ *Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
+ }
+
+ void VertexRoutine::transformFeedback(const Pointer<Byte> &vertex, const UInt &primitiveNumber, const UInt &indexInPrimitive)
+ {
+ If(indexInPrimitive < state.verticesPerPrimitive)
+ {
+ UInt tOffset = primitiveNumber * state.verticesPerPrimitive + indexInPrimitive;
+
+ for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
+ {
+ if(state.transformFeedbackEnabled & (1ULL << i))
+ {
+ UInt reg = *Pointer<UInt>(data + OFFSET(DrawData, vs.reg[i]));
+ UInt row = *Pointer<UInt>(data + OFFSET(DrawData, vs.row[i]));
+ UInt col = *Pointer<UInt>(data + OFFSET(DrawData, vs.col[i]));
+ UInt str = *Pointer<UInt>(data + OFFSET(DrawData, vs.str[i]));
+
+ Pointer<Byte> t = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.t[i])) + (tOffset * str * sizeof(float));
+ Pointer<Byte> v = vertex + OFFSET(Vertex, v) + reg * sizeof(float);
+
+ For(UInt r = 0, r < row, r++)
+ {
+ UInt rOffsetX = r * col * sizeof(float);
+ UInt rOffset4 = r * sizeof(float4);
+
+ For(UInt c = 0, c < col, c++)
+ {
+ UInt cOffset = c * sizeof(float);
+ *Pointer<Float>(t + rOffsetX + cOffset) = *Pointer<Float>(v + rOffset4 + cOffset);
+ }
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/src/Pipeline/VertexRoutine.hpp b/src/Pipeline/VertexRoutine.hpp
new file mode 100644
index 0000000..905118b
--- /dev/null
+++ b/src/Pipeline/VertexRoutine.hpp
@@ -0,0 +1,71 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_VertexRoutine_hpp
+#define sw_VertexRoutine_hpp
+
+#include "Renderer/Color.hpp"
+#include "Renderer/VertexProcessor.hpp"
+#include "ShaderCore.hpp"
+#include "VertexShader.hpp"
+
+namespace sw
+{
+ class VertexRoutinePrototype : public Function<Void(Pointer<Byte>, Pointer<Byte>, Pointer<Byte>, Pointer<Byte>)>
+ {
+ public:
+ VertexRoutinePrototype() : vertex(Arg<0>()), batch(Arg<1>()), task(Arg<2>()), data(Arg<3>()) {}
+ virtual ~VertexRoutinePrototype() {};
+
+ protected:
+ Pointer<Byte> vertex;
+ Pointer<Byte> batch;
+ Pointer<Byte> task;
+ Pointer<Byte> data;
+ };
+
+ class VertexRoutine : public VertexRoutinePrototype
+ {
+ public:
+ VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader);
+ virtual ~VertexRoutine();
+
+ void generate();
+
+ protected:
+ Pointer<Byte> constants;
+
+ Int clipFlags;
+
+ RegisterArray<MAX_VERTEX_INPUTS> v; // Input registers
+ RegisterArray<MAX_VERTEX_OUTPUTS> o; // Output registers
+
+ const VertexProcessor::State &state;
+
+ private:
+ virtual void pipeline(UInt &index) = 0;
+
+ typedef VertexProcessor::State::Input Stream;
+
+ Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index);
+ void readInput(UInt &index);
+ void computeClipFlags();
+ void postTransform();
+ void writeCache(Pointer<Byte> &cacheLine);
+ void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheLine);
+ void transformFeedback(const Pointer<Byte> &vertex, const UInt &primitiveNumber, const UInt &indexInPrimitive);
+ };
+}
+
+#endif // sw_VertexRoutine_hpp
diff --git a/src/Pipeline/VertexShader.cpp b/src/Pipeline/VertexShader.cpp
new file mode 100644
index 0000000..8f1c4f8
--- /dev/null
+++ b/src/Pipeline/VertexShader.cpp
@@ -0,0 +1,330 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "VertexShader.hpp"
+
+#include "Renderer/Vertex.hpp"
+#include "Common/Debug.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+ VertexShader::VertexShader(const VertexShader *vs) : Shader()
+ {
+ shaderModel = 0x0300;
+ positionRegister = Pos;
+ pointSizeRegister = Unused;
+ instanceIdDeclared = false;
+ vertexIdDeclared = false;
+ textureSampling = false;
+
+ for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+ {
+ input[i] = Semantic();
+ attribType[i] = ATTRIBTYPE_FLOAT;
+ }
+
+ if(vs) // Make a copy
+ {
+ for(size_t i = 0; i < vs->getLength(); i++)
+ {
+ append(new sw::Shader::Instruction(*vs->getInstruction(i)));
+ }
+
+ memcpy(output, vs->output, sizeof(output));
+ memcpy(input, vs->input, sizeof(input));
+ memcpy(attribType, vs->attribType, sizeof(attribType));
+ positionRegister = vs->positionRegister;
+ pointSizeRegister = vs->pointSizeRegister;
+ instanceIdDeclared = vs->instanceIdDeclared;
+ vertexIdDeclared = vs->vertexIdDeclared;
+ usedSamplers = vs->usedSamplers;
+
+ optimize();
+ analyze();
+ }
+ }
+
+ VertexShader::VertexShader(const unsigned long *token) : Shader()
+ {
+ parse(token);
+
+ positionRegister = Pos;
+ pointSizeRegister = Unused;
+ instanceIdDeclared = false;
+ vertexIdDeclared = false;
+ textureSampling = false;
+
+ for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+ {
+ input[i] = Semantic();
+ attribType[i] = ATTRIBTYPE_FLOAT;
+ }
+
+ optimize();
+ analyze();
+ }
+
+ VertexShader::~VertexShader()
+ {
+ }
+
+ int VertexShader::validate(const unsigned long *const token)
+ {
+ if(!token)
+ {
+ return 0;
+ }
+
+ unsigned short version = (unsigned short)(token[0] & 0x0000FFFF);
+ unsigned char majorVersion = (unsigned char)((token[0] & 0x0000FF00) >> 8);
+ ShaderType shaderType = (ShaderType)((token[0] & 0xFFFF0000) >> 16);
+
+ if(shaderType != SHADER_VERTEX || majorVersion > 3)
+ {
+ return 0;
+ }
+
+ int instructionCount = 1;
+
+ for(int i = 0; token[i] != 0x0000FFFF; i++)
+ {
+ if((token[i] & 0x0000FFFF) == 0x0000FFFE) // Comment token
+ {
+ int length = (token[i] & 0x7FFF0000) >> 16;
+
+ i += length;
+ }
+ else
+ {
+ Shader::Opcode opcode = (Shader::Opcode)(token[i] & 0x0000FFFF);
+
+ switch(opcode)
+ {
+ case Shader::OPCODE_TEXCOORD:
+ case Shader::OPCODE_TEXKILL:
+ case Shader::OPCODE_TEX:
+ case Shader::OPCODE_TEXBEM:
+ case Shader::OPCODE_TEXBEML:
+ case Shader::OPCODE_TEXREG2AR:
+ case Shader::OPCODE_TEXREG2GB:
+ case Shader::OPCODE_TEXM3X2PAD:
+ case Shader::OPCODE_TEXM3X2TEX:
+ case Shader::OPCODE_TEXM3X3PAD:
+ case Shader::OPCODE_TEXM3X3TEX:
+ case Shader::OPCODE_RESERVED0:
+ case Shader::OPCODE_TEXM3X3SPEC:
+ case Shader::OPCODE_TEXM3X3VSPEC:
+ case Shader::OPCODE_TEXREG2RGB:
+ case Shader::OPCODE_TEXDP3TEX:
+ case Shader::OPCODE_TEXM3X2DEPTH:
+ case Shader::OPCODE_TEXDP3:
+ case Shader::OPCODE_TEXM3X3:
+ case Shader::OPCODE_TEXDEPTH:
+ case Shader::OPCODE_CMP0:
+ case Shader::OPCODE_BEM:
+ case Shader::OPCODE_DP2ADD:
+ case Shader::OPCODE_DFDX:
+ case Shader::OPCODE_DFDY:
+ case Shader::OPCODE_TEXLDD:
+ return 0; // Unsupported operation
+ default:
+ instructionCount++;
+ break;
+ }
+
+ i += size(token[i], version);
+ }
+ }
+
+ return instructionCount;
+ }
+
+ bool VertexShader::containsTextureSampling() const
+ {
+ return textureSampling;
+ }
+
+ void VertexShader::setInput(int inputIdx, const sw::Shader::Semantic& semantic, AttribType aType)
+ {
+ input[inputIdx] = semantic;
+ attribType[inputIdx] = aType;
+ }
+
+ void VertexShader::setOutput(int outputIdx, int nbComponents, const sw::Shader::Semantic& semantic)
+ {
+ for(int i = 0; i < nbComponents; ++i)
+ {
+ output[outputIdx][i] = semantic;
+ }
+ }
+
+ void VertexShader::setPositionRegister(int posReg)
+ {
+ setOutput(posReg, 4, sw::Shader::Semantic(sw::Shader::USAGE_POSITION, 0));
+ positionRegister = posReg;
+ }
+
+ void VertexShader::setPointSizeRegister(int ptSizeReg)
+ {
+ setOutput(ptSizeReg, 4, sw::Shader::Semantic(sw::Shader::USAGE_PSIZE, 0));
+ pointSizeRegister = ptSizeReg;
+ }
+
+ const sw::Shader::Semantic& VertexShader::getInput(int inputIdx) const
+ {
+ return input[inputIdx];
+ }
+
+ VertexShader::AttribType VertexShader::getAttribType(int inputIdx) const
+ {
+ return attribType[inputIdx];
+ }
+
+ const sw::Shader::Semantic& VertexShader::getOutput(int outputIdx, int component) const
+ {
+ return output[outputIdx][component];
+ }
+
+ void VertexShader::analyze()
+ {
+ analyzeInput();
+ analyzeOutput();
+ analyzeDirtyConstants();
+ analyzeTextureSampling();
+ analyzeDynamicBranching();
+ analyzeSamplers();
+ analyzeCallSites();
+ analyzeIndirectAddressing();
+ }
+
+ void VertexShader::analyzeInput()
+ {
+ for(unsigned int i = 0; i < instruction.size(); i++)
+ {
+ if(instruction[i]->opcode == Shader::OPCODE_DCL &&
+ instruction[i]->dst.type == Shader::PARAMETER_INPUT)
+ {
+ int index = instruction[i]->dst.index;
+
+ input[index] = Semantic(instruction[i]->usage, instruction[i]->usageIndex);
+ }
+ }
+ }
+
+ void VertexShader::analyzeOutput()
+ {
+ if(shaderModel < 0x0300)
+ {
+ output[Pos][0] = Semantic(Shader::USAGE_POSITION, 0);
+ output[Pos][1] = Semantic(Shader::USAGE_POSITION, 0);
+ output[Pos][2] = Semantic(Shader::USAGE_POSITION, 0);
+ output[Pos][3] = Semantic(Shader::USAGE_POSITION, 0);
+
+ for(const auto &inst : instruction)
+ {
+ const DestinationParameter &dst = inst->dst;
+
+ switch(dst.type)
+ {
+ case Shader::PARAMETER_RASTOUT:
+ switch(dst.index)
+ {
+ case 0:
+ // Position already assumed written
+ break;
+ case 1:
+ output[Fog][0] = Semantic(Shader::USAGE_FOG, 0);
+ break;
+ case 2:
+ output[Pts][1] = Semantic(Shader::USAGE_PSIZE, 0);
+ pointSizeRegister = Pts;
+ break;
+ default: ASSERT(false);
+ }
+ break;
+ case Shader::PARAMETER_ATTROUT:
+ if(dst.index == 0)
+ {
+ if(dst.x) output[C0][0] = Semantic(Shader::USAGE_COLOR, 0);
+ if(dst.y) output[C0][1] = Semantic(Shader::USAGE_COLOR, 0);
+ if(dst.z) output[C0][2] = Semantic(Shader::USAGE_COLOR, 0);
+ if(dst.w) output[C0][3] = Semantic(Shader::USAGE_COLOR, 0);
+ }
+ else if(dst.index == 1)
+ {
+ if(dst.x) output[C1][0] = Semantic(Shader::USAGE_COLOR, 1);
+ if(dst.y) output[C1][1] = Semantic(Shader::USAGE_COLOR, 1);
+ if(dst.z) output[C1][2] = Semantic(Shader::USAGE_COLOR, 1);
+ if(dst.w) output[C1][3] = Semantic(Shader::USAGE_COLOR, 1);
+ }
+ else ASSERT(false);
+ break;
+ case Shader::PARAMETER_TEXCRDOUT:
+ if(dst.x) output[T0 + dst.index][0] = Semantic(Shader::USAGE_TEXCOORD, dst.index);
+ if(dst.y) output[T0 + dst.index][1] = Semantic(Shader::USAGE_TEXCOORD, dst.index);
+ if(dst.z) output[T0 + dst.index][2] = Semantic(Shader::USAGE_TEXCOORD, dst.index);
+ if(dst.w) output[T0 + dst.index][3] = Semantic(Shader::USAGE_TEXCOORD, dst.index);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ else // Shader Model 3.0 input declaration
+ {
+ for(const auto &inst : instruction)
+ {
+ if(inst->opcode == Shader::OPCODE_DCL &&
+ inst->dst.type == Shader::PARAMETER_OUTPUT)
+ {
+ unsigned char usage = inst->usage;
+ unsigned char usageIndex = inst->usageIndex;
+
+ const DestinationParameter &dst = inst->dst;
+
+ if(dst.x) output[dst.index][0] = Semantic(usage, usageIndex);
+ if(dst.y) output[dst.index][1] = Semantic(usage, usageIndex);
+ if(dst.z) output[dst.index][2] = Semantic(usage, usageIndex);
+ if(dst.w) output[dst.index][3] = Semantic(usage, usageIndex);
+
+ if(usage == Shader::USAGE_POSITION && usageIndex == 0)
+ {
+ positionRegister = dst.index;
+ }
+
+ if(usage == Shader::USAGE_PSIZE && usageIndex == 0)
+ {
+ pointSizeRegister = dst.index;
+ }
+ }
+ }
+ }
+ }
+
+ void VertexShader::analyzeTextureSampling()
+ {
+ textureSampling = false;
+
+ for(const auto &inst : instruction)
+ {
+ if(inst->src[1].type == PARAMETER_SAMPLER)
+ {
+ textureSampling = true;
+ break;
+ }
+ }
+ }
+}
diff --git a/src/Pipeline/VertexShader.hpp b/src/Pipeline/VertexShader.hpp
new file mode 100644
index 0000000..9a9a0a6
--- /dev/null
+++ b/src/Pipeline/VertexShader.hpp
@@ -0,0 +1,78 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_VertexShader_hpp
+#define sw_VertexShader_hpp
+
+#include "Shader.hpp"
+#include "Main/Config.hpp"
+
+namespace sw
+{
+ class VertexShader : public Shader
+ {
+ public:
+ enum AttribType : unsigned char
+ {
+ ATTRIBTYPE_FLOAT,
+ ATTRIBTYPE_INT,
+ ATTRIBTYPE_UINT,
+
+ ATTRIBTYPE_LAST = ATTRIBTYPE_UINT
+ };
+
+ explicit VertexShader(const VertexShader *vs = 0);
+ explicit VertexShader(const unsigned long *token);
+
+ virtual ~VertexShader();
+
+ static int validate(const unsigned long *const token); // Returns number of instructions if valid
+ bool containsTextureSampling() const;
+
+ void setInput(int inputIdx, const Semantic& semantic, AttribType attribType = ATTRIBTYPE_FLOAT);
+ void setOutput(int outputIdx, int nbComponents, const Semantic& semantic);
+ void setPositionRegister(int posReg);
+ void setPointSizeRegister(int ptSizeReg);
+ void declareInstanceId() { instanceIdDeclared = true; }
+ void declareVertexId() { vertexIdDeclared = true; }
+
+ const Semantic& getInput(int inputIdx) const;
+ const Semantic& getOutput(int outputIdx, int component) const;
+ AttribType getAttribType(int inputIndex) const;
+ int getPositionRegister() const { return positionRegister; }
+ int getPointSizeRegister() const { return pointSizeRegister; }
+ bool isInstanceIdDeclared() const { return instanceIdDeclared; }
+ bool isVertexIdDeclared() const { return vertexIdDeclared; }
+
+ private:
+ void analyze();
+ void analyzeInput();
+ void analyzeOutput();
+ void analyzeTextureSampling();
+
+ Semantic input[MAX_VERTEX_INPUTS];
+ Semantic output[MAX_VERTEX_OUTPUTS][4];
+
+ AttribType attribType[MAX_VERTEX_INPUTS];
+
+ int positionRegister;
+ int pointSizeRegister;
+
+ bool instanceIdDeclared;
+ bool vertexIdDeclared;
+ bool textureSampling;
+ };
+}
+
+#endif // sw_VertexShader_hpp
diff --git a/src/System/CPUID.cpp b/src/System/CPUID.cpp
new file mode 100644
index 0000000..c080034
--- /dev/null
+++ b/src/System/CPUID.cpp
@@ -0,0 +1,301 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "CPUID.hpp"
+
+#if defined(_WIN32)
+ #ifndef WIN32_LEAN_AND_MEAN
+ #define WIN32_LEAN_AND_MEAN
+ #endif
+ #include <windows.h>
+ #include <intrin.h>
+ #include <float.h>
+#else
+ #include <unistd.h>
+ #include <sched.h>
+ #include <sys/types.h>
+#endif
+
+namespace sw
+{
+ bool CPUID::MMX = detectMMX();
+ bool CPUID::CMOV = detectCMOV();
+ bool CPUID::SSE = detectSSE();
+ bool CPUID::SSE2 = detectSSE2();
+ bool CPUID::SSE3 = detectSSE3();
+ bool CPUID::SSSE3 = detectSSSE3();
+ bool CPUID::SSE4_1 = detectSSE4_1();
+ int CPUID::cores = detectCoreCount();
+ int CPUID::affinity = detectAffinity();
+
+ bool CPUID::enableMMX = true;
+ bool CPUID::enableCMOV = true;
+ bool CPUID::enableSSE = true;
+ bool CPUID::enableSSE2 = true;
+ bool CPUID::enableSSE3 = true;
+ bool CPUID::enableSSSE3 = true;
+ bool CPUID::enableSSE4_1 = true;
+
+ void CPUID::setEnableMMX(bool enable)
+ {
+ enableMMX = enable;
+
+ if(!enableMMX)
+ {
+ enableSSE = false;
+ enableSSE2 = false;
+ enableSSE3 = false;
+ enableSSSE3 = false;
+ enableSSE4_1 = false;
+ }
+ }
+
+ void CPUID::setEnableCMOV(bool enable)
+ {
+ enableCMOV = enable;
+
+ if(!CMOV)
+ {
+ enableSSE = false;
+ enableSSE2 = false;
+ enableSSE3 = false;
+ enableSSSE3 = false;
+ enableSSE4_1 = false;
+ }
+ }
+
+ void CPUID::setEnableSSE(bool enable)
+ {
+ enableSSE = enable;
+
+ if(enableSSE)
+ {
+ enableMMX = true;
+ enableCMOV = true;
+ }
+ else
+ {
+ enableSSE2 = false;
+ enableSSE3 = false;
+ enableSSSE3 = false;
+ enableSSE4_1 = false;
+ }
+ }
+
+ void CPUID::setEnableSSE2(bool enable)
+ {
+ enableSSE2 = enable;
+
+ if(enableSSE2)
+ {
+ enableMMX = true;
+ enableCMOV = true;
+ enableSSE = true;
+ }
+ else
+ {
+ enableSSE3 = false;
+ enableSSSE3 = false;
+ enableSSE4_1 = false;
+ }
+ }
+
+ void CPUID::setEnableSSE3(bool enable)
+ {
+ enableSSE3 = enable;
+
+ if(enableSSE3)
+ {
+ enableMMX = true;
+ enableCMOV = true;
+ enableSSE = true;
+ enableSSE2 = true;
+ }
+ else
+ {
+ enableSSSE3 = false;
+ enableSSE4_1 = false;
+ }
+ }
+
+ void CPUID::setEnableSSSE3(bool enable)
+ {
+ enableSSSE3 = enable;
+
+ if(enableSSSE3)
+ {
+ enableMMX = true;
+ enableCMOV = true;
+ enableSSE = true;
+ enableSSE2 = true;
+ enableSSE3 = true;
+ }
+ else
+ {
+ enableSSE4_1 = false;
+ }
+ }
+
+ void CPUID::setEnableSSE4_1(bool enable)
+ {
+ enableSSE4_1 = enable;
+
+ if(enableSSE4_1)
+ {
+ enableMMX = true;
+ enableCMOV = true;
+ enableSSE = true;
+ enableSSE2 = true;
+ enableSSE3 = true;
+ enableSSSE3 = true;
+ }
+ }
+
+ static void cpuid(int registers[4], int info)
+ {
+ #if defined(__i386__) || defined(__x86_64__)
+ #if defined(_WIN32)
+ __cpuid(registers, info);
+ #else
+ __asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+ #endif
+ #else
+ registers[0] = 0;
+ registers[1] = 0;
+ registers[2] = 0;
+ registers[3] = 0;
+ #endif
+ }
+
+ bool CPUID::detectMMX()
+ {
+ int registers[4];
+ cpuid(registers, 1);
+ return MMX = (registers[3] & 0x00800000) != 0;
+ }
+
+ bool CPUID::detectCMOV()
+ {
+ int registers[4];
+ cpuid(registers, 1);
+ return CMOV = (registers[3] & 0x00008000) != 0;
+ }
+
+ bool CPUID::detectSSE()
+ {
+ int registers[4];
+ cpuid(registers, 1);
+ return SSE = (registers[3] & 0x02000000) != 0;
+ }
+
+ bool CPUID::detectSSE2()
+ {
+ int registers[4];
+ cpuid(registers, 1);
+ return SSE2 = (registers[3] & 0x04000000) != 0;
+ }
+
+ bool CPUID::detectSSE3()
+ {
+ int registers[4];
+ cpuid(registers, 1);
+ return SSE3 = (registers[2] & 0x00000001) != 0;
+ }
+
+ bool CPUID::detectSSSE3()
+ {
+ int registers[4];
+ cpuid(registers, 1);
+ return SSSE3 = (registers[2] & 0x00000200) != 0;
+ }
+
+ bool CPUID::detectSSE4_1()
+ {
+ int registers[4];
+ cpuid(registers, 1);
+ return SSE4_1 = (registers[2] & 0x00080000) != 0;
+ }
+
+ int CPUID::detectCoreCount()
+ {
+ int cores = 0;
+
+ #if defined(_WIN32)
+ DWORD_PTR processAffinityMask = 1;
+ DWORD_PTR systemAffinityMask = 1;
+
+ GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask);
+
+ while(systemAffinityMask)
+ {
+ if(systemAffinityMask & 1)
+ {
+ cores++;
+ }
+
+ systemAffinityMask >>= 1;
+ }
+ #else
+ cores = sysconf(_SC_NPROCESSORS_ONLN);
+ #endif
+
+ if(cores < 1) cores = 1;
+ if(cores > 16) cores = 16;
+
+ return cores; // FIXME: Number of physical cores
+ }
+
+ int CPUID::detectAffinity()
+ {
+ int cores = 0;
+
+ #if defined(_WIN32)
+ DWORD_PTR processAffinityMask = 1;
+ DWORD_PTR systemAffinityMask = 1;
+
+ GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask);
+
+ while(processAffinityMask)
+ {
+ if(processAffinityMask & 1)
+ {
+ cores++;
+ }
+
+ processAffinityMask >>= 1;
+ }
+ #else
+ return detectCoreCount(); // FIXME: Assumes no affinity limitation
+ #endif
+
+ if(cores < 1) cores = 1;
+ if(cores > 16) cores = 16;
+
+ return cores;
+ }
+
+ void CPUID::setFlushToZero(bool enable)
+ {
+ #if defined(_MSC_VER)
+ _controlfp(enable ? _DN_FLUSH : _DN_SAVE, _MCW_DN);
+ #else
+ // Unimplemented
+ #endif
+ }
+
+ void CPUID::setDenormalsAreZero(bool enable)
+ {
+ // Unimplemented
+ }
+}
diff --git a/src/System/CPUID.hpp b/src/System/CPUID.hpp
new file mode 100644
index 0000000..3c21cd7
--- /dev/null
+++ b/src/System/CPUID.hpp
@@ -0,0 +1,137 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_CPUID_hpp
+#define sw_CPUID_hpp
+
+namespace sw
+{
+ #if !defined(__i386__) && defined(_M_IX86)
+ #define __i386__ 1
+ #endif
+
+ #if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
+ #define __x86_64__ 1
+ #endif
+
+ class CPUID
+ {
+ public:
+ static bool supportsMMX();
+ static bool supportsCMOV();
+ static bool supportsMMX2(); // MMX instructions added by SSE: pshufw, pmulhuw, pmovmskb, pavgw/b, pextrw, pinsrw, pmaxsw/ub, etc.
+ static bool supportsSSE();
+ static bool supportsSSE2();
+ static bool supportsSSE3();
+ static bool supportsSSSE3();
+ static bool supportsSSE4_1();
+ static int coreCount();
+ static int processAffinity();
+
+ static void setEnableMMX(bool enable);
+ static void setEnableCMOV(bool enable);
+ static void setEnableSSE(bool enable);
+ static void setEnableSSE2(bool enable);
+ static void setEnableSSE3(bool enable);
+ static void setEnableSSSE3(bool enable);
+ static void setEnableSSE4_1(bool enable);
+
+ static void setFlushToZero(bool enable); // Denormal results are written as zero
+ static void setDenormalsAreZero(bool enable); // Denormal inputs are read as zero
+
+ private:
+ static bool MMX;
+ static bool CMOV;
+ static bool SSE;
+ static bool SSE2;
+ static bool SSE3;
+ static bool SSSE3;
+ static bool SSE4_1;
+ static int cores;
+ static int affinity;
+
+ static bool enableMMX;
+ static bool enableCMOV;
+ static bool enableSSE;
+ static bool enableSSE2;
+ static bool enableSSE3;
+ static bool enableSSSE3;
+ static bool enableSSE4_1;
+
+ static bool detectMMX();
+ static bool detectCMOV();
+ static bool detectSSE();
+ static bool detectSSE2();
+ static bool detectSSE3();
+ static bool detectSSSE3();
+ static bool detectSSE4_1();
+ static int detectCoreCount();
+ static int detectAffinity();
+ };
+}
+
+namespace sw
+{
+ inline bool CPUID::supportsMMX()
+ {
+ return MMX && enableMMX;
+ }
+
+ inline bool CPUID::supportsCMOV()
+ {
+ return CMOV && enableCMOV;
+ }
+
+ inline bool CPUID::supportsMMX2()
+ {
+ return supportsSSE(); // Coincides with 64-bit integer vector instructions supported by SSE
+ }
+
+ inline bool CPUID::supportsSSE()
+ {
+ return SSE && enableSSE;
+ }
+
+ inline bool CPUID::supportsSSE2()
+ {
+ return SSE2 && enableSSE2;
+ }
+
+ inline bool CPUID::supportsSSE3()
+ {
+ return SSE3 && enableSSE3;
+ }
+
+ inline bool CPUID::supportsSSSE3()
+ {
+ return SSSE3 && enableSSSE3;
+ }
+
+ inline bool CPUID::supportsSSE4_1()
+ {
+ return SSE4_1 && enableSSE4_1;
+ }
+
+ inline int CPUID::coreCount()
+ {
+ return cores;
+ }
+
+ inline int CPUID::processAffinity()
+ {
+ return affinity;
+ }
+}
+
+#endif // sw_CPUID_hpp
diff --git a/src/System/Configurator.cpp b/src/System/Configurator.cpp
new file mode 100644
index 0000000..ead1d28
--- /dev/null
+++ b/src/System/Configurator.cpp
@@ -0,0 +1,255 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Configurator.hpp"
+
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <ctype.h>
+
+#if defined(__unix__)
+#include <unistd.h>
+#endif
+
+namespace sw
+{
+ Configurator::Configurator(string iniPath)
+ {
+ path = iniPath;
+
+ readFile();
+ }
+
+ Configurator::~Configurator()
+ {
+ }
+
+ bool Configurator::readFile()
+ {
+ #if defined(__unix__)
+ if(access(path.c_str(), R_OK) != 0)
+ {
+ return false;
+ }
+ #endif
+
+ fstream file(path.c_str(), ios::in);
+ if(file.fail()) return false;
+
+ string line;
+ string keyName;
+
+ while(getline(file, line))
+ {
+ if(line.length())
+ {
+ if(line[line.length() - 1] == '\r')
+ {
+ line = line.substr(0, line.length() - 1);
+ }
+
+ if(!isprint(line[0]))
+ {
+ // printf("Failing on char %d\n", line[0]);
+ file.close();
+ return false;
+ }
+
+ string::size_type pLeft = line.find_first_of(";#[=");
+
+ if(pLeft != string::npos)
+ {
+ switch(line[pLeft])
+ {
+ case '[':
+ {
+ string::size_type pRight = line.find_last_of("]");
+
+ if(pRight != string::npos && pRight > pLeft)
+ {
+ keyName = line.substr(pLeft + 1, pRight - pLeft - 1);
+ addKeyName(keyName);
+ }
+ }
+ break;
+ case '=':
+ {
+ string valueName = line.substr(0, pLeft);
+ string value = line.substr(pLeft + 1);
+ addValue(keyName, valueName, value);
+ }
+ break;
+ case ';':
+ case '#':
+ // Ignore comments
+ break;
+ }
+ }
+ }
+ }
+
+ file.close();
+
+ if(names.size())
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ void Configurator::writeFile(std::string title)
+ {
+ #if defined(__unix__)
+ if(access(path.c_str(), W_OK) != 0)
+ {
+ return;
+ }
+ #endif
+
+ fstream file(path.c_str(), ios::out);
+ if(file.fail()) return;
+
+ file << "; " << title << endl << endl;
+
+ for(unsigned int keyID = 0; keyID < sections.size(); keyID++)
+ {
+ file << "[" << names[keyID] << "]" << endl;
+
+ for(unsigned int valueID = 0; valueID < sections[keyID].names.size(); valueID++)
+ {
+ file << sections[keyID].names[valueID] << "=" << sections[keyID].values[valueID] << endl;
+ }
+
+ file << endl;
+ }
+
+ file.close();
+ }
+
+ int Configurator::findKey(string keyName) const
+ {
+ for(unsigned int keyID = 0; keyID < names.size(); keyID++)
+ {
+ if(names[keyID] == keyName)
+ {
+ return keyID;
+ }
+ }
+
+ return -1;
+ }
+
+ int Configurator::findValue(unsigned int keyID, string valueName) const
+ {
+ if(!sections.size() || keyID >= sections.size())
+ {
+ return -1;
+ }
+
+ for(unsigned int valueID = 0; valueID < sections[keyID].names.size(); ++valueID)
+ {
+ if(sections[keyID].names[valueID] == valueName)
+ {
+ return valueID;
+ }
+ }
+
+ return -1;
+ }
+
+ unsigned int Configurator::addKeyName(string keyName)
+ {
+ names.resize(names.size() + 1, keyName);
+ sections.resize(sections.size() + 1);
+ return (unsigned int)names.size() - 1;
+ }
+
+ void Configurator::addValue(string const keyName, string const valueName, string const value)
+ {
+ int keyID = findKey(keyName);
+
+ if(keyID == -1)
+ {
+ keyID = addKeyName(keyName);
+ }
+
+ int valueID = findValue(keyID, valueName);
+
+ if(valueID == -1)
+ {
+ sections[keyID].names.resize(sections[keyID].names.size() + 1, valueName);
+ sections[keyID].values.resize(sections[keyID].values.size() + 1, value);
+ }
+ else
+ {
+ sections[keyID].values[valueID] = value;
+ }
+ }
+
+ string Configurator::getValue(string keyName, string valueName, string defaultValue) const
+ {
+ int keyID = findKey(keyName);
+ if(keyID == -1) return defaultValue;
+ int valueID = findValue((unsigned int)keyID, valueName);
+ if(valueID == -1) return defaultValue;
+
+ return sections[keyID].values[valueID];
+ }
+
+ int Configurator::getInteger(string keyName, string valueName, int defaultValue) const
+ {
+ char svalue[256];
+
+ sprintf(svalue, "%d", defaultValue);
+
+ return atoi(getValue(keyName, valueName, svalue).c_str());
+ }
+
+ bool Configurator::getBoolean(string keyName, string valueName, bool defaultValue) const
+ {
+ return getInteger(keyName, valueName, (int)defaultValue) != 0;
+ }
+
+ double Configurator::getFloat(string keyName, string valueName, double defaultValue) const
+ {
+ char svalue[256];
+
+ sprintf(svalue, "%f", defaultValue);
+
+ return atof(getValue(keyName, valueName, svalue).c_str());
+ }
+
+ unsigned int Configurator::getFormatted(string keyName, string valueName, char *format,
+ void *v1, void *v2, void *v3, void *v4,
+ void *v5, void *v6, void *v7, void *v8,
+ void *v9, void *v10, void *v11, void *v12,
+ void *v13, void *v14, void *v15, void *v16)
+ {
+ string value = getValue(keyName, valueName);
+
+ if(!value.length()) return false;
+
+ unsigned int nVals = sscanf(value.c_str(), format,
+ v1, v2, v3, v4, v5, v6, v7, v8,
+ v9, v10, v11, v12, v13, v14, v15, v16);
+
+ return nVals;
+ }
+}
diff --git a/src/System/Configurator.hpp b/src/System/Configurator.hpp
new file mode 100644
index 0000000..6fd930c
--- /dev/null
+++ b/src/System/Configurator.hpp
@@ -0,0 +1,66 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Configurator_hpp
+#define sw_Configurator_hpp
+
+#include <string>
+#include <vector>
+
+#include <stdlib.h>
+
+namespace sw
+{
+ class Configurator
+ {
+ public:
+ Configurator(std::string iniPath = "");
+
+ ~Configurator();
+
+ std::string getValue(std::string sectionName, std::string valueName, std::string defaultValue = "") const;
+ int getInteger(std::string sectionName, std::string valueName, int defaultValue = 0) const;
+ bool getBoolean(std::string sectionName, std::string valueName, bool defaultValue = false) const;
+ double getFloat(std::string sectionName, std::string valueName, double defaultValue = 0.0) const;
+ unsigned int getFormatted(std::string sectionName, std::string valueName, char *format,
+ void *v1 = 0, void *v2 = 0, void *v3 = 0, void *v4 = 0,
+ void *v5 = 0, void *v6 = 0, void *v7 = 0, void *v8 = 0,
+ void *v9 = 0, void *v10 = 0, void *v11 = 0, void *v12 = 0,
+ void *v13 = 0, void *v14 = 0, void *v15 = 0, void *v16 = 0);
+
+ void addValue(std::string sectionName, std::string valueName, std::string value);
+
+ void writeFile(std::string title = "Configuration File");
+
+ private:
+ bool readFile();
+
+ unsigned int addKeyName(std::string sectionName);
+ int findKey(std::string sectionName) const;
+ int findValue(unsigned int sectionID, std::string valueName) const;
+
+ std::string path;
+
+ struct Section
+ {
+ std::vector<std::string> names;
+ std::vector<std::string> values;
+ };
+
+ std::vector<Section> sections;
+ std::vector<std::string> names;
+ };
+}
+
+#endif // sw_Configurator_hpp
diff --git a/src/System/Debug.cpp b/src/System/Debug.cpp
new file mode 100644
index 0000000..acf469e
--- /dev/null
+++ b/src/System/Debug.cpp
@@ -0,0 +1,39 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Debug.hpp"
+
+#include <stdio.h>
+#include <stdarg.h>
+
+namespace sw
+{
+void trace(const char *format, ...)
+{
+ if(false)
+ {
+ FILE *file = fopen("debug.txt", "a");
+
+ if(file)
+ {
+ va_list vararg;
+ va_start(vararg, format);
+ vfprintf(file, format, vararg);
+ va_end(vararg);
+
+ fclose(file);
+ }
+ }
+}
+}
diff --git a/src/System/Debug.hpp b/src/System/Debug.hpp
new file mode 100644
index 0000000..9758c3b
--- /dev/null
+++ b/src/System/Debug.hpp
@@ -0,0 +1,58 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Debug_hpp
+#define Debug_hpp
+
+#if defined(__ANDROID__) && !defined(ANDROID_HOST_BUILD)
+#include "DebugAndroid.hpp"
+#else
+
+#include <assert.h>
+#include <stdio.h>
+
+#undef min
+#undef max
+
+namespace sw
+{
+void trace(const char *format, ...);
+inline void trace() {}
+}
+
+#if !defined(NDEBUG) || defined(DCHECK_ALWAYS_ON)
+ #define TRACE(format, ...) sw::trace("[0x%0.8X]%s(" format ")\n", this, __FUNCTION__, ##__VA_ARGS__)
+#else
+ #define TRACE(...) ((void)0)
+#endif
+
+#if !defined(NDEBUG) || defined(DCHECK_ALWAYS_ON)
+ #define UNIMPLEMENTED(...) do { \
+ sw::trace("\t! Unimplemented: %s(%d): ", __FUNCTION__, __LINE__); \
+ sw::trace(__VA_ARGS__); \
+ sw::trace("\n"); \
+ ASSERT(false); \
+ } while(0)
+#else
+ #define UNIMPLEMENTED(...) ((void)0)
+#endif
+
+#if !defined(NDEBUG) || defined(DCHECK_ALWAYS_ON)
+ #define ASSERT(expression) {if(!(expression)) sw::trace("\t! Assert failed in %s(%d): " #expression "\n", __FUNCTION__, __LINE__); assert(expression);}
+#else
+ #define ASSERT assert
+#endif
+
+#endif // !__ANDROID__
+#endif // Debug_hpp
diff --git a/src/System/DebugAndroid.cpp b/src/System/DebugAndroid.cpp
new file mode 100644
index 0000000..c511fc3
--- /dev/null
+++ b/src/System/DebugAndroid.cpp
@@ -0,0 +1,53 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "DebugAndroid.hpp"
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <cutils/properties.h>
+
+void AndroidEnterDebugger()
+{
+ ALOGE(__FUNCTION__);
+#ifndef NDEBUG
+ static volatile int * const makefault = nullptr;
+ char value[PROPERTY_VALUE_MAX];
+ property_get("debug.db.uid", value, "-1");
+ int debug_uid = atoi(value);
+ if((debug_uid >= 0) && (geteuid() < static_cast<uid_t>(debug_uid)))
+ {
+ ALOGE("Waiting for debugger: gdbserver :${PORT} --attach %u. Look for thread %u", getpid(), gettid());
+ volatile int waiting = 1;
+ while (waiting) {
+ sleep(1);
+ }
+ }
+ else
+ {
+ ALOGE("No debugger");
+ }
+#endif
+}
+
+void trace(const char *format, ...)
+{
+#ifndef NDEBUG
+ va_list vararg;
+ va_start(vararg, format);
+ android_vprintLog(ANDROID_LOG_VERBOSE, NULL, LOG_TAG, format, vararg);
+ va_end(vararg);
+#endif
+}
diff --git a/src/System/DebugAndroid.hpp b/src/System/DebugAndroid.hpp
new file mode 100644
index 0000000..eced194
--- /dev/null
+++ b/src/System/DebugAndroid.hpp
@@ -0,0 +1,99 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DebugAndroid_hpp
+#define DebugAndroid_hpp
+
+#if ANDROID_PLATFORM_SDK_VERSION < 27
+#include <cutils/log.h>
+#elif ANDROID_PLATFORM_SDK_VERSION >= 27
+#include <log/log.h>
+#else
+#error "ANDROID_PLATFORM_SDK_VERSION is not defined"
+#endif
+
+#include <cassert>
+
+// On Android Virtual Devices we heavily depend on logging, even in
+// production builds. We do this because AVDs are components of larger
+// systems, and may be configured in ways that are difficult to
+// reproduce locally. For example some system run tests against
+// third-party code that we cannot access. Aborting (cf. assert) on
+// unimplemented functionality creates two problems. First, it produces
+// a service failure where none is needed. Second, it puts the
+// customer on the critical path for notifying us of a problem.
+// The alternative, skipping unimplemented functionality silently, is
+// arguably worse: neither the service provider nor the customer will
+// learn that unimplemented functionality may have compromised the test
+// results.
+// Logging invocations of unimplemented functionality is useful to both
+// service provider and the customer. The service provider can learn
+// that the functionality is needed. The customer learns that the test
+// results may be compromised.
+
+/**
+ * Enter the debugger with a memory fault iff debuggerd is set to capture this
+ * process. Otherwise return.
+ */
+void AndroidEnterDebugger();
+
+#define ASSERT(E) do { \
+ if (!(E)) { \
+ ALOGE("badness: assertion_failed %s in %s at %s:%d", #E, \
+ __FUNCTION__, __FILE__, __LINE__); \
+ AndroidEnterDebugger(); \
+ } \
+ } while(0)
+
+#undef assert
+#define assert(E) ASSERT(E)
+
+#define ERR(format, ...) \
+ do { \
+ ALOGE("badness: err %s %s:%d (" format ")", __FUNCTION__, __FILE__, \
+ __LINE__, ##__VA_ARGS__); \
+ AndroidEnterDebugger(); \
+ } while(0)
+
+#define FIXME(format, ...) \
+ do { \
+ ALOGE("badness: fixme %s %s:%d (" format ")", __FUNCTION__, __FILE__, \
+ __LINE__, ##__VA_ARGS__); \
+ AndroidEnterDebugger(); \
+ } while(0)
+
+// TODO: Handle __VA_ARGS__ (can be empty)
+#define UNIMPLEMENTED(...) do { \
+ ALOGE("badness: unimplemented: %s %s:%d", \
+ __FUNCTION__, __FILE__, __LINE__); \
+ AndroidEnterDebugger(); \
+ } while(0)
+
+#define UNREACHABLE(value) do { \
+ ALOGE("badness: unreachable case reached: %s %s:%d. %s: %d", \
+ __FUNCTION__, __FILE__, __LINE__, #value, value); \
+ AndroidEnterDebugger(); \
+ } while(0)
+
+#ifndef NDEBUG
+ #define TRACE(format, ...) \
+ ALOGV("%s %s:%d (" format ")", __FUNCTION__, __FILE__, \
+ __LINE__, ##__VA_ARGS__)
+#else
+ #define TRACE(...) ((void)0)
+#endif
+
+void trace(const char *format, ...);
+
+#endif // DebugAndroid_hpp
diff --git a/src/System/GrallocAndroid.cpp b/src/System/GrallocAndroid.cpp
new file mode 100644
index 0000000..c877e9933
--- /dev/null
+++ b/src/System/GrallocAndroid.cpp
@@ -0,0 +1,106 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "GrallocAndroid.hpp"
+#include "Debug.hpp"
+
+#ifdef HAVE_GRALLOC1
+#include <sync/sync.h>
+#endif
+
+GrallocModule *GrallocModule::getInstance()
+{
+ static GrallocModule instance;
+ return &instance;
+}
+
+GrallocModule::GrallocModule()
+{
+ const hw_module_t *module = nullptr;
+ hw_get_module(GRALLOC_HARDWARE_MODULE_ID, &module);
+
+ m_major_version = (module->module_api_version >> 8) & 0xff;
+ switch(m_major_version)
+ {
+ case 0:
+ m_module = reinterpret_cast<const gralloc_module_t*>(module);
+ break;
+ case 1:
+#ifdef HAVE_GRALLOC1
+ gralloc1_open(module, &m_gralloc1_device);
+ m_gralloc1_lock = (GRALLOC1_PFN_LOCK) m_gralloc1_device->getFunction(m_gralloc1_device, GRALLOC1_FUNCTION_LOCK);
+ m_gralloc1_unlock = (GRALLOC1_PFN_UNLOCK)m_gralloc1_device->getFunction(m_gralloc1_device, GRALLOC1_FUNCTION_UNLOCK);
+ break;
+#endif
+ default:
+ TRACE("unknown gralloc major version (%d)", m_major_version);
+ break;
+ }
+}
+
+int GrallocModule::lock(buffer_handle_t handle, int usage, int left, int top, int width, int height, void **vaddr)
+{
+ switch(m_major_version)
+ {
+ case 0:
+ {
+ return m_module->lock(m_module, handle, usage, left, top, width, height, vaddr);
+ }
+ case 1:
+#ifdef HAVE_GRALLOC1
+ {
+ gralloc1_rect_t outRect{};
+ outRect.left = left;
+ outRect.top = top;
+ outRect.width = width;
+ outRect.height = height;
+ return m_gralloc1_lock(m_gralloc1_device, handle, usage, usage, &outRect, vaddr, -1);
+ }
+#endif
+ default:
+ {
+ TRACE("no gralloc module to lock");
+ return -1;
+ }
+ }
+}
+
+int GrallocModule::unlock(buffer_handle_t handle)
+{
+ switch(m_major_version)
+ {
+ case 0:
+ {
+ return m_module->unlock(m_module, handle);
+ }
+ case 1:
+#ifdef HAVE_GRALLOC1
+ {
+ int32_t fenceFd = -1;
+ int error = m_gralloc1_unlock(m_gralloc1_device, handle, &fenceFd);
+ if (!error)
+ {
+ sync_wait(fenceFd, -1);
+ close(fenceFd);
+ }
+ return error;
+ }
+#endif
+ default:
+ {
+ TRACE("no gralloc module to unlock");
+ return -1;
+ }
+ }
+}
diff --git a/src/System/GrallocAndroid.hpp b/src/System/GrallocAndroid.hpp
new file mode 100644
index 0000000..fe0b15a
--- /dev/null
+++ b/src/System/GrallocAndroid.hpp
@@ -0,0 +1,44 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GRALLOC_ANDROID
+#define GRALLOC_ANDROID
+
+#include <hardware/gralloc.h>
+
+#ifdef HAVE_GRALLOC1
+#include <hardware/gralloc1.h>
+#endif
+
+#include <unistd.h> // for close()
+
+class GrallocModule
+{
+public:
+ static GrallocModule *getInstance();
+ int lock(buffer_handle_t handle, int usage, int left, int top, int width, int height, void **vaddr);
+ int unlock(buffer_handle_t handle);
+
+private:
+ GrallocModule();
+ uint8_t m_major_version;
+ const gralloc_module_t *m_module;
+#ifdef HAVE_GRALLOC1
+ gralloc1_device_t *m_gralloc1_device = nullptr;
+ GRALLOC1_PFN_LOCK m_gralloc1_lock = nullptr;
+ GRALLOC1_PFN_UNLOCK m_gralloc1_unlock = nullptr;
+#endif
+};
+
+#endif // GRALLOC_ANDROID
diff --git a/src/System/Half.cpp b/src/System/Half.cpp
new file mode 100644
index 0000000..cde8190
--- /dev/null
+++ b/src/System/Half.cpp
@@ -0,0 +1,102 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Half.hpp"
+
+namespace sw
+{
+ half::half(float fp32)
+ {
+ unsigned int fp32i = *(unsigned int*)&fp32;
+ unsigned int sign = (fp32i & 0x80000000) >> 16;
+ unsigned int abs = fp32i & 0x7FFFFFFF;
+
+ if(abs > 0x47FFEFFF) // Infinity
+ {
+ fp16i = sign | 0x7FFF;
+ }
+ else if(abs < 0x38800000) // Denormal
+ {
+ unsigned int mantissa = (abs & 0x007FFFFF) | 0x00800000;
+ int e = 113 - (abs >> 23);
+
+ if(e < 24)
+ {
+ abs = mantissa >> e;
+ }
+ else
+ {
+ abs = 0;
+ }
+
+ fp16i = sign | (abs + 0x00000FFF + ((abs >> 13) & 1)) >> 13;
+ }
+ else
+ {
+ fp16i = sign | (abs + 0xC8000000 + 0x00000FFF + ((abs >> 13) & 1)) >> 13;
+ }
+ }
+
+ half::operator float() const
+ {
+ unsigned int fp32i;
+
+ int s = (fp16i >> 15) & 0x00000001;
+ int e = (fp16i >> 10) & 0x0000001F;
+ int m = fp16i & 0x000003FF;
+
+ if(e == 0)
+ {
+ if(m == 0)
+ {
+ fp32i = s << 31;
+
+ return (float&)fp32i;
+ }
+ else
+ {
+ while(!(m & 0x00000400))
+ {
+ m <<= 1;
+ e -= 1;
+ }
+
+ e += 1;
+ m &= ~0x00000400;
+ }
+ }
+
+ e = e + (127 - 15);
+ m = m << 13;
+
+ fp32i = (s << 31) | (e << 23) | m;
+
+ return (float&)fp32i;
+ }
+
+ half &half::operator=(half h)
+ {
+ fp16i = h.fp16i;
+
+ return *this;
+ }
+
+
+ half &half::operator=(float f)
+ {
+ *this = half(f);
+
+ return *this;
+ }
+}
diff --git a/src/System/Half.hpp b/src/System/Half.hpp
new file mode 100644
index 0000000..f2d378e
--- /dev/null
+++ b/src/System/Half.hpp
@@ -0,0 +1,93 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Half_hpp
+#define sw_Half_hpp
+
+namespace sw
+{
+ class half
+ {
+ public:
+ half() = default;
+ explicit half(float f);
+
+ operator float() const;
+
+ half &operator=(half h);
+ half &operator=(float f);
+
+ private:
+ unsigned short fp16i;
+ };
+
+ inline half shortAsHalf(short s)
+ {
+ union
+ {
+ half h;
+ short s;
+ } hs;
+
+ hs.s = s;
+
+ return hs.h;
+ }
+
+ class RGB9E5
+ {
+ unsigned int R : 9;
+ unsigned int G : 9;
+ unsigned int B : 9;
+ unsigned int E : 5;
+
+ public:
+ void toRGB16F(half rgb[3]) const
+ {
+ constexpr int offset = 24; // Exponent bias (15) + number of mantissa bits per component (9) = 24
+
+ const float factor = (1u << E) * (1.0f / (1 << offset));
+ rgb[0] = half(R * factor);
+ rgb[1] = half(G * factor);
+ rgb[2] = half(B * factor);
+ }
+ };
+
+ class R11G11B10F
+ {
+ unsigned int R : 11;
+ unsigned int G : 11;
+ unsigned int B : 10;
+
+ static inline half float11ToFloat16(unsigned short fp11)
+ {
+ return shortAsHalf(fp11 << 4); // Sign bit 0
+ }
+
+ static inline half float10ToFloat16(unsigned short fp10)
+ {
+ return shortAsHalf(fp10 << 5); // Sign bit 0
+ }
+
+ public:
+ void toRGB16F(half rgb[3]) const
+ {
+ rgb[0] = float11ToFloat16(R);
+ rgb[1] = float11ToFloat16(G);
+ rgb[2] = float10ToFloat16(B);
+ }
+ };
+}
+
+#endif // sw_Half_hpp
diff --git a/src/System/Math.cpp b/src/System/Math.cpp
new file mode 100644
index 0000000..290d4ab
--- /dev/null
+++ b/src/System/Math.cpp
@@ -0,0 +1,49 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Math.hpp"
+
+namespace sw
+{
+ inline uint64_t FNV_1a(uint64_t hash, unsigned char data)
+ {
+ return (hash ^ data) * 1099511628211;
+ }
+
+ uint64_t FNV_1a(const unsigned char *data, int size)
+ {
+ int64_t hash = 0xCBF29CE484222325;
+
+ for(int i = 0; i < size; i++)
+ {
+ hash = FNV_1a(hash, data[i]);
+ }
+
+ return hash;
+ }
+
+ unsigned char sRGB8toLinear8(unsigned char value)
+ {
+ static unsigned char sRGBtoLinearTable[256] = { 255 };
+ if(sRGBtoLinearTable[0] == 255)
+ {
+ for(int i = 0; i < 256; i++)
+ {
+ sRGBtoLinearTable[i] = static_cast<unsigned char>(sw::sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
+ }
+ }
+
+ return sRGBtoLinearTable[value];
+ }
+}
diff --git a/src/System/Math.hpp b/src/System/Math.hpp
new file mode 100644
index 0000000..a35d2e0
--- /dev/null
+++ b/src/System/Math.hpp
@@ -0,0 +1,385 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Math_hpp
+#define sw_Math_hpp
+
+#include "Types.hpp"
+#include "Half.hpp"
+
+#include <cmath>
+#if defined(_MSC_VER)
+ #include <intrin.h>
+#endif
+
+namespace sw
+{
+ using std::abs;
+
+ #undef min
+ #undef max
+
+ template<class T>
+ inline T max(T a, T b)
+ {
+ return a > b ? a : b;
+ }
+
+ template<class T>
+ inline T min(T a, T b)
+ {
+ return a < b ? a : b;
+ }
+
+ template<class T>
+ inline T max(T a, T b, T c)
+ {
+ return max(max(a, b), c);
+ }
+
+ template<class T>
+ inline T min(T a, T b, T c)
+ {
+ return min(min(a, b), c);
+ }
+
+ template<class T>
+ inline T max(T a, T b, T c, T d)
+ {
+ return max(max(a, b), max(c, d));
+ }
+
+ template<class T>
+ inline T min(T a, T b, T c, T d)
+ {
+ return min(min(a, b), min(c, d));
+ }
+
+ template<class T>
+ inline void swap(T &a, T &b)
+ {
+ T t = a;
+ a = b;
+ b = t;
+ }
+
+ template <typename destType, typename sourceType>
+ destType bitCast(const sourceType &source)
+ {
+ union
+ {
+ sourceType s;
+ destType d;
+ } sd;
+ sd.s = source;
+ return sd.d;
+ }
+
+ inline int iround(float x)
+ {
+ return (int)floor(x + 0.5f);
+ // return _mm_cvtss_si32(_mm_load_ss(&x)); // FIXME: Demands SSE support
+ }
+
+ inline int ifloor(float x)
+ {
+ return (int)floor(x);
+ }
+
+ inline int ceilFix4(int x)
+ {
+ return (x + 0xF) & 0xFFFFFFF0;
+ }
+
+ inline int ceilInt4(int x)
+ {
+ return (x + 0xF) >> 4;
+ }
+
+ #define BITS(x) ( \
+ !!((x) & 0x80000000) + \
+ !!((x) & 0xC0000000) + \
+ !!((x) & 0xE0000000) + \
+ !!((x) & 0xF0000000) + \
+ !!((x) & 0xF8000000) + \
+ !!((x) & 0xFC000000) + \
+ !!((x) & 0xFE000000) + \
+ !!((x) & 0xFF000000) + \
+ !!((x) & 0xFF800000) + \
+ !!((x) & 0xFFC00000) + \
+ !!((x) & 0xFFE00000) + \
+ !!((x) & 0xFFF00000) + \
+ !!((x) & 0xFFF80000) + \
+ !!((x) & 0xFFFC0000) + \
+ !!((x) & 0xFFFE0000) + \
+ !!((x) & 0xFFFF0000) + \
+ !!((x) & 0xFFFF8000) + \
+ !!((x) & 0xFFFFC000) + \
+ !!((x) & 0xFFFFE000) + \
+ !!((x) & 0xFFFFF000) + \
+ !!((x) & 0xFFFFF800) + \
+ !!((x) & 0xFFFFFC00) + \
+ !!((x) & 0xFFFFFE00) + \
+ !!((x) & 0xFFFFFF00) + \
+ !!((x) & 0xFFFFFF80) + \
+ !!((x) & 0xFFFFFFC0) + \
+ !!((x) & 0xFFFFFFE0) + \
+ !!((x) & 0xFFFFFFF0) + \
+ !!((x) & 0xFFFFFFF8) + \
+ !!((x) & 0xFFFFFFFC) + \
+ !!((x) & 0xFFFFFFFE) + \
+ !!((x) & 0xFFFFFFFF))
+
+ #define MAX(x, y) ((x) > (y) ? (x) : (y))
+ #define MIN(x, y) ((x) < (y) ? (x) : (y))
+
+ inline float exp2(float x)
+ {
+ return exp2f(x);
+ }
+
+ inline int exp2(int x)
+ {
+ return 1 << x;
+ }
+
+ inline unsigned long log2(int x)
+ {
+ #if defined(_MSC_VER)
+ unsigned long y;
+ _BitScanReverse(&y, x);
+ return y;
+ #else
+ return 31 - __builtin_clz(x);
+ #endif
+ }
+
+ inline int ilog2(float x)
+ {
+ unsigned int y = *(unsigned int*)&x;
+
+ return ((y & 0x7F800000) >> 23) - 127;
+ }
+
+ inline float log2(float x)
+ {
+ return logf(x) * 1.44269504f; // 1.0 / log[e](2)
+ }
+
+ inline bool isPow2(int x)
+ {
+ return (x & -x) == x;
+ }
+
+ template<class T>
+ inline T clamp(T x, T a, T b)
+ {
+ if(x < a) x = a;
+ if(x > b) x = b;
+
+ return x;
+ }
+
+ inline float clamp01(float x)
+ {
+ return clamp(x, 0.0f, 1.0f);
+ }
+
+ inline int ceilPow2(int x)
+ {
+ int i = 1;
+
+ while(i < x)
+ {
+ i <<= 1;
+ }
+
+ return i;
+ }
+
+ inline int floorDiv(int a, int b)
+ {
+ return a / b + ((a % b) >> 31);
+ }
+
+ inline int floorMod(int a, int b)
+ {
+ int r = a % b;
+ return r + ((r >> 31) & b);
+ }
+
+ inline int ceilDiv(int a, int b)
+ {
+ return a / b - (-(a % b) >> 31);
+ }
+
+ inline int ceilMod(int a, int b)
+ {
+ int r = a % b;
+ return r - ((-r >> 31) & b);
+ }
+
+ template<const int n>
+ inline unsigned int unorm(float x)
+ {
+ static const unsigned int max = 0xFFFFFFFF >> (32 - n);
+ static const float maxf = static_cast<float>(max);
+
+ if(x >= 1.0f)
+ {
+ return max;
+ }
+ else if(x <= 0.0f)
+ {
+ return 0;
+ }
+ else
+ {
+ return static_cast<unsigned int>(maxf * x + 0.5f);
+ }
+ }
+
+ template<const int n>
+ inline int snorm(float x)
+ {
+ static const unsigned int min = 0x80000000 >> (32 - n);
+ static const unsigned int max = 0xFFFFFFFF >> (32 - n + 1);
+ static const float maxf = static_cast<float>(max);
+ static const unsigned int range = 0xFFFFFFFF >> (32 - n);
+
+ if(x >= 0.0f)
+ {
+ if(x >= 1.0f)
+ {
+ return max;
+ }
+ else
+ {
+ return static_cast<int>(maxf * x + 0.5f);
+ }
+ }
+ else
+ {
+ if(x <= -1.0f)
+ {
+ return min;
+ }
+ else
+ {
+ return static_cast<int>(maxf * x - 0.5f) & range;
+ }
+ }
+ }
+
+ template<const int n>
+ inline unsigned int ucast(float x)
+ {
+ static const unsigned int max = 0xFFFFFFFF >> (32 - n);
+ static const float maxf = static_cast<float>(max);
+
+ if(x >= maxf)
+ {
+ return max;
+ }
+ else if(x <= 0.0f)
+ {
+ return 0;
+ }
+ else
+ {
+ return static_cast<unsigned int>(x + 0.5f);
+ }
+ }
+
+ template<const int n>
+ inline int scast(float x)
+ {
+ static const unsigned int min = 0x80000000 >> (32 - n);
+ static const unsigned int max = 0xFFFFFFFF >> (32 - n + 1);
+ static const float maxf = static_cast<float>(max);
+ static const float minf = static_cast<float>(min);
+ static const unsigned int range = 0xFFFFFFFF >> (32 - n);
+
+ if(x > 0.0f)
+ {
+ if(x >= maxf)
+ {
+ return max;
+ }
+ else
+ {
+ return static_cast<int>(x + 0.5f);
+ }
+ }
+ else
+ {
+ if(x <= -minf)
+ {
+ return min;
+ }
+ else
+ {
+ return static_cast<int>(x - 0.5f) & range;
+ }
+ }
+ }
+
+ inline float sRGBtoLinear(float c)
+ {
+ if(c <= 0.04045f)
+ {
+ return c * 0.07739938f; // 1.0f / 12.92f;
+ }
+ else
+ {
+ return powf((c + 0.055f) * 0.9478673f, 2.4f); // 1.0f / 1.055f
+ }
+ }
+
+ inline float linearToSRGB(float c)
+ {
+ if(c <= 0.0031308f)
+ {
+ return c * 12.92f;
+ }
+ else
+ {
+ return 1.055f * powf(c, 0.4166667f) - 0.055f; // 1.0f / 2.4f
+ }
+ }
+
+ unsigned char sRGB8toLinear8(unsigned char value);
+
+ uint64_t FNV_1a(const unsigned char *data, int size); // Fowler-Noll-Vo hash function
+
+ // Round up to the next multiple of alignment
+ template<typename T>
+ inline T align(T value, unsigned int alignment)
+ {
+ return ((value + alignment - 1) / alignment) * alignment;
+ }
+
+ template<unsigned int alignment, typename T>
+ inline T align(T value)
+ {
+ return ((value + alignment - 1) / alignment) * alignment;
+ }
+
+ inline int clampToSignedInt(unsigned int x)
+ {
+ return static_cast<int>(min(x, 0x7FFFFFFFu));
+ }
+}
+
+#endif // sw_Math_hpp
diff --git a/src/System/Memory.cpp b/src/System/Memory.cpp
new file mode 100644
index 0000000..45fef40
--- /dev/null
+++ b/src/System/Memory.cpp
@@ -0,0 +1,262 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Memory.hpp"
+
+#include "Types.hpp"
+#include "Debug.hpp"
+
+#if defined(_WIN32)
+ #ifndef WIN32_LEAN_AND_MEAN
+ #define WIN32_LEAN_AND_MEAN
+ #endif
+ #include <windows.h>
+ #include <intrin.h>
+#else
+ #include <errno.h>
+ #include <sys/mman.h>
+ #include <stdlib.h>
+ #include <unistd.h>
+#endif
+
+#include <memory.h>
+
+#undef allocate
+#undef deallocate
+
+#if (defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || defined (_M_X64)) && !defined(__x86__)
+#define __x86__
+#endif
+
+namespace sw
+{
+namespace
+{
+struct Allocation
+{
+// size_t bytes;
+ unsigned char *block;
+};
+
+void *allocateRaw(size_t bytes, size_t alignment)
+{
+ ASSERT((alignment & (alignment - 1)) == 0); // Power of 2 alignment.
+
+ #if defined(LINUX_ENABLE_NAMED_MMAP)
+ void *allocation;
+ int result = posix_memalign(&allocation, alignment, bytes);
+ if(result != 0)
+ {
+ errno = result;
+ allocation = nullptr;
+ }
+ return allocation;
+ #else
+ unsigned char *block = new unsigned char[bytes + sizeof(Allocation) + alignment];
+ unsigned char *aligned = nullptr;
+
+ if(block)
+ {
+ aligned = (unsigned char*)((uintptr_t)(block + sizeof(Allocation) + alignment - 1) & -(intptr_t)alignment);
+ Allocation *allocation = (Allocation*)(aligned - sizeof(Allocation));
+
+ // allocation->bytes = bytes;
+ allocation->block = block;
+ }
+
+ return aligned;
+ #endif
+}
+
+#if defined(LINUX_ENABLE_NAMED_MMAP)
+// Create a file descriptor for anonymous memory with the given
+// name. Returns -1 on failure.
+// TODO: remove once libc wrapper exists.
+int memfd_create(const char* name, unsigned int flags)
+{
+ #if __aarch64__
+ #define __NR_memfd_create 279
+ #elif __arm__
+ #define __NR_memfd_create 279
+ #elif __powerpc64__
+ #define __NR_memfd_create 360
+ #elif __i386__
+ #define __NR_memfd_create 356
+ #elif __x86_64__
+ #define __NR_memfd_create 319
+ #endif /* __NR_memfd_create__ */
+ #ifdef __NR_memfd_create
+ // In the event of no system call this returns -1 with errno set
+ // as ENOSYS.
+ return syscall(__NR_memfd_create, name, flags);
+ #else
+ return -1;
+ #endif
+}
+
+// Returns a file descriptor for use with an anonymous mmap, if
+// memfd_create fails, -1 is returned. Note, the mappings should be
+// MAP_PRIVATE so that underlying pages aren't shared.
+int anonymousFd()
+{
+ static int fd = memfd_create("SwiftShader JIT", 0);
+ return fd;
+}
+
+// Ensure there is enough space in the "anonymous" fd for length.
+void ensureAnonFileSize(int anonFd, size_t length)
+{
+ static size_t fileSize = 0;
+ if(length > fileSize)
+ {
+ ftruncate(anonFd, length);
+ fileSize = length;
+ }
+}
+#endif // defined(LINUX_ENABLE_NAMED_MMAP)
+
+} // anonymous namespace
+
+size_t memoryPageSize()
+{
+ static int pageSize = 0;
+
+ if(pageSize == 0)
+ {
+ #if defined(_WIN32)
+ SYSTEM_INFO systemInfo;
+ GetSystemInfo(&systemInfo);
+ pageSize = systemInfo.dwPageSize;
+ #else
+ pageSize = sysconf(_SC_PAGESIZE);
+ #endif
+ }
+
+ return pageSize;
+}
+
+void *allocate(size_t bytes, size_t alignment)
+{
+ void *memory = allocateRaw(bytes, alignment);
+
+ if(memory)
+ {
+ memset(memory, 0, bytes);
+ }
+
+ return memory;
+}
+
+void deallocate(void *memory)
+{
+ #if defined(LINUX_ENABLE_NAMED_MMAP)
+ free(memory);
+ #else
+ if(memory)
+ {
+ unsigned char *aligned = (unsigned char*)memory;
+ Allocation *allocation = (Allocation*)(aligned - sizeof(Allocation));
+
+ delete[] allocation->block;
+ }
+ #endif
+}
+
+void *allocateExecutable(size_t bytes)
+{
+ size_t pageSize = memoryPageSize();
+ size_t length = (bytes + pageSize - 1) & ~(pageSize - 1);
+ void *mapping;
+
+ #if defined(LINUX_ENABLE_NAMED_MMAP)
+ // Try to name the memory region for the executable code,
+ // to aid profilers.
+ int anonFd = anonymousFd();
+ if(anonFd == -1)
+ {
+ mapping = mmap(nullptr, length, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ }
+ else
+ {
+ ensureAnonFileSize(anonFd, length);
+ mapping = mmap(nullptr, length, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE, anonFd, 0);
+ }
+
+ if(mapping == MAP_FAILED)
+ {
+ mapping = nullptr;
+ }
+ #else
+ mapping = allocate(length, pageSize);
+ #endif
+
+ return mapping;
+}
+
+void markExecutable(void *memory, size_t bytes)
+{
+ #if defined(_WIN32)
+ unsigned long oldProtection;
+ VirtualProtect(memory, bytes, PAGE_EXECUTE_READ, &oldProtection);
+ #else
+ mprotect(memory, bytes, PROT_READ | PROT_EXEC);
+ #endif
+}
+
+void deallocateExecutable(void *memory, size_t bytes)
+{
+ #if defined(_WIN32)
+ unsigned long oldProtection;
+ VirtualProtect(memory, bytes, PAGE_READWRITE, &oldProtection);
+ deallocate(memory);
+ #elif defined(LINUX_ENABLE_NAMED_MMAP)
+ size_t pageSize = memoryPageSize();
+ size_t length = (bytes + pageSize - 1) & ~(pageSize - 1);
+ munmap(memory, length);
+ #else
+ mprotect(memory, bytes, PROT_READ | PROT_WRITE);
+ deallocate(memory);
+ #endif
+}
+
+void clear(uint16_t *memory, uint16_t element, size_t count)
+{
+ #if defined(_MSC_VER) && defined(__x86__) && !defined(MEMORY_SANITIZER)
+ __stosw(memory, element, count);
+ #elif defined(__GNUC__) && defined(__x86__) && !defined(MEMORY_SANITIZER)
+ __asm__("rep stosw" : : "D"(memory), "a"(element), "c"(count));
+ #else
+ for(size_t i = 0; i < count; i++)
+ {
+ memory[i] = element;
+ }
+ #endif
+}
+
+void clear(uint32_t *memory, uint32_t element, size_t count)
+{
+ #if defined(_MSC_VER) && defined(__x86__) && !defined(MEMORY_SANITIZER)
+ __stosd((unsigned long*)memory, element, count);
+ #elif defined(__GNUC__) && defined(__x86__) && !defined(MEMORY_SANITIZER)
+ __asm__("rep stosl" : : "D"(memory), "a"(element), "c"(count));
+ #else
+ for(size_t i = 0; i < count; i++)
+ {
+ memory[i] = element;
+ }
+ #endif
+}
+}
diff --git a/src/System/Memory.hpp b/src/System/Memory.hpp
new file mode 100644
index 0000000..8d3a159
--- /dev/null
+++ b/src/System/Memory.hpp
@@ -0,0 +1,36 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Memory_hpp
+#define Memory_hpp
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace sw
+{
+size_t memoryPageSize();
+
+void *allocate(size_t bytes, size_t alignment = 16);
+void deallocate(void *memory);
+
+void *allocateExecutable(size_t bytes); // Allocates memory that can be made executable using markExecutable()
+void markExecutable(void *memory, size_t bytes);
+void deallocateExecutable(void *memory, size_t bytes);
+
+void clear(uint16_t *memory, uint16_t element, size_t count);
+void clear(uint32_t *memory, uint32_t element, size_t count);
+}
+
+#endif // Memory_hpp
diff --git a/src/System/MutexLock.hpp b/src/System/MutexLock.hpp
new file mode 100644
index 0000000..65e9fa4
--- /dev/null
+++ b/src/System/MutexLock.hpp
@@ -0,0 +1,199 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_MutexLock_hpp
+#define sw_MutexLock_hpp
+
+#include "Thread.hpp"
+
+#if defined(__linux__)
+// Use a pthread mutex on Linux. Since many processes may use SwiftShader
+// at the same time it's best to just have the scheduler overhead.
+#include <pthread.h>
+
+namespace sw
+{
+ class MutexLock
+ {
+ public:
+ MutexLock()
+ {
+ pthread_mutex_init(&mutex, NULL);
+ }
+
+ ~MutexLock()
+ {
+ pthread_mutex_destroy(&mutex);
+ }
+
+ bool attemptLock()
+ {
+ return pthread_mutex_trylock(&mutex) == 0;
+ }
+
+ void lock()
+ {
+ pthread_mutex_lock(&mutex);
+ }
+
+ void unlock()
+ {
+ pthread_mutex_unlock(&mutex);
+ }
+
+ private:
+ pthread_mutex_t mutex;
+ };
+}
+
+#else // !__linux__
+
+#include <atomic>
+
+namespace sw
+{
+ class BackoffLock
+ {
+ public:
+ BackoffLock()
+ {
+ mutex = 0;
+ }
+
+ bool attemptLock()
+ {
+ if(!isLocked())
+ {
+ if(mutex.exchange(true) == false)
+ {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ void lock()
+ {
+ int backoff = 1;
+
+ while(!attemptLock())
+ {
+ if(backoff <= 64)
+ {
+ for(int i = 0; i < backoff; i++)
+ {
+ nop();
+ nop();
+ nop();
+ nop();
+ nop();
+
+ nop();
+ nop();
+ nop();
+ nop();
+ nop();
+
+ nop();
+ nop();
+ nop();
+ nop();
+ nop();
+
+ nop();
+ nop();
+ nop();
+ nop();
+ nop();
+
+ nop();
+ nop();
+ nop();
+ nop();
+ nop();
+
+ nop();
+ nop();
+ nop();
+ nop();
+ nop();
+
+ nop();
+ nop();
+ nop();
+ nop();
+ nop();
+ }
+
+ backoff *= 2;
+ }
+ else
+ {
+ Thread::yield();
+
+ backoff = 1;
+ }
+ };
+ }
+
+ void unlock()
+ {
+ mutex.store(false, std::memory_order_release);
+ }
+
+ bool isLocked()
+ {
+ return mutex.load(std::memory_order_acquire);
+ }
+
+ private:
+ struct
+ {
+ // Ensure that the mutex variable is on its own 64-byte cache line to avoid false sharing
+ // Padding must be public to avoid compiler warnings
+ volatile int padding1[16];
+ std::atomic<bool> mutex;
+ volatile int padding2[15];
+ };
+ };
+
+ using MutexLock = BackoffLock;
+}
+
+#endif // !__ANDROID__
+
+class LockGuard
+{
+public:
+ explicit LockGuard(sw::MutexLock &mutex) : mutex(&mutex)
+ {
+ mutex.lock();
+ }
+
+ explicit LockGuard(sw::MutexLock *mutex) : mutex(mutex)
+ {
+ if (mutex) mutex->lock();
+ }
+
+ ~LockGuard()
+ {
+ if (mutex) mutex->unlock();
+ }
+
+protected:
+ sw::MutexLock *mutex;
+};
+
+#endif // sw_MutexLock_hpp
diff --git a/src/System/Resource.cpp b/src/System/Resource.cpp
new file mode 100644
index 0000000..3a63810
--- /dev/null
+++ b/src/System/Resource.cpp
@@ -0,0 +1,184 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Resource.hpp"
+
+#include "Memory.hpp"
+#include "Debug.hpp"
+
+namespace sw
+{
+ Resource::Resource(size_t bytes) : size(bytes)
+ {
+ blocked = 0;
+
+ accessor = PUBLIC;
+ count = 0;
+ orphaned = false;
+
+ buffer = allocate(bytes);
+ }
+
+ Resource::~Resource()
+ {
+ deallocate(buffer);
+ }
+
+ void *Resource::lock(Accessor claimer)
+ {
+ criticalSection.lock();
+
+ while(count > 0 && accessor != claimer)
+ {
+ blocked++;
+ criticalSection.unlock();
+
+ unblock.wait();
+
+ criticalSection.lock();
+ blocked--;
+ }
+
+ accessor = claimer;
+ count++;
+
+ criticalSection.unlock();
+
+ return buffer;
+ }
+
+ void *Resource::lock(Accessor relinquisher, Accessor claimer)
+ {
+ criticalSection.lock();
+
+ // Release
+ while(count > 0 && accessor == relinquisher)
+ {
+ count--;
+
+ if(count == 0)
+ {
+ if(blocked)
+ {
+ unblock.signal();
+ }
+ else if(orphaned)
+ {
+ criticalSection.unlock();
+
+ delete this;
+
+ return 0;
+ }
+ }
+ }
+
+ // Acquire
+ while(count > 0 && accessor != claimer)
+ {
+ blocked++;
+ criticalSection.unlock();
+
+ unblock.wait();
+
+ criticalSection.lock();
+ blocked--;
+ }
+
+ accessor = claimer;
+ count++;
+
+ criticalSection.unlock();
+
+ return buffer;
+ }
+
+ void Resource::unlock()
+ {
+ criticalSection.lock();
+ ASSERT(count > 0);
+
+ count--;
+
+ if(count == 0)
+ {
+ if(blocked)
+ {
+ unblock.signal();
+ }
+ else if(orphaned)
+ {
+ criticalSection.unlock();
+
+ delete this;
+
+ return;
+ }
+ }
+
+ criticalSection.unlock();
+ }
+
+ void Resource::unlock(Accessor relinquisher)
+ {
+ criticalSection.lock();
+ ASSERT(count > 0);
+
+ while(count > 0 && accessor == relinquisher)
+ {
+ count--;
+
+ if(count == 0)
+ {
+ if(blocked)
+ {
+ unblock.signal();
+ }
+ else if(orphaned)
+ {
+ criticalSection.unlock();
+
+ delete this;
+
+ return;
+ }
+ }
+ }
+
+ criticalSection.unlock();
+ }
+
+ void Resource::destruct()
+ {
+ criticalSection.lock();
+
+ if(count == 0 && !blocked)
+ {
+ criticalSection.unlock();
+
+ delete this;
+
+ return;
+ }
+
+ orphaned = true;
+
+ criticalSection.unlock();
+ }
+
+ const void *Resource::data() const
+ {
+ return buffer;
+ }
+}
diff --git a/src/System/Resource.hpp b/src/System/Resource.hpp
new file mode 100644
index 0000000..0acfa48
--- /dev/null
+++ b/src/System/Resource.hpp
@@ -0,0 +1,60 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Resource_hpp
+#define sw_Resource_hpp
+
+#include "MutexLock.hpp"
+
+namespace sw
+{
+ enum Accessor
+ {
+ PUBLIC, // Application/API access
+ PRIVATE, // Renderer access, shared by multiple threads if read-only
+ MANAGED, // Renderer access, shared read/write access if partitioned
+ EXCLUSIVE
+ };
+
+ class Resource
+ {
+ public:
+ Resource(size_t bytes);
+
+ void destruct(); // Asynchronous destructor
+
+ void *lock(Accessor claimer);
+ void *lock(Accessor relinquisher, Accessor claimer);
+ void unlock();
+ void unlock(Accessor relinquisher);
+
+ const void *data() const;
+ const size_t size;
+
+ private:
+ ~Resource(); // Always call destruct() instead
+
+ MutexLock criticalSection;
+ Event unblock;
+ volatile int blocked;
+
+ volatile Accessor accessor;
+ volatile int count;
+ bool orphaned;
+
+ void *buffer;
+ };
+}
+
+#endif // sw_Resource_hpp
diff --git a/src/System/SharedLibrary.hpp b/src/System/SharedLibrary.hpp
new file mode 100644
index 0000000..8a8c3a1
--- /dev/null
+++ b/src/System/SharedLibrary.hpp
@@ -0,0 +1,171 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SharedLibrary_hpp
+#define SharedLibrary_hpp
+
+#if defined(_WIN32)
+ #include <Windows.h>
+#else
+ #include <dlfcn.h>
+#endif
+
+#include <string>
+
+void *getLibraryHandle(const char *path);
+void *loadLibrary(const char *path);
+void freeLibrary(void *library);
+void *getProcAddress(void *library, const char *name);
+
+template<int n>
+void *loadLibrary(const std::string &libraryDirectory, const char *(&names)[n], const char *mustContainSymbol = nullptr)
+{
+ for(const char *libraryName : names)
+ {
+ std::string libraryPath = libraryDirectory + libraryName;
+ void *library = getLibraryHandle(libraryPath.c_str());
+
+ if(library)
+ {
+ if(!mustContainSymbol || getProcAddress(library, mustContainSymbol))
+ {
+ return library;
+ }
+
+ freeLibrary(library);
+ }
+ }
+
+ for(const char *libraryName : names)
+ {
+ std::string libraryPath = libraryDirectory + libraryName;
+ void *library = loadLibrary(libraryPath.c_str());
+
+ if(library)
+ {
+ if(!mustContainSymbol || getProcAddress(library, mustContainSymbol))
+ {
+ return library;
+ }
+
+ freeLibrary(library);
+ }
+ }
+
+ return nullptr;
+}
+
+#if defined(_WIN32)
+ inline void *loadLibrary(const char *path)
+ {
+ return (void*)LoadLibrary(path);
+ }
+
+ inline void *getLibraryHandle(const char *path)
+ {
+ HMODULE module = NULL;
+ GetModuleHandleEx(0, path, &module);
+ return (void*)module;
+ }
+
+ inline void freeLibrary(void *library)
+ {
+ FreeLibrary((HMODULE)library);
+ }
+
+ inline void *getProcAddress(void *library, const char *name)
+ {
+ return (void*)GetProcAddress((HMODULE)library, name);
+ }
+
+ inline std::string getModuleDirectory()
+ {
+ static int dummy_symbol = 0;
+
+ HMODULE module = NULL;
+ GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS, (LPCTSTR)&dummy_symbol, &module);
+
+ char filename[1024];
+ if(module && (GetModuleFileName(module, filename, sizeof(filename)) != 0))
+ {
+ std::string directory(filename);
+ return directory.substr(0, directory.find_last_of("\\/") + 1).c_str();
+ }
+ else
+ {
+ return "";
+ }
+ }
+#else
+ inline void *loadLibrary(const char *path)
+ {
+ return dlopen(path, RTLD_LAZY | RTLD_LOCAL);
+ }
+
+ inline void *getLibraryHandle(const char *path)
+ {
+ #ifdef __ANDROID__
+ // bionic doesn't support RTLD_NOLOAD before L
+ return dlopen(path, RTLD_NOW | RTLD_LOCAL);
+ #else
+ void *resident = dlopen(path, RTLD_LAZY | RTLD_NOLOAD | RTLD_LOCAL);
+
+ if(resident)
+ {
+ return dlopen(path, RTLD_LAZY | RTLD_LOCAL); // Increment reference count
+ }
+
+ return nullptr;
+ #endif
+ }
+
+ inline void freeLibrary(void *library)
+ {
+ if(library)
+ {
+ dlclose(library);
+ }
+ }
+
+ inline void *getProcAddress(void *library, const char *name)
+ {
+ void *symbol = dlsym(library, name);
+
+ if(!symbol)
+ {
+ const char *reason = dlerror(); // Silence the error
+ (void)reason;
+ }
+
+ return symbol;
+ }
+
+ inline std::string getModuleDirectory()
+ {
+ static int dummy_symbol = 0;
+
+ Dl_info dl_info;
+ if(dladdr(&dummy_symbol, &dl_info) != 0)
+ {
+ std::string directory(dl_info.dli_fname);
+ return directory.substr(0, directory.find_last_of("\\/") + 1).c_str();
+ }
+ else
+ {
+ return "";
+ }
+ }
+#endif
+
+#endif // SharedLibrary_hpp
diff --git a/src/System/Socket.cpp b/src/System/Socket.cpp
new file mode 100644
index 0000000..b098031
--- /dev/null
+++ b/src/System/Socket.cpp
@@ -0,0 +1,110 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Socket.hpp"
+
+#if defined(_WIN32)
+ #include <ws2tcpip.h>
+#else
+ #include <unistd.h>
+ #include <netdb.h>
+ #include <netinet/in.h>
+ #include <sys/select.h>
+#endif
+
+namespace sw
+{
+ Socket::Socket(SOCKET socket) : socket(socket)
+ {
+ }
+
+ Socket::Socket(const char *address, const char *port)
+ {
+ #if defined(_WIN32)
+ socket = INVALID_SOCKET;
+ #else
+ socket = -1;
+ #endif
+
+ addrinfo hints = {};
+ hints.ai_family = AF_INET;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_protocol = IPPROTO_TCP;
+ hints.ai_flags = AI_PASSIVE;
+
+ addrinfo *info = 0;
+ getaddrinfo(address, port, &hints, &info);
+
+ if(info)
+ {
+ socket = ::socket(info->ai_family, info->ai_socktype, info->ai_protocol);
+ bind(socket, info->ai_addr, (int)info->ai_addrlen);
+ }
+ }
+
+ Socket::~Socket()
+ {
+ #if defined(_WIN32)
+ closesocket(socket);
+ #else
+ close(socket);
+ #endif
+ }
+
+ void Socket::listen(int backlog)
+ {
+ ::listen(socket, backlog);
+ }
+
+ bool Socket::select(int us)
+ {
+ fd_set sockets;
+ FD_ZERO(&sockets);
+ FD_SET(socket, &sockets);
+
+ timeval timeout = {us / 1000000, us % 1000000};
+
+ return ::select(FD_SETSIZE, &sockets, 0, 0, &timeout) >= 1;
+ }
+
+ Socket *Socket::accept()
+ {
+ return new Socket(::accept(socket, 0, 0));
+ }
+
+ int Socket::receive(char *buffer, int length)
+ {
+ return recv(socket, buffer, length, 0);
+ }
+
+ void Socket::send(const char *buffer, int length)
+ {
+ ::send(socket, buffer, length, 0);
+ }
+
+ void Socket::startup()
+ {
+ #if defined(_WIN32)
+ WSADATA winsockData;
+ WSAStartup(MAKEWORD(2, 2), &winsockData);
+ #endif
+ }
+
+ void Socket::cleanup()
+ {
+ #if defined(_WIN32)
+ WSACleanup();
+ #endif
+ }
+}
diff --git a/src/System/Socket.hpp b/src/System/Socket.hpp
new file mode 100644
index 0000000..b6b9abd
--- /dev/null
+++ b/src/System/Socket.hpp
@@ -0,0 +1,49 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Socket_hpp
+#define sw_Socket_hpp
+
+#if defined(_WIN32)
+ #include <winsock2.h>
+#else
+ #include <sys/socket.h>
+ typedef int SOCKET;
+#endif
+
+namespace sw
+{
+ class Socket
+ {
+ public:
+ Socket(SOCKET socket);
+ Socket(const char *address, const char *port);
+ ~Socket();
+
+ void listen(int backlog = 1);
+ bool select(int us);
+ Socket *accept();
+
+ int receive(char *buffer, int length);
+ void send(const char *buffer, int length);
+
+ static void startup();
+ static void cleanup();
+
+ private:
+ SOCKET socket;
+ };
+}
+
+#endif // sw_Socket_hpp
diff --git a/src/System/Thread.cpp b/src/System/Thread.cpp
new file mode 100644
index 0000000..df9a0b7
--- /dev/null
+++ b/src/System/Thread.cpp
@@ -0,0 +1,91 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Thread.hpp"
+
+namespace sw
+{
+ Thread::Thread(void (*threadFunction)(void *parameters), void *parameters)
+ {
+ Event init;
+ Entry entry = {threadFunction, parameters, &init};
+
+ #if defined(_WIN32)
+ handle = CreateThread(NULL, 1024 * 1024, startFunction, &entry, 0, NULL);
+ #else
+ pthread_create(&handle, NULL, startFunction, &entry);
+ #endif
+
+ init.wait();
+ }
+
+ Thread::~Thread()
+ {
+ join(); // Make threads exit before deleting them to not block here
+ }
+
+ void Thread::join()
+ {
+ if(!hasJoined)
+ {
+ #if defined(_WIN32)
+ WaitForSingleObject(handle, INFINITE);
+ CloseHandle(handle);
+ #else
+ pthread_join(handle, NULL);
+ #endif
+
+ hasJoined = true;
+ }
+ }
+
+ #if defined(_WIN32)
+ unsigned long __stdcall Thread::startFunction(void *parameters)
+ {
+ Entry entry = *(Entry*)parameters;
+ entry.init->signal();
+ entry.threadFunction(entry.threadParameters);
+ return 0;
+ }
+ #else
+ void *Thread::startFunction(void *parameters)
+ {
+ Entry entry = *(Entry*)parameters;
+ entry.init->signal();
+ entry.threadFunction(entry.threadParameters);
+ return nullptr;
+ }
+ #endif
+
+ Event::Event()
+ {
+ #if defined(_WIN32)
+ handle = CreateEvent(NULL, FALSE, FALSE, NULL);
+ #else
+ pthread_cond_init(&handle, NULL);
+ pthread_mutex_init(&mutex, NULL);
+ signaled = false;
+ #endif
+ }
+
+ Event::~Event()
+ {
+ #if defined(_WIN32)
+ CloseHandle(handle);
+ #else
+ pthread_cond_destroy(&handle);
+ pthread_mutex_destroy(&mutex);
+ #endif
+ }
+}
diff --git a/src/System/Thread.hpp b/src/System/Thread.hpp
new file mode 100644
index 0000000..b8280f1
--- /dev/null
+++ b/src/System/Thread.hpp
@@ -0,0 +1,338 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Thread_hpp
+#define sw_Thread_hpp
+
+#if defined(_WIN32)
+ #ifndef WIN32_LEAN_AND_MEAN
+ #define WIN32_LEAN_AND_MEAN
+ #endif
+ #include <windows.h>
+ #include <intrin.h>
+#else
+ #include <pthread.h>
+ #include <sched.h>
+ #include <unistd.h>
+ #define TLS_OUT_OF_INDEXES (pthread_key_t)(~0)
+#endif
+
+#include <stdlib.h>
+
+#if defined(__clang__)
+#if __has_include(<atomic>) // clang has an explicit check for the availability of atomic
+#define USE_STD_ATOMIC 1
+#endif
+// atomic is available in C++11 or newer, and in Visual Studio 2012 or newer
+#elif (defined(_MSC_VER) && (_MSC_VER >= 1700)) || (__cplusplus >= 201103L)
+#define USE_STD_ATOMIC 1
+#endif
+
+#if USE_STD_ATOMIC
+#include <atomic>
+#endif
+
+namespace sw
+{
+ class Event;
+
+ class Thread
+ {
+ public:
+ Thread(void (*threadFunction)(void *parameters), void *parameters);
+
+ ~Thread();
+
+ void join();
+
+ static void yield();
+ static void sleep(int milliseconds);
+
+ #if defined(_WIN32)
+ typedef DWORD LocalStorageKey;
+ #else
+ typedef pthread_key_t LocalStorageKey;
+ #endif
+
+ static LocalStorageKey allocateLocalStorageKey(void (*destructor)(void *storage) = free);
+ static void freeLocalStorageKey(LocalStorageKey key);
+ static void *allocateLocalStorage(LocalStorageKey key, size_t size);
+ static void *getLocalStorage(LocalStorageKey key);
+ static void freeLocalStorage(LocalStorageKey key);
+
+ private:
+ struct Entry
+ {
+ void (*const threadFunction)(void *parameters);
+ void *threadParameters;
+ Event *init;
+ };
+
+ #if defined(_WIN32)
+ static unsigned long __stdcall startFunction(void *parameters);
+ HANDLE handle;
+ #else
+ static void *startFunction(void *parameters);
+ pthread_t handle;
+ #endif
+
+ bool hasJoined = false;
+ };
+
+ class Event
+ {
+ friend class Thread;
+
+ public:
+ Event();
+
+ ~Event();
+
+ void signal();
+ void wait();
+
+ private:
+ #if defined(_WIN32)
+ HANDLE handle;
+ #else
+ pthread_cond_t handle;
+ pthread_mutex_t mutex;
+ volatile bool signaled;
+ #endif
+ };
+
+ #if PERF_PROFILE
+ int64_t atomicExchange(int64_t volatile *target, int64_t value);
+ int atomicExchange(int volatile *target, int value);
+ #endif
+
+ int atomicIncrement(int volatile *value);
+ int atomicDecrement(int volatile *value);
+ int atomicAdd(int volatile *target, int value);
+ void nop();
+}
+
+namespace sw
+{
+ inline void Thread::yield()
+ {
+ #if defined(_WIN32)
+ Sleep(0);
+ #elif defined(__APPLE__)
+ pthread_yield_np();
+ #else
+ sched_yield();
+ #endif
+ }
+
+ inline void Thread::sleep(int milliseconds)
+ {
+ #if defined(_WIN32)
+ Sleep(milliseconds);
+ #else
+ usleep(1000 * milliseconds);
+ #endif
+ }
+
+ inline Thread::LocalStorageKey Thread::allocateLocalStorageKey(void (*destructor)(void *storage))
+ {
+ #if defined(_WIN32)
+ return TlsAlloc();
+ #else
+ LocalStorageKey key;
+ pthread_key_create(&key, destructor);
+ return key;
+ #endif
+ }
+
+ inline void Thread::freeLocalStorageKey(LocalStorageKey key)
+ {
+ #if defined(_WIN32)
+ TlsFree(key);
+ #else
+ pthread_key_delete(key); // Using an invalid key is an error but not undefined behavior.
+ #endif
+ }
+
+ inline void *Thread::allocateLocalStorage(LocalStorageKey key, size_t size)
+ {
+ if(key == TLS_OUT_OF_INDEXES)
+ {
+ return nullptr;
+ }
+
+ freeLocalStorage(key);
+
+ void *storage = malloc(size);
+
+ #if defined(_WIN32)
+ TlsSetValue(key, storage);
+ #else
+ pthread_setspecific(key, storage);
+ #endif
+
+ return storage;
+ }
+
+ inline void *Thread::getLocalStorage(LocalStorageKey key)
+ {
+ #if defined(_WIN32)
+ return TlsGetValue(key);
+ #else
+ if(key == TLS_OUT_OF_INDEXES) // Avoid undefined behavior.
+ {
+ return nullptr;
+ }
+
+ return pthread_getspecific(key);
+ #endif
+ }
+
+ inline void Thread::freeLocalStorage(LocalStorageKey key)
+ {
+ free(getLocalStorage(key));
+
+ #if defined(_WIN32)
+ TlsSetValue(key, nullptr);
+ #else
+ pthread_setspecific(key, nullptr);
+ #endif
+ }
+
+ inline void Event::signal()
+ {
+ #if defined(_WIN32)
+ SetEvent(handle);
+ #else
+ pthread_mutex_lock(&mutex);
+ signaled = true;
+ pthread_cond_signal(&handle);
+ pthread_mutex_unlock(&mutex);
+ #endif
+ }
+
+ inline void Event::wait()
+ {
+ #if defined(_WIN32)
+ WaitForSingleObject(handle, INFINITE);
+ #else
+ pthread_mutex_lock(&mutex);
+ while(!signaled) pthread_cond_wait(&handle, &mutex);
+ signaled = false;
+ pthread_mutex_unlock(&mutex);
+ #endif
+ }
+
+ #if PERF_PROFILE
+ inline int64_t atomicExchange(volatile int64_t *target, int64_t value)
+ {
+ #if defined(_WIN32)
+ return InterlockedExchange64(target, value);
+ #else
+ int ret;
+ __asm__ __volatile__("lock; xchg8 %x0,(%x1)" : "=r" (ret) :"r" (target), "0" (value) : "memory" );
+ return ret;
+ #endif
+ }
+
+ inline int atomicExchange(volatile int *target, int value)
+ {
+ #if defined(_WIN32)
+ return InterlockedExchange((volatile long*)target, (long)value);
+ #else
+ int ret;
+ __asm__ __volatile__("lock; xchgl %x0,(%x1)" : "=r" (ret) :"r" (target), "0" (value) : "memory" );
+ return ret;
+ #endif
+ }
+ #endif
+
+ inline int atomicIncrement(volatile int *value)
+ {
+ #if defined(_WIN32)
+ return InterlockedIncrement((volatile long*)value);
+ #else
+ return __sync_add_and_fetch(value, 1);
+ #endif
+ }
+
+ inline int atomicDecrement(volatile int *value)
+ {
+ #if defined(_WIN32)
+ return InterlockedDecrement((volatile long*)value);
+ #else
+ return __sync_sub_and_fetch(value, 1);
+ #endif
+ }
+
+ inline int atomicAdd(volatile int* target, int value)
+ {
+ #if defined(_WIN32)
+ return InterlockedExchangeAdd((volatile long*)target, value) + value;
+ #else
+ return __sync_add_and_fetch(target, value);
+ #endif
+ }
+
+ inline void nop()
+ {
+ #if defined(_WIN32)
+ __nop();
+ #else
+ __asm__ __volatile__ ("nop");
+ #endif
+ }
+
+ #if USE_STD_ATOMIC
+ class AtomicInt
+ {
+ public:
+ AtomicInt() : ai() {}
+ AtomicInt(int i) : ai(i) {}
+
+ inline operator int() const { return ai.load(std::memory_order_acquire); }
+ inline void operator=(const AtomicInt& i) { ai.store(i.ai.load(std::memory_order_acquire), std::memory_order_release); }
+ inline void operator=(int i) { ai.store(i, std::memory_order_release); }
+ inline void operator--() { ai.fetch_sub(1, std::memory_order_acq_rel); }
+ inline void operator++() { ai.fetch_add(1, std::memory_order_acq_rel); }
+ inline int operator--(int) { return ai.fetch_sub(1, std::memory_order_acq_rel) - 1; }
+ inline int operator++(int) { return ai.fetch_add(1, std::memory_order_acq_rel) + 1; }
+ inline void operator-=(int i) { ai.fetch_sub(i, std::memory_order_acq_rel); }
+ inline void operator+=(int i) { ai.fetch_add(i, std::memory_order_acq_rel); }
+ private:
+ std::atomic<int> ai;
+ };
+ #else
+ class AtomicInt
+ {
+ public:
+ AtomicInt() {}
+ AtomicInt(int i) : vi(i) {}
+
+ inline operator int() const { return vi; } // Note: this isn't a guaranteed atomic operation
+ inline void operator=(const AtomicInt& i) { sw::atomicExchange(&vi, i.vi); }
+ inline void operator=(int i) { sw::atomicExchange(&vi, i); }
+ inline void operator--() { sw::atomicDecrement(&vi); }
+ inline void operator++() { sw::atomicIncrement(&vi); }
+ inline int operator--(int) { return sw::atomicDecrement(&vi); }
+ inline int operator++(int) { return sw::atomicIncrement(&vi); }
+ inline void operator-=(int i) { sw::atomicAdd(&vi, -i); }
+ inline void operator+=(int i) { sw::atomicAdd(&vi, i); }
+ private:
+ volatile int vi;
+ };
+ #endif
+}
+
+#endif // sw_Thread_hpp
diff --git a/src/System/Timer.cpp b/src/System/Timer.cpp
new file mode 100644
index 0000000..8ff2cf3
--- /dev/null
+++ b/src/System/Timer.cpp
@@ -0,0 +1,95 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Timer.hpp"
+
+#if !defined(__i386__) && defined(_M_IX86)
+ #define __i386__ 1
+#endif
+
+#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
+ #define __x86_64__ 1
+#endif
+
+#if defined(_WIN32)
+ #ifndef WIN32_LEAN_AND_MEAN
+ #define WIN32_LEAN_AND_MEAN
+ #endif
+ #include <windows.h>
+ #include <intrin.h>
+#else
+ #include <sys/time.h>
+ #if defined(__i386__) || defined(__x86_64__)
+ #include <x86intrin.h>
+ #endif
+#endif
+
+namespace sw
+{
+ Timer::Timer()
+ {
+ }
+
+ Timer::~Timer()
+ {
+ }
+
+ double Timer::seconds()
+ {
+ #if defined(_WIN32)
+ return (double)counter() / (double)frequency();
+ #else
+ timeval t;
+ gettimeofday(&t, 0);
+ return (double)t.tv_sec + (double)t.tv_usec * 1.0e-6;
+ #endif
+ }
+
+ int64_t Timer::ticks()
+ {
+ #if defined(_WIN32)
+ return __rdtsc();
+ #elif defined(__i386__) || defined(__x86_64__)
+ int64_t tsc;
+ __asm volatile("rdtsc": "=A" (tsc));
+ return tsc;
+ #else
+ return 0;
+ #endif
+ }
+
+ int64_t Timer::counter()
+ {
+ #if defined(_WIN32)
+ int64_t counter;
+ QueryPerformanceCounter((LARGE_INTEGER*)&counter);
+ return counter;
+ #else
+ timeval t;
+ gettimeofday(&t, 0);
+ return t.tv_sec * 1000000 + t.tv_usec;
+ #endif
+ }
+
+ int64_t Timer::frequency()
+ {
+ #if defined(_WIN32)
+ int64_t frequency;
+ QueryPerformanceFrequency((LARGE_INTEGER*)&frequency);
+ return frequency;
+ #else
+ return 1000000; // gettimeofday uses microsecond resolution
+ #endif
+ }
+}
diff --git a/src/System/Timer.hpp b/src/System/Timer.hpp
new file mode 100644
index 0000000..977c877
--- /dev/null
+++ b/src/System/Timer.hpp
@@ -0,0 +1,37 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Timer_hpp
+#define sw_Timer_hpp
+
+#include "Types.hpp"
+
+namespace sw
+{
+ class Timer
+ {
+ public:
+ Timer();
+
+ ~Timer();
+
+ static double seconds();
+ static int64_t ticks();
+
+ static int64_t counter();
+ static int64_t frequency();
+ };
+}
+
+#endif // sw_Timer_hpp
diff --git a/src/System/Types.hpp b/src/System/Types.hpp
new file mode 100644
index 0000000..cd08ed5
--- /dev/null
+++ b/src/System/Types.hpp
@@ -0,0 +1,157 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Types_hpp
+#define sw_Types_hpp
+
+#include <limits>
+#include <type_traits>
+
+// GCC warns against bitfields not fitting the entire range of an enum with a fixed underlying type of unsigned int, which gets promoted to an error with -Werror and cannot be suppressed.
+// However, GCC already defaults to using unsigned int as the underlying type of an unscoped enum without a fixed underlying type. So we can just omit it.
+#if defined(__GNUC__) && !defined(__clang__)
+namespace {enum E {}; static_assert(!std::numeric_limits<std::underlying_type<E>::type>::is_signed, "expected unscoped enum whose underlying type is not fixed to be unsigned");}
+#define ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+#else
+#define ENUM_UNDERLYING_TYPE_UNSIGNED_INT : unsigned int
+#endif
+
+#if defined(_MSC_VER)
+ typedef signed __int8 int8_t;
+ typedef signed __int16 int16_t;
+ typedef signed __int32 int32_t;
+ typedef signed __int64 int64_t;
+ typedef unsigned __int8 uint8_t;
+ typedef unsigned __int16 uint16_t;
+ typedef unsigned __int32 uint32_t;
+ typedef unsigned __int64 uint64_t;
+ #define ALIGN(bytes, type) __declspec(align(bytes)) type
+#else
+ #include <stdint.h>
+ #define ALIGN(bytes, type) type __attribute__((aligned(bytes)))
+#endif
+
+namespace sw
+{
+ typedef ALIGN(1, uint8_t) byte;
+ typedef ALIGN(2, uint16_t) word;
+ typedef ALIGN(4, uint32_t) dword;
+ typedef ALIGN(8, uint64_t) qword;
+ typedef ALIGN(16, uint64_t) qword2[2];
+ typedef ALIGN(4, uint8_t) byte4[4];
+ typedef ALIGN(8, uint8_t) byte8[8];
+ typedef ALIGN(16, uint8_t) byte16[16];
+ typedef ALIGN(8, uint16_t) word4[4];
+ typedef ALIGN(8, uint32_t) dword2[2];
+ typedef ALIGN(16, uint32_t) dword4[4];
+ typedef ALIGN(16, uint64_t) xword[2];
+
+ typedef ALIGN(1, int8_t) sbyte;
+ typedef ALIGN(4, int8_t) sbyte4[4];
+ typedef ALIGN(8, int8_t) sbyte8[8];
+ typedef ALIGN(16, int8_t) sbyte16[16];
+ typedef ALIGN(8, short) short4[4];
+ typedef ALIGN(8, unsigned short) ushort4[4];
+ typedef ALIGN(16, short) short8[8];
+ typedef ALIGN(16, unsigned short) ushort8[8];
+ typedef ALIGN(8, int) int2[2];
+ typedef ALIGN(8, unsigned int) uint2[2];
+ typedef ALIGN(16, unsigned int) uint4[4];
+
+ typedef ALIGN(8, float) float2[2];
+
+ ALIGN(16, struct int4
+ {
+ int x;
+ int y;
+ int z;
+ int w;
+
+ int &operator[](int i)
+ {
+ return (&x)[i];
+ }
+
+ const int &operator[](int i) const
+ {
+ return (&x)[i];
+ }
+
+ bool operator!=(const int4 &rhs)
+ {
+ return x != rhs.x || y != rhs.y || z != rhs.z || w != rhs.w;
+ }
+
+ bool operator==(const int4 &rhs)
+ {
+ return x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w;
+ }
+ });
+
+ ALIGN(16, struct float4
+ {
+ float x;
+ float y;
+ float z;
+ float w;
+
+ float &operator[](int i)
+ {
+ return (&x)[i];
+ }
+
+ const float &operator[](int i) const
+ {
+ return (&x)[i];
+ }
+
+ bool operator!=(const float4 &rhs)
+ {
+ return x != rhs.x || y != rhs.y || z != rhs.z || w != rhs.w;
+ }
+
+ bool operator==(const float4 &rhs)
+ {
+ return x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w;
+ }
+ });
+
+ inline float4 vector(float x, float y, float z, float w)
+ {
+ float4 v;
+
+ v.x = x;
+ v.y = y;
+ v.z = z;
+ v.w = w;
+
+ return v;
+ }
+
+ inline float4 replicate(float f)
+ {
+ float4 v;
+
+ v.x = f;
+ v.y = f;
+ v.z = f;
+ v.w = f;
+
+ return v;
+ }
+
+ #define OFFSET(s,m) (int)(size_t)&reinterpret_cast<const volatile char&>((((s*)0)->m))
+}
+
+#endif // sw_Types_hpp
diff --git a/src/System/Version.h b/src/System/Version.h
new file mode 100644
index 0000000..72bd15d
--- /dev/null
+++ b/src/System/Version.h
@@ -0,0 +1,24 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MAJOR_VERSION 4
+#define MINOR_VERSION 1
+#define BUILD_VERSION 0
+#define BUILD_REVISION 2
+
+#define STRINGIFY(x) #x
+#define MACRO_STRINGIFY(x) STRINGIFY(x)
+
+#define REVISION_STRING MACRO_STRINGIFY(BUILD_REVISION)
+#define VERSION_STRING MACRO_STRINGIFY(MAJOR_VERSION) "." MACRO_STRINGIFY(MINOR_VERSION) "." MACRO_STRINGIFY(BUILD_VERSION) "." MACRO_STRINGIFY(BUILD_REVISION)
diff --git a/src/WSI/FrameBuffer.cpp b/src/WSI/FrameBuffer.cpp
new file mode 100644
index 0000000..7a8ddc1
--- /dev/null
+++ b/src/WSI/FrameBuffer.cpp
@@ -0,0 +1,638 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBuffer.hpp"
+
+#include "Renderer/Surface.hpp"
+#include "Reactor/Reactor.hpp"
+#include "Common/Timer.hpp"
+#include "Common/Debug.hpp"
+
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#define ASYNCHRONOUS_BLIT false // FIXME: Currently leads to rare race conditions
+
+namespace sw
+{
+ extern bool forceWindowed;
+
+ FrameBuffer::Cursor FrameBuffer::cursor = {};
+ bool FrameBuffer::topLeftOrigin = false;
+
+ FrameBuffer::FrameBuffer(int width, int height, bool fullscreen, bool topLeftOrigin)
+ {
+ this->topLeftOrigin = topLeftOrigin;
+
+ framebuffer = nullptr;
+
+ this->width = width;
+ this->height = height;
+ format = FORMAT_X8R8G8B8;
+ stride = 0;
+
+ windowed = !fullscreen || forceWindowed;
+
+ blitFunction = nullptr;
+ blitRoutine = nullptr;
+ blitState = {};
+
+ if(ASYNCHRONOUS_BLIT)
+ {
+ terminate = false;
+ FrameBuffer *parameters = this;
+ blitThread = new Thread(threadFunction, ¶meters);
+ }
+ }
+
+ FrameBuffer::~FrameBuffer()
+ {
+ if(ASYNCHRONOUS_BLIT)
+ {
+ terminate = true;
+ blitEvent.signal();
+ blitThread->join();
+ delete blitThread;
+ }
+
+ delete blitRoutine;
+ }
+
+ void FrameBuffer::setCursorImage(sw::Surface *cursorImage)
+ {
+ if(cursorImage)
+ {
+ cursor.image = cursorImage->lockExternal(0, 0, 0, sw::LOCK_READONLY, sw::PUBLIC);
+ cursorImage->unlockExternal();
+
+ cursor.width = cursorImage->getWidth();
+ cursor.height = cursorImage->getHeight();
+ }
+ else
+ {
+ cursor.width = 0;
+ cursor.height = 0;
+ }
+ }
+
+ void FrameBuffer::setCursorOrigin(int x0, int y0)
+ {
+ cursor.hotspotX = x0;
+ cursor.hotspotY = y0;
+ }
+
+ void FrameBuffer::setCursorPosition(int x, int y)
+ {
+ cursor.positionX = x;
+ cursor.positionY = y;
+ }
+
+ void FrameBuffer::copy(sw::Surface *source)
+ {
+ if(!source)
+ {
+ return;
+ }
+
+ if(!lock())
+ {
+ return;
+ }
+
+ int sourceStride = source->getInternalPitchB();
+
+ updateState = {};
+ updateState.width = width;
+ updateState.height = height;
+ updateState.destFormat = format;
+ updateState.destStride = stride;
+ updateState.sourceFormat = source->getInternalFormat();
+ updateState.sourceStride = topLeftOrigin ? sourceStride : -sourceStride;
+ updateState.cursorWidth = cursor.width;
+ updateState.cursorHeight = cursor.height;
+
+ renderbuffer = source->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PUBLIC);
+
+ if(!topLeftOrigin)
+ {
+ renderbuffer = (byte*)renderbuffer + (height - 1) * sourceStride;
+ }
+
+ cursor.x = cursor.positionX - cursor.hotspotX;
+ cursor.y = cursor.positionY - cursor.hotspotY;
+
+ if(ASYNCHRONOUS_BLIT)
+ {
+ blitEvent.signal();
+ syncEvent.wait();
+ }
+ else
+ {
+ copyLocked();
+ }
+
+ source->unlockInternal();
+ unlock();
+
+ profiler.nextFrame(); // Assumes every copy() is a full frame
+ }
+
+ void FrameBuffer::copyLocked()
+ {
+ if(memcmp(&blitState, &updateState, sizeof(BlitState)) != 0)
+ {
+ blitState = updateState;
+ delete blitRoutine;
+
+ blitRoutine = copyRoutine(blitState);
+ blitFunction = (void(*)(void*, void*, Cursor*))blitRoutine->getEntry();
+ }
+
+ blitFunction(framebuffer, renderbuffer, &cursor);
+ }
+
+ Routine *FrameBuffer::copyRoutine(const BlitState &state)
+ {
+ const int width = state.width;
+ const int height = state.height;
+ const int dBytes = Surface::bytes(state.destFormat);
+ const int dStride = state.destStride;
+ const int sBytes = Surface::bytes(state.sourceFormat);
+ const int sStride = state.sourceStride;
+
+ Function<Void(Pointer<Byte>, Pointer<Byte>, Pointer<Byte>)> function;
+ {
+ Pointer<Byte> dst(function.Arg<0>());
+ Pointer<Byte> src(function.Arg<1>());
+ Pointer<Byte> cursor(function.Arg<2>());
+
+ For(Int y = 0, y < height, y++)
+ {
+ Pointer<Byte> d = dst + y * dStride;
+ Pointer<Byte> s = src + y * sStride;
+
+ Int x0 = 0;
+
+ switch(state.destFormat)
+ {
+ case FORMAT_X8R8G8B8:
+ case FORMAT_A8R8G8B8:
+ {
+ Int x = x0;
+
+ switch(state.sourceFormat)
+ {
+ case FORMAT_X8R8G8B8:
+ case FORMAT_A8R8G8B8:
+ For(, x < width - 3, x += 4)
+ {
+ *Pointer<Int4>(d, 1) = *Pointer<Int4>(s, sStride % 16 ? 1 : 16);
+
+ s += 4 * sBytes;
+ d += 4 * dBytes;
+ }
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8B8G8R8:
+ For(, x < width - 3, x += 4)
+ {
+ Int4 bgra = *Pointer<Int4>(s, sStride % 16 ? 1 : 16);
+
+ *Pointer<Int4>(d, 1) = ((bgra & Int4(0x00FF0000)) >> 16) |
+ ((bgra & Int4(0x000000FF)) << 16) |
+ (bgra & Int4(0xFF00FF00));
+
+ s += 4 * sBytes;
+ d += 4 * dBytes;
+ }
+ break;
+ case FORMAT_A16B16G16R16:
+ For(, x < width - 1, x += 2)
+ {
+ Short4 c0 = As<UShort4>(Swizzle(*Pointer<Short4>(s + 0), 0xC6)) >> 8;
+ Short4 c1 = As<UShort4>(Swizzle(*Pointer<Short4>(s + 8), 0xC6)) >> 8;
+
+ *Pointer<Int2>(d) = As<Int2>(PackUnsigned(c0, c1));
+
+ s += 2 * sBytes;
+ d += 2 * dBytes;
+ }
+ break;
+ case FORMAT_R5G6B5:
+ For(, x < width - 3, x += 4)
+ {
+ Int4 rgb = Int4(*Pointer<Short4>(s));
+
+ *Pointer<Int4>(d) = (((rgb & Int4(0xF800)) << 8) | ((rgb & Int4(0xE01F)) << 3)) |
+ (((rgb & Int4(0x07E0)) << 5) | ((rgb & Int4(0x0600)) >> 1)) |
+ (((rgb & Int4(0x001C)) >> 2) | Int4(0xFF000000));
+
+ s += 4 * sBytes;
+ d += 4 * dBytes;
+ }
+ break;
+ default:
+ ASSERT(false);
+ break;
+ }
+
+ For(, x < width, x++)
+ {
+ switch(state.sourceFormat)
+ {
+ case FORMAT_X8R8G8B8:
+ case FORMAT_A8R8G8B8:
+ *Pointer<Int>(d) = *Pointer<Int>(s);
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8B8G8R8:
+ {
+ Int rgba = *Pointer<Int>(s);
+
+ *Pointer<Int>(d) = ((rgba & Int(0x00FF0000)) >> 16) |
+ ((rgba & Int(0x000000FF)) << 16) |
+ (rgba & Int(0xFF00FF00));
+ }
+ break;
+ case FORMAT_A16B16G16R16:
+ {
+ Short4 c = As<UShort4>(Swizzle(*Pointer<Short4>(s), 0xC6)) >> 8;
+
+ *Pointer<Int>(d) = Int(As<Int2>(PackUnsigned(c, c)));
+ }
+ break;
+ case FORMAT_R5G6B5:
+ {
+ Int rgb = Int(*Pointer<Short>(s));
+
+ *Pointer<Int>(d) = 0xFF000000 |
+ ((rgb & 0xF800) << 8) | ((rgb & 0xE01F) << 3) |
+ ((rgb & 0x07E0) << 5) | ((rgb & 0x0600) >> 1) |
+ ((rgb & 0x001C) >> 2);
+ }
+ break;
+ default:
+ ASSERT(false);
+ break;
+ }
+
+ s += sBytes;
+ d += dBytes;
+ }
+ }
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ {
+ Int x = x0;
+
+ switch(state.sourceFormat)
+ {
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8B8G8R8:
+ For(, x < width - 3, x += 4)
+ {
+ *Pointer<Int4>(d, 1) = *Pointer<Int4>(s, sStride % 16 ? 1 : 16);
+
+ s += 4 * sBytes;
+ d += 4 * dBytes;
+ }
+ break;
+ case FORMAT_X8R8G8B8:
+ case FORMAT_A8R8G8B8:
+ For(, x < width - 3, x += 4)
+ {
+ Int4 bgra = *Pointer<Int4>(s, sStride % 16 ? 1 : 16);
+
+ *Pointer<Int4>(d, 1) = ((bgra & Int4(0x00FF0000)) >> 16) |
+ ((bgra & Int4(0x000000FF)) << 16) |
+ (bgra & Int4(0xFF00FF00));
+
+ s += 4 * sBytes;
+ d += 4 * dBytes;
+ }
+ break;
+ case FORMAT_A16B16G16R16:
+ For(, x < width - 1, x += 2)
+ {
+ Short4 c0 = *Pointer<UShort4>(s + 0) >> 8;
+ Short4 c1 = *Pointer<UShort4>(s + 8) >> 8;
+
+ *Pointer<Int2>(d) = As<Int2>(PackUnsigned(c0, c1));
+
+ s += 2 * sBytes;
+ d += 2 * dBytes;
+ }
+ break;
+ case FORMAT_R5G6B5:
+ For(, x < width - 3, x += 4)
+ {
+ Int4 rgb = Int4(*Pointer<Short4>(s));
+
+ *Pointer<Int4>(d) = Int4(0xFF000000) |
+ (((rgb & Int4(0x001F)) << 19) | ((rgb & Int4(0x001C)) << 14)) |
+ (((rgb & Int4(0x07E0)) << 5) | ((rgb & Int4(0x0600)) >> 1)) |
+ (((rgb & Int4(0xF800)) >> 8) | ((rgb & Int4(0xE000)) >> 13));
+
+ s += 4 * sBytes;
+ d += 4 * dBytes;
+ }
+ break;
+ default:
+ ASSERT(false);
+ break;
+ }
+
+ For(, x < width, x++)
+ {
+ switch(state.sourceFormat)
+ {
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8B8G8R8:
+ *Pointer<Int>(d) = *Pointer<Int>(s);
+ break;
+ case FORMAT_X8R8G8B8:
+ case FORMAT_A8R8G8B8:
+ {
+ Int bgra = *Pointer<Int>(s);
+ *Pointer<Int>(d) = ((bgra & Int(0x00FF0000)) >> 16) |
+ ((bgra & Int(0x000000FF)) << 16) |
+ (bgra & Int(0xFF00FF00));
+ }
+ break;
+ case FORMAT_A16B16G16R16:
+ {
+ Short4 c = *Pointer<UShort4>(s) >> 8;
+
+ *Pointer<Int>(d) = Int(As<Int2>(PackUnsigned(c, c)));
+ }
+ break;
+ case FORMAT_R5G6B5:
+ {
+ Int rgb = Int(*Pointer<Short>(s));
+
+ *Pointer<Int>(d) = 0xFF000000 |
+ ((rgb & 0x001F) << 19) | ((rgb & 0x001C) << 14) |
+ ((rgb & 0x07E0) << 5) | ((rgb & 0x0600) >> 1) |
+ ((rgb & 0xF800) >> 8) | ((rgb & 0xE000) >> 13);
+ }
+ break;
+ default:
+ ASSERT(false);
+ break;
+ }
+
+ s += sBytes;
+ d += dBytes;
+ }
+ }
+ break;
+ case FORMAT_R8G8B8:
+ {
+ For(Int x = x0, x < width, x++)
+ {
+ switch(state.sourceFormat)
+ {
+ case FORMAT_X8R8G8B8:
+ case FORMAT_A8R8G8B8:
+ *Pointer<Byte>(d + 0) = *Pointer<Byte>(s + 0);
+ *Pointer<Byte>(d + 1) = *Pointer<Byte>(s + 1);
+ *Pointer<Byte>(d + 2) = *Pointer<Byte>(s + 2);
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8B8G8R8:
+ *Pointer<Byte>(d + 0) = *Pointer<Byte>(s + 2);
+ *Pointer<Byte>(d + 1) = *Pointer<Byte>(s + 1);
+ *Pointer<Byte>(d + 2) = *Pointer<Byte>(s + 0);
+ break;
+ case FORMAT_A16B16G16R16:
+ *Pointer<Byte>(d + 0) = *Pointer<Byte>(s + 5);
+ *Pointer<Byte>(d + 1) = *Pointer<Byte>(s + 3);
+ *Pointer<Byte>(d + 2) = *Pointer<Byte>(s + 1);
+ break;
+ case FORMAT_R5G6B5:
+ {
+ Int rgb = Int(*Pointer<Short>(s));
+
+ *Pointer<Byte>(d + 0) = Byte(((rgb & 0x001F) << 3) | ((rgb & 0x001C) >> 2));
+ *Pointer<Byte>(d + 1) = Byte(((rgb & 0x07E0) << 5) | ((rgb & 0x0600) >> 1));
+ *Pointer<Byte>(d + 2) = Byte(((rgb & 0xF800) << 8) | ((rgb & 0xE000) << 3));
+ }
+ break;
+ default:
+ ASSERT(false);
+ break;
+ }
+
+ s += sBytes;
+ d += dBytes;
+ }
+ }
+ break;
+ case FORMAT_R5G6B5:
+ {
+ For(Int x = x0, x < width, x++)
+ {
+ switch(state.sourceFormat)
+ {
+ case FORMAT_X8R8G8B8:
+ case FORMAT_A8R8G8B8:
+ {
+ Int c = *Pointer<Int>(s);
+
+ *Pointer<Short>(d) = Short((c & 0x00F80000) >> 8 |
+ (c & 0x0000FC00) >> 5 |
+ (c & 0x000000F8) >> 3);
+ }
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8B8G8R8:
+ {
+ Int c = *Pointer<Int>(s);
+
+ *Pointer<Short>(d) = Short((c & 0x00F80000) >> 19 |
+ (c & 0x0000FC00) >> 5 |
+ (c & 0x000000F8) << 8);
+ }
+ break;
+ case FORMAT_A16B16G16R16:
+ {
+ Short4 cc = *Pointer<UShort4>(s) >> 8;
+ Int c = Int(As<Int2>(PackUnsigned(cc, cc)));
+
+ *Pointer<Short>(d) = Short((c & 0x00F80000) >> 19 |
+ (c & 0x0000FC00) >> 5 |
+ (c & 0x000000F8) << 8);
+ }
+ break;
+ case FORMAT_R5G6B5:
+ *Pointer<Short>(d) = *Pointer<Short>(s);
+ break;
+ default:
+ ASSERT(false);
+ break;
+ }
+
+ s += sBytes;
+ d += dBytes;
+ }
+ }
+ break;
+ default:
+ ASSERT(false);
+ break;
+ }
+ }
+
+ if(state.cursorWidth > 0 && state.cursorHeight > 0)
+ {
+ Int x0 = *Pointer<Int>(cursor + OFFSET(Cursor,x));
+ Int y0 = *Pointer<Int>(cursor + OFFSET(Cursor,y));
+
+ For(Int y1 = 0, y1 < state.cursorHeight, y1++)
+ {
+ Int y = y0 + y1;
+
+ If(y >= 0 && y < height)
+ {
+ Pointer<Byte> d = dst + y * dStride + x0 * dBytes;
+ Pointer<Byte> s = src + y * sStride + x0 * sBytes;
+ Pointer<Byte> c = *Pointer<Pointer<Byte>>(cursor + OFFSET(Cursor,image)) + y1 * state.cursorWidth * 4;
+
+ For(Int x1 = 0, x1 < state.cursorWidth, x1++)
+ {
+ Int x = x0 + x1;
+
+ If(x >= 0 && x < width)
+ {
+ blend(state, d, s, c);
+ }
+
+ c += 4;
+ s += sBytes;
+ d += dBytes;
+ }
+ }
+ }
+ }
+ }
+
+ return function(L"FrameBuffer");
+ }
+
+ void FrameBuffer::blend(const BlitState &state, const Pointer<Byte> &d, const Pointer<Byte> &s, const Pointer<Byte> &c)
+ {
+ Short4 c1;
+ Short4 c2;
+
+ c1 = Unpack(*Pointer<Byte4>(c));
+
+ switch(state.sourceFormat)
+ {
+ case FORMAT_X8R8G8B8:
+ case FORMAT_A8R8G8B8:
+ c2 = Unpack(*Pointer<Byte4>(s));
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8B8G8R8:
+ c2 = Swizzle(Unpack(*Pointer<Byte4>(s)), 0xC6);
+ break;
+ case FORMAT_A16B16G16R16:
+ c2 = Swizzle(*Pointer<Short4>(s), 0xC6);
+ break;
+ case FORMAT_R5G6B5:
+ {
+ Int rgb(*Pointer<Short>(s));
+ rgb = 0xFF000000 |
+ ((rgb & 0xF800) << 8) | ((rgb & 0xE01F) << 3) |
+ ((rgb & 0x07E0) << 5) | ((rgb & 0x0600) >> 1) |
+ ((rgb & 0x001C) >> 2);
+ c2 = Unpack(As<Byte4>(rgb));
+ }
+ break;
+ default:
+ ASSERT(false);
+ break;
+ }
+
+ c1 = As<Short4>(As<UShort4>(c1) >> 9);
+ c2 = As<Short4>(As<UShort4>(c2) >> 9);
+
+ Short4 alpha = Swizzle(c1, 0xFF) & Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0x0000);
+
+ c1 = (c1 - c2) * alpha;
+ c1 = c1 >> 7;
+ c1 = c1 + c2;
+ c1 = c1 + c1;
+
+ switch(state.destFormat)
+ {
+ case FORMAT_X8R8G8B8:
+ case FORMAT_A8R8G8B8:
+ *Pointer<Byte4>(d) = Byte4(PackUnsigned(c1, c1));
+ break;
+ case FORMAT_X8B8G8R8:
+ case FORMAT_A8B8G8R8:
+ case FORMAT_SRGB8_X8:
+ case FORMAT_SRGB8_A8:
+ {
+ c1 = Swizzle(c1, 0xC6);
+
+ *Pointer<Byte4>(d) = Byte4(PackUnsigned(c1, c1));
+ }
+ break;
+ case FORMAT_R8G8B8:
+ {
+ Int c = Int(As<Int2>(PackUnsigned(c1, c1)));
+
+ *Pointer<Byte>(d + 0) = Byte(c >> 0);
+ *Pointer<Byte>(d + 1) = Byte(c >> 8);
+ *Pointer<Byte>(d + 2) = Byte(c >> 16);
+ }
+ break;
+ case FORMAT_R5G6B5:
+ {
+ Int c = Int(As<Int2>(PackUnsigned(c1, c1)));
+
+ *Pointer<Short>(d) = Short((c & 0x00F80000) >> 8 |
+ (c & 0x0000FC00) >> 5 |
+ (c & 0x000000F8) >> 3);
+ }
+ break;
+ default:
+ ASSERT(false);
+ break;
+ }
+ }
+
+ void FrameBuffer::threadFunction(void *parameters)
+ {
+ FrameBuffer *frameBuffer = *static_cast<FrameBuffer**>(parameters);
+
+ while(!frameBuffer->terminate)
+ {
+ frameBuffer->blitEvent.wait();
+
+ if(!frameBuffer->terminate)
+ {
+ frameBuffer->copyLocked();
+
+ frameBuffer->syncEvent.signal();
+ }
+ }
+ }
+}
diff --git a/src/WSI/FrameBuffer.hpp b/src/WSI/FrameBuffer.hpp
new file mode 100644
index 0000000..dd539e1
--- /dev/null
+++ b/src/WSI/FrameBuffer.hpp
@@ -0,0 +1,106 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_FrameBuffer_hpp
+#define sw_FrameBuffer_hpp
+
+#include "Reactor/Reactor.hpp"
+#include "Renderer/Surface.hpp"
+#include "Common/Thread.hpp"
+
+namespace sw
+{
+ class Surface;
+
+ struct BlitState
+ {
+ int width;
+ int height;
+ Format destFormat;
+ Format sourceFormat;
+ int destStride;
+ int sourceStride;
+ int cursorWidth;
+ int cursorHeight;
+ };
+
+ class [[clang::lto_visibility_public]] FrameBuffer
+ {
+ public:
+ FrameBuffer(int width, int height, bool fullscreen, bool topLeftOrigin);
+
+ virtual ~FrameBuffer() = 0;
+
+ virtual void flip(sw::Surface *source) = 0;
+ virtual void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) = 0;
+
+ virtual void *lock() = 0;
+ virtual void unlock() = 0;
+
+ static void setCursorImage(sw::Surface *cursor);
+ static void setCursorOrigin(int x0, int y0);
+ static void setCursorPosition(int x, int y);
+
+ static Routine *copyRoutine(const BlitState &state);
+
+ protected:
+ void copy(sw::Surface *source);
+
+ bool windowed;
+
+ void *framebuffer; // Native window buffer.
+ int width;
+ int height;
+ int stride;
+ Format format;
+
+ private:
+ void copyLocked();
+
+ static void threadFunction(void *parameters);
+
+ void *renderbuffer; // Render target buffer.
+
+ struct Cursor
+ {
+ void *image;
+ int x;
+ int y;
+ int width;
+ int height;
+ int hotspotX;
+ int hotspotY;
+ int positionX;
+ int positionY;
+ };
+
+ static Cursor cursor;
+
+ void (*blitFunction)(void *dst, void *src, Cursor *cursor);
+ Routine *blitRoutine;
+ BlitState blitState; // State of the current blitRoutine.
+ BlitState updateState; // State of the routine to be generated.
+
+ static void blend(const BlitState &state, const Pointer<Byte> &d, const Pointer<Byte> &s, const Pointer<Byte> &c);
+
+ Thread *blitThread;
+ Event syncEvent;
+ Event blitEvent;
+ volatile bool terminate;
+
+ static bool topLeftOrigin;
+ };
+}
+
+#endif // sw_FrameBuffer_hpp
diff --git a/src/WSI/FrameBufferAndroid.cpp b/src/WSI/FrameBufferAndroid.cpp
new file mode 100644
index 0000000..0ae5f09
--- /dev/null
+++ b/src/WSI/FrameBufferAndroid.cpp
@@ -0,0 +1,145 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferAndroid.hpp"
+
+#include "Common/GrallocAndroid.hpp"
+
+#include <system/window.h>
+
+namespace sw
+{
+ inline int dequeueBuffer(ANativeWindow* window, ANativeWindowBuffer** buffer)
+ {
+ #if ANDROID_PLATFORM_SDK_VERSION > 16
+ return native_window_dequeue_buffer_and_wait(window, buffer);
+ #else
+ return window->dequeueBuffer(window, buffer);
+ #endif
+ }
+
+ inline int queueBuffer(ANativeWindow* window, ANativeWindowBuffer* buffer, int fenceFd)
+ {
+ #if ANDROID_PLATFORM_SDK_VERSION > 16
+ return window->queueBuffer(window, buffer, fenceFd);
+ #else
+ return window->queueBuffer(window, buffer);
+ #endif
+ }
+
+ inline int cancelBuffer(ANativeWindow* window, ANativeWindowBuffer* buffer, int fenceFd)
+ {
+ #if ANDROID_PLATFORM_SDK_VERSION > 16
+ return window->cancelBuffer(window, buffer, fenceFd);
+ #else
+ return window->cancelBuffer(window, buffer);
+ #endif
+ }
+
+ FrameBufferAndroid::FrameBufferAndroid(ANativeWindow* window, int width, int height)
+ : FrameBuffer(width, height, false, false),
+ nativeWindow(window), buffer(nullptr)
+ {
+ nativeWindow->common.incRef(&nativeWindow->common);
+ native_window_set_usage(nativeWindow, GRALLOC_USAGE_SW_READ_OFTEN | GRALLOC_USAGE_SW_WRITE_OFTEN);
+ }
+
+ FrameBufferAndroid::~FrameBufferAndroid()
+ {
+ nativeWindow->common.decRef(&nativeWindow->common);
+ }
+
+ void FrameBufferAndroid::blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+ {
+ copy(source);
+
+ if(buffer)
+ {
+ if(framebuffer)
+ {
+ framebuffer = nullptr;
+ unlock();
+ }
+
+ queueBuffer(nativeWindow, buffer, -1);
+ }
+ }
+
+ void *FrameBufferAndroid::lock()
+ {
+ if(dequeueBuffer(nativeWindow, &buffer) != 0)
+ {
+ return nullptr;
+ }
+
+ if(GrallocModule::getInstance()->lock(buffer->handle,
+ GRALLOC_USAGE_SW_READ_OFTEN | GRALLOC_USAGE_SW_WRITE_OFTEN,
+ 0, 0, buffer->width, buffer->height, &framebuffer) != 0)
+ {
+ TRACE("%s failed to lock buffer %p", __FUNCTION__, buffer);
+ return nullptr;
+ }
+
+ if((buffer->width < width) || (buffer->height < height))
+ {
+ TRACE("lock failed: buffer of %dx%d too small for window of %dx%d",
+ buffer->width, buffer->height, width, height);
+ return nullptr;
+ }
+
+ switch(buffer->format)
+ {
+ case HAL_PIXEL_FORMAT_RGB_565: format = FORMAT_R5G6B5; break;
+ case HAL_PIXEL_FORMAT_RGBA_8888: format = FORMAT_A8B8G8R8; break;
+#if ANDROID_PLATFORM_SDK_VERSION > 16
+ case HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED: format = FORMAT_X8B8G8R8; break;
+#endif
+ case HAL_PIXEL_FORMAT_RGBX_8888: format = FORMAT_X8B8G8R8; break;
+ case HAL_PIXEL_FORMAT_BGRA_8888: format = FORMAT_A8R8G8B8; break;
+ case HAL_PIXEL_FORMAT_RGB_888:
+ // Frame buffers are expected to have 16-bit or 32-bit colors, not 24-bit.
+ TRACE("Unsupported frame buffer format RGB_888"); ASSERT(false);
+ format = FORMAT_R8G8B8; // Wrong component order.
+ break;
+ default:
+ TRACE("Unsupported frame buffer format %d", buffer->format); ASSERT(false);
+ format = FORMAT_NULL;
+ break;
+ }
+
+ stride = buffer->stride * Surface::bytes(format);
+ return framebuffer;
+ }
+
+ void FrameBufferAndroid::unlock()
+ {
+ if(!buffer)
+ {
+ TRACE("%s: badness unlock with no active buffer", __FUNCTION__);
+ return;
+ }
+
+ framebuffer = nullptr;
+
+ if(GrallocModule::getInstance()->unlock(buffer->handle) != 0)
+ {
+ TRACE("%s: badness unlock failed", __FUNCTION__);
+ }
+ }
+}
+
+sw::FrameBuffer *createFrameBuffer(void *display, ANativeWindow* window, int width, int height)
+{
+ return new sw::FrameBufferAndroid(window, width, height);
+}
diff --git a/src/WSI/FrameBufferAndroid.hpp b/src/WSI/FrameBufferAndroid.hpp
new file mode 100644
index 0000000..b71c32b
--- /dev/null
+++ b/src/WSI/FrameBufferAndroid.hpp
@@ -0,0 +1,47 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_FrameBufferAndroid_hpp
+#define sw_FrameBufferAndroid_hpp
+
+#include "Main/FrameBuffer.hpp"
+#include "Common/Debug.hpp"
+
+struct ANativeWindow;
+struct ANativeWindowBuffer;
+
+namespace sw
+{
+ class FrameBufferAndroid : public FrameBuffer
+ {
+ public:
+ FrameBufferAndroid(ANativeWindow *window, int width, int height);
+
+ ~FrameBufferAndroid() override;
+
+ void flip(sw::Surface *source) override {blit(source, nullptr, nullptr);};
+ void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+ void *lock() override;
+ void unlock() override;
+
+ bool setSwapRectangle(int l, int t, int w, int h);
+
+ private:
+ ANativeWindow *nativeWindow;
+ ANativeWindowBuffer *buffer;
+ };
+}
+
+#endif // sw_FrameBufferAndroid
diff --git a/src/WSI/FrameBufferDD.cpp b/src/WSI/FrameBufferDD.cpp
new file mode 100644
index 0000000..46ed89f
--- /dev/null
+++ b/src/WSI/FrameBufferDD.cpp
@@ -0,0 +1,510 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferDD.hpp"
+
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+ extern bool forceWindowed;
+
+ GUID secondaryDisplay = {0};
+
+ int __stdcall enumDisplayCallback(GUID* guid, char *driverDescription, char *driverName, void *context, HMONITOR monitor)
+ {
+ if(strcmp(driverName, "\\\\.\\DISPLAY2") == 0)
+ {
+ secondaryDisplay = *guid;
+ }
+
+ return 1;
+ }
+
+ FrameBufferDD::FrameBufferDD(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin) : FrameBufferWin(windowHandle, width, height, fullscreen, topLeftOrigin)
+ {
+ directDraw = 0;
+ frontBuffer = 0;
+ backBuffer = 0;
+
+ framebuffer = nullptr;
+
+ ddraw = LoadLibrary("ddraw.dll");
+ DirectDrawCreate = (DIRECTDRAWCREATE)GetProcAddress(ddraw, "DirectDrawCreate");
+ DirectDrawEnumerateExA = (DIRECTDRAWENUMERATEEXA)GetProcAddress(ddraw, "DirectDrawEnumerateExA");
+
+ if(!windowed)
+ {
+ initFullscreen();
+ }
+ else
+ {
+ initWindowed();
+ }
+ }
+
+ FrameBufferDD::~FrameBufferDD()
+ {
+ releaseAll();
+
+ FreeLibrary(ddraw);
+ }
+
+ void FrameBufferDD::createSurfaces()
+ {
+ if(backBuffer)
+ {
+ backBuffer->Release();
+ backBuffer = 0;
+ }
+
+ if(frontBuffer)
+ {
+ frontBuffer->Release();
+ frontBuffer = 0;
+ }
+
+ if(!windowed)
+ {
+ DDSURFACEDESC surfaceDescription = {0};
+ surfaceDescription.dwSize = sizeof(surfaceDescription);
+ surfaceDescription.dwFlags = DDSD_CAPS | DDSD_BACKBUFFERCOUNT;
+ surfaceDescription.ddsCaps.dwCaps = DDSCAPS_PRIMARYSURFACE | DDSCAPS_FLIP | DDSCAPS_COMPLEX;
+ surfaceDescription.dwBackBufferCount = 1;
+ directDraw->CreateSurface(&surfaceDescription, &frontBuffer, 0);
+
+ if(frontBuffer)
+ {
+ DDSCAPS surfaceCapabilties = {0};
+ surfaceCapabilties.dwCaps = DDSCAPS_BACKBUFFER;
+ frontBuffer->GetAttachedSurface(&surfaceCapabilties, &backBuffer);
+ backBuffer->AddRef();
+ }
+ }
+ else
+ {
+ IDirectDrawClipper *clipper;
+
+ DDSURFACEDESC ddsd = {0};
+ ddsd.dwSize = sizeof(ddsd);
+ ddsd.dwFlags = DDSD_CAPS;
+ ddsd.ddsCaps.dwCaps = DDSCAPS_PRIMARYSURFACE;
+
+ long result = directDraw->CreateSurface(&ddsd, &frontBuffer, 0);
+ directDraw->GetDisplayMode(&ddsd);
+
+ switch(ddsd.ddpfPixelFormat.dwRGBBitCount)
+ {
+ case 32: format = FORMAT_X8R8G8B8; break;
+ case 24: format = FORMAT_R8G8B8; break;
+ case 16: format = FORMAT_R5G6B5; break;
+ default: format = FORMAT_NULL; break;
+ }
+
+ if((result != DD_OK && result != DDERR_PRIMARYSURFACEALREADYEXISTS) || (format == FORMAT_NULL))
+ {
+ assert(!"Failed to initialize graphics: Incompatible display mode.");
+ }
+ else
+ {
+ ddsd.dwFlags = DDSD_CAPS | DDSD_WIDTH | DDSD_HEIGHT;
+ ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN;
+ ddsd.dwWidth = width;
+ ddsd.dwHeight = height;
+
+ directDraw->CreateSurface(&ddsd, &backBuffer, 0);
+
+ directDraw->CreateClipper(0, &clipper, 0);
+ clipper->SetHWnd(0, windowHandle);
+ frontBuffer->SetClipper(clipper);
+ clipper->Release();
+ }
+ }
+ }
+
+ bool FrameBufferDD::readySurfaces()
+ {
+ if(!frontBuffer || !backBuffer)
+ {
+ createSurfaces();
+ }
+
+ if(frontBuffer && backBuffer)
+ {
+ if(frontBuffer->IsLost() || backBuffer->IsLost())
+ {
+ restoreSurfaces();
+ }
+
+ if(frontBuffer && backBuffer)
+ {
+ if(!frontBuffer->IsLost() && !backBuffer->IsLost())
+ {
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ void FrameBufferDD::updateClipper(HWND windowOverride)
+ {
+ if(windowed)
+ {
+ if(frontBuffer)
+ {
+ HWND window = windowOverride ? windowOverride : windowHandle;
+
+ IDirectDrawClipper *clipper;
+ frontBuffer->GetClipper(&clipper);
+ clipper->SetHWnd(0, window);
+ clipper->Release();
+ }
+ }
+ }
+
+ void FrameBufferDD::restoreSurfaces()
+ {
+ long result1 = frontBuffer->Restore();
+ long result2 = backBuffer->Restore();
+
+ if(result1 != DD_OK || result2 != DD_OK) // Surfaces could not be restored; recreate them
+ {
+ createSurfaces();
+ }
+ }
+
+ void FrameBufferDD::initFullscreen()
+ {
+ releaseAll();
+
+ if(true) // Render to primary display
+ {
+ DirectDrawCreate(0, &directDraw, 0);
+ }
+ else // Render to secondary display
+ {
+ DirectDrawEnumerateEx(&enumDisplayCallback, 0, DDENUM_ATTACHEDSECONDARYDEVICES);
+ DirectDrawCreate(&secondaryDisplay, &directDraw, 0);
+ }
+
+ directDraw->SetCooperativeLevel(windowHandle, DDSCL_EXCLUSIVE | DDSCL_FULLSCREEN);
+
+ long result;
+
+ do
+ {
+ format = FORMAT_X8R8G8B8;
+ result = directDraw->SetDisplayMode(width, height, 32);
+
+ if(result == DDERR_INVALIDMODE)
+ {
+ format = FORMAT_R8G8B8;
+ result = directDraw->SetDisplayMode(width, height, 24);
+
+ if(result == DDERR_INVALIDMODE)
+ {
+ format = FORMAT_R5G6B5;
+ result = directDraw->SetDisplayMode(width, height, 16);
+
+ if(result == DDERR_INVALIDMODE)
+ {
+ assert(!"Failed to initialize graphics: Display mode not supported.");
+ }
+ }
+ }
+
+ if(result != DD_OK)
+ {
+ Sleep(1);
+ }
+ }
+ while(result != DD_OK);
+
+ createSurfaces();
+
+ updateBounds(windowHandle);
+ }
+
+ void FrameBufferDD::initWindowed()
+ {
+ releaseAll();
+
+ DirectDrawCreate(0, &directDraw, 0);
+ directDraw->SetCooperativeLevel(windowHandle, DDSCL_NORMAL);
+
+ createSurfaces();
+
+ updateBounds(windowHandle);
+ }
+
+ void FrameBufferDD::flip(sw::Surface *source)
+ {
+ copy(source);
+
+ if(!readySurfaces())
+ {
+ return;
+ }
+
+ while(true)
+ {
+ long result;
+
+ if(windowed)
+ {
+ result = frontBuffer->Blt(&bounds, backBuffer, 0, DDBLT_WAIT, 0);
+ }
+ else
+ {
+ result = frontBuffer->Flip(0, DDFLIP_NOVSYNC);
+ }
+
+ if(result != DDERR_WASSTILLDRAWING)
+ {
+ break;
+ }
+
+ Sleep(0);
+ }
+ }
+
+ void FrameBufferDD::blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+ {
+ copy(source);
+
+ if(!readySurfaces())
+ {
+ return;
+ }
+
+ RECT dRect;
+
+ if(destRect)
+ {
+ dRect.bottom = bounds.top + destRect->y1;
+ dRect.left = bounds.left + destRect->x0;
+ dRect.right = bounds.left + destRect->x1;
+ dRect.top = bounds.top + destRect->y0;
+ }
+ else
+ {
+ dRect.bottom = bounds.top + height;
+ dRect.left = bounds.left + 0;
+ dRect.right = bounds.left + width;
+ dRect.top = bounds.top + 0;
+ }
+
+ while(true)
+ {
+ long result = frontBuffer->Blt(&dRect, backBuffer, (LPRECT)sourceRect, DDBLT_WAIT, 0);
+
+ if(result != DDERR_WASSTILLDRAWING)
+ {
+ break;
+ }
+
+ Sleep(0);
+ }
+ }
+
+ void FrameBufferDD::flip(HWND windowOverride, sw::Surface *source)
+ {
+ updateClipper(windowOverride);
+ updateBounds(windowOverride);
+
+ flip(source);
+ }
+
+ void FrameBufferDD::blit(HWND windowOverride, sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+ {
+ updateClipper(windowOverride);
+ updateBounds(windowOverride);
+
+ blit(source, sourceRect, destRect);
+ }
+
+ void FrameBufferDD::screenshot(void *destBuffer)
+ {
+ if(!readySurfaces())
+ {
+ return;
+ }
+
+ DDSURFACEDESC DDSD;
+ DDSD.dwSize = sizeof(DDSD);
+
+ long result = frontBuffer->Lock(0, &DDSD, DDLOCK_WAIT, 0);
+
+ if(result == DD_OK)
+ {
+ int width = DDSD.dwWidth;
+ int height = DDSD.dwHeight;
+ int stride = DDSD.lPitch;
+
+ void *sourceBuffer = DDSD.lpSurface;
+
+ for(int y = 0; y < height; y++)
+ {
+ memcpy(destBuffer, sourceBuffer, width * 4); // FIXME: Assumes 32-bit buffer
+
+ (char*&)sourceBuffer += stride;
+ (char*&)destBuffer += 4 * width;
+ }
+
+ frontBuffer->Unlock(0);
+ }
+ }
+
+ void FrameBufferDD::setGammaRamp(GammaRamp *gammaRamp, bool calibrate)
+ {
+ IDirectDrawGammaControl *gammaControl = 0;
+
+ if(frontBuffer)
+ {
+ frontBuffer->QueryInterface(IID_IDirectDrawGammaControl, (void**)&gammaControl);
+
+ if(gammaControl)
+ {
+ gammaControl->SetGammaRamp(calibrate ? DDSGR_CALIBRATE : 0, (DDGAMMARAMP*)gammaRamp);
+
+ gammaControl->Release();
+ }
+ }
+ }
+
+ void FrameBufferDD::getGammaRamp(GammaRamp *gammaRamp)
+ {
+ IDirectDrawGammaControl *gammaControl = 0;
+
+ if(frontBuffer)
+ {
+ frontBuffer->QueryInterface(IID_IDirectDrawGammaControl, (void**)&gammaControl);
+
+ if(gammaControl)
+ {
+ gammaControl->GetGammaRamp(0, (DDGAMMARAMP*)gammaRamp);
+
+ gammaControl->Release();
+ }
+ }
+ }
+
+ void *FrameBufferDD::lock()
+ {
+ if(framebuffer)
+ {
+ return framebuffer;
+ }
+
+ if(!readySurfaces())
+ {
+ return nullptr;
+ }
+
+ DDSURFACEDESC DDSD;
+ DDSD.dwSize = sizeof(DDSD);
+
+ long result = backBuffer->Lock(0, &DDSD, DDLOCK_WAIT, 0);
+
+ if(result == DD_OK)
+ {
+ width = DDSD.dwWidth;
+ height = DDSD.dwHeight;
+ stride = DDSD.lPitch;
+
+ framebuffer = DDSD.lpSurface;
+
+ return framebuffer;
+ }
+
+ return nullptr;
+ }
+
+ void FrameBufferDD::unlock()
+ {
+ if(!framebuffer || !backBuffer) return;
+
+ backBuffer->Unlock(0);
+
+ framebuffer = nullptr;
+ }
+
+ void FrameBufferDD::drawText(int x, int y, const char *string, ...)
+ {
+ char buffer[256];
+ va_list arglist;
+
+ va_start(arglist, string);
+ vsprintf(buffer, string, arglist);
+ va_end(arglist);
+
+ HDC hdc;
+
+ backBuffer->GetDC(&hdc);
+
+ SetBkColor(hdc, RGB(0, 0, 255));
+ SetTextColor(hdc, RGB(255, 255, 255));
+
+ TextOut(hdc, x, y, buffer, lstrlen(buffer));
+
+ backBuffer->ReleaseDC(hdc);
+ }
+
+ bool FrameBufferDD::getScanline(bool &inVerticalBlank, unsigned int &scanline)
+ {
+ HRESULT result = directDraw->GetScanLine((unsigned long*)&scanline);
+
+ if(result == DD_OK)
+ {
+ inVerticalBlank = false;
+ }
+ else if(result == DDERR_VERTICALBLANKINPROGRESS)
+ {
+ inVerticalBlank = true;
+ }
+ else if(result == DDERR_UNSUPPORTED)
+ {
+ return false;
+ }
+ else ASSERT(false);
+
+ return true;
+ }
+
+ void FrameBufferDD::releaseAll()
+ {
+ unlock();
+
+ if(backBuffer)
+ {
+ backBuffer->Release();
+ backBuffer = 0;
+ }
+
+ if(frontBuffer)
+ {
+ frontBuffer->Release();
+ frontBuffer = 0;
+ }
+
+ if(directDraw)
+ {
+ directDraw->SetCooperativeLevel(0, DDSCL_NORMAL);
+ directDraw->Release();
+ directDraw = 0;
+ }
+ }
+}
diff --git a/src/WSI/FrameBufferDD.hpp b/src/WSI/FrameBufferDD.hpp
new file mode 100644
index 0000000..22d76c9
--- /dev/null
+++ b/src/WSI/FrameBufferDD.hpp
@@ -0,0 +1,69 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_FrameBufferDD_hpp
+#define sw_FrameBufferDD_hpp
+
+#include "FrameBufferWin.hpp"
+
+#include <ddraw.h>
+
+namespace sw
+{
+ class FrameBufferDD : public FrameBufferWin
+ {
+ public:
+ FrameBufferDD(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin);
+
+ ~FrameBufferDD() override;
+
+ void flip(sw::Surface *source) override;
+ void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+ void flip(HWND windowOverride, sw::Surface *source) override;
+ void blit(HWND windowOverride, sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+ void *lock() override;
+ void unlock() override;
+
+ void setGammaRamp(GammaRamp *gammaRamp, bool calibrate) override;
+ void getGammaRamp(GammaRamp *gammaRamp) override;
+
+ void screenshot(void *destBuffer) override;
+ bool getScanline(bool &inVerticalBlank, unsigned int &scanline) override;
+
+ void drawText(int x, int y, const char *string, ...);
+
+ private:
+ void initFullscreen();
+ void initWindowed();
+ void createSurfaces();
+ bool readySurfaces();
+ void updateClipper(HWND windowOverride);
+ void restoreSurfaces();
+ void releaseAll();
+
+ HMODULE ddraw;
+ typedef HRESULT (WINAPI *DIRECTDRAWCREATE)( GUID FAR *lpGUID, LPDIRECTDRAW FAR *lplpDD, IUnknown FAR *pUnkOuter );
+ HRESULT (WINAPI *DirectDrawCreate)( GUID FAR *lpGUID, LPDIRECTDRAW FAR *lplpDD, IUnknown FAR *pUnkOuter );
+ typedef HRESULT (WINAPI *DIRECTDRAWENUMERATEEXA)( LPDDENUMCALLBACKEXA lpCallback, LPVOID lpContext, DWORD dwFlags);
+ HRESULT (WINAPI *DirectDrawEnumerateExA)( LPDDENUMCALLBACKEXA lpCallback, LPVOID lpContext, DWORD dwFlags);
+
+ IDirectDraw *directDraw;
+ IDirectDrawSurface *frontBuffer;
+ IDirectDrawSurface *backBuffer;
+ };
+}
+
+#endif // sw_FrameBufferDD_hpp
diff --git a/src/WSI/FrameBufferGDI.cpp b/src/WSI/FrameBufferGDI.cpp
new file mode 100644
index 0000000..90a469e
--- /dev/null
+++ b/src/WSI/FrameBufferGDI.cpp
@@ -0,0 +1,162 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferGDI.hpp"
+
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+ extern bool forceWindowed;
+
+ FrameBufferGDI::FrameBufferGDI(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin) : FrameBufferWin(windowHandle, width, height, fullscreen, topLeftOrigin)
+ {
+ if(!windowed)
+ {
+ SetWindowPos(windowHandle, HWND_TOPMOST, 0, 0, width, height, SWP_SHOWWINDOW);
+
+ DEVMODE deviceMode;
+ deviceMode.dmSize = sizeof(DEVMODE);
+ deviceMode.dmPelsWidth= width;
+ deviceMode.dmPelsHeight = height;
+ deviceMode.dmFields = DM_PELSWIDTH | DM_PELSHEIGHT;
+
+ ChangeDisplaySettings(&deviceMode, CDS_FULLSCREEN);
+ }
+
+ init(this->windowHandle);
+
+ format = FORMAT_X8R8G8B8;
+ }
+
+ FrameBufferGDI::~FrameBufferGDI()
+ {
+ release();
+
+ if(!windowed)
+ {
+ ChangeDisplaySettings(0, 0);
+
+ RECT clientRect;
+ RECT windowRect;
+ GetClientRect(windowHandle, &clientRect);
+ GetWindowRect(windowHandle, &windowRect);
+ int windowWidth = width + (windowRect.right - windowRect.left) - (clientRect.right - clientRect.left);
+ int windowHeight = height + (windowRect.bottom - windowRect.top) - (clientRect.bottom - clientRect.top);
+ int desktopWidth = GetSystemMetrics(SM_CXSCREEN);
+ int desktopHeight = GetSystemMetrics(SM_CYSCREEN);
+ SetWindowPos(windowHandle, HWND_TOP, desktopWidth / 2 - windowWidth / 2, desktopHeight / 2 - windowHeight / 2, windowWidth, windowHeight, SWP_SHOWWINDOW);
+ }
+ }
+
+ void *FrameBufferGDI::lock()
+ {
+ stride = width * 4;
+
+ return framebuffer;
+ }
+
+ void FrameBufferGDI::unlock()
+ {
+ }
+
+ void FrameBufferGDI::flip(sw::Surface *source)
+ {
+ blit(source, nullptr, nullptr);
+ }
+
+ void FrameBufferGDI::blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+ {
+ copy(source);
+
+ int sourceLeft = sourceRect ? sourceRect->x0 : 0;
+ int sourceTop = sourceRect ? sourceRect->y0 : 0;
+ int sourceWidth = sourceRect ? sourceRect->x1 - sourceRect->x0 : width;
+ int sourceHeight = sourceRect ? sourceRect->y1 - sourceRect->y0 : height;
+ int destLeft = destRect ? destRect->x0 : 0;
+ int destTop = destRect ? destRect->y0 : 0;
+ int destWidth = destRect ? destRect->x1 - destRect->x0 : bounds.right - bounds.left;
+ int destHeight = destRect ? destRect->y1 - destRect->y0 : bounds.bottom - bounds.top;
+
+ StretchBlt(windowContext, destLeft, destTop, destWidth, destHeight, bitmapContext, sourceLeft, sourceTop, sourceWidth, sourceHeight, SRCCOPY);
+ }
+
+ void FrameBufferGDI::flip(HWND windowOverride, sw::Surface *source)
+ {
+ blit(windowOverride, source, nullptr, nullptr);
+ }
+
+ void FrameBufferGDI::blit(HWND windowOverride, sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+ {
+ if(windowed && windowOverride != 0 && windowOverride != bitmapWindow)
+ {
+ release();
+ init(windowOverride);
+ }
+
+ blit(source, sourceRect, destRect);
+ }
+
+ void FrameBufferGDI::setGammaRamp(GammaRamp *gammaRamp, bool calibrate)
+ {
+ SetDeviceGammaRamp(windowContext, gammaRamp);
+ }
+
+ void FrameBufferGDI::getGammaRamp(GammaRamp *gammaRamp)
+ {
+ GetDeviceGammaRamp(windowContext, gammaRamp);
+ }
+
+ void FrameBufferGDI::screenshot(void *destBuffer)
+ {
+ UNIMPLEMENTED();
+ }
+
+ bool FrameBufferGDI::getScanline(bool &inVerticalBlank, unsigned int &scanline)
+ {
+ UNIMPLEMENTED();
+
+ return false;
+ }
+
+ void FrameBufferGDI::init(HWND window)
+ {
+ bitmapWindow = window;
+
+ windowContext = GetDC(window);
+ bitmapContext = CreateCompatibleDC(windowContext);
+
+ BITMAPINFO bitmapInfo;
+ memset(&bitmapInfo, 0, sizeof(BITMAPINFO));
+ bitmapInfo.bmiHeader.biSize = sizeof(BITMAPINFO);
+ bitmapInfo.bmiHeader.biBitCount = 32;
+ bitmapInfo.bmiHeader.biPlanes = 1;
+ bitmapInfo.bmiHeader.biHeight = -height;
+ bitmapInfo.bmiHeader.biWidth = width;
+ bitmapInfo.bmiHeader.biCompression = BI_RGB;
+
+ bitmap = CreateDIBSection(bitmapContext, &bitmapInfo, DIB_RGB_COLORS, &framebuffer, 0, 0);
+ SelectObject(bitmapContext, bitmap);
+
+ updateBounds(window);
+ }
+
+ void FrameBufferGDI::release()
+ {
+ SelectObject(bitmapContext, 0);
+ DeleteObject(bitmap);
+ ReleaseDC(bitmapWindow, windowContext);
+ DeleteDC(bitmapContext);
+ }
+}
diff --git a/src/WSI/FrameBufferGDI.hpp b/src/WSI/FrameBufferGDI.hpp
new file mode 100644
index 0000000..add2504
--- /dev/null
+++ b/src/WSI/FrameBufferGDI.hpp
@@ -0,0 +1,56 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_FrameBufferGDI_hpp
+#define sw_FrameBufferGDI_hpp
+
+#include "FrameBufferWin.hpp"
+
+namespace sw
+{
+ class FrameBufferGDI : public FrameBufferWin
+ {
+ public:
+ FrameBufferGDI(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin);
+
+ ~FrameBufferGDI() override;
+
+ void flip(sw::Surface *source) override;
+ void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+ void flip(HWND windowOverride, sw::Surface *source) override;
+ void blit(HWND windowOverride, sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+ void *lock() override;
+ void unlock() override;
+
+ void setGammaRamp(GammaRamp *gammaRamp, bool calibrate) override;
+ void getGammaRamp(GammaRamp *gammaRamp) override;
+
+ void screenshot(void *destBuffer) override;
+ bool getScanline(bool &inVerticalBlank, unsigned int &scanline) override;
+
+ private:
+ void init(HWND bitmapWindow);
+ void release();
+
+ HDC windowContext;
+ HDC bitmapContext;
+ HWND bitmapWindow;
+
+ HBITMAP bitmap;
+ };
+}
+
+#endif // sw_FrameBufferGDI_hpp
diff --git a/src/WSI/FrameBufferOSX.hpp b/src/WSI/FrameBufferOSX.hpp
new file mode 100644
index 0000000..07f8d63
--- /dev/null
+++ b/src/WSI/FrameBufferOSX.hpp
@@ -0,0 +1,49 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_FrameBufferOSX_hpp
+#define sw_FrameBufferOSX_hpp
+
+#include "Main/FrameBuffer.hpp"
+
+#import <Cocoa/Cocoa.h>
+
+@class CALayer;
+
+namespace sw
+{
+ class FrameBufferOSX : public FrameBuffer
+ {
+ public:
+ FrameBufferOSX(CALayer *layer, int width, int height);
+ ~FrameBufferOSX() override;
+
+ void flip(sw::Surface *source) override;
+ void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+ void *lock() override;
+ void unlock() override;
+
+ private:
+ int width;
+ int height;
+ CALayer *layer;
+ uint8_t *buffer;
+ CGDataProviderRef provider;
+ CGColorSpaceRef colorspace;
+ CGImageRef currentImage;
+ };
+}
+
+#endif // sw_FrameBufferOSX
diff --git a/src/WSI/FrameBufferOSX.mm b/src/WSI/FrameBufferOSX.mm
new file mode 100644
index 0000000..6d58ae7
--- /dev/null
+++ b/src/WSI/FrameBufferOSX.mm
@@ -0,0 +1,103 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferOSX.hpp"
+
+#include "Common/Debug.hpp"
+
+#include <EGL/egl.h>
+#import <QuartzCore/QuartzCore.h>
+
+namespace sw {
+
+ FrameBufferOSX::FrameBufferOSX(CALayer* layer, int width, int height)
+ : FrameBuffer(width, height, false, false), width(width), height(height),
+ layer(layer), buffer(nullptr), provider(nullptr), currentImage(nullptr)
+ {
+ format = sw::FORMAT_X8B8G8R8;
+ int bufferSize = width * height * 4 * sizeof(uint8_t);
+ buffer = new uint8_t[bufferSize];
+ provider = CGDataProviderCreateWithData(nullptr, buffer, bufferSize, nullptr);
+ colorspace = CGColorSpaceCreateDeviceRGB();
+ }
+
+ FrameBufferOSX::~FrameBufferOSX()
+ {
+ //[CATransaction begin];
+ //[layer setContents:nullptr];
+ //[CATransaction commit];
+
+ CGImageRelease(currentImage);
+ CGColorSpaceRelease(colorspace);
+ CGDataProviderRelease(provider);
+
+ delete[] buffer;
+ }
+
+ void FrameBufferOSX::flip(sw::Surface *source)
+ {
+ blit(source, nullptr, nullptr);
+ }
+
+ void FrameBufferOSX::blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+ {
+ copy(source);
+
+ int bytesPerRow = width * 4 * sizeof(uint8_t);
+ CGImageRef image = CGImageCreate(width, height, 8, 32, bytesPerRow, colorspace, kCGBitmapByteOrder32Big, provider, nullptr, false, kCGRenderingIntentDefault);
+
+ [CATransaction begin];
+ [layer setContents:(id)image];
+ [CATransaction commit];
+ [CATransaction flush];
+
+ if(currentImage)
+ {
+ CGImageRelease(currentImage);
+ }
+ currentImage = image;
+ }
+
+ void *FrameBufferOSX::lock()
+ {
+ stride = width * 4 * sizeof(uint8_t);
+ framebuffer = buffer;
+ return framebuffer;
+ };
+
+ void FrameBufferOSX::unlock()
+ {
+ framebuffer = nullptr;
+ };
+}
+
+sw::FrameBuffer *createFrameBuffer(void *display, EGLNativeWindowType nativeWindow, int width, int height)
+{
+ NSObject *window = reinterpret_cast<NSObject*>(nativeWindow);
+ CALayer *layer = nullptr;
+
+ if([window isKindOfClass:[NSView class]])
+ {
+ NSView *view = reinterpret_cast<NSView*>(window);
+ [view setWantsLayer:YES];
+ layer = [view layer];
+ }
+ else if([window isKindOfClass:[CALayer class]])
+ {
+ layer = reinterpret_cast<CALayer*>(window);
+ }
+ else ASSERT(0);
+
+ return new sw::FrameBufferOSX(layer, width, height);
+}
diff --git a/src/WSI/FrameBufferOzone.cpp b/src/WSI/FrameBufferOzone.cpp
new file mode 100644
index 0000000..95e0729
--- /dev/null
+++ b/src/WSI/FrameBufferOzone.cpp
@@ -0,0 +1,54 @@
+// Copyright 2017 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferOzone.hpp"
+
+namespace sw
+{
+ FrameBufferOzone::FrameBufferOzone(intptr_t display, intptr_t window, int width, int height) : FrameBuffer(width, height, false, false)
+ {
+ buffer = sw::Surface::create(width, height, 1, format, nullptr,
+ sw::Surface::pitchB(width, 0, format, true),
+ sw::Surface::sliceB(width, height, 0, format, true));
+ }
+
+ FrameBufferOzone::~FrameBufferOzone()
+ {
+ delete buffer;
+ }
+
+ void *FrameBufferOzone::lock()
+ {
+ framebuffer = buffer->lockInternal(0, 0, 0, sw::LOCK_READWRITE, sw::PUBLIC);
+
+ return framebuffer;
+ }
+
+ void FrameBufferOzone::unlock()
+ {
+ buffer->unlockInternal();
+
+ framebuffer = nullptr;
+ }
+
+ void FrameBufferOzone::blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+ {
+ copy(source);
+ }
+}
+
+NO_SANITIZE_FUNCTION sw::FrameBuffer *createFrameBuffer(void* display, intptr_t window, int width, int height)
+{
+ return new sw::FrameBufferOzone((intptr_t)display, window, width, height);
+}
diff --git a/src/WSI/FrameBufferOzone.hpp b/src/WSI/FrameBufferOzone.hpp
new file mode 100644
index 0000000..0dc9f60
--- /dev/null
+++ b/src/WSI/FrameBufferOzone.hpp
@@ -0,0 +1,40 @@
+// Copyright 2017 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_FrameBufferOzone_hpp
+#define sw_FrameBufferOzone_hpp
+
+#include "Main/FrameBuffer.hpp"
+
+namespace sw
+{
+ class FrameBufferOzone : public FrameBuffer
+ {
+ public:
+ FrameBufferOzone(intptr_t display, intptr_t window, int width, int height);
+
+ ~FrameBufferOzone() override;
+
+ void flip(sw::Surface *source) override {blit(source, nullptr, nullptr);};
+ void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+ void *lock() override;
+ void unlock() override;
+
+ private:
+ sw::Surface* buffer;
+ };
+}
+
+#endif // sw_FrameBufferOzone_hpp
diff --git a/src/WSI/FrameBufferWin.cpp b/src/WSI/FrameBufferWin.cpp
new file mode 100644
index 0000000..cad8954
--- /dev/null
+++ b/src/WSI/FrameBufferWin.cpp
@@ -0,0 +1,78 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferWin.hpp"
+
+namespace sw
+{
+ FrameBufferWin::FrameBufferWin(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin) : FrameBuffer(width, height, fullscreen, topLeftOrigin), windowHandle(windowHandle)
+ {
+ if(!windowed)
+ {
+ // Force fullscreen window style (no borders)
+ originalWindowStyle = GetWindowLong(windowHandle, GWL_STYLE);
+ SetWindowLong(windowHandle, GWL_STYLE, WS_POPUP);
+ }
+ }
+
+ FrameBufferWin::~FrameBufferWin()
+ {
+ if(!windowed && GetWindowLong(windowHandle, GWL_STYLE) == WS_POPUP)
+ {
+ SetWindowLong(windowHandle, GWL_STYLE, originalWindowStyle);
+ }
+ }
+
+ void FrameBufferWin::updateBounds(HWND windowOverride)
+ {
+ HWND window = windowOverride ? windowOverride : windowHandle;
+
+ if(windowed)
+ {
+ GetClientRect(window, &bounds);
+ ClientToScreen(window, (POINT*)&bounds);
+ ClientToScreen(window, (POINT*)&bounds + 1);
+ }
+ else
+ {
+ SetRect(&bounds, 0, 0, GetSystemMetrics(SM_CXSCREEN), GetSystemMetrics(SM_CYSCREEN));
+ }
+ }
+}
+
+#include "FrameBufferDD.hpp"
+#include "FrameBufferGDI.hpp"
+#include "Common/Configurator.hpp"
+
+sw::FrameBufferWin *createFrameBufferWin(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin)
+{
+ sw::Configurator ini("SwiftShader.ini");
+ int api = ini.getInteger("Testing", "FrameBufferAPI", 0);
+
+ if(api == 0 && topLeftOrigin)
+ {
+ return new sw::FrameBufferDD(windowHandle, width, height, fullscreen, topLeftOrigin);
+ }
+ else
+ {
+ return new sw::FrameBufferGDI(windowHandle, width, height, fullscreen, topLeftOrigin);
+ }
+
+ return 0;
+}
+
+sw::FrameBuffer *createFrameBuffer(void *display, HWND window, int width, int height)
+{
+ return createFrameBufferWin(window, width, height, false, false);
+}
diff --git a/src/WSI/FrameBufferWin.hpp b/src/WSI/FrameBufferWin.hpp
new file mode 100644
index 0000000..15c1e0e
--- /dev/null
+++ b/src/WSI/FrameBufferWin.hpp
@@ -0,0 +1,59 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_FrameBufferWin_hpp
+#define sw_FrameBufferWin_hpp
+
+#include "FrameBuffer.hpp"
+
+namespace sw
+{
+ struct GammaRamp
+ {
+ short red[256];
+ short green[256];
+ short blue[256];
+ };
+
+ class FrameBufferWin : public FrameBuffer
+ {
+ public:
+ FrameBufferWin(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin);
+
+ ~FrameBufferWin() override;
+
+ void flip(sw::Surface *source) override = 0;
+ void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override = 0;
+
+ virtual void flip(HWND windowOverride, sw::Surface *source) = 0;
+ virtual void blit(HWND windowOverride, sw::Surface *source, const Rect *sourceRect, const Rect *destRect) = 0;
+
+ virtual void setGammaRamp(GammaRamp *gammaRamp, bool calibrate) = 0;
+ virtual void getGammaRamp(GammaRamp *gammaRamp) = 0;
+
+ virtual void screenshot(void *destBuffer) = 0;
+ virtual bool getScanline(bool &inVerticalBlank, unsigned int &scanline) = 0;
+
+ protected:
+ void updateBounds(HWND windowOverride);
+
+ HWND windowHandle;
+ DWORD originalWindowStyle;
+ RECT bounds;
+ };
+}
+
+sw::FrameBufferWin *createFrameBufferWin(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin);
+
+#endif // sw_FrameBufferWin_hpp
diff --git a/src/WSI/FrameBufferX11.cpp b/src/WSI/FrameBufferX11.cpp
new file mode 100644
index 0000000..b3ae3b4
--- /dev/null
+++ b/src/WSI/FrameBufferX11.cpp
@@ -0,0 +1,192 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferX11.hpp"
+
+#include "libX11.hpp"
+#include "Common/Timer.hpp"
+
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+
+namespace sw
+{
+ static int (*PreviousXErrorHandler)(Display *display, XErrorEvent *event) = 0;
+ static bool shmBadAccess = false;
+
+ // Catches BadAcces errors so we can fall back to not using MIT-SHM
+ static int XShmErrorHandler(Display *display, XErrorEvent *event)
+ {
+ if(event->error_code == BadAccess)
+ {
+ shmBadAccess = true;
+ return 0;
+ }
+ else
+ {
+ return PreviousXErrorHandler(display, event);
+ }
+ }
+
+ FrameBufferX11::FrameBufferX11(Display *display, Window window, int width, int height) : FrameBuffer(width, height, false, false), ownX11(!display), x_display(display), x_window(window)
+ {
+ if(!x_display)
+ {
+ x_display = libX11->XOpenDisplay(0);
+ assert(x_display);
+ }
+
+ int screen = DefaultScreen(x_display);
+ x_gc = libX11->XDefaultGC(x_display, screen);
+ int depth = libX11->XDefaultDepth(x_display, screen);
+
+ XVisualInfo x_visual;
+ Status status = libX11->XMatchVisualInfo(x_display, screen, 32, TrueColor, &x_visual);
+ bool match = (status != 0 && x_visual.blue_mask == 0xFF); // Prefer X8R8G8B8
+ Visual *visual = match ? x_visual.visual : libX11->XDefaultVisual(x_display, screen);
+
+ mit_shm = (libX11->XShmQueryExtension && libX11->XShmQueryExtension(x_display) == True);
+
+ if(mit_shm)
+ {
+ x_image = libX11->XShmCreateImage(x_display, visual, depth, ZPixmap, 0, &shminfo, width, height);
+
+ shminfo.shmid = shmget(IPC_PRIVATE, x_image->bytes_per_line * x_image->height, IPC_CREAT | SHM_R | SHM_W);
+ shminfo.shmaddr = x_image->data = (char*)shmat(shminfo.shmid, 0, 0);
+ shminfo.readOnly = False;
+
+ PreviousXErrorHandler = libX11->XSetErrorHandler(XShmErrorHandler);
+ libX11->XShmAttach(x_display, &shminfo); // May produce a BadAccess error
+ libX11->XSync(x_display, False);
+ libX11->XSetErrorHandler(PreviousXErrorHandler);
+
+ if(shmBadAccess)
+ {
+ mit_shm = false;
+
+ XDestroyImage(x_image);
+ shmdt(shminfo.shmaddr);
+ shmctl(shminfo.shmid, IPC_RMID, 0);
+
+ shmBadAccess = false;
+ }
+ }
+
+ if(!mit_shm)
+ {
+ int bytes_per_line = width * 4;
+ int bytes_per_image = height * bytes_per_line;
+ char *buffer = (char*)malloc(bytes_per_image);
+ memset(buffer, 0, bytes_per_image);
+
+ x_image = libX11->XCreateImage(x_display, visual, depth, ZPixmap, 0, buffer, width, height, 32, bytes_per_line);
+ assert(x_image);
+
+ if(!x_image)
+ {
+ free(buffer);
+ }
+ }
+ }
+
+ FrameBufferX11::~FrameBufferX11()
+ {
+ if(!mit_shm)
+ {
+ XDestroyImage(x_image);
+ }
+ else
+ {
+ libX11->XShmDetach(x_display, &shminfo);
+ XDestroyImage(x_image);
+ shmdt(shminfo.shmaddr);
+ shmctl(shminfo.shmid, IPC_RMID, 0);
+ }
+
+ if(ownX11)
+ {
+ libX11->XCloseDisplay(x_display);
+ }
+ }
+
+ void *FrameBufferX11::lock()
+ {
+ if(x_image)
+ {
+ stride = x_image->bytes_per_line;
+ framebuffer = x_image->data;
+ }
+
+ return framebuffer;
+ }
+
+ void FrameBufferX11::unlock()
+ {
+ framebuffer = nullptr;
+ }
+
+ void FrameBufferX11::blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+ {
+ copy(source);
+
+ if(!mit_shm)
+ {
+ libX11->XPutImage(x_display, x_window, x_gc, x_image, 0, 0, 0, 0, width, height);
+ }
+ else
+ {
+ libX11->XShmPutImage(x_display, x_window, x_gc, x_image, 0, 0, 0, 0, width, height, False);
+ }
+
+ libX11->XSync(x_display, False);
+
+ if(false) // Draw the framerate on screen
+ {
+ static double fpsTime = sw::Timer::seconds();
+ static int frames = -1;
+
+ double time = sw::Timer::seconds();
+ double delta = time - fpsTime;
+ frames++;
+
+ static double FPS = 0.0;
+ static double maxFPS = 0.0;
+
+ if(delta > 1.0)
+ {
+ FPS = frames / delta;
+
+ fpsTime = time;
+ frames = 0;
+
+ if(FPS > maxFPS)
+ {
+ maxFPS = FPS;
+ }
+ }
+
+ char string[256];
+ sprintf(string, "FPS: %.2f (max: %.2f)", FPS, maxFPS);
+ libX11->XDrawString(x_display, x_window, x_gc, 50, 50, string, strlen(string));
+ }
+ }
+}
+
+NO_SANITIZE_FUNCTION sw::FrameBuffer *createFrameBuffer(void *display, Window window, int width, int height)
+{
+ return new sw::FrameBufferX11((::Display*)display, window, width, height);
+}
diff --git a/src/WSI/FrameBufferX11.hpp b/src/WSI/FrameBufferX11.hpp
new file mode 100644
index 0000000..dc96331
--- /dev/null
+++ b/src/WSI/FrameBufferX11.hpp
@@ -0,0 +1,52 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_FrameBufferX11_hpp
+#define sw_FrameBufferX11_hpp
+
+#include "Main/FrameBuffer.hpp"
+#include "Common/Debug.hpp"
+
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+#include <X11/extensions/XShm.h>
+
+namespace sw
+{
+ class FrameBufferX11 : public FrameBuffer
+ {
+ public:
+ FrameBufferX11(Display *display, Window window, int width, int height);
+
+ ~FrameBufferX11() override;
+
+ void flip(sw::Surface *source) override { blit(source, nullptr, nullptr); }
+ void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+ void *lock() override;
+ void unlock() override;
+
+ private:
+ const bool ownX11;
+ Display *x_display;
+ const Window x_window;
+ XImage *x_image = nullptr;
+ GC x_gc;
+
+ bool mit_shm;
+ XShmSegmentInfo shminfo;
+ };
+}
+
+#endif // sw_FrameBufferX11_hpp
diff --git a/src/WSI/libX11.cpp b/src/WSI/libX11.cpp
new file mode 100644
index 0000000..f3723ff
--- /dev/null
+++ b/src/WSI/libX11.cpp
@@ -0,0 +1,84 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "libX11.hpp"
+
+#include "Common/SharedLibrary.hpp"
+
+#define Bool int
+
+LibX11exports::LibX11exports(void *libX11, void *libXext)
+{
+ XOpenDisplay = (Display *(*)(char*))getProcAddress(libX11, "XOpenDisplay");
+ XGetWindowAttributes = (Status (*)(Display*, Window, XWindowAttributes*))getProcAddress(libX11, "XGetWindowAttributes");
+ XDefaultScreenOfDisplay = (Screen *(*)(Display*))getProcAddress(libX11, "XDefaultScreenOfDisplay");
+ XWidthOfScreen = (int (*)(Screen*))getProcAddress(libX11, "XWidthOfScreen");
+ XHeightOfScreen = (int (*)(Screen*))getProcAddress(libX11, "XHeightOfScreen");
+ XPlanesOfScreen = (int (*)(Screen*))getProcAddress(libX11, "XPlanesOfScreen");
+ XDefaultGC = (GC (*)(Display*, int))getProcAddress(libX11, "XDefaultGC");
+ XDefaultDepth = (int (*)(Display*, int))getProcAddress(libX11, "XDefaultDepth");
+ XMatchVisualInfo = (Status (*)(Display*, int, int, int, XVisualInfo*))getProcAddress(libX11, "XMatchVisualInfo");
+ XDefaultVisual = (Visual *(*)(Display*, int screen_number))getProcAddress(libX11, "XDefaultVisual");
+ XSetErrorHandler = (int (*(*)(int (*)(Display*, XErrorEvent*)))(Display*, XErrorEvent*))getProcAddress(libX11, "XSetErrorHandler");
+ XSync = (int (*)(Display*, Bool))getProcAddress(libX11, "XSync");
+ XCreateImage = (XImage *(*)(Display*, Visual*, unsigned int, int, int, char*, unsigned int, unsigned int, int, int))getProcAddress(libX11, "XCreateImage");
+ XCloseDisplay = (int (*)(Display*))getProcAddress(libX11, "XCloseDisplay");
+ XPutImage = (int (*)(Display*, Drawable, GC, XImage*, int, int, int, int, unsigned int, unsigned int))getProcAddress(libX11, "XPutImage");
+ XDrawString = (int (*)(Display*, Drawable, GC, int, int, char*, int))getProcAddress(libX11, "XDrawString");
+
+ XShmQueryExtension = (Bool (*)(Display*))getProcAddress(libXext, "XShmQueryExtension");
+ XShmCreateImage = (XImage *(*)(Display*, Visual*, unsigned int, int, char*, XShmSegmentInfo*, unsigned int, unsigned int))getProcAddress(libXext, "XShmCreateImage");
+ XShmAttach = (Bool (*)(Display*, XShmSegmentInfo*))getProcAddress(libXext, "XShmAttach");
+ XShmDetach = (Bool (*)(Display*, XShmSegmentInfo*))getProcAddress(libXext, "XShmDetach");
+ XShmPutImage = (int (*)(Display*, Drawable, GC, XImage*, int, int, int, int, unsigned int, unsigned int, bool))getProcAddress(libXext, "XShmPutImage");
+}
+
+LibX11exports *LibX11::operator->()
+{
+ return loadExports();
+}
+
+LibX11exports *LibX11::loadExports()
+{
+ static void *libX11 = nullptr;
+ static void *libXext = nullptr;
+ static LibX11exports *libX11exports = nullptr;
+
+ if(!libX11)
+ {
+ if(getProcAddress(RTLD_DEFAULT, "XOpenDisplay")) // Search the global scope for pre-loaded X11 library.
+ {
+ libX11exports = new LibX11exports(RTLD_DEFAULT, RTLD_DEFAULT);
+ libX11 = (void*)-1; // No need to load it.
+ }
+ else
+ {
+ libX11 = loadLibrary("libX11.so");
+
+ if(libX11)
+ {
+ libXext = loadLibrary("libXext.so");
+ libX11exports = new LibX11exports(libX11, libXext);
+ }
+ else
+ {
+ libX11 = (void*)-1; // Don't attempt loading more than once.
+ }
+ }
+ }
+
+ return libX11exports;
+}
+
+LibX11 libX11;
diff --git a/src/WSI/libX11.hpp b/src/WSI/libX11.hpp
new file mode 100644
index 0000000..c188386
--- /dev/null
+++ b/src/WSI/libX11.hpp
@@ -0,0 +1,69 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef libX11_hpp
+#define libX11_hpp
+
+#define Bool int
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+#include <X11/extensions/XShm.h>
+
+struct LibX11exports
+{
+ LibX11exports(void *libX11, void *libXext);
+
+ Display *(*XOpenDisplay)(char *display_name);
+ Status (*XGetWindowAttributes)(Display *display, Window w, XWindowAttributes *window_attributes_return);
+ Screen *(*XDefaultScreenOfDisplay)(Display *display);
+ int (*XWidthOfScreen)(Screen *screen);
+ int (*XHeightOfScreen)(Screen *screen);
+ int (*XPlanesOfScreen)(Screen *screen);
+ GC (*XDefaultGC)(Display *display, int screen_number);
+ int (*XDefaultDepth)(Display *display, int screen_number);
+ Status (*XMatchVisualInfo)(Display *display, int screen, int depth, int screen_class, XVisualInfo *vinfo_return);
+ Visual *(*XDefaultVisual)(Display *display, int screen_number);
+ int (*(*XSetErrorHandler)(int (*handler)(Display*, XErrorEvent*)))(Display*, XErrorEvent*);
+ int (*XSync)(Display *display, Bool discard);
+ XImage *(*XCreateImage)(Display *display, Visual *visual, unsigned int depth, int format, int offset, char *data, unsigned int width, unsigned int height, int bitmap_pad, int bytes_per_line);
+ int (*XCloseDisplay)(Display *display);
+ int (*XPutImage)(Display *display, Drawable d, GC gc, XImage *image, int src_x, int src_y, int dest_x, int dest_y, unsigned int width, unsigned int height);
+ int (*XDrawString)(Display *display, Drawable d, GC gc, int x, int y, char *string, int length);
+
+ Bool (*XShmQueryExtension)(Display *display);
+ XImage *(*XShmCreateImage)(Display *display, Visual *visual, unsigned int depth, int format, char *data, XShmSegmentInfo *shminfo, unsigned int width, unsigned int height);
+ Bool (*XShmAttach)(Display *display, XShmSegmentInfo *shminfo);
+ Bool (*XShmDetach)(Display *display, XShmSegmentInfo *shminfo);
+ int (*XShmPutImage)(Display *display, Drawable d, GC gc, XImage *image, int src_x, int src_y, int dest_x, int dest_y, unsigned int width, unsigned int height, bool send_event);
+};
+
+#undef Bool
+
+class LibX11
+{
+public:
+ operator bool()
+ {
+ return loadExports();
+ }
+
+ LibX11exports *operator->();
+
+private:
+ LibX11exports *loadExports();
+};
+
+extern LibX11 libX11;
+
+#endif // libX11_hpp