glBlitFramebuffer support for depth/stencil formats

Added support for depth and stencil formats for glBlitFramebuffer:
- Blitter now supports quad layout and stencil
- Device::stretchRect() now supports specific buffers, so that a
  caller can specifically choose which buffer to copy

Change-Id: Iae0898df11e0a1d3c006113486ed15a3fd2f90a9
Reviewed-on: https://swiftshader-review.googlesource.com/7510
Tested-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <capn@google.com>
diff --git a/src/OpenGL/libGLESv2/Context.cpp b/src/OpenGL/libGLESv2/Context.cpp
index 6033efe..0e32336 100644
--- a/src/OpenGL/libGLESv2/Context.cpp
+++ b/src/OpenGL/libGLESv2/Context.cpp
@@ -3873,7 +3873,7 @@
 
 void Context::blitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                               GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
-                              GLbitfield mask, bool filter)
+                              GLbitfield mask, bool filter, bool allowPartialDepthStencilBlit)
 {
 	Framebuffer *readFramebuffer = getReadFramebuffer();
 	Framebuffer *drawFramebuffer = getDrawFramebuffer();
@@ -4048,7 +4048,8 @@
 	}
 
 	bool blitRenderTarget = false;
-	bool blitDepthStencil = false;
+	bool blitDepth = false;
+	bool blitStencil = false;
 
 	if(mask & GL_COLOR_BUFFER_BIT)
 	{
@@ -4083,7 +4084,7 @@
 					return error(GL_INVALID_OPERATION);
 				}
 
-				blitDepthStencil = true;
+				blitDepth = true;
 				readDSBuffer = readFramebuffer->getDepthbuffer();
 				drawDSBuffer = drawFramebuffer->getDepthbuffer();
 			}
@@ -4098,15 +4099,15 @@
 					return error(GL_INVALID_OPERATION);
 				}
 
-				blitDepthStencil = true;
+				blitStencil = true;
 				readDSBuffer = readFramebuffer->getStencilbuffer();
 				drawDSBuffer = drawFramebuffer->getStencilbuffer();
 			}
 		}
 
-		if(partialBufferCopy)
+		if(partialBufferCopy && !allowPartialDepthStencilBlit)
 		{
-			ERR("Only whole-buffer depth and stencil blits are supported by this implementation.");
+			ERR("Only whole-buffer depth and stencil blits are supported by ANGLE_framebuffer_blit.");
 			return error(GL_INVALID_OPERATION);   // Only whole-buffer copies are permitted
 		}
 
@@ -4117,7 +4118,7 @@
 		}
 	}
 
-	if(blitRenderTarget || blitDepthStencil)
+	if(blitRenderTarget || blitDepth || blitStencil)
 	{
 		if(blitRenderTarget)
 		{
@@ -4133,7 +4134,7 @@
 				swap(destRect.y0, destRect.y1);
 			}
 
-			bool success = device->stretchRect(readRenderTarget, &sourceRect, drawRenderTarget, &destRect, filter);
+			bool success = device->stretchRect(readRenderTarget, &sourceRect, drawRenderTarget, &destRect, (filter ? Device::USE_FILTER : 0) | Device::COLOR_BUFFER);
 
 			readRenderTarget->release();
 			drawRenderTarget->release();
@@ -4145,9 +4146,32 @@
 			}
 		}
 
-		if(blitDepthStencil)
+		if(blitDepth)
 		{
-			bool success = device->stretchRect(readFramebuffer->getDepthBuffer(), nullptr, drawFramebuffer->getDepthBuffer(), nullptr, false);
+			egl::Image *readRenderTarget = readFramebuffer->getDepthBuffer();
+			egl::Image *drawRenderTarget = drawFramebuffer->getDepthBuffer();
+
+			bool success = device->stretchRect(readRenderTarget, &sourceRect, drawRenderTarget, &destRect, (filter ? Device::USE_FILTER : 0) | Device::DEPTH_BUFFER);
+
+			readRenderTarget->release();
+			drawRenderTarget->release();
+
+			if(!success)
+			{
+				ERR("BlitFramebuffer failed.");
+				return;
+			}
+		}
+
+		if(blitStencil)
+		{
+			egl::Image *readRenderTarget = readFramebuffer->getStencilBuffer();
+			egl::Image *drawRenderTarget = drawFramebuffer->getStencilBuffer();
+
+			bool success = device->stretchRect(readRenderTarget, &sourceRect, drawRenderTarget, &destRect, (filter ? Device::USE_FILTER : 0) | Device::STENCIL_BUFFER);
+
+			readRenderTarget->release();
+			drawRenderTarget->release();
 
 			if(!success)
 			{
diff --git a/src/OpenGL/libGLESv2/Context.h b/src/OpenGL/libGLESv2/Context.h
index f638c05..cd8194f 100644
--- a/src/OpenGL/libGLESv2/Context.h
+++ b/src/OpenGL/libGLESv2/Context.h
@@ -684,7 +684,7 @@
 
 	void blitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
 	                     GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
-	                     GLbitfield mask, bool filter);
+	                     GLbitfield mask, bool filter, bool allowPartialDepthStencilBlit);
 
 	virtual void bindTexImage(egl::Surface *surface);
 	virtual EGLenum validateSharedImage(EGLenum target, GLuint name, GLuint textureLevel);
diff --git a/src/OpenGL/libGLESv2/Device.cpp b/src/OpenGL/libGLESv2/Device.cpp
index 225ceeb..2e7a2e3 100644
--- a/src/OpenGL/libGLESv2/Device.cpp
+++ b/src/OpenGL/libGLESv2/Device.cpp
@@ -510,7 +510,7 @@
 		}
 	}
 
-	bool Device::stretchRect(sw::Surface *source, const sw::SliceRect *sourceRect, sw::Surface *dest, const sw::SliceRect *destRect, bool filter)
+	bool Device::stretchRect(sw::Surface *source, const sw::SliceRect *sourceRect, sw::Surface *dest, const sw::SliceRect *destRect, unsigned char flags)
 	{
 		if(!source || !dest)
 		{
@@ -596,7 +596,12 @@
 
 		bool scaling = (sRect.x1 - sRect.x0 != dRect.x1 - dRect.x0) || (sRect.y1 - sRect.y0 != dRect.y1 - dRect.y0);
 		bool equalFormats = source->getInternalFormat() == dest->getInternalFormat();
-		bool depthStencil = egl::Image::isDepth(source->getInternalFormat()) || egl::Image::isStencil(source->getInternalFormat());
+		bool hasQuadLayout = Surface::hasQuadLayout(source->getInternalFormat()) || Surface::hasQuadLayout(dest->getInternalFormat());
+		bool fullCopy = (sRect.x0 == 0) && (sRect.y0 == 0) && (dRect.x0 == 0) && (dRect.y0 == 0) &&
+		                (sRect.x1 == sWidth) && (sRect.y1 == sHeight) && (dRect.x1 == dWidth) && (dRect.y0 == dHeight);
+		bool isDepth = (flags & Device::DEPTH_BUFFER) && egl::Image::isDepth(source->getInternalFormat());
+		bool isStencil = (flags & Device::STENCIL_BUFFER) && (egl::Image::isDepth(source->getInternalFormat()) || egl::Image::isStencil(source->getInternalFormat()));
+		bool isColor = (flags & Device::COLOR_BUFFER) == Device::COLOR_BUFFER;
 		bool alpha0xFF = false;
 
 		if((source->getInternalFormat() == FORMAT_A8R8G8B8 && dest->getInternalFormat() == FORMAT_X8R8G8B8) ||
@@ -606,31 +611,31 @@
 			alpha0xFF = true;
 		}
 
-		if(depthStencil)   // Copy entirely, internally   // FIXME: Check
+		if((isDepth || isStencil) && !scaling && equalFormats && (!hasQuadLayout || fullCopy))
 		{
-			if(source->hasDepth())
+			if(source->hasDepth() && isDepth)
 			{
-				sw::byte *sourceBuffer = (sw::byte*)source->lockInternal(0, 0, sourceRect->slice, LOCK_READONLY, PUBLIC);
-				sw::byte *destBuffer = (sw::byte*)dest->lockInternal(0, 0, destRect->slice, LOCK_DISCARD, PUBLIC);
+				sw::byte *sourceBuffer = (sw::byte*)source->lockInternal(sRect.x0, sRect.y0, 0, LOCK_READONLY, PUBLIC);
+				sw::byte *destBuffer = (sw::byte*)dest->lockInternal(dRect.x0, dRect.y0, 0, LOCK_DISCARD, PUBLIC);
 
-				copyBuffer(sourceBuffer, destBuffer, source->getWidth(), source->getHeight(), source->getInternalPitchB(), dest->getInternalPitchB(), egl::Image::bytes(source->getInternalFormat()), flipX, flipY);
+				copyBuffer(sourceBuffer, destBuffer, dRect.width(), dRect.height(), source->getInternalPitchB(), dest->getInternalPitchB(), egl::Image::bytes(source->getInternalFormat()), flipX, flipY);
 
 				source->unlockInternal();
 				dest->unlockInternal();
 			}
 
-			if(source->hasStencil())
+			if(source->hasStencil() && isStencil)
 			{
-				sw::byte *sourceBuffer = (sw::byte*)source->lockStencil(0, 0, 0, PUBLIC);
-				sw::byte *destBuffer = (sw::byte*)dest->lockStencil(0, 0, 0, PUBLIC);
+				sw::byte *sourceBuffer = (sw::byte*)source->lockStencil(sRect.x0, sRect.y0, 0, PUBLIC);
+				sw::byte *destBuffer = (sw::byte*)dest->lockStencil(dRect.x0, dRect.y0, 0, PUBLIC);
 
-				copyBuffer(sourceBuffer, destBuffer, source->getWidth(), source->getHeight(), source->getInternalPitchB(), dest->getInternalPitchB(), egl::Image::bytes(source->getInternalFormat()), flipX, flipY);
+				copyBuffer(sourceBuffer, destBuffer, source->getWidth(), source->getHeight(), source->getStencilPitchB(), dest->getStencilPitchB(), egl::Image::bytes(source->getStencilFormat()), flipX, flipY);
 
 				source->unlockStencil();
 				dest->unlockStencil();
 			}
 		}
-		else if(!scaling && equalFormats)
+		else if((flags & Device::COLOR_BUFFER) && !scaling && equalFormats && (!hasQuadLayout || fullCopy))
 		{
 			unsigned char *sourceBytes = (unsigned char*)source->lockInternal(sRect.x0, sRect.y0, sourceRect->slice, LOCK_READONLY, PUBLIC);
 			unsigned char *destBytes = (unsigned char*)dest->lockInternal(dRect.x0, dRect.y0, destRect->slice, LOCK_READWRITE, PUBLIC);
@@ -656,7 +661,7 @@
 			source->unlockInternal();
 			dest->unlockInternal();
 		}
-		else
+		else if(isColor || isDepth || isStencil)
 		{
 			if(flipX)
 			{
@@ -666,7 +671,11 @@
 			{
 				swap(dRect.y0, dRect.y1);
 			}
-			blit(source, sRect, dest, dRect, scaling && filter);
+			blit(source, sRect, dest, dRect, scaling && (flags & Device::USE_FILTER), isStencil);
+		}
+		else
+		{
+			UNREACHABLE(false);
 		}
 
 		return true;
diff --git a/src/OpenGL/libGLESv2/Device.hpp b/src/OpenGL/libGLESv2/Device.hpp
index 705597b..13a10de 100644
--- a/src/OpenGL/libGLESv2/Device.hpp
+++ b/src/OpenGL/libGLESv2/Device.hpp
@@ -39,6 +39,15 @@
 	class Device : public sw::Renderer
 	{
 	public:
+		enum : unsigned char
+		{
+			USE_FILTER = 0x01,
+			COLOR_BUFFER = 0x02,
+			DEPTH_BUFFER = 0x04,
+			STENCIL_BUFFER = 0x08,
+			ALL_BUFFERS = COLOR_BUFFER | DEPTH_BUFFER | STENCIL_BUFFER,
+		};
+
 		explicit Device(sw::Context *context);
 
 		virtual ~Device();
@@ -64,7 +73,7 @@
 		void setVertexShaderConstantF(unsigned int startRegister, const float *constantData, unsigned int count);
 		void setViewport(const Viewport &viewport);
 
-		bool stretchRect(sw::Surface *sourceSurface, const sw::SliceRect *sourceRect, sw::Surface *destSurface, const sw::SliceRect *destRect, bool filter);
+		bool stretchRect(sw::Surface *sourceSurface, const sw::SliceRect *sourceRect, sw::Surface *destSurface, const sw::SliceRect *destRect, unsigned char flags);
 		bool stretchCube(sw::Surface *sourceSurface, sw::Surface *destSurface);
 		void finish();
 
diff --git a/src/OpenGL/libGLESv2/Texture.cpp b/src/OpenGL/libGLESv2/Texture.cpp
index 50ab54b..deb8ed7 100644
--- a/src/OpenGL/libGLESv2/Texture.cpp
+++ b/src/OpenGL/libGLESv2/Texture.cpp
@@ -480,7 +480,7 @@
 	Device *device = getDevice();
 
 	sw::SliceRect destRect(xoffset, yoffset, xoffset + (sourceRect.x1 - sourceRect.x0), yoffset + (sourceRect.y1 - sourceRect.y0), zoffset);
-	bool success = device->stretchRect(source, &sourceRect, dest, &destRect, false);
+	bool success = device->stretchRect(source, &sourceRect, dest, &destRect, Device::ALL_BUFFERS);
 
 	if(!success)
 	{
@@ -926,7 +926,7 @@
 			return error(GL_OUT_OF_MEMORY);
 		}
 
-		getDevice()->stretchRect(image[i - 1], 0, image[i], 0, true);
+		getDevice()->stretchRect(image[i - 1], 0, image[i], 0, Device::ALL_BUFFERS | Device::USE_FILTER);
 	}
 }
 
@@ -1415,7 +1415,7 @@
 				return error(GL_OUT_OF_MEMORY);
 			}
 
-			getDevice()->stretchRect(image[f][i - 1], 0, image[f][i], 0, true);
+			getDevice()->stretchRect(image[f][i - 1], 0, image[f][i], 0, Device::ALL_BUFFERS | Device::USE_FILTER);
 		}
 	}
 }
@@ -2001,7 +2001,7 @@
 		{
 			sw::SliceRect srcRect(0, 0, srcw, srch, z);
 			sw::SliceRect dstRect(0, 0, w, h, z);
-			getDevice()->stretchRect(image[i - 1], &srcRect, image[i], &dstRect, true);
+			getDevice()->stretchRect(image[i - 1], &srcRect, image[i], &dstRect, Device::ALL_BUFFERS | Device::USE_FILTER);
 		}
 	}
 }
diff --git a/src/OpenGL/libGLESv2/libGLESv2.cpp b/src/OpenGL/libGLESv2/libGLESv2.cpp
index 2b7361d..447d800 100644
--- a/src/OpenGL/libGLESv2/libGLESv2.cpp
+++ b/src/OpenGL/libGLESv2/libGLESv2.cpp
@@ -6236,7 +6236,7 @@
 	}
 }
 
-void BlitFramebufferNV(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter)
+static void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter, bool allowPartialDepthStencilBlit)
 {
 	TRACE("(GLint srcX0 = %d, GLint srcY0 = %d, GLint srcX1 = %d, GLint srcY1 = %d, "
 	      "GLint dstX0 = %d, GLint dstY0 = %d, GLint dstX1 = %d, GLint dstY1 = %d, "
@@ -6266,10 +6266,15 @@
 			return error(GL_INVALID_OPERATION);
 		}
 
-		context->blitFramebuffer(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, false);
+		context->blitFramebuffer(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, false, allowPartialDepthStencilBlit);
 	}
 }
 
+void BlitFramebufferNV(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter)
+{
+	BlitFramebuffer(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, filter, true);
+}
+
 void BlitFramebufferANGLE(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
                           GLbitfield mask, GLenum filter)
 {
@@ -6279,7 +6284,7 @@
 		return error(GL_INVALID_OPERATION);
 	}
 
-	glBlitFramebufferNV(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, filter);
+	BlitFramebuffer(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, filter, false);
 }
 
 void TexImage3DOES(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth,
diff --git a/src/OpenGL/libGLESv2/libGLESv3.cpp b/src/OpenGL/libGLESv2/libGLESv3.cpp
index f50a65c..4c86895 100644
--- a/src/OpenGL/libGLESv2/libGLESv3.cpp
+++ b/src/OpenGL/libGLESv2/libGLESv3.cpp
@@ -1447,7 +1447,7 @@
 			return error(GL_INVALID_OPERATION);
 		}
 
-		context->blitFramebuffer(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, filter == GL_LINEAR);
+		context->blitFramebuffer(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, filter == GL_LINEAR, true);
 	}
 }
 
diff --git a/src/Renderer/Blitter.cpp b/src/Renderer/Blitter.cpp
index 9ee53de..4608641 100644
--- a/src/Renderer/Blitter.cpp
+++ b/src/Renderer/Blitter.cpp
@@ -40,9 +40,17 @@
 		blit(&color, sRect, dest, dRect, clearOptions);
 	}
 
-	void Blitter::blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, bool filter)
+	void Blitter::blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil)
 	{
-		Blitter::Options options = filter ? static_cast<Blitter::Options>(WRITE_RGBA | FILTER_LINEAR) : WRITE_RGBA;
+		Blitter::Options options = WRITE_RGBA;
+		if(filter)
+		{
+			options = static_cast<Blitter::Options>(options | FILTER_LINEAR);
+		}
+		if(isStencil)
+		{
+			options = static_cast<Blitter::Options>(options | USE_STENCIL);
+		}
 		blit(source, sRect, dest, dRect, options);
 	}
 
@@ -321,6 +329,9 @@
 		case FORMAT_D32FS8_SHADOW:
 			c.x = *Pointer<Float>(element);
 			break;
+		case FORMAT_S8:
+			c.x = Float(Int(*Pointer<Byte>(element)));
+			break;
 		default:
 			return false;
 		}
@@ -689,6 +700,9 @@
 		case FORMAT_D32FS8_SHADOW:
 			*Pointer<Float>(element) = c.x;
 			break;
+		case FORMAT_S8:
+			*Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
+			break;
 		default:
 			return false;
 		}
@@ -978,7 +992,8 @@
 		case FORMAT_D32F_LOCKABLE:
 		case FORMAT_D32FS8_TEXTURE:
 		case FORMAT_D32FS8_SHADOW:
-			scale = vector(1.0f, 0.0f, 0.0f, 0.0f);
+		case FORMAT_S8:
+			scale = vector(1.0f, 1.0f, 1.0f, 1.0f);
 			break;
 		default:
 			return false;
@@ -1036,6 +1051,12 @@
 		return true;
 	}
 
+	Int Blitter::ComputeOffset(Int& x, Int& y, Int& pitchB, int bytes, bool quadLayout)
+	{
+		return (quadLayout ? (y & Int(~1)) : y) * pitchB +
+		       (quadLayout ? ((y & Int(1)) << 1) + (x * 2) - (x & Int(1)) : x) * bytes;
+	}
+
 	Routine *Blitter::generate(BlitState &state)
 	{
 		Function<Void(Pointer<Byte>)> function;
@@ -1063,6 +1084,10 @@
 			bool intSrc = Surface::isNonNormalizedInteger(state.sourceFormat);
 			bool intDst = Surface::isNonNormalizedInteger(state.destFormat);
 			bool intBoth = intSrc && intDst;
+			bool srcQuadLayout = Surface::hasQuadLayout(state.sourceFormat);
+			bool dstQuadLayout = Surface::hasQuadLayout(state.destFormat);
+			int srcBytes = Surface::bytes(state.sourceFormat);
+			int dstBytes = Surface::bytes(state.destFormat);
 
 			bool hasConstantColorI = false;
 			Int4 constantColorI;
@@ -1098,11 +1123,11 @@
 			For(Int j = y0d, j < y1d, j++)
 			{
 				Float x = x0;
-				Pointer<Byte> destLine = dest + j * dPitchB;
+				Pointer<Byte> destLine = dest + (dstQuadLayout ? j & Int(~1) : j) * dPitchB;
 
 				For(Int i = x0d, i < x1d, i++)
 				{
-					Pointer<Byte> d = destLine + i * Surface::bytes(state.destFormat);
+					Pointer<Byte> d = destLine + (dstQuadLayout ? (((j & Int(1)) << 1) + (i * 2) - (i & Int(1))) : i) * dstBytes;
 					if(hasConstantColorI)
 					{
 						if(!write(constantColorI, d, state.destFormat, state.options))
@@ -1120,7 +1145,11 @@
 					else if(intBoth) // Integer types do not support filtering
 					{
 						Int4 color; // When both formats are true integer types, we don't go to float to avoid losing precision
-						Pointer<Byte> s = source + Int(y) * sPitchB + Int(x) * Surface::bytes(state.sourceFormat);
+						Int X = Int(x);
+						Int Y = Int(y);
+
+						Pointer<Byte> s = source + ComputeOffset(X, Y, sPitchB, srcBytes, srcQuadLayout);
+
 						if(!read(color, s, state.sourceFormat))
 						{
 							return nullptr;
@@ -1140,7 +1169,7 @@
 							Int X = Int(x);
 							Int Y = Int(y);
 
-							Pointer<Byte> s = source + Y * sPitchB + X * Surface::bytes(state.sourceFormat);
+							Pointer<Byte> s = source + ComputeOffset(X, Y, sPitchB, srcBytes, srcQuadLayout);
 
 							if(!read(color, s, state.sourceFormat))
 							{
@@ -1158,10 +1187,10 @@
 							Int X1 = IfThenElse(X0 + 1 >= sWidth, X0, X0 + 1);
 							Int Y1 = IfThenElse(Y0 + 1 >= sHeight, Y0, Y0 + 1);
 
-							Pointer<Byte> s00 = source + Y0 * sPitchB + X0 * Surface::bytes(state.sourceFormat);
-							Pointer<Byte> s01 = source + Y0 * sPitchB + X1 * Surface::bytes(state.sourceFormat);
-							Pointer<Byte> s10 = source + Y1 * sPitchB + X0 * Surface::bytes(state.sourceFormat);
-							Pointer<Byte> s11 = source + Y1 * sPitchB + X1 * Surface::bytes(state.sourceFormat);
+							Pointer<Byte> s00 = source + ComputeOffset(X0, Y0, sPitchB, srcBytes, srcQuadLayout);
+							Pointer<Byte> s01 = source + ComputeOffset(X1, Y0, sPitchB, srcBytes, srcQuadLayout);
+							Pointer<Byte> s10 = source + ComputeOffset(X0, Y1, sPitchB, srcBytes, srcQuadLayout);
+							Pointer<Byte> s11 = source + ComputeOffset(X1, Y1, sPitchB, srcBytes, srcQuadLayout);
 
 							Float4 c00; if(!read(c00, s00, state.sourceFormat)) return nullptr;
 							Float4 c01; if(!read(c01, s01, state.sourceFormat)) return nullptr;
@@ -1214,9 +1243,10 @@
 
 		bool useSourceInternal = !source->isExternalDirty();
 		bool useDestInternal = !dest->isExternalDirty();
+		bool isStencil = ((options & USE_STENCIL) == USE_STENCIL);
 
-		state.sourceFormat = source->getFormat(useSourceInternal);
-		state.destFormat = dest->getFormat(useDestInternal);
+		state.sourceFormat = isStencil ? source->getStencilFormat() : source->getFormat(useSourceInternal);
+		state.destFormat = isStencil ? dest->getStencilFormat() : dest->getFormat(useDestInternal);
 		state.options = options;
 
 		criticalSection.lock();
@@ -1244,10 +1274,12 @@
 		bool isRGBA = ((options & WRITE_RGBA) == WRITE_RGBA);
 		bool isEntireDest = dest->isEntire(destRect);
 
-		data.source = source->lock(0, 0, sourceRect.slice, sw::LOCK_READONLY, sw::PUBLIC, useSourceInternal);
-		data.dest = dest->lock(0, 0, destRect.slice, isRGBA ? (isEntireDest ? sw::LOCK_DISCARD : sw::LOCK_WRITEONLY) : sw::LOCK_READWRITE, sw::PUBLIC, useDestInternal);
-		data.sPitchB = source->getPitchB(useSourceInternal);
-		data.dPitchB = dest->getPitchB(useDestInternal);
+		data.source = isStencil ? source->lockStencil(0, 0, 0, sw::PUBLIC) :
+		                          source->lock(0, 0, sourceRect.slice, sw::LOCK_READONLY, sw::PUBLIC, useSourceInternal);
+		data.dest = isStencil ? dest->lockStencil(0, 0, 0, sw::PUBLIC) :
+		                        dest->lock(0, 0, destRect.slice, isRGBA ? (isEntireDest ? sw::LOCK_DISCARD : sw::LOCK_WRITEONLY) : sw::LOCK_READWRITE, sw::PUBLIC, useDestInternal);
+		data.sPitchB = isStencil ? source->getStencilPitchB() : source->getPitchB(useSourceInternal);
+		data.dPitchB = isStencil ? dest->getStencilPitchB() : dest->getPitchB(useDestInternal);
 
 		data.w = 1.0f / (dRect.x1 - dRect.x0) * (sRect.x1 - sRect.x0);
 		data.h = 1.0f / (dRect.y1 - dRect.y0) * (sRect.y1 - sRect.y0);
@@ -1264,8 +1296,16 @@
 
 		blitFunction(&data);
 
-		source->unlock(useSourceInternal);
-		dest->unlock(useDestInternal);
+		if(isStencil)
+		{
+			source->unlockStencil();
+			dest->unlockStencil();
+		}
+		else
+		{
+			source->unlock(useSourceInternal);
+			dest->unlock(useDestInternal);
+		}
 
 		return true;
 	}
diff --git a/src/Renderer/Blitter.hpp b/src/Renderer/Blitter.hpp
index 8353ab9..366d5c3 100644
--- a/src/Renderer/Blitter.hpp
+++ b/src/Renderer/Blitter.hpp
@@ -34,7 +34,8 @@
 			WRITE_ALPHA = 0x08,
 			WRITE_RGBA = WRITE_RED | WRITE_GREEN | WRITE_BLUE | WRITE_ALPHA,
 			FILTER_LINEAR = 0x10,
-			CLEAR_OPERATION = 0x20
+			CLEAR_OPERATION = 0x20,
+			USE_STENCIL = 0x40,
 		};
 
 		struct BlitState
@@ -76,7 +77,7 @@
 		virtual ~Blitter();
 
 		void clear(void* pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
-		void blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, bool filter);
+		void blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil = false);
 		void blit3D(Surface *source, Surface *dest);
 
 	private:
@@ -86,6 +87,7 @@
 		bool write(Int4 &color, Pointer<Byte> element, Format format, const Blitter::Options& options);
 		static bool GetScale(float4& scale, Format format);
 		static bool ApplyScaleAndClamp(Float4& value, const BlitState& state);
+		static Int ComputeOffset(Int& x, Int& y, Int& pitchB, int bytes, bool quadLayout);
 		void blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, const Blitter::Options& options);
 		bool blitReactor(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, const Blitter::Options& options);
 		Routine *generate(BlitState &state);
diff --git a/src/Renderer/PixelProcessor.cpp b/src/Renderer/PixelProcessor.cpp
index 76356e4..172e8ef 100644
--- a/src/Renderer/PixelProcessor.cpp
+++ b/src/Renderer/PixelProcessor.cpp
@@ -972,9 +972,7 @@
 		{
 			state.depthTestActive = true;
 			state.depthCompareMode = context->depthCompareMode;
-			state.quadLayoutDepthBuffer = context->depthBuffer->getInternalFormat() != FORMAT_D32F_LOCKABLE &&
-			                              context->depthBuffer->getInternalFormat() != FORMAT_D32FS8_TEXTURE &&
-			                              context->depthBuffer->getInternalFormat() != FORMAT_D32FS8_SHADOW;
+			state.quadLayoutDepthBuffer = Surface::hasQuadLayout(context->depthBuffer->getInternalFormat());
 		}
 
 		state.occlusionEnabled = context->occlusionEnabled;
diff --git a/src/Renderer/Renderer.cpp b/src/Renderer/Renderer.cpp
index 0700a88..b51cbd3 100644
--- a/src/Renderer/Renderer.cpp
+++ b/src/Renderer/Renderer.cpp
@@ -207,9 +207,9 @@
 		blitter.clear(pixel, format, dest, dRect, rgbaMask);
 	}
 
-	void Renderer::blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, bool filter)
+	void Renderer::blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil)
 	{
-		blitter.blit(source, sRect, dest, dRect, filter);
+		blitter.blit(source, sRect, dest, dRect, filter, isStencil);
 	}
 
 	void Renderer::blit3D(Surface *source, Surface *dest)
diff --git a/src/Renderer/Renderer.hpp b/src/Renderer/Renderer.hpp
index 334f814..e4a83e1 100644
--- a/src/Renderer/Renderer.hpp
+++ b/src/Renderer/Renderer.hpp
@@ -322,7 +322,7 @@
 		void operator delete(void * mem);
 
 		void clear(void* pixel, Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
-		void blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, bool filter);
+		void blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil = false);
 		void blit3D(Surface *source, Surface *dest);
 		void draw(DrawType drawType, unsigned int indexOffset, unsigned int count, bool update = true);
 
diff --git a/src/Renderer/Surface.cpp b/src/Renderer/Surface.cpp
index d972574..02ad43b 100644
--- a/src/Renderer/Surface.cpp
+++ b/src/Renderer/Surface.cpp
@@ -2629,6 +2629,34 @@
 		}
 	}
 
+	bool Surface::hasQuadLayout(Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_D32:
+		case FORMAT_D16:
+		case FORMAT_D24X8:
+		case FORMAT_D24S8:
+		case FORMAT_D24FS8:
+		case FORMAT_D32F:
+		case FORMAT_D32F_COMPLEMENTARY:
+		case FORMAT_DF24S8:
+		case FORMAT_DF16S8:
+		case FORMAT_INTZ:
+		case FORMAT_S8:
+		case FORMAT_A8G8R8B8Q:
+		case FORMAT_X8G8R8B8Q:
+			return true;
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32FS8_SHADOW:
+		default:
+			break;
+		}
+
+		return false;
+	}
+
 	bool Surface::isPalette(Format format)
 	{
 		switch(format)
@@ -2686,6 +2714,7 @@
 		case FORMAT_A8:
 		case FORMAT_R8I:
 		case FORMAT_R8:
+		case FORMAT_S8:
 		case FORMAT_L8:
 		case FORMAT_L16:
 		case FORMAT_A8L8:
diff --git a/src/Renderer/Surface.hpp b/src/Renderer/Surface.hpp
index dd96ffd..5a5fe10 100644
--- a/src/Renderer/Surface.hpp
+++ b/src/Renderer/Surface.hpp
@@ -327,6 +327,7 @@
 
 		static bool isStencil(Format format);
 		static bool isDepth(Format format);
+		static bool hasQuadLayout(Format format);
 		static bool isPalette(Format format);
 
 		static bool isFloatFormat(Format format);