Separate image depth and samples count.

Previously, multisampled images used the 'depth' member of 3D images
or 2D arrays as the number of samples. This caused rendering to a
layer of a 2D array to be interpreted as rendering to a multisampled
render target. This change adds a 'samples' member which is orthogonal
to 'depth'.

Note that write operations put the same color into each of the samples,
while read operations (still) assume multisampled images have been
resolved into the first slice.

Change-Id: Ib33a0cf8194e19fcbb569b0c257ba1e1bd9c4821
Reviewed-on: https://swiftshader-review.googlesource.com/14808
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Renderer/Blitter.cpp b/src/Renderer/Blitter.cpp
index a6773fc..7e7ca5e 100644
--- a/src/Renderer/Blitter.cpp
+++ b/src/Renderer/Blitter.cpp
@@ -30,7 +30,7 @@
 		delete blitCache;
 	}
 
-	void Blitter::clear(void* pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
+	void Blitter::clear(void *pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
 	{
 		if(fastClear(pixel, format, dest, dRect, rgbaMask))
 		{
@@ -44,7 +44,7 @@
 		delete color;
 	}
 
-	bool Blitter::fastClear(void* pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
+	bool Blitter::fastClear(void *pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
 	{
 		if(format != FORMAT_A32B32G32R32F)
 		{
@@ -99,26 +99,33 @@
 			return false;
 		}
 
-		uint8_t *d = (uint8_t*)dest->lockInternal(dRect.x0, dRect.y0, dRect.slice, sw::LOCK_WRITEONLY, sw::PUBLIC);
+		uint8_t *slice = (uint8_t*)dest->lockInternal(dRect.x0, dRect.y0, dRect.slice, sw::LOCK_WRITEONLY, sw::PUBLIC);
 
-		switch(Surface::bytes(dest->getFormat()))
+		for(int j = 0; j < dest->getSamples(); j++)
 		{
-		case 2:
-			for(int i = dRect.y0; i < dRect.y1; i++)
+			uint8_t *d = slice;
+
+			switch(Surface::bytes(dest->getFormat()))
 			{
-				sw::clear((uint16_t*)d, packed, dRect.x1 - dRect.x0);
-				d += dest->getInternalPitchB();
+			case 2:
+				for(int i = dRect.y0; i < dRect.y1; i++)
+				{
+					sw::clear((uint16_t*)d, packed, dRect.x1 - dRect.x0);
+					d += dest->getInternalPitchB();
+				}
+				break;
+			case 4:
+				for(int i = dRect.y0; i < dRect.y1; i++)
+				{
+					sw::clear((uint32_t*)d, packed, dRect.x1 - dRect.x0);
+					d += dest->getInternalPitchB();
+				}
+				break;
+			default:
+				assert(false);
 			}
-			break;
-		case 4:
-			for(int i = dRect.y0; i < dRect.y1; i++)
-			{
-				sw::clear((uint32_t*)d, packed, dRect.x1 - dRect.x0);
-				d += dest->getInternalPitchB();
-			}
-			break;
-		default:
-			assert(false);
+
+			slice += dest->getInternalSliceB();
 		}
 
 		dest->unlockInternal();
@@ -1201,6 +1208,7 @@
 				For(Int i = x0d, i < x1d, i++)
 				{
 					Pointer<Byte> d = destLine + (dstQuadLayout ? (((j & Int(1)) << 1) + (i * 2) - (i & Int(1))) : RValue<Int>(i)) * dstBytes;
+
 					if(hasConstantColorI)
 					{
 						if(!write(constantColorI, d, state.destFormat, state.options))
@@ -1281,10 +1289,20 @@
 							        (c10 * ix + c11 * fx) * fy;
 						}
 
-						if(!ApplyScaleAndClamp(color, state) || !write(color, d, state.destFormat, state.options))
+						if(!ApplyScaleAndClamp(color, state))
 						{
 							return nullptr;
 						}
+
+						for(int s = 0; s < state.destSamples; s++)
+						{
+							if(!write(color, d, state.destFormat, state.options))
+							{
+								return nullptr;
+							}
+
+							d += *Pointer<Int>(blit + OFFSET(BlitData,dSliceB));
+						}
 					}
 
 					if(!hasConstantColorI && !hasConstantColorF) { x += w; }
@@ -1322,6 +1340,7 @@
 
 		state.sourceFormat = isStencil ? source->getStencilFormat() : source->getFormat(useSourceInternal);
 		state.destFormat = isStencil ? dest->getStencilFormat() : dest->getFormat(useDestInternal);
+		state.destSamples = dest->getSamples();
 		state.options = options;
 
 		criticalSection.lock();
@@ -1355,6 +1374,7 @@
 		                        dest->lock(0, 0, destRect.slice, isRGBA ? (isEntireDest ? sw::LOCK_DISCARD : sw::LOCK_WRITEONLY) : sw::LOCK_READWRITE, sw::PUBLIC, useDestInternal);
 		data.sPitchB = isStencil ? source->getStencilPitchB() : source->getPitchB(useSourceInternal);
 		data.dPitchB = isStencil ? dest->getStencilPitchB() : dest->getPitchB(useDestInternal);
+		data.dSliceB = isStencil ? dest->getStencilSliceB() : dest->getSliceB(useDestInternal);
 
 		data.w = sRect.width() / dRect.width();
 		data.h = sRect.height() / dRect.height();
diff --git a/src/Renderer/Blitter.hpp b/src/Renderer/Blitter.hpp
index ce7c789..6e87db8 100644
--- a/src/Renderer/Blitter.hpp
+++ b/src/Renderer/Blitter.hpp
@@ -47,6 +47,7 @@
 
 			Format sourceFormat;
 			Format destFormat;
+			int destSamples;
 			Blitter::Options options;
 		};
 
@@ -56,6 +57,7 @@
 			void *dest;
 			int sPitchB;
 			int dPitchB;
+			int dSliceB;
 
 			float x0;
 			float y0;
@@ -75,12 +77,12 @@
 		Blitter();
 		virtual ~Blitter();
 
-		void clear(void* pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
+		void clear(void *pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
 		void blit(Surface *source, const SliceRectF &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil = false);
 		void blit3D(Surface *source, Surface *dest);
 
 	private:
-		bool fastClear(void* pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
+		bool fastClear(void *pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
 
 		bool read(Float4 &color, Pointer<Byte> element, Format format);
 		bool write(Float4 &color, Pointer<Byte> element, Format format, const Blitter::Options& options);
diff --git a/src/Renderer/Renderer.cpp b/src/Renderer/Renderer.cpp
index fd0c340..0b95c72 100644
--- a/src/Renderer/Renderer.cpp
+++ b/src/Renderer/Renderer.cpp
@@ -606,7 +606,8 @@
 
 					if(draw->renderTarget[index])
 					{
-						data->colorBuffer[index] = (unsigned int*)context->renderTarget[index]->lockInternal(0, 0, q * ms, LOCK_READWRITE, MANAGED);
+						data->colorBuffer[index] = (unsigned int*)context->renderTarget[index]->lockInternal(0, 0, 0, LOCK_READWRITE, MANAGED);
+						data->colorBuffer[index] += q * ms * context->renderTarget[index]->getSliceB(true);
 						data->colorPitchB[index] = context->renderTarget[index]->getInternalPitchB();
 						data->colorSliceB[index] = context->renderTarget[index]->getInternalSliceB();
 					}
@@ -617,14 +618,16 @@
 
 				if(draw->depthBuffer)
 				{
-					data->depthBuffer = (float*)context->depthBuffer->lockInternal(0, 0, q * ms, LOCK_READWRITE, MANAGED);
+					data->depthBuffer = (float*)context->depthBuffer->lockInternal(0, 0, 0, LOCK_READWRITE, MANAGED);
+					data->depthBuffer += q * ms * context->depthBuffer->getSliceB(true);
 					data->depthPitchB = context->depthBuffer->getInternalPitchB();
 					data->depthSliceB = context->depthBuffer->getInternalSliceB();
 				}
 
 				if(draw->stencilBuffer)
 				{
-					data->stencilBuffer = (unsigned char*)context->stencilBuffer->lockStencil(0, 0, q * ms, MANAGED);
+					data->stencilBuffer = (unsigned char*)context->stencilBuffer->lockStencil(0, 0, 0, MANAGED);
+					data->stencilBuffer += q * ms * context->stencilBuffer->getSliceB(true);
 					data->stencilPitchB = context->stencilBuffer->getStencilPitchB();
 					data->stencilSliceB = context->stencilBuffer->getStencilSliceB();
 				}
@@ -673,13 +676,7 @@
 
 	void Renderer::clear(void *value, Format format, Surface *dest, const Rect &clearRect, unsigned int rgbaMask)
 	{
-		SliceRect rect = clearRect;
-		int samples = dest->getDepth();
-
-		for(rect.slice = 0; rect.slice < samples; rect.slice++)
-		{
-			blitter->clear(value, format, dest, rect, rgbaMask);
-		}
+		blitter->clear(value, format, dest, clearRect, rgbaMask);
 	}
 
 	void Renderer::blit(Surface *source, const SliceRectF &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil)
diff --git a/src/Renderer/Surface.cpp b/src/Renderer/Surface.cpp
index b15102d..b758bad 100644
--- a/src/Renderer/Surface.cpp
+++ b/src/Renderer/Surface.cpp
@@ -44,16 +44,24 @@
 
 	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
 	{
-		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB + z * sliceB;
+		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
 
-		write(element, color);
+		for(int i = 0; i < samples; i++)
+		{
+			write(element, color);
+			element += sliceB;
+		}
 	}
 
 	void Surface::Buffer::write(int x, int y, const Color<float> &color)
 	{
-		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB;
+		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB;
 
-		write(element, color);
+		for(int i = 0; i < samples; i++)
+		{
+			write(element, color);
+			element += sliceB;
+		}
 	}
 
 	inline void Surface::Buffer::write(void *element, const Color<float> &color)
@@ -365,7 +373,7 @@
 
 	Color<float> Surface::Buffer::read(int x, int y, int z) const
 	{
-		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB + z * sliceB;
+		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
 
 		return read(element);
 	}
@@ -1039,7 +1047,7 @@
 		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
 	}
 
-	Color<float> Surface::Buffer::sample(float x, float y) const
+	Color<float> Surface::Buffer::sample(float x, float y, int layer) const
 	{
 		x -= 0.5f;
 		y -= 0.5f;
@@ -1050,10 +1058,10 @@
 		int y0 = clamp((int)y, 0, height - 1);
 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
 
-		Color<float> c00 = read(x0, y0);
-		Color<float> c10 = read(x1, y0);
-		Color<float> c01 = read(x0, y1);
-		Color<float> c11 = read(x1, y1);
+		Color<float> c00 = read(x0, y0, layer);
+		Color<float> c10 = read(x1, y0, layer);
+		Color<float> c01 = read(x0, y1, layer);
+		Color<float> c11 = read(x1, y1, layer);
 
 		float fx = x - x0;
 		float fy = y - y0;
@@ -1156,11 +1164,11 @@
 			case FORMAT_ATI2:
 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
 			default:
-				return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
+				return (unsigned char*)buffer + x * bytes + y * pitchB + z * samples * sliceB;
 			}
 		}
 
-		return 0;
+		return nullptr;
 	}
 
 	void Surface::Buffer::unlockRect()
@@ -1173,8 +1181,8 @@
 	public:
 		SurfaceImplementation(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
 			: Surface(width, height, depth, format, pixels, pitch, slice) {}
-		SurfaceImplementation(Resource *texture, int width, int height, int depth, int border, Format format, bool lockable, bool renderTarget, int pitchP = 0)
-			: Surface(texture, width, height, depth, border, format, lockable, renderTarget, pitchP) {}
+		SurfaceImplementation(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0)
+			: Surface(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchP) {}
 		~SurfaceImplementation() override {};
 
 		void *lockInternal(int x, int y, int z, Lock lock, Accessor client) override
@@ -1193,9 +1201,9 @@
 		return new SurfaceImplementation(width, height, depth, format, pixels, pitch, slice);
 	}
 
-	Surface *Surface::create(Resource *texture, int width, int height, int depth, int border, Format format, bool lockable, bool renderTarget, int pitchPprovided)
+	Surface *Surface::create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided)
 	{
-		return new SurfaceImplementation(texture, width, height, depth, border, format, lockable, renderTarget, pitchPprovided);
+		return new SurfaceImplementation(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchPprovided);
 	}
 
 	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
@@ -1209,6 +1217,7 @@
 		external.width = width;
 		external.height = height;
 		external.depth = depth;
+		external.samples = 1;
 		external.format = format;
 		external.bytes = bytes(external.format);
 		external.pitchB = pitch;
@@ -1223,6 +1232,7 @@
 		internal.width = width;
 		internal.height = height;
 		internal.depth = depth;
+		internal.samples = 1;
 		internal.format = selectInternalFormat(format);
 		internal.bytes = bytes(internal.format);
 		internal.pitchB = pitchB(internal.width, 0, internal.format, false);
@@ -1237,6 +1247,7 @@
 		stencil.width = width;
 		stencil.height = height;
 		stencil.depth = depth;
+		stencil.samples = 1;
 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
 		stencil.bytes = bytes(stencil.format);
 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, false);
@@ -1251,17 +1262,19 @@
 		paletteUsed = 0;
 	}
 
-	Surface::Surface(Resource *texture, int width, int height, int depth, int border, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
+	Surface::Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
 	{
 		resource = texture ? texture : new Resource(0);
 		hasParent = texture != nullptr;
 		ownExternal = true;
 		depth = max(1, depth);
+		samples = max(1, samples);
 
 		external.buffer = nullptr;
 		external.width = width;
 		external.height = height;
 		external.depth = depth;
+		external.samples = (short)samples;
 		external.format = format;
 		external.bytes = bytes(external.format);
 		external.pitchB = pitchB(external.width, 0, external.format, renderTarget && !texture);
@@ -1276,13 +1289,14 @@
 		internal.width = width;
 		internal.height = height;
 		internal.depth = depth;
+		internal.samples = (short)samples;
 		internal.format = selectInternalFormat(format);
 		internal.bytes = bytes(internal.format);
 		internal.pitchB = !pitchPprovided ? pitchB(internal.width, border, internal.format, renderTarget) : pitchPprovided * internal.bytes;
 		internal.pitchP = !pitchPprovided ? pitchP(internal.width, border, internal.format, renderTarget) : pitchPprovided;
 		internal.sliceB = sliceB(internal.width, internal.height, border, internal.format, renderTarget);
 		internal.sliceP = sliceP(internal.width, internal.height, border, internal.format, renderTarget);
-		internal.border = border;
+		internal.border = (short)border;
 		internal.lock = LOCK_UNLOCKED;
 		internal.dirty = false;
 
@@ -1290,6 +1304,7 @@
 		stencil.width = width;
 		stencil.height = height;
 		stencil.depth = depth;
+		stencil.samples = (short)samples;
 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
 		stencil.bytes = bytes(stencil.format);
 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, renderTarget);
@@ -1344,7 +1359,7 @@
 			}
 			else
 			{
-				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.border, external.format);
+				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.border, external.samples, external.format);
 			}
 		}
 
@@ -1396,7 +1411,7 @@
 			}
 			else
 			{
-				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.border, internal.format);
+				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.border, internal.samples, internal.format);
 			}
 		}
 
@@ -1471,7 +1486,7 @@
 
 		if(!stencil.buffer)
 		{
-			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.border, stencil.format);
+			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.border, stencil.samples, stencil.format);
 		}
 
 		return stencil.lockRect(x, y, front, LOCK_READWRITE);   // FIXME
@@ -1931,17 +1946,21 @@
 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_WRITEONLY);
 
-		for(int z = 0; z < destination.depth && z < source.depth; z++)
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+
+		for(int z = 0; z < depth; z++)
 		{
 			unsigned char *sourceRow = sourceSlice;
 			unsigned char *destinationRow = destinationSlice;
 
-			for(int y = 0; y < destination.height && y < source.height; y++)
+			for(int y = 0; y < height; y++)
 			{
 				unsigned char *sourceElement = sourceRow;
 				unsigned char *destinationElement = destinationRow;
 
-				for(int x = 0; x < destination.width && x < source.width; x++)
+				for(int x = 0; x < width; x++)
 				{
 					unsigned int b = sourceElement[0];
 					unsigned int g = sourceElement[1];
@@ -1970,17 +1989,21 @@
 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_WRITEONLY);
 
-		for(int z = 0; z < destination.depth && z < source.depth; z++)
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+
+		for(int z = 0; z < depth; z++)
 		{
 			unsigned char *sourceRow = sourceSlice;
 			unsigned char *destinationRow = destinationSlice;
 
-			for(int y = 0; y < destination.height && y < source.height; y++)
+			for(int y = 0; y < height; y++)
 			{
 				unsigned char *sourceElement = sourceRow;
 				unsigned char *destinationElement = destinationRow;
 
-				for(int x = 0; x < destination.width && x < source.width; x++)
+				for(int x = 0; x < width; x++)
 				{
 					unsigned int xrgb = *(unsigned short*)sourceElement;
 
@@ -2011,17 +2034,21 @@
 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_WRITEONLY);
 
-		for(int z = 0; z < destination.depth && z < source.depth; z++)
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+
+		for(int z = 0; z < depth; z++)
 		{
 			unsigned char *sourceRow = sourceSlice;
 			unsigned char *destinationRow = destinationSlice;
 
-			for(int y = 0; y < destination.height && y < source.height; y++)
+			for(int y = 0; y < height; y++)
 			{
 				unsigned char *sourceElement = sourceRow;
 				unsigned char *destinationElement = destinationRow;
 
-				for(int x = 0; x < destination.width && x < source.width; x++)
+				for(int x = 0; x < width; x++)
 				{
 					unsigned int argb = *(unsigned short*)sourceElement;
 
@@ -2053,17 +2080,21 @@
 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_WRITEONLY);
 
-		for(int z = 0; z < destination.depth && z < source.depth; z++)
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+
+		for(int z = 0; z < depth; z++)
 		{
 			unsigned char *sourceRow = sourceSlice;
 			unsigned char *destinationRow = destinationSlice;
 
-			for(int y = 0; y < destination.height && y < source.height; y++)
+			for(int y = 0; y < height; y++)
 			{
 				unsigned char *sourceElement = sourceRow;
 				unsigned char *destinationElement = destinationRow;
 
-				for(int x = 0; x < destination.width && x < source.width; x++)
+				for(int x = 0; x < width; x++)
 				{
 					unsigned int xrgb = *(unsigned short*)sourceElement;
 
@@ -2094,17 +2125,21 @@
 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_WRITEONLY);
 
-		for(int z = 0; z < destination.depth && z < source.depth; z++)
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+
+		for(int z = 0; z < depth; z++)
 		{
 			unsigned char *sourceRow = sourceSlice;
 			unsigned char *destinationRow = destinationSlice;
 
-			for(int y = 0; y < destination.height && y < source.height; y++)
+			for(int y = 0; y < height; y++)
 			{
 				unsigned char *sourceElement = sourceRow;
 				unsigned char *destinationElement = destinationRow;
 
-				for(int x = 0; x < destination.width && x < source.width; x++)
+				for(int x = 0; x < width; x++)
 				{
 					unsigned int argb = *(unsigned short*)sourceElement;
 
@@ -2136,17 +2171,21 @@
 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_WRITEONLY);
 
-		for(int z = 0; z < destination.depth && z < source.depth; z++)
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+
+		for(int z = 0; z < depth; z++)
 		{
 			unsigned char *sourceRow = sourceSlice;
 			unsigned char *destinationRow = destinationSlice;
 
-			for(int y = 0; y < destination.height && y < source.height; y++)
+			for(int y = 0; y < height; y++)
 			{
 				unsigned char *sourceElement = sourceRow;
 				unsigned char *destinationElement = destinationRow;
 
-				for(int x = 0; x < destination.width && x < source.width; x++)
+				for(int x = 0; x < width; x++)
 				{
 					unsigned int abgr = palette[*(unsigned char*)sourceElement];
 
@@ -2581,7 +2620,7 @@
 	{
 	}
 
-	unsigned int Surface::size(int width, int height, int depth, int border, Format format)
+	unsigned int Surface::size(int width, int height, int depth, int border, int samples, Format format)
 	{
 		width += 2 * border;
 		height += 2 * border;
@@ -2667,7 +2706,7 @@
 				return YSize + 2 * CSize;
 			}
 		default:
-			return bytes(format) * width * height * depth;
+			return bytes(format) * width * height * depth * samples;
 		}
 	}
 
@@ -3184,7 +3223,7 @@
 		return 1;
 	}
 
-	void *Surface::allocateBuffer(int width, int height, int depth, int border, Format format)
+	void *Surface::allocateBuffer(int width, int height, int depth, int border, int samples, Format format)
 	{
 		// Render targets require 2x2 quads
 		int width2 = (width + 1) & ~1;
@@ -3193,7 +3232,7 @@
 		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
 		// and stencil operations also read 8 bytes per four 8-bit stencil values,
 		// so we have to allocate 4 extra bytes to avoid buffer overruns.
-		return allocate(size(width2, height2, depth, border, format) + 4);
+		return allocate(size(width2, height2, depth, border, samples, format) + 4);
 	}
 
 	void Surface::memfill4(void *buffer, int pattern, int bytes)
@@ -3308,7 +3347,7 @@
 		{
 			float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;
 
-			for(int z = 0; z < internal.depth; z++)
+			for(int z = 0; z < internal.samples; z++)
 			{
 				for(int y = y0; y < y1; y++)
 				{
@@ -3333,7 +3372,7 @@
 			int evenX0 = ((x0 + 1) & ~1) * 2;
 			int evenBytes = (oddX1 - evenX0) * sizeof(float);
 
-			for(int z = 0; z < internal.depth; z++)
+			for(int z = 0; z < internal.samples; z++)
 			{
 				for(int y = y0; y < y1; y++)
 				{
@@ -3437,7 +3476,7 @@
 		char *buffer = (char*)lockStencil(0, 0, 0, PUBLIC);
 
 		// Stencil buffers are assumed to use quad layout
-		for(int z = 0; z < stencil.depth; z++)
+		for(int z = 0; z < stencil.samples; z++)
 		{
 			for(int y = y0; y < y1; y++)
 			{
@@ -3534,7 +3573,7 @@
 		}
 	}
 
-	void Surface::copyInternal(const Surface* source, int x, int y, float srcX, float srcY, bool filter)
+	void Surface::copyInternal(const Surface *source, int x, int y, float srcX, float srcY, bool filter)
 	{
 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
 
@@ -3542,17 +3581,17 @@
 
 		if(!filter)
 		{
-			color = source->internal.read((int)srcX, (int)srcY);
+			color = source->internal.read((int)srcX, (int)srcY, 0);
 		}
 		else   // Bilinear filtering
 		{
-			color = source->internal.sample(srcX, srcY);
+			color = source->internal.sample(srcX, srcY, 0);
 		}
 
 		internal.write(x, y, color);
 	}
 
-	void Surface::copyInternal(const Surface* source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
+	void Surface::copyInternal(const Surface *source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
 	{
 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
 
@@ -3688,7 +3727,8 @@
 		       external.depth  == internal.depth &&
 		       external.pitchB == internal.pitchB &&
 		       external.sliceB == internal.sliceB &&
-		       external.border == internal.border;
+		       external.border == internal.border &&
+		       external.samples == internal.samples;
 	}
 
 	Format Surface::selectInternalFormat(Format format) const
@@ -3947,11 +3987,13 @@
 
 	void Surface::resolve()
 	{
-		if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
+		if(internal.samples <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
 		{
 			return;
 		}
 
+		ASSERT(internal.depth == 1);  // Unimplemented
+
 		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
 
 		int width = internal.width;
@@ -3983,7 +4025,7 @@
 			#if defined(__i386__) || defined(__x86_64__)
 				if(CPUID::supportsSSE2() && (width % 4) == 0)
 				{
-					if(internal.depth == 2)
+					if(internal.samples == 2)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4001,7 +4043,7 @@
 							source1 += pitch;
 						}
 					}
-					else if(internal.depth == 4)
+					else if(internal.samples == 4)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4025,7 +4067,7 @@
 							source3 += pitch;
 						}
 					}
-					else if(internal.depth == 8)
+					else if(internal.samples == 8)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4061,7 +4103,7 @@
 							source7 += pitch;
 						}
 					}
-					else if(internal.depth == 16)
+					else if(internal.samples == 16)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4128,7 +4170,7 @@
 			{
 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
 
-				if(internal.depth == 2)
+				if(internal.samples == 2)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -4146,7 +4188,7 @@
 						source1 += pitch;
 					}
 				}
-				else if(internal.depth == 4)
+				else if(internal.samples == 4)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -4170,7 +4212,7 @@
 						source3 += pitch;
 					}
 				}
-				else if(internal.depth == 8)
+				else if(internal.samples == 8)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -4206,7 +4248,7 @@
 						source7 += pitch;
 					}
 				}
-				else if(internal.depth == 16)
+				else if(internal.samples == 16)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -4277,7 +4319,7 @@
 			#if defined(__i386__) || defined(__x86_64__)
 				if(CPUID::supportsSSE2() && (width % 4) == 0)
 				{
-					if(internal.depth == 2)
+					if(internal.samples == 2)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4295,7 +4337,7 @@
 							source1 += pitch;
 						}
 					}
-					else if(internal.depth == 4)
+					else if(internal.samples == 4)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4319,7 +4361,7 @@
 							source3 += pitch;
 						}
 					}
-					else if(internal.depth == 8)
+					else if(internal.samples == 8)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4355,7 +4397,7 @@
 							source7 += pitch;
 						}
 					}
-					else if(internal.depth == 16)
+					else if(internal.samples == 16)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4422,7 +4464,7 @@
 			{
 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
 
-				if(internal.depth == 2)
+				if(internal.samples == 2)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -4440,7 +4482,7 @@
 						source1 += pitch;
 					}
 				}
-				else if(internal.depth == 4)
+				else if(internal.samples == 4)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -4464,7 +4506,7 @@
 						source3 += pitch;
 					}
 				}
-				else if(internal.depth == 8)
+				else if(internal.samples == 8)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -4500,7 +4542,7 @@
 						source7 += pitch;
 					}
 				}
-				else if(internal.depth == 16)
+				else if(internal.samples == 16)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -4570,7 +4612,7 @@
 			#if defined(__i386__) || defined(__x86_64__)
 				if(CPUID::supportsSSE2() && (width % 2) == 0)
 				{
-					if(internal.depth == 2)
+					if(internal.samples == 2)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4588,7 +4630,7 @@
 							source1 += pitch;
 						}
 					}
-					else if(internal.depth == 4)
+					else if(internal.samples == 4)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4612,7 +4654,7 @@
 							source3 += pitch;
 						}
 					}
-					else if(internal.depth == 8)
+					else if(internal.samples == 8)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4648,7 +4690,7 @@
 							source7 += pitch;
 						}
 					}
-					else if(internal.depth == 16)
+					else if(internal.samples == 16)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4715,7 +4757,7 @@
 			{
 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
 
-				if(internal.depth == 2)
+				if(internal.samples == 2)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -4733,7 +4775,7 @@
 						source1 += pitch;
 					}
 				}
-				else if(internal.depth == 4)
+				else if(internal.samples == 4)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -4757,7 +4799,7 @@
 						source3 += pitch;
 					}
 				}
-				else if(internal.depth == 8)
+				else if(internal.samples == 8)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -4793,7 +4835,7 @@
 						source7 += pitch;
 					}
 				}
-				else if(internal.depth == 16)
+				else if(internal.samples == 16)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -4863,7 +4905,7 @@
 			#if defined(__i386__) || defined(__x86_64__)
 				if(CPUID::supportsSSE() && (width % 4) == 0)
 				{
-					if(internal.depth == 2)
+					if(internal.samples == 2)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4882,7 +4924,7 @@
 							source1 += pitch;
 						}
 					}
-					else if(internal.depth == 4)
+					else if(internal.samples == 4)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4907,7 +4949,7 @@
 							source3 += pitch;
 						}
 					}
-					else if(internal.depth == 8)
+					else if(internal.samples == 8)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -4944,7 +4986,7 @@
 							source7 += pitch;
 						}
 					}
-					else if(internal.depth == 16)
+					else if(internal.samples == 16)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -5010,7 +5052,7 @@
 				else
 			#endif
 			{
-				if(internal.depth == 2)
+				if(internal.samples == 2)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -5029,7 +5071,7 @@
 						source1 += pitch;
 					}
 				}
-				else if(internal.depth == 4)
+				else if(internal.samples == 4)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -5054,7 +5096,7 @@
 						source3 += pitch;
 					}
 				}
-				else if(internal.depth == 8)
+				else if(internal.samples == 8)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -5091,7 +5133,7 @@
 						source7 += pitch;
 					}
 				}
-				else if(internal.depth == 16)
+				else if(internal.samples == 16)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -5160,7 +5202,7 @@
 			#if defined(__i386__) || defined(__x86_64__)
 				if(CPUID::supportsSSE() && (width % 2) == 0)
 				{
-					if(internal.depth == 2)
+					if(internal.samples == 2)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -5179,7 +5221,7 @@
 							source1 += pitch;
 						}
 					}
-					else if(internal.depth == 4)
+					else if(internal.samples == 4)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -5204,7 +5246,7 @@
 							source3 += pitch;
 						}
 					}
-					else if(internal.depth == 8)
+					else if(internal.samples == 8)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -5241,7 +5283,7 @@
 							source7 += pitch;
 						}
 					}
-					else if(internal.depth == 16)
+					else if(internal.samples == 16)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -5307,7 +5349,7 @@
 				else
 			#endif
 			{
-				if(internal.depth == 2)
+				if(internal.samples == 2)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -5326,7 +5368,7 @@
 						source1 += pitch;
 					}
 				}
-				else if(internal.depth == 4)
+				else if(internal.samples == 4)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -5351,7 +5393,7 @@
 						source3 += pitch;
 					}
 				}
-				else if(internal.depth == 8)
+				else if(internal.samples == 8)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -5388,7 +5430,7 @@
 						source7 += pitch;
 					}
 				}
-				else if(internal.depth == 16)
+				else if(internal.samples == 16)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -5457,7 +5499,7 @@
 			#if defined(__i386__) || defined(__x86_64__)
 				if(CPUID::supportsSSE())
 				{
-					if(internal.depth == 2)
+					if(internal.samples == 2)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -5476,7 +5518,7 @@
 							source1 += pitch;
 						}
 					}
-					else if(internal.depth == 4)
+					else if(internal.samples == 4)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -5501,7 +5543,7 @@
 							source3 += pitch;
 						}
 					}
-					else if(internal.depth == 8)
+					else if(internal.samples == 8)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -5538,7 +5580,7 @@
 							source7 += pitch;
 						}
 					}
-					else if(internal.depth == 16)
+					else if(internal.samples == 16)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -5604,7 +5646,7 @@
 				else
 			#endif
 			{
-				if(internal.depth == 2)
+				if(internal.samples == 2)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -5623,7 +5665,7 @@
 						source1 += pitch;
 					}
 				}
-				else if(internal.depth == 4)
+				else if(internal.samples == 4)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -5648,7 +5690,7 @@
 						source3 += pitch;
 					}
 				}
-				else if(internal.depth == 8)
+				else if(internal.samples == 8)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -5685,7 +5727,7 @@
 						source7 += pitch;
 					}
 				}
-				else if(internal.depth == 16)
+				else if(internal.samples == 16)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -5754,7 +5796,7 @@
 			#if defined(__i386__) || defined(__x86_64__)
 				if(CPUID::supportsSSE2() && (width % 8) == 0)
 				{
-					if(internal.depth == 2)
+					if(internal.samples == 2)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -5783,7 +5825,7 @@
 							source1 += pitch;
 						}
 					}
-					else if(internal.depth == 4)
+					else if(internal.samples == 4)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -5824,7 +5866,7 @@
 							source3 += pitch;
 						}
 					}
-					else if(internal.depth == 8)
+					else if(internal.samples == 8)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -5889,7 +5931,7 @@
 							source7 += pitch;
 						}
 					}
-					else if(internal.depth == 16)
+					else if(internal.samples == 16)
 					{
 						for(int y = 0; y < height; y++)
 						{
@@ -6009,7 +6051,7 @@
 			{
 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
 
-				if(internal.depth == 2)
+				if(internal.samples == 2)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -6027,7 +6069,7 @@
 						source1 += pitch;
 					}
 				}
-				else if(internal.depth == 4)
+				else if(internal.samples == 4)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -6051,7 +6093,7 @@
 						source3 += pitch;
 					}
 				}
-				else if(internal.depth == 8)
+				else if(internal.samples == 8)
 				{
 					for(int y = 0; y < height; y++)
 					{
@@ -6087,7 +6129,7 @@
 						source7 += pitch;
 					}
 				}
-				else if(internal.depth == 16)
+				else if(internal.samples == 16)
 				{
 					for(int y = 0; y < height; y++)
 					{
diff --git a/src/Renderer/Surface.hpp b/src/Renderer/Surface.hpp
index 16aa24e..7ea7bee 100644
--- a/src/Renderer/Surface.hpp
+++ b/src/Renderer/Surface.hpp
@@ -48,7 +48,7 @@
 	typedef RectT<int> Rect;
 	typedef RectT<float> RectF;
 
-	template <typename T> struct SliceRectT : public RectT<T>
+	template<typename T> struct SliceRectT : public RectT<T>
 	{
 		SliceRectT() : slice(0) {}
 		SliceRectT(const RectT<T>& rect) : RectT<T>(rect), slice(0) {}
@@ -237,7 +237,9 @@
 	private:
 		struct Buffer
 		{
-		public:
+			friend Surface;
+
+		private:
 			void write(int x, int y, int z, const Color<float> &color);
 			void write(int x, int y, const Color<float> &color);
 			void write(void *element, const Color<float> &color);
@@ -245,7 +247,7 @@
 			Color<float> read(int x, int y) const;
 			Color<float> read(void *element) const;
 			Color<float> sample(float x, float y, float z) const;
-			Color<float> sample(float x, float y) const;
+			Color<float> sample(float x, float y, int layer) const;
 
 			void *lockRect(int x, int y, int z, Lock lock);
 			void unlockRect();
@@ -254,12 +256,15 @@
 			int width;
 			int height;
 			int depth;
+			short border;
+			short samples;
+
 			int bytes;
 			int pitchB;
 			int pitchP;
 			int sliceB;
 			int sliceP;
-			int border;
+
 			Format format;
 			AtomicInt lock;
 
@@ -268,11 +273,11 @@
 
 	protected:
 		Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice);
-		Surface(Resource *texture, int width, int height, int depth, int border, Format format, bool lockable, bool renderTarget, int pitchP = 0);
+		Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0);
 
 	public:
 		static Surface *create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice);
-		static Surface *create(Resource *texture, int width, int height, int depth, int border, Format format, bool lockable, bool renderTarget, int pitchP = 0);
+		static Surface *create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0);
 
 		virtual ~Surface() = 0;
 
@@ -313,6 +318,7 @@
 		void sync();                      // Wait for lock(s) to be released.
 		inline bool isUnlocked() const;   // Only reliable after sync().
 
+		inline int getSamples() const;
 		inline int getMultiSampleCount() const;
 		inline int getSuperSampleCount() const;
 
@@ -351,7 +357,7 @@
 		static int pitchP(int width, int border, Format format, bool target);
 		static int sliceB(int width, int height, int border, Format format, bool target);
 		static int sliceP(int width, int height, int border, Format format, bool target);
-		static unsigned int size(int width, int height, int depth, int border, Format format);   // FIXME: slice * depth
+		static unsigned int size(int width, int height, int depth, int border, int samples, Format format);   // FIXME: slice * depth
 
 		static bool isStencil(Format format);
 		static bool isDepth(Format format);
@@ -474,7 +480,7 @@
 
 		static void update(Buffer &destination, Buffer &source);
 		static void genericUpdate(Buffer &destination, Buffer &source);
-		static void *allocateBuffer(int width, int height, int depth, int border, Format format);
+		static void *allocateBuffer(int width, int height, int depth, int border, int samples, Format format);
 		static void memfill4(void *buffer, int pattern, int bytes);
 
 		bool identicalFormats() const;
@@ -625,14 +631,19 @@
 		return stencil.sliceB;
 	}
 
+	int Surface::getSamples() const
+	{
+		return internal.samples;
+	}
+
 	int Surface::getMultiSampleCount() const
 	{
-		return sw::min(internal.depth, 4);
+		return sw::min((int)internal.samples, 4);
 	}
 
 	int Surface::getSuperSampleCount() const
 	{
-		return internal.depth > 4 ? internal.depth / 4 : 1;
+		return internal.samples > 4 ? internal.samples / 4 : 1;
 	}
 
 	bool Surface::isUnlocked() const