// SwiftShader Software Renderer
//
// Copyright(c) 2005-2013 TransGaming Inc.
//
// All rights reserved. No part of this software may be copied, distributed, transmitted,
// transcribed, stored in a retrieval system, translated into any human or computer
// language by any means, or disclosed to third parties without the explicit written
// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
// or implied, including but not limited to any patent rights, are granted to you.
//

#include "Surface.hpp"

#include "Color.hpp"
#include "Context.hpp"
#include "ETC_Decoder.hpp"
#include "Renderer.hpp"
#include "Common/Half.hpp"
#include "Common/Memory.hpp"
#include "Common/CPUID.hpp"
#include "Common/Resource.hpp"
#include "Common/Debug.hpp"
#include "Reactor/Reactor.hpp"

#include <xmmintrin.h>
#include <emmintrin.h>

#undef min
#undef max

namespace sw
{
	extern bool quadLayoutEnabled;
	extern bool complementaryDepthBuffer;
	extern TranscendentalPrecision logPrecision;

	unsigned int *Surface::palette = 0;
	unsigned int Surface::paletteID = 0;

	void Rect::clip(int minX, int minY, int maxX, int maxY)
	{
		x0 = clamp(x0, minX, maxX);
		y0 = clamp(y0, minY, maxY);
		x1 = clamp(x1, minX, maxX);
		y1 = clamp(y1, minY, maxY);
	}

	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
	{
		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;

		write(element, color);
	}

	void Surface::Buffer::write(int x, int y, const Color<float> &color)
	{
		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;

		write(element, color);
	}

	inline void Surface::Buffer::write(void *element, const Color<float> &color)
	{
		switch(format)
		{
		case FORMAT_A8:
			*(unsigned char*)element = unorm<8>(color.a);
			break;
		case FORMAT_R8:
			*(unsigned char*)element = unorm<8>(color.r);
			break;
		case FORMAT_R3G3B2:
			*(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
			break;
		case FORMAT_A8R3G3B2:
			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
			break;
		case FORMAT_X4R4G4B4:
			*(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
			break;
		case FORMAT_A4R4G4B4:
			*(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
			break;
		case FORMAT_R4G4B4A4:
			*(unsigned short*)element = (unorm<4>(color.r) << 12) | (unorm<4>(color.g) << 8) | (unorm<4>(color.b) << 4) | (unorm<4>(color.a) << 0);
			break;
		case FORMAT_R5G6B5:
			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0);
			break;
		case FORMAT_A1R5G5B5:
			*(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
			break;
		case FORMAT_R5G5B5A1:
			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<5>(color.g) << 6) | (unorm<5>(color.b) << 1) | (unorm<5>(color.a) << 0);
			break;
		case FORMAT_X1R5G5B5:
			*(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
			break;
		case FORMAT_A8R8G8B8:
			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
			break;
		case FORMAT_X8R8G8B8:
			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
			break;
		case FORMAT_A8B8G8R8:
			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
			break;
		case FORMAT_X8B8G8R8:
			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
			break;
		case FORMAT_A2R10G10B10:
			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0);
			break;
		case FORMAT_A2B10G10R10:
			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0);
			break;
		case FORMAT_G8R8:
			*(unsigned int*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
			break;
		case FORMAT_G16R16:
			*(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0);
			break;
		case FORMAT_A16B16G16R16:
			((unsigned short*)element)[0] = unorm<16>(color.r);
			((unsigned short*)element)[1] = unorm<16>(color.g);
			((unsigned short*)element)[2] = unorm<16>(color.b);
			((unsigned short*)element)[3] = unorm<16>(color.a);
			break;
		case FORMAT_V8U8:
			*(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
			break;
		case FORMAT_L6V5U5:
			*(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0);
			break;
		case FORMAT_Q8W8V8U8:
			*(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
			break;
		case FORMAT_X8L8V8U8:
			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
			break;
		case FORMAT_V16U16:
			*(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0);
			break;
		case FORMAT_A2W10V10U10:
			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0);
			break;
		case FORMAT_A16W16V16U16:
			((unsigned short*)element)[0] = snorm<16>(color.r);
			((unsigned short*)element)[1] = snorm<16>(color.g);
			((unsigned short*)element)[2] = snorm<16>(color.b);
			((unsigned short*)element)[3] = unorm<16>(color.a);
			break;
		case FORMAT_Q16W16V16U16:
			((unsigned short*)element)[0] = snorm<16>(color.r);
			((unsigned short*)element)[1] = snorm<16>(color.g);
			((unsigned short*)element)[2] = snorm<16>(color.b);
			((unsigned short*)element)[3] = snorm<16>(color.a);
			break;
		case FORMAT_R8G8B8:
			((unsigned char*)element)[0] = unorm<8>(color.b);
			((unsigned char*)element)[1] = unorm<8>(color.g);
			((unsigned char*)element)[2] = unorm<8>(color.r);
			break;
		case FORMAT_B8G8R8:
			((unsigned char*)element)[0] = unorm<8>(color.r);
			((unsigned char*)element)[1] = unorm<8>(color.g);
			((unsigned char*)element)[2] = unorm<8>(color.b);
			break;
		case FORMAT_R16F:
			*(half*)element = (half)color.r;
			break;
		case FORMAT_A16F:
			*(half*)element = (half)color.a;
			break;
		case FORMAT_G16R16F:
			((half*)element)[0] = (half)color.r;
			((half*)element)[1] = (half)color.g;
			break;
		case FORMAT_B16G16R16F:
			((half*)element)[0] = (half)color.r;
			((half*)element)[1] = (half)color.g;
			((half*)element)[2] = (half)color.b;
			break;
		case FORMAT_A16B16G16R16F:
			((half*)element)[0] = (half)color.r;
			((half*)element)[1] = (half)color.g;
			((half*)element)[2] = (half)color.b;
			((half*)element)[3] = (half)color.a;
			break;
		case FORMAT_A32F:
			*(float*)element = color.a;
			break;
		case FORMAT_R32F:
			*(float*)element = color.r;
			break;
		case FORMAT_G32R32F:
			((float*)element)[0] = color.r;
			((float*)element)[1] = color.g;
			break;
		case FORMAT_B32G32R32F:
			((float*)element)[0] = color.r;
			((float*)element)[1] = color.g;
			((float*)element)[2] = color.b;
			break;
		case FORMAT_A32B32G32R32F:
			((float*)element)[0] = color.r;
			((float*)element)[1] = color.g;
			((float*)element)[2] = color.b;
			((float*)element)[3] = color.a;
			break;
		case FORMAT_D32F:
		case FORMAT_D32F_LOCKABLE:
		case FORMAT_D32FS8_TEXTURE:
		case FORMAT_D32FS8_SHADOW:
			*((float*)element) = color.r;
			break;
		case FORMAT_D32F_COMPLEMENTARY:
			*((float*)element) = 1 - color.r;
			break;
		case FORMAT_S8:
			*((unsigned char*)element) = unorm<8>(color.r);
			break;
		case FORMAT_L8:
			*(unsigned char*)element = unorm<8>(color.r);
			break;
		case FORMAT_A4L4:
			*(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0);
			break;
		case FORMAT_L16:
			*(unsigned short*)element = unorm<16>(color.r);
			break;
		case FORMAT_A8L8:
			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0);
			break;
		case FORMAT_L16F:
			*(half*)element = (half)color.r;
			break;
		case FORMAT_A16L16F:
			((half*)element)[0] = (half)color.r;
			((half*)element)[1] = (half)color.a;
			break;
		case FORMAT_L32F:
			*(float*)element = color.r;
			break;
		case FORMAT_A32L32F:
			((float*)element)[0] = color.r;
			((float*)element)[1] = color.a;
			break;
		default:
			ASSERT(false);
		}
	}

	Color<float> Surface::Buffer::read(int x, int y, int z) const
	{
		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;

		return read(element);
	}

	Color<float> Surface::Buffer::read(int x, int y) const
	{
		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;

		return read(element);
	}

	inline Color<float> Surface::Buffer::read(void *element) const
	{
		float r = 0.0f;
		float g = 0.0f;
		float b = 0.0f;
		float a = 1.0f;

		switch(format)
		{
		case FORMAT_P8:
			{
				ASSERT(palette);

				unsigned int abgr = palette[*(unsigned char*)element];
				
				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
			}
			break;
		case FORMAT_A8P8:
			{
				ASSERT(palette);

				unsigned int bgr = palette[((unsigned char*)element)[0]];
				
				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
			}
			break;
		case FORMAT_A8:
			r = 0;
			g = 0;
			b = 0;
			a = *(unsigned char*)element * (1.0f / 0xFF);
			break;
		case FORMAT_R8:
			r = *(unsigned char*)element * (1.0f / 0xFF);
			break;
		case FORMAT_R3G3B2:
			{
				unsigned char rgb = *(unsigned char*)element;
				
				r = (rgb & 0xE0) * (1.0f / 0xE0);
				g = (rgb & 0x1C) * (1.0f / 0x1C);
				b = (rgb & 0x03) * (1.0f / 0x03);
			}
			break;
		case FORMAT_A8R3G3B2:
			{
				unsigned short argb = *(unsigned short*)element;
				
				a = (argb & 0xFF00) * (1.0f / 0xFF00);
				r = (argb & 0x00E0) * (1.0f / 0x00E0);
				g = (argb & 0x001C) * (1.0f / 0x001C);
				b = (argb & 0x0003) * (1.0f / 0x0003);
			}
			break;
		case FORMAT_X4R4G4B4:
			{
				unsigned short rgb = *(unsigned short*)element;
				
				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
				b = (rgb & 0x000F) * (1.0f / 0x000F);
			}
			break;
		case FORMAT_A4R4G4B4:
			{
				unsigned short argb = *(unsigned short*)element;
				
				a = (argb & 0xF000) * (1.0f / 0xF000);
				r = (argb & 0x0F00) * (1.0f / 0x0F00);
				g = (argb & 0x00F0) * (1.0f / 0x00F0);
				b = (argb & 0x000F) * (1.0f / 0x000F);
			}
			break;
		case FORMAT_R4G4B4A4:
			{
				unsigned short rgba = *(unsigned short*)element;
				
				r = (rgba & 0xF000) * (1.0f / 0xF000);
				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
				a = (rgba & 0x000F) * (1.0f / 0x000F);
			}
			break;
		case FORMAT_R5G6B5:
			{
				unsigned short rgb = *(unsigned short*)element;
				
				r = (rgb & 0xF800) * (1.0f / 0xF800);
				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
				b = (rgb & 0x001F) * (1.0f / 0x001F);
			}
			break;
		case FORMAT_A1R5G5B5:
			{
				unsigned short argb = *(unsigned short*)element;
				
				a = (argb & 0x8000) * (1.0f / 0x8000);
				r = (argb & 0x7C00) * (1.0f / 0x7C00);
				g = (argb & 0x03E0) * (1.0f / 0x03E0);
				b = (argb & 0x001F) * (1.0f / 0x001F);
			}
			break;
		case FORMAT_R5G5B5A1:
			{
				unsigned short rgba = *(unsigned short*)element;
				
				r = (rgba & 0xF800) * (1.0f / 0xF800);
				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
				b = (rgba & 0x003E) * (1.0f / 0x003E);
				a = (rgba & 0x0001) * (1.0f / 0x0001);
			}
			break;
		case FORMAT_X1R5G5B5:
			{
				unsigned short xrgb = *(unsigned short*)element;
				
				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
				b = (xrgb & 0x001F) * (1.0f / 0x001F);
			}
			break;
		case FORMAT_A8R8G8B8:
			{
				unsigned int argb = *(unsigned int*)element;
				
				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
			}
			break;
		case FORMAT_X8R8G8B8:
			{
				unsigned int xrgb = *(unsigned int*)element;
				
				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
			}
			break;
		case FORMAT_A8B8G8R8:
			{
				unsigned int abgr = *(unsigned int*)element;
				
				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
			}
			break;
		case FORMAT_X8B8G8R8:
			{
				unsigned int xbgr = *(unsigned int*)element;
				
				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
			}
			break;
		case FORMAT_G8R8:
			{
				unsigned short gr = *(unsigned short*)element;
				
				g = (gr & 0xFF00) * (1.0f / 0xFF00);
				r = (gr & 0x00FF) * (1.0f / 0x00FF);
			}
			break;
		case FORMAT_G16R16:
			{
				unsigned int gr = *(unsigned int*)element;
				
				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
			}
			break;
		case FORMAT_A2R10G10B10:
			{
				unsigned int argb = *(unsigned int*)element;
				
				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
			}
			break;
		case FORMAT_A2B10G10R10:
			{
				unsigned int abgr = *(unsigned int*)element;
				
				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
			}
			break;
		case FORMAT_A16B16G16R16:
			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
			break;
		case FORMAT_V8U8:
			{
				unsigned short vu = *(unsigned short*)element;

				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
			}
			break;
		case FORMAT_L6V5U5:
			{
				unsigned short lvu = *(unsigned short*)element;
				
				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
			}
			break;
		case FORMAT_Q8W8V8U8:
			{
				unsigned int qwvu = *(unsigned int*)element;
				
				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
			}
			break;
		case FORMAT_X8L8V8U8:
			{
				unsigned int xlvu = *(unsigned int*)element;
				
				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
			}
			break;
		case FORMAT_R8G8B8:
			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
			break;
		case FORMAT_B8G8R8:
			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
			break;
		case FORMAT_V16U16:
			{
				unsigned int vu = *(unsigned int*)element;
				
				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
			}
			break;
		case FORMAT_A2W10V10U10:
			{
				unsigned int awvu = *(unsigned int*)element;
				
				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
			}
			break;
		case FORMAT_A16W16V16U16:
			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
			break;
		case FORMAT_Q16W16V16U16:
			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
			break;
		case FORMAT_L8:
			r =
			g =
			b = *(unsigned char*)element * (1.0f / 0xFF);
			break;
		case FORMAT_A4L4:
			{
				unsigned char al = *(unsigned char*)element;
				
				r =
				g =
				b = (al & 0x0F) * (1.0f / 0x0F);
				a = (al & 0xF0) * (1.0f / 0xF0);
			}
			break;
		case FORMAT_L16:
			r =
			g =
			b = *(unsigned short*)element * (1.0f / 0xFFFF);
			break;
		case FORMAT_A8L8:
			r =
			g =
			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
			break;
		case FORMAT_L16F:
			r =
			g =
			b = *(half*)element;
			break;
		case FORMAT_A16L16F:
			r =
			g =
			b = ((half*)element)[0];
			a = ((half*)element)[1];
			break;
		case FORMAT_L32F:
			r =
			g =
			b = *(float*)element;
			break;
		case FORMAT_A32L32F:
			r =
			g =
			b = ((float*)element)[0];
			a = ((float*)element)[1];
			break;
		case FORMAT_A16F:
			a = *(half*)element;
			break;
		case FORMAT_R16F:
			r = *(half*)element;
			break;
		case FORMAT_G16R16F:
			r = ((half*)element)[0];
			g = ((half*)element)[1];
			break;
		case FORMAT_B16G16R16F:
			r = ((half*)element)[0];
			g = ((half*)element)[1];
			b = ((half*)element)[2];
			break;
		case FORMAT_A16B16G16R16F:
			r = ((half*)element)[0];
			g = ((half*)element)[1];
			b = ((half*)element)[2];
			a = ((half*)element)[3];
			break;
		case FORMAT_A32F:
			a = *(float*)element;
			break;
		case FORMAT_R32F:
			r = *(float*)element;
			break;
		case FORMAT_G32R32F:
			r = ((float*)element)[0];
			g = ((float*)element)[1];
			break;
		case FORMAT_B32G32R32F:
			r = ((float*)element)[0];
			g = ((float*)element)[1];
			b = ((float*)element)[2];
			break;
		case FORMAT_A32B32G32R32F:
			r = ((float*)element)[0];
			g = ((float*)element)[1];
			b = ((float*)element)[2];
			a = ((float*)element)[3];
			break;
		case FORMAT_D32F:
		case FORMAT_D32F_LOCKABLE:
		case FORMAT_D32FS8_TEXTURE:
		case FORMAT_D32FS8_SHADOW:
			r = *(float*)element;
			g = r;
			b = r;
			a = r;
			break;
		case FORMAT_D32F_COMPLEMENTARY:
			r = 1.0f - *(float*)element;
			g = r;
			b = r;
			a = r;
			break;
		case FORMAT_S8:
			r = *(unsigned char*)element * (1.0f / 0xFF);
			break;
		default:
			ASSERT(false);
		}

	//	if(sRGB)
	//	{
	//		r = sRGBtoLinear(r);
	//		g = sRGBtoLinear(g);
	//		b = sRGBtoLinear(b);
	//	}

		return Color<float>(r, g, b, a);
	}

	Color<float> Surface::Buffer::sample(float x, float y, float z) const
	{
		x -= 0.5f;
		y -= 0.5f;
		z -= 0.5f;

		int x0 = clamp((int)x, 0, width - 1);
		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;

		int y0 = clamp((int)y, 0, height - 1);
		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;

		int z0 = clamp((int)z, 0, depth - 1);
		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;

		Color<float> c000 = read(x0, y0, z0);
		Color<float> c100 = read(x1, y0, z0);
		Color<float> c010 = read(x0, y1, z0);
		Color<float> c110 = read(x1, y1, z0);
		Color<float> c001 = read(x0, y0, z1);
		Color<float> c101 = read(x1, y0, z1);
		Color<float> c011 = read(x0, y1, z1);
		Color<float> c111 = read(x1, y1, z1);

		float fx = x - x0;
		float fy = y - y0;
		float fz = z - z0;

		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
		c100 *= fx * (1 - fy) * (1 - fz);
		c010 *= (1 - fx) * fy * (1 - fz);
		c110 *= fx * fy * (1 - fz);
		c001 *= (1 - fx) * (1 - fy) * fz;
		c101 *= fx * (1 - fy) * fz;
		c011 *= (1 - fx) * fy * fz;
		c111 *= fx * fy * fz;

		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
	}

	Color<float> Surface::Buffer::sample(float x, float y) const
	{
		x -= 0.5f;
		y -= 0.5f;

		int x0 = clamp((int)x, 0, width - 1);
		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;

		int y0 = clamp((int)y, 0, height - 1);
		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;

		Color<float> c00 = read(x0, y0);
		Color<float> c10 = read(x1, y0);
		Color<float> c01 = read(x0, y1);
		Color<float> c11 = read(x1, y1);

		float fx = x - x0;
		float fy = y - y0;

		c00 *= (1 - fx) * (1 - fy);
		c10 *= fx * (1 - fy);
		c01 *= (1 - fx) * fy;
		c11 *= fx * fy;

		return c00 + c10 + c01 + c11;
	}

	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
	{
		this->lock = lock;

		switch(lock)
		{
		case LOCK_UNLOCKED:
		case LOCK_READONLY:
			break;
		case LOCK_WRITEONLY:
		case LOCK_READWRITE:
		case LOCK_DISCARD:
			dirty = true;
			break;
		default:
			ASSERT(false);
		}

		if(buffer)
		{
			switch(format)
			{
			#if S3TC_SUPPORT
			case FORMAT_DXT1:
			#endif
			case FORMAT_ATI1:
			case FORMAT_ETC1:
			case FORMAT_R11_EAC:
			case FORMAT_SIGNED_R11_EAC:
			case FORMAT_RGB8_ETC2:
			case FORMAT_SRGB8_ETC2:
			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
			case FORMAT_RG11_EAC:
			case FORMAT_SIGNED_RG11_EAC:
			case FORMAT_RGBA8_ETC2_EAC:
			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
			case FORMAT_RGBA_ASTC_4x4_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_5x4_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_5x5_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_6x5_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_6x6_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_8x5_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_8x6_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_8x8_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_10x5_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_10x6_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_10x8_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_10x10_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_12x10_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
			case FORMAT_RGBA_ASTC_12x12_KHR:
			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
			#if S3TC_SUPPORT
			case FORMAT_DXT3:
			case FORMAT_DXT5:
			#endif
			case FORMAT_ATI2:
				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
			default:
				return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
			}
		}

		return 0;
	}

	void Surface::Buffer::unlockRect()
	{
		lock = LOCK_UNLOCKED;
	}

	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
	{
		resource = new Resource(0);
		hasParent = false;
		ownExternal = false;
		depth = max(1, depth);

		external.buffer = pixels;
		external.width = width;
		external.height = height;
		external.depth = depth;
		external.format = format;
		external.bytes = bytes(external.format);
		external.pitchB = pitch;
		external.pitchP = pitch / external.bytes;
		external.sliceB = slice;
		external.sliceP = slice / external.bytes;
		external.lock = LOCK_UNLOCKED;
		external.dirty = true;

		internal.buffer = 0;
		internal.width = width;
		internal.height = height;
		internal.depth = depth;
		internal.format = selectInternalFormat(format);
		internal.bytes = bytes(internal.format);
		internal.pitchB = pitchB(internal.width, internal.format, false);
		internal.pitchP = pitchP(internal.width, internal.format, false);
		internal.sliceB = sliceB(internal.width, internal.height, internal.format, false);
		internal.sliceP = sliceP(internal.width, internal.height, internal.format, false);
		internal.lock = LOCK_UNLOCKED;
		internal.dirty = false;

		stencil.buffer = 0;
		stencil.width = width;
		stencil.height = height;
		stencil.depth = depth;
		stencil.format = FORMAT_S8;
		stencil.bytes = bytes(stencil.format);
		stencil.pitchB = pitchB(stencil.width, stencil.format, false);
		stencil.pitchP = pitchP(stencil.width, stencil.format, false);
		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, false);
		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, false);
		stencil.lock = LOCK_UNLOCKED;
		stencil.dirty = false;

		dirtyMipmaps = true;
		paletteUsed = 0;
	}

	Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget) : lockable(lockable), renderTarget(renderTarget)
	{
		resource = texture ? texture : new Resource(0);
		hasParent = texture != 0;
		ownExternal = true;
		depth = max(1, depth);

		external.buffer = 0;
		external.width = width;
		external.height = height;
		external.depth = depth;
		external.format = format;
		external.bytes = bytes(external.format);
		external.pitchB = pitchB(external.width, external.format, renderTarget && !texture);
		external.pitchP = pitchP(external.width, external.format, renderTarget && !texture);
		external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture);
		external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture);
		external.lock = LOCK_UNLOCKED;
		external.dirty = false;

		internal.buffer = 0;
		internal.width = width;
		internal.height = height;
		internal.depth = depth;
		internal.format = selectInternalFormat(format);
		internal.bytes = bytes(internal.format);
		internal.pitchB = pitchB(internal.width, internal.format, renderTarget);
		internal.pitchP = pitchP(internal.width, internal.format, renderTarget);
		internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget);
		internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget);
		internal.lock = LOCK_UNLOCKED;
		internal.dirty = false;

		stencil.buffer = 0;
		stencil.width = width;
		stencil.height = height;
		stencil.depth = depth;
		stencil.format = FORMAT_S8;
		stencil.bytes = bytes(stencil.format);
		stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget);
		stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget);
		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget);
		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget);
		stencil.lock = LOCK_UNLOCKED;
		stencil.dirty = false;

		dirtyMipmaps = true;
		paletteUsed = 0;
	}

	Surface::~Surface()
	{
		// Synchronize so we can deallocate the buffers below
		resource->lock(DESTRUCT);
		resource->unlock();

		if(!hasParent)
		{
			resource->destruct();
		}

		if(ownExternal)
		{
			deallocate(external.buffer);
		}

		if(internal.buffer != external.buffer)
		{
			deallocate(internal.buffer);
		}

		deallocate(stencil.buffer);

		external.buffer = 0;
		internal.buffer = 0;
		stencil.buffer = 0;
	}

	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
	{
		resource->lock(client);

		if(!external.buffer)
		{
			if(internal.buffer && identicalFormats())
			{
				external.buffer = internal.buffer;
			}
			else
			{
				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format);
			}
		}

		if(internal.dirty)
		{
			if(lock != LOCK_DISCARD)
			{
				update(external, internal);
			}

			internal.dirty = false;
		}

		switch(lock)
		{
		case LOCK_READONLY:
			break;
		case LOCK_WRITEONLY:
		case LOCK_READWRITE:
		case LOCK_DISCARD:
			dirtyMipmaps = true;
			break;
		default:
			ASSERT(false);
		}

		return external.lockRect(x, y, z, lock);
	}

	void Surface::unlockExternal()
	{
		resource->unlock();

		external.unlockRect();
	}

	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
	{
		if(lock != LOCK_UNLOCKED)
		{
			resource->lock(client);
		}

		if(!internal.buffer)
		{
			if(external.buffer && identicalFormats())
			{
				internal.buffer = external.buffer;
			}
			else
			{
				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format);
			}
		}

		// FIXME: WHQL requires conversion to lower external precision and back
		if(logPrecision >= WHQL)
		{
			if(internal.dirty && renderTarget && internal.format != external.format)
			{
				if(lock != LOCK_DISCARD)
				{
					switch(external.format)
					{
					case FORMAT_R3G3B2:
					case FORMAT_A8R3G3B2:
					case FORMAT_A1R5G5B5:
					case FORMAT_A2R10G10B10:
					case FORMAT_A2B10G10R10:
						lockExternal(0, 0, 0, LOCK_READWRITE, client);
						unlockExternal();
						break;
					default:
						// Difference passes WHQL
						break;
					}
				}
			}
		}

		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
		{
			if(lock != LOCK_DISCARD)
			{
				update(internal, external);
			}

			external.dirty = false;
			paletteUsed = Surface::paletteID;
		}

		switch(lock)
		{
		case LOCK_UNLOCKED:
		case LOCK_READONLY:
			break;
		case LOCK_WRITEONLY:
		case LOCK_READWRITE:
		case LOCK_DISCARD:
			dirtyMipmaps = true;
			break;
		default:
			ASSERT(false);
		}

		if(lock == LOCK_READONLY && client == PUBLIC)
		{
			resolve();
		}

		return internal.lockRect(x, y, z, lock);
	}

	void Surface::unlockInternal()
	{
		resource->unlock();

		internal.unlockRect();
	}

	void *Surface::lockStencil(int front, Accessor client)
	{
		resource->lock(client);

		if(!stencil.buffer)
		{
			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format);
		}

		return stencil.lockRect(0, 0, front, LOCK_READWRITE);   // FIXME
	}

	void Surface::unlockStencil()
	{
		resource->unlock();

		stencil.unlockRect();
	}

	int Surface::bytes(Format format)
	{
		switch(format)
		{
		case FORMAT_NULL:				return 0;
		case FORMAT_P8:					return 1;
		case FORMAT_A8P8:				return 2;
		case FORMAT_A8:					return 1;
		case FORMAT_R8:					return 1;
		case FORMAT_R3G3B2:				return 1;
		case FORMAT_A8R3G3B2:			return 2;
		case FORMAT_R5G6B5:				return 2;
		case FORMAT_A1R5G5B5:			return 2;
		case FORMAT_X1R5G5B5:			return 2;
		case FORMAT_R5G5B5A1:           return 2;
		case FORMAT_X4R4G4B4:			return 2;
		case FORMAT_A4R4G4B4:			return 2;
		case FORMAT_R4G4B4A4:           return 2;
		case FORMAT_R8G8B8:				return 3;
		case FORMAT_B8G8R8:             return 3;
		case FORMAT_X8R8G8B8:			return 4;
	//	case FORMAT_X8G8R8B8Q:			return 4;
		case FORMAT_A8R8G8B8:			return 4;
	//	case FORMAT_A8G8R8B8Q:			return 4;
		case FORMAT_X8B8G8R8:			return 4;
		case FORMAT_A8B8G8R8:			return 4;
		case FORMAT_A2R10G10B10:		return 4;
		case FORMAT_A2B10G10R10:		return 4;
		case FORMAT_G8R8:				return 2;
		case FORMAT_G16R16:				return 4;
		case FORMAT_A16B16G16R16:		return 8;
		// Compressed formats
		#if S3TC_SUPPORT
		case FORMAT_DXT1:				return 2;   // Column of four pixels
		case FORMAT_DXT3:				return 4;   // Column of four pixels
		case FORMAT_DXT5:				return 4;   // Column of four pixels
		#endif
		case FORMAT_ATI1:				return 2;   // Column of four pixels
		case FORMAT_ATI2:				return 4;   // Column of four pixels
		case FORMAT_ETC1:				return 2;   // Column of four pixels
		case FORMAT_R11_EAC:			return 2;
		case FORMAT_SIGNED_R11_EAC:		return 2;
		case FORMAT_RG11_EAC:			return 4;
		case FORMAT_SIGNED_RG11_EAC:	return 4;
		case FORMAT_RGB8_ETC2:			return 2;
		case FORMAT_SRGB8_ETC2:			return 2;
		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
		case FORMAT_RGBA8_ETC2_EAC:			return 4;
		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
		case FORMAT_RGBA_ASTC_4x4_KHR:
		case FORMAT_RGBA_ASTC_5x4_KHR:
		case FORMAT_RGBA_ASTC_5x5_KHR:
		case FORMAT_RGBA_ASTC_6x5_KHR:
		case FORMAT_RGBA_ASTC_6x6_KHR:
		case FORMAT_RGBA_ASTC_8x5_KHR:
		case FORMAT_RGBA_ASTC_8x6_KHR:
		case FORMAT_RGBA_ASTC_8x8_KHR:
		case FORMAT_RGBA_ASTC_10x5_KHR:
		case FORMAT_RGBA_ASTC_10x6_KHR:
		case FORMAT_RGBA_ASTC_10x8_KHR:
		case FORMAT_RGBA_ASTC_10x10_KHR:
		case FORMAT_RGBA_ASTC_12x10_KHR:
		case FORMAT_RGBA_ASTC_12x12_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
		// Bumpmap formats
		case FORMAT_V8U8:				return 2;
		case FORMAT_L6V5U5:				return 2;
		case FORMAT_Q8W8V8U8:			return 4;
		case FORMAT_X8L8V8U8:			return 4;
		case FORMAT_A2W10V10U10:		return 4;
		case FORMAT_V16U16:				return 4;
		case FORMAT_A16W16V16U16:		return 8;
		case FORMAT_Q16W16V16U16:		return 8;
		// Luminance formats
		case FORMAT_L8:					return 1;
		case FORMAT_A4L4:				return 1;
		case FORMAT_L16:				return 2;
		case FORMAT_A8L8:				return 2;
		case FORMAT_L16F:               return 2;
		case FORMAT_A16L16F:            return 4;
		case FORMAT_L32F:               return 4;
		case FORMAT_A32L32F:            return 8;
		// Floating-point formats
		case FORMAT_A16F:				return 2;
		case FORMAT_R16F:				return 2;
		case FORMAT_G16R16F:			return 4;
		case FORMAT_B16G16R16F:			return 6;
		case FORMAT_A16B16G16R16F:		return 8;
		case FORMAT_A32F:				return 4;
		case FORMAT_R32F:				return 4;
		case FORMAT_G32R32F:			return 8;
		case FORMAT_B32G32R32F:			return 12;
		case FORMAT_A32B32G32R32F:		return 16;
		// Depth/stencil formats
		case FORMAT_D16:				return 2;
		case FORMAT_D32:				return 4;
		case FORMAT_D24X8:				return 4;
		case FORMAT_D24S8:				return 4;
		case FORMAT_D24FS8:				return 4;
		case FORMAT_D32F:				return 4;
		case FORMAT_D32F_COMPLEMENTARY:	return 4;
		case FORMAT_D32F_LOCKABLE:		return 4;
		case FORMAT_D32FS8_TEXTURE:		return 4;
		case FORMAT_D32FS8_SHADOW:		return 4;
		case FORMAT_DF24S8:				return 4;
		case FORMAT_DF16S8:				return 2;
		case FORMAT_INTZ:				return 4;
		case FORMAT_S8:					return 1;
		case FORMAT_YV12_BT601:         return 1;   // Y plane only
		case FORMAT_YV12_BT709:         return 1;   // Y plane only
		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
		default:
			ASSERT(false);
		}

		return 0;
	}

	int Surface::pitchB(int width, Format format, bool target)
	{
		if(target || isDepth(format) || isStencil(format))
		{
			width = align(width, 2);
		}

		switch(format)
		{
		#if S3TC_SUPPORT
		case FORMAT_DXT1:
		#endif
		case FORMAT_ETC1:
		case FORMAT_R11_EAC:
		case FORMAT_SIGNED_R11_EAC:
		case FORMAT_RGB8_ETC2:
		case FORMAT_SRGB8_ETC2:
		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
		case FORMAT_RG11_EAC:
		case FORMAT_SIGNED_RG11_EAC:
		case FORMAT_RGBA8_ETC2_EAC:
		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
		case FORMAT_RGBA_ASTC_4x4_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
		case FORMAT_RGBA_ASTC_5x4_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
		case FORMAT_RGBA_ASTC_5x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
			return 16 * ((width + 4) / 5);
		case FORMAT_RGBA_ASTC_6x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
		case FORMAT_RGBA_ASTC_6x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
			return 16 * ((width + 5) / 6);
		case FORMAT_RGBA_ASTC_8x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
		case FORMAT_RGBA_ASTC_8x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
		case FORMAT_RGBA_ASTC_8x8_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
			return 16 * ((width + 7) / 8);
		case FORMAT_RGBA_ASTC_10x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
		case FORMAT_RGBA_ASTC_10x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
		case FORMAT_RGBA_ASTC_10x8_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
		case FORMAT_RGBA_ASTC_10x10_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
			return 16 * ((width + 9) / 10);
		case FORMAT_RGBA_ASTC_12x10_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
		case FORMAT_RGBA_ASTC_12x12_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
			return 16 * ((width + 11) / 12);
		#if S3TC_SUPPORT
		case FORMAT_DXT3:
		case FORMAT_DXT5:
			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
		#endif
		case FORMAT_ATI1:
			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
		case FORMAT_ATI2:
			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
		case FORMAT_YV12_BT601:
		case FORMAT_YV12_BT709:
		case FORMAT_YV12_JFIF:
			return align(width, 16);
		default:
			return bytes(format) * width;
		}
	}

	int Surface::pitchP(int width, Format format, bool target)
	{
		int B = bytes(format);

		return B > 0 ? pitchB(width, format, target) / B : 0;
	}

	int Surface::sliceB(int width, int height, Format format, bool target)
	{
		if(target || isDepth(format) || isStencil(format))
		{
			height = ((height + 1) & ~1);
		}

		switch(format)
		{
		#if S3TC_SUPPORT
		case FORMAT_DXT1:
		case FORMAT_DXT3:
		case FORMAT_DXT5:
		#endif
		case FORMAT_ETC1:
		case FORMAT_R11_EAC:
		case FORMAT_SIGNED_R11_EAC:
		case FORMAT_RG11_EAC:
		case FORMAT_SIGNED_RG11_EAC:
		case FORMAT_RGB8_ETC2:
		case FORMAT_SRGB8_ETC2:
		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
		case FORMAT_RGBA8_ETC2_EAC:
		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
		case FORMAT_RGBA_ASTC_4x4_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
		case FORMAT_RGBA_ASTC_5x4_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
			return pitchB(width, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
		case FORMAT_RGBA_ASTC_5x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
		case FORMAT_RGBA_ASTC_6x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
		case FORMAT_RGBA_ASTC_8x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
		case FORMAT_RGBA_ASTC_10x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
			return pitchB(width, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
		case FORMAT_RGBA_ASTC_6x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
		case FORMAT_RGBA_ASTC_8x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
		case FORMAT_RGBA_ASTC_10x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
			return pitchB(width, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
		case FORMAT_RGBA_ASTC_8x8_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
		case FORMAT_RGBA_ASTC_10x8_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
			return pitchB(width, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
		case FORMAT_RGBA_ASTC_10x10_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
		case FORMAT_RGBA_ASTC_12x10_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
			return pitchB(width, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
		case FORMAT_RGBA_ASTC_12x12_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
			return pitchB(width, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
		case FORMAT_ATI1:
		case FORMAT_ATI2:
		default:
			return pitchB(width, format, target) * height;   // Pitch computed per row
		}
	}

	int Surface::sliceP(int width, int height, Format format, bool target)
	{
		int B = bytes(format);

		return B > 0 ? sliceB(width, height, format, target) / B : 0;
	}

	void Surface::update(Buffer &destination, Buffer &source)
	{
	//	ASSERT(source.lock != LOCK_UNLOCKED);
	//	ASSERT(destination.lock != LOCK_UNLOCKED);
		
		if(destination.buffer != source.buffer)
		{
			ASSERT(source.dirty && !destination.dirty);

			switch(source.format)
			{
			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
			#if S3TC_SUPPORT
			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
			#endif
			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
			case FORMAT_ETC1:
			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
			default:				genericUpdate(destination, source);		break;
			}
		}
	}

	void Surface::genericUpdate(Buffer &destination, Buffer &source)
	{
		unsigned char *sourceSlice = (unsigned char*)source.buffer;
		unsigned char *destinationSlice = (unsigned char*)destination.buffer;

		int depth = min(destination.depth, source.depth);
		int height = min(destination.height, source.height);
		int width = min(destination.width, source.width);
		int rowBytes = width * source.bytes;

		for(int z = 0; z < depth; z++)
		{
			unsigned char *sourceRow = sourceSlice;
			unsigned char *destinationRow = destinationSlice;

			for(int y = 0; y < height; y++)
			{
				if(source.format == destination.format)
				{
					memcpy(destinationRow, sourceRow, rowBytes);
				}
				else
				{
					unsigned char *sourceElement = sourceRow;
					unsigned char *destinationElement = destinationRow;

					for(int x = 0; x < width; x++)
					{
						Color<float> color = source.read(sourceElement);
						destination.write(destinationElement, color);

						sourceElement += source.bytes;
						destinationElement += destination.bytes;
					}
				}

				sourceRow += source.pitchB;
				destinationRow += destination.pitchB;
			}

			sourceSlice += source.sliceB;
			destinationSlice += destination.sliceB;
		}
	}

	void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source)
	{
		unsigned char *sourceSlice = (unsigned char*)source.buffer;
		unsigned char *destinationSlice = (unsigned char*)destination.buffer;

		for(int z = 0; z < destination.depth && z < source.depth; z++)
		{
			unsigned char *sourceRow = sourceSlice;
			unsigned char *destinationRow = destinationSlice;

			for(int y = 0; y < destination.height && y < source.height; y++)
			{
				unsigned char *sourceElement = sourceRow;
				unsigned char *destinationElement = destinationRow;

				for(int x = 0; x < destination.width && x < source.width; x++)
				{
					unsigned int b = sourceElement[0];
					unsigned int g = sourceElement[1];
					unsigned int r = sourceElement[2];

					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);

					sourceElement += source.bytes;
					destinationElement += destination.bytes;
				}

				sourceRow += source.pitchB;
				destinationRow += destination.pitchB;
			}

			sourceSlice += source.sliceB;
			destinationSlice += destination.sliceB;
		}
	}

	void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source)
	{
		unsigned char *sourceSlice = (unsigned char*)source.buffer;
		unsigned char *destinationSlice = (unsigned char*)destination.buffer;

		for(int z = 0; z < destination.depth && z < source.depth; z++)
		{
			unsigned char *sourceRow = sourceSlice;
			unsigned char *destinationRow = destinationSlice;

			for(int y = 0; y < destination.height && y < source.height; y++)
			{
				unsigned char *sourceElement = sourceRow;
				unsigned char *destinationElement = destinationRow;

				for(int x = 0; x < destination.width && x < source.width; x++)
				{
					unsigned int xrgb = *(unsigned short*)sourceElement;
						
					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);

					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;

					sourceElement += source.bytes;
					destinationElement += destination.bytes;
				}

				sourceRow += source.pitchB;
				destinationRow += destination.pitchB;
			}

			sourceSlice += source.sliceB;
			destinationSlice += destination.sliceB;
		}
	}

	void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source)
	{
		unsigned char *sourceSlice = (unsigned char*)source.buffer;
		unsigned char *destinationSlice = (unsigned char*)destination.buffer;

		for(int z = 0; z < destination.depth && z < source.depth; z++)
		{
			unsigned char *sourceRow = sourceSlice;
			unsigned char *destinationRow = destinationSlice;

			for(int y = 0; y < destination.height && y < source.height; y++)
			{
				unsigned char *sourceElement = sourceRow;
				unsigned char *destinationElement = destinationRow;

				for(int x = 0; x < destination.width && x < source.width; x++)
				{
					unsigned int argb = *(unsigned short*)sourceElement;
					
					unsigned int a =   (argb & 0x8000) * 130560;
					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);

					*(unsigned int*)destinationElement = a | r | g | b;

					sourceElement += source.bytes;
					destinationElement += destination.bytes;
				}

				sourceRow += source.pitchB;
				destinationRow += destination.pitchB;
			}

			sourceSlice += source.sliceB;
			destinationSlice += destination.sliceB;
		}
	}

	void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source)
	{
		unsigned char *sourceSlice = (unsigned char*)source.buffer;
		unsigned char *destinationSlice = (unsigned char*)destination.buffer;

		for(int z = 0; z < destination.depth && z < source.depth; z++)
		{
			unsigned char *sourceRow = sourceSlice;
			unsigned char *destinationRow = destinationSlice;

			for(int y = 0; y < destination.height && y < source.height; y++)
			{
				unsigned char *sourceElement = sourceRow;
				unsigned char *destinationElement = destinationRow;

				for(int x = 0; x < destination.width && x < source.width; x++)
				{
					unsigned int xrgb = *(unsigned short*)sourceElement;
						
					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
					unsigned int b =  (xrgb & 0x000F) * 0x00000011;

					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;

					sourceElement += source.bytes;
					destinationElement += destination.bytes;
				}

				sourceRow += source.pitchB;
				destinationRow += destination.pitchB;
			}

			sourceSlice += source.sliceB;
			destinationSlice += destination.sliceB;
		}
	}

	void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source)
	{
		unsigned char *sourceSlice = (unsigned char*)source.buffer;
		unsigned char *destinationSlice = (unsigned char*)destination.buffer;

		for(int z = 0; z < destination.depth && z < source.depth; z++)
		{
			unsigned char *sourceRow = sourceSlice;
			unsigned char *destinationRow = destinationSlice;

			for(int y = 0; y < destination.height && y < source.height; y++)
			{
				unsigned char *sourceElement = sourceRow;
				unsigned char *destinationElement = destinationRow;

				for(int x = 0; x < destination.width && x < source.width; x++)
				{
					unsigned int argb = *(unsigned short*)sourceElement;
					
					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
					unsigned int b =  (argb & 0x000F) * 0x00000011;

					*(unsigned int*)destinationElement = a | r | g | b;

					sourceElement += source.bytes;
					destinationElement += destination.bytes;
				}

				sourceRow += source.pitchB;
				destinationRow += destination.pitchB;
			}

			sourceSlice += source.sliceB;
			destinationSlice += destination.sliceB;
		}
	}

	void Surface::decodeP8(Buffer &destination, const Buffer &source)
	{
		unsigned char *sourceSlice = (unsigned char*)source.buffer;
		unsigned char *destinationSlice = (unsigned char*)destination.buffer;

		for(int z = 0; z < destination.depth && z < source.depth; z++)
		{
			unsigned char *sourceRow = sourceSlice;
			unsigned char *destinationRow = destinationSlice;

			for(int y = 0; y < destination.height && y < source.height; y++)
			{
				unsigned char *sourceElement = sourceRow;
				unsigned char *destinationElement = destinationRow;

				for(int x = 0; x < destination.width && x < source.width; x++)
				{
					unsigned int abgr = palette[*(unsigned char*)sourceElement];

					unsigned int r = (abgr & 0x000000FF) << 16;
					unsigned int g = (abgr & 0x0000FF00) << 0;
					unsigned int b = (abgr & 0x00FF0000) >> 16;
					unsigned int a = (abgr & 0xFF000000) >> 0;

					*(unsigned int*)destinationElement = a | r | g | b;

					sourceElement += source.bytes;
					destinationElement += destination.bytes;
				}

				sourceRow += source.pitchB;
				destinationRow += destination.pitchB;
			}

			sourceSlice += source.sliceB;
			destinationSlice += destination.sliceB;
		}
	}

#if S3TC_SUPPORT
	void Surface::decodeDXT1(Buffer &internal, const Buffer &external)
	{
		unsigned int *destSlice = (unsigned int*)internal.buffer;
		const DXT1 *source = (const DXT1*)external.buffer;

		for(int z = 0; z < external.depth; z++)
		{
			unsigned int *dest = destSlice;

			for(int y = 0; y < external.height; y += 4)
			{
				for(int x = 0; x < external.width; x += 4)
				{
					Color<byte> c[4];

					c[0] = source->c0;
					c[1] = source->c1;

					if(source->c0 > source->c1)   // No transparency
					{
						// c2 = 2 / 3 * c0 + 1 / 3 * c1
						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
						c[2].a = 0xFF;

						// c3 = 1 / 3 * c0 + 2 / 3 * c1
						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
						c[3].a = 0xFF;
					}
					else   // c3 transparent
					{
						// c2 = 1 / 2 * c0 + 1 / 2 * c1
						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
						c[2].a = 0xFF;

						c[3].r = 0;
						c[3].g = 0;
						c[3].b = 0;
						c[3].a = 0;
					}

					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
					{
						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
						{
							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
						}
					}

					source++;
				}
			}

			(byte*&)destSlice += internal.sliceB;
		}
	}

	void Surface::decodeDXT3(Buffer &internal, const Buffer &external)
	{
		unsigned int *destSlice = (unsigned int*)internal.buffer;
		const DXT3 *source = (const DXT3*)external.buffer;

		for(int z = 0; z < external.depth; z++)
		{
			unsigned int *dest = destSlice;

			for(int y = 0; y < external.height; y += 4)
			{
				for(int x = 0; x < external.width; x += 4)
				{
					Color<byte> c[4];

					c[0] = source->c0;
					c[1] = source->c1;

					// c2 = 2 / 3 * c0 + 1 / 3 * c1
					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);

					// c3 = 1 / 3 * c0 + 2 / 3 * c1
					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);

					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
					{
						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
						{
							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));

							dest[(x + i) + (y + j) * internal.width] = color;
						}
					}

					source++;
				}
			}

			(byte*&)destSlice += internal.sliceB;
		}
	}

	void Surface::decodeDXT5(Buffer &internal, const Buffer &external)
	{
		unsigned int *destSlice = (unsigned int*)internal.buffer;
		const DXT5 *source = (const DXT5*)external.buffer;

		for(int z = 0; z < external.depth; z++)
		{
			unsigned int *dest = destSlice;

			for(int y = 0; y < external.height; y += 4)
			{
				for(int x = 0; x < external.width; x += 4)
				{
					Color<byte> c[4];

					c[0] = source->c0;
					c[1] = source->c1;

					// c2 = 2 / 3 * c0 + 1 / 3 * c1
					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);

					// c3 = 1 / 3 * c0 + 2 / 3 * c1
					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);

					byte a[8];

					a[0] = source->a0;
					a[1] = source->a1;
					
					if(a[0] > a[1])
					{
						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
					}
					else
					{
						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
						a[6] = 0;
						a[7] = 0xFF;
					}

					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
					{
						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
						{
							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
							
							dest[(x + i) + (y + j) * internal.width] = color;
						}
					}

					source++;
				}
			}

			(byte*&)destSlice += internal.sliceB;
		}
	}
#endif

	void Surface::decodeATI1(Buffer &internal, const Buffer &external)
	{
		byte *destSlice = (byte*)internal.buffer;
		const ATI1 *source = (const ATI1*)external.buffer;

		for(int z = 0; z < external.depth; z++)
		{
			byte *dest = destSlice;

			for(int y = 0; y < external.height; y += 4)
			{
				for(int x = 0; x < external.width; x += 4)
				{
					byte r[8];

					r[0] = source->r0;
					r[1] = source->r1;
					
					if(r[0] > r[1])
					{
						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
					}
					else
					{
						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
						r[6] = 0;
						r[7] = 0xFF;
					}

					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
					{
						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
						{
							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
						}
					}

					source++;
				}
			}

			destSlice += internal.sliceB;
		}
	}

	void Surface::decodeATI2(Buffer &internal, const Buffer &external)
	{
		word *destSlice = (word*)internal.buffer;
		const ATI2 *source = (const ATI2*)external.buffer;

		for(int z = 0; z < external.depth; z++)
		{
			word *dest = destSlice;

			for(int y = 0; y < external.height; y += 4)
			{
				for(int x = 0; x < external.width; x += 4)
				{
					byte X[8];

					X[0] = source->x0;
					X[1] = source->x1;
					
					if(X[0] > X[1])
					{
						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
					}
					else
					{
						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
						X[6] = 0;
						X[7] = 0xFF;
					}

					byte Y[8];

					Y[0] = source->y0;
					Y[1] = source->y1;
					
					if(Y[0] > Y[1])
					{
						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
					}
					else
					{
						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
						Y[6] = 0;
						Y[7] = 0xFF;
					}

					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
					{
						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
						{
							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];

							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
						}
					}

					source++;
				}
			}

			(byte*&)destSlice += internal.sliceB;
		}
	}

	void Surface::decodeETC2(Buffer &internal, const Buffer &external, int nbAlphaBits, bool isSRGB)
	{
		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));

		if(isSRGB)
		{
			static byte sRGBtoLinearTable[256];
			static bool sRGBtoLinearTableDirty = true;
			if(sRGBtoLinearTableDirty)
			{
				for(int i = 0; i < 256; i++)
				{
					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
				}
				sRGBtoLinearTableDirty = false;
			}

			// Perform sRGB conversion in place after decoding
			byte* src = (byte*)internal.buffer;
			for(int y = 0; y < internal.height; y++)
			{
				byte* srcRow = src + y * internal.pitchB;
				for(int x = 0; x <  internal.width; x++)
				{
					byte* srcPix = srcRow + x * internal.bytes;
					for(int i = 0; i < 3; i++)
					{
						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
					}
				}
			}
		}
	}

	void Surface::decodeEAC(Buffer &internal, const Buffer &external, int nbChannels, bool isSigned)
	{
		ASSERT(nbChannels == 1 || nbChannels == 2);

		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));

		// FIXME: We convert signed data to float, until signed integer internal formats are supported
		//        This code can be removed if signed ETC2 images are decoded to internal 8 bit signed R/RG formats
		if(isSigned)
		{
			sbyte* src = (sbyte*)internal.buffer;

			for(int y = 0; y < internal.height; y++)
			{
				sbyte* srcRow = src + y * internal.pitchB;
				for(int x = internal.width - 1; x >= 0; x--)
				{
					int dx = x & 0xFFFFFFFC;
					int mx = x - dx;
					sbyte* srcPix = srcRow + dx * internal.bytes + mx * nbChannels;
					float* dstPix = (float*)(srcRow + x * internal.bytes);
					for(int c = nbChannels - 1; c >= 0; c--)
					{
						static const float normalization = 1.0f / 127.875f;
						dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
					}
				}
			}
		}
	}

	void Surface::decodeASTC(Buffer &internal, const Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
	{
	}

	unsigned int Surface::size(int width, int height, int depth, Format format)
	{
		// Dimensions rounded up to multiples of 4, used for compressed formats
		int width4 = align(width, 4);
		int height4 = align(height, 4);

		switch(format)
		{
		#if S3TC_SUPPORT
		case FORMAT_DXT1:
		#endif
		case FORMAT_ATI1:
		case FORMAT_ETC1:
		case FORMAT_R11_EAC:
		case FORMAT_SIGNED_R11_EAC:
		case FORMAT_RGB8_ETC2:
		case FORMAT_SRGB8_ETC2:
		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
			return width4 * height4 * depth / 2;
		#if S3TC_SUPPORT
		case FORMAT_DXT3:
		case FORMAT_DXT5:
		#endif
		case FORMAT_ATI2:
		case FORMAT_RG11_EAC:
		case FORMAT_SIGNED_RG11_EAC:
		case FORMAT_RGBA8_ETC2_EAC:
		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
		case FORMAT_RGBA_ASTC_4x4_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
			return width4 * height4 * depth;
		case FORMAT_RGBA_ASTC_5x4_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
			return align(width, 5) * height4 * depth;
		case FORMAT_RGBA_ASTC_5x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
			return align(width, 5) * align(height, 5) * depth;
		case FORMAT_RGBA_ASTC_6x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
			return align(width, 6) * align(height, 5) * depth;
		case FORMAT_RGBA_ASTC_6x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
			return align(width, 6) * align(height, 6) * depth;
		case FORMAT_RGBA_ASTC_8x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
			return align(width, 8) * align(height, 5) * depth;
		case FORMAT_RGBA_ASTC_8x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
			return align(width, 8) * align(height, 6) * depth;
		case FORMAT_RGBA_ASTC_8x8_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
			return align(width, 8) * align(height, 8) * depth;
		case FORMAT_RGBA_ASTC_10x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
			return align(width, 10) * align(height, 5) * depth;
		case FORMAT_RGBA_ASTC_10x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
			return align(width, 10) * align(height, 6) * depth;
		case FORMAT_RGBA_ASTC_10x8_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
			return align(width, 10) * align(height, 8) * depth;
		case FORMAT_RGBA_ASTC_10x10_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
			return align(width, 10) * align(height, 10) * depth;
		case FORMAT_RGBA_ASTC_12x10_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
			return align(width, 12) * align(height, 10) * depth;
		case FORMAT_RGBA_ASTC_12x12_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
			return align(width, 12) * align(height, 12) * depth;
		case FORMAT_YV12_BT601:
		case FORMAT_YV12_BT709:
		case FORMAT_YV12_JFIF:
			{
				unsigned int YStride = align(width, 16);
				unsigned int YSize = YStride * height;
				unsigned int CStride = align(YStride / 2, 16);
 				unsigned int CSize = CStride * height / 2;

				return YSize + 2 * CSize;
			}
		default:
			return bytes(format) * width * height * depth;
		}

		return 0;
	}

	bool Surface::isStencil(Format format)
	{
		switch(format)
		{
		case FORMAT_D32:
		case FORMAT_D16:
		case FORMAT_D24X8:
		case FORMAT_D32F:
		case FORMAT_D32F_COMPLEMENTARY:
		case FORMAT_D32F_LOCKABLE:
			return false;
		case FORMAT_D24S8:
		case FORMAT_D24FS8:
		case FORMAT_S8:
		case FORMAT_DF24S8:
		case FORMAT_DF16S8:
		case FORMAT_D32FS8_TEXTURE:
		case FORMAT_D32FS8_SHADOW:
		case FORMAT_INTZ:
			return true;
		default:
			return false;
		}
	}

	bool Surface::isDepth(Format format)
	{
		switch(format)
		{
		case FORMAT_D32:
		case FORMAT_D16:
		case FORMAT_D24X8:
		case FORMAT_D24S8:
		case FORMAT_D24FS8:
		case FORMAT_D32F:
		case FORMAT_D32F_COMPLEMENTARY:
		case FORMAT_D32F_LOCKABLE:
		case FORMAT_DF24S8:
		case FORMAT_DF16S8:
		case FORMAT_D32FS8_TEXTURE:
		case FORMAT_D32FS8_SHADOW:
		case FORMAT_INTZ:
			return true;
		case FORMAT_S8:
			return false;
		default:
			return false;
		}
	}

	bool Surface::isPalette(Format format)
	{
		switch(format)
		{
		case FORMAT_P8:
		case FORMAT_A8P8:
			return true;
		default:
			return false;
		}
	}

	bool Surface::isFloatFormat(Format format)
	{
		switch(format)
		{
		case FORMAT_R5G6B5:
		case FORMAT_X8R8G8B8:
		case FORMAT_X8B8G8R8I:
		case FORMAT_X8B8G8R8:
		case FORMAT_A8R8G8B8:
		case FORMAT_A8B8G8R8I:
		case FORMAT_R8UI:
		case FORMAT_G8R8UI:
		case FORMAT_X8B8G8R8UI:
		case FORMAT_A8B8G8R8UI:
		case FORMAT_A8B8G8R8:
		case FORMAT_G8R8I:
		case FORMAT_G8R8:
		case FORMAT_R8I_SNORM:
		case FORMAT_G8R8I_SNORM:
		case FORMAT_X8B8G8R8I_SNORM:
		case FORMAT_A8B8G8R8I_SNORM:
		case FORMAT_R16I:
		case FORMAT_R16UI:
		case FORMAT_G16R16I:
		case FORMAT_G16R16UI:
		case FORMAT_G16R16:
		case FORMAT_X16B16G16R16I:
		case FORMAT_X16B16G16R16UI:
		case FORMAT_A16B16G16R16I:
		case FORMAT_A16B16G16R16UI:
		case FORMAT_A16B16G16R16:
		case FORMAT_V8U8:
		case FORMAT_Q8W8V8U8:
		case FORMAT_X8L8V8U8:
		case FORMAT_V16U16:
		case FORMAT_A16W16V16U16:
		case FORMAT_Q16W16V16U16:
		case FORMAT_A8:
		case FORMAT_R8I:
		case FORMAT_R8:
		case FORMAT_L8:
		case FORMAT_L16:
		case FORMAT_A8L8:
		case FORMAT_YV12_BT601:
		case FORMAT_YV12_BT709:
		case FORMAT_YV12_JFIF:
		case FORMAT_R32I:
		case FORMAT_R32UI:
		case FORMAT_G32R32I:
		case FORMAT_G32R32UI:
		case FORMAT_X32B32G32R32I:
		case FORMAT_X32B32G32R32UI:
		case FORMAT_A32B32G32R32I:
		case FORMAT_A32B32G32R32UI:
			return false;
		case FORMAT_R32F:
		case FORMAT_G32R32F:
		case FORMAT_A32B32G32R32F:
		case FORMAT_D32F:
		case FORMAT_D32F_COMPLEMENTARY:
		case FORMAT_D32F_LOCKABLE:
		case FORMAT_D32FS8_TEXTURE:
		case FORMAT_D32FS8_SHADOW:
		case FORMAT_L16F:
		case FORMAT_A16L16F:
		case FORMAT_L32F:
		case FORMAT_A32L32F:
			return true;
		default:
			ASSERT(false);
		}
		
		return false;
	}

	bool Surface::isUnsignedComponent(Format format, int component)
	{
		switch(format)
		{
		case FORMAT_NULL:
		case FORMAT_R5G6B5:
		case FORMAT_X8R8G8B8:
		case FORMAT_X8B8G8R8:
		case FORMAT_A8R8G8B8:
		case FORMAT_A8B8G8R8:
		case FORMAT_G8R8:
		case FORMAT_G16R16:
		case FORMAT_A16B16G16R16:
		case FORMAT_D32F:
		case FORMAT_D32F_COMPLEMENTARY:
		case FORMAT_D32F_LOCKABLE:
		case FORMAT_D32FS8_TEXTURE:
		case FORMAT_D32FS8_SHADOW:
		case FORMAT_A8:
		case FORMAT_R8:
		case FORMAT_L8:
		case FORMAT_L16:
		case FORMAT_A8L8:
		case FORMAT_YV12_BT601:
		case FORMAT_YV12_BT709:
		case FORMAT_YV12_JFIF:
			return true;
		case FORMAT_V8U8:
		case FORMAT_X8L8V8U8:
		case FORMAT_V16U16:
			if(component < 2)
			{
				return false;
			}
			else
			{
				return true;
			}
		case FORMAT_A16W16V16U16:
			if(component < 3)
			{
				return false;
			}
			else
			{
				return true;
			}
		case FORMAT_Q8W8V8U8:
		case FORMAT_Q16W16V16U16:
			return false;
		case FORMAT_R32F:
			if(component < 1)
			{
				return false;
			}
			else
			{
				return true;
			}
		case FORMAT_G32R32F:
			if(component < 2)
			{
				return false;
			}
			else
			{
				return true;
			}
		case FORMAT_A32B32G32R32F:
			return false;
		default:
			ASSERT(false);
		}
		
		return false;
	}

	bool Surface::isSRGBreadable(Format format)
	{
		// Keep in sync with Capabilities::isSRGBreadable
		switch(format)
		{
		case FORMAT_L8:
		case FORMAT_A8L8:
		case FORMAT_R8G8B8:
		case FORMAT_A8R8G8B8:
		case FORMAT_X8R8G8B8:
		case FORMAT_A8B8G8R8:
		case FORMAT_X8B8G8R8:
		case FORMAT_R5G6B5:
		case FORMAT_X1R5G5B5:
		case FORMAT_A1R5G5B5:
		case FORMAT_A4R4G4B4:
		#if S3TC_SUPPORT
		case FORMAT_DXT1:
		case FORMAT_DXT3:
		case FORMAT_DXT5:
		#endif
		case FORMAT_ATI1:
		case FORMAT_ATI2:
			return true;
		default:
			return false;
		}

		return false;
	}

	bool Surface::isSRGBwritable(Format format)
	{
		// Keep in sync with Capabilities::isSRGBwritable
		switch(format)
		{
		case FORMAT_NULL:
		case FORMAT_A8R8G8B8:
		case FORMAT_X8R8G8B8:
		case FORMAT_A8B8G8R8:
		case FORMAT_X8B8G8R8:
		case FORMAT_R5G6B5:
			return true;
		default:
			return false;
		}
	}

	bool Surface::isCompressed(Format format)
	{
		switch(format)
		{
		#if S3TC_SUPPORT
		case FORMAT_DXT1:
		case FORMAT_DXT3:
		case FORMAT_DXT5:
		#endif
		case FORMAT_ATI1:
		case FORMAT_ATI2:
		case FORMAT_ETC1:
		case FORMAT_R11_EAC:
		case FORMAT_SIGNED_R11_EAC:
		case FORMAT_RG11_EAC:
		case FORMAT_SIGNED_RG11_EAC:
		case FORMAT_RGB8_ETC2:
		case FORMAT_SRGB8_ETC2:
		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
		case FORMAT_RGBA8_ETC2_EAC:
		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
		case FORMAT_RGBA_ASTC_4x4_KHR:
		case FORMAT_RGBA_ASTC_5x4_KHR:
		case FORMAT_RGBA_ASTC_5x5_KHR:
		case FORMAT_RGBA_ASTC_6x5_KHR:
		case FORMAT_RGBA_ASTC_6x6_KHR:
		case FORMAT_RGBA_ASTC_8x5_KHR:
		case FORMAT_RGBA_ASTC_8x6_KHR:
		case FORMAT_RGBA_ASTC_8x8_KHR:
		case FORMAT_RGBA_ASTC_10x5_KHR:
		case FORMAT_RGBA_ASTC_10x6_KHR:
		case FORMAT_RGBA_ASTC_10x8_KHR:
		case FORMAT_RGBA_ASTC_10x10_KHR:
		case FORMAT_RGBA_ASTC_12x10_KHR:
		case FORMAT_RGBA_ASTC_12x12_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
			return true;
		default:
			return false;
		}
	}

	bool Surface::isNonNormalizedInteger(Format format)
	{
		switch(format)
		{
		case FORMAT_A8B8G8R8I:
		case FORMAT_X8B8G8R8I:
		case FORMAT_G8R8I:
		case FORMAT_R8I:
		case FORMAT_A8B8G8R8UI:
		case FORMAT_X8B8G8R8UI:
		case FORMAT_G8R8UI:
		case FORMAT_R8UI:
		case FORMAT_A16B16G16R16I:
		case FORMAT_X16B16G16R16I:
		case FORMAT_G16R16I:
		case FORMAT_R16I:
		case FORMAT_A16B16G16R16UI:
		case FORMAT_X16B16G16R16UI:
		case FORMAT_G16R16UI:
		case FORMAT_R16UI:
		case FORMAT_A32B32G32R32I:
		case FORMAT_X32B32G32R32I:
		case FORMAT_G32R32I:
		case FORMAT_R32I:
		case FORMAT_A32B32G32R32UI:
		case FORMAT_X32B32G32R32UI:
		case FORMAT_G32R32UI:
		case FORMAT_R32UI:
			return true;
		default:
			return false;
		}
	}

	int Surface::componentCount(Format format)
	{
		switch(format)
		{
		case FORMAT_R5G6B5:         return 3;
		case FORMAT_X8R8G8B8:       return 3;
		case FORMAT_X8B8G8R8:       return 3;
		case FORMAT_A8R8G8B8:       return 4;
		case FORMAT_A8B8G8R8:       return 4;
		case FORMAT_G8R8:           return 2;
		case FORMAT_G16R16:         return 2;
		case FORMAT_A16B16G16R16:   return 4;
		case FORMAT_V8U8:           return 2;
		case FORMAT_Q8W8V8U8:       return 4;
		case FORMAT_X8L8V8U8:       return 3;
		case FORMAT_V16U16:         return 2;
		case FORMAT_A16W16V16U16:   return 4;
		case FORMAT_Q16W16V16U16:   return 4;
		case FORMAT_R32F:           return 1;
		case FORMAT_G32R32F:        return 2;
		case FORMAT_A32B32G32R32F:  return 4;
		case FORMAT_D32F_LOCKABLE:  return 1;
		case FORMAT_D32FS8_TEXTURE: return 1;
		case FORMAT_D32FS8_SHADOW:  return 1;
		case FORMAT_A8:             return 1;
		case FORMAT_R8:             return 1;
		case FORMAT_L8:             return 1;
		case FORMAT_L16:            return 1;
		case FORMAT_A8L8:           return 2;
		case FORMAT_YV12_BT601:     return 3;
		case FORMAT_YV12_BT709:     return 3;
		case FORMAT_YV12_JFIF:      return 3;
		default:
			ASSERT(false);
		}

		return 1;
	}

	void *Surface::allocateBuffer(int width, int height, int depth, Format format)
	{
		// Render targets require 2x2 quads
		int width2 = (width + 1) & ~1;
		int height2 = (height + 1) & ~1;

		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
		// so we have to allocate 4 extra bytes to avoid buffer overruns.
		return allocateZero(size(width2, height2, depth, format) + 4);
	}

	void Surface::memfill4(void *buffer, int pattern, int bytes)
	{
		while((size_t)buffer & 0x1 && bytes >= 1)
		{
			*(char*)buffer = (char)pattern;
			(char*&)buffer += 1;
			bytes -= 1;
		}

		while((size_t)buffer & 0x3 && bytes >= 2)
		{
			*(short*)buffer = (short)pattern;
			(short*&)buffer += 1;
			bytes -= 2;
		}

		if(CPUID::supportsSSE())
		{
			while((size_t)buffer & 0xF && bytes >= 4)
			{
				*(int*)buffer = pattern;
				(int*&)buffer += 1;
				bytes -= 4;
			}

			__m128 quad = _mm_set_ps1((float&)pattern);
			
			float *pointer = (float*)buffer;
			int qxwords = bytes / 64;
			bytes -= qxwords * 64;

			while(qxwords--)
			{
				_mm_stream_ps(pointer + 0, quad);
				_mm_stream_ps(pointer + 4, quad);
				_mm_stream_ps(pointer + 8, quad);
				_mm_stream_ps(pointer + 12, quad);

				pointer += 16;
			}

			buffer = pointer;
		}

		while(bytes >= 4)
		{
			*(int*)buffer = (int)pattern;
			(int*&)buffer += 1;
			bytes -= 4;
		}

		while(bytes >= 2)
		{
			*(short*)buffer = (short)pattern;
			(short*&)buffer += 1;
			bytes -= 2;
		}

		while(bytes >= 1)
		{
			*(char*)buffer = (char)pattern;
			(char*&)buffer += 1;
			bytes -= 1;
		}
	}

	void Surface::clearColorBuffer(float red, float green, float blue, float alpha, unsigned int rgbaMask, int x0, int y0, int width, int height)
	{
		// FIXME: Also clear buffers in other formats?

		// Not overlapping
		if(x0 > internal.width) return;
		if(y0 > internal.height) return;
		if(x0 + width < 0) return;
		if(y0 + height < 0) return;

		// Clip against dimensions
		if(x0 < 0) {width += x0; x0 = 0;}
		if(x0 + width > internal.width) width = internal.width - x0;
		if(y0 < 0) {height += y0; y0 = 0;}
		if(y0 + height > internal.height) height = internal.height - y0;

		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;

		int x1 = x0 + width;
		int y1 = y0 + height;

	//	if(lockable || !quadLayoutEnabled)
		{
			unsigned char *buffer = (unsigned char*)lockInternal(x0, y0, 0, lock, PUBLIC);

			for(int z = 0; z < internal.depth; z++)
			{
				unsigned char *target = buffer;

				for(int y = y0; y < y1; y++)
				{
					switch(internal.format)
					{
					case FORMAT_NULL:
						break;
					case FORMAT_X8R8G8B8:
					case FORMAT_A8R8G8B8:
				//	case FORMAT_X8G8R8B8Q:   // FIXME
				//	case FORMAT_A8G8R8B8Q:   // FIXME
						{
							unsigned char r8 = iround(red * 0xFF);
							unsigned char g8 = iround(green * 0xFF);
							unsigned char b8 = iround(blue * 0xFF);
							unsigned char a8 = iround(alpha * 0xFF);
							unsigned char a8r8g8b8[4] = {b8, g8, r8, a8};
							unsigned int colorARGB = (unsigned int&)a8r8g8b8;

							if(rgbaMask == 0xF || (internal.format == FORMAT_X8R8G8B8 && rgbaMask == 0x7))
							{
								memfill4(target, colorARGB, 4 * (x1 - x0));
							}
							else
							{
								unsigned int bgraMask = (rgbaMask & 0x1 ? 0x00FF0000 : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0) | (rgbaMask & 0x4 ? 0x000000FF : 0) | (rgbaMask & 0x8 ? 0xFF000000 : 0);
								unsigned int invMask = ~bgraMask;
								unsigned int maskedColor = colorARGB & bgraMask;
								unsigned int *target32 = (unsigned int*)target;

								for(int x = 0; x < width; x++)
								{
									target32[x] = maskedColor | (target32[x] & invMask);
								}
							}
						}
						break;
					case FORMAT_X8B8G8R8:
					case FORMAT_A8B8G8R8:
						{
							unsigned char r8 = iround(red * 0xFF);
							unsigned char g8 = iround(green * 0xFF);
							unsigned char b8 = iround(blue * 0xFF);
							unsigned char a8 = iround(alpha * 0xFF);
							unsigned char a8b8g8r8[4] = {r8, g8, b8, a8};
							unsigned int colorABGR = (unsigned int&)a8b8g8r8;

							if(rgbaMask == 0xF || (internal.format == FORMAT_X8B8G8R8 && rgbaMask == 0x7))
							{
								memfill4(target, colorABGR, 4 * (x1 - x0));
							}
							else
							{
								unsigned int rgbaMask32 = (rgbaMask & 0x1 ? 0x000000FF : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0) | (rgbaMask & 0x4 ? 0x00FF0000 : 0) | (rgbaMask & 0x8 ? 0xFF000000 : 0);
								unsigned int invMask = ~rgbaMask32;
								unsigned int maskedColor = colorABGR & rgbaMask32;
 								unsigned int *target32 = (unsigned int*)target;

								for(int x = 0; x < width; x++)
								{
									target32[x] = maskedColor | (target32[x] & invMask);
								}
							}
						}
						break;
					case FORMAT_G8R8:
						{
							unsigned char r8 = iround(red * 0xFF);
							unsigned char g8 = iround(green * 0xFF);
							unsigned char g8r8[4] = {r8, g8, r8, g8};

							if((rgbaMask & 0x3) == 0x3)
							{
								memfill4(target, (int&)g8r8, 2 * (x1 - x0));
							}
							else
							{
								unsigned short rgMask = (rgbaMask & 0x1 ? 0x000000FF : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0);
								unsigned short invMask = ~rgMask;
								unsigned short maskedColor = (unsigned short&)g8r8 & rgMask;
								unsigned short *target16 = (unsigned short*)target;

								for(int x = 0; x < width; x++)
								{
									target16[x] = maskedColor | (target16[x] & invMask);
								}
							}
						}
						break;
					case FORMAT_G16R16:
						{
							unsigned char r16 = iround(red * 0xFFFF);
							unsigned char g16 = iround(green * 0xFFFF);
							unsigned short g16r16[2] = {r16, g16};

							if((rgbaMask & 0x3) == 0x3)
							{
								memfill4(target, (int&)g16r16, 4 * (x1 - x0));
							}
							else
							{
								unsigned int rgMask = (rgbaMask & 0x1 ? 0x0000FFFF : 0) | (rgbaMask & 0x2 ? 0xFFFF0000 : 0);
								unsigned int invMask = ~rgMask;
								unsigned int maskedColor = (unsigned int&)g16r16 & rgMask;
								unsigned int *target32 = (unsigned int*)target;

								for(int x = 0; x < width; x++)
								{
									target32[x] = maskedColor | (target32[x] & invMask);
								}
							}
						}
						break;
					case FORMAT_A16B16G16R16:
						{
							unsigned char r16 = iround(red * 0xFFFF);
							unsigned char g16 = iround(green * 0xFFFF);
							unsigned char b16 = iround(blue * 0xFFFF);
							unsigned char a16 = iround(alpha * 0xFFFF);

							if(rgbaMask == 0xF)
							{
								for(int x = 0; x < width; x++)
								{
									((unsigned short*)target)[4 * x + 0] = r16;
									((unsigned short*)target)[4 * x + 1] = g16;
									((unsigned short*)target)[4 * x + 2] = b16;
									((unsigned short*)target)[4 * x + 3] = a16;
								}
							}
							else
							{
								if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 0] = r16;
								if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 1] = g16;
								if(rgbaMask & 0x4) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 2] = b16;
								if(rgbaMask & 0x8) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 3] = a16;
							}
						}
						break;
					case FORMAT_R32F:
						if(rgbaMask & 0x1)
						{
							for(int x = 0; x < width; x++)
							{
								((float*)target)[x] = red;
							}
						}
						break;
					case FORMAT_G32R32F:
						if((rgbaMask & 0x3) == 0x3)
						{
							for(int x = 0; x < width; x++)
							{
								((float*)target)[2 * x + 0] = red;
								((float*)target)[2 * x + 1] = green;
							}
						}
						else
						{
							if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((float*)target)[2 * x + 0] = red;
							if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((float*)target)[2 * x + 1] = green;
						}
						break;
					case FORMAT_A32B32G32R32F:
						if(rgbaMask == 0xF)
						{
							for(int x = 0; x < width; x++)
							{
								((float*)target)[4 * x + 0] = red;
								((float*)target)[4 * x + 1] = green;
								((float*)target)[4 * x + 2] = blue;
								((float*)target)[4 * x + 3] = alpha;
							}
						}
						else
						{
							if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 0] = red;
							if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 1] = green;
							if(rgbaMask & 0x4) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 2] = blue;
							if(rgbaMask & 0x8) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 3] = alpha;
						}
						break;
					case FORMAT_R5G6B5:
						{
							unsigned int r5 = iround(red * 0x1F);
							unsigned int g6 = iround(green * 0x3F);
							unsigned int b5 = iround(blue * 0x1F);
							unsigned int r5g6b5 = (r5 << 11) | (g6 << 5) | b5;

							if((rgbaMask & 0x7) == 0x7)
							{
								unsigned int r5g6b5r5g6b5 = r5g6b5 | (r5g6b5 << 16);
								memfill4(target, r5g6b5r5g6b5, 2 * (x1 - x0));
							}
							else
							{
								unsigned short rgbMask = (rgbaMask & 0x1 ? 0xF800 : 0) | (rgbaMask & 0x2 ? 0x07E0 : 0) | (rgbaMask & 0x3 ? 0x001F : 0);
								unsigned short invMask = ~rgbMask;
								unsigned short maskedColor = r5g6b5 & rgbMask;
								unsigned short *target16 = (unsigned short*)target;

								for(int x = 0; x < width; x++)
								{
									target16[x] = maskedColor | (target16[x] & invMask);
								}
							}
						}
						break;
					default:
						ASSERT(false);
					}

					target += internal.pitchB;
				}

				buffer += internal.sliceB;
			}

			unlockInternal();
		}
	/*	else
		{
			int width2 = (internal.width + 1) & ~1;

		//	unsigned char *target = (unsigned char*&)buffer;
		//
		//	for(int y = y0; y < y1; y++)
		//	{
		//		for(int x = x0; x < x1; x++)
		//		{
		//			target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 0] =  (color & 0x000000FF) >> 0;
		//			target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 4] =  (color & 0x00FF0000) >> 16;
		//			target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 8] =  (color & 0x0000FF00) >> 8;
		//			target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 12] = (color & 0xFF000000) >> 24;
		//		}
		//	}

			unsigned char colorQ[16];

			colorQ[0] =  (color & 0x000000FF) >> 0;
			colorQ[1] =  (color & 0x000000FF) >> 0;
			colorQ[2] =  (color & 0x000000FF) >> 0;
			colorQ[3] =  (color & 0x000000FF) >> 0;
			colorQ[4] =  (color & 0x00FF0000) >> 16;
			colorQ[5] =  (color & 0x00FF0000) >> 16;
			colorQ[6] =  (color & 0x00FF0000) >> 16;
			colorQ[7] =  (color & 0x00FF0000) >> 16;
			colorQ[8] =  (color & 0x0000FF00) >> 8;
			colorQ[9] =  (color & 0x0000FF00) >> 8;
			colorQ[10] = (color & 0x0000FF00) >> 8;
			colorQ[11] = (color & 0x0000FF00) >> 8;
			colorQ[12] = (color & 0xFF000000) >> 24;
			colorQ[13] = (color & 0xFF000000) >> 24;
			colorQ[14] = (color & 0xFF000000) >> 24;
			colorQ[15] = (color & 0xFF000000) >> 24;

			for(int y = y0; y < y1; y++)
			{
				unsigned char *target = (unsigned char*)lockInternal(0, 0, 0, lock) + width2 * 4 * (y & ~1) + 2 * (y & 1);   // FIXME: Unlock

				if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
				{
					if((x0 & 1) != 0)
					{
						target[8 * (x0 & ~1) + 1 + 0] =  (color & 0x000000FF) >> 0;
						target[8 * (x0 & ~1) + 1 + 4] =  (color & 0x00FF0000) >> 16;
						target[8 * (x0 & ~1) + 1 + 8] =  (color & 0x0000FF00) >> 8;
						target[8 * (x0 & ~1) + 1 + 12] = (color & 0xFF000000) >> 24;

						target[8 * (x0 & ~1) + 3 + 0] =  (color & 0x000000FF) >> 0;
						target[8 * (x0 & ~1) + 3 + 4] =  (color & 0x00FF0000) >> 16;
						target[8 * (x0 & ~1) + 3 + 8] =  (color & 0x0000FF00) >> 8;
						target[8 * (x0 & ~1) + 3 + 12] = (color & 0xFF000000) >> 24;
					}

					__asm
					{
						movq mm0, colorQ+0
						movq mm1, colorQ+8

						mov eax, x0
						add eax, 1
						and eax, 0xFFFFFFFE
						cmp eax, x1
						jge qEnd

						mov edi, target

					qLoop:
						movntq [edi+8*eax+0], mm0
						movntq [edi+8*eax+8], mm1

						add eax, 2
						cmp eax, x1
						jl qLoop
					qEnd:
						emms
					}

					if((x1 & 1) != 0)
					{
						target[8 * (x1 & ~1) + 0 + 0] =  (color & 0x000000FF) >> 0;
						target[8 * (x1 & ~1) + 0 + 4] =  (color & 0x00FF0000) >> 16;
						target[8 * (x1 & ~1) + 0 + 8] =  (color & 0x0000FF00) >> 8;
						target[8 * (x1 & ~1) + 0 + 12] = (color & 0xFF000000) >> 24;

						target[8 * (x1 & ~1) + 2 + 0] =  (color & 0x000000FF) >> 0;
						target[8 * (x1 & ~1) + 2 + 4] =  (color & 0x00FF0000) >> 16;
						target[8 * (x1 & ~1) + 2 + 8] =  (color & 0x0000FF00) >> 8;
						target[8 * (x1 & ~1) + 2 + 12] = (color & 0xFF000000) >> 24;
					}

					y++;
				}
				else
				{
					for(int x = x0; x < x1; x++)
					{
						target[8 * (x & ~1) + (x & 1) + 0] =  (color & 0x000000FF) >> 0;
						target[8 * (x & ~1) + (x & 1) + 4] =  (color & 0x00FF0000) >> 16;
						target[8 * (x & ~1) + (x & 1) + 8] =  (color & 0x0000FF00) >> 8;
						target[8 * (x & ~1) + (x & 1) + 12] = (color & 0xFF000000) >> 24;
					}
				}
			}
		}*/
	}

	void Surface::clearDepthBuffer(float depth, int x0, int y0, int width, int height)
	{
		// Not overlapping
		if(x0 > internal.width) return;
		if(y0 > internal.height) return;
		if(x0 + width < 0) return;
		if(y0 + height < 0) return;

		// Clip against dimensions
		if(x0 < 0) {width += x0; x0 = 0;}
		if(x0 + width > internal.width) width = internal.width - x0;
		if(y0 < 0) {height += y0; y0 = 0;}
		if(y0 + height > internal.height) height = internal.height - y0;

		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;

		int width2 = (internal.width + 1) & ~1;

		int x1 = x0 + width;
		int y1 = y0 + height;

		if(internal.format == FORMAT_D32F_LOCKABLE ||
		   internal.format == FORMAT_D32FS8_TEXTURE ||
		   internal.format == FORMAT_D32FS8_SHADOW)
		{
			float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;

			for(int z = 0; z < internal.depth; z++)
			{
				for(int y = y0; y < y1; y++)
				{
					memfill4(target, (int&)depth, 4 * width);
					target += width2;
				}
			}

			unlockInternal();
		}
		else   // Quad layout
		{
			if(complementaryDepthBuffer)
			{
				depth = 1 - depth;
			}

			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);

			for(int z = 0; z < internal.depth; z++)
			{
				for(int y = y0; y < y1; y++)
				{
					float *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
			
					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
					{
						if((x0 & 1) != 0)
						{
							target[(x0 & ~1) * 2 + 1] = depth;
							target[(x0 & ~1) * 2 + 3] = depth;
						}

					//	for(int x2 = ((x0 + 1) & ~1) * 2; x2 < x1 * 2; x2 += 4)
					//	{
					//		target[x2 + 0] = depth;
					//		target[x2 + 1] = depth;
					//		target[x2 + 2] = depth;
					//		target[x2 + 3] = depth;
					//	}

					//	__asm
					//	{
					//		movss xmm0, depth
					//		shufps xmm0, xmm0, 0x00
					//
					//		mov eax, x0
					//		add eax, 1
					//		and eax, 0xFFFFFFFE
					//		cmp eax, x1
					//		jge qEnd
					//
					//		mov edi, target
					//
					//	qLoop:
					//		movntps [edi+8*eax], xmm0
					//
					//		add eax, 2
					//		cmp eax, x1
					//		jl qLoop
					//	qEnd:
					//	}

						memfill4(&target[((x0 + 1) & ~1) * 2], (int&)depth, 8 * ((x1 & ~1) - ((x0 + 1) & ~1)));

						if((x1 & 1) != 0)
						{
							target[(x1 & ~1) * 2 + 0] = depth;
							target[(x1 & ~1) * 2 + 2] = depth;
						}

						y++;
					}
					else
					{
						for(int x = x0; x < x1; x++)
						{
							target[(x & ~1) * 2 + (x & 1)] = depth;
						}
					}
				}

				buffer += internal.sliceP;
			}

			unlockInternal();
		}
	}

	void Surface::clearStencilBuffer(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
	{
		// Not overlapping
		if(x0 > internal.width) return;
		if(y0 > internal.height) return;
		if(x0 + width < 0) return;
		if(y0 + height < 0) return;

		// Clip against dimensions
		if(x0 < 0) {width += x0; x0 = 0;}
		if(x0 + width > internal.width) width = internal.width - x0;
		if(y0 < 0) {height += y0; y0 = 0;}
		if(y0 + height > internal.height) height = internal.height - y0;

		int width2 = (internal.width + 1) & ~1;

		int x1 = x0 + width;
		int y1 = y0 + height;

		unsigned char maskedS = s & mask;
		unsigned char invMask = ~mask;
		unsigned int fill = maskedS;
		fill = fill | (fill << 8) | (fill << 16) + (fill << 24);

		if(false)
		{
			char *target = (char*)lockStencil(0, PUBLIC) + x0 + width2 * y0;

			for(int z = 0; z < stencil.depth; z++)
			{
				for(int y = y0; y < y0 + height; y++)
				{
					if(mask == 0xFF)
					{
						memfill4(target, fill, width);
					}
					else
					{
						for(int x = 0; x < width; x++)
						{
							target[x] = maskedS | (target[x] & invMask);
						}
					}

					target += width2;
				}
			}

			unlockStencil();
		}
		else   // Quad layout
		{
			char *buffer = (char*)lockStencil(0, PUBLIC);

			if(mask == 0xFF)
			{
				for(int z = 0; z < stencil.depth; z++)
				{
					for(int y = y0; y < y1; y++)
					{
						char *target = buffer + (y & ~1) * width2 + (y & 1) * 2;

						if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
						{
							if((x0 & 1) != 0)
							{
								target[(x0 & ~1) * 2 + 1] = fill;
								target[(x0 & ~1) * 2 + 3] = fill;
							}

							memfill4(&target[((x0 + 1) & ~1) * 2], fill, ((x1 + 1) & ~1) * 2 - ((x0 + 1) & ~1) * 2);

							if((x1 & 1) != 0)
							{
								target[(x1 & ~1) * 2 + 0] = fill;
								target[(x1 & ~1) * 2 + 2] = fill;
							}

							y++;
						}
						else
						{
							for(int x = x0; x < x1; x++)
							{
								target[(x & ~1) * 2 + (x & 1)] = maskedS | (target[x] & invMask);
							}
						}
					}

					buffer += stencil.sliceP;
				}
			}

			unlockStencil();
		}
	}

	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
	{
		unsigned char *row;
		Buffer *buffer;
		
		if(internal.dirty)
		{
			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
			buffer = &internal;
		}
		else
		{
			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
			buffer = &external;
		}

		if(buffer->bytes <= 4)
		{
			int c;
			buffer->write(&c, color);

			if(buffer->bytes <= 1) c = (c << 8)  | c;
			if(buffer->bytes <= 2) c = (c << 16) | c;

			for(int y = 0; y < height; y++)
			{
				memfill4(row, c, width * buffer->bytes);

				row += buffer->pitchB;
			}
		}
		else   // Generic
		{
			for(int y = 0; y < height; y++)
			{
				unsigned char *element = row;

				for(int x = 0; x < width; x++)
				{
					buffer->write(element, color);

					element += buffer->bytes;
				}

				row += buffer->pitchB;
			}
		}

		if(buffer == &internal)
		{
			unlockInternal();
		}
		else
		{
			unlockExternal();
		}
	}

	Color<float> Surface::readExternal(int x, int y, int z) const
	{
		ASSERT(external.lock != LOCK_UNLOCKED);

		return external.read(x, y, z);
	}

	Color<float> Surface::readExternal(int x, int y) const
	{
		ASSERT(external.lock != LOCK_UNLOCKED);

		return external.read(x, y);
	}

	Color<float> Surface::sampleExternal(float x, float y, float z) const
	{
		ASSERT(external.lock != LOCK_UNLOCKED);

		return external.sample(x, y, z);
	}

	Color<float> Surface::sampleExternal(float x, float y) const
	{
		ASSERT(external.lock != LOCK_UNLOCKED);

		return external.sample(x, y);
	}

	void Surface::writeExternal(int x, int y, int z, const Color<float> &color)
	{
		ASSERT(external.lock != LOCK_UNLOCKED);

		external.write(x, y, z, color);
	}

	void Surface::writeExternal(int x, int y, const Color<float> &color)
	{
		ASSERT(external.lock != LOCK_UNLOCKED);

		external.write(x, y, color);
	}

	void Surface::copyInternal(const Surface* source, int x, int y, float srcX, float srcY, bool filter)
	{
		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);

		sw::Color<float> color;

		if(!filter)
		{
			color = source->internal.read((int)srcX, (int)srcY);
		}
		else   // Bilinear filtering
		{
			color = source->internal.sample(srcX, srcY);
		}

		internal.write(x, y, color);
	}

	void Surface::copyInternal(const Surface* source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
	{
		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);

		sw::Color<float> color;

		if(!filter)
		{
			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
		}
		else   // Bilinear filtering
		{
			color = source->internal.sample(srcX, srcY, srcZ);
		}

		internal.write(x, y, z, color);
	}

	bool Surface::hasStencil() const
	{
		return isStencil(external.format);
	}
	
	bool Surface::hasDepth() const
	{
		return isDepth(external.format);
	}

	bool Surface::hasPalette() const
	{
		return isPalette(external.format);
	}

	bool Surface::isRenderTarget() const
	{
		return renderTarget;
	}

	bool Surface::hasDirtyMipmaps() const
	{
		return dirtyMipmaps;
	}

	void Surface::cleanMipmaps()
	{
		dirtyMipmaps = false;
	}

	Resource *Surface::getResource()
	{
		return resource;
	}

	bool Surface::identicalFormats() const
	{
		return external.format == internal.format &&
		       external.width  == internal.width &&
		       external.height == internal.height &&
		       external.depth  == internal.depth &&
		       external.pitchB == internal.pitchB &&
		       external.sliceB == internal.sliceB;
	}

	Format Surface::selectInternalFormat(Format format) const
	{
		switch(format)
		{
		case FORMAT_NULL:
			return FORMAT_NULL;
		case FORMAT_P8:
		case FORMAT_A8P8:
		case FORMAT_A4R4G4B4:
		case FORMAT_A1R5G5B5:
		case FORMAT_A8R3G3B2:
			return FORMAT_A8R8G8B8;
		case FORMAT_A8:
			return FORMAT_A8;
		case FORMAT_R8:
			return FORMAT_R8;
		case FORMAT_A2R10G10B10:
		case FORMAT_A2B10G10R10:
		case FORMAT_A16B16G16R16:
			return FORMAT_A16B16G16R16;
		case FORMAT_G8R8:
			return FORMAT_G8R8;
		case FORMAT_G16R16:
			return FORMAT_G16R16;
		case FORMAT_A8R8G8B8:
			if(lockable || !quadLayoutEnabled)
			{
				return FORMAT_A8R8G8B8;
			}
			else
			{
				return FORMAT_A8G8R8B8Q;
			}
		case FORMAT_R5G5B5A1:
		case FORMAT_R4G4B4A4:
		case FORMAT_A8B8G8R8:
			return FORMAT_A8B8G8R8;
		case FORMAT_R5G6B5:
			return FORMAT_R5G6B5;
		case FORMAT_R3G3B2:
		case FORMAT_R8G8B8:
		case FORMAT_X4R4G4B4:
		case FORMAT_X1R5G5B5:
		case FORMAT_X8R8G8B8:
			if(lockable || !quadLayoutEnabled)
			{
				return FORMAT_X8R8G8B8;
			}
			else
			{
				return FORMAT_X8G8R8B8Q;
			}
		case FORMAT_B8G8R8:
		case FORMAT_X8B8G8R8:
			return FORMAT_X8B8G8R8;
		// Compressed formats
		#if S3TC_SUPPORT
		case FORMAT_DXT1:
		case FORMAT_DXT3:
		case FORMAT_DXT5:
		#endif
		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
		case FORMAT_RGBA8_ETC2_EAC:
		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
			return FORMAT_A8R8G8B8;
		case FORMAT_RGBA_ASTC_4x4_KHR:
		case FORMAT_RGBA_ASTC_5x4_KHR:
		case FORMAT_RGBA_ASTC_5x5_KHR:
		case FORMAT_RGBA_ASTC_6x5_KHR:
		case FORMAT_RGBA_ASTC_6x6_KHR:
		case FORMAT_RGBA_ASTC_8x5_KHR:
		case FORMAT_RGBA_ASTC_8x6_KHR:
		case FORMAT_RGBA_ASTC_8x8_KHR:
		case FORMAT_RGBA_ASTC_10x5_KHR:
		case FORMAT_RGBA_ASTC_10x6_KHR:
		case FORMAT_RGBA_ASTC_10x8_KHR:
		case FORMAT_RGBA_ASTC_10x10_KHR:
		case FORMAT_RGBA_ASTC_12x10_KHR:
		case FORMAT_RGBA_ASTC_12x12_KHR:
			// ASTC supports HDR, so a floating point format is required to represent it properly
			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
		case FORMAT_ATI1:
		case FORMAT_R11_EAC:
			return FORMAT_R8;
		case FORMAT_SIGNED_R11_EAC:
			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
		case FORMAT_ATI2:
		case FORMAT_RG11_EAC:
			return FORMAT_G8R8;
		case FORMAT_SIGNED_RG11_EAC:
			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
		case FORMAT_ETC1:
		case FORMAT_RGB8_ETC2:
		case FORMAT_SRGB8_ETC2:
			return FORMAT_X8R8G8B8;
		// Bumpmap formats
		case FORMAT_V8U8:			return FORMAT_V8U8;
		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
		case FORMAT_V16U16:			return FORMAT_V16U16;
		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
		// Floating-point formats
		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
		case FORMAT_R16F:			return FORMAT_R32F;
		case FORMAT_G16R16F:		return FORMAT_G32R32F;
		case FORMAT_B16G16R16F:     return FORMAT_A32B32G32R32F;
		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
		case FORMAT_R32F:			return FORMAT_R32F;
		case FORMAT_G32R32F:		return FORMAT_G32R32F;
		case FORMAT_B32G32R32F:     return FORMAT_A32B32G32R32F;
		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
		// Luminance formats
		case FORMAT_L8:				return FORMAT_L8;
		case FORMAT_A4L4:			return FORMAT_A8L8;
		case FORMAT_L16:			return FORMAT_L16;
		case FORMAT_A8L8:			return FORMAT_A8L8;
		case FORMAT_L16F:           return FORMAT_A32B32G32R32F;
		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
		case FORMAT_L32F:           return FORMAT_A32B32G32R32F;
		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
		// Depth/stencil formats
		case FORMAT_D16:
		case FORMAT_D32:
		case FORMAT_D24X8:
		case FORMAT_D24S8:
		case FORMAT_D24FS8:
			if(hasParent)   // Texture
			{
				return FORMAT_D32FS8_SHADOW;
			}
			else if(complementaryDepthBuffer)
			{
				return FORMAT_D32F_COMPLEMENTARY;
			}
			else
			{
				return FORMAT_D32F;
			}
		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
		default:
			ASSERT(false);
		}

		return FORMAT_NULL;
	}

	void Surface::setTexturePalette(unsigned int *palette)
	{
		Surface::palette = palette;
		Surface::paletteID++;
	}

	void Surface::resolve()
	{
		if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
		{
			return;
		}

		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);

		int quality = internal.depth;
		int width = internal.width;
		int height = internal.height;
		int pitch = internal.pitchB;
		int slice = internal.sliceB;

		unsigned char *source0 = (unsigned char*)source;
		unsigned char *source1 = source0 + slice;
		unsigned char *source2 = source1 + slice;
		unsigned char *source3 = source2 + slice;
		unsigned char *source4 = source3 + slice;
		unsigned char *source5 = source4 + slice;
		unsigned char *source6 = source5 + slice;
		unsigned char *source7 = source6 + slice;
		unsigned char *source8 = source7 + slice;
		unsigned char *source9 = source8 + slice;
		unsigned char *sourceA = source9 + slice;
		unsigned char *sourceB = sourceA + slice;
		unsigned char *sourceC = sourceB + slice;
		unsigned char *sourceD = sourceC + slice;
		unsigned char *sourceE = sourceD + slice;
		unsigned char *sourceF = sourceE + slice;

		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 || internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8)
		{
			if(CPUID::supportsSSE2() && (width % 4) == 0)
			{
				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 4)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
							
							c0 = _mm_avg_epu8(c0, c1);

							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 4)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
							
							c0 = _mm_avg_epu8(c0, c1);
							c2 = _mm_avg_epu8(c2, c3);
							c0 = _mm_avg_epu8(c0, c2);

							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 4)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
							
							c0 = _mm_avg_epu8(c0, c1);
							c2 = _mm_avg_epu8(c2, c3);
							c4 = _mm_avg_epu8(c4, c5);
							c6 = _mm_avg_epu8(c6, c7);
							c0 = _mm_avg_epu8(c0, c2);
							c4 = _mm_avg_epu8(c4, c6);
							c0 = _mm_avg_epu8(c0, c4);

							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 4)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));

							c0 = _mm_avg_epu8(c0, c1);
							c2 = _mm_avg_epu8(c2, c3);
							c4 = _mm_avg_epu8(c4, c5);
							c6 = _mm_avg_epu8(c6, c7);
							c8 = _mm_avg_epu8(c8, c9);
							cA = _mm_avg_epu8(cA, cB);
							cC = _mm_avg_epu8(cC, cD);
							cE = _mm_avg_epu8(cE, cF);
							c0 = _mm_avg_epu8(c0, c2);
							c4 = _mm_avg_epu8(c4, c6);
							c8 = _mm_avg_epu8(c8, cA);
							cC = _mm_avg_epu8(cC, cE);
							c0 = _mm_avg_epu8(c0, c4);
							c8 = _mm_avg_epu8(c8, cC);
							c0 = _mm_avg_epu8(c0, c8);

							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);
			}
			else
			{
				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))

				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);

							c0 = AVERAGE(c0, c1);

							*(unsigned int*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);

							c0 = AVERAGE(c0, c1);
							c2 = AVERAGE(c2, c3);
							c0 = AVERAGE(c0, c2);

							*(unsigned int*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);

							c0 = AVERAGE(c0, c1);
							c2 = AVERAGE(c2, c3);
							c4 = AVERAGE(c4, c5);
							c6 = AVERAGE(c6, c7);
							c0 = AVERAGE(c0, c2);
							c4 = AVERAGE(c4, c6);
							c0 = AVERAGE(c0, c4);

							*(unsigned int*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);

							c0 = AVERAGE(c0, c1);
							c2 = AVERAGE(c2, c3);
							c4 = AVERAGE(c4, c5);
							c6 = AVERAGE(c6, c7);
							c8 = AVERAGE(c8, c9);
							cA = AVERAGE(cA, cB);
							cC = AVERAGE(cC, cD);
							cE = AVERAGE(cE, cF);
							c0 = AVERAGE(c0, c2);
							c4 = AVERAGE(c4, c6);
							c8 = AVERAGE(c8, cA);
							cC = AVERAGE(cC, cE);
							c0 = AVERAGE(c0, c4);
							c8 = AVERAGE(c8, cC);
							c0 = AVERAGE(c0, c8);

							*(unsigned int*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);

				#undef AVERAGE
			}
		}
		else if(internal.format == FORMAT_G16R16)
		{
			if(CPUID::supportsSSE2() && (width % 4) == 0)
			{
				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 4)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
							
							c0 = _mm_avg_epu16(c0, c1);

							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 4)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
							
							c0 = _mm_avg_epu16(c0, c1);
							c2 = _mm_avg_epu16(c2, c3);
							c0 = _mm_avg_epu16(c0, c2);

							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 4)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
							
							c0 = _mm_avg_epu16(c0, c1);
							c2 = _mm_avg_epu16(c2, c3);
							c4 = _mm_avg_epu16(c4, c5);
							c6 = _mm_avg_epu16(c6, c7);
							c0 = _mm_avg_epu16(c0, c2);
							c4 = _mm_avg_epu16(c4, c6);
							c0 = _mm_avg_epu16(c0, c4);

							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 4)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));

							c0 = _mm_avg_epu16(c0, c1);
							c2 = _mm_avg_epu16(c2, c3);
							c4 = _mm_avg_epu16(c4, c5);
							c6 = _mm_avg_epu16(c6, c7);
							c8 = _mm_avg_epu16(c8, c9);
							cA = _mm_avg_epu16(cA, cB);
							cC = _mm_avg_epu16(cC, cD);
							cE = _mm_avg_epu16(cE, cF);
							c0 = _mm_avg_epu16(c0, c2);
							c4 = _mm_avg_epu16(c4, c6);
							c8 = _mm_avg_epu16(c8, cA);
							cC = _mm_avg_epu16(cC, cE);
							c0 = _mm_avg_epu16(c0, c4);
							c8 = _mm_avg_epu16(c8, cC);
							c0 = _mm_avg_epu16(c0, c8);

							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);
			}
			else
			{
				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))

				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);

							c0 = AVERAGE(c0, c1);

							*(unsigned int*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);

							c0 = AVERAGE(c0, c1);
							c2 = AVERAGE(c2, c3);
							c0 = AVERAGE(c0, c2);

							*(unsigned int*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);

							c0 = AVERAGE(c0, c1);
							c2 = AVERAGE(c2, c3);
							c4 = AVERAGE(c4, c5);
							c6 = AVERAGE(c6, c7);
							c0 = AVERAGE(c0, c2);
							c4 = AVERAGE(c4, c6);
							c0 = AVERAGE(c0, c4);

							*(unsigned int*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);

							c0 = AVERAGE(c0, c1);
							c2 = AVERAGE(c2, c3);
							c4 = AVERAGE(c4, c5);
							c6 = AVERAGE(c6, c7);
							c8 = AVERAGE(c8, c9);
							cA = AVERAGE(cA, cB);
							cC = AVERAGE(cC, cD);
							cE = AVERAGE(cE, cF);
							c0 = AVERAGE(c0, c2);
							c4 = AVERAGE(c4, c6);
							c8 = AVERAGE(c8, cA);
							cC = AVERAGE(cC, cE);
							c0 = AVERAGE(c0, c4);
							c8 = AVERAGE(c8, cC);
							c0 = AVERAGE(c0, c8);

							*(unsigned int*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);

				#undef AVERAGE
			}
		}
		else if(internal.format == FORMAT_A16B16G16R16)
		{
			if(CPUID::supportsSSE2() && (width % 2) == 0)
			{
				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 2)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
							
							c0 = _mm_avg_epu16(c0, c1);

							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 2)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
							
							c0 = _mm_avg_epu16(c0, c1);
							c2 = _mm_avg_epu16(c2, c3);
							c0 = _mm_avg_epu16(c0, c2);

							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 2)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
							
							c0 = _mm_avg_epu16(c0, c1);
							c2 = _mm_avg_epu16(c2, c3);
							c4 = _mm_avg_epu16(c4, c5);
							c6 = _mm_avg_epu16(c6, c7);
							c0 = _mm_avg_epu16(c0, c2);
							c4 = _mm_avg_epu16(c4, c6);
							c0 = _mm_avg_epu16(c0, c4);

							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 2)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));

							c0 = _mm_avg_epu16(c0, c1);
							c2 = _mm_avg_epu16(c2, c3);
							c4 = _mm_avg_epu16(c4, c5);
							c6 = _mm_avg_epu16(c6, c7);
							c8 = _mm_avg_epu16(c8, c9);
							cA = _mm_avg_epu16(cA, cB);
							cC = _mm_avg_epu16(cC, cD);
							cE = _mm_avg_epu16(cE, cF);
							c0 = _mm_avg_epu16(c0, c2);
							c4 = _mm_avg_epu16(c4, c6);
							c8 = _mm_avg_epu16(c8, cA);
							cC = _mm_avg_epu16(cC, cE);
							c0 = _mm_avg_epu16(c0, c4);
							c8 = _mm_avg_epu16(c8, cC);
							c0 = _mm_avg_epu16(c0, c8);

							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);
			}
			else
			{
				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))

				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < 2 * width; x++)
						{
							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);

							c0 = AVERAGE(c0, c1);

							*(unsigned int*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < 2 * width; x++)
						{
							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);

							c0 = AVERAGE(c0, c1);
							c2 = AVERAGE(c2, c3);
							c0 = AVERAGE(c0, c2);

							*(unsigned int*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < 2 * width; x++)
						{
							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);

							c0 = AVERAGE(c0, c1);
							c2 = AVERAGE(c2, c3);
							c4 = AVERAGE(c4, c5);
							c6 = AVERAGE(c6, c7);
							c0 = AVERAGE(c0, c2);
							c4 = AVERAGE(c4, c6);
							c0 = AVERAGE(c0, c4);

							*(unsigned int*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < 2 * width; x++)
						{
							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);

							c0 = AVERAGE(c0, c1);
							c2 = AVERAGE(c2, c3);
							c4 = AVERAGE(c4, c5);
							c6 = AVERAGE(c6, c7);
							c8 = AVERAGE(c8, c9);
							cA = AVERAGE(cA, cB);
							cC = AVERAGE(cC, cD);
							cE = AVERAGE(cE, cF);
							c0 = AVERAGE(c0, c2);
							c4 = AVERAGE(c4, c6);
							c8 = AVERAGE(c8, cA);
							cC = AVERAGE(cC, cE);
							c0 = AVERAGE(c0, c4);
							c8 = AVERAGE(c8, cC);
							c0 = AVERAGE(c0, c8);

							*(unsigned int*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);

				#undef AVERAGE
			}
		}
		else if(internal.format == FORMAT_R32F)
		{
			if(CPUID::supportsSSE() && (width % 4) == 0)
			{
				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 4)
						{
							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
							
							c0 = _mm_add_ps(c0, c1);
							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f)); 

							_mm_store_ps((float*)(source0 + 4 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 4)
						{
							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
							
							c0 = _mm_add_ps(c0, c1);
							c2 = _mm_add_ps(c2, c3);
							c0 = _mm_add_ps(c0, c2);
							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f)); 

							_mm_store_ps((float*)(source0 + 4 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 4)
						{
							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
							
							c0 = _mm_add_ps(c0, c1);
							c2 = _mm_add_ps(c2, c3);
							c4 = _mm_add_ps(c4, c5);
							c6 = _mm_add_ps(c6, c7);
							c0 = _mm_add_ps(c0, c2);
							c4 = _mm_add_ps(c4, c6);
							c0 = _mm_add_ps(c0, c4);
							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f)); 

							_mm_store_ps((float*)(source0 + 4 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 4)
						{
							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
							__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
							__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
							__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
							__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
							__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
							__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
							__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
							__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));

							c0 = _mm_add_ps(c0, c1);
							c2 = _mm_add_ps(c2, c3);
							c4 = _mm_add_ps(c4, c5);
							c6 = _mm_add_ps(c6, c7);
							c8 = _mm_add_ps(c8, c9);
							cA = _mm_add_ps(cA, cB);
							cC = _mm_add_ps(cC, cD);
							cE = _mm_add_ps(cE, cF);
							c0 = _mm_add_ps(c0, c2);
							c4 = _mm_add_ps(c4, c6);
							c8 = _mm_add_ps(c8, cA);
							cC = _mm_add_ps(cC, cE);
							c0 = _mm_add_ps(c0, c4);
							c8 = _mm_add_ps(c8, cC);
							c0 = _mm_add_ps(c0, c8);
							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f)); 

							_mm_store_ps((float*)(source0 + 4 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);
			}
			else
			{
				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							float c0 = *(float*)(source0 + 4 * x);
							float c1 = *(float*)(source1 + 4 * x);

							c0 = c0 + c1;
							c0 *= 1.0f / 2.0f;

							*(float*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							float c0 = *(float*)(source0 + 4 * x);
							float c1 = *(float*)(source1 + 4 * x);
							float c2 = *(float*)(source2 + 4 * x);
							float c3 = *(float*)(source3 + 4 * x);

							c0 = c0 + c1;
							c2 = c2 + c3;
							c0 = c0 + c2;
							c0 *= 1.0f / 4.0f;

							*(float*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							float c0 = *(float*)(source0 + 4 * x);
							float c1 = *(float*)(source1 + 4 * x);
							float c2 = *(float*)(source2 + 4 * x);
							float c3 = *(float*)(source3 + 4 * x);
							float c4 = *(float*)(source4 + 4 * x);
							float c5 = *(float*)(source5 + 4 * x);
							float c6 = *(float*)(source6 + 4 * x);
							float c7 = *(float*)(source7 + 4 * x);

							c0 = c0 + c1;
							c2 = c2 + c3;
							c4 = c4 + c5;
							c6 = c6 + c7;
							c0 = c0 + c2;
							c4 = c4 + c6;
							c0 = c0 + c4;
							c0 *= 1.0f / 8.0f;

							*(float*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							float c0 = *(float*)(source0 + 4 * x);
							float c1 = *(float*)(source1 + 4 * x);
							float c2 = *(float*)(source2 + 4 * x);
							float c3 = *(float*)(source3 + 4 * x);
							float c4 = *(float*)(source4 + 4 * x);
							float c5 = *(float*)(source5 + 4 * x);
							float c6 = *(float*)(source6 + 4 * x);
							float c7 = *(float*)(source7 + 4 * x);
							float c8 = *(float*)(source8 + 4 * x);
							float c9 = *(float*)(source9 + 4 * x);
							float cA = *(float*)(sourceA + 4 * x);
							float cB = *(float*)(sourceB + 4 * x);
							float cC = *(float*)(sourceC + 4 * x);
							float cD = *(float*)(sourceD + 4 * x);
							float cE = *(float*)(sourceE + 4 * x);
							float cF = *(float*)(sourceF + 4 * x);

							c0 = c0 + c1;
							c2 = c2 + c3;
							c4 = c4 + c5;
							c6 = c6 + c7;
							c8 = c8 + c9;
							cA = cA + cB;
							cC = cC + cD;
							cE = cE + cF;
							c0 = c0 + c2;
							c4 = c4 + c6;
							c8 = c8 + cA;
							cC = cC + cE;
							c0 = c0 + c4;
							c8 = c8 + cC;
							c0 = c0 + c8;
							c0 *= 1.0f / 16.0f;

							*(float*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);
			}
		}
		else if(internal.format == FORMAT_G32R32F)
		{
			if(CPUID::supportsSSE() && (width % 2) == 0)
			{
				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 2)
						{
							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
							
							c0 = _mm_add_ps(c0, c1);
							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f)); 

							_mm_store_ps((float*)(source0 + 8 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 2)
						{
							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
							
							c0 = _mm_add_ps(c0, c1);
							c2 = _mm_add_ps(c2, c3);
							c0 = _mm_add_ps(c0, c2);
							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f)); 

							_mm_store_ps((float*)(source0 + 8 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 2)
						{
							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
							
							c0 = _mm_add_ps(c0, c1);
							c2 = _mm_add_ps(c2, c3);
							c4 = _mm_add_ps(c4, c5);
							c6 = _mm_add_ps(c6, c7);
							c0 = _mm_add_ps(c0, c2);
							c4 = _mm_add_ps(c4, c6);
							c0 = _mm_add_ps(c0, c4);
							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f)); 

							_mm_store_ps((float*)(source0 + 8 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 2)
						{
							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
							__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
							__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
							__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
							__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
							__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
							__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
							__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
							__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));

							c0 = _mm_add_ps(c0, c1);
							c2 = _mm_add_ps(c2, c3);
							c4 = _mm_add_ps(c4, c5);
							c6 = _mm_add_ps(c6, c7);
							c8 = _mm_add_ps(c8, c9);
							cA = _mm_add_ps(cA, cB);
							cC = _mm_add_ps(cC, cD);
							cE = _mm_add_ps(cE, cF);
							c0 = _mm_add_ps(c0, c2);
							c4 = _mm_add_ps(c4, c6);
							c8 = _mm_add_ps(c8, cA);
							cC = _mm_add_ps(cC, cE);
							c0 = _mm_add_ps(c0, c4);
							c8 = _mm_add_ps(c8, cC);
							c0 = _mm_add_ps(c0, c8);
							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f)); 

							_mm_store_ps((float*)(source0 + 8 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);
			}
			else
			{
				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < 2 * width; x++)
						{
							float c0 = *(float*)(source0 + 4 * x);
							float c1 = *(float*)(source1 + 4 * x);

							c0 = c0 + c1;
							c0 *= 1.0f / 2.0f;

							*(float*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < 2 * width; x++)
						{
							float c0 = *(float*)(source0 + 4 * x);
							float c1 = *(float*)(source1 + 4 * x);
							float c2 = *(float*)(source2 + 4 * x);
							float c3 = *(float*)(source3 + 4 * x);

							c0 = c0 + c1;
							c2 = c2 + c3;
							c0 = c0 + c2;
							c0 *= 1.0f / 4.0f;

							*(float*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < 2 * width; x++)
						{
							float c0 = *(float*)(source0 + 4 * x);
							float c1 = *(float*)(source1 + 4 * x);
							float c2 = *(float*)(source2 + 4 * x);
							float c3 = *(float*)(source3 + 4 * x);
							float c4 = *(float*)(source4 + 4 * x);
							float c5 = *(float*)(source5 + 4 * x);
							float c6 = *(float*)(source6 + 4 * x);
							float c7 = *(float*)(source7 + 4 * x);

							c0 = c0 + c1;
							c2 = c2 + c3;
							c4 = c4 + c5;
							c6 = c6 + c7;
							c0 = c0 + c2;
							c4 = c4 + c6;
							c0 = c0 + c4;
							c0 *= 1.0f / 8.0f;

							*(float*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < 2 * width; x++)
						{
							float c0 = *(float*)(source0 + 4 * x);
							float c1 = *(float*)(source1 + 4 * x);
							float c2 = *(float*)(source2 + 4 * x);
							float c3 = *(float*)(source3 + 4 * x);
							float c4 = *(float*)(source4 + 4 * x);
							float c5 = *(float*)(source5 + 4 * x);
							float c6 = *(float*)(source6 + 4 * x);
							float c7 = *(float*)(source7 + 4 * x);
							float c8 = *(float*)(source8 + 4 * x);
							float c9 = *(float*)(source9 + 4 * x);
							float cA = *(float*)(sourceA + 4 * x);
							float cB = *(float*)(sourceB + 4 * x);
							float cC = *(float*)(sourceC + 4 * x);
							float cD = *(float*)(sourceD + 4 * x);
							float cE = *(float*)(sourceE + 4 * x);
							float cF = *(float*)(sourceF + 4 * x);

							c0 = c0 + c1;
							c2 = c2 + c3;
							c4 = c4 + c5;
							c6 = c6 + c7;
							c8 = c8 + c9;
							cA = cA + cB;
							cC = cC + cD;
							cE = cE + cF;
							c0 = c0 + c2;
							c4 = c4 + c6;
							c8 = c8 + cA;
							cC = cC + cE;
							c0 = c0 + c4;
							c8 = c8 + cC;
							c0 = c0 + c8;
							c0 *= 1.0f / 16.0f;

							*(float*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);
			}
		}
		else if(internal.format == FORMAT_A32B32G32R32F)
		{
			if(CPUID::supportsSSE())
			{
				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
							
							c0 = _mm_add_ps(c0, c1);
							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f)); 

							_mm_store_ps((float*)(source0 + 16 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
							
							c0 = _mm_add_ps(c0, c1);
							c2 = _mm_add_ps(c2, c3);
							c0 = _mm_add_ps(c0, c2);
							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f)); 

							_mm_store_ps((float*)(source0 + 16 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
							
							c0 = _mm_add_ps(c0, c1);
							c2 = _mm_add_ps(c2, c3);
							c4 = _mm_add_ps(c4, c5);
							c6 = _mm_add_ps(c6, c7);
							c0 = _mm_add_ps(c0, c2);
							c4 = _mm_add_ps(c4, c6);
							c0 = _mm_add_ps(c0, c4);
							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f)); 

							_mm_store_ps((float*)(source0 + 16 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
							__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
							__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
							__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
							__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
							__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
							__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
							__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
							__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));

							c0 = _mm_add_ps(c0, c1);
							c2 = _mm_add_ps(c2, c3);
							c4 = _mm_add_ps(c4, c5);
							c6 = _mm_add_ps(c6, c7);
							c8 = _mm_add_ps(c8, c9);
							cA = _mm_add_ps(cA, cB);
							cC = _mm_add_ps(cC, cD);
							cE = _mm_add_ps(cE, cF);
							c0 = _mm_add_ps(c0, c2);
							c4 = _mm_add_ps(c4, c6);
							c8 = _mm_add_ps(c8, cA);
							cC = _mm_add_ps(cC, cE);
							c0 = _mm_add_ps(c0, c4);
							c8 = _mm_add_ps(c8, cC);
							c0 = _mm_add_ps(c0, c8);
							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f)); 

							_mm_store_ps((float*)(source0 + 16 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);
			}
			else
			{
				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < 4 * width; x++)
						{
							float c0 = *(float*)(source0 + 4 * x);
							float c1 = *(float*)(source1 + 4 * x);

							c0 = c0 + c1;
							c0 *= 1.0f / 2.0f;

							*(float*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < 4 * width; x++)
						{
							float c0 = *(float*)(source0 + 4 * x);
							float c1 = *(float*)(source1 + 4 * x);
							float c2 = *(float*)(source2 + 4 * x);
							float c3 = *(float*)(source3 + 4 * x);

							c0 = c0 + c1;
							c2 = c2 + c3;
							c0 = c0 + c2;
							c0 *= 1.0f / 4.0f;

							*(float*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < 4 * width; x++)
						{
							float c0 = *(float*)(source0 + 4 * x);
							float c1 = *(float*)(source1 + 4 * x);
							float c2 = *(float*)(source2 + 4 * x);
							float c3 = *(float*)(source3 + 4 * x);
							float c4 = *(float*)(source4 + 4 * x);
							float c5 = *(float*)(source5 + 4 * x);
							float c6 = *(float*)(source6 + 4 * x);
							float c7 = *(float*)(source7 + 4 * x);

							c0 = c0 + c1;
							c2 = c2 + c3;
							c4 = c4 + c5;
							c6 = c6 + c7;
							c0 = c0 + c2;
							c4 = c4 + c6;
							c0 = c0 + c4;
							c0 *= 1.0f / 8.0f;

							*(float*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < 4 * width; x++)
						{
							float c0 = *(float*)(source0 + 4 * x);
							float c1 = *(float*)(source1 + 4 * x);
							float c2 = *(float*)(source2 + 4 * x);
							float c3 = *(float*)(source3 + 4 * x);
							float c4 = *(float*)(source4 + 4 * x);
							float c5 = *(float*)(source5 + 4 * x);
							float c6 = *(float*)(source6 + 4 * x);
							float c7 = *(float*)(source7 + 4 * x);
							float c8 = *(float*)(source8 + 4 * x);
							float c9 = *(float*)(source9 + 4 * x);
							float cA = *(float*)(sourceA + 4 * x);
							float cB = *(float*)(sourceB + 4 * x);
							float cC = *(float*)(sourceC + 4 * x);
							float cD = *(float*)(sourceD + 4 * x);
							float cE = *(float*)(sourceE + 4 * x);
							float cF = *(float*)(sourceF + 4 * x);

							c0 = c0 + c1;
							c2 = c2 + c3;
							c4 = c4 + c5;
							c6 = c6 + c7;
							c8 = c8 + c9;
							cA = cA + cB;
							cC = cC + cD;
							cE = cE + cF;
							c0 = c0 + c2;
							c4 = c4 + c6;
							c8 = c8 + cA;
							cC = cC + cE;
							c0 = c0 + c4;
							c8 = c8 + cC;
							c0 = c0 + c8;
							c0 *= 1.0f / 16.0f;

							*(float*)(source0 + 4 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);
			}
		}
		else if(internal.format == FORMAT_R5G6B5)
		{
			if(CPUID::supportsSSE2() && (width % 8) == 0)
			{
				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 8)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
						
							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));

							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
							c1 = _mm_avg_epu16(c0__g_, c1__g_);
							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
							c0 = _mm_or_si128(c0, c1);

							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 8)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
							
							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));

							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
							c0 = _mm_avg_epu8(c0, c2);
							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
							c1 = _mm_avg_epu16(c0__g_, c1__g_);
							c3 = _mm_avg_epu16(c2__g_, c3__g_);
							c1 = _mm_avg_epu16(c1, c3);
							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
							c0 = _mm_or_si128(c0, c1);

							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 8)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
							
							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));

							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
							c0 = _mm_avg_epu8(c0, c2);
							c4 = _mm_avg_epu8(c4, c6);
							c0 = _mm_avg_epu8(c0, c4);
							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
							c1 = _mm_avg_epu16(c0__g_, c1__g_);
							c3 = _mm_avg_epu16(c2__g_, c3__g_);
							c5 = _mm_avg_epu16(c4__g_, c5__g_);
							c7 = _mm_avg_epu16(c6__g_, c7__g_);
							c1 = _mm_avg_epu16(c1, c3);
							c5 = _mm_avg_epu16(c5, c7);
							c1 = _mm_avg_epu16(c1, c5);
							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
							c0 = _mm_or_si128(c0, c1);

							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x += 8)
						{
							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));

							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
							__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
							__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
							__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
							__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
							__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
							__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
							__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
							__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
							__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
							__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
							__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
							__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
							__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
							__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
							__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
							__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));

							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
							c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
							cA = _mm_avg_epu8(cA_r_b, cB_r_b);
							cC = _mm_avg_epu8(cC_r_b, cD_r_b);
							cE = _mm_avg_epu8(cE_r_b, cF_r_b);
							c0 = _mm_avg_epu8(c0, c2);
							c4 = _mm_avg_epu8(c4, c6);
							c8 = _mm_avg_epu8(c8, cA);
							cC = _mm_avg_epu8(cC, cE);
							c0 = _mm_avg_epu8(c0, c4);
							c8 = _mm_avg_epu8(c8, cC);
							c0 = _mm_avg_epu8(c0, c8);
							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
							c1 = _mm_avg_epu16(c0__g_, c1__g_);
							c3 = _mm_avg_epu16(c2__g_, c3__g_);
							c5 = _mm_avg_epu16(c4__g_, c5__g_);
							c7 = _mm_avg_epu16(c6__g_, c7__g_);
							c9 = _mm_avg_epu16(c8__g_, c9__g_);
							cB = _mm_avg_epu16(cA__g_, cB__g_);
							cD = _mm_avg_epu16(cC__g_, cD__g_);
							cF = _mm_avg_epu16(cE__g_, cF__g_);
							c1 = _mm_avg_epu8(c1, c3);
							c5 = _mm_avg_epu8(c5, c7);
							c9 = _mm_avg_epu8(c9, cB);
							cD = _mm_avg_epu8(cD, cF);
							c1 = _mm_avg_epu8(c1, c5);
							c9 = _mm_avg_epu8(c9, cD);
							c1 = _mm_avg_epu8(c1, c9);
							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
							c0 = _mm_or_si128(c0, c1);

							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);
			}
			else
			{
				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))

				if(internal.depth == 2)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);

							c0 = AVERAGE(c0, c1);

							*(unsigned short*)(source0 + 2 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
					}
				}
				else if(internal.depth == 4)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);

							c0 = AVERAGE(c0, c1);
							c2 = AVERAGE(c2, c3);
							c0 = AVERAGE(c0, c2);

							*(unsigned short*)(source0 + 2 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
					}
				}
				else if(internal.depth == 8)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);

							c0 = AVERAGE(c0, c1);
							c2 = AVERAGE(c2, c3);
							c4 = AVERAGE(c4, c5);
							c6 = AVERAGE(c6, c7);
							c0 = AVERAGE(c0, c2);
							c4 = AVERAGE(c4, c6);
							c0 = AVERAGE(c0, c4);

							*(unsigned short*)(source0 + 2 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
					}
				}
				else if(internal.depth == 16)
				{
					for(int y = 0; y < height; y++)
					{
						for(int x = 0; x < width; x++)
						{
							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);

							c0 = AVERAGE(c0, c1);
							c2 = AVERAGE(c2, c3);
							c4 = AVERAGE(c4, c5);
							c6 = AVERAGE(c6, c7);
							c8 = AVERAGE(c8, c9);
							cA = AVERAGE(cA, cB);
							cC = AVERAGE(cC, cD);
							cE = AVERAGE(cE, cF);
							c0 = AVERAGE(c0, c2);
							c4 = AVERAGE(c4, c6);
							c8 = AVERAGE(c8, cA);
							cC = AVERAGE(cC, cE);
							c0 = AVERAGE(c0, c4);
							c8 = AVERAGE(c8, cC);
							c0 = AVERAGE(c0, c8);

							*(unsigned short*)(source0 + 2 * x) = c0;
						}

						source0 += pitch;
						source1 += pitch;
						source2 += pitch;
						source3 += pitch;
						source4 += pitch;
						source5 += pitch;
						source6 += pitch;
						source7 += pitch;
						source8 += pitch;
						source9 += pitch;
						sourceA += pitch;
						sourceB += pitch;
						sourceC += pitch;
						sourceD += pitch;
						sourceE += pitch;
						sourceF += pitch;
					}
				}
				else ASSERT(false);

				#undef AVERAGE
			}
		}
		else
		{
		//	UNIMPLEMENTED();
		}
	}
}
