Fix ARM compilation.

This does not provide full support for ARM,
but merely makes things (statically) compile.

Bug b/37478805

Change-Id: I01d1d84e396c04c84e74d521946595014d2eafb5
Reviewed-on: https://swiftshader-review.googlesource.com/9430
Reviewed-by: Nicolas Capens <capn@google.com>
Tested-by: Nicolas Capens <capn@google.com>
diff --git a/src/Common/CPUID.cpp b/src/Common/CPUID.cpp
index 23d76d6..c080034 100644
--- a/src/Common/CPUID.cpp
+++ b/src/Common/CPUID.cpp
@@ -164,10 +164,17 @@
 
 	static void cpuid(int registers[4], int info)
 	{
-		#if defined(_WIN32)
-			__cpuid(registers, info);
+		#if defined(__i386__) || defined(__x86_64__)
+			#if defined(_WIN32)
+				__cpuid(registers, info);
+			#else
+				__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+			#endif
 		#else
-			__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+			registers[0] = 0;
+			registers[1] = 0;
+			registers[2] = 0;
+			registers[3] = 0;
 		#endif
 	}
 
diff --git a/src/Common/CPUID.hpp b/src/Common/CPUID.hpp
index 73c8e94..3c21cd7 100644
--- a/src/Common/CPUID.hpp
+++ b/src/Common/CPUID.hpp
@@ -17,6 +17,10 @@
 
 namespace sw
 {
+	#if !defined(__i386__) && defined(_M_IX86)
+		#define __i386__ 1
+	#endif
+
 	#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
 		#define __x86_64__ 1
 	#endif
diff --git a/src/Common/Timer.cpp b/src/Common/Timer.cpp
index bb01cda..8ff2cf3 100644
--- a/src/Common/Timer.cpp
+++ b/src/Common/Timer.cpp
@@ -14,6 +14,14 @@
 
 #include "Timer.hpp"
 
+#if !defined(__i386__) && defined(_M_IX86)
+	#define __i386__ 1
+#endif
+
+#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
+	#define __x86_64__ 1
+#endif
+
 #if defined(_WIN32)
 	#ifndef WIN32_LEAN_AND_MEAN
 		#define WIN32_LEAN_AND_MEAN
@@ -22,7 +30,9 @@
 	#include <intrin.h>
 #else
 	#include <sys/time.h>
-	#include <x86intrin.h>
+	#if defined(__i386__) || defined(__x86_64__)
+		#include <x86intrin.h>
+	#endif
 #endif
 
 namespace sw
@@ -50,10 +60,12 @@
 	{
 		#if defined(_WIN32)
 			return __rdtsc();
-		#else
+		#elif defined(__i386__) || defined(__x86_64__)
 			int64_t tsc;
 			__asm volatile("rdtsc": "=A" (tsc));
 			return tsc;
+		#else
+			return 0;
 		#endif
 	}
 
diff --git a/src/OpenGL/libEGL/Display.cpp b/src/OpenGL/libEGL/Display.cpp
index 7d895c3..ff30812 100644
--- a/src/OpenGL/libEGL/Display.cpp
+++ b/src/OpenGL/libEGL/Display.cpp
@@ -84,12 +84,27 @@
 	#endif
 }
 
+#if !defined(__i386__) && defined(_M_IX86)
+	#define __i386__ 1
+#endif
+
+#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
+	#define __x86_64__ 1
+#endif
+
 static void cpuid(int registers[4], int info)
 {
-	#if defined(_WIN32)
-		__cpuid(registers, info);
+	#if defined(__i386__) || defined(__x86_64__)
+		#if defined(_WIN32)
+			__cpuid(registers, info);
+		#else
+			__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+		#endif
 	#else
-		__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+		registers[0] = 0;
+		registers[1] = 0;
+		registers[2] = 0;
+		registers[3] = 0;
 	#endif
 }
 
@@ -107,10 +122,12 @@
 		return true;
 	}
 
-	if(!detectSSE())
-	{
-		return false;
-	}
+	#if defined(__i386__) || defined(__x86_64__)
+		if(!detectSSE())
+		{
+			return false;
+		}
+	#endif
 
 	mMinSwapInterval = 0;
 	mMaxSwapInterval = 4;
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 1e8a75a..bfcf2ff 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -37,9 +37,12 @@
 #include "Memory.hpp"
 #include "MutexLock.hpp"
 
-#include <xmmintrin.h>
 #include <fstream>
 
+#if defined(__i386__) || defined(__x86_64__)
+#include <xmmintrin.h>
+#endif
+
 #if defined(__x86_64__) && defined(_WIN32)
 extern "C" void X86CompilationCallback()
 {
@@ -5734,16 +5737,16 @@
 
 	RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
 	{
-		if(exactAtPow2)
-		{
-			// rcpss uses a piecewise-linear approximation which minimizes the relative error
-			// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
-			return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
-		}
-		else
-		{
-			return x86::rcpss(x);
-		}
+		#if defined(__i386__) || defined(__x86_64__)
+			if(exactAtPow2)
+			{
+				// rcpss uses a piecewise-linear approximation which minimizes the relative error
+				// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+				return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+			}
+		#endif
+
+		return x86::rcpss(x);
 	}
 
 	RValue<Float> RcpSqrt_pp(RValue<Float> x)
@@ -6114,16 +6117,16 @@
 
 	RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
 	{
-		if(exactAtPow2)
-		{
-			// rcpps uses a piecewise-linear approximation which minimizes the relative error
-			// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
-			return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
-		}
-		else
-		{
-			return x86::rcpps(x);
-		}
+		#if defined(__i386__) || defined(__x86_64__)
+			if(exactAtPow2)
+			{
+				// rcpps uses a piecewise-linear approximation which minimizes the relative error
+				// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+				return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+			}
+		#endif
+
+		return x86::rcpps(x);
 	}
 
 	RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index d1464a5..ee3fedb 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -66,6 +66,14 @@
 
 namespace
 {
+	#if !defined(__i386__) && defined(_M_IX86)
+		#define __i386__ 1
+	#endif
+
+	#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
+		#define __x86_64__ 1
+	#endif
+
 	class CPUID
 	{
 	public:
@@ -74,18 +82,29 @@
 	private:
 		static void cpuid(int registers[4], int info)
 		{
-			#if defined(_WIN32)
-				__cpuid(registers, info);
+			#if defined(__i386__) || defined(__x86_64__)
+				#if defined(_WIN32)
+					__cpuid(registers, info);
+				#else
+					__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+				#endif
 			#else
-				__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+				registers[0] = 0;
+				registers[1] = 0;
+				registers[2] = 0;
+				registers[3] = 0;
 			#endif
 		}
 
 		static bool detectSSE4_1()
 		{
-			int registers[4];
-			cpuid(registers, 1);
-			return (registers[2] & 0x00080000) != 0;
+			#if defined(__i386__) || defined(__x86_64__)
+				int registers[4];
+				cpuid(registers, 1);
+				return (registers[2] & 0x00080000) != 0;
+			#else
+				return false;
+			#endif
 		}
 	};
 
diff --git a/src/Renderer/Surface.cpp b/src/Renderer/Surface.cpp
index 7d75217..4eb77ca 100644
--- a/src/Renderer/Surface.cpp
+++ b/src/Renderer/Surface.cpp
@@ -25,8 +25,10 @@
 #include "Common/Debug.hpp"
 #include "Reactor/Reactor.hpp"
 
-#include <xmmintrin.h>
-#include <emmintrin.h>
+#if defined(__i386__) || defined(__x86_64__)
+	#include <xmmintrin.h>
+	#include <emmintrin.h>
+#endif
 
 #undef min
 #undef max
@@ -3104,33 +3106,35 @@
 			bytes -= 2;
 		}
 
-		if(CPUID::supportsSSE())
-		{
-			while((size_t)buffer & 0xF && bytes >= 4)
+		#if defined(__i386__) || defined(__x86_64__)
+			if(CPUID::supportsSSE())
 			{
-				*(int*)buffer = pattern;
-				(int*&)buffer += 1;
-				bytes -= 4;
+				while((size_t)buffer & 0xF && bytes >= 4)
+				{
+					*(int*)buffer = pattern;
+					(int*&)buffer += 1;
+					bytes -= 4;
+				}
+
+				__m128 quad = _mm_set_ps1((float&)pattern);
+
+				float *pointer = (float*)buffer;
+				int qxwords = bytes / 64;
+				bytes -= qxwords * 64;
+
+				while(qxwords--)
+				{
+					_mm_stream_ps(pointer + 0, quad);
+					_mm_stream_ps(pointer + 4, quad);
+					_mm_stream_ps(pointer + 8, quad);
+					_mm_stream_ps(pointer + 12, quad);
+
+					pointer += 16;
+				}
+
+				buffer = pointer;
 			}
-
-			__m128 quad = _mm_set_ps1((float&)pattern);
-
-			float *pointer = (float*)buffer;
-			int qxwords = bytes / 64;
-			bytes -= qxwords * 64;
-
-			while(qxwords--)
-			{
-				_mm_stream_ps(pointer + 0, quad);
-				_mm_stream_ps(pointer + 4, quad);
-				_mm_stream_ps(pointer + 8, quad);
-				_mm_stream_ps(pointer + 12, quad);
-
-				pointer += 16;
-			}
-
-			buffer = pointer;
-		}
+		#endif
 
 		while(bytes >= 4)
 		{
@@ -3771,149 +3775,151 @@
 		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
 		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
 		{
-			if(CPUID::supportsSSE2() && (width % 4) == 0)
-			{
-				if(internal.depth == 2)
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE2() && (width % 4) == 0)
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
 
-							c0 = _mm_avg_epu8(c0, c1);
+								c0 = _mm_avg_epu8(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
 
-							c0 = _mm_avg_epu8(c0, c1);
-							c2 = _mm_avg_epu8(c2, c3);
-							c0 = _mm_avg_epu8(c0, c2);
+								c0 = _mm_avg_epu8(c0, c1);
+								c2 = _mm_avg_epu8(c2, c3);
+								c0 = _mm_avg_epu8(c0, c2);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
 
-							c0 = _mm_avg_epu8(c0, c1);
-							c2 = _mm_avg_epu8(c2, c3);
-							c4 = _mm_avg_epu8(c4, c5);
-							c6 = _mm_avg_epu8(c6, c7);
-							c0 = _mm_avg_epu8(c0, c2);
-							c4 = _mm_avg_epu8(c4, c6);
-							c0 = _mm_avg_epu8(c0, c4);
+								c0 = _mm_avg_epu8(c0, c1);
+								c2 = _mm_avg_epu8(c2, c3);
+								c4 = _mm_avg_epu8(c4, c5);
+								c6 = _mm_avg_epu8(c6, c7);
+								c0 = _mm_avg_epu8(c0, c2);
+								c4 = _mm_avg_epu8(c4, c6);
+								c0 = _mm_avg_epu8(c0, c4);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
-							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
-							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
-							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
-							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
-							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
-							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
-							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
-							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
+								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
+								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
+								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
+								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
+								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
+								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
+								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
 
-							c0 = _mm_avg_epu8(c0, c1);
-							c2 = _mm_avg_epu8(c2, c3);
-							c4 = _mm_avg_epu8(c4, c5);
-							c6 = _mm_avg_epu8(c6, c7);
-							c8 = _mm_avg_epu8(c8, c9);
-							cA = _mm_avg_epu8(cA, cB);
-							cC = _mm_avg_epu8(cC, cD);
-							cE = _mm_avg_epu8(cE, cF);
-							c0 = _mm_avg_epu8(c0, c2);
-							c4 = _mm_avg_epu8(c4, c6);
-							c8 = _mm_avg_epu8(c8, cA);
-							cC = _mm_avg_epu8(cC, cE);
-							c0 = _mm_avg_epu8(c0, c4);
-							c8 = _mm_avg_epu8(c8, cC);
-							c0 = _mm_avg_epu8(c0, c8);
+								c0 = _mm_avg_epu8(c0, c1);
+								c2 = _mm_avg_epu8(c2, c3);
+								c4 = _mm_avg_epu8(c4, c5);
+								c6 = _mm_avg_epu8(c6, c7);
+								c8 = _mm_avg_epu8(c8, c9);
+								cA = _mm_avg_epu8(cA, cB);
+								cC = _mm_avg_epu8(cC, cD);
+								cE = _mm_avg_epu8(cE, cF);
+								c0 = _mm_avg_epu8(c0, c2);
+								c4 = _mm_avg_epu8(c4, c6);
+								c8 = _mm_avg_epu8(c8, cA);
+								cC = _mm_avg_epu8(cC, cE);
+								c0 = _mm_avg_epu8(c0, c4);
+								c8 = _mm_avg_epu8(c8, cC);
+								c0 = _mm_avg_epu8(c0, c8);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
 
@@ -4062,149 +4068,152 @@
 		}
 		else if(internal.format == FORMAT_G16R16)
 		{
-			if(CPUID::supportsSSE2() && (width % 4) == 0)
-			{
-				if(internal.depth == 2)
+
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE2() && (width % 4) == 0)
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
+								c0 = _mm_avg_epu16(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
-							c2 = _mm_avg_epu16(c2, c3);
-							c0 = _mm_avg_epu16(c0, c2);
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c0 = _mm_avg_epu16(c0, c2);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
-							c2 = _mm_avg_epu16(c2, c3);
-							c4 = _mm_avg_epu16(c4, c5);
-							c6 = _mm_avg_epu16(c6, c7);
-							c0 = _mm_avg_epu16(c0, c2);
-							c4 = _mm_avg_epu16(c4, c6);
-							c0 = _mm_avg_epu16(c0, c4);
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c4 = _mm_avg_epu16(c4, c5);
+								c6 = _mm_avg_epu16(c6, c7);
+								c0 = _mm_avg_epu16(c0, c2);
+								c4 = _mm_avg_epu16(c4, c6);
+								c0 = _mm_avg_epu16(c0, c4);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
-							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
-							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
-							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
-							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
-							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
-							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
-							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
-							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
+								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
+								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
+								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
+								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
+								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
+								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
+								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
-							c2 = _mm_avg_epu16(c2, c3);
-							c4 = _mm_avg_epu16(c4, c5);
-							c6 = _mm_avg_epu16(c6, c7);
-							c8 = _mm_avg_epu16(c8, c9);
-							cA = _mm_avg_epu16(cA, cB);
-							cC = _mm_avg_epu16(cC, cD);
-							cE = _mm_avg_epu16(cE, cF);
-							c0 = _mm_avg_epu16(c0, c2);
-							c4 = _mm_avg_epu16(c4, c6);
-							c8 = _mm_avg_epu16(c8, cA);
-							cC = _mm_avg_epu16(cC, cE);
-							c0 = _mm_avg_epu16(c0, c4);
-							c8 = _mm_avg_epu16(c8, cC);
-							c0 = _mm_avg_epu16(c0, c8);
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c4 = _mm_avg_epu16(c4, c5);
+								c6 = _mm_avg_epu16(c6, c7);
+								c8 = _mm_avg_epu16(c8, c9);
+								cA = _mm_avg_epu16(cA, cB);
+								cC = _mm_avg_epu16(cC, cD);
+								cE = _mm_avg_epu16(cE, cF);
+								c0 = _mm_avg_epu16(c0, c2);
+								c4 = _mm_avg_epu16(c4, c6);
+								c8 = _mm_avg_epu16(c8, cA);
+								cC = _mm_avg_epu16(cC, cE);
+								c0 = _mm_avg_epu16(c0, c4);
+								c8 = _mm_avg_epu16(c8, cC);
+								c0 = _mm_avg_epu16(c0, c8);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
 
@@ -4353,149 +4362,151 @@
 		}
 		else if(internal.format == FORMAT_A16B16G16R16)
 		{
-			if(CPUID::supportsSSE2() && (width % 2) == 0)
-			{
-				if(internal.depth == 2)
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE2() && (width % 2) == 0)
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
+								c0 = _mm_avg_epu16(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
-							c2 = _mm_avg_epu16(c2, c3);
-							c0 = _mm_avg_epu16(c0, c2);
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c0 = _mm_avg_epu16(c0, c2);
 
-							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
-							c2 = _mm_avg_epu16(c2, c3);
-							c4 = _mm_avg_epu16(c4, c5);
-							c6 = _mm_avg_epu16(c6, c7);
-							c0 = _mm_avg_epu16(c0, c2);
-							c4 = _mm_avg_epu16(c4, c6);
-							c0 = _mm_avg_epu16(c0, c4);
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c4 = _mm_avg_epu16(c4, c5);
+								c6 = _mm_avg_epu16(c6, c7);
+								c0 = _mm_avg_epu16(c0, c2);
+								c4 = _mm_avg_epu16(c4, c6);
+								c0 = _mm_avg_epu16(c0, c4);
 
-							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
-							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
-							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
-							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
-							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
-							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
-							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
-							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
-							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
+								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
+								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
+								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
+								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
+								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
+								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
+								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
+								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
-							c2 = _mm_avg_epu16(c2, c3);
-							c4 = _mm_avg_epu16(c4, c5);
-							c6 = _mm_avg_epu16(c6, c7);
-							c8 = _mm_avg_epu16(c8, c9);
-							cA = _mm_avg_epu16(cA, cB);
-							cC = _mm_avg_epu16(cC, cD);
-							cE = _mm_avg_epu16(cE, cF);
-							c0 = _mm_avg_epu16(c0, c2);
-							c4 = _mm_avg_epu16(c4, c6);
-							c8 = _mm_avg_epu16(c8, cA);
-							cC = _mm_avg_epu16(cC, cE);
-							c0 = _mm_avg_epu16(c0, c4);
-							c8 = _mm_avg_epu16(c8, cC);
-							c0 = _mm_avg_epu16(c0, c8);
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c4 = _mm_avg_epu16(c4, c5);
+								c6 = _mm_avg_epu16(c6, c7);
+								c8 = _mm_avg_epu16(c8, c9);
+								cA = _mm_avg_epu16(cA, cB);
+								cC = _mm_avg_epu16(cC, cD);
+								cE = _mm_avg_epu16(cE, cF);
+								c0 = _mm_avg_epu16(c0, c2);
+								c4 = _mm_avg_epu16(c4, c6);
+								c8 = _mm_avg_epu16(c8, cA);
+								cC = _mm_avg_epu16(cC, cE);
+								c0 = _mm_avg_epu16(c0, c4);
+								c8 = _mm_avg_epu16(c8, cC);
+								c0 = _mm_avg_epu16(c0, c8);
 
-							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
 
@@ -4644,153 +4655,155 @@
 		}
 		else if(internal.format == FORMAT_R32F)
 		{
-			if(CPUID::supportsSSE() && (width % 4) == 0)
-			{
-				if(internal.depth == 2)
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE() && (width % 4) == 0)
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
 
-							_mm_store_ps((float*)(source0 + 4 * x), c0);
+								_mm_store_ps((float*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c0 = _mm_add_ps(c0, c2);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c0 = _mm_add_ps(c0, c2);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
 
-							_mm_store_ps((float*)(source0 + 4 * x), c0);
+								_mm_store_ps((float*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
-							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
-							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
-							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
-							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c4 = _mm_add_ps(c4, c5);
-							c6 = _mm_add_ps(c6, c7);
-							c0 = _mm_add_ps(c0, c2);
-							c4 = _mm_add_ps(c4, c6);
-							c0 = _mm_add_ps(c0, c4);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c0 = _mm_add_ps(c0, c4);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
 
-							_mm_store_ps((float*)(source0 + 4 * x), c0);
+								_mm_store_ps((float*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
-							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
-							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
-							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
-							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
-							__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
-							__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
-							__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
-							__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
-							__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
-							__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
-							__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
-							__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
+								__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
+								__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
+								__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
+								__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
+								__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
+								__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
+								__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
+								__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c4 = _mm_add_ps(c4, c5);
-							c6 = _mm_add_ps(c6, c7);
-							c8 = _mm_add_ps(c8, c9);
-							cA = _mm_add_ps(cA, cB);
-							cC = _mm_add_ps(cC, cD);
-							cE = _mm_add_ps(cE, cF);
-							c0 = _mm_add_ps(c0, c2);
-							c4 = _mm_add_ps(c4, c6);
-							c8 = _mm_add_ps(c8, cA);
-							cC = _mm_add_ps(cC, cE);
-							c0 = _mm_add_ps(c0, c4);
-							c8 = _mm_add_ps(c8, cC);
-							c0 = _mm_add_ps(c0, c8);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c8 = _mm_add_ps(c8, c9);
+								cA = _mm_add_ps(cA, cB);
+								cC = _mm_add_ps(cC, cD);
+								cE = _mm_add_ps(cE, cF);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c8 = _mm_add_ps(c8, cA);
+								cC = _mm_add_ps(cC, cE);
+								c0 = _mm_add_ps(c0, c4);
+								c8 = _mm_add_ps(c8, cC);
+								c0 = _mm_add_ps(c0, c8);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
 
-							_mm_store_ps((float*)(source0 + 4 * x), c0);
+								_mm_store_ps((float*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				if(internal.depth == 2)
 				{
@@ -4939,153 +4952,155 @@
 		}
 		else if(internal.format == FORMAT_G32R32F)
 		{
-			if(CPUID::supportsSSE() && (width % 2) == 0)
-			{
-				if(internal.depth == 2)
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE() && (width % 2) == 0)
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
 
-							_mm_store_ps((float*)(source0 + 8 * x), c0);
+								_mm_store_ps((float*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c0 = _mm_add_ps(c0, c2);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c0 = _mm_add_ps(c0, c2);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
 
-							_mm_store_ps((float*)(source0 + 8 * x), c0);
+								_mm_store_ps((float*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
-							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
-							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
-							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
-							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c4 = _mm_add_ps(c4, c5);
-							c6 = _mm_add_ps(c6, c7);
-							c0 = _mm_add_ps(c0, c2);
-							c4 = _mm_add_ps(c4, c6);
-							c0 = _mm_add_ps(c0, c4);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c0 = _mm_add_ps(c0, c4);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
 
-							_mm_store_ps((float*)(source0 + 8 * x), c0);
+								_mm_store_ps((float*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
-							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
-							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
-							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
-							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
-							__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
-							__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
-							__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
-							__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
-							__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
-							__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
-							__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
-							__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
+								__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
+								__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
+								__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
+								__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
+								__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
+								__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
+								__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
+								__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c4 = _mm_add_ps(c4, c5);
-							c6 = _mm_add_ps(c6, c7);
-							c8 = _mm_add_ps(c8, c9);
-							cA = _mm_add_ps(cA, cB);
-							cC = _mm_add_ps(cC, cD);
-							cE = _mm_add_ps(cE, cF);
-							c0 = _mm_add_ps(c0, c2);
-							c4 = _mm_add_ps(c4, c6);
-							c8 = _mm_add_ps(c8, cA);
-							cC = _mm_add_ps(cC, cE);
-							c0 = _mm_add_ps(c0, c4);
-							c8 = _mm_add_ps(c8, cC);
-							c0 = _mm_add_ps(c0, c8);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c8 = _mm_add_ps(c8, c9);
+								cA = _mm_add_ps(cA, cB);
+								cC = _mm_add_ps(cC, cD);
+								cE = _mm_add_ps(cE, cF);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c8 = _mm_add_ps(c8, cA);
+								cC = _mm_add_ps(cC, cE);
+								c0 = _mm_add_ps(c0, c4);
+								c8 = _mm_add_ps(c8, cC);
+								c0 = _mm_add_ps(c0, c8);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
 
-							_mm_store_ps((float*)(source0 + 8 * x), c0);
+								_mm_store_ps((float*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				if(internal.depth == 2)
 				{
@@ -5234,153 +5249,155 @@
 		}
 		else if(internal.format == FORMAT_A32B32G32R32F || internal.format == FORMAT_X32B32G32R32F)
 		{
-			if(CPUID::supportsSSE())
-			{
-				if(internal.depth == 2)
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE())
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x++)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+							for(int x = 0; x < width; x++)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
 
-							_mm_store_ps((float*)(source0 + 16 * x), c0);
+								_mm_store_ps((float*)(source0 + 16 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x++)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
+							for(int x = 0; x < width; x++)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c0 = _mm_add_ps(c0, c2);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c0 = _mm_add_ps(c0, c2);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
 
-							_mm_store_ps((float*)(source0 + 16 * x), c0);
+								_mm_store_ps((float*)(source0 + 16 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x++)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
-							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
-							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
-							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
-							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
+							for(int x = 0; x < width; x++)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c4 = _mm_add_ps(c4, c5);
-							c6 = _mm_add_ps(c6, c7);
-							c0 = _mm_add_ps(c0, c2);
-							c4 = _mm_add_ps(c4, c6);
-							c0 = _mm_add_ps(c0, c4);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c0 = _mm_add_ps(c0, c4);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
 
-							_mm_store_ps((float*)(source0 + 16 * x), c0);
+								_mm_store_ps((float*)(source0 + 16 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x++)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
-							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
-							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
-							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
-							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
-							__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
-							__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
-							__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
-							__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
-							__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
-							__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
-							__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
-							__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
+							for(int x = 0; x < width; x++)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
+								__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
+								__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
+								__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
+								__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
+								__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
+								__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
+								__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
+								__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c4 = _mm_add_ps(c4, c5);
-							c6 = _mm_add_ps(c6, c7);
-							c8 = _mm_add_ps(c8, c9);
-							cA = _mm_add_ps(cA, cB);
-							cC = _mm_add_ps(cC, cD);
-							cE = _mm_add_ps(cE, cF);
-							c0 = _mm_add_ps(c0, c2);
-							c4 = _mm_add_ps(c4, c6);
-							c8 = _mm_add_ps(c8, cA);
-							cC = _mm_add_ps(cC, cE);
-							c0 = _mm_add_ps(c0, c4);
-							c8 = _mm_add_ps(c8, cC);
-							c0 = _mm_add_ps(c0, c8);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c8 = _mm_add_ps(c8, c9);
+								cA = _mm_add_ps(cA, cB);
+								cC = _mm_add_ps(cC, cD);
+								cE = _mm_add_ps(cE, cF);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c8 = _mm_add_ps(c8, cA);
+								cC = _mm_add_ps(cC, cE);
+								c0 = _mm_add_ps(c0, c4);
+								c8 = _mm_add_ps(c8, cC);
+								c0 = _mm_add_ps(c0, c8);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
 
-							_mm_store_ps((float*)(source0 + 16 * x), c0);
+								_mm_store_ps((float*)(source0 + 16 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				if(internal.depth == 2)
 				{
@@ -5529,259 +5546,261 @@
 		}
 		else if(internal.format == FORMAT_R5G6B5)
 		{
-			if(CPUID::supportsSSE2() && (width % 8) == 0)
-			{
-				if(internal.depth == 2)
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE2() && (width % 8) == 0)
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x += 8)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+							for(int x = 0; x < width; x += 8)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
 
-							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
-							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
-							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
 
-							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
-							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							c1 = _mm_avg_epu16(c0__g_, c1__g_);
-							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							c0 = _mm_or_si128(c0, c1);
+								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								c1 = _mm_avg_epu16(c0__g_, c1__g_);
+								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								c0 = _mm_or_si128(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x += 8)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+							for(int x = 0; x < width; x += 8)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
 
-							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
-							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
-							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
 
-							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
-							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
-							c0 = _mm_avg_epu8(c0, c2);
-							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							c1 = _mm_avg_epu16(c0__g_, c1__g_);
-							c3 = _mm_avg_epu16(c2__g_, c3__g_);
-							c1 = _mm_avg_epu16(c1, c3);
-							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							c0 = _mm_or_si128(c0, c1);
+								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+								c0 = _mm_avg_epu8(c0, c2);
+								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								c1 = _mm_avg_epu16(c0__g_, c1__g_);
+								c3 = _mm_avg_epu16(c2__g_, c3__g_);
+								c1 = _mm_avg_epu16(c1, c3);
+								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								c0 = _mm_or_si128(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x += 8)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
+							for(int x = 0; x < width; x += 8)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
 
-							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
-							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
-							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
+								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
 
-							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
-							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
-							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
-							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
-							c0 = _mm_avg_epu8(c0, c2);
-							c4 = _mm_avg_epu8(c4, c6);
-							c0 = _mm_avg_epu8(c0, c4);
-							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							c1 = _mm_avg_epu16(c0__g_, c1__g_);
-							c3 = _mm_avg_epu16(c2__g_, c3__g_);
-							c5 = _mm_avg_epu16(c4__g_, c5__g_);
-							c7 = _mm_avg_epu16(c6__g_, c7__g_);
-							c1 = _mm_avg_epu16(c1, c3);
-							c5 = _mm_avg_epu16(c5, c7);
-							c1 = _mm_avg_epu16(c1, c5);
-							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							c0 = _mm_or_si128(c0, c1);
+								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
+								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
+								c0 = _mm_avg_epu8(c0, c2);
+								c4 = _mm_avg_epu8(c4, c6);
+								c0 = _mm_avg_epu8(c0, c4);
+								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								c1 = _mm_avg_epu16(c0__g_, c1__g_);
+								c3 = _mm_avg_epu16(c2__g_, c3__g_);
+								c5 = _mm_avg_epu16(c4__g_, c5__g_);
+								c7 = _mm_avg_epu16(c6__g_, c7__g_);
+								c1 = _mm_avg_epu16(c1, c3);
+								c5 = _mm_avg_epu16(c5, c7);
+								c1 = _mm_avg_epu16(c1, c5);
+								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								c0 = _mm_or_si128(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x += 8)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
-							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
-							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
-							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
-							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
-							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
-							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
-							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
-							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
+							for(int x = 0; x < width; x += 8)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
+								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
+								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
+								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
+								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
+								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
+								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
+								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
+								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
 
-							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
-							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
-							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
-							__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
-							__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
-							__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
-							__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
-							__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
-							__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
-							__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
-							__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
-							__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
-							__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
-							__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
-							__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
+								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
 
-							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
-							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
-							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
-							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
-							c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
-							cA = _mm_avg_epu8(cA_r_b, cB_r_b);
-							cC = _mm_avg_epu8(cC_r_b, cD_r_b);
-							cE = _mm_avg_epu8(cE_r_b, cF_r_b);
-							c0 = _mm_avg_epu8(c0, c2);
-							c4 = _mm_avg_epu8(c4, c6);
-							c8 = _mm_avg_epu8(c8, cA);
-							cC = _mm_avg_epu8(cC, cE);
-							c0 = _mm_avg_epu8(c0, c4);
-							c8 = _mm_avg_epu8(c8, cC);
-							c0 = _mm_avg_epu8(c0, c8);
-							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							c1 = _mm_avg_epu16(c0__g_, c1__g_);
-							c3 = _mm_avg_epu16(c2__g_, c3__g_);
-							c5 = _mm_avg_epu16(c4__g_, c5__g_);
-							c7 = _mm_avg_epu16(c6__g_, c7__g_);
-							c9 = _mm_avg_epu16(c8__g_, c9__g_);
-							cB = _mm_avg_epu16(cA__g_, cB__g_);
-							cD = _mm_avg_epu16(cC__g_, cD__g_);
-							cF = _mm_avg_epu16(cE__g_, cF__g_);
-							c1 = _mm_avg_epu8(c1, c3);
-							c5 = _mm_avg_epu8(c5, c7);
-							c9 = _mm_avg_epu8(c9, cB);
-							cD = _mm_avg_epu8(cD, cF);
-							c1 = _mm_avg_epu8(c1, c5);
-							c9 = _mm_avg_epu8(c9, cD);
-							c1 = _mm_avg_epu8(c1, c9);
-							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							c0 = _mm_or_si128(c0, c1);
+								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
+								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
+								c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
+								cA = _mm_avg_epu8(cA_r_b, cB_r_b);
+								cC = _mm_avg_epu8(cC_r_b, cD_r_b);
+								cE = _mm_avg_epu8(cE_r_b, cF_r_b);
+								c0 = _mm_avg_epu8(c0, c2);
+								c4 = _mm_avg_epu8(c4, c6);
+								c8 = _mm_avg_epu8(c8, cA);
+								cC = _mm_avg_epu8(cC, cE);
+								c0 = _mm_avg_epu8(c0, c4);
+								c8 = _mm_avg_epu8(c8, cC);
+								c0 = _mm_avg_epu8(c0, c8);
+								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								c1 = _mm_avg_epu16(c0__g_, c1__g_);
+								c3 = _mm_avg_epu16(c2__g_, c3__g_);
+								c5 = _mm_avg_epu16(c4__g_, c5__g_);
+								c7 = _mm_avg_epu16(c6__g_, c7__g_);
+								c9 = _mm_avg_epu16(c8__g_, c9__g_);
+								cB = _mm_avg_epu16(cA__g_, cB__g_);
+								cD = _mm_avg_epu16(cC__g_, cD__g_);
+								cF = _mm_avg_epu16(cE__g_, cF__g_);
+								c1 = _mm_avg_epu8(c1, c3);
+								c5 = _mm_avg_epu8(c5, c7);
+								c9 = _mm_avg_epu8(c9, cB);
+								cD = _mm_avg_epu8(cD, cF);
+								c1 = _mm_avg_epu8(c1, c5);
+								c9 = _mm_avg_epu8(c9, cD);
+								c1 = _mm_avg_epu8(c1, c9);
+								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								c0 = _mm_or_si128(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
 
diff --git a/third_party/LLVM/lib/Support/Memory.cpp b/third_party/LLVM/lib/Support/Memory.cpp
index 2a1642a..93ce4e6 100644
--- a/third_party/LLVM/lib/Support/Memory.cpp
+++ b/third_party/LLVM/lib/Support/Memory.cpp
@@ -69,7 +69,7 @@
   // FIXME: Can we safely always call this for __GNUC__ everywhere?
   char *Start = (char*) Addr;
   char *End = Start + Len;
-  __clear_cache(Start, End);
+  __builtin___clear_cache(Start, End);
 #  elif defined(__mips__)
   cacheflush((char*)Addr, Len, BCACHE);
 #  endif