Refactor vector packing.

x86 vector packing instructions always treat the input as having signed
integer components, but can perform signed or unsigned saturation on
the output. In Reactor the Pack() intrinsic has overloads which
differentiate between them based on the signedness of the input, but
this is confusing.

Also simplify emulation of saturating add/subtract.

Bug b/37496082

Change-Id: I0625fff429ffb40f42baf9600c7760d9858b5d89
Reviewed-on: https://swiftshader-review.googlesource.com/12548
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Main/FrameBuffer.cpp b/src/Main/FrameBuffer.cpp
index e95f766..d3e6383 100644
--- a/src/Main/FrameBuffer.cpp
+++ b/src/Main/FrameBuffer.cpp
@@ -253,10 +253,10 @@
 						case FORMAT_A16B16G16R16:
 							For(, x < width - 1, x += 2)
 							{
-								UShort4 c0 = As<UShort4>(Swizzle(*Pointer<Short4>(s + 0), 0xC6)) >> 8;
-								UShort4 c1 = As<UShort4>(Swizzle(*Pointer<Short4>(s + 8), 0xC6)) >> 8;
+								Short4 c0 = As<UShort4>(Swizzle(*Pointer<Short4>(s + 0), 0xC6)) >> 8;
+								Short4 c1 = As<UShort4>(Swizzle(*Pointer<Short4>(s + 8), 0xC6)) >> 8;
 
-								*Pointer<Int2>(d) = As<Int2>(Pack(c0, c1));
+								*Pointer<Int2>(d) = As<Int2>(PackUnsigned(c0, c1));
 
 								s += 2 * sBytes;
 								d += 2 * dBytes;
@@ -300,9 +300,9 @@
 								break;
 							case FORMAT_A16B16G16R16:
 								{
-									UShort4 c = As<UShort4>(Swizzle(*Pointer<Short4>(s), 0xC6)) >> 8;
+									Short4 c = As<UShort4>(Swizzle(*Pointer<Short4>(s), 0xC6)) >> 8;
 
-									*Pointer<Int>(d) = Int(As<Int2>(Pack(c, c)));
+									*Pointer<Int>(d) = Int(As<Int2>(PackUnsigned(c, c)));
 								}
 								break;
 							case FORMAT_R5G6B5:
@@ -361,10 +361,10 @@
 						case FORMAT_A16B16G16R16:
 							For(, x < width - 1, x += 2)
 							{
-								UShort4 c0 = *Pointer<UShort4>(s + 0) >> 8;
-								UShort4 c1 = *Pointer<UShort4>(s + 8) >> 8;
+								Short4 c0 = *Pointer<UShort4>(s + 0) >> 8;
+								Short4 c1 = *Pointer<UShort4>(s + 8) >> 8;
 
-								*Pointer<Int2>(d) = As<Int2>(Pack(c0, c1));
+								*Pointer<Int2>(d) = As<Int2>(PackUnsigned(c0, c1));
 
 								s += 2 * sBytes;
 								d += 2 * dBytes;
@@ -408,9 +408,9 @@
 								break;
 							case FORMAT_A16B16G16R16:
 								{
-									UShort4 c = *Pointer<UShort4>(s) >> 8;
+									Short4 c = *Pointer<UShort4>(s) >> 8;
 
-									*Pointer<Int>(d) = Int(As<Int2>(Pack(c, c)));
+									*Pointer<Int>(d) = Int(As<Int2>(PackUnsigned(c, c)));
 								}
 								break;
 							case FORMAT_R5G6B5:
@@ -503,8 +503,8 @@
 								break;
 							case FORMAT_A16B16G16R16:
 								{
-									UShort4 cc = *Pointer<UShort4>(s) >> 8;
-									Int c = Int(As<Int2>(Pack(cc, cc)));
+									Short4 cc = *Pointer<UShort4>(s) >> 8;
+									Int c = Int(As<Int2>(PackUnsigned(cc, cc)));
 
 									*Pointer<Short>(d) = Short((c & 0x00F80000) >> 19 |
 									                           (c & 0x0000FC00) >> 5 |
@@ -615,7 +615,7 @@
 		{
 		case FORMAT_X8R8G8B8:
 		case FORMAT_A8R8G8B8:
-			*Pointer<Byte4>(d) = Byte4(Pack(As<UShort4>(c1), As<UShort4>(c1)));
+			*Pointer<Byte4>(d) = Byte4(PackUnsigned(c1, c1));
 			break;
 		case FORMAT_X8B8G8R8:
 		case FORMAT_A8B8G8R8:
@@ -624,12 +624,12 @@
 			{
 				c1 = Swizzle(c1, 0xC6);
 
-				*Pointer<Byte4>(d) = Byte4(Pack(As<UShort4>(c1), As<UShort4>(c1)));
+				*Pointer<Byte4>(d) = Byte4(PackUnsigned(c1, c1));
 			}
 			break;
 		case FORMAT_R8G8B8:
 			{
-				Int c = Int(As<Int2>(Pack(As<UShort4>(c1), As<UShort4>(c1))));
+				Int c = Int(As<Int2>(PackUnsigned(c1, c1)));
 
 				*Pointer<Byte>(d + 0) = Byte(c >> 0);
 				*Pointer<Byte>(d + 1) = Byte(c >> 8);
@@ -638,7 +638,7 @@
 			break;
 		case FORMAT_R5G6B5:
 			{
-				Int c = Int(As<Int2>(Pack(As<UShort4>(c1), As<UShort4>(c1))));
+				Int c = Int(As<Int2>(PackUnsigned(c1, c1)));
 
 				*Pointer<Short>(d) = Short((c & 0x00F80000) >> 8 |
 				                           (c & 0x0000FC00) >> 5 |
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 8abb17a..59e7e09 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -2791,7 +2791,7 @@
 	RValue<Short4> RoundShort4(RValue<Float4> cast)
 	{
 		RValue<Int4> int4 = RoundInt(cast);
-		return As<Short4>(Pack(int4, int4));
+		return As<Short4>(PackSigned(int4, int4));
 	}
 
 	RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
@@ -2824,13 +2824,20 @@
 		return x86::pmaddwd(x, y);
 	}
 
-	RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y)
+	RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
 	{
 		auto result = x86::packsswb(x, y);
 
 		return As<SByte8>(Swizzle(As<Int4>(result), 0x88));
 	}
 
+	RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
+	{
+		auto result = x86::packuswb(x, y);
+
+		return As<Byte8>(Swizzle(As<Int4>(result), 0x88));
+	}
+
 	RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
 	{
 		int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
@@ -2899,7 +2906,7 @@
 			if(CPUID::supportsSSE4_1())
 			{
 				Int4 int4(Min(cast, Float4(0xFFFF)));   // packusdw takes care of 0x0000 saturation
-				*this = As<Short4>(Pack(As<UInt4>(int4), As<UInt4>(int4)));
+				*this = As<Short4>(PackUnsigned(int4, int4));
 			}
 			else
 			{
@@ -3093,13 +3100,6 @@
 		return x86::pavgw(x, y);
 	}
 
-	RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		auto result = x86::packuswb(x, y);
-
-		return As<Byte8>(Swizzle(As<Int4>(result), 0x88));
-	}
-
 	Type *UShort4::getType()
 	{
 		return T(Type_v4i16);
@@ -4846,11 +4846,16 @@
 		return x86::cvtps2dq(cast);
 	}
 
-	RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y)
+	RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
 	{
 		return x86::packssdw(x, y);
 	}
 
+	RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
+	{
+		return x86::packusdw(x, y);
+	}
+
 	RValue<Int> Extract(RValue<Int4> x, int i)
 	{
 		return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
@@ -5180,11 +5185,6 @@
 		}
 	}
 
-	RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		return x86::packusdw(As<Int4>(x), As<Int4>(y));
-	}
-
 	Type *UInt4::getType()
 	{
 		return T(llvm::VectorType::get(T(UInt::getType()), 4));
@@ -6205,7 +6205,7 @@
 			return As<SByte8>(V(::builder->CreateCall2(packsswb, x.value, y.value)));
 		}
 
-		RValue<Byte8> packuswb(RValue<UShort4> x, RValue<UShort4> y)
+		RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
 		{
 			llvm::Function *packuswb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packuswb_128);
 
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 3f9fb3d..bd2ce7f 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -797,7 +797,8 @@
 	RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y);
 	RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y);
 	RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y);
-	RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y);
+	RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y);
+	RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y);
 	RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y);
 	RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y);
 	RValue<Short4> Swizzle(RValue<Short4> x, unsigned char select);
@@ -866,7 +867,6 @@
 	RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y);
 	RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y);
 	RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y);
-	RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y);
 
 	class Short8 : public LValue<Short8>
 	{
@@ -1831,7 +1831,8 @@
 	RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y);
 	RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y);
 	RValue<Int4> RoundInt(RValue<Float4> cast);
-	RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y);
+	RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y);
+	RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y);
 	RValue<Int> Extract(RValue<Int4> val, int i);
 	RValue<Int4> Insert(RValue<Int4> val, RValue<Int> element, int i);
 	RValue<Int> SignMask(RValue<Int4> x);
@@ -1911,7 +1912,6 @@
 	RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y);
 	RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y);
 //	RValue<UInt4> RoundInt(RValue<Float4> cast);
-	RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y);
 
 	class Float : public LValue<Float>
 	{
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index a770981..d67b182 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -2716,7 +2716,7 @@
 		return RValue<Byte8>(Nucleus::createInsertElement(val.value, element.value, i));
 	}
 
-	RValue<Byte> Saturate(RValue<UShort> x)
+	RValue<Byte> SaturateUnsigned(RValue<Short> x)
 	{
 		return Byte(IfThenElse(Int(x) > 0xFF, Int(0xFF), Int(x)));
 	}
@@ -2726,14 +2726,14 @@
 		if(emulateIntrinsics)
 		{
 			Byte8 result;
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 0))) + UShort(Int(Extract(y, 0)))), 0);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 1))) + UShort(Int(Extract(y, 1)))), 1);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 2))) + UShort(Int(Extract(y, 2)))), 2);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 3))) + UShort(Int(Extract(y, 3)))), 3);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 4))) + UShort(Int(Extract(y, 4)))), 4);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 5))) + UShort(Int(Extract(y, 5)))), 5);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 6))) + UShort(Int(Extract(y, 6)))), 6);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 7))) + UShort(Int(Extract(y, 7)))), 7);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
 
 			return result;
 		}
@@ -2756,14 +2756,14 @@
 		if(emulateIntrinsics)
 		{
 			Byte8 result;
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 0))) - UShort(Int(Extract(y, 0)))), 0);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 1))) - UShort(Int(Extract(y, 1)))), 1);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 2))) - UShort(Int(Extract(y, 2)))), 2);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 3))) - UShort(Int(Extract(y, 3)))), 3);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 4))) - UShort(Int(Extract(y, 4)))), 4);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 5))) - UShort(Int(Extract(y, 5)))), 5);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 6))) - UShort(Int(Extract(y, 6)))), 6);
-			result = Insert(result, Saturate(UShort(Int(Extract(x, 7))) - UShort(Int(Extract(y, 7)))), 7);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
+			result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
 
 			return result;
 		}
@@ -3043,7 +3043,7 @@
 		return RValue<SByte8>(Nucleus::createNot(val.value));
 	}
 
-	RValue<SByte> Saturate(RValue<Short> x)
+	RValue<SByte> SaturateSigned(RValue<Short> x)
 	{
 		return SByte(IfThenElse(Int(x) > 0x7F, Int(0x7F), IfThenElse(Int(x) < -0x80, Int(0x80), Int(x))));
 	}
@@ -3053,14 +3053,14 @@
 		if(emulateIntrinsics)
 		{
 			SByte8 result;
-			result = Insert(result, Saturate(Short(Int(Extract(x, 0))) + Short(Int(Extract(y, 0)))), 0);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 1))) + Short(Int(Extract(y, 1)))), 1);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 2))) + Short(Int(Extract(y, 2)))), 2);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 3))) + Short(Int(Extract(y, 3)))), 3);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 4))) + Short(Int(Extract(y, 4)))), 4);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 5))) + Short(Int(Extract(y, 5)))), 5);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 6))) + Short(Int(Extract(y, 6)))), 6);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 7))) + Short(Int(Extract(y, 7)))), 7);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
 
 			return result;
 		}
@@ -3083,14 +3083,14 @@
 		if(emulateIntrinsics)
 		{
 			SByte8 result;
-			result = Insert(result, Saturate(Short(Int(Extract(x, 0))) - Short(Int(Extract(y, 0)))), 0);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 1))) - Short(Int(Extract(y, 1)))), 1);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 2))) - Short(Int(Extract(y, 2)))), 2);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 3))) - Short(Int(Extract(y, 3)))), 3);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 4))) - Short(Int(Extract(y, 4)))), 4);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 5))) - Short(Int(Extract(y, 5)))), 5);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 6))) - Short(Int(Extract(y, 6)))), 6);
-			result = Insert(result, Saturate(Short(Int(Extract(x, 7))) - Short(Int(Extract(y, 7)))), 7);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
+			result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
 
 			return result;
 		}
@@ -3491,7 +3491,7 @@
 	RValue<Short4> RoundShort4(RValue<Float4> cast)
 	{
 		RValue<Int4> int4 = RoundInt(cast);
-		return As<Short4>(Pack(int4, int4));
+		return As<Short4>(PackSigned(int4, int4));
 	}
 
 	RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
@@ -3520,7 +3520,7 @@
 		return RValue<Short4>(V(result));
 	}
 
-	RValue<Short> Saturate(RValue<Int> x)
+	RValue<Short> SaturateSigned(RValue<Int> x)
 	{
 		return Short(IfThenElse(x > 0x7FFF, Int(0x7FFF), IfThenElse(x < -0x8000, Int(0x8000), x)));
 	}
@@ -3530,10 +3530,10 @@
 		if(emulateIntrinsics)
 		{
 			Short4 result;
-			result = Insert(result, Saturate(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
-			result = Insert(result, Saturate(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
-			result = Insert(result, Saturate(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
-			result = Insert(result, Saturate(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
+			result = Insert(result, SaturateSigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
+			result = Insert(result, SaturateSigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
+			result = Insert(result, SaturateSigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
+			result = Insert(result, SaturateSigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
 
 			return result;
 		}
@@ -3556,10 +3556,10 @@
 		if(emulateIntrinsics)
 		{
 			Short4 result;
-			result = Insert(result, Saturate(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
-			result = Insert(result, Saturate(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
-			result = Insert(result, Saturate(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
-			result = Insert(result, Saturate(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
+			result = Insert(result, SaturateSigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
+			result = Insert(result, SaturateSigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
+			result = Insert(result, SaturateSigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
+			result = Insert(result, SaturateSigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
 
 			return result;
 		}
@@ -3627,19 +3627,19 @@
 		}
 	}
 
-	RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y)
+	RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
 	{
 		if(emulateIntrinsics)
 		{
 			SByte8 result;
-			result = Insert(result, Saturate(Extract(x, 0)), 0);
-			result = Insert(result, Saturate(Extract(x, 1)), 1);
-			result = Insert(result, Saturate(Extract(x, 2)), 2);
-			result = Insert(result, Saturate(Extract(x, 3)), 3);
-			result = Insert(result, Saturate(Extract(y, 0)), 4);
-			result = Insert(result, Saturate(Extract(y, 1)), 5);
-			result = Insert(result, Saturate(Extract(y, 2)), 6);
-			result = Insert(result, Saturate(Extract(y, 3)), 7);
+			result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
+			result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
+			result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
+			result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
+			result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
+			result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
+			result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
+			result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
 
 			return result;
 		}
@@ -3657,6 +3657,36 @@
 		}
 	}
 
+	RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
+	{
+		if(emulateIntrinsics)
+		{
+			Byte8 result;
+			result = Insert(result, SaturateUnsigned(Extract(x, 0)), 0);
+			result = Insert(result, SaturateUnsigned(Extract(x, 1)), 1);
+			result = Insert(result, SaturateUnsigned(Extract(x, 2)), 2);
+			result = Insert(result, SaturateUnsigned(Extract(x, 3)), 3);
+			result = Insert(result, SaturateUnsigned(Extract(y, 0)), 4);
+			result = Insert(result, SaturateUnsigned(Extract(y, 1)), 5);
+			result = Insert(result, SaturateUnsigned(Extract(y, 2)), 6);
+			result = Insert(result, SaturateUnsigned(Extract(y, 3)), 7);
+
+			return result;
+		}
+		else
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			pack->addArg(x.value);
+			pack->addArg(y.value);
+			::basicBlock->appendInst(pack);
+
+			return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x88));
+		}
+	}
+
 	RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
 	{
 		int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
@@ -3725,7 +3755,7 @@
 			if(CPUID::SSE4_1)
 			{
 				Int4 int4(Min(cast, Float4(0xFFFF)));   // packusdw takes care of 0x0000 saturation
-				*this = As<Short4>(Pack(As<UInt4>(int4), As<UInt4>(int4)));
+				*this = As<UShort4>(PackUnsigned(int4, int4));
 			}
 			else
 			{
@@ -4035,36 +4065,6 @@
 		assert(false && "UNIMPLEMENTED"); return RValue<UShort4>(V(nullptr));
 	}
 
-	RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		if(emulateIntrinsics)
-		{
-			Byte8 result;
-			result = Insert(result, Saturate(Extract(x, 0)), 0);
-			result = Insert(result, Saturate(Extract(x, 1)), 1);
-			result = Insert(result, Saturate(Extract(x, 2)), 2);
-			result = Insert(result, Saturate(Extract(x, 3)), 3);
-			result = Insert(result, Saturate(Extract(y, 0)), 4);
-			result = Insert(result, Saturate(Extract(y, 1)), 5);
-			result = Insert(result, Saturate(Extract(y, 2)), 6);
-			result = Insert(result, Saturate(Extract(y, 3)), 7);
-
-			return result;
-		}
-		else
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pack->addArg(x.value);
-			pack->addArg(y.value);
-			::basicBlock->appendInst(pack);
-
-			return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x88));
-		}
-	}
-
 	Type *UShort4::getType()
 	{
 		return T(Type_v4i16);
@@ -5931,19 +5931,19 @@
 		}
 	}
 
-	RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y)
+	RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
 	{
 		if(emulateIntrinsics)
 		{
 			Short8 result;
-			result = Insert(result, Saturate(Extract(x, 0)), 0);
-			result = Insert(result, Saturate(Extract(x, 1)), 1);
-			result = Insert(result, Saturate(Extract(x, 2)), 2);
-			result = Insert(result, Saturate(Extract(x, 3)), 3);
-			result = Insert(result, Saturate(Extract(y, 0)), 4);
-			result = Insert(result, Saturate(Extract(y, 1)), 5);
-			result = Insert(result, Saturate(Extract(y, 2)), 6);
-			result = Insert(result, Saturate(Extract(y, 3)), 7);
+			result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
+			result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
+			result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
+			result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
+			result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
+			result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
+			result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
+			result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
 
 			return result;
 		}
@@ -5961,6 +5961,32 @@
 		}
 	}
 
+	RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
+	{
+		if(CPUID::SSE4_1)
+		{
+			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+			auto target = ::context->getConstantUndef(Ice::IceType_i32);
+			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+			pack->addArg(x.value);
+			pack->addArg(y.value);
+			::basicBlock->appendInst(pack);
+
+			return RValue<UShort8>(V(result));
+		}
+		else
+		{
+			RValue<Int4> sx = As<Int4>(x);
+			RValue<Int4> bx = (sx & ~(sx >> 31)) - Int4(0x8000);
+
+			RValue<Int4> sy = As<Int4>(y);
+			RValue<Int4> by = (sy & ~(sy >> 31)) - Int4(0x8000);
+
+			return PackUnsigned(bx, by) + UShort8(0x8000u);
+		}
+	}
+
 	RValue<Int> Extract(RValue<Int4> x, int i)
 	{
 		return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
@@ -6329,32 +6355,6 @@
 		return RValue<UInt4>(V(result));
 	}
 
-	RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		if(CPUID::SSE4_1)
-		{
-			Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
-			const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
-			auto target = ::context->getConstantUndef(Ice::IceType_i32);
-			auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
-			pack->addArg(x.value);
-			pack->addArg(y.value);
-			::basicBlock->appendInst(pack);
-
-			return RValue<UShort8>(V(result));
-		}
-		else
-		{
-			RValue<Int4> sx = As<Int4>(x);
-			RValue<Int4> bx = (sx & ~(sx >> 31)) - Int4(0x8000);
-
-			RValue<Int4> sy = As<Int4>(y);
-			RValue<Int4> by = (sy & ~(sy >> 31)) - Int4(0x8000);
-
-			return As<UShort8>(Pack(bx, by) + Short8(0x8000u));
-		}
-	}
-
 	Type *UInt4::getType()
 	{
 		return T(Ice::IceType_v4i32);
diff --git a/src/Reactor/x86.hpp b/src/Reactor/x86.hpp
index 5e759b3..1b8786e 100644
--- a/src/Reactor/x86.hpp
+++ b/src/Reactor/x86.hpp
@@ -66,7 +66,7 @@
 		RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y);
 		RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y);
 		RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y);
-		RValue<Byte8> packuswb(RValue<UShort4> x, RValue<UShort4> y);
+		RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y);
 
 		RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y);
 
diff --git a/src/Renderer/Blitter.cpp b/src/Renderer/Blitter.cpp
index 0c4a160..b2486fb 100644
--- a/src/Renderer/Blitter.cpp
+++ b/src/Renderer/Blitter.cpp
@@ -445,8 +445,8 @@
 		case FORMAT_A8R8G8B8:
 			if(writeRGBA)
 			{
-				UShort4 c0 = As<UShort4>(RoundShort4(c.zyxw));
-				*Pointer<Byte4>(element) = Byte4(Pack(c0, c0));
+				Short4 c0 = RoundShort4(c.zyxw);
+				*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
 			}
 			else
 			{
@@ -460,8 +460,8 @@
 		case FORMAT_SRGB8_A8:
 			if(writeRGBA)
 			{
-				UShort4 c0 = As<UShort4>(RoundShort4(c));
-				*Pointer<Byte4>(element) = Byte4(Pack(c0, c0));
+				Short4 c0 = RoundShort4(c);
+				*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
 			}
 			else
 			{
@@ -474,8 +474,8 @@
 		case FORMAT_X8R8G8B8:
 			if(writeRGBA)
 			{
-				UShort4 c0 = As<UShort4>(RoundShort4(c.zyxw)) | UShort4(0x0000, 0x0000, 0x0000, 0xFFFFu);
-				*Pointer<Byte4>(element) = Byte4(Pack(c0, c0));
+				Short4 c0 = RoundShort4(c.zyxw) | Short4(0x0000, 0x0000, 0x0000, 0xFFFFu);
+				*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
 			}
 			else
 			{
@@ -489,8 +489,8 @@
 		case FORMAT_SRGB8_X8:
 			if(writeRGBA)
 			{
-				UShort4 c0 = As<UShort4>(RoundShort4(c)) | UShort4(0x0000, 0x0000, 0x0000, 0xFFFFu);
-				*Pointer<Byte4>(element) = Byte4(Pack(c0, c0));
+				Short4 c0 = RoundShort4(c) | Short4(0x0000, 0x0000, 0x0000, 0xFFFFu);
+				*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
 			}
 			else
 			{
diff --git a/src/Renderer/QuadRasterizer.cpp b/src/Renderer/QuadRasterizer.cpp
index 4721591..2855e16 100644
--- a/src/Renderer/QuadRasterizer.cpp
+++ b/src/Renderer/QuadRasterizer.cpp
@@ -284,7 +284,7 @@
 					for(unsigned int q = 0; q < state.multiSample; q++)
 					{
 						Short4 mask = CmpGT(xxxx, xLeft[q]) & CmpGT(xRight[q], xxxx);
-						cMask[q] = SignMask(Pack(mask, mask)) & 0x0000000F;
+						cMask[q] = SignMask(PackSigned(mask, mask)) & 0x0000000F;
 					}
 
 					quad(cBuffer, zBuffer, sBuffer, cMask, x, y);
diff --git a/src/Shader/PixelPipeline.cpp b/src/Shader/PixelPipeline.cpp
index 66d6a09..4ed1830 100644
--- a/src/Shader/PixelPipeline.cpp
+++ b/src/Shader/PixelPipeline.cpp
@@ -1665,7 +1665,7 @@
 	void PixelPipeline::TEXKILL(Int cMask[4], Vector4s &src)
 	{
 		Short4 test = src.x | src.y | src.z;
-		Int kill = SignMask(Pack(test, test)) ^ 0x0000000F;
+		Int kill = SignMask(PackSigned(test, test)) ^ 0x0000000F;
 
 		for(unsigned int q = 0; q < state.multiSample; q++)
 		{
diff --git a/src/Shader/PixelRoutine.cpp b/src/Shader/PixelRoutine.cpp
index 48a86a1..90e2073 100644
--- a/src/Shader/PixelRoutine.cpp
+++ b/src/Shader/PixelRoutine.cpp
@@ -549,29 +549,29 @@
 			break;
 		case ALPHA_EQUAL:
 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
-			aMask = SignMask(Pack(cmp, Short4(0x0000)));
+			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
 			break;
 		case ALPHA_NOTEQUAL:       // a != b ~ !(a == b)
 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
-			aMask = SignMask(Pack(cmp, Short4(0x0000)));
+			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
 			break;
 		case ALPHA_LESS:           // a < b ~ b > a
 			cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
-			aMask = SignMask(Pack(cmp, Short4(0x0000)));
+			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
 			break;
 		case ALPHA_GREATEREQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
 			equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
 			cmp |= equal;
-			aMask = SignMask(Pack(cmp, Short4(0x0000)));
+			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
 			break;
 		case ALPHA_LESSEQUAL:      // a <= b ~ !(a > b)
 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
-			aMask = SignMask(Pack(cmp, Short4(0x0000)));
+			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
 			break;
 		case ALPHA_GREATER:        // a > b
 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
-			aMask = SignMask(Pack(cmp, Short4(0x0000)));
+			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
 			break;
 		default:
 			ASSERT(false);
@@ -1452,8 +1452,8 @@
 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
 
-				current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
-				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
+				current.z = As<Short4>(PackUnsigned(current.z, current.x));
+				current.y = As<Short4>(PackUnsigned(current.y, current.y));
 
 				current.x = current.z;
 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
@@ -1469,8 +1469,8 @@
 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
 
-				current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
-				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
+				current.z = As<Short4>(PackUnsigned(current.z, current.x));
+				current.y = As<Short4>(PackUnsigned(current.y, current.w));
 
 				current.x = current.z;
 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
@@ -1490,8 +1490,8 @@
 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
 
-				current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
-				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
+				current.z = As<Short4>(PackUnsigned(current.x, current.z));
+				current.y = As<Short4>(PackUnsigned(current.y, current.y));
 
 				current.x = current.z;
 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
@@ -1507,8 +1507,8 @@
 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
 
-				current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
-				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
+				current.z = As<Short4>(PackUnsigned(current.x, current.z));
+				current.y = As<Short4>(PackUnsigned(current.y, current.w));
 
 				current.x = current.z;
 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
@@ -1521,17 +1521,17 @@
 		case FORMAT_G8R8:
 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
-			current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
-			current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
+			current.x = As<Short4>(PackUnsigned(current.x, current.x));
+			current.y = As<Short4>(PackUnsigned(current.y, current.y));
 			current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
 			break;
 		case FORMAT_R8:
 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
-			current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
+			current.x = As<Short4>(PackUnsigned(current.x, current.x));
 			break;
 		case FORMAT_A8:
 			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
-			current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
+			current.w = As<Short4>(PackUnsigned(current.w, current.w));
 			break;
 		case FORMAT_G16R16:
 			current.z = current.x;
@@ -2367,11 +2367,11 @@
 				Short4 tmpCol = Short4(As<Int4>(oC.x));
 				if(state.targetFormat[index] == FORMAT_R8I)
 				{
-					tmpCol = As<Short4>(Pack(tmpCol, tmpCol));
+					tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
 				}
 				else
 				{
-					tmpCol = As<Short4>(Pack(As<UShort4>(tmpCol), As<UShort4>(tmpCol)));
+					tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
 				}
 				packedCol = Extract(As<Int2>(tmpCol), 0);
 
@@ -2466,11 +2466,11 @@
 
 				if(state.targetFormat[index] == FORMAT_G8R8I)
 				{
-					packedCol = As<Int2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+					packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
 				}
 				else
 				{
-					packedCol = As<Int2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
+					packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
 				}
 
 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
@@ -2604,11 +2604,11 @@
 
 				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
 				{
-					packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
 				}
 				else
 				{
-					packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
+					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
 				}
 				value = *Pointer<UInt2>(buffer, 16);
 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
@@ -2622,11 +2622,11 @@
 
 				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
 				{
-					packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
+					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
 				}
 				else
 				{
-					packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w))));
+					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
 				}
 				value = *Pointer<UInt2>(buffer, 16);
 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
diff --git a/src/Shader/SamplerCore.cpp b/src/Shader/SamplerCore.cpp
index 62f76fa..290f402 100644
--- a/src/Shader/SamplerCore.cpp
+++ b/src/Shader/SamplerCore.cpp
@@ -570,7 +570,7 @@
 	void SamplerCore::border(Short4 &mask, Float4 &coordinates)
 	{
 		Int4 border = As<Int4>(CmpLT(Abs(coordinates - Float4(0.5f)), Float4(0.5f)));
-		mask = As<Short4>(Int2(As<Int4>(Pack(border, border))));
+		mask = As<Short4>(Int2(As<Int4>(PackSigned(border, border))));
 	}
 
 	void SamplerCore::border(Int4 &mask, Float4 &coordinates)
@@ -2271,7 +2271,7 @@
 
 			// Clamp
 			convert -= Int4(0x00008000, 0x00008000, 0x00008000, 0x00008000);
-			convert = As<Int4>(Pack(convert, convert));
+			convert = As<Int4>(PackSigned(convert, convert));
 
 			return As<Short4>(Int2(convert)) + Short4(0x8000u);
 		}