Refactor vector packing.

x86 vector packing instructions always treat the input as having signed
integer components, but can perform signed or unsigned saturation on
the output. In Reactor the Pack() intrinsic has overloads which
differentiate between them based on the signedness of the input, but
this is confusing.

Also simplify emulation of saturating add/subtract.

Bug b/37496082

Change-Id: I0625fff429ffb40f42baf9600c7760d9858b5d89
Reviewed-on: https://swiftshader-review.googlesource.com/12548
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 8abb17a..59e7e09 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -2791,7 +2791,7 @@
 	RValue<Short4> RoundShort4(RValue<Float4> cast)
 	{
 		RValue<Int4> int4 = RoundInt(cast);
-		return As<Short4>(Pack(int4, int4));
+		return As<Short4>(PackSigned(int4, int4));
 	}
 
 	RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
@@ -2824,13 +2824,20 @@
 		return x86::pmaddwd(x, y);
 	}
 
-	RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y)
+	RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
 	{
 		auto result = x86::packsswb(x, y);
 
 		return As<SByte8>(Swizzle(As<Int4>(result), 0x88));
 	}
 
+	RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
+	{
+		auto result = x86::packuswb(x, y);
+
+		return As<Byte8>(Swizzle(As<Int4>(result), 0x88));
+	}
+
 	RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
 	{
 		int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};   // Real type is v8i16
@@ -2899,7 +2906,7 @@
 			if(CPUID::supportsSSE4_1())
 			{
 				Int4 int4(Min(cast, Float4(0xFFFF)));   // packusdw takes care of 0x0000 saturation
-				*this = As<Short4>(Pack(As<UInt4>(int4), As<UInt4>(int4)));
+				*this = As<Short4>(PackUnsigned(int4, int4));
 			}
 			else
 			{
@@ -3093,13 +3100,6 @@
 		return x86::pavgw(x, y);
 	}
 
-	RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y)
-	{
-		auto result = x86::packuswb(x, y);
-
-		return As<Byte8>(Swizzle(As<Int4>(result), 0x88));
-	}
-
 	Type *UShort4::getType()
 	{
 		return T(Type_v4i16);
@@ -4846,11 +4846,16 @@
 		return x86::cvtps2dq(cast);
 	}
 
-	RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y)
+	RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
 	{
 		return x86::packssdw(x, y);
 	}
 
+	RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
+	{
+		return x86::packusdw(x, y);
+	}
+
 	RValue<Int> Extract(RValue<Int4> x, int i)
 	{
 		return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
@@ -5180,11 +5185,6 @@
 		}
 	}
 
-	RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y)
-	{
-		return x86::packusdw(As<Int4>(x), As<Int4>(y));
-	}
-
 	Type *UInt4::getType()
 	{
 		return T(llvm::VectorType::get(T(UInt::getType()), 4));
@@ -6205,7 +6205,7 @@
 			return As<SByte8>(V(::builder->CreateCall2(packsswb, x.value, y.value)));
 		}
 
-		RValue<Byte8> packuswb(RValue<UShort4> x, RValue<UShort4> y)
+		RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
 		{
 			llvm::Function *packuswb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packuswb_128);