Implement byte swizzle operations

Add Swizzle() intrinsics for Byte16, Byte8, and Byte4, and add Byte4
constructors and assignment operators. Also move LLVM-specific
implementations to the generic Reactor.cpp source file.

On x86 these all translate to a pshufb instruction, which is very
efficient.

Bug: b/148295813
Change-Id: Icf88fe1621623f8104c4a642d560643a01b9ef55
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/40549
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/docs/Reactor.md b/docs/Reactor.md
index df016d7..7824a14 100644
--- a/docs/Reactor.md
+++ b/docs/Reactor.md
@@ -141,6 +141,8 @@
 

 Note that this is a bitwise cast. Unlike C++'s ```reinterpret_cast<>```, it does not allow casting between different sized types. Think of it as storing the value in memory and then loading from that same address into the casted type.

 

+An important exception is that 16-, 8-, and 4-byte vectors can be cast to other vectors of one of these sizes. Casting to a longer vector leaves the upper contents undefined.

+

 ### Pointers

 

 Pointers also use a template class:

diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 47144df..2600112 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -2906,35 +2906,6 @@
 #endif
 }
 
-RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
-{
-	RR_DEBUG_INFO_UPDATE_LOC();
-	int pshufb[16] = {
-		select0 + 0,
-		select0 + 1,
-		select1 + 0,
-		select1 + 1,
-		select2 + 0,
-		select2 + 1,
-		select3 + 0,
-		select3 + 1,
-		select4 + 0,
-		select4 + 1,
-		select5 + 0,
-		select5 + 1,
-		select6 + 0,
-		select6 + 1,
-		select7 + 0,
-		select7 + 1,
-	};
-
-	Value *byte16 = Nucleus::createBitCast(x.value, Byte16::getType());
-	Value *shuffle = Nucleus::createShuffleVector(byte16, byte16, pshufb);
-	Value *short8 = Nucleus::createBitCast(shuffle, UShort8::getType());
-
-	return RValue<UShort8>(short8);
-}
-
 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index b3ce607..8801f2a 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -1250,12 +1250,62 @@
 	storeValue(Nucleus::createBitCast(cast.value, getType()));
 }
 
+Byte4::Byte4(RValue<UShort4> cast)
+{
+	// TODO(b/148379603): Optimize narrowing swizzle.
+	*this = As<Byte4>(Swizzle(As<Byte8>(cast), 0x0246'0246));
+}
+
+Byte4::Byte4(RValue<Short4> cast)
+{
+	// TODO(b/148379603): Optimize narrowing swizzle.
+	*this = As<Byte4>(Swizzle(As<Byte8>(cast), 0x0246'0246));
+}
+
+Byte4::Byte4(RValue<UInt4> cast)
+{
+	// TODO(b/148379603): Optimize narrowing swizzle.
+	*this = As<Byte4>(Swizzle(As<Byte16>(cast), 0x048C'048C'048C'048C));
+}
+
+Byte4::Byte4(RValue<Int4> cast)
+{
+	// TODO(b/148379603): Optimize narrowing swizzle.
+	*this = As<Byte4>(Swizzle(As<Byte16>(cast), 0x048C'048C'048C'048C));
+}
+
+Byte4::Byte4(RValue<Byte4> rhs)
+{
+	storeValue(rhs.value);
+}
+
+Byte4::Byte4(const Byte4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+}
+
 Byte4::Byte4(const Reference<Byte4> &rhs)
 {
 	Value *value = rhs.loadValue();
 	storeValue(value);
 }
 
+RValue<Byte4> Byte4::operator=(RValue<Byte4> rhs)
+{
+	storeValue(rhs.value);
+
+	return rhs;
+}
+
+RValue<Byte4> Byte4::operator=(const Byte4 &rhs)
+{
+	Value *value = rhs.loadValue();
+	storeValue(value);
+
+	return RValue<Byte4>(value);
+}
+
 Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
 {
 	int64_t constantVector[8] = { x0, x1, x2, x3, x4, x5, x6, x7 };
@@ -1417,8 +1467,35 @@
 	return RValue<Byte8>(Nucleus::createNot(val.value));
 }
 
+RValue<Byte8> Swizzle(RValue<Byte8> x, uint32_t select)
+{
+	// Real type is v16i8
+	// TODO(b/148379603): Optimize narrowing swizzle.
+	int shuffle[16] = {
+		static_cast<int>((select >> 28) & 0x07),
+		static_cast<int>((select >> 24) & 0x07),
+		static_cast<int>((select >> 20) & 0x07),
+		static_cast<int>((select >> 16) & 0x07),
+		static_cast<int>((select >> 12) & 0x07),
+		static_cast<int>((select >> 8) & 0x07),
+		static_cast<int>((select >> 4) & 0x07),
+		static_cast<int>((select >> 0) & 0x07),
+		static_cast<int>((select >> 28) & 0x07),
+		static_cast<int>((select >> 24) & 0x07),
+		static_cast<int>((select >> 20) & 0x07),
+		static_cast<int>((select >> 16) & 0x07),
+		static_cast<int>((select >> 12) & 0x07),
+		static_cast<int>((select >> 8) & 0x07),
+		static_cast<int>((select >> 4) & 0x07),
+		static_cast<int>((select >> 0) & 0x07),
+	};
+
+	return As<Byte8>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
+}
+
 RValue<Short4> Unpack(RValue<Byte4> x)
 {
+	// TODO(b/148379603): Optimize narrowing swizzle.
 	int shuffle[16] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };  // Real type is v16i8
 	return As<Short4>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
 }
@@ -1430,12 +1507,14 @@
 
 RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
 {
+	// TODO(b/148379603): Optimize narrowing swizzle.
 	int shuffle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };  // Real type is v16i8
 	return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 }
 
 RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y)
 {
+	// TODO(b/148379603): Optimize narrowing swizzle.
 	int shuffle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };  // Real type is v16i8
 	auto lowHigh = RValue<Byte16>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 	return As<Short4>(Swizzle(As<Int4>(lowHigh), 0x2323));
@@ -1606,12 +1685,14 @@
 
 RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
 {
+	// TODO(b/148379603): Optimize narrowing swizzle.
 	int shuffle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };  // Real type is v16i8
 	return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 }
 
 RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y)
 {
+	// TODO(b/148379603): Optimize narrowing swizzle.
 	int shuffle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };  // Real type is v16i8
 	auto lowHigh = RValue<Byte16>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 	return As<Short4>(Swizzle(As<Int4>(lowHigh), 0x2323));
@@ -1657,6 +1738,30 @@
 	return RValue<Byte16>(value);
 }
 
+RValue<Byte16> Swizzle(RValue<Byte16> x, uint64_t select)
+{
+	int shuffle[16] = {
+		static_cast<int>((select >> 60) & 0x0F),
+		static_cast<int>((select >> 56) & 0x0F),
+		static_cast<int>((select >> 52) & 0x0F),
+		static_cast<int>((select >> 48) & 0x0F),
+		static_cast<int>((select >> 44) & 0x0F),
+		static_cast<int>((select >> 40) & 0x0F),
+		static_cast<int>((select >> 36) & 0x0F),
+		static_cast<int>((select >> 32) & 0x0F),
+		static_cast<int>((select >> 28) & 0x0F),
+		static_cast<int>((select >> 24) & 0x0F),
+		static_cast<int>((select >> 20) & 0x0F),
+		static_cast<int>((select >> 16) & 0x0F),
+		static_cast<int>((select >> 12) & 0x0F),
+		static_cast<int>((select >> 8) & 0x0F),
+		static_cast<int>((select >> 4) & 0x0F),
+		static_cast<int>((select >> 0) & 0x0F),
+	};
+
+	return As<Byte16>(Nucleus::createShuffleVector(x.value, x.value, shuffle));
+}
+
 Short2::Short2(RValue<Short4> cast)
 {
 	storeValue(Nucleus::createBitCast(cast.value, getType()));
@@ -1890,6 +1995,7 @@
 
 RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y)
 {
+	// TODO(b/148379603): Optimize narrowing swizzle.
 	int shuffle[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };  // Real type is v8i16
 	auto lowHigh = RValue<Short8>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 	return As<Int2>(Swizzle(As<Int4>(lowHigh), 0x2323));
@@ -1898,6 +2004,7 @@
 RValue<Short4> Swizzle(RValue<Short4> x, uint16_t select)
 {
 	// Real type is v8i16
+	// TODO(b/148379603): Optimize narrowing swizzle.
 	int shuffle[8] = {
 		(select >> 12) & 0x03,
 		(select >> 8) & 0x03,
@@ -2214,6 +2321,22 @@
 	return RValue<UShort8>(Nucleus::createNot(val.value));
 }
 
+RValue<UShort8> Swizzle(RValue<UShort8> x, uint32_t select)
+{
+	int swizzle[16] = {
+		static_cast<int>((select >> 28) & 0x07),
+		static_cast<int>((select >> 24) & 0x07),
+		static_cast<int>((select >> 20) & 0x07),
+		static_cast<int>((select >> 16) & 0x07),
+		static_cast<int>((select >> 12) & 0x07),
+		static_cast<int>((select >> 8) & 0x07),
+		static_cast<int>((select >> 4) & 0x07),
+		static_cast<int>((select >> 0) & 0x07),
+	};
+
+	return RValue<UShort8>(Nucleus::createShuffleVector(x.value, x.value, swizzle));
+}
+
 Int::Int(Argument<Int> argument)
 {
 	storeValue(argument.value);
@@ -3073,12 +3196,14 @@
 
 RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y)
 {
+	// TODO(b/148379603): Optimize narrowing swizzle.
 	int shuffle[4] = { 0, 4, 1, 5 };  // Real type is v4i32
 	return As<Short4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 }
 
 RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
 {
+	// TODO(b/148379603): Optimize narrowing swizzle.
 	int shuffle[4] = { 0, 4, 1, 5 };  // Real type is v4i32
 	auto lowHigh = RValue<Int4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 	return As<Short4>(Swizzle(lowHigh, 0x2323));
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 3ddbec9..488c0be 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -517,15 +517,19 @@
 {
 public:
 	explicit Byte4(RValue<Byte8> cast);
+	explicit Byte4(RValue<UShort4> cast);
+	explicit Byte4(RValue<Short4> cast);
+	explicit Byte4(RValue<UInt4> cast);
+	explicit Byte4(RValue<Int4> cast);
 
 	Byte4() = default;
 	//	Byte4(int x, int y, int z, int w);
-	//	Byte4(RValue<Byte4> rhs);
-	//	Byte4(const Byte4 &rhs);
+	Byte4(RValue<Byte4> rhs);
+	Byte4(const Byte4 &rhs);
 	Byte4(const Reference<Byte4> &rhs);
 
-	//	RValue<Byte4> operator=(RValue<Byte4> rhs);
-	//	RValue<Byte4> operator=(const Byte4 &rhs);
+	RValue<Byte4> operator=(RValue<Byte4> rhs);
+	RValue<Byte4> operator=(const Byte4 &rhs);
 	//	RValue<Byte4> operator=(const Reference<Byte4> &rhs);
 
 	static Type *getType();
@@ -656,6 +660,7 @@
 RValue<Int> SignMask(RValue<Byte8> x);
 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y);
 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y);
+RValue<Byte8> Swizzle(RValue<Byte8> x, uint32_t select);
 
 class SByte8 : public LValue<SByte8>
 {
@@ -713,7 +718,6 @@
 {
 public:
 	Byte16() = default;
-	//	Byte16(int x, int y, int z, int w);
 	Byte16(RValue<Byte16> rhs);
 	Byte16(const Byte16 &rhs);
 	Byte16(const Reference<Byte16> &rhs);
@@ -752,6 +756,7 @@
 //	const Byte16 &operator++(Byte16 &val);   // Pre-increment
 //	RValue<Byte16> operator--(Byte16 &val, int);   // Post-decrement
 //	const Byte16 &operator--(Byte16 &val);   // Pre-decrement
+RValue<Byte16> Swizzle(RValue<Byte16> x, uint64_t select);
 
 class SByte16 : public LValue<SByte16>
 {
@@ -1065,7 +1070,7 @@
 //	RValue<Bool> operator!=(RValue<UShort8> lhs, RValue<UShort8> rhs);
 //	RValue<Bool> operator==(RValue<UShort8> lhs, RValue<UShort8> rhs);
 
-RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7);
+RValue<UShort8> Swizzle(RValue<UShort8> x, uint32_t select);
 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y);
 
 class Int : public LValue<Int>
diff --git a/src/Reactor/ReactorUnitTests.cpp b/src/Reactor/ReactorUnitTests.cpp
index 103643e..2ffbcdb 100644
--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp
@@ -345,133 +345,199 @@
 	}
 }
 
+TEST(ReactorUnitTests, Cast)
+{
+	FunctionT<void(void *)> function;
+	{
+		Pointer<Byte> out = function.Arg<0>();
+
+		Int4 c = Int4(0x01020304, 0x05060708, 0x09101112, 0x13141516);
+		*Pointer<Short4>(out + 16 * 0) = Short4(c);
+		*Pointer<Byte4>(out + 16 * 1 + 0) = Byte4(c);
+		*Pointer<Byte4>(out + 16 * 1 + 4) = Byte4(As<Byte8>(c));
+		*Pointer<Byte4>(out + 16 * 1 + 8) = Byte4(As<Short4>(c));
+	}
+
+	auto routine = function("one");
+
+	if(routine)
+	{
+		int out[2][4];
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		EXPECT_EQ(out[0][0], 0x07080304);
+		EXPECT_EQ(out[0][1], 0x15161112);
+
+		EXPECT_EQ(out[1][0], 0x16120804);
+		EXPECT_EQ(out[1][1], 0x01020304);
+		EXPECT_EQ(out[1][2], 0x06080204);
+	}
+}
+
+static uint16_t swizzleCode4(int i)
+{
+	auto x = (i >> 0) & 0x03;
+	auto y = (i >> 2) & 0x03;
+	auto z = (i >> 4) & 0x03;
+	auto w = (i >> 6) & 0x03;
+	return static_cast<uint16_t>((x << 12) | (y << 8) | (z << 4) | (w << 0));
+}
+
+TEST(ReactorUnitTests, Swizzle4)
+{
+	FunctionT<void(void *)> function;
+	{
+		Pointer<Byte> out = function.Arg<0>();
+
+		for(int i = 0; i < 256; i++)
+		{
+			*Pointer<Float4>(out + 16 * i) = Swizzle(Float4(1.0f, 2.0f, 3.0f, 4.0f), swizzleCode4(i));
+		}
+
+		for(int i = 0; i < 256; i++)
+		{
+			*Pointer<Float4>(out + 16 * (256 + i)) = ShuffleLowHigh(Float4(1.0f, 2.0f, 3.0f, 4.0f), Float4(5.0f, 6.0f, 7.0f, 8.0f), swizzleCode4(i));
+		}
+
+		*Pointer<Float4>(out + 16 * (512 + 0)) = UnpackLow(Float4(1.0f, 2.0f, 3.0f, 4.0f), Float4(5.0f, 6.0f, 7.0f, 8.0f));
+		*Pointer<Float4>(out + 16 * (512 + 1)) = UnpackHigh(Float4(1.0f, 2.0f, 3.0f, 4.0f), Float4(5.0f, 6.0f, 7.0f, 8.0f));
+		*Pointer<Int2>(out + 16 * (512 + 2)) = UnpackLow(Short4(1, 2, 3, 4), Short4(5, 6, 7, 8));
+		*Pointer<Int2>(out + 16 * (512 + 3)) = UnpackHigh(Short4(1, 2, 3, 4), Short4(5, 6, 7, 8));
+		*Pointer<Short4>(out + 16 * (512 + 4)) = UnpackLow(Byte8(1, 2, 3, 4, 5, 6, 7, 8), Byte8(9, 10, 11, 12, 13, 14, 15, 16));
+		*Pointer<Short4>(out + 16 * (512 + 5)) = UnpackHigh(Byte8(1, 2, 3, 4, 5, 6, 7, 8), Byte8(9, 10, 11, 12, 13, 14, 15, 16));
+
+		for(int i = 0; i < 256; i++)
+		{
+			*Pointer<Short4>(out + 16 * (512 + 6) + (8 * i)) =
+			    Swizzle(Short4(1, 2, 3, 4), swizzleCode4(i));
+		}
+
+		for(int i = 0; i < 256; i++)
+		{
+			*Pointer<Int4>(out + 16 * (512 + 6 + i) + (8 * 256)) =
+			    Swizzle(Int4(1, 2, 3, 4), swizzleCode4(i));
+		}
+	}
+
+	auto routine = function("one");
+
+	if(routine)
+	{
+		struct
+		{
+			float f[256 + 256 + 2][4];
+			int i[388][4];
+		} out;
+
+		memset(&out, 0, sizeof(out));
+
+		routine(&out);
+
+		for(int i = 0; i < 256; i++)
+		{
+			EXPECT_EQ(out.f[i][0], float((i >> 0) & 0x03) + 1.0f);
+			EXPECT_EQ(out.f[i][1], float((i >> 2) & 0x03) + 1.0f);
+			EXPECT_EQ(out.f[i][2], float((i >> 4) & 0x03) + 1.0f);
+			EXPECT_EQ(out.f[i][3], float((i >> 6) & 0x03) + 1.0f);
+		}
+
+		for(int i = 0; i < 256; i++)
+		{
+			EXPECT_EQ(out.f[256 + i][0], float((i >> 0) & 0x03) + 1.0f);
+			EXPECT_EQ(out.f[256 + i][1], float((i >> 2) & 0x03) + 1.0f);
+			EXPECT_EQ(out.f[256 + i][2], float((i >> 4) & 0x03) + 5.0f);
+			EXPECT_EQ(out.f[256 + i][3], float((i >> 6) & 0x03) + 5.0f);
+		}
+
+		EXPECT_EQ(out.f[512 + 0][0], 1.0f);
+		EXPECT_EQ(out.f[512 + 0][1], 5.0f);
+		EXPECT_EQ(out.f[512 + 0][2], 2.0f);
+		EXPECT_EQ(out.f[512 + 0][3], 6.0f);
+
+		EXPECT_EQ(out.f[512 + 1][0], 3.0f);
+		EXPECT_EQ(out.f[512 + 1][1], 7.0f);
+		EXPECT_EQ(out.f[512 + 1][2], 4.0f);
+		EXPECT_EQ(out.f[512 + 1][3], 8.0f);
+
+		EXPECT_EQ(out.i[0][0], 0x00050001);
+		EXPECT_EQ(out.i[0][1], 0x00060002);
+		EXPECT_EQ(out.i[0][2], 0x00000000);
+		EXPECT_EQ(out.i[0][3], 0x00000000);
+
+		EXPECT_EQ(out.i[1][0], 0x00070003);
+		EXPECT_EQ(out.i[1][1], 0x00080004);
+		EXPECT_EQ(out.i[1][2], 0x00000000);
+		EXPECT_EQ(out.i[1][3], 0x00000000);
+
+		EXPECT_EQ(out.i[2][0], 0x0A020901);
+		EXPECT_EQ(out.i[2][1], 0x0C040B03);
+		EXPECT_EQ(out.i[2][2], 0x00000000);
+		EXPECT_EQ(out.i[2][3], 0x00000000);
+
+		EXPECT_EQ(out.i[3][0], 0x0E060D05);
+		EXPECT_EQ(out.i[3][1], 0x10080F07);
+		EXPECT_EQ(out.i[3][2], 0x00000000);
+		EXPECT_EQ(out.i[3][3], 0x00000000);
+
+		for(int i = 0; i < 256; i++)
+		{
+			EXPECT_EQ(out.i[4 + i / 2][0 + (i % 2) * 2] & 0xFFFF,
+			          ((i >> 0) & 0x03) + 1);
+			EXPECT_EQ(out.i[4 + i / 2][0 + (i % 2) * 2] >> 16,
+			          ((i >> 2) & 0x03) + 1);
+			EXPECT_EQ(out.i[4 + i / 2][1 + (i % 2) * 2] & 0xFFFF,
+			          ((i >> 4) & 0x03) + 1);
+			EXPECT_EQ(out.i[4 + i / 2][1 + (i % 2) * 2] >> 16,
+			          ((i >> 6) & 0x03) + 1);
+		}
+
+		for(int i = 0; i < 256; i++)
+		{
+			EXPECT_EQ(out.i[132 + i][0], ((i >> 0) & 0x03) + 1);
+			EXPECT_EQ(out.i[132 + i][1], ((i >> 2) & 0x03) + 1);
+			EXPECT_EQ(out.i[132 + i][2], ((i >> 4) & 0x03) + 1);
+			EXPECT_EQ(out.i[132 + i][3], ((i >> 6) & 0x03) + 1);
+		}
+	}
+}
+
 TEST(ReactorUnitTests, Swizzle)
 {
-	auto swizzleCode = [](int i) -> uint16_t {
-		auto x = (i >> 0) & 0x03;
-		auto y = (i >> 2) & 0x03;
-		auto z = (i >> 4) & 0x03;
-		auto w = (i >> 6) & 0x03;
-		return (x << 12) | (y << 8) | (z << 4) | (w << 0);
-	};
-
+	FunctionT<void(void *)> function;
 	{
-		FunctionT<int(void *)> function;
-		{
-			Pointer<Byte> out = function.Arg<0>();
+		Pointer<Byte> out = function.Arg<0>();
 
-			for(int i = 0; i < 256; i++)
-			{
-				*Pointer<Float4>(out + 16 * i) = Swizzle(Float4(1.0f, 2.0f, 3.0f, 4.0f), swizzleCode(i));
-			}
+		Int4 c = Int4(0x01020304, 0x05060708, 0x09101112, 0x13141516);
+		*Pointer<Byte16>(out + 16 * 0) = Swizzle(As<Byte16>(c), 0xFEDCBA9876543210ull);
+		*Pointer<Byte8>(out + 16 * 1) = Swizzle(As<Byte8>(c), 0x76543210u);
+		*Pointer<UShort8>(out + 16 * 2) = Swizzle(As<UShort8>(c), 0x76543210u);
+	}
 
-			for(int i = 0; i < 256; i++)
-			{
-				*Pointer<Float4>(out + 16 * (256 + i)) = ShuffleLowHigh(Float4(1.0f, 2.0f, 3.0f, 4.0f), Float4(5.0f, 6.0f, 7.0f, 8.0f), swizzleCode(i));
-			}
+	auto routine = function("one");
 
-			*Pointer<Float4>(out + 16 * (512 + 0)) = UnpackLow(Float4(1.0f, 2.0f, 3.0f, 4.0f), Float4(5.0f, 6.0f, 7.0f, 8.0f));
-			*Pointer<Float4>(out + 16 * (512 + 1)) = UnpackHigh(Float4(1.0f, 2.0f, 3.0f, 4.0f), Float4(5.0f, 6.0f, 7.0f, 8.0f));
-			*Pointer<Int2>(out + 16 * (512 + 2)) = UnpackLow(Short4(1, 2, 3, 4), Short4(5, 6, 7, 8));
-			*Pointer<Int2>(out + 16 * (512 + 3)) = UnpackHigh(Short4(1, 2, 3, 4), Short4(5, 6, 7, 8));
-			*Pointer<Short4>(out + 16 * (512 + 4)) = UnpackLow(Byte8(1, 2, 3, 4, 5, 6, 7, 8), Byte8(9, 10, 11, 12, 13, 14, 15, 16));
-			*Pointer<Short4>(out + 16 * (512 + 5)) = UnpackHigh(Byte8(1, 2, 3, 4, 5, 6, 7, 8), Byte8(9, 10, 11, 12, 13, 14, 15, 16));
+	if(routine)
+	{
+		int out[3][4];
 
-			for(int i = 0; i < 256; i++)
-			{
-				*Pointer<Short4>(out + 16 * (512 + 6) + (8 * i)) =
-				    Swizzle(Short4(1, 2, 3, 4), swizzleCode(i));
-			}
+		memset(&out, 0, sizeof(out));
 
-			for(int i = 0; i < 256; i++)
-			{
-				*Pointer<Int4>(out + 16 * (512 + 6 + i) + (8 * 256)) =
-				    Swizzle(Int4(1, 2, 3, 4), swizzleCode(i));
-			}
+		routine(&out);
 
-			Return(0);
-		}
+		EXPECT_EQ(out[0][0], 0x16151413);
+		EXPECT_EQ(out[0][1], 0x12111009);
+		EXPECT_EQ(out[0][2], 0x08070605);
+		EXPECT_EQ(out[0][3], 0x04030201);
 
-		auto routine = function("one");
+		EXPECT_EQ(out[1][0], 0x08070605);
+		EXPECT_EQ(out[1][1], 0x04030201);
 
-		if(routine)
-		{
-			struct
-			{
-				float f[256 + 256 + 2][4];
-				int i[388][4];
-			} out;
-
-			memset(&out, 0, sizeof(out));
-
-			routine(&out);
-
-			for(int i = 0; i < 256; i++)
-			{
-				EXPECT_EQ(out.f[i][0], float((i >> 0) & 0x03) + 1.0f);
-				EXPECT_EQ(out.f[i][1], float((i >> 2) & 0x03) + 1.0f);
-				EXPECT_EQ(out.f[i][2], float((i >> 4) & 0x03) + 1.0f);
-				EXPECT_EQ(out.f[i][3], float((i >> 6) & 0x03) + 1.0f);
-			}
-
-			for(int i = 0; i < 256; i++)
-			{
-				EXPECT_EQ(out.f[256 + i][0], float((i >> 0) & 0x03) + 1.0f);
-				EXPECT_EQ(out.f[256 + i][1], float((i >> 2) & 0x03) + 1.0f);
-				EXPECT_EQ(out.f[256 + i][2], float((i >> 4) & 0x03) + 5.0f);
-				EXPECT_EQ(out.f[256 + i][3], float((i >> 6) & 0x03) + 5.0f);
-			}
-
-			EXPECT_EQ(out.f[512 + 0][0], 1.0f);
-			EXPECT_EQ(out.f[512 + 0][1], 5.0f);
-			EXPECT_EQ(out.f[512 + 0][2], 2.0f);
-			EXPECT_EQ(out.f[512 + 0][3], 6.0f);
-
-			EXPECT_EQ(out.f[512 + 1][0], 3.0f);
-			EXPECT_EQ(out.f[512 + 1][1], 7.0f);
-			EXPECT_EQ(out.f[512 + 1][2], 4.0f);
-			EXPECT_EQ(out.f[512 + 1][3], 8.0f);
-
-			EXPECT_EQ(out.i[0][0], 0x00050001);
-			EXPECT_EQ(out.i[0][1], 0x00060002);
-			EXPECT_EQ(out.i[0][2], 0x00000000);
-			EXPECT_EQ(out.i[0][3], 0x00000000);
-
-			EXPECT_EQ(out.i[1][0], 0x00070003);
-			EXPECT_EQ(out.i[1][1], 0x00080004);
-			EXPECT_EQ(out.i[1][2], 0x00000000);
-			EXPECT_EQ(out.i[1][3], 0x00000000);
-
-			EXPECT_EQ(out.i[2][0], 0x0A020901);
-			EXPECT_EQ(out.i[2][1], 0x0C040B03);
-			EXPECT_EQ(out.i[2][2], 0x00000000);
-			EXPECT_EQ(out.i[2][3], 0x00000000);
-
-			EXPECT_EQ(out.i[3][0], 0x0E060D05);
-			EXPECT_EQ(out.i[3][1], 0x10080F07);
-			EXPECT_EQ(out.i[3][2], 0x00000000);
-			EXPECT_EQ(out.i[3][3], 0x00000000);
-
-			for(int i = 0; i < 256; i++)
-			{
-				EXPECT_EQ(out.i[4 + i / 2][0 + (i % 2) * 2] & 0xFFFF,
-				          ((i >> 0) & 0x03) + 1);
-				EXPECT_EQ(out.i[4 + i / 2][0 + (i % 2) * 2] >> 16,
-				          ((i >> 2) & 0x03) + 1);
-				EXPECT_EQ(out.i[4 + i / 2][1 + (i % 2) * 2] & 0xFFFF,
-				          ((i >> 4) & 0x03) + 1);
-				EXPECT_EQ(out.i[4 + i / 2][1 + (i % 2) * 2] >> 16,
-				          ((i >> 6) & 0x03) + 1);
-			}
-
-			for(int i = 0; i < 256; i++)
-			{
-				EXPECT_EQ(out.i[132 + i][0], ((i >> 0) & 0x03) + 1);
-				EXPECT_EQ(out.i[132 + i][1], ((i >> 2) & 0x03) + 1);
-				EXPECT_EQ(out.i[132 + i][2], ((i >> 4) & 0x03) + 1);
-				EXPECT_EQ(out.i[132 + i][3], ((i >> 6) & 0x03) + 1);
-			}
-		}
+		EXPECT_EQ(out[2][0], 0x15161314);
+		EXPECT_EQ(out[2][1], 0x11120910);
+		EXPECT_EQ(out[2][2], 0x07080506);
+		EXPECT_EQ(out[2][3], 0x03040102);
 	}
 }
 
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index a9375c4..5d057b0 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -2682,24 +2682,12 @@
 	}
 }
 
-RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
-{
-	UNIMPLEMENTED("RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)");
-	return UShort8(0);
-}
-
 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
 {
 	UNIMPLEMENTED("RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)");
 	return UShort8(0);
 }
 
-// FIXME: Implement as Shuffle(x, y, Select(i0, ..., i16)) and Shuffle(x, y, SELECT_PACK_REPEAT(element))
-//	RValue<UShort8> PackRepeat(RValue<Byte16> x, RValue<Byte16> y, int element)
-//	{
-//		ASSERT(false && "UNIMPLEMENTED"); return RValue<UShort8>(V(nullptr));
-//	}
-
 Type *UShort8::getType()
 {
 	return T(Ice::IceType_v8i16);