Implement 128-bit insert/extract

These operations facilitate the transition from 4-wide to arbitrary
width SIMD.

Bug: b/214583550
Change-Id: I13b72c1ff1f758556fc544f8351bf10ede1f2d5b
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/66673
Reviewed-by: Alexis Hétu <sugoi@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index e652dfb..c5dbf12 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -4258,6 +4258,21 @@
 #endif
 }
 
+RValue<Int4> Extract128(RValue<SIMD::Int> val, int i)
+{
+	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
+
+	return As<Int4>(V(jit->builder->CreateExtractElement(v128, i)));
+}
+
+RValue<SIMD::Int> Insert128(RValue<SIMD::Int> val, RValue<Int4> element, int i)
+{
+	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
+	llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
+
+	return As<SIMD::Int>(V(jit->builder->CreateInsertElement(v128, a, i)));
+}
+
 Type *SIMD::Int::type()
 {
 	return T(llvm::VectorType::get(T(scalar::Int::type()), SIMD::Width, false));
@@ -4344,6 +4359,21 @@
 	return (x & less) | (y & ~less);
 }
 
+RValue<UInt4> Extract128(RValue<SIMD::UInt> val, int i)
+{
+	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
+
+	return As<UInt4>(V(jit->builder->CreateExtractElement(v128, i)));
+}
+
+RValue<SIMD::UInt> Insert128(RValue<SIMD::UInt> val, RValue<UInt4> element, int i)
+{
+	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
+	llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
+
+	return As<SIMD::UInt>(V(jit->builder->CreateInsertElement(v128, a, i)));
+}
+
 Type *SIMD::UInt::type()
 {
 	return T(llvm::VectorType::get(T(scalar::UInt::type()), SIMD::Width, false));
@@ -4508,6 +4538,21 @@
 	return -Floor(-x);
 }
 
+RValue<Float4> Extract128(RValue<SIMD::Float> val, int i)
+{
+	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
+
+	return As<Float4>(V(jit->builder->CreateExtractElement(v128, i)));
+}
+
+RValue<SIMD::Float> Insert128(RValue<SIMD::Float> val, RValue<Float4> element, int i)
+{
+	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
+	llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
+
+	return As<SIMD::Float>(V(jit->builder->CreateInsertElement(v128, a, i)));
+}
+
 Type *SIMD::Float::type()
 {
 	return T(llvm::VectorType::get(T(scalar::Float::type()), SIMD::Width, false));
diff --git a/src/Reactor/SIMD.hpp b/src/Reactor/SIMD.hpp
index cb15f60..de7c2c5 100644
--- a/src/Reactor/SIMD.hpp
+++ b/src/Reactor/SIMD.hpp
@@ -27,6 +27,12 @@
 using Float = rr::Float;
 }  // namespace scalar
 
+namespace packed {
+using Int4 = rr::Int4;
+using UInt4 = rr::UInt4;
+using Float4 = rr::Float4;
+}  // namespace packed
+
 namespace SIMD {
 
 extern const int Width;
@@ -252,6 +258,8 @@
 RValue<SIMD::Int> RoundIntClamped(RValue<SIMD::Float> cast);
 RValue<scalar::Int> Extract(RValue<SIMD::Int> val, int i);
 RValue<SIMD::Int> Insert(RValue<SIMD::Int> val, RValue<scalar::Int> element, int i);
+RValue<packed::Int4> Extract128(RValue<SIMD::Int> val, int i);
+RValue<SIMD::Int> Insert128(RValue<SIMD::Int> val, RValue<packed::Int4> element, int i);
 
 RValue<SIMD::UInt> operator+(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
 RValue<SIMD::UInt> operator-(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
@@ -307,6 +315,8 @@
 RValue<SIMD::UInt> Min(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y);
 RValue<scalar::UInt> Extract(RValue<SIMD::UInt> val, int i);
 RValue<SIMD::UInt> Insert(RValue<SIMD::UInt> val, RValue<scalar::UInt> element, int i);
+RValue<packed::UInt4> Extract128(RValue<SIMD::UInt> val, int i);
+RValue<SIMD::UInt> Insert128(RValue<SIMD::UInt> val, RValue<packed::UInt4> element, int i);
 //	RValue<SIMD::UInt> RoundInt(RValue<SIMD::Float> cast);
 
 RValue<SIMD::Float> operator+(RValue<SIMD::Float> lhs, RValue<SIMD::Float> rhs);
@@ -336,6 +346,8 @@
 RValue<SIMD::Float> Sqrt(RValue<SIMD::Float> x);
 RValue<SIMD::Float> Insert(RValue<SIMD::Float> val, RValue<rr ::Float> element, int i);
 RValue<rr ::Float> Extract(RValue<SIMD::Float> x, int i);
+RValue<packed::Float4> Extract128(RValue<SIMD::Float> val, int i);
+RValue<SIMD::Float> Insert128(RValue<SIMD::Float> val, RValue<packed::Float4> element, int i);
 
 // Ordered comparison functions
 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index f0c0aac..3f9e802 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -4886,6 +4886,22 @@
 	}
 }
 
+RValue<Int4> Extract128(RValue<SIMD::Int> val, int i)
+{
+	ASSERT(SIMD::Width == 4);
+	ASSERT(i == 0);
+
+	return As<Int4>(val);
+}
+
+RValue<SIMD::Int> Insert128(RValue<SIMD::Int> val, RValue<Int4> element, int i)
+{
+	ASSERT(SIMD::Width == 4);
+	ASSERT(i == 0);
+
+	return As<SIMD::Int>(element);
+}
+
 Type *SIMD::Int::type()
 {
 	return T(Ice::IceType_v4i32);
@@ -5009,6 +5025,22 @@
 	return RValue<SIMD::UInt>(V(result));
 }
 
+RValue<UInt4> Extract128(RValue<SIMD::UInt> val, int i)
+{
+	ASSERT(SIMD::Width == 4);
+	ASSERT(i == 0);
+
+	return As<UInt4>(val);
+}
+
+RValue<SIMD::UInt> Insert128(RValue<SIMD::UInt> val, RValue<UInt4> element, int i)
+{
+	ASSERT(SIMD::Width == 4);
+	ASSERT(i == 0);
+
+	return As<SIMD::UInt>(element);
+}
+
 Type *SIMD::UInt::type()
 {
 	return T(Ice::IceType_v4i32);
@@ -5277,6 +5309,22 @@
 	}
 }
 
+RValue<Float4> Extract128(RValue<SIMD::Float> val, int i)
+{
+	ASSERT(SIMD::Width == 4);
+	ASSERT(i == 0);
+
+	return As<Float4>(val);
+}
+
+RValue<SIMD::Float> Insert128(RValue<SIMD::Float> val, RValue<Float4> element, int i)
+{
+	ASSERT(SIMD::Width == 4);
+	ASSERT(i == 0);
+
+	return As<SIMD::Float>(element);
+}
+
 Type *SIMD::Float::type()
 {
 	return T(Ice::IceType_v4f32);
diff --git a/tests/ReactorUnitTests/ReactorSIMD.cpp b/tests/ReactorUnitTests/ReactorSIMD.cpp
index fbc9071..de2deda 100644
--- a/tests/ReactorUnitTests/ReactorSIMD.cpp
+++ b/tests/ReactorUnitTests/ReactorSIMD.cpp
@@ -95,6 +95,45 @@
 	}
 }
 
+TEST(ReactorSIMD, InsertExtract128)
+{
+	FunctionT<void(int *, int *)> function;
+	{
+		Pointer<Int> r = Pointer<Int>(function.Arg<0>());
+		Pointer<Int> a = Pointer<Int>(function.Arg<1>());
+
+		SIMD::Int x = *Pointer<SIMD::Int>(a);
+		SIMD::Int y = *Pointer<SIMD::Int>(r);
+
+		x -= y;
+
+		for(int i = 0; i < SIMD::Width / 4; i++)
+		{
+			y = Insert128(y, Extract128(x, i) << (i + 1), i);
+		}
+
+		*Pointer<SIMD::Int>(r) = y;
+	}
+
+	auto routine = function(testName().c_str());
+
+	std::vector<int> r(SIMD::Width);
+	std::vector<int> a(SIMD::Width);
+
+	for(int i = 0; i < SIMD::Width; i++)
+	{
+		r[i] = 0;
+		a[i] = 1 + i;
+	}
+
+	routine(r.data(), a.data());
+
+	for(int i = 0; i < SIMD::Width; i++)
+	{
+		ASSERT_EQ(r[i], a[i] << (i / 4 + 1));
+	}
+}
+
 TEST(ReactorSIMD, Intrinsics_Scatter)
 {
 	Function<Void(Pointer<Float> base, Pointer<Float4> val, Pointer<Int4> offsets)> function;