Add the unimplemented non-x86 code generator

This commit fills in the unimplemented non-x86 (generic) code generator.

Change-Id: I189b0ea523ecd9b18c29ad6ed6fa3f798382295b
Reviewed-on: https://swiftshader-review.googlesource.com/20476
Tested-by: Logan Chien <loganchien@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index ff290c2..ba4aeb9 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -119,7 +119,6 @@
 	sw::MutexLock codegenMutex;
 
 #if SWIFTSHADER_LLVM_VERSION >= 7
-#if defined(__i386__) || defined(__x86_64__)
 	llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
 	{
 		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
@@ -149,6 +148,7 @@
 		return ::builder->CreateSExt(::builder->CreateICmp(pred, x, y), dstTy, "");
 	}
 
+#if defined(__i386__) || defined(__x86_64__)
 	llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
 	{
 		llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
@@ -171,6 +171,217 @@
 		return ::builder->CreateSelect(cmp, v, neg);
 	}
 #endif  // defined(__i386__) || defined(__x86_64__)
+
+#if !defined(__i386__) && !defined(__x86_64__)
+	llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
+							   llvm::FCmpInst::Predicate pred)
+	{
+		return ::builder->CreateSelect(::builder->CreateFCmp(pred, x, y), x, y);
+	}
+
+	// Packed add/sub saturatation
+	llvm::Value *lowerPSAT(llvm::Intrinsic::ID intrinsic, llvm::Value *x, llvm::Value *y)
+	{
+		llvm::Function *func = llvm::Intrinsic::getDeclaration(
+			::module, intrinsic, {x->getType(), y->getType()});
+		llvm::Value *ret = ::builder->CreateCall(func, ARGS(x, y));
+		return ::builder->CreateExtractValue(ret, {0});
+	}
+
+	llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
+	{
+		return lowerPSAT(llvm::Intrinsic::uadd_with_overflow, x, y);
+	}
+
+	llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
+	{
+		return lowerPSAT(llvm::Intrinsic::sadd_with_overflow, x, y);
+	}
+
+	llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
+	{
+		return lowerPSAT(llvm::Intrinsic::usub_with_overflow, x, y);
+	}
+
+	llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
+	{
+		return lowerPSAT(llvm::Intrinsic::ssub_with_overflow, x, y);
+	}
+
+	llvm::Value *lowerSQRT(llvm::Value *x)
+	{
+		llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
+			::module, llvm::Intrinsic::sqrt, {x->getType()});
+		return ::builder->CreateCall(sqrt, ARGS(x));
+	}
+
+	llvm::Value *lowerRCP(llvm::Value *x)
+	{
+		llvm::Type *ty = x->getType();
+		llvm::Constant *one;
+		if (llvm::VectorType *vectorTy = llvm::dyn_cast<llvm::VectorType>(ty))
+		{
+			one = llvm::ConstantVector::getSplat(
+				vectorTy->getNumElements(),
+				llvm::ConstantFP::get(vectorTy->getElementType(), 1));
+		}
+		else
+		{
+			one = llvm::ConstantFP::get(ty, 1);
+		}
+		return ::builder->CreateFDiv(one, x);
+	}
+
+	llvm::Value *lowerRSQRT(llvm::Value *x)
+	{
+		return lowerRCP(lowerSQRT(x));
+	}
+
+	llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
+	{
+		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+		llvm::Value *y = llvm::ConstantVector::getSplat(
+			ty->getNumElements(),
+			llvm::ConstantInt::get(ty->getElementType(), scalarY));
+		return ::builder->CreateShl(x, y);
+	}
+
+	llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
+	{
+		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+		llvm::Value *y = llvm::ConstantVector::getSplat(
+			ty->getNumElements(),
+			llvm::ConstantInt::get(ty->getElementType(), scalarY));
+		return ::builder->CreateAShr(x, y);
+	}
+
+	llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
+	{
+		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+		llvm::Value *y = llvm::ConstantVector::getSplat(
+			ty->getNumElements(),
+			llvm::ConstantInt::get(ty->getElementType(), scalarY));
+		return ::builder->CreateLShr(x, y);
+	}
+
+	llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
+	{
+		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+		llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
+
+		llvm::Value *extX = ::builder->CreateSExt(x, extTy);
+		llvm::Value *extY = ::builder->CreateSExt(y, extTy);
+		llvm::Value *mult = ::builder->CreateMul(extX, extY);
+
+		llvm::Value *undef = llvm::UndefValue::get(extTy);
+
+		llvm::SmallVector<uint32_t, 16> evenIdx;
+		llvm::SmallVector<uint32_t, 16> oddIdx;
+		for (uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
+		{
+			evenIdx.push_back(i);
+			oddIdx.push_back(i + 1);
+		}
+
+		llvm::Value *lhs = ::builder->CreateShuffleVector(mult, undef, evenIdx);
+		llvm::Value *rhs = ::builder->CreateShuffleVector(mult, undef, oddIdx);
+		return ::builder->CreateAdd(lhs, rhs);
+	}
+
+	llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
+	{
+		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+		llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
+
+		llvm::Value *extX, *extY;
+		if (sext)
+		{
+			extX = ::builder->CreateSExt(x, extTy);
+			extY = ::builder->CreateSExt(y, extTy);
+		}
+		else
+		{
+			extX = ::builder->CreateZExt(x, extTy);
+			extY = ::builder->CreateZExt(y, extTy);
+		}
+
+		llvm::Value *mult = ::builder->CreateMul(extX, extY);
+
+		llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
+		llvm::Value *mulh = ::builder->CreateAShr(mult, intTy->getIntegerBitWidth());
+		return ::builder->CreateTrunc(mulh, ty);
+	}
+
+	llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
+	{
+		llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(x->getType());
+		llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
+
+		llvm::IntegerType *dstElemTy =
+			llvm::cast<llvm::IntegerType>(dstTy->getElementType());
+
+		uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
+		assert(truncNumBits < 64 && "shift 64 must be handled separately");
+		llvm::Constant *max, *min;
+		if (isSigned)
+		{
+			max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
+			min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
+		}
+		else
+		{
+			max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
+			min = llvm::ConstantInt::get(srcTy, 0, false);
+		}
+
+		x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
+		x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
+		y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
+		y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
+
+		x = ::builder->CreateTrunc(x, dstTy);
+		y = ::builder->CreateTrunc(y, dstTy);
+
+		llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
+		std::iota(index.begin(), index.end(), 0);
+
+		return ::builder->CreateShuffleVector(x, y, index);
+	}
+
+	llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
+	{
+		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+		llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
+		llvm::Value *cmp = ::builder->CreateICmpSLT(x, zero);
+
+		llvm::Value *ret = ::builder->CreateZExt(
+			::builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
+		for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
+		{
+			llvm::Value *elem = ::builder->CreateZExt(
+				::builder->CreateExtractElement(cmp, i), retTy);
+			ret = ::builder->CreateOr(ret, ::builder->CreateShl(elem, i));
+		}
+		return ret;
+	}
+
+	llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
+	{
+		llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
+		llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
+		llvm::Value *cmp = ::builder->CreateFCmpULT(x, zero);
+
+		llvm::Value *ret = ::builder->CreateZExt(
+			::builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
+		for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
+		{
+			llvm::Value *elem = ::builder->CreateZExt(
+				::builder->CreateExtractElement(cmp, i), retTy);
+			ret = ::builder->CreateOr(ret, ::builder->CreateShl(elem, i));
+		}
+		return ret;
+	}
+#endif  // !defined(__i386__) && !defined(__x86_64__)
 #endif  // SWIFTSHADER_LLVM_VERSION >= 7
 }
 
@@ -517,11 +728,18 @@
 
 		#if defined(__x86_64__)
 			static const char arch[] = "x86-64";
-		#else
+		#elif defined(__i386__)
 			static const char arch[] = "x86";
+		#elif defined(__aarch64__)
+			static const char arch[] = "arm64";
+		#elif defined(__arm__)
+			static const char arch[] = "arm";
+		#else
+		#error "unknown architecture"
 		#endif
 
 		llvm::SmallVector<std::string, 1> mattrs;
+#if defined(__i386__) || defined(__x86_64__)
 		mattrs.push_back(CPUID::supportsMMX()    ? "+mmx"    : "-mmx");
 		mattrs.push_back(CPUID::supportsCMOV()   ? "+cmov"   : "-cmov");
 		mattrs.push_back(CPUID::supportsSSE()    ? "+sse"    : "-sse");
@@ -533,6 +751,14 @@
 #else
 		mattrs.push_back(CPUID::supportsSSE4_1() ? "+sse4.1" : "-sse4.1");
 #endif
+#elif defined(__arm__)
+#if __ARM_ARCH >= 8
+		mattrs.push_back("+armv8-a");
+#else
+		// armv7-a requires compiler-rt routines; otherwise, compiled kernel
+		// might fail to link.
+#endif
+#endif
 
 #if SWIFTSHADER_LLVM_VERSION < 7
 		llvm::JITEmitDebugInfo = false;
@@ -2556,12 +2782,20 @@
 
 	RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::paddusb(x, y);
+#else
+		return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
+#endif
 	}
 
 	RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::psubusb(x, y);
+#else
+		return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
+#endif
 	}
 
 	RValue<Short4> Unpack(RValue<Byte4> x)
@@ -2590,17 +2824,29 @@
 
 	RValue<Int> SignMask(RValue<Byte8> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pmovmskb(x);
+#else
+		return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
+#endif
 	}
 
 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
 //	{
+//#if defined(__i386__) || defined(__x86_64__)
 //		return x86::pcmpgtb(x, y);   // FIXME: Signedness
+//#else
+//		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
+//#endif
 //	}
 
 	RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pcmpeqb(x, y);
+#else
+		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
+#endif
 	}
 
 	Type *Byte8::getType()
@@ -2773,12 +3019,20 @@
 
 	RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::paddsb(x, y);
+#else
+		return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
+#endif
 	}
 
 	RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::psubsb(x, y);
+#else
+		return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
+#endif
 	}
 
 	RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
@@ -2796,17 +3050,29 @@
 
 	RValue<Int> SignMask(RValue<SByte8> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pmovmskb(As<Byte8>(x));
+#else
+		return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
+#endif
 	}
 
 	RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pcmpgtb(x, y);
+#else
+		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
+#endif
 	}
 
 	RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
+#else
+		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
+#endif
 	}
 
 	Type *SByte8::getType()
@@ -2912,7 +3178,12 @@
 	Short4::Short4(RValue<Float4> cast)
 	{
 		Int4 v4i32 = Int4(cast);
+#if defined(__i386__) || defined(__x86_64__)
 		v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
+#else
+		Value *v = v4i32.loadValue();
+		v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
+#endif
 
 		storeValue(As<Short4>(Int2(v4i32)).value);
 	}
@@ -3049,16 +3320,22 @@
 
 	RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 	//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
 
 		return x86::psllw(lhs, rhs);
+#else
+		return As<Short4>(V(lowerVectorShl(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
 	{
-	//	return RValue<Short4>(Nucleus::createAShr(lhs.value, rhs.value));
-
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::psraw(lhs, rhs);
+#else
+		return As<Short4>(V(lowerVectorAShr(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs)
@@ -3134,45 +3411,75 @@
 
 	RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pmaxsw(x, y);
+#else
+		return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
+#endif
 	}
 
 	RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pminsw(x, y);
+#else
+		return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
+#endif
 	}
 
 	RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::paddsw(x, y);
+#else
+		return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
+#endif
 	}
 
 	RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::psubsw(x, y);
+#else
+		return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
+#endif
 	}
 
 	RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pmulhw(x, y);
+#else
+		return As<Short4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
+#endif
 	}
 
 	RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pmaddwd(x, y);
+#else
+		return As<Int2>(V(lowerMulAdd(V(x.value), V(y.value))));
+#endif
 	}
 
 	RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		auto result = x86::packsswb(x, y);
-
+#else
+		auto result = V(lowerPack(V(x.value), V(y.value), true));
+#endif
 		return As<SByte8>(Swizzle(As<Int4>(result), 0x88));
 	}
 
 	RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		auto result = x86::packuswb(x, y);
-
+#else
+		auto result = V(lowerPack(V(x.value), V(y.value), false));
+#endif
 		return As<Byte8>(Swizzle(As<Int4>(result), 0x88));
 	}
 
@@ -3219,12 +3526,20 @@
 
 	RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pcmpgtw(x, y);
+#else
+		return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
+#endif
 	}
 
 	RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pcmpeqw(x, y);
+#else
+		return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
+#endif
 	}
 
 	Type *Short4::getType()
@@ -3381,16 +3696,24 @@
 
 	RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 	//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
 
 		return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
+#else
+		return As<UShort4>(V(lowerVectorShl(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 	//	return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
 
 		return x86::psrlw(lhs, rhs);
+#else
+		return As<UShort4>(V(lowerVectorLShr(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs)
@@ -3420,22 +3743,38 @@
 
 	RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::paddusw(x, y);
+#else
+		return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
+#endif
 	}
 
 	RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::psubusw(x, y);
+#else
+		return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
+#endif
 	}
 
 	RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pmulhuw(x, y);
+#else
+		return As<UShort4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
+#endif
 	}
 
 	RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pavgw(x, y);
+#else
+		return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
+#endif
 	}
 
 	Type *UShort4::getType()
@@ -3486,17 +3825,29 @@
 
 	RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
 	{
-		return x86::psllw(lhs, rhs);   // FIXME: Fallback required
+#if defined(__i386__) || defined(__x86_64__)
+		return x86::psllw(lhs, rhs);
+#else
+		return As<Short8>(V(lowerVectorShl(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
 	{
-		return x86::psraw(lhs, rhs);   // FIXME: Fallback required
+#if defined(__i386__) || defined(__x86_64__)
+		return x86::psraw(lhs, rhs);
+#else
+		return As<Short8>(V(lowerVectorAShr(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
 	{
-		return x86::pmaddwd(x, y);   // FIXME: Fallback required
+#if defined(__i386__) || defined(__x86_64__)
+		return x86::pmaddwd(x, y);
+#else
+		return As<Int4>(V(lowerMulAdd(V(x.value), V(y.value))));
+#endif
 	}
 
 	RValue<Int4> Abs(RValue<Int4> x)
@@ -3507,7 +3858,11 @@
 
 	RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
 	{
-		return x86::pmulhw(x, y);   // FIXME: Fallback required
+#if defined(__i386__) || defined(__x86_64__)
+		return x86::pmulhw(x, y);
+#else
+		return As<Short8>(V(lowerMulHigh(V(x.value), V(y.value), true)));
+#endif
 	}
 
 	Type *Short8::getType()
@@ -3576,12 +3931,20 @@
 
 	RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
 	{
-		return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));   // FIXME: Fallback required
+#if defined(__i386__) || defined(__x86_64__)
+		return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
+#else
+		return As<UShort8>(V(lowerVectorShl(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::psrlw(lhs, rhs);   // FIXME: Fallback required
+#else
+		return As<UShort8>(V(lowerVectorLShr(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs)
@@ -3635,7 +3998,11 @@
 
 	RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
 	{
-		return x86::pmulhuw(x, y);   // FIXME: Fallback required
+#if defined(__i386__) || defined(__x86_64__)
+		return x86::pmulhuw(x, y);
+#else
+		return As<UShort8>(V(lowerMulHigh(V(x.value), V(y.value), false)));
+#endif
 	}
 
 	Type *UShort8::getType()
@@ -3983,9 +4350,11 @@
 
 	RValue<Int> RoundInt(RValue<Float> cast)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::cvtss2si(cast);
-
-	//	return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f));
+#else
+		return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
+#endif
 	}
 
 	Type *Int::getType()
@@ -4401,9 +4770,11 @@
 
 //	RValue<UInt> RoundUInt(RValue<Float> cast)
 //	{
+//#if defined(__i386__) || defined(__x86_64__)
 //		return x86::cvtss2si(val);   // FIXME: Unsigned
-//
-//	//	return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f));
+//#else
+//		return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
+//#endif
 //	}
 
 	Type *UInt::getType()
@@ -4523,16 +4894,24 @@
 
 	RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 	//	return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
 
 		return x86::pslld(lhs, rhs);
+#else
+		return As<Int2>(V(lowerVectorShl(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 	//	return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
 
 		return x86::psrad(lhs, rhs);
+#else
+		return As<Int2>(V(lowerVectorAShr(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs)
@@ -4716,16 +5095,24 @@
 
 	RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 	//	return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
 
 		return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
+#else
+		return As<UInt2>(V(lowerVectorShl(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 	//	return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
 
 		return x86::psrld(lhs, rhs);
+#else
+		return As<UInt2>(V(lowerVectorLShr(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs)
@@ -4804,11 +5191,13 @@
 
 	Int4::Int4(RValue<Byte4> cast) : XYZW(this)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			*this = x86::pmovzxbd(As<Byte16>(cast));
 		}
 		else
+#endif
 		{
 			int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
 			Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
@@ -4824,11 +5213,13 @@
 
 	Int4::Int4(RValue<SByte4> cast) : XYZW(this)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			*this = x86::pmovsxbd(As<SByte16>(cast));
 		}
 		else
+#endif
 		{
 			int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
 			Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
@@ -4851,11 +5242,13 @@
 
 	Int4::Int4(RValue<Short4> cast) : XYZW(this)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			*this = x86::pmovsxwd(As<Short8>(cast));
 		}
 		else
+#endif
 		{
 			int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
 			Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
@@ -4865,11 +5258,13 @@
 
 	Int4::Int4(RValue<UShort4> cast) : XYZW(this)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			*this = x86::pmovzxwd(As<UShort8>(cast));
 		}
 		else
+#endif
 		{
 			int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
 			Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
@@ -5031,12 +5426,20 @@
 
 	RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::pslld(lhs, rhs);
+#else
+		return As<Int4>(V(lowerVectorShl(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::psrad(lhs, rhs);
+#else
+		return As<Int4>(V(lowerVectorAShr(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs)
@@ -5164,11 +5567,13 @@
 
 	RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x86::pmaxsd(x, y);
 		}
 		else
+#endif
 		{
 			RValue<Int4> greater = CmpNLE(x, y);
 			return (x & greater) | (y & ~greater);
@@ -5177,11 +5582,13 @@
 
 	RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x86::pminsd(x, y);
 		}
 		else
+#endif
 		{
 			RValue<Int4> less = CmpLT(x, y);
 			return (x & less) | (y & ~less);
@@ -5190,17 +5597,29 @@
 
 	RValue<Int4> RoundInt(RValue<Float4> cast)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::cvtps2dq(cast);
+#else
+		return As<Int4>(V(::builder->CreateFPToSI(V(cast.value), T(Int4::getType()))));
+#endif
 	}
 
 	RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::packssdw(x, y);
+#else
+		return As<Short8>(V(lowerPack(V(x.value), V(y.value), true)));
+#endif
 	}
 
 	RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::packusdw(x, y);
+#else
+		return As<UShort8>(V(lowerPack(V(x.value), V(y.value), false)));
+#endif
 	}
 
 	RValue<Int> Extract(RValue<Int4> x, int i)
@@ -5215,7 +5634,11 @@
 
 	RValue<Int> SignMask(RValue<Int4> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::movmskps(As<Float4>(x));
+#else
+		return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
+#endif
 	}
 
 	RValue<Int4> Swizzle(RValue<Int4> x, unsigned char select)
@@ -5384,12 +5807,20 @@
 
 	RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
+#else
+		return As<UInt4>(V(lowerVectorShl(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::psrld(lhs, rhs);
+#else
+		return As<UInt4>(V(lowerVectorLShr(V(lhs.value), rhs)));
+#endif
 	}
 
 	RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs)
@@ -5508,11 +5939,13 @@
 
 	RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x86::pmaxud(x, y);
 		}
 		else
+#endif
 		{
 			RValue<UInt4> greater = CmpNLE(x, y);
 			return (x & greater) | (y & ~greater);
@@ -5521,11 +5954,13 @@
 
 	RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x86::pminud(x, y);
 		}
 		else
+#endif
 		{
 			RValue<UInt4> less = CmpLT(x, y);
 			return (x & less) | (y & ~less);
@@ -5694,35 +6129,46 @@
 
 	RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
 	{
-		#if defined(__i386__) || defined(__x86_64__)
-			if(exactAtPow2)
-			{
-				// rcpss uses a piecewise-linear approximation which minimizes the relative error
-				// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
-				return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
-			}
-		#endif
-
+#if defined(__i386__) || defined(__x86_64__)
+		if(exactAtPow2)
+		{
+			// rcpss uses a piecewise-linear approximation which minimizes the relative error
+			// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+			return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+		}
 		return x86::rcpss(x);
+#else
+		return As<Float>(V(lowerRCP(V(x.value))));
+#endif
 	}
 
 	RValue<Float> RcpSqrt_pp(RValue<Float> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::rsqrtss(x);
+#else
+		return As<Float>(V(lowerRSQRT(V(x.value))));
+#endif
 	}
 
 	RValue<Float> Sqrt(RValue<Float> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::sqrtss(x);
+#else
+		return As<Float>(V(lowerSQRT(V(x.value))));
+#endif
 	}
 
 	RValue<Float> Round(RValue<Float> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x86::roundss(x, 0);
 		}
 		else
+#endif
 		{
 			return Float4(Round(Float4(x))).x;
 		}
@@ -5730,11 +6176,13 @@
 
 	RValue<Float> Trunc(RValue<Float> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x86::roundss(x, 3);
 		}
 		else
+#endif
 		{
 			return Float(Int(x));   // Rounded toward zero
 		}
@@ -5742,11 +6190,13 @@
 
 	RValue<Float> Frac(RValue<Float> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x - x86::floorss(x);
 		}
 		else
+#endif
 		{
 			return Float4(Frac(Float4(x))).x;
 		}
@@ -5754,11 +6204,13 @@
 
 	RValue<Float> Floor(RValue<Float> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x86::floorss(x);
 		}
 		else
+#endif
 		{
 			return Float4(Floor(Float4(x))).x;
 		}
@@ -5766,11 +6218,13 @@
 
 	RValue<Float> Ceil(RValue<Float> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x86::ceilss(x);
 		}
 		else
+#endif
 		{
 			return Float4(Ceil(Float4(x))).x;
 		}
@@ -6016,36 +6470,53 @@
 
 	RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::maxps(x, y);
+#else
+		return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OGT)));
+#endif
 	}
 
 	RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::minps(x, y);
+#else
+		return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OLT)));
+#endif
 	}
 
 	RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
 	{
-		#if defined(__i386__) || defined(__x86_64__)
-			if(exactAtPow2)
-			{
-				// rcpps uses a piecewise-linear approximation which minimizes the relative error
-				// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
-				return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
-			}
-		#endif
-
+#if defined(__i386__) || defined(__x86_64__)
+		if(exactAtPow2)
+		{
+			// rcpps uses a piecewise-linear approximation which minimizes the relative error
+			// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+			return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+		}
 		return x86::rcpps(x);
+#else
+		return As<Float4>(V(lowerRCP(V(x.value))));
+#endif
 	}
 
 	RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::rsqrtps(x);
+#else
+		return As<Float4>(V(lowerRSQRT(V(x.value))));
+#endif
 	}
 
 	RValue<Float4> Sqrt(RValue<Float4> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::sqrtps(x);
+#else
+		return As<Float4>(V(lowerSQRT(V(x.value))));
+#endif
 	}
 
 	RValue<Float4> Insert(RValue<Float4> x, RValue<Float> element, int i)
@@ -6099,7 +6570,11 @@
 
 	RValue<Int> SignMask(RValue<Float4> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		return x86::movmskps(x);
+#else
+		return As<Int>(V(lowerFPSignMask(V(x.value), T(Int::getType()))));
+#endif
 	}
 
 	RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
@@ -6150,11 +6625,13 @@
 
 	RValue<Float4> Round(RValue<Float4> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x86::roundps(x, 0);
 		}
 		else
+#endif
 		{
 			return Float4(RoundInt(x));
 		}
@@ -6162,11 +6639,13 @@
 
 	RValue<Float4> Trunc(RValue<Float4> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x86::roundps(x, 3);
 		}
 		else
+#endif
 		{
 			return Float4(Int4(x));
 		}
@@ -6194,11 +6673,13 @@
 
 	RValue<Float4> Floor(RValue<Float4> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x86::floorps(x);
 		}
 		else
+#endif
 		{
 			return x - Frac(x);
 		}
@@ -6206,11 +6687,13 @@
 
 	RValue<Float4> Ceil(RValue<Float4> x)
 	{
+#if defined(__i386__) || defined(__x86_64__)
 		if(CPUID::supportsSSE4_1())
 		{
 			return x86::ceilps(x);
 		}
 		else
+#endif
 		{
 			return -Floor(-x);
 		}
@@ -6311,6 +6794,7 @@
 
 namespace sw
 {
+#if defined(__i386__) || defined(__x86_64__)
 	namespace x86
 	{
 		RValue<Int> cvtss2si(RValue<Float> val)
@@ -6854,4 +7338,5 @@
 #endif
 		}
 	}
+#endif  // defined(__i386__) || defined(__x86_64__)
 }