Implement floating-point rounding intrinsic.

BUG=swiftshader:15

Change-Id: I8e53f2fdb8208f8be0f4cdff3241b4a5efe9bc8a
Reviewed-on: https://chromium-review.googlesource.com/404352
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index 5cd4faf..96b1df4 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -511,7 +511,9 @@
     kRoundUp = 0x2,
     kRoundToZero = 0x3
   };
-  void roundsd(XmmRegister dst, XmmRegister src, RoundingMode mode);
+  void round(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mode);
+  void round(Type Ty, XmmRegister dst, const Address &src,
+             const Immediate &mode);
 
   //----------------------------------------------------------------------------
   //
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index 0abf587..655e5ad 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -2392,17 +2392,58 @@
 }
 
 template <typename TraitsType>
-void AssemblerX86Base<TraitsType>::roundsd(XmmRegister dst, XmmRegister src,
-                                           RoundingMode mode) {
+void AssemblerX86Base<TraitsType>::round(Type Ty, XmmRegister dst,
+                                         XmmRegister src,
+                                         const Immediate &mode) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
   emitRexRB(RexTypeIrrelevant, dst, src);
   emitUint8(0x0F);
   emitUint8(0x3A);
-  emitUint8(0x0B);
+  switch (Ty) {
+  case IceType_v4f32:
+    emitUint8(0x08);
+    break;
+  case IceType_f32:
+    emitUint8(0x0A);
+    break;
+  case IceType_f64:
+    emitUint8(0x0B);
+    break;
+  default:
+    assert(false && "Unsupported round operand type");
+  }
   emitXmmRegisterOperand(dst, src);
   // Mask precision exeption.
-  emitUint8(static_cast<uint8_t>(mode) | 0x8);
+  emitUint8(static_cast<uint8_t>(mode.value()) | 0x8);
+}
+
+template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::round(Type Ty, XmmRegister dst,
+                                         const Address &src,
+                                         const Immediate &mode) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitUint8(0x66);
+  emitAddrSizeOverridePrefix();
+  emitRex(RexTypeIrrelevant, src, dst);
+  emitUint8(0x0F);
+  emitUint8(0x3A);
+  switch (Ty) {
+  case IceType_v4f32:
+    emitUint8(0x08);
+    break;
+  case IceType_f32:
+    emitUint8(0x0A);
+    break;
+  case IceType_f64:
+    emitUint8(0x0B);
+    break;
+  default:
+    assert(false && "Unsupported round operand type");
+  }
+  emitOperand(gprEncoding(dst), src);
+  // Mask precision exeption.
+  emitUint8(static_cast<uint8_t>(mode.value()) | 0x8);
 }
 
 template <typename TraitsType>
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index 489ffea..62d6c61 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -164,6 +164,7 @@
       Pxor,
       Ret,
       Rol,
+      Round,
       Sar,
       Sbb,
       SbbRMW,
@@ -2564,6 +2565,25 @@
     InstX86Cvt(Cfg *Func, Variable *Dest, Operand *Source, CvtVariant Variant);
   };
 
+  /// Round instruction
+  class InstX86Round final
+      : public InstX86BaseThreeAddressop<InstX86Base::Round> {
+  public:
+    static InstX86Round *create(Cfg *Func, Variable *Dest, Operand *Source,
+                                Operand *Imm) {
+      return new (Func->allocate<InstX86Round>())
+          InstX86Round(Func, Dest, Source, Imm);
+    }
+
+    void emit(const Cfg *Func) const override;
+    void emitIAS(const Cfg *Func) const override;
+
+  private:
+    InstX86Round(Cfg *Func, Variable *Dest, Operand *Source, Operand *Imm)
+        : InstX86BaseThreeAddressop<InstX86Base::Round>(Func, Dest, Source,
+                                                        Imm) {}
+  };
+
   /// cmp - Integer compare instruction.
   class InstX86Icmp final : public InstX86Base {
     InstX86Icmp() = delete;
@@ -3229,6 +3249,7 @@
   using Cmpxchg = typename InstImpl<TraitsType>::InstX86Cmpxchg;
   using Cmpxchg8b = typename InstImpl<TraitsType>::InstX86Cmpxchg8b;
   using Cvt = typename InstImpl<TraitsType>::InstX86Cvt;
+  using Round = typename InstImpl<TraitsType>::InstX86Round;
   using Icmp = typename InstImpl<TraitsType>::InstX86Icmp;
   using Ucomiss = typename InstImpl<TraitsType>::InstX86Ucomiss;
   using UD2 = typename InstImpl<TraitsType>::InstX86UD2;
@@ -3494,6 +3515,9 @@
       "insertps";                                                              \
   template <>                                                                  \
   template <>                                                                  \
+  const char *InstImpl<TraitsType>::InstX86Round::Base::Opcode = "round";      \
+  template <>                                                                  \
+  template <>                                                                  \
   const char *InstImpl<TraitsType>::InstX86Shufps::Base::Opcode = "shufps";    \
   template <>                                                                  \
   template <>                                                                  \
diff --git a/src/IceInstX86BaseImpl.h b/src/IceInstX86BaseImpl.h
index 8eae1b3..b738a02 100644
--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -1767,6 +1767,35 @@
 }
 
 template <typename TraitsType>
+void InstImpl<TraitsType>::InstX86Round::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getSrcSize() == 3);
+  Str << "\t" << this->Opcode
+      << Traits::TypeAttributes[this->getDest()->getType()].SpSdString
+      << "\t";
+  this->getSrc(1)->emit(Func);
+  Str << ", ";
+  this->getSrc(0)->emit(Func);
+  Str << ", ";
+  this->getDest()->emit(Func);
+}
+
+template <typename TraitsType>
+void InstImpl<TraitsType>::InstX86Round::emitIAS(const Cfg *Func) const {
+  assert(this->getSrcSize() == 2);
+  assert(InstX86Base::getTarget(Func)->getInstructionSet() >= Traits::SSE4_1);
+  const Variable *Dest = this->getDest();
+  Type Ty = Dest->getType();
+  static const ThreeOpImmEmitter<XmmRegister, XmmRegister> Emitter = {
+      &Assembler::round, &Assembler::round};
+  emitIASThreeOpImmOps<XmmRegister, XmmRegister, Traits::getEncodedXmm,
+                       Traits::getEncodedXmm>(Func, Ty, Dest, this->getSrc(0),
+                                              this->getSrc(1), Emitter);
+}
+
+template <typename TraitsType>
 void InstImpl<TraitsType>::InstX86Icmp::emit(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
diff --git a/src/IceIntrinsics.h b/src/IceIntrinsics.h
index 3e2a738..c696fca 100644
--- a/src/IceIntrinsics.h
+++ b/src/IceIntrinsics.h
@@ -68,6 +68,7 @@
     MultiplyAddPairs,
     MultiplyHighSigned,
     MultiplyHighUnsigned,
+    Round,
     SignMask,
     StoreSubVector,
     SubtractSaturateSigned,
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index e6276ce..8e0ce83 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -619,6 +619,10 @@
     AutoMemorySandboxer<> _(this, &Dest, &Src0);
     Context.insert<typename Traits::Insts::Cvt>(Dest, Src0, Variant);
   }
+  void _round(Variable *Dest, Operand *Src0, Operand *Imm) {
+    AutoMemorySandboxer<> _(this, &Dest, &Src0);
+    Context.insert<typename Traits::Insts::Round>(Dest, Src0, Imm);
+  }
   void _div(Variable *Dest, Operand *Src0, Operand *Src1) {
     AutoMemorySandboxer<> _(this, &Dest, &Src0, &Src1);
     Context.insert<typename Traits::Insts::Div>(Dest, Src0, Src1);
@@ -894,6 +898,10 @@
     AutoMemorySandboxer<> _(this, &Dest, &Src0);
     Context.insert<typename Traits::Insts::Rol>(Dest, Src0);
   }
+  void _round(Variable *Dest, Operand *Src, Constant *Imm) {
+    AutoMemorySandboxer<> _(this, &Dest, &Src);
+    Context.insert<typename Traits::Insts::Round>(Dest, Src, Imm);
+  }
   X86OperandMem *_sandbox_mem_reference(X86OperandMem *Mem) {
     return dispatchToConcrete(&Traits::ConcreteTarget::_sandbox_mem_reference,
                               std::move(Mem));
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index a5de2ae..3472b00 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -4557,6 +4557,21 @@
     _movp(Dest, T);
     return;
   }
+  case Intrinsics::Round: {
+    Variable *Dest = Instr->getDest();
+    Operand *Src = Instr->getArg(0);
+    Operand *Mode = Instr->getArg(1);
+    assert(llvm::isa<ConstantInteger32>(Mode) &&
+           "Round last argument must be a constant");
+    auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
+    int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
+    (void)Imm;
+    assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
+    auto *T = makeReg(Dest->getType());
+    _round(T, SrcRM, Mode);
+    _movp(Dest, T);
+    return;
+  }
   default: // UnknownIntrinsic
     Func->setError("Unexpected intrinsic");
     return;
diff --git a/unittest/AssemblerX8632/XmmArith.cpp b/unittest/AssemblerX8632/XmmArith.cpp
index d21f153..da2ad72 100644
--- a/unittest/AssemblerX8632/XmmArith.cpp
+++ b/unittest/AssemblerX8632/XmmArith.cpp
@@ -2069,8 +2069,9 @@
                                                                                \
     __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
     __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
-    __ roundsd(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src, \
-               AssemblerX8632::k##Mode);                                       \
+    __ round(IceType_f64, XmmRegister::Encoded_Reg_##Dst,                      \
+             XmmRegister::Encoded_Reg_##Src,                                   \
+             Immediate(AssemblerX8632::k##Mode));                              \
                                                                                \
     AssembledTest test = assemble();                                           \
     test.setDqwordTo(T0, V0);                                                  \
diff --git a/unittest/AssemblerX8664/XmmArith.cpp b/unittest/AssemblerX8664/XmmArith.cpp
index 72c6730..25dedfa 100644
--- a/unittest/AssemblerX8664/XmmArith.cpp
+++ b/unittest/AssemblerX8664/XmmArith.cpp
@@ -2194,8 +2194,8 @@
                                                                                \
     __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
     __ movups(Encoded_Xmm_##Src(), dwordAddress(T1));                          \
-    __ roundsd(Encoded_Xmm_##Dst(), Encoded_Xmm_##Src(),                       \
-               AssemblerX8664::k##Mode);                                       \
+    __ round(IceType_f64, Encoded_Xmm_##Dst(), Encoded_Xmm_##Src(),            \
+             Immediate(AssemblerX8664::k##Mode));                              \
                                                                                \
     AssembledTest test = assemble();                                           \
     test.setDqwordTo(T0, V0);                                                  \