Subzero: ARM32: lowering of vector insert and extract.

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1655313002 .
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index 6b9c5a0..e9b4f61 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -20,7 +20,6 @@
 #include "IceCfgNode.h"
 #include "IceInst.h"
 #include "IceOperand.h"
-#include "IceRegistersARM32.h"
 #include "IceTargetLoweringARM32.h"
 
 namespace Ice {
@@ -28,6 +27,8 @@
 
 namespace {
 
+using Register = RegARM32::AllRegisters;
+
 // maximum number of registers allowed in vpush/vpop.
 static constexpr SizeT VpushVpopMaxConsecRegs = 16;
 
@@ -1043,6 +1044,132 @@
   }
 }
 
+// These next two functions find the D register that maps to the half of the Q
+// register that this instruction is accessing.
+Register getDRegister(const Variable *Src, uint32_t Index) {
+  assert(Src->hasReg());
+  const auto SrcReg = static_cast<Register>(Src->getRegNum());
+
+  const RegARM32::RegTableType &SrcEntry = RegARM32::RegTable[SrcReg];
+  assert(SrcEntry.IsVec128);
+
+  const uint32_t NumElements = typeNumElements(Src->getType());
+
+  // This code assumes the Aliases list goes Q_n, S_2n, S_2n+1. The asserts in
+  // the next two branches help to check that this is still true.
+  if (Index < NumElements / 2) {
+    // We have a Q register that's made up of two D registers. This assert is
+    // to help ensure that we picked the right D register.
+    //
+    // TODO(jpp): find a way to do this that doesn't rely on ordering of the
+    // alias list.
+    assert(RegARM32::RegTable[SrcEntry.Aliases[1]].Encoding + 1 ==
+           RegARM32::RegTable[SrcEntry.Aliases[2]].Encoding);
+    return static_cast<Register>(SrcEntry.Aliases[1]);
+  } else {
+    // We have a Q register that's made up of two D registers. This assert is
+    // to help ensure that we picked the right D register.
+    //
+    // TODO(jpp): find a way to do this that doesn't rely on ordering of the
+    // alias list.
+    assert(RegARM32::RegTable[SrcEntry.Aliases[2]].Encoding - 1 ==
+           RegARM32::RegTable[SrcEntry.Aliases[1]].Encoding);
+    return static_cast<Register>(SrcEntry.Aliases[2]);
+  }
+}
+
+constexpr uint32_t getDIndex(uint32_t NumElements, uint32_t Index) {
+  return (Index < NumElements / 2) ? Index : Index - (NumElements / 2);
+}
+
+// For floating point values, we can insertelement or extractelement by moving
+// directly from an S register. This function finds the right one.
+Register getSRegister(const Variable *Src, uint32_t Index) {
+  assert(Src->hasReg());
+  const auto SrcReg = static_cast<Register>(Src->getRegNum());
+
+  // For floating point values, we need to be allocated to Q0 - Q7, so we can
+  // directly access the value we want as one of the S registers.
+  assert(Src->getType() == IceType_v4f32);
+  assert(SrcReg < RegARM32::Reg_q8);
+
+  // This part assumes the register alias list goes q0, d0, d1, s0, s1, s2, s3.
+  assert(Index < 4);
+
+  // TODO(jpp): find a way to do this that doesn't rely on ordering of the alias
+  // list.
+  return static_cast<Register>(RegARM32::RegTable[SrcReg].Aliases[Index + 3]);
+}
+
+void InstARM32Extract::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  const Type DestTy = getDest()->getType();
+
+  const auto *Src = llvm::cast<Variable>(getSrc(0));
+
+  if (isIntegerType(DestTy)) {
+    Str << "\t"
+        << "vmov" << getPredicate();
+    const uint32_t BitSize = typeWidthInBytes(DestTy) * CHAR_BIT;
+    if (BitSize < 32) {
+      Str << ".s" << BitSize;
+    } else {
+      Str << "." << BitSize;
+    }
+    Str << "\t";
+    getDest()->emit(Func);
+    Str << ", ";
+
+    const size_t VectorSize = typeNumElements(Src->getType());
+
+    const Register SrcReg = getDRegister(Src, Index);
+
+    Str << RegARM32::RegTable[SrcReg].Name;
+    Str << "[" << getDIndex(VectorSize, Index) << "]";
+  } else if (isFloatingType(DestTy)) {
+    const Register SrcReg = getSRegister(Src, Index);
+
+    Str << "\t"
+        << "vmov" << getPredicate() << ".f32"
+        << "\t";
+    getDest()->emit(Func);
+    Str << ", " << RegARM32::RegTable[SrcReg].Name;
+  } else {
+    assert(false && "Invalid extract type");
+  }
+}
+
+void InstARM32Insert::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  const Variable *Dest = getDest();
+  const Type DestTy = getDest()->getType();
+
+  const auto *Src = llvm::cast<Variable>(getSrc(0));
+
+  if (isIntegerType(DestTy)) {
+    Str << "\t"
+        << "vmov" << getPredicate();
+    const size_t BitSize = typeWidthInBytes(typeElementType(DestTy)) * CHAR_BIT;
+    Str << "." << BitSize << "\t";
+
+    const size_t VectorSize = typeNumElements(DestTy);
+    const Register DestReg = getDRegister(Dest, Index);
+    const uint32_t Index = getDIndex(VectorSize, this->Index);
+    Str << RegARM32::RegTable[DestReg].Name;
+    Str << "[" << Index << "], ";
+    Src->emit(Func);
+  } else if (isFloatingType(DestTy)) {
+    Str << "\t"
+        << "vmov" << getPredicate() << ".f32"
+        << "\t";
+    const Register DestReg = getSRegister(Dest, Index);
+    Str << RegARM32::RegTable[DestReg].Name << ", ";
+    Src->emit(Func);
+  } else {
+    assert(false && "Invalid insert type");
+  }
+}
+
 template <InstARM32::InstKindARM32 K>
 void InstARM32CmpLike<K>::emitIAS(const Cfg *Func) const {
   emitUsingTextFixup(Func);
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index 2be562a..56ca8af 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -23,6 +23,7 @@
 #include "IceInst.h"
 #include "IceInstARM32.def"
 #include "IceOperand.h"
+#include "IceRegistersARM32.h"
 
 namespace Ice {
 namespace ARM32 {
@@ -389,6 +390,8 @@
     Cmp,
     Dmb,
     Eor,
+    Extract,
+    Insert,
     Label,
     Ldr,
     Ldrex,
@@ -1349,6 +1352,62 @@
   Variable *DestHi = nullptr;
 };
 
+/// Generates vmov Rd, Dn[x] instructions, and their related floating point
+/// versions.
+class InstARM32Extract final : public InstARM32Pred {
+  InstARM32Extract() = delete;
+  InstARM32Extract(const InstARM32Extract &) = delete;
+  InstARM32Extract &operator=(const InstARM32Extract &) = delete;
+
+public:
+  static InstARM32Extract *create(Cfg *Func, Variable *Dest, Variable *Src0,
+                                  uint32_t Index, CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32Extract>())
+        InstARM32Extract(Func, Dest, Src0, Index, Predicate);
+  }
+  void emit(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Extract); }
+
+private:
+  InstARM32Extract(Cfg *Func, Variable *Dest, Variable *Src0, uint32_t Index,
+                   CondARM32::Cond Predicate)
+      : InstARM32Pred(Func, InstARM32::Extract, 1, Dest, Predicate),
+        Index(Index) {
+    assert(Index < typeNumElements(Src0->getType()));
+    addSource(Src0);
+  }
+
+  const uint32_t Index;
+};
+
+/// Generates vmov Dn[x], Rd instructions, and their related floating point
+/// versions.
+class InstARM32Insert final : public InstARM32Pred {
+  InstARM32Insert() = delete;
+  InstARM32Insert(const InstARM32Insert &) = delete;
+  InstARM32Insert &operator=(const InstARM32Insert &) = delete;
+
+public:
+  static InstARM32Insert *create(Cfg *Func, Variable *Dest, Variable *Src0,
+                                 uint32_t Index, CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32Insert>())
+        InstARM32Insert(Func, Dest, Src0, Index, Predicate);
+  }
+  void emit(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Insert); }
+
+private:
+  InstARM32Insert(Cfg *Func, Variable *Dest, Variable *Src0, uint32_t Index,
+                  CondARM32::Cond Predicate)
+      : InstARM32Pred(Func, InstARM32::Insert, 1, Dest, Predicate),
+        Index(Index) {
+    assert(Index < typeNumElements(Dest->getType()));
+    addSource(Src0);
+  }
+
+  const uint32_t Index;
+};
+
 class InstARM32Vcmp final : public InstARM32Pred {
   InstARM32Vcmp() = delete;
   InstARM32Vcmp(const InstARM32Vcmp &) = delete;
diff --git a/src/IceRegistersARM32.h b/src/IceRegistersARM32.h
index 94f45c1..1ce05ab 100644
--- a/src/IceRegistersARM32.h
+++ b/src/IceRegistersARM32.h
@@ -219,8 +219,12 @@
   return RegTable[RegNum].Name;
 }
 
-// Extend enum RegClass with ARM32-specific register classes (if any).
-enum RegClassARM32 : uint8_t { RCARM32_NUM = RC_Target };
+// Extend enum RegClass with ARM32-specific register classes.
+enum RegClassARM32 : uint8_t {
+  RCARM32_QtoS = RC_Target, // Denotes Q registers that are aliased by S
+                            // registers.
+  RCARM32_NUM
+};
 
 } // end of namespace RegARM32
 } // end of namespace ARM32
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 4b70dad..099ceb2 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -296,7 +296,9 @@
   llvm::SmallBitVector Float32Registers(RegARM32::Reg_NUM);
   llvm::SmallBitVector Float64Registers(RegARM32::Reg_NUM);
   llvm::SmallBitVector VectorRegisters(RegARM32::Reg_NUM);
+  llvm::SmallBitVector QtoSRegisters(RegARM32::Reg_NUM);
   llvm::SmallBitVector InvalidRegisters(RegARM32::Reg_NUM);
+  const unsigned EncodedReg_q8 = RegARM32::RegTable[RegARM32::Reg_q8].Encoding;
   for (int i = 0; i < RegARM32::Reg_NUM; ++i) {
     const auto &Entry = RegARM32::RegTable[i];
     IntegerRegisters[i] = Entry.IsInt;
@@ -305,6 +307,9 @@
     Float64Registers[i] = Entry.IsFP64;
     VectorRegisters[i] = Entry.IsVec128;
     RegisterAliases[i].resize(RegARM32::Reg_NUM);
+    // TODO(eholk): It would be better to store a QtoS flag in the
+    // IceRegistersARM32 table than to compare their encodings here.
+    QtoSRegisters[i] = Entry.IsVec128 && Entry.Encoding < EncodedReg_q8;
     for (int j = 0; j < Entry.NumAliases; ++j) {
       assert(i == j || !RegisterAliases[i][Entry.Aliases[j]]);
       RegisterAliases[i].set(Entry.Aliases[j]);
@@ -340,6 +345,7 @@
   TypeToRegisterSet[IceType_v8i16] = VectorRegisters;
   TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
   TypeToRegisterSet[IceType_v4f32] = VectorRegisters;
+  TypeToRegisterSet[RegARM32::RCARM32_QtoS] = QtoSRegisters;
 
   for (size_t i = 0; i < llvm::array_lengthof(TypeToRegisterSet); ++i)
     TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
@@ -3834,7 +3840,28 @@
 }
 
 void TargetARM32::lowerExtractElement(const InstExtractElement *Instr) {
-  UnimplementedLoweringError(this, Instr);
+  Variable *Dest = Instr->getDest();
+  Type DestTy = Dest->getType();
+
+  Variable *Src0 = legalizeToReg(Instr->getSrc(0));
+  Operand *Src1 = Instr->getSrc(1);
+
+  if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+    const uint32_t Index = Imm->getValue();
+    Variable *T = makeReg(DestTy);
+    Variable *TSrc0 = makeReg(Src0->getType());
+
+    if (isFloatingType(DestTy)) {
+      // We need to make sure the source is in a suitable register.
+      TSrc0->setRegClass(RegARM32::RCARM32_QtoS);
+    }
+
+    _mov(TSrc0, Src0);
+    _extractelement(T, TSrc0, Index);
+    _mov(Dest, T);
+    return;
+  }
+  assert(false && "extractelement requires a constant index");
 }
 
 namespace {
@@ -4229,7 +4256,28 @@
 }
 
 void TargetARM32::lowerInsertElement(const InstInsertElement *Instr) {
-  UnimplementedLoweringError(this, Instr);
+  Variable *Dest = Instr->getDest();
+  Type DestTy = Dest->getType();
+
+  Variable *Src0 = legalizeToReg(Instr->getSrc(0));
+  Variable *Src1 = legalizeToReg(Instr->getSrc(1));
+  Operand *Src2 = Instr->getSrc(2);
+
+  if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src2)) {
+    const uint32_t Index = Imm->getValue();
+    Variable *T = makeReg(DestTy);
+
+    if (isFloatingType(DestTy)) {
+      T->setRegClass(RegARM32::RCARM32_QtoS);
+    }
+
+    _mov(T, Src0);
+    _insertelement(T, Src1, Index);
+    _set_dest_redefined();
+    _mov(Dest, T);
+    return;
+  }
+  assert(false && "insertelement requires a constant index");
 }
 
 namespace {
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 00f1483..b35649c 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -85,13 +85,18 @@
   const llvm::SmallBitVector &
   getRegistersForVariable(const Variable *Var) const override {
     RegClass RC = Var->getRegClass();
-    assert(RC < RC_Target);
-    return TypeToRegisterSet[RC];
+    switch (RC) {
+    default:
+      assert(RC < RC_Target);
+      return TypeToRegisterSet[RC];
+    case RegARM32::RCARM32_QtoS:
+      return TypeToRegisterSet[RC];
+    }
   }
   const llvm::SmallBitVector &
   getAllRegistersForVariable(const Variable *Var) const override {
     RegClass RC = Var->getRegClass();
-    assert(RC < RC_Target);
+    assert((RegARM32::RegClassARM32)RC < RegARM32::RCARM32_NUM);
     return TypeToRegisterSetUnfiltered[RC];
   }
   const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
@@ -413,6 +418,20 @@
     }
   }
 
+  // Generates a vmov instruction to extract the given index from a vector
+  // register.
+  void _extractelement(Variable *Dest, Variable *Src0, uint32_t Index,
+                       CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert<InstARM32Extract>(Dest, Src0, Index, Pred);
+  }
+
+  // Generates a vmov instruction to insert a value into the given index of a
+  // vector register.
+  void _insertelement(Variable *Dest, Variable *Src0, uint32_t Index,
+                      CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert<InstARM32Insert>(Dest, Src0, Index, Pred);
+  }
+
   // --------------------------------------------------------------------------
   // Begin bool folding machinery.
   //