Add insert/extract element to the integrated ARM assembler.

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4334
R=jpp@chromium.org

Review URL: https://codereview.chromium.org/1679023008 .
diff --git a/src/DartARM32/assembler_arm.cc b/src/DartARM32/assembler_arm.cc
index 106ecb4..8933b26 100644
--- a/src/DartARM32/assembler_arm.cc
+++ b/src/DartARM32/assembler_arm.cc
@@ -683,7 +683,8 @@
   Emit(encoding);
 }
 
-
+#if 0
+// Moved to ARM32::AssemblerARM32::vmovdqir().
 void Assembler::vmovdr(DRegister dn, int i, Register rt, Condition cond) {
   ASSERT(TargetCPUFeatures::vfp_supported());
   ASSERT((i == 0) || (i == 1));
@@ -701,7 +702,6 @@
   Emit(encoding);
 }
 
-#if 0
 // Moved to ARM32::AssemblerARM32::vmovdrr().
 void Assembler::vmovdrr(DRegister dm, Register rt, Register rt2,
                         Condition cond) {
diff --git a/src/DartARM32/assembler_arm.h b/src/DartARM32/assembler_arm.h
index d483053..9c6d9b1 100644
--- a/src/DartARM32/assembler_arm.h
+++ b/src/DartARM32/assembler_arm.h
@@ -630,9 +630,8 @@
   void vmovdrr(DRegister dm, Register rt, Register rt2, Condition cond = AL);
   // Moved to ARM32::AssemblerARM32::vmovrrd().
   void vmovrrd(Register rt, Register rt2, DRegister dm, Condition cond = AL);
-#endif
+  // Moved to ARM32::AssemblerARM32::vmovqir().
   void vmovdr(DRegister dd, int i, Register rt, Condition cond = AL);
-#if 0
   // Moved to ARM32::AssemblerARM32::vmovss().
   void vmovs(SRegister sd, SRegister sm, Condition cond = AL);
   // Moved to ARM32::AssemblerARM32::vmovdd().
@@ -1409,6 +1408,7 @@
   // ARM32::AssemblerARM32::veord()
   // ARM32::AssemblerARM32::vld1qr()
   // ARM32::AssemblerARM32::vst1qr()
+  // ARM32::AssemblerARM32::vmorqi()
 #endif
 
   DISALLOW_ALLOCATION();
diff --git a/src/IceAssemblerARM32.cpp b/src/IceAssemblerARM32.cpp
index ba328c8..a9ef88e 100644
--- a/src/IceAssemblerARM32.cpp
+++ b/src/IceAssemblerARM32.cpp
@@ -158,7 +158,7 @@
     return 3;
   default:
     llvm::report_fatal_error("SIMD op: Don't understand element type " +
-                             std::string(typeString(ElmtTy)));
+                             typeIceString(ElmtTy));
   }
 }
 
@@ -213,7 +213,17 @@
   return RegARM32::getEncodedQReg(Var->getRegNum());
 }
 
-IValueT mapQRegToDReg(IValueT EncodedQReg) { return EncodedQReg << 1; }
+IValueT mapQRegToDReg(IValueT EncodedQReg) {
+  IValueT DReg = EncodedQReg << 1;
+  assert(DReg < RegARM32::getNumDRegs());
+  return DReg;
+}
+
+IValueT mapQRegToSReg(IValueT EncodedQReg) {
+  IValueT SReg = EncodedQReg << 2;
+  assert(SReg < RegARM32::getNumSRegs());
+  return SReg;
+}
 
 IValueT getYInRegXXXXY(IValueT RegXXXXY) { return RegXXXXY & 0x1; }
 
@@ -1010,6 +1020,60 @@
   emitInst(Encoding);
 }
 
+void AssemblerARM32::emitInsertExtractInt(CondARM32::Cond Cond,
+                                          const Operand *OpQn, uint32_t Index,
+                                          const Operand *OpRt, bool IsExtract,
+                                          const char *InstName) {
+  const IValueT Rt = encodeGPRegister(OpRt, "Rt", InstName);
+  IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", InstName));
+  assert(Rt != RegARM32::Encoded_Reg_pc);
+  assert(Rt != RegARM32::Encoded_Reg_sp);
+  assert(CondARM32::isDefined(Cond));
+  const uint32_t BitSize = typeWidthInBytes(OpRt->getType()) * CHAR_BIT;
+  IValueT Opcode1 = 0;
+  IValueT Opcode2 = 0;
+  switch (BitSize) {
+  default:
+    llvm::report_fatal_error(std::string(InstName) +
+                             ": Unable to process type " +
+                             typeIceString(OpRt->getType()));
+  case 8:
+    assert(Index < 16);
+    Dn = Dn | mask(Index, 3, 1);
+    Opcode1 = B1 | mask(Index, 2, 1);
+    Opcode2 = mask(Index, 0, 2);
+    break;
+  case 16:
+    assert(Index < 8);
+    Dn = Dn | mask(Index, 2, 1);
+    Opcode1 = mask(Index, 1, 1);
+    Opcode2 = (mask(Index, 0, 1) << 1) | B0;
+    break;
+  case 32:
+    assert(Index < 4);
+    Dn = Dn | mask(Index, 1, 1);
+    Opcode1 = mask(Index, 0, 1);
+    break;
+  }
+  const IValueT Encoding = B27 | B26 | B25 | B11 | B9 | B8 | B4 |
+                           (encodeCondition(Cond) << kConditionShift) |
+                           (Opcode1 << 21) |
+                           (getXXXXInRegYXXXX(Dn) << kRnShift) | (Rt << 12) |
+                           (encodeBool(IsExtract) << 20) |
+                           (getYInRegYXXXX(Dn) << 7) | (Opcode2 << 5);
+  emitInst(Encoding);
+}
+
+void AssemblerARM32::emitMoveSS(CondARM32::Cond Cond, IValueT Sd, IValueT Sm) {
+  // VMOV (register) - ARM section A8.8.340, encoding A2:
+  //   vmov<c>.f32 <Sd>, <Sm>
+  //
+  // cccc11101D110000dddd101001M0mmmm where cccc=Cond, ddddD=Sd, and mmmmM=Sm.
+  constexpr IValueT VmovssOpcode = B23 | B21 | B20 | B6;
+  constexpr IValueT S0 = 0;
+  emitVFPsss(Cond, VmovssOpcode, Sd, S0, Sm);
+}
+
 void AssemblerARM32::emitMulOp(CondARM32::Cond Cond, IValueT Opcode, IValueT Rd,
                                IValueT Rn, IValueT Rm, IValueT Rs,
                                bool SetFlags) {
@@ -2654,6 +2718,33 @@
   emitInst(Encoding);
 }
 
+void AssemblerARM32::vmovqir(const Operand *OpQn, uint32_t Index,
+                             const Operand *OpRt, CondARM32::Cond Cond) {
+  // VMOV (ARM core register to scalar) - ARM section A8.8.341, encoding A1:
+  //   vmov<c>.<size> <Dn[x]>, <Rt>
+  constexpr const char *Vmovdr = "vmovdr";
+  constexpr bool IsExtract = true;
+  emitInsertExtractInt(Cond, OpQn, Index, OpRt, !IsExtract, Vmovdr);
+}
+
+void AssemblerARM32::vmovqis(const Operand *OpQd, uint32_t Index,
+                             const Operand *OpSm, CondARM32::Cond Cond) {
+  constexpr const char *Vmovqis = "vmovqis";
+  assert(Index < 4);
+  IValueT Sd = mapQRegToSReg(encodeQRegister(OpQd, "Qd", Vmovqis)) + Index;
+  IValueT Sm = encodeSRegister(OpSm, "Sm", Vmovqis);
+  emitMoveSS(Cond, Sd, Sm);
+}
+
+void AssemblerARM32::vmovrqi(const Operand *OpRt, const Operand *OpQn,
+                             uint32_t Index, CondARM32::Cond Cond) {
+  // VMOV (scalar to ARM core register) - ARM section A8.8.342, encoding A1:
+  //   vmov<c>.<dt> <Rt>, <Dn[x]>
+  constexpr const char *Vmovrd = "vmovrd";
+  constexpr bool IsExtract = true;
+  emitInsertExtractInt(Cond, OpQn, Index, OpRt, IsExtract, Vmovrd);
+}
+
 void AssemblerARM32::vmovrrd(const Operand *OpRt, const Operand *OpRt2,
                              const Operand *OpDm, CondARM32::Cond Cond) {
   // VMOV (between two ARM core registers and a doubleword extension register).
@@ -2716,16 +2807,20 @@
 
 void AssemblerARM32::vmovss(const Operand *OpSd, const Variable *OpSm,
                             CondARM32::Cond Cond) {
-  // VMOV (register) - ARM section A8.8.340, encoding A2:
-  //   vmov<c>.f32 <Sd>, <Sm>
-  //
-  // cccc11101D110000dddd101001M0mmmm where cccc=Cond, ddddD=Sd, and mmmmM=Sm.
   constexpr const char *Vmovss = "Vmovss";
   IValueT Sd = encodeSRegister(OpSd, "Sd", Vmovss);
   IValueT Sm = encodeSRegister(OpSm, "Sm", Vmovss);
-  constexpr IValueT VmovssOpcode = B23 | B21 | B20 | B6;
-  constexpr IValueT S0 = 0;
-  emitVFPsss(Cond, VmovssOpcode, Sd, S0, Sm);
+  emitMoveSS(Cond, Sd, Sm);
+}
+
+void AssemblerARM32::vmovsqi(const Operand *OpSd, const Operand *OpQm,
+                             uint32_t Index, CondARM32::Cond Cond) {
+  constexpr const char *Vmovsqi = "vmovsqi";
+  const IValueT Sd = encodeSRegister(OpSd, "Sd", Vmovsqi);
+  assert(Index < 4);
+  const IValueT Sm =
+      mapQRegToSReg(encodeQRegister(OpQm, "Qm", Vmovsqi)) + Index;
+  emitMoveSS(Cond, Sd, Sm);
 }
 
 void AssemblerARM32::vmovsr(const Operand *OpSn, const Operand *OpRt,
diff --git a/src/IceAssemblerARM32.h b/src/IceAssemblerARM32.h
index 3e6d5a3..3ae857b 100644
--- a/src/IceAssemblerARM32.h
+++ b/src/IceAssemblerARM32.h
@@ -417,24 +417,48 @@
     vld1qr(ElmtSize, OpQd, OpRn, TInfo);
   }
 
+  // Dn = FpImm
   void vmovd(const Operand *OpDn, const OperandARM32FlexFpImm *OpFpImm,
              CondARM32::Cond Cond);
 
+  // Dd = Dm
   void vmovdd(const Operand *OpDd, const Variable *OpDm, CondARM32::Cond Cond);
 
+  // Dm = Rt:Rt2
   void vmovdrr(const Operand *OpDm, const Operand *OpRt, const Operand *OpRt2,
                CondARM32::Cond Cond);
 
+  // Qd[Index] = Rt
+  void vmovqir(const Operand *OpQd, uint32_t Index, const Operand *OpRt,
+               CondARM32::Cond Cond);
+
+  // Qd[Index] = Sm
+  void vmovqis(const Operand *OpQd, uint32_t Indx, const Operand *OpSm,
+               CondARM32::Cond Cond);
+
+  // Rt = Qm[Index]
+  void vmovrqi(const Operand *OpRt, const Operand *OpQd, uint32_t Index,
+               CondARM32::Cond Cond);
+
+  // Rt:Rt2 = Dm
   void vmovrrd(const Operand *OpRt, const Operand *OpRt2, const Operand *OpDm,
                CondARM32::Cond Cond);
 
+  // Rt = Sn
   void vmovrs(const Operand *OpRt, const Operand *OpSn, CondARM32::Cond Cond);
 
+  // Sn = FpImm
   void vmovs(const Operand *OpSn, const OperandARM32FlexFpImm *OpFpImm,
              CondARM32::Cond Cond);
 
-  void vmovss(const Operand *OpDd, const Variable *OpDm, CondARM32::Cond Cond);
+  // Sd = Sm
+  void vmovss(const Operand *OpSd, const Variable *OpSm, CondARM32::Cond Cond);
 
+  // Sd = Qm[Index]
+  void vmovsqi(const Operand *OpSd, const Operand *OpQm, uint32_t Index,
+               CondARM32::Cond Cond);
+
+  // Sn = Rt
   void vmovsr(const Operand *OpSn, const Operand *OpRt, CondARM32::Cond Cond);
 
   void vmlad(const Operand *OpDd, const Operand *OpDn, const Operand *OpDm,
@@ -641,6 +665,17 @@
   void emitDivOp(CondARM32::Cond Cond, IValueT Opcode, IValueT Rd, IValueT Rn,
                  IValueT Rm);
 
+  // cccc1110iiiennnntttt1011Njj10000 where cccc=Cond, tttt=Rt, Ndddd=2*Qn=Dn,
+  // iii=Opcode1, jj=Opcode2, Opcode1Opcode2 encodes Index and the
+  // corresponding element size of the vector element, and e=IsExtract.
+  void emitInsertExtractInt(CondARM32::Cond Cond, const Operand *OpQn,
+                            uint32_t Index, const Operand *OpRt, bool IsExtract,
+                            const char *InstName);
+
+  // cccc11101D110000dddd101001M0mmmm where cccc=Cond, ddddD=Sd, and mmmmM=Sm.
+  // Assigns Sd the value of Sm.
+  void emitMoveSS(CondARM32::Cond Cond, IValueT Sd, IValueT Sm);
+
   // Pattern ccccxxxxxxxfnnnnddddssss1001mmmm where cccc=Cond, dddd=Rd, nnnn=Rn,
   // mmmm=Rm, ssss=Rs, f=SetFlags and xxxxxxx=Opcode.
   void emitMulOp(CondARM32::Cond Cond, IValueT Opcode, IValueT Rd, IValueT Rn,
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index 4538aa9..4a0fc4c 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -1067,6 +1067,8 @@
   }
 }
 
+namespace {
+
 // These next two functions find the D register that maps to the half of the Q
 // register that this instruction is accessing.
 Register getDRegister(const Variable *Src, uint32_t Index) {
@@ -1124,6 +1126,8 @@
   return static_cast<Register>(RegARM32::RegTable[SrcReg].Aliases[Index + 3]);
 }
 
+} // end of anonymous namespace
+
 void InstARM32Extract::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   const Type DestTy = getDest()->getType();
@@ -1162,6 +1166,23 @@
   }
 }
 
+void InstARM32Extract::emitIAS(const Cfg *Func) const {
+  const Operand *Dest = getDest();
+  const Type DestTy = Dest->getType();
+  const Operand *Src = getSrc(0);
+  assert(isVectorType(Src->getType()));
+  assert(DestTy == typeElementType(Src->getType()));
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  if (isIntegerType(DestTy)) {
+    Asm->vmovrqi(Dest, Src, Index, getPredicate());
+    assert(!Asm->needsTextFixup());
+    return;
+  }
+  assert(isFloatingType(DestTy));
+  Asm->vmovsqi(Dest, Src, Index, getPredicate());
+  assert(!Asm->needsTextFixup());
+}
+
 void InstARM32Insert::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   const Variable *Dest = getDest();
@@ -1193,6 +1214,24 @@
   }
 }
 
+void InstARM32Insert::emitIAS(const Cfg *Func) const {
+  const Variable *Dest = getDest();
+  const Operand *Src = getSrc(0);
+  const Type SrcTy = Src->getType();
+  assert(isVectorType(Dest->getType()));
+  assert(typeElementType(Dest->getType()) == SrcTy);
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  if (isIntegerType(SrcTy)) {
+    const Operand *Src = getSrc(0);
+    Asm->vmovqir(Dest, Index, Src, getPredicate());
+    assert(!Asm->needsTextFixup());
+    return;
+  }
+  assert(isFloatingType(SrcTy));
+  Asm->vmovqis(Dest, Index, Src, getPredicate());
+  assert(!Asm->needsTextFixup());
+}
+
 template <InstARM32::InstKindARM32 K>
 void InstARM32CmpLike<K>::emitIAS(const Cfg *Func) const {
   emitUsingTextFixup(Func);
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index 40518c1..96a95c5 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -1368,6 +1368,7 @@
         InstARM32Extract(Func, Dest, Src0, Index, Predicate);
   }
   void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Extract); }
 
 private:
@@ -1396,6 +1397,7 @@
         InstARM32Insert(Func, Dest, Src0, Index, Predicate);
   }
   void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Insert); }
 
 private:
diff --git a/tests_lit/assembler/arm32/insert-extract.ll b/tests_lit/assembler/arm32/insert-extract.ll
index c96c995..5bbb18b 100644
--- a/tests_lit/assembler/arm32/insert-extract.ll
+++ b/tests_lit/assembler/arm32/insert-extract.ll
@@ -27,9 +27,9 @@
 
   %1 = extractelement <4 x i32> %src, i32 1
 
-; ASM: vmov.32	r0, d0[1]
+; ASM: vmov.32  r0, d0[1]
 ; DIS:   10:       ee300b10
-
+; IASM-NOT: vmov.32  r0, d0[1]
   ret i32 %1
 }
 
@@ -40,8 +40,9 @@
 
   %1 = extractelement <4 x i32> %src, i32 2
 
-; ASM: vmov.32	r0, d1[0]
+; ASM: vmov.32  r0, d1[0]
 ; DIS:   40:       ee110b10
+; IASM-NOT: vmov.32  r0, d1[0]
 
   ret i32 %1
 }
@@ -53,8 +54,10 @@
 
   %1 = extractelement <8 x i16> %src, i32 3
 
-; ASM: vmov.s16	r0, d0[3]
+; ASM: vmov.s16 r0, d0[3]
 ; DIS:   70:       ee300b70
+; IASM-NOT: vmov.s16 r0, d0[3]
+
   %2 = sext i16 %1 to i32
   ret i32 %2
 }
@@ -66,8 +69,9 @@
 
   %1 = extractelement <8 x i16> %src, i32 4
 
-; ASM: vmov.s16	r0, d1[0]
+; ASM: vmov.s16 r0, d1[0]
 ; DIS:   a0:       ee110b30
+; IASM-NOT: vmov.s16 r0, d1[0]
 
   %2 = sext i16 %1 to i32
   ret i32 %2
@@ -80,8 +84,9 @@
 
   %1 = extractelement <16 x i8> %src, i32 7
 
-; ASM: vmov.s8	r0, d0[7]
+; ASM: vmov.s8  r0, d0[7]
 ; DIS:   d0:       ee700b70
+; IASM-NOT: vmov.s8  r0, d0[7]
 
   %2 = sext i8 %1 to i32
   ret i32 %2
@@ -94,8 +99,9 @@
 
   %1 = extractelement <16 x i8> %src, i32 8
 
-; ASM: vmov.s8	r0, d1[0]
+; ASM: vmov.s8  r0, d1[0]
 ; DIS:   100:       ee510b10
+; IASM-NOT: vmov.s8  r0, d1[0]
 
   %2 = sext i8 %1 to i32
   ret i32 %2
@@ -108,8 +114,9 @@
 
   %1 = extractelement <4 x float> %src, i32 1
 
-; ASM: vmov.f32	s0, s1
+; ASM: vmov.f32 s0, s1
 ; DIS:   130:       eeb00a60
+; IASM-NOT: vmov.f32 s0, s1
 
   ret float %1
 }
@@ -121,8 +128,9 @@
 
   %1 = extractelement <4 x float> %src, i32 2
 
-; ASM: vmov.f32	s0, s2
+; ASM: vmov.f32 s0, s2
 ; DIS:   160:       eeb00a41
+; IASM-NOT: vmov.f32 s0, s2
 
   ret float %1
 }
@@ -134,8 +142,9 @@
 
   %1 = insertelement <4 x i32> %src, i32 %s, i32 1
 
-; ASM: vmov.32	d0[1], r0
+; ASM: vmov.32  d0[1], r0
 ; DIS:   198:       ee200b10
+; IASM-NOT: vmov.32  d0[1], r0
 
   ret <4 x i32> %1
 }
@@ -147,8 +156,9 @@
 
   %1 = insertelement <4 x i32> %src, i32 %s, i32 2
 
-; ASM: vmov.32	d1[0], r0
+; ASM: vmov.32  d1[0], r0
 ; DIS:   1c8:       ee010b10
+; IASM-NOT: vmov.32  d1[0], r0
 
   ret <4 x i32> %1
 }
@@ -161,8 +171,10 @@
   %s2 = trunc i32 %s to i16
   %1 = insertelement <8 x i16> %src, i16 %s2, i32 3
 
-; ASM: vmov.16	d0[3], r0
+; ASM: vmov.16  d0[3], r0
 ; DIS:   200:       ee200b70
+; IASM-NOT: vmov.16  d0[3], r0
+
   ret <8 x i16> %1
 }
 
@@ -174,8 +186,10 @@
   %s2 = trunc i32 %s to i16
   %1 = insertelement <8 x i16> %src, i16 %s2, i32 4
 
-; ASM: vmov.16	d1[0], r0
+; ASM: vmov.16  d1[0], r0
 ; DIS:   240:       ee010b30
+; IASM-NOT: vmov.16  d1[0], r0
+
   ret <8 x i16> %1
 }
 
@@ -187,8 +201,9 @@
   %s2 = trunc i32 %s to i8
   %1 = insertelement <16 x i8> %src, i8 %s2, i32 7
 
-; ASM: vmov.8	d0[7], r0
+; ASM: vmov.8   d0[7], r0
 ; DIS:   280:       ee600b70
+; IASM-NOT: vmov.8   d0[7], r0
 
   ret <16 x i8> %1
 }
@@ -201,8 +216,9 @@
   %s2 = trunc i32 %s to i8
   %1 = insertelement <16 x i8> %src, i8 %s2, i32 8
 
-; ASM: vmov.8	d1[0], r0
+; ASM: vmov.8   d1[0], r0
 ; DIS:   2c0:       ee410b10
+; IASM-NOT: vmov.8   d1[0], r0
 
   ret <16 x i8> %1
 }
@@ -214,8 +230,9 @@
 
   %1 = insertelement <4 x float> %src, float %s, i32 1
 
-; ASM: vmov.f32	s1, s4
+; ASM: vmov.f32 s1, s4
 ; DIS:   2f8:       eef00a42
+; IASM-NOT: vmov.f32 s1, s4
 
   ret <4 x float> %1
 }
@@ -227,8 +244,9 @@
 
   %1 = insertelement <4 x float> %src, float %s, i32 2
 
-; ASM: vmov.f32	s2, s4
+; ASM: vmov.f32 s2, s4
 ; DIS:   328:       eeb01a42
+; IASM-NOT: vmov.f32 s2, s4
 
   ret <4 x float> %1
 }