Inline memset when there is a constant value and count.

BUG=
R=jvoung@chromium.org, jvoung, stichnot

Review URL: https://codereview.chromium.org/1255053008.
diff --git a/src/IceInstX86BaseImpl.h b/src/IceInstX86BaseImpl.h
index 137e295..2956558 100644
--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -2134,6 +2134,7 @@
     return;
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(this->getSrcSize() == 2);
+  assert(isVectorType(this->getSrc(1)->getType()));
   Str << "\tmovups\t";
   this->getSrc(0)->emit(Func);
   Str << ", ";
@@ -2175,7 +2176,8 @@
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(this->getSrcSize() == 2);
   assert(this->getSrc(1)->getType() == IceType_i64 ||
-         this->getSrc(1)->getType() == IceType_f64);
+         this->getSrc(1)->getType() == IceType_f64 ||
+         isVectorType(this->getSrc(1)->getType()));
   Str << "\tmovq\t";
   this->getSrc(0)->emit(Func);
   Str << ", ";
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 64f921d..d89d747 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -158,6 +158,9 @@
                       Operand *Val);
   void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
                        Operand *SecondVal);
+  /// Replace a function call with inline instructions.
+  void lowerMemset(Operand *Dest, Operand *Val, Operand *Count);
+
   /// Lower an indirect jump adding sandboxing when needed.
   void lowerIndirectJump(Variable *Target);
 
@@ -214,7 +217,8 @@
 
   Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
 
-  /// Returns a vector in a register with the given constant entries.
+  /// \name Returns a vector in a register with the given constant entries.
+  /// @{
   Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
   Variable *makeVectorOfOnes(Type Ty, int32_t RegNum = Variable::NoRegister);
   Variable *makeVectorOfMinusOnes(Type Ty,
@@ -223,6 +227,7 @@
                                       int32_t RegNum = Variable::NoRegister);
   Variable *makeVectorOfFabsMask(Type Ty,
                                  int32_t RegNum = Variable::NoRegister);
+  /// @}
 
   /// Return a memory operand corresponding to a stack allocated Variable.
   typename Traits::X86OperandMem *
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 85c8c71..2532217 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -3546,17 +3546,7 @@
     return;
   }
   case Intrinsics::Memset: {
-    // The value operand needs to be extended to a stack slot size because the
-    // PNaCl ABI requires arguments to be at least 32 bits wide.
-    Operand *ValOp = Instr->getArg(1);
-    assert(ValOp->getType() == IceType_i8);
-    Variable *ValExt = Func->makeVariable(stackSlotType());
-    lowerCast(InstCast::create(Func, InstCast::Zext, ValExt, ValOp));
-    InstCall *Call = makeHelperCall(H_call_memset, nullptr, 3);
-    Call->addArg(Instr->getArg(0));
-    Call->addArg(ValExt);
-    Call->addArg(Instr->getArg(2));
-    lowerCall(Call);
+    lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
     return;
   }
   case Intrinsics::NaClReadTP: {
@@ -3995,6 +3985,149 @@
 }
 
 template <class Machine>
+void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
+                                         Operand *Count) {
+  constexpr uint32_t UNROLL_LIMIT = 16;
+  assert(Val->getType() == IceType_i8);
+
+  // Check if the operands are constants
+  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
+  const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
+  const bool IsCountConst = CountConst != nullptr;
+  const bool IsValConst = ValConst != nullptr;
+  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
+  const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
+
+  // Unlikely, but nothing to do if it does happen
+  if (IsCountConst && CountValue == 0)
+    return;
+
+  // TODO(ascull): if the count is constant but val is not it would be possible
+  // to inline by spreading the value across 4 bytes and accessing subregs e.g.
+  // eax, ax and al.
+  if (IsCountConst && IsValConst) {
+    Variable *Base = legalizeToReg(Dest);
+
+    // 3 is the awkward size as it is too small for the vector or 32-bit
+    // operations and will not work with lowerLeftOvers as there is no valid
+    // overlap.
+    if (CountValue == 3) {
+      Constant *Offset = nullptr;
+      auto *Mem =
+          Traits::X86OperandMem::create(Func, IceType_i16, Base, Offset);
+      _store(Ctx->getConstantInt16((ValValue << 8) | ValValue), Mem);
+
+      Offset = Ctx->getConstantInt8(2);
+      Mem = Traits::X86OperandMem::create(Func, IceType_i8, Base, Offset);
+      _store(Ctx->getConstantInt8(ValValue), Mem);
+      return;
+    }
+
+    // Lowers the assignment to the remaining bytes. Assumes the original size
+    // was large enough to allow for overlaps.
+    auto lowerLeftOvers = [this, Base, CountValue](
+        uint32_t SpreadValue, uint32_t Size, Variable *VecReg) {
+      auto lowerStoreSpreadValue =
+          [this, Base, CountValue, SpreadValue](Type Ty) {
+            Constant *Offset =
+                Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
+            auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
+            _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
+          };
+
+      if (Size > 8) {
+        assert(VecReg != nullptr);
+        Constant *Offset = Ctx->getConstantInt32(CountValue - 16);
+        auto *Mem = Traits::X86OperandMem::create(Func, VecReg->getType(), Base,
+                                                  Offset);
+        _storep(VecReg, Mem);
+      } else if (Size > 4) {
+        assert(VecReg != nullptr);
+        Constant *Offset = Ctx->getConstantInt32(CountValue - 8);
+        auto *Mem =
+            Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset);
+        _storeq(VecReg, Mem);
+      } else if (Size > 2) {
+        lowerStoreSpreadValue(IceType_i32);
+      } else if (Size > 1) {
+        lowerStoreSpreadValue(IceType_i16);
+      } else if (Size == 1) {
+        lowerStoreSpreadValue(IceType_i8);
+      }
+    };
+
+    // When the value is zero it can be loaded into a register cheaply using
+    // the xor trick.
+    constexpr uint32_t BytesPerStorep = 16;
+    if (ValValue == 0 && CountValue >= 8 &&
+        CountValue <= BytesPerStorep * UNROLL_LIMIT) {
+      Variable *Zero = makeVectorOfZeros(IceType_v16i8);
+
+      // Too small to use large vector operations so use small ones instead
+      if (CountValue < 16) {
+        Constant *Offset = nullptr;
+        auto *Mem =
+            Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset);
+        _storeq(Zero, Mem);
+        lowerLeftOvers(0, CountValue - 8, Zero);
+        return;
+      }
+
+      assert(CountValue >= 16);
+      // Use large vector operations
+      for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
+        N -= 16;
+        Constant *Offset = Ctx->getConstantInt32(N);
+        auto *Mem =
+            Traits::X86OperandMem::create(Func, Zero->getType(), Base, Offset);
+        _storep(Zero, Mem);
+      }
+      uint32_t LeftOver = CountValue & 0xF;
+      lowerLeftOvers(0, LeftOver, Zero);
+      return;
+    }
+
+    // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?
+    constexpr uint32_t BytesPerStore = 4;
+    if (CountValue <= BytesPerStore * UNROLL_LIMIT) {
+      // TODO(ascull); 64-bit can do better with 64-bit mov
+      uint32_t SpreadValue =
+          (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
+      if (CountValue >= 4) {
+        Constant *ValueConst = Ctx->getConstantInt32(SpreadValue);
+        for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {
+          N -= 4;
+          Constant *Offset = Ctx->getConstantInt32(N);
+          auto *Mem =
+              Traits::X86OperandMem::create(Func, IceType_i32, Base, Offset);
+          _store(ValueConst, Mem);
+        }
+      }
+      uint32_t LeftOver = CountValue & 0x3;
+      lowerLeftOvers(SpreadValue, LeftOver, nullptr);
+      return;
+    }
+  }
+
+  // Fall back on calling the memset function. The value operand needs to be
+  // extended to a stack slot size because the PNaCl ABI requires arguments to
+  // be at least 32 bits wide.
+  Operand *ValExt;
+  if (IsValConst) {
+    ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
+  } else {
+    Variable *ValExtVar = Func->makeVariable(stackSlotType());
+    lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
+    ValExt = ValExtVar;
+  }
+  InstCall *Call = makeHelperCall(H_call_memset, nullptr, 3);
+  Call->addArg(Dest);
+  Call->addArg(ValExt);
+  Call->addArg(Count);
+  lowerCall(Call);
+}
+
+template <class Machine>
 void TargetX86Base<Machine>::lowerIndirectJump(Variable *Target) {
   const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
   if (NeedSandboxing) {
diff --git a/tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll
new file mode 100644
index 0000000..b262f92
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll
@@ -0,0 +1,218 @@
+; This tests the NaCl intrinsics memset, memcpy and memmove.
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 -sandbox \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 -sandbox \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+; RUN: %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
+; RUN:   -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command FileCheck --check-prefix ARM32 %s
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+declare void @llvm.memmove.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1)
+
+define void @test_memcpy(i32 %iptr_dst, i32 %iptr_src, i32 %len) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 %len, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memcpy
+; CHECK: call {{.*}} R_{{.*}} memcpy
+; ARM32-LABEL: test_memcpy
+; ARM32: bl {{.*}} memcpy
+
+; TODO(jvoung) -- if we want to be clever, we can do this and the memmove,
+; memset without a function call.
+define void @test_memcpy_const_len_align(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 32, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memcpy_const_len_align
+; CHECK: call {{.*}} R_{{.*}} memcpy
+; ARM32-LABEL: test_memcpy_const_len_align
+; ARM32: bl {{.*}} memcpy
+
+define void @test_memmove(i32 %iptr_dst, i32 %iptr_src, i32 %len) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                        i32 %len, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memmove
+; CHECK: call {{.*}} R_{{.*}} memmove
+; ARM32-LABEL: test_memmove
+; ARM32: bl {{.*}} memmove
+
+define void @test_memmove_const_len_align(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                        i32 32, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memmove_const_len_align
+; CHECK: call {{.*}} R_{{.*}} memmove
+; ARM32-LABEL: test_memmove_const_len_align
+; ARM32: bl {{.*}} memmove
+
+define void @test_memset(i32 %iptr_dst, i32 %wide_val, i32 %len) {
+entry:
+  %val = trunc i32 %wide_val to i8
+  %dst = inttoptr i32 %iptr_dst to i8*
+  call void @llvm.memset.p0i8.i32(i8* %dst, i8 %val,
+                                  i32 %len, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memset
+; CHECK: movzx
+; CHECK: call {{.*}} R_{{.*}} memset
+; ARM32-LABEL: test_memset
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
+
+define void @test_memset_const_len_align(i32 %iptr_dst, i32 %wide_val) {
+entry:
+  %val = trunc i32 %wide_val to i8
+  %dst = inttoptr i32 %iptr_dst to i8*
+  call void @llvm.memset.p0i8.i32(i8* %dst, i8 %val,
+                                  i32 32, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memset_const_len_align
+; CHECK: movzx
+; CHECK: call {{.*}} R_{{.*}} memset
+; ARM32-LABEL: test_memset_const_len_align
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
+
+define void @test_memset_const_val(i32 %iptr_dst, i32 %len) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  call void @llvm.memset.p0i8.i32(i8* %dst, i8 0, i32 %len, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memset_const_val
+; CHECK-NOT: movzx
+; CHECK: call {{.*}} R_{{.*}} memset
+; ARM32-LABEL: test_memset_const_val
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
+
+define void @test_memset_const_val_len_very_small(i32 %iptr_dst) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  call void @llvm.memset.p0i8.i32(i8* %dst, i8 10, i32 2, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memset_const_val_len_very_small
+; CHECK: mov WORD PTR [{{.*}}],0xa0a
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memset_const_val_len_very_small
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
+
+define void @test_memset_const_val_len_3(i32 %iptr_dst) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  call void @llvm.memset.p0i8.i32(i8* %dst, i8 16, i32 3, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memset_const_val_len_3
+; CHECK: mov WORD PTR [{{.*}}],0x1010
+; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x2],0x10
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memset_const_val_len_3
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
+
+define void @test_memset_const_val_len_mid(i32 %iptr_dst) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  call void @llvm.memset.p0i8.i32(i8* %dst, i8 32, i32 9, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memset_const_val_len_mid
+; CHECK: mov DWORD PTR [{{.*}}+0x4],0x20202020
+; CHECK: mov DWORD PTR [{{.*}}],0x20202020
+; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x8],0x20
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memset_const_val_len_mid
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
+
+define void @test_memset_zero_const_len_small(i32 %iptr_dst) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  call void @llvm.memset.p0i8.i32(i8* %dst, i8 0, i32 12, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memset_zero_const_len_small
+; CHECK: pxor [[ZERO:xmm[0-9]+]],[[ZERO]]
+; CHECK-NEXT: movq QWORD PTR [{{.*}}],[[ZERO]]
+; CHECK-NEXT: mov DWORD PTR [{{.*}}+0x8],0x0
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memset_zero_const_len_small
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
+
+define void @test_memset_zero_const_len_small_overlap(i32 %iptr_dst) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  call void @llvm.memset.p0i8.i32(i8* %dst, i8 0, i32 15, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memset_zero_const_len_small_overlap
+; CHECK: pxor [[ZERO:xmm[0-9]+]],[[ZERO]]
+; CHECK-NEXT: movq QWORD PTR [{{.*}}],[[ZERO]]
+; CHECK-NEXT: movq QWORD PTR [{{.*}}+0x7],[[ZERO]]
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memset_zero_const_len_small_overlap
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
+
+define void @test_memset_zero_const_len_large_overlap(i32 %iptr_dst) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  call void @llvm.memset.p0i8.i32(i8* %dst, i8 0, i32 30, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memset_zero_const_len_large_overlap
+; CHECK: pxor [[ZERO:xmm[0-9]+]],[[ZERO]]
+; CHECK-NEXT: movups XMMWORD PTR [{{.*}}],[[ZERO]]
+; CHECK-NEXT: movups XMMWORD PTR [{{.*}}+0xe],[[ZERO]]
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memset_zero_const_len_large_overlap
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
+
+define void @test_memset_zero_const_len_large(i32 %iptr_dst) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  call void @llvm.memset.p0i8.i32(i8* %dst, i8 0, i32 33, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memset_zero_const_len_large
+; CHECK: pxor [[ZERO:xmm[0-9]+]],[[ZERO]]
+; CHECK-NEXT: movups XMMWORD PTR [{{.*}}+0x10],[[ZERO]]
+; CHECK-NEXT: movups XMMWORD PTR [{{.*}}],[[ZERO]]
+; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x20],0x0
+; CHECK-NOT: mov
+; ARM32-LABEL: test_memset_zero_const_len_large
+; ARM32: uxtb
+; ARM32: bl {{.*}} memset
diff --git a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
index ebc39c2..1257383 100644
--- a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
@@ -29,11 +29,7 @@
 ; RUN:   | %if --need=target_ARM32 --need=allow_dump \
 ; RUN:   --command FileCheck --check-prefix ARM32 %s
 
-
 declare i8* @llvm.nacl.read.tp()
-declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
-declare void @llvm.memmove.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
-declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1)
 declare void @llvm.nacl.longjmp(i8*, i32)
 declare i32 @llvm.nacl.setjmp(i8*)
 declare float @llvm.sqrt.f32(float)
@@ -107,106 +103,6 @@
 ; CHECKO2UNSANDBOXEDREM-LABEL: test_nacl_read_tp_dead
 ; CHECKO2UNSANDBOXEDREM-NOT: call {{.*}} R_{{.*}} __nacl_read_tp
 
-define void @test_memcpy(i32 %iptr_dst, i32 %iptr_src, i32 %len) {
-entry:
-  %dst = inttoptr i32 %iptr_dst to i8*
-  %src = inttoptr i32 %iptr_src to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
-                                       i32 %len, i32 1, i1 false)
-  ret void
-}
-; CHECK-LABEL: test_memcpy
-; CHECK: call {{.*}} R_{{.*}} memcpy
-; CHECKO2REM-LABEL: test_memcpy
-; CHECKO2UNSANDBOXEDREM-LABEL: test_memcpy
-; ARM32-LABEL: test_memcpy
-; ARM32: bl {{.*}} memcpy
-
-; TODO(jvoung) -- if we want to be clever, we can do this and the memmove,
-; memset without a function call.
-define void @test_memcpy_const_len_align(i32 %iptr_dst, i32 %iptr_src) {
-entry:
-  %dst = inttoptr i32 %iptr_dst to i8*
-  %src = inttoptr i32 %iptr_src to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
-                                       i32 32, i32 1, i1 false)
-  ret void
-}
-; CHECK-LABEL: test_memcpy_const_len_align
-; CHECK: call {{.*}} R_{{.*}} memcpy
-; ARM32-LABEL: test_memcpy_const_len_align
-; ARM32: bl {{.*}} memcpy
-
-define void @test_memmove(i32 %iptr_dst, i32 %iptr_src, i32 %len) {
-entry:
-  %dst = inttoptr i32 %iptr_dst to i8*
-  %src = inttoptr i32 %iptr_src to i8*
-  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
-                                        i32 %len, i32 1, i1 false)
-  ret void
-}
-; CHECK-LABEL: test_memmove
-; CHECK: call {{.*}} R_{{.*}} memmove
-; ARM32-LABEL: test_memmove
-; ARM32: bl {{.*}} memmove
-
-define void @test_memmove_const_len_align(i32 %iptr_dst, i32 %iptr_src) {
-entry:
-  %dst = inttoptr i32 %iptr_dst to i8*
-  %src = inttoptr i32 %iptr_src to i8*
-  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
-                                        i32 32, i32 1, i1 false)
-  ret void
-}
-; CHECK-LABEL: test_memmove_const_len_align
-; CHECK: call {{.*}} R_{{.*}} memmove
-; ARM32-LABEL: test_memmove_const_len_align
-; ARM32: bl {{.*}} memmove
-
-define void @test_memset(i32 %iptr_dst, i32 %wide_val, i32 %len) {
-entry:
-  %val = trunc i32 %wide_val to i8
-  %dst = inttoptr i32 %iptr_dst to i8*
-  call void @llvm.memset.p0i8.i32(i8* %dst, i8 %val,
-                                  i32 %len, i32 1, i1 false)
-  ret void
-}
-; CHECK-LABEL: test_memset
-; CHECK: movzx
-; CHECK: call {{.*}} R_{{.*}} memset
-; ARM32-LABEL: test_memset
-; ARM32: uxtb
-; ARM32: bl {{.*}} memset
-
-define void @test_memset_const_len_align(i32 %iptr_dst, i32 %wide_val) {
-entry:
-  %val = trunc i32 %wide_val to i8
-  %dst = inttoptr i32 %iptr_dst to i8*
-  call void @llvm.memset.p0i8.i32(i8* %dst, i8 %val,
-                                  i32 32, i32 1, i1 false)
-  ret void
-}
-; CHECK-LABEL: test_memset_const_len_align
-; CHECK: movzx
-; CHECK: call {{.*}} R_{{.*}} memset
-; ARM32-LABEL: test_memset_const_len_align
-; ARM32: uxtb
-; ARM32: bl {{.*}} memset
-
-define void @test_memset_const_val(i32 %iptr_dst, i32 %len) {
-entry:
-  %dst = inttoptr i32 %iptr_dst to i8*
-  call void @llvm.memset.p0i8.i32(i8* %dst, i8 0, i32 %len, i32 1, i1 false)
-  ret void
-}
-; CHECK-LABEL: test_memset_const_val
-; Make sure the argument is legalized (can't movzx reg, 0).
-; CHECK: movzx {{.*}},{{[^0]}}
-; CHECK: call {{.*}} R_{{.*}} memset
-; ARM32-LABEL: test_memset_const_val
-; ARM32: uxtb
-; ARM32: bl {{.*}} memset
-
 define i32 @test_setjmplongjmp(i32 %iptr_env) {
 entry:
   %env = inttoptr i32 %iptr_env to i8*