Increase Subzero x86-64 stack alignment to 16

The SystemV x64 ABI requires that "The stack is 16-byte aligned just
before the call instruction is called."

The Microsoft x64 ABI states that "The stack will always be maintained
16-byte aligned, except within the prolog (for example, after the return
address is pushed), and except where indicated in Function Types for a
certain class of frame functions." The latter refers to functions which
do not call other functions but are not leaf functions (i.e. they may be
called externally, or allocate additional stack space).

This change fixes the calling convention and improves performance for
spilled 128-bit vectors.

Bug: b/193550986
Change-Id: If26600b701359da996b288a86ec92e07132913b2
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/55810
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Sean Risser <srisser@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/third_party/subzero/src/IceTargetLoweringX8632.cpp b/third_party/subzero/src/IceTargetLoweringX8632.cpp
index 15487a3..a6c89cb 100644
--- a/third_party/subzero/src/IceTargetLoweringX8632.cpp
+++ b/third_party/subzero/src/IceTargetLoweringX8632.cpp
@@ -2506,7 +2506,7 @@
 }
 
 void TargetX8632::lowerCall(const InstCall *Instr) {
-  // Common x86 calling convention lowering:
+  // System V x86-32 calling convention lowering:
   //
   // * At the point before the call, the stack must be aligned to 16 bytes.
   //
@@ -2517,6 +2517,10 @@
   // * Stack arguments of vector type are aligned to start at the next highest
   // multiple of 16 bytes. Other stack arguments are aligned to the next word
   // size boundary (4 or 8 bytes, respectively).
+  //
+  // This is compatible with the Microsoft x86-32 'cdecl' calling convention,
+  // which doesn't have a 16-byte stack alignment requirement.
+
   RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
                                             Traits::X86_STACK_ALIGNMENT_BYTES);
 
diff --git a/third_party/subzero/src/IceTargetLoweringX8664.cpp b/third_party/subzero/src/IceTargetLoweringX8664.cpp
index 03f6f93..4828a15 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664.cpp
+++ b/third_party/subzero/src/IceTargetLoweringX8664.cpp
@@ -1058,21 +1058,11 @@
   if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
     if (isScalarFloatingType(ReturnType)) {
       // Avoid misaligned double-precision load/store.
-      RequiredStackAlignment = std::max<size_t>(
-          RequiredStackAlignment, Traits::X86_STACK_ALIGNMENT_BYTES);
       SpillAreaSizeBytes =
           std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes);
     }
   }
 
-  RequiredStackAlignment =
-      std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes);
-
-  if (PrologEmitsFixedAllocas) {
-    RequiredStackAlignment =
-        std::max(RequiredStackAlignment, FixedAllocaAlignBytes);
-  }
-
   // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
   // fixed allocations in the prolog.
   if (PrologEmitsFixedAllocas)
@@ -1108,16 +1098,6 @@
 
   // StackPointer: points just past the spill area (end of stack frame)
 
-  // If the required alignment is greater than the stack pointer's guaranteed
-  // alignment, align the stack pointer accordingly.
-  if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
-    assert(IsEbpBasedFrame);
-    _and(getPhysicalRegister(getStackReg(), Traits::WordType),
-         Ctx->getConstantInt32(-RequiredStackAlignment));
-  }
-
-  // StackPointer: may have just been offset for alignment
-
   // Account for known-frame-offset alloca instructions that were not already
   // combined into the prolog.
   if (!PrologEmitsFixedAllocas)
@@ -1313,25 +1293,15 @@
 }
 
 void TargetX8664::lowerAlloca(const InstAlloca *Instr) {
-  // Conservatively require the stack to be aligned. Some stack adjustment
-  // operations implemented below assume that the stack is aligned before the
-  // alloca. All the alloca code ensures that the stack alignment is preserved
-  // after the alloca. The stack alignment restriction can be relaxed in some
-  // cases.
-  RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
-                                            Traits::X86_STACK_ALIGNMENT_BYTES);
-
   // For default align=0, set it to the real value 1, to avoid any
   // bit-manipulation problems below.
   const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
 
   // LLVM enforces power of 2 alignment.
   assert(llvm::isPowerOf2_32(AlignmentParam));
-  assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES));
 
-  const uint32_t Alignment =
-      std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
-  const bool OverAligned = Alignment > Traits::X86_STACK_ALIGNMENT_BYTES;
+  const uint32_t Alignment = std::max(AlignmentParam, RequiredStackAlignment);
+  const bool OverAligned = Alignment > RequiredStackAlignment;
   const bool OptM1 = Func->getOptLevel() == Opt_m1;
   const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
   const bool UseFramePointer =
@@ -2336,7 +2306,7 @@
 }
 
 void TargetX8664::lowerCall(const InstCall *Instr) {
-  // Common x86 calling convention lowering:
+  // Common x86-64 calling convention lowering:
   //
   // * At the point before the call, the stack must be aligned to 16 bytes.
   //
@@ -2347,8 +2317,6 @@
   // * Stack arguments of vector type are aligned to start at the next highest
   // multiple of 16 bytes. Other stack arguments are aligned to the next word
   // size boundary (4 or 8 bytes, respectively).
-  RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
-                                            Traits::X86_STACK_ALIGNMENT_BYTES);
 
   constexpr SizeT MaxOperands =
       constexprMax(Traits::X86_MAX_XMM_ARGS, Traits::X86_MAX_GPR_ARGS);
diff --git a/third_party/subzero/src/IceTargetLoweringX8664.h b/third_party/subzero/src/IceTargetLoweringX8664.h
index 78074d2..1c98f5d 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664.h
+++ b/third_party/subzero/src/IceTargetLoweringX8664.h
@@ -868,7 +868,7 @@
   void findRMW();
 
   bool IsEbpBasedFrame = false;
-  size_t RequiredStackAlignment = sizeof(int64_t); // 8 bytes
+  static constexpr uint32_t RequiredStackAlignment = 16;
   size_t SpillAreaSizeBytes = 0;
   size_t FixedAllocaSizeBytes = 0;
   size_t FixedAllocaAlignBytes = 0;