Removes references to ah.

AH is a thorn in the flesh for our X86-64 backend. The assembler was
designed to always encode the low 8-bit registers, so %ah would become
%spl. While it is true we **could** force %spl to always be encoded as
%ah, that would not work if the instruction has a rex prefix.

This CL removes references to %ah from TargetX86Base. There used to be
2 uses of ah in the target lowering:

1) To zero-extend %al before an unsigned div:
    mov <<src0>>, %al
    mov 0, %ah
    div <<src1>>

This pattern has been changed to
    xor %eax, %eax
    mov <<src0>>, %al
    div <<src1>>

2) To access the 8-bit remainder for 8-bit division:
    mov %ah, <<dest>>

This pattern has been changed to
    shr $8, %eax
    mov %al, <<Dest>>

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4077
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1260163003.
diff --git a/src/IceInstX8632.def b/src/IceInstX8632.def
index f9b1e88..3129dad 100644
--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -44,16 +44,10 @@
 // all of the registers are considered and have distinct numberings.
 // This is in contrast to the above, where the "encode" is based on how
 // the register numbers will be encoded in binaries and values can overlap.
-// Note that the isI8 attributed of Reg_ah is not set.  In general we
-// don't want the register allocator choosing Reg_ah, in particular
-// for lowering insertelement to pinsrb where internally we use an
-// 8-bit operand but externally pinsrb uses a 32-bit register, in
-// which Reg_ah doesn't map to eax.
 #define REGX8632_TABLE                                                  \
   /* val, encode, name, name16, name8, scratch, preserved, stackptr,    \
      frameptr, isI8, isInt, isFP */                                     \
   REGX8632_GPR_TABLE                                                    \
-  X(Reg_ah, 4, "???", "" , "ah", 0, 0, 0, 0, 0, 0, 0)                   \
   REGX8632_XMM_TABLE
 //#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,
 //          frameptr, isI8, isInt, isFP)
@@ -73,8 +67,7 @@
   X(Reg_al, = 0) \
   X(Reg_cl, = 1) \
   X(Reg_dl, = 2) \
-  X(Reg_bl, = 3) \
-  X(Reg_ah, = 4)
+  X(Reg_bl, = 3)
 //#define X(val, encode)
 
 // X86 segment registers.
diff --git a/src/IceInstX8664.def b/src/IceInstX8664.def
index dd4b712..7ad1eaa 100644
--- a/src/IceInstX8664.def
+++ b/src/IceInstX8664.def
@@ -67,15 +67,10 @@
 // all of the registers are considered and have distinct numberings.
 // This is in contrast to the above, where the "encode" is based on how
 // the register numbers will be encoded in binaries and values can overlap.
-// We don't want the register allocator choosing Reg_ah, in particular
-// for lowering insertelement to pinsrb where internally we use an
-// 8-bit operand but externally pinsrb uses a 32-bit register, in
-// which Reg_ah doesn't map to eax.
 #define REGX8664_TABLE                                                         \
   /* val, encode, name64, name, name16, name8, scratch, preserved,             \
      stackptr, frameptr, isInt, isFP */                                        \
   REGX8664_GPR_TABLE                                                           \
-  X(Reg_ah, = Reg_rax + 4, "?ah", "?ah", "?ah", "ah", 0, 0, 0, 0, 0, 0)        \
   REGX8664_XMM_TABLE
 //#define X(val, encode, name, name32, name16, name8, scratch, preserved,
 //          stackptr, frameptr, isI8, isInt, isFP)
diff --git a/src/IceRegistersX8632.h b/src/IceRegistersX8632.h
index da920e2..b0d22bb 100644
--- a/src/IceRegistersX8632.h
+++ b/src/IceRegistersX8632.h
@@ -78,19 +78,20 @@
   };
 
   static inline GPRRegister getEncodedGPR(int32_t RegNum) {
-    assert(Reg_GPR_First <= RegNum && RegNum <= Reg_GPR_Last);
+    assert(Reg_GPR_First <= RegNum);
+    assert(RegNum <= Reg_GPR_Last);
     return GPRRegister(RegNum - Reg_GPR_First);
   }
 
   static inline XmmRegister getEncodedXmm(int32_t RegNum) {
-    assert(Reg_XMM_First <= RegNum && RegNum <= Reg_XMM_Last);
+    assert(Reg_XMM_First <= RegNum);
+    assert(RegNum <= Reg_XMM_Last);
     return XmmRegister(RegNum - Reg_XMM_First);
   }
 
   static inline ByteRegister getEncodedByteReg(int32_t RegNum) {
-    assert(RegNum == Reg_ah || (Reg_GPR_First <= RegNum && RegNum <= Reg_ebx));
-    if (RegNum == Reg_ah)
-      return Encoded_Reg_ah;
+    assert(Reg_GPR_First <= RegNum);
+    assert(RegNum <= Reg_ebx);
     return ByteRegister(RegNum - Reg_GPR_First);
   }
 
@@ -102,7 +103,8 @@
   }
 
   static inline X87STRegister getEncodedSTReg(int32_t RegNum) {
-    assert(Encoded_X87ST_First <= RegNum && RegNum <= Encoded_X87ST_Last);
+    assert(Encoded_X87ST_First <= RegNum);
+    assert(RegNum <= Encoded_X87ST_Last);
     return X87STRegister(RegNum);
   }
 };
diff --git a/src/IceRegistersX8664.h b/src/IceRegistersX8664.h
index f95fe9e..bc448b2 100644
--- a/src/IceRegistersX8664.h
+++ b/src/IceRegistersX8664.h
@@ -69,16 +69,20 @@
   };
 
   static inline GPRRegister getEncodedGPR(int32_t RegNum) {
-    assert(Reg_GPR_First <= RegNum && RegNum <= Reg_GPR_Last);
+    assert(Reg_GPR_First <= RegNum);
+    assert(RegNum <= Reg_GPR_Last);
     return GPRRegister(RegNum - Reg_GPR_First);
   }
 
   static inline XmmRegister getEncodedXmm(int32_t RegNum) {
-    assert(Reg_XMM_First <= RegNum && RegNum <= Reg_XMM_Last);
+    assert(Reg_XMM_First <= RegNum);
+    assert(RegNum <= Reg_XMM_Last);
     return XmmRegister(RegNum - Reg_XMM_First);
   }
 
   static inline ByteRegister getEncodedByteReg(int32_t RegNum) {
+    assert(Reg_GPR_First <= RegNum);
+    assert(RegNum <= Reg_GPR_Last);
     return ByteRegister(RegNum - Reg_GPR_First);
   }
 
diff --git a/src/IceTargetLoweringX8664.def b/src/IceTargetLoweringX8664.def
new file mode 100644
index 0000000..8ee61d0
--- /dev/null
+++ b/src/IceTargetLoweringX8664.def
@@ -0,0 +1,53 @@
+//===- subzero/src/IceTargetLoweringX8664.def - x86-64 X-macros -*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines certain patterns for lowering to x86-64 target
+// instructions, in the form of x-macros.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8664_DEF
+#define SUBZERO_SRC_ICETARGETLOWERINGX8664_DEF
+
+#define FCMPX8664_TABLE                                              \
+  /*       <---- scalar comparison ---->  <- vector comparison -> */ \
+  /* val,  dflt, swap, C1,      C2,       swap,  predicate        */ \
+  X(False, 0,    0,    Br_None, Br_None,  0,     Cmpps_Invalid)      \
+  X(Oeq,   0,    0,    Br_ne,   Br_p,     0,     Cmpps_eq)           \
+  X(Ogt,   1,    0,    Br_a,    Br_None,  1,     Cmpps_lt)           \
+  X(Oge,   1,    0,    Br_ae,   Br_None,  1,     Cmpps_le)           \
+  X(Olt,   1,    1,    Br_a,    Br_None,  0,     Cmpps_lt)           \
+  X(Ole,   1,    1,    Br_ae,   Br_None,  0,     Cmpps_le)           \
+  X(One,   1,    0,    Br_ne,   Br_None,  0,     Cmpps_Invalid)      \
+  X(Ord,   1,    0,    Br_np,   Br_None,  0,     Cmpps_ord)          \
+  X(Ueq,   1,    0,    Br_e,    Br_None,  0,     Cmpps_Invalid)      \
+  X(Ugt,   1,    1,    Br_b,    Br_None,  0,     Cmpps_nle)          \
+  X(Uge,   1,    1,    Br_be,   Br_None,  0,     Cmpps_nlt)          \
+  X(Ult,   1,    0,    Br_b,    Br_None,  1,     Cmpps_nle)          \
+  X(Ule,   1,    0,    Br_be,   Br_None,  1,     Cmpps_nlt)          \
+  X(Une,   1,    0,    Br_ne,   Br_p,     0,     Cmpps_neq)          \
+  X(Uno,   1,    0,    Br_p,    Br_None,  0,     Cmpps_unord)        \
+  X(True,  1,    0,    Br_None, Br_None,  0,     Cmpps_Invalid)      \
+//#define X(val, dflt, swapS, C1, C2, swapV, pred)
+
+#define ICMPX8664_TABLE                     \
+  /* val, C_32,  C1_64,   C2_64,   C3_64 */ \
+  X(Eq,   Br_e,  Br_None, Br_ne,   Br_e)    \
+  X(Ne,   Br_ne, Br_ne,   Br_None, Br_ne)   \
+  X(Ugt,  Br_a,  Br_a,    Br_b,    Br_a)    \
+  X(Uge,  Br_ae, Br_a,    Br_b,    Br_ae)   \
+  X(Ult,  Br_b,  Br_b,    Br_a,    Br_b)    \
+  X(Ule,  Br_be, Br_b,    Br_a,    Br_be)   \
+  X(Sgt,  Br_g,  Br_g,    Br_l,    Br_a)    \
+  X(Sge,  Br_ge, Br_g,    Br_l,    Br_ae)   \
+  X(Slt,  Br_l,  Br_l,    Br_g,    Br_b)    \
+  X(Sle,  Br_le, Br_l,    Br_g,    Br_be)   \
+//#define X(val, C_32, C1_64, C2_64, C3_64)
+
+#endif // SUBZERO_SRC_ICETARGETLOWERINGX8664_DEF
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 77048b0..d9cc5e4 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -1850,12 +1850,21 @@
     // immediates as the operand.
     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
     if (isByteSizedArithType(Dest->getType())) {
-      Variable *T_ah = nullptr;
-      Constant *Zero = Ctx->getConstantZero(IceType_i8);
+      // For 8-bit unsigned division we need to zero-extend al into ah. A mov
+      // $0, %ah (or xor %ah, %ah) would work just fine, except that the x86-64
+      // assembler refuses to encode %ah (encoding %spl with a REX prefix
+      // instead.) Accessing %ah in 64-bit is "tricky" as you can't encode %ah
+      // with any other 8-bit register except for %a[lh], %b[lh], %c[lh], and
+      // d[%lh], which means the X86 target lowering (and the register
+      // allocator) would have to be aware of this restriction. For now, we
+      // simply zero %eax completely, and move the dividend into %al.
+      Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+      Context.insert(InstFakeDef::create(Func, T_eax));
+      _xor(T_eax, T_eax);
       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
-      _mov(T_ah, Zero, Traits::RegisterSet::Reg_ah);
-      _div(T, Src1, T_ah);
+      _div(T, Src1, T);
       _mov(Dest, T);
+      Context.insert(InstFakeUse::create(Func, T_eax));
     } else {
       Constant *Zero = Ctx->getConstantZero(IceType_i32);
       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
@@ -1917,12 +1926,21 @@
   case InstArithmetic::Urem:
     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
     if (isByteSizedArithType(Dest->getType())) {
-      Variable *T_ah = nullptr;
-      Constant *Zero = Ctx->getConstantZero(IceType_i8);
+      Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+      Context.insert(InstFakeDef::create(Func, T_eax));
+      _xor(T_eax, T_eax);
       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
-      _mov(T_ah, Zero, Traits::RegisterSet::Reg_ah);
-      _div(T_ah, Src1, T);
-      _mov(Dest, T_ah);
+      Variable *T_al = makeReg(IceType_i8, Traits::RegisterSet::Reg_eax);
+      _div(T_al, Src1, T);
+      // shr $8, %eax shifts ah (i.e., the 8 bit remainder) into al. We don't
+      // mov %ah, %al because it would make x86-64 codegen more complicated. If
+      // this ever becomes a problem we can introduce a pseudo rem instruction
+      // that returns the remainder in %al directly (and uses a mov for copying
+      // %ah to %al.)
+      static constexpr uint8_t AlSizeInBits = 8;
+      _shr(T_eax, Ctx->getConstantInt8(AlSizeInBits));
+      _mov(Dest, T_al);
+      Context.insert(InstFakeUse::create(Func, T_eax));
     } else {
       Constant *Zero = Ctx->getConstantZero(IceType_i32);
       _mov(T_edx, Zero, Traits::RegisterSet::Reg_edx);
@@ -1974,12 +1992,21 @@
     }
     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
     if (isByteSizedArithType(Dest->getType())) {
-      Variable *T_ah = makeReg(IceType_i8, Traits::RegisterSet::Reg_ah);
       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
+      // T is %al.
       _cbwdq(T, T);
-      Context.insert(InstFakeDef::create(Func, T_ah));
-      _idiv(T_ah, Src1, T);
-      _mov(Dest, T_ah);
+      _idiv(T, Src1, T);
+      Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+      Context.insert(InstFakeDef::create(Func, T_eax));
+      // shr $8, %eax shifts ah (i.e., the 8 bit remainder) into al. We don't
+      // mov %ah, %al because it would make x86-64 codegen more complicated. If
+      // this ever becomes a problem we can introduce a pseudo rem instruction
+      // that returns the remainder in %al directly (and uses a mov for copying
+      // %ah to %al.)
+      static constexpr uint8_t AlSizeInBits = 8;
+      _shr(T_eax, Ctx->getConstantInt8(AlSizeInBits));
+      _mov(Dest, T);
+      Context.insert(InstFakeUse::create(Func, T_eax));
     } else {
       T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
       _mov(T, Src0, Traits::RegisterSet::Reg_eax);