Add SwiftShader source to repo

Oct 6 code drop from Transgaming
Review URL: https://chromereviews.googleplex.com/3846015
diff --git a/src/LLVM/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/src/LLVM/lib/CodeGen/AggressiveAntiDepBreaker.cpp
new file mode 100644
index 0000000..62e40e9
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -0,0 +1,978 @@
+//===----- AggressiveAntiDepBreaker.cpp - Anti-dep breaker ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AggressiveAntiDepBreaker class, which
+// implements register anti-dependence breaking during post-RA
+// scheduling. It attempts to break all anti-dependencies within a
+// block.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "post-RA-sched"
+#include "AggressiveAntiDepBreaker.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// If DebugDiv > 0 then only break antidep with (ID % DebugDiv) == DebugMod
+static cl::opt<int>
+DebugDiv("agg-antidep-debugdiv",
+         cl::desc("Debug control for aggressive anti-dep breaker"),
+         cl::init(0), cl::Hidden);
+static cl::opt<int>
+DebugMod("agg-antidep-debugmod",
+         cl::desc("Debug control for aggressive anti-dep breaker"),
+         cl::init(0), cl::Hidden);
+
+AggressiveAntiDepState::AggressiveAntiDepState(const unsigned TargetRegs,
+                                               MachineBasicBlock *BB) :
+  NumTargetRegs(TargetRegs), GroupNodes(TargetRegs, 0),
+  GroupNodeIndices(TargetRegs, 0),
+  KillIndices(TargetRegs, 0),
+  DefIndices(TargetRegs, 0)
+{
+  const unsigned BBSize = BB->size();
+  for (unsigned i = 0; i < NumTargetRegs; ++i) {
+    // Initialize all registers to be in their own group. Initially we
+    // assign the register to the same-indexed GroupNode.
+    GroupNodeIndices[i] = i;
+    // Initialize the indices to indicate that no registers are live.
+    KillIndices[i] = ~0u;
+    DefIndices[i] = BBSize;
+  }
+}
+
+unsigned AggressiveAntiDepState::GetGroup(unsigned Reg) {
+  unsigned Node = GroupNodeIndices[Reg];
+  while (GroupNodes[Node] != Node)
+    Node = GroupNodes[Node];
+
+  return Node;
+}
+
+void AggressiveAntiDepState::GetGroupRegs(
+  unsigned Group,
+  std::vector<unsigned> &Regs,
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference> *RegRefs)
+{
+  for (unsigned Reg = 0; Reg != NumTargetRegs; ++Reg) {
+    if ((GetGroup(Reg) == Group) && (RegRefs->count(Reg) > 0))
+      Regs.push_back(Reg);
+  }
+}
+
+unsigned AggressiveAntiDepState::UnionGroups(unsigned Reg1, unsigned Reg2)
+{
+  assert(GroupNodes[0] == 0 && "GroupNode 0 not parent!");
+  assert(GroupNodeIndices[0] == 0 && "Reg 0 not in Group 0!");
+
+  // find group for each register
+  unsigned Group1 = GetGroup(Reg1);
+  unsigned Group2 = GetGroup(Reg2);
+
+  // if either group is 0, then that must become the parent
+  unsigned Parent = (Group1 == 0) ? Group1 : Group2;
+  unsigned Other = (Parent == Group1) ? Group2 : Group1;
+  GroupNodes.at(Other) = Parent;
+  return Parent;
+}
+
+unsigned AggressiveAntiDepState::LeaveGroup(unsigned Reg)
+{
+  // Create a new GroupNode for Reg. Reg's existing GroupNode must
+  // stay as is because there could be other GroupNodes referring to
+  // it.
+  unsigned idx = GroupNodes.size();
+  GroupNodes.push_back(idx);
+  GroupNodeIndices[Reg] = idx;
+  return idx;
+}
+
+bool AggressiveAntiDepState::IsLive(unsigned Reg)
+{
+  // KillIndex must be defined and DefIndex not defined for a register
+  // to be live.
+  return((KillIndices[Reg] != ~0u) && (DefIndices[Reg] == ~0u));
+}
+
+
+
+AggressiveAntiDepBreaker::
+AggressiveAntiDepBreaker(MachineFunction& MFi,
+                         TargetSubtarget::RegClassVector& CriticalPathRCs) :
+  AntiDepBreaker(), MF(MFi),
+  MRI(MF.getRegInfo()),
+  TII(MF.getTarget().getInstrInfo()),
+  TRI(MF.getTarget().getRegisterInfo()),
+  AllocatableSet(TRI->getAllocatableSet(MF)),
+  State(NULL) {
+  /* Collect a bitset of all registers that are only broken if they
+     are on the critical path. */
+  for (unsigned i = 0, e = CriticalPathRCs.size(); i < e; ++i) {
+    BitVector CPSet = TRI->getAllocatableSet(MF, CriticalPathRCs[i]);
+    if (CriticalPathSet.none())
+      CriticalPathSet = CPSet;
+    else
+      CriticalPathSet |= CPSet;
+   }
+
+  DEBUG(dbgs() << "AntiDep Critical-Path Registers:");
+  DEBUG(for (int r = CriticalPathSet.find_first(); r != -1;
+             r = CriticalPathSet.find_next(r))
+          dbgs() << " " << TRI->getName(r));
+  DEBUG(dbgs() << '\n');
+}
+
+AggressiveAntiDepBreaker::~AggressiveAntiDepBreaker() {
+  delete State;
+}
+
+void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
+  assert(State == NULL);
+  State = new AggressiveAntiDepState(TRI->getNumRegs(), BB);
+
+  bool IsReturnBlock = (!BB->empty() && BB->back().getDesc().isReturn());
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+
+  // Determine the live-out physregs for this block.
+  if (IsReturnBlock) {
+    // In a return block, examine the function live-out regs.
+    for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(),
+         E = MRI.liveout_end(); I != E; ++I) {
+      unsigned Reg = *I;
+      State->UnionGroups(Reg, 0);
+      KillIndices[Reg] = BB->size();
+      DefIndices[Reg] = ~0u;
+      // Repeat, for all aliases.
+      for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+        unsigned AliasReg = *Alias;
+        State->UnionGroups(AliasReg, 0);
+        KillIndices[AliasReg] = BB->size();
+        DefIndices[AliasReg] = ~0u;
+      }
+    }
+  }
+
+  // In a non-return block, examine the live-in regs of all successors.
+  // Note a return block can have successors if the return instruction is
+  // predicated.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         SE = BB->succ_end(); SI != SE; ++SI)
+    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+           E = (*SI)->livein_end(); I != E; ++I) {
+      unsigned Reg = *I;
+      State->UnionGroups(Reg, 0);
+      KillIndices[Reg] = BB->size();
+      DefIndices[Reg] = ~0u;
+      // Repeat, for all aliases.
+      for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+        unsigned AliasReg = *Alias;
+        State->UnionGroups(AliasReg, 0);
+        KillIndices[AliasReg] = BB->size();
+        DefIndices[AliasReg] = ~0u;
+      }
+    }
+
+  // Mark live-out callee-saved registers. In a return block this is
+  // all callee-saved registers. In non-return this is any
+  // callee-saved register that is not saved in the prolog.
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  BitVector Pristine = MFI->getPristineRegs(BB);
+  for (const unsigned *I = TRI->getCalleeSavedRegs(); *I; ++I) {
+    unsigned Reg = *I;
+    if (!IsReturnBlock && !Pristine.test(Reg)) continue;
+    State->UnionGroups(Reg, 0);
+    KillIndices[Reg] = BB->size();
+    DefIndices[Reg] = ~0u;
+    // Repeat, for all aliases.
+    for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+      unsigned AliasReg = *Alias;
+      State->UnionGroups(AliasReg, 0);
+      KillIndices[AliasReg] = BB->size();
+      DefIndices[AliasReg] = ~0u;
+    }
+  }
+}
+
+void AggressiveAntiDepBreaker::FinishBlock() {
+  delete State;
+  State = NULL;
+}
+
+void AggressiveAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
+                                       unsigned InsertPosIndex) {
+  assert(Count < InsertPosIndex && "Instruction index out of expected range!");
+
+  std::set<unsigned> PassthruRegs;
+  GetPassthruRegs(MI, PassthruRegs);
+  PrescanInstruction(MI, Count, PassthruRegs);
+  ScanInstruction(MI, Count);
+
+  DEBUG(dbgs() << "Observe: ");
+  DEBUG(MI->dump());
+  DEBUG(dbgs() << "\tRegs:");
+
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg) {
+    // If Reg is current live, then mark that it can't be renamed as
+    // we don't know the extent of its live-range anymore (now that it
+    // has been scheduled). If it is not live but was defined in the
+    // previous schedule region, then set its def index to the most
+    // conservative location (i.e. the beginning of the previous
+    // schedule region).
+    if (State->IsLive(Reg)) {
+      DEBUG(if (State->GetGroup(Reg) != 0)
+              dbgs() << " " << TRI->getName(Reg) << "=g" <<
+                State->GetGroup(Reg) << "->g0(region live-out)");
+      State->UnionGroups(Reg, 0);
+    } else if ((DefIndices[Reg] < InsertPosIndex)
+               && (DefIndices[Reg] >= Count)) {
+      DefIndices[Reg] = Count;
+    }
+  }
+  DEBUG(dbgs() << '\n');
+}
+
+bool AggressiveAntiDepBreaker::IsImplicitDefUse(MachineInstr *MI,
+                                                MachineOperand& MO)
+{
+  if (!MO.isReg() || !MO.isImplicit())
+    return false;
+
+  unsigned Reg = MO.getReg();
+  if (Reg == 0)
+    return false;
+
+  MachineOperand *Op = NULL;
+  if (MO.isDef())
+    Op = MI->findRegisterUseOperand(Reg, true);
+  else
+    Op = MI->findRegisterDefOperand(Reg);
+
+  return((Op != NULL) && Op->isImplicit());
+}
+
+void AggressiveAntiDepBreaker::GetPassthruRegs(MachineInstr *MI,
+                                           std::set<unsigned>& PassthruRegs) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    if ((MO.isDef() && MI->isRegTiedToUseOperand(i)) ||
+        IsImplicitDefUse(MI, MO)) {
+      const unsigned Reg = MO.getReg();
+      PassthruRegs.insert(Reg);
+      for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+           *Subreg; ++Subreg) {
+        PassthruRegs.insert(*Subreg);
+      }
+    }
+  }
+}
+
+/// AntiDepEdges - Return in Edges the anti- and output- dependencies
+/// in SU that we want to consider for breaking.
+static void AntiDepEdges(const SUnit *SU, std::vector<const SDep*>& Edges) {
+  SmallSet<unsigned, 4> RegSet;
+  for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
+       P != PE; ++P) {
+    if ((P->getKind() == SDep::Anti) || (P->getKind() == SDep::Output)) {
+      unsigned Reg = P->getReg();
+      if (RegSet.count(Reg) == 0) {
+        Edges.push_back(&*P);
+        RegSet.insert(Reg);
+      }
+    }
+  }
+}
+
+/// CriticalPathStep - Return the next SUnit after SU on the bottom-up
+/// critical path.
+static const SUnit *CriticalPathStep(const SUnit *SU) {
+  const SDep *Next = 0;
+  unsigned NextDepth = 0;
+  // Find the predecessor edge with the greatest depth.
+  if (SU != 0) {
+    for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
+         P != PE; ++P) {
+      const SUnit *PredSU = P->getSUnit();
+      unsigned PredLatency = P->getLatency();
+      unsigned PredTotalLatency = PredSU->getDepth() + PredLatency;
+      // In the case of a latency tie, prefer an anti-dependency edge over
+      // other types of edges.
+      if (NextDepth < PredTotalLatency ||
+          (NextDepth == PredTotalLatency && P->getKind() == SDep::Anti)) {
+        NextDepth = PredTotalLatency;
+        Next = &*P;
+      }
+    }
+  }
+
+  return (Next) ? Next->getSUnit() : 0;
+}
+
+void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
+                                             const char *tag,
+                                             const char *header,
+                                             const char *footer) {
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  if (!State->IsLive(Reg)) {
+    KillIndices[Reg] = KillIdx;
+    DefIndices[Reg] = ~0u;
+    RegRefs.erase(Reg);
+    State->LeaveGroup(Reg);
+    DEBUG(if (header != NULL) {
+        dbgs() << header << TRI->getName(Reg); header = NULL; });
+    DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << tag);
+  }
+  // Repeat for subregisters.
+  for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+       *Subreg; ++Subreg) {
+    unsigned SubregReg = *Subreg;
+    if (!State->IsLive(SubregReg)) {
+      KillIndices[SubregReg] = KillIdx;
+      DefIndices[SubregReg] = ~0u;
+      RegRefs.erase(SubregReg);
+      State->LeaveGroup(SubregReg);
+      DEBUG(if (header != NULL) {
+          dbgs() << header << TRI->getName(Reg); header = NULL; });
+      DEBUG(dbgs() << " " << TRI->getName(SubregReg) << "->g" <<
+            State->GetGroup(SubregReg) << tag);
+    }
+  }
+
+  DEBUG(if ((header == NULL) && (footer != NULL)) dbgs() << footer);
+}
+
+void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
+                                                  unsigned Count,
+                                             std::set<unsigned>& PassthruRegs) {
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // Handle dead defs by simulating a last-use of the register just
+  // after the def. A dead def can occur because the def is truely
+  // dead, or because only a subregister is live at the def. If we
+  // don't do this the dead def will be incorrectly merged into the
+  // previous def.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    HandleLastUse(Reg, Count + 1, "", "\tDead Def: ", "\n");
+  }
+
+  DEBUG(dbgs() << "\tDef Groups:");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    DEBUG(dbgs() << " " << TRI->getName(Reg) << "=g" << State->GetGroup(Reg));
+
+    // If MI's defs have a special allocation requirement, don't allow
+    // any def registers to be changed. Also assume all registers
+    // defined in a call must not be changed (ABI).
+    if (MI->getDesc().isCall() || MI->getDesc().hasExtraDefRegAllocReq() ||
+        TII->isPredicated(MI)) {
+      DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
+      State->UnionGroups(Reg, 0);
+    }
+
+    // Any aliased that are live at this point are completely or
+    // partially defined here, so group those aliases with Reg.
+    for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+      unsigned AliasReg = *Alias;
+      if (State->IsLive(AliasReg)) {
+        State->UnionGroups(Reg, AliasReg);
+        DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << "(via " <<
+              TRI->getName(AliasReg) << ")");
+      }
+    }
+
+    // Note register reference...
+    const TargetRegisterClass *RC = NULL;
+    if (i < MI->getDesc().getNumOperands())
+      RC = MI->getDesc().OpInfo[i].getRegClass(TRI);
+    AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
+    RegRefs.insert(std::make_pair(Reg, RR));
+  }
+
+  DEBUG(dbgs() << '\n');
+
+  // Scan the register defs for this instruction and update
+  // live-ranges.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    // Ignore KILLs and passthru registers for liveness...
+    if (MI->isKill() || (PassthruRegs.count(Reg) != 0))
+      continue;
+
+    // Update def for Reg and aliases.
+    DefIndices[Reg] = Count;
+    for (const unsigned *Alias = TRI->getAliasSet(Reg);
+         *Alias; ++Alias) {
+      unsigned AliasReg = *Alias;
+      DefIndices[AliasReg] = Count;
+    }
+  }
+}
+
+void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
+                                               unsigned Count) {
+  DEBUG(dbgs() << "\tUse Groups:");
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // If MI's uses have special allocation requirement, don't allow
+  // any use registers to be changed. Also assume all registers
+  // used in a call must not be changed (ABI).
+  // FIXME: The issue with predicated instruction is more complex. We are being
+  // conservatively here because the kill markers cannot be trusted after
+  // if-conversion:
+  // %R6<def> = LDR %SP, %reg0, 92, pred:14, pred:%reg0; mem:LD4[FixedStack14]
+  // ...
+  // STR %R0, %R6<kill>, %reg0, 0, pred:0, pred:%CPSR; mem:ST4[%395]
+  // %R6<def> = LDR %SP, %reg0, 100, pred:0, pred:%CPSR; mem:LD4[FixedStack12]
+  // STR %R0, %R6<kill>, %reg0, 0, pred:14, pred:%reg0; mem:ST4[%396](align=8)
+  //
+  // The first R6 kill is not really a kill since it's killed by a predicated
+  // instruction which may not be executed. The second R6 def may or may not
+  // re-define R6 so it's not safe to change it since the last R6 use cannot be
+  // changed.
+  bool Special = MI->getDesc().isCall() ||
+    MI->getDesc().hasExtraSrcRegAllocReq() ||
+    TII->isPredicated(MI);
+
+  // Scan the register uses for this instruction and update
+  // live-ranges, groups and RegRefs.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isUse()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    DEBUG(dbgs() << " " << TRI->getName(Reg) << "=g" <<
+          State->GetGroup(Reg));
+
+    // It wasn't previously live but now it is, this is a kill. Forget
+    // the previous live-range information and start a new live-range
+    // for the register.
+    HandleLastUse(Reg, Count, "(last-use)");
+
+    if (Special) {
+      DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
+      State->UnionGroups(Reg, 0);
+    }
+
+    // Note register reference...
+    const TargetRegisterClass *RC = NULL;
+    if (i < MI->getDesc().getNumOperands())
+      RC = MI->getDesc().OpInfo[i].getRegClass(TRI);
+    AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
+    RegRefs.insert(std::make_pair(Reg, RR));
+  }
+
+  DEBUG(dbgs() << '\n');
+
+  // Form a group of all defs and uses of a KILL instruction to ensure
+  // that all registers are renamed as a group.
+  if (MI->isKill()) {
+    DEBUG(dbgs() << "\tKill Group:");
+
+    unsigned FirstReg = 0;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+
+      if (FirstReg != 0) {
+        DEBUG(dbgs() << "=" << TRI->getName(Reg));
+        State->UnionGroups(FirstReg, Reg);
+      } else {
+        DEBUG(dbgs() << " " << TRI->getName(Reg));
+        FirstReg = Reg;
+      }
+    }
+
+    DEBUG(dbgs() << "->g" << State->GetGroup(FirstReg) << '\n');
+  }
+}
+
+BitVector AggressiveAntiDepBreaker::GetRenameRegisters(unsigned Reg) {
+  BitVector BV(TRI->getNumRegs(), false);
+  bool first = true;
+
+  // Check all references that need rewriting for Reg. For each, use
+  // the corresponding register class to narrow the set of registers
+  // that are appropriate for renaming.
+  std::pair<std::multimap<unsigned,
+                     AggressiveAntiDepState::RegisterReference>::iterator,
+            std::multimap<unsigned,
+                     AggressiveAntiDepState::RegisterReference>::iterator>
+    Range = State->GetRegRefs().equal_range(Reg);
+  for (std::multimap<unsigned,
+       AggressiveAntiDepState::RegisterReference>::iterator Q = Range.first,
+       QE = Range.second; Q != QE; ++Q) {
+    const TargetRegisterClass *RC = Q->second.RC;
+    if (RC == NULL) continue;
+
+    BitVector RCBV = TRI->getAllocatableSet(MF, RC);
+    if (first) {
+      BV |= RCBV;
+      first = false;
+    } else {
+      BV &= RCBV;
+    }
+
+    DEBUG(dbgs() << " " << RC->getName());
+  }
+
+  return BV;
+}
+
+bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
+                                unsigned AntiDepGroupIndex,
+                                RenameOrderType& RenameOrder,
+                                std::map<unsigned, unsigned> &RenameMap) {
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // Collect all referenced registers in the same group as
+  // AntiDepReg. These all need to be renamed together if we are to
+  // break the anti-dependence.
+  std::vector<unsigned> Regs;
+  State->GetGroupRegs(AntiDepGroupIndex, Regs, &RegRefs);
+  assert(Regs.size() > 0 && "Empty register group!");
+  if (Regs.size() == 0)
+    return false;
+
+  // Find the "superest" register in the group. At the same time,
+  // collect the BitVector of registers that can be used to rename
+  // each register.
+  DEBUG(dbgs() << "\tRename Candidates for Group g" << AntiDepGroupIndex
+        << ":\n");
+  std::map<unsigned, BitVector> RenameRegisterMap;
+  unsigned SuperReg = 0;
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    unsigned Reg = Regs[i];
+    if ((SuperReg == 0) || TRI->isSuperRegister(SuperReg, Reg))
+      SuperReg = Reg;
+
+    // If Reg has any references, then collect possible rename regs
+    if (RegRefs.count(Reg) > 0) {
+      DEBUG(dbgs() << "\t\t" << TRI->getName(Reg) << ":");
+
+      BitVector BV = GetRenameRegisters(Reg);
+      RenameRegisterMap.insert(std::pair<unsigned, BitVector>(Reg, BV));
+
+      DEBUG(dbgs() << " ::");
+      DEBUG(for (int r = BV.find_first(); r != -1; r = BV.find_next(r))
+              dbgs() << " " << TRI->getName(r));
+      DEBUG(dbgs() << "\n");
+    }
+  }
+
+  // All group registers should be a subreg of SuperReg.
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    unsigned Reg = Regs[i];
+    if (Reg == SuperReg) continue;
+    bool IsSub = TRI->isSubRegister(SuperReg, Reg);
+    assert(IsSub && "Expecting group subregister");
+    if (!IsSub)
+      return false;
+  }
+
+#ifndef NDEBUG
+  // If DebugDiv > 0 then only rename (renamecnt % DebugDiv) == DebugMod
+  if (DebugDiv > 0) {
+    static int renamecnt = 0;
+    if (renamecnt++ % DebugDiv != DebugMod)
+      return false;
+
+    dbgs() << "*** Performing rename " << TRI->getName(SuperReg) <<
+      " for debug ***\n";
+  }
+#endif
+
+  // Check each possible rename register for SuperReg in round-robin
+  // order. If that register is available, and the corresponding
+  // registers are available for the other group subregisters, then we
+  // can use those registers to rename.
+
+  // FIXME: Using getMinimalPhysRegClass is very conservative. We should
+  // check every use of the register and find the largest register class
+  // that can be used in all of them.
+  const TargetRegisterClass *SuperRC =
+    TRI->getMinimalPhysRegClass(SuperReg, MVT::Other);
+
+  const TargetRegisterClass::iterator RB = SuperRC->allocation_order_begin(MF);
+  const TargetRegisterClass::iterator RE = SuperRC->allocation_order_end(MF);
+  if (RB == RE) {
+    DEBUG(dbgs() << "\tEmpty Super Regclass!!\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "\tFind Registers:");
+
+  if (RenameOrder.count(SuperRC) == 0)
+    RenameOrder.insert(RenameOrderType::value_type(SuperRC, RE));
+
+  const TargetRegisterClass::iterator OrigR = RenameOrder[SuperRC];
+  const TargetRegisterClass::iterator EndR = ((OrigR == RE) ? RB : OrigR);
+  TargetRegisterClass::iterator R = OrigR;
+  do {
+    if (R == RB) R = RE;
+    --R;
+    const unsigned NewSuperReg = *R;
+    // Don't replace a register with itself.
+    if (NewSuperReg == SuperReg) continue;
+
+    DEBUG(dbgs() << " [" << TRI->getName(NewSuperReg) << ':');
+    RenameMap.clear();
+
+    // For each referenced group register (which must be a SuperReg or
+    // a subregister of SuperReg), find the corresponding subregister
+    // of NewSuperReg and make sure it is free to be renamed.
+    for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+      unsigned Reg = Regs[i];
+      unsigned NewReg = 0;
+      if (Reg == SuperReg) {
+        NewReg = NewSuperReg;
+      } else {
+        unsigned NewSubRegIdx = TRI->getSubRegIndex(SuperReg, Reg);
+        if (NewSubRegIdx != 0)
+          NewReg = TRI->getSubReg(NewSuperReg, NewSubRegIdx);
+      }
+
+      DEBUG(dbgs() << " " << TRI->getName(NewReg));
+
+      // Check if Reg can be renamed to NewReg.
+      BitVector BV = RenameRegisterMap[Reg];
+      if (!BV.test(NewReg)) {
+        DEBUG(dbgs() << "(no rename)");
+        goto next_super_reg;
+      }
+
+      // If NewReg is dead and NewReg's most recent def is not before
+      // Regs's kill, it's safe to replace Reg with NewReg. We
+      // must also check all aliases of NewReg, because we can't define a
+      // register when any sub or super is already live.
+      if (State->IsLive(NewReg) || (KillIndices[Reg] > DefIndices[NewReg])) {
+        DEBUG(dbgs() << "(live)");
+        goto next_super_reg;
+      } else {
+        bool found = false;
+        for (const unsigned *Alias = TRI->getAliasSet(NewReg);
+             *Alias; ++Alias) {
+          unsigned AliasReg = *Alias;
+          if (State->IsLive(AliasReg) ||
+              (KillIndices[Reg] > DefIndices[AliasReg])) {
+            DEBUG(dbgs() << "(alias " << TRI->getName(AliasReg) << " live)");
+            found = true;
+            break;
+          }
+        }
+        if (found)
+          goto next_super_reg;
+      }
+
+      // Record that 'Reg' can be renamed to 'NewReg'.
+      RenameMap.insert(std::pair<unsigned, unsigned>(Reg, NewReg));
+    }
+
+    // If we fall-out here, then every register in the group can be
+    // renamed, as recorded in RenameMap.
+    RenameOrder.erase(SuperRC);
+    RenameOrder.insert(RenameOrderType::value_type(SuperRC, R));
+    DEBUG(dbgs() << "]\n");
+    return true;
+
+  next_super_reg:
+    DEBUG(dbgs() << ']');
+  } while (R != EndR);
+
+  DEBUG(dbgs() << '\n');
+
+  // No registers are free and available!
+  return false;
+}
+
+/// BreakAntiDependencies - Identifiy anti-dependencies within the
+/// ScheduleDAG and break them by renaming registers.
+///
+unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
+                              const std::vector<SUnit>& SUnits,
+                              MachineBasicBlock::iterator Begin,
+                              MachineBasicBlock::iterator End,
+                              unsigned InsertPosIndex) {
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // The code below assumes that there is at least one instruction,
+  // so just duck out immediately if the block is empty.
+  if (SUnits.empty()) return 0;
+
+  // For each regclass the next register to use for renaming.
+  RenameOrderType RenameOrder;
+
+  // ...need a map from MI to SUnit.
+  std::map<MachineInstr *, const SUnit *> MISUnitMap;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    const SUnit *SU = &SUnits[i];
+    MISUnitMap.insert(std::pair<MachineInstr *, const SUnit *>(SU->getInstr(),
+                                                               SU));
+  }
+
+  // Track progress along the critical path through the SUnit graph as
+  // we walk the instructions. This is needed for regclasses that only
+  // break critical-path anti-dependencies.
+  const SUnit *CriticalPathSU = 0;
+  MachineInstr *CriticalPathMI = 0;
+  if (CriticalPathSet.any()) {
+    for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+      const SUnit *SU = &SUnits[i];
+      if (!CriticalPathSU ||
+          ((SU->getDepth() + SU->Latency) >
+           (CriticalPathSU->getDepth() + CriticalPathSU->Latency))) {
+        CriticalPathSU = SU;
+      }
+    }
+
+    CriticalPathMI = CriticalPathSU->getInstr();
+  }
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "\n===== Aggressive anti-dependency breaking\n");
+  DEBUG(dbgs() << "Available regs:");
+  for (unsigned Reg = 0; Reg < TRI->getNumRegs(); ++Reg) {
+    if (!State->IsLive(Reg))
+      DEBUG(dbgs() << " " << TRI->getName(Reg));
+  }
+  DEBUG(dbgs() << '\n');
+#endif
+
+  // Attempt to break anti-dependence edges. Walk the instructions
+  // from the bottom up, tracking information about liveness as we go
+  // to help determine which registers are available.
+  unsigned Broken = 0;
+  unsigned Count = InsertPosIndex - 1;
+  for (MachineBasicBlock::iterator I = End, E = Begin;
+       I != E; --Count) {
+    MachineInstr *MI = --I;
+
+    DEBUG(dbgs() << "Anti: ");
+    DEBUG(MI->dump());
+
+    std::set<unsigned> PassthruRegs;
+    GetPassthruRegs(MI, PassthruRegs);
+
+    // Process the defs in MI...
+    PrescanInstruction(MI, Count, PassthruRegs);
+
+    // The dependence edges that represent anti- and output-
+    // dependencies that are candidates for breaking.
+    std::vector<const SDep *> Edges;
+    const SUnit *PathSU = MISUnitMap[MI];
+    AntiDepEdges(PathSU, Edges);
+
+    // If MI is not on the critical path, then we don't rename
+    // registers in the CriticalPathSet.
+    BitVector *ExcludeRegs = NULL;
+    if (MI == CriticalPathMI) {
+      CriticalPathSU = CriticalPathStep(CriticalPathSU);
+      CriticalPathMI = (CriticalPathSU) ? CriticalPathSU->getInstr() : 0;
+    } else {
+      ExcludeRegs = &CriticalPathSet;
+    }
+
+    // Ignore KILL instructions (they form a group in ScanInstruction
+    // but don't cause any anti-dependence breaking themselves)
+    if (!MI->isKill()) {
+      // Attempt to break each anti-dependency...
+      for (unsigned i = 0, e = Edges.size(); i != e; ++i) {
+        const SDep *Edge = Edges[i];
+        SUnit *NextSU = Edge->getSUnit();
+
+        if ((Edge->getKind() != SDep::Anti) &&
+            (Edge->getKind() != SDep::Output)) continue;
+
+        unsigned AntiDepReg = Edge->getReg();
+        DEBUG(dbgs() << "\tAntidep reg: " << TRI->getName(AntiDepReg));
+        assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
+
+        if (!AllocatableSet.test(AntiDepReg)) {
+          // Don't break anti-dependencies on non-allocatable registers.
+          DEBUG(dbgs() << " (non-allocatable)\n");
+          continue;
+        } else if ((ExcludeRegs != NULL) && ExcludeRegs->test(AntiDepReg)) {
+          // Don't break anti-dependencies for critical path registers
+          // if not on the critical path
+          DEBUG(dbgs() << " (not critical-path)\n");
+          continue;
+        } else if (PassthruRegs.count(AntiDepReg) != 0) {
+          // If the anti-dep register liveness "passes-thru", then
+          // don't try to change it. It will be changed along with
+          // the use if required to break an earlier antidep.
+          DEBUG(dbgs() << " (passthru)\n");
+          continue;
+        } else {
+          // No anti-dep breaking for implicit deps
+          MachineOperand *AntiDepOp = MI->findRegisterDefOperand(AntiDepReg);
+          assert(AntiDepOp != NULL &&
+                 "Can't find index for defined register operand");
+          if ((AntiDepOp == NULL) || AntiDepOp->isImplicit()) {
+            DEBUG(dbgs() << " (implicit)\n");
+            continue;
+          }
+
+          // If the SUnit has other dependencies on the SUnit that
+          // it anti-depends on, don't bother breaking the
+          // anti-dependency since those edges would prevent such
+          // units from being scheduled past each other
+          // regardless.
+          //
+          // Also, if there are dependencies on other SUnits with the
+          // same register as the anti-dependency, don't attempt to
+          // break it.
+          for (SUnit::const_pred_iterator P = PathSU->Preds.begin(),
+                 PE = PathSU->Preds.end(); P != PE; ++P) {
+            if (P->getSUnit() == NextSU ?
+                (P->getKind() != SDep::Anti || P->getReg() != AntiDepReg) :
+                (P->getKind() == SDep::Data && P->getReg() == AntiDepReg)) {
+              AntiDepReg = 0;
+              break;
+            }
+          }
+          for (SUnit::const_pred_iterator P = PathSU->Preds.begin(),
+                 PE = PathSU->Preds.end(); P != PE; ++P) {
+            if ((P->getSUnit() == NextSU) && (P->getKind() != SDep::Anti) &&
+                (P->getKind() != SDep::Output)) {
+              DEBUG(dbgs() << " (real dependency)\n");
+              AntiDepReg = 0;
+              break;
+            } else if ((P->getSUnit() != NextSU) &&
+                       (P->getKind() == SDep::Data) &&
+                       (P->getReg() == AntiDepReg)) {
+              DEBUG(dbgs() << " (other dependency)\n");
+              AntiDepReg = 0;
+              break;
+            }
+          }
+
+          if (AntiDepReg == 0) continue;
+        }
+
+        assert(AntiDepReg != 0);
+        if (AntiDepReg == 0) continue;
+
+        // Determine AntiDepReg's register group.
+        const unsigned GroupIndex = State->GetGroup(AntiDepReg);
+        if (GroupIndex == 0) {
+          DEBUG(dbgs() << " (zero group)\n");
+          continue;
+        }
+
+        DEBUG(dbgs() << '\n');
+
+        // Look for a suitable register to use to break the anti-dependence.
+        std::map<unsigned, unsigned> RenameMap;
+        if (FindSuitableFreeRegisters(GroupIndex, RenameOrder, RenameMap)) {
+          DEBUG(dbgs() << "\tBreaking anti-dependence edge on "
+                << TRI->getName(AntiDepReg) << ":");
+
+          // Handle each group register...
+          for (std::map<unsigned, unsigned>::iterator
+                 S = RenameMap.begin(), E = RenameMap.end(); S != E; ++S) {
+            unsigned CurrReg = S->first;
+            unsigned NewReg = S->second;
+
+            DEBUG(dbgs() << " " << TRI->getName(CurrReg) << "->" <<
+                  TRI->getName(NewReg) << "(" <<
+                  RegRefs.count(CurrReg) << " refs)");
+
+            // Update the references to the old register CurrReg to
+            // refer to the new register NewReg.
+            std::pair<std::multimap<unsigned,
+                           AggressiveAntiDepState::RegisterReference>::iterator,
+                      std::multimap<unsigned,
+                           AggressiveAntiDepState::RegisterReference>::iterator>
+              Range = RegRefs.equal_range(CurrReg);
+            for (std::multimap<unsigned,
+                 AggressiveAntiDepState::RegisterReference>::iterator
+                   Q = Range.first, QE = Range.second; Q != QE; ++Q) {
+              Q->second.Operand->setReg(NewReg);
+              // If the SU for the instruction being updated has debug
+              // information related to the anti-dependency register, make
+              // sure to update that as well.
+              const SUnit *SU = MISUnitMap[Q->second.Operand->getParent()];
+              if (!SU) continue;
+              for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) {
+                MachineInstr *DI = SU->DbgInstrList[i];
+                assert (DI->getNumOperands()==3 && DI->getOperand(0).isReg() &&
+                        DI->getOperand(0).getReg()
+                        && "Non register dbg_value attached to SUnit!");
+                if (DI->getOperand(0).getReg() == AntiDepReg)
+                  DI->getOperand(0).setReg(NewReg);
+              }
+            }
+
+            // We just went back in time and modified history; the
+            // liveness information for CurrReg is now inconsistent. Set
+            // the state as if it were dead.
+            State->UnionGroups(NewReg, 0);
+            RegRefs.erase(NewReg);
+            DefIndices[NewReg] = DefIndices[CurrReg];
+            KillIndices[NewReg] = KillIndices[CurrReg];
+
+            State->UnionGroups(CurrReg, 0);
+            RegRefs.erase(CurrReg);
+            DefIndices[CurrReg] = KillIndices[CurrReg];
+            KillIndices[CurrReg] = ~0u;
+            assert(((KillIndices[CurrReg] == ~0u) !=
+                    (DefIndices[CurrReg] == ~0u)) &&
+                   "Kill and Def maps aren't consistent for AntiDepReg!");
+          }
+
+          ++Broken;
+          DEBUG(dbgs() << '\n');
+        }
+      }
+    }
+
+    ScanInstruction(MI, Count);
+  }
+
+  return Broken;
+}
diff --git a/src/LLVM/lib/CodeGen/AggressiveAntiDepBreaker.h b/src/LLVM/lib/CodeGen/AggressiveAntiDepBreaker.h
new file mode 100644
index 0000000..9d715cc
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -0,0 +1,184 @@
+//=- llvm/CodeGen/AggressiveAntiDepBreaker.h - Anti-Dep Support -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AggressiveAntiDepBreaker class, which
+// implements register anti-dependence breaking during post-RA
+// scheduling. It attempts to break all anti-dependencies within a
+// block.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
+#define LLVM_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
+
+#include "AntiDepBreaker.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include <map>
+
+namespace llvm {
+  /// Class AggressiveAntiDepState
+  /// Contains all the state necessary for anti-dep breaking.
+  class AggressiveAntiDepState {
+  public:
+    /// RegisterReference - Information about a register reference
+    /// within a liverange
+    typedef struct {
+      /// Operand - The registers operand
+      MachineOperand *Operand;
+      /// RC - The register class
+      const TargetRegisterClass *RC;
+    } RegisterReference;
+
+  private:
+    /// NumTargetRegs - Number of non-virtual target registers
+    /// (i.e. TRI->getNumRegs()).
+    const unsigned NumTargetRegs;
+
+    /// GroupNodes - Implements a disjoint-union data structure to
+    /// form register groups. A node is represented by an index into
+    /// the vector. A node can "point to" itself to indicate that it
+    /// is the parent of a group, or point to another node to indicate
+    /// that it is a member of the same group as that node.
+    std::vector<unsigned> GroupNodes;
+
+    /// GroupNodeIndices - For each register, the index of the GroupNode
+    /// currently representing the group that the register belongs to.
+    /// Register 0 is always represented by the 0 group, a group
+    /// composed of registers that are not eligible for anti-aliasing.
+    std::vector<unsigned> GroupNodeIndices;
+
+    /// RegRefs - Map registers to all their references within a live range.
+    std::multimap<unsigned, RegisterReference> RegRefs;
+
+    /// KillIndices - The index of the most recent kill (proceding bottom-up),
+    /// or ~0u if the register is not live.
+    std::vector<unsigned> KillIndices;
+
+    /// DefIndices - The index of the most recent complete def (proceding bottom
+    /// up), or ~0u if the register is live.
+    std::vector<unsigned> DefIndices;
+
+  public:
+    AggressiveAntiDepState(const unsigned TargetRegs, MachineBasicBlock *BB);
+
+    /// GetKillIndices - Return the kill indices.
+    std::vector<unsigned> &GetKillIndices() { return KillIndices; }
+
+    /// GetDefIndices - Return the define indices.
+    std::vector<unsigned> &GetDefIndices() { return DefIndices; }
+
+    /// GetRegRefs - Return the RegRefs map.
+    std::multimap<unsigned, RegisterReference>& GetRegRefs() { return RegRefs; }
+
+    // GetGroup - Get the group for a register. The returned value is
+    // the index of the GroupNode representing the group.
+    unsigned GetGroup(unsigned Reg);
+
+    // GetGroupRegs - Return a vector of the registers belonging to a
+    // group. If RegRefs is non-NULL then only included referenced registers.
+    void GetGroupRegs(
+       unsigned Group,
+       std::vector<unsigned> &Regs,
+       std::multimap<unsigned,
+         AggressiveAntiDepState::RegisterReference> *RegRefs);
+
+    // UnionGroups - Union Reg1's and Reg2's groups to form a new
+    // group. Return the index of the GroupNode representing the
+    // group.
+    unsigned UnionGroups(unsigned Reg1, unsigned Reg2);
+
+    // LeaveGroup - Remove a register from its current group and place
+    // it alone in its own group. Return the index of the GroupNode
+    // representing the registers new group.
+    unsigned LeaveGroup(unsigned Reg);
+
+    /// IsLive - Return true if Reg is live
+    bool IsLive(unsigned Reg);
+  };
+
+
+  /// Class AggressiveAntiDepBreaker
+  class AggressiveAntiDepBreaker : public AntiDepBreaker {
+    MachineFunction& MF;
+    MachineRegisterInfo &MRI;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+
+    /// AllocatableSet - The set of allocatable registers.
+    /// We'll be ignoring anti-dependencies on non-allocatable registers,
+    /// because they may not be safe to break.
+    const BitVector AllocatableSet;
+
+    /// CriticalPathSet - The set of registers that should only be
+    /// renamed if they are on the critical path.
+    BitVector CriticalPathSet;
+
+    /// State - The state used to identify and rename anti-dependence
+    /// registers.
+    AggressiveAntiDepState *State;
+
+  public:
+    AggressiveAntiDepBreaker(MachineFunction& MFi,
+                             TargetSubtarget::RegClassVector& CriticalPathRCs);
+    ~AggressiveAntiDepBreaker();
+
+    /// Start - Initialize anti-dep breaking for a new basic block.
+    void StartBlock(MachineBasicBlock *BB);
+
+    /// BreakAntiDependencies - Identifiy anti-dependencies along the critical
+    /// path
+    /// of the ScheduleDAG and break them by renaming registers.
+    ///
+    unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
+                                   MachineBasicBlock::iterator Begin,
+                                   MachineBasicBlock::iterator End,
+                                   unsigned InsertPosIndex);
+
+    /// Observe - Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    ///
+    void Observe(MachineInstr *MI, unsigned Count, unsigned InsertPosIndex);
+
+    /// Finish - Finish anti-dep breaking for a basic block.
+    void FinishBlock();
+
+  private:
+    typedef std::map<const TargetRegisterClass *,
+                     TargetRegisterClass::const_iterator> RenameOrderType;
+
+    /// IsImplicitDefUse - Return true if MO represents a register
+    /// that is both implicitly used and defined in MI
+    bool IsImplicitDefUse(MachineInstr *MI, MachineOperand& MO);
+
+    /// GetPassthruRegs - If MI implicitly def/uses a register, then
+    /// return that register and all subregisters.
+    void GetPassthruRegs(MachineInstr *MI, std::set<unsigned>& PassthruRegs);
+
+    void HandleLastUse(unsigned Reg, unsigned KillIdx, const char *tag,
+                       const char *header =NULL, const char *footer =NULL);
+
+    void PrescanInstruction(MachineInstr *MI, unsigned Count,
+                            std::set<unsigned>& PassthruRegs);
+    void ScanInstruction(MachineInstr *MI, unsigned Count);
+    BitVector GetRenameRegisters(unsigned Reg);
+    bool FindSuitableFreeRegisters(unsigned AntiDepGroupIndex,
+                                   RenameOrderType& RenameOrder,
+                                   std::map<unsigned, unsigned> &RenameMap);
+  };
+}
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/Analysis.cpp b/src/LLVM/lib/CodeGen/Analysis.cpp
new file mode 100644
index 0000000..2acfa77
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/Analysis.cpp
@@ -0,0 +1,285 @@
+//===-- Analysis.cpp - CodeGen LLVM IR Analysis Utilities --*- C++ ------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines several CodeGen-specific LLVM IR analysis utilties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+/// ComputeLinearIndex - Given an LLVM IR aggregate type and a sequence
+/// of insertvalue or extractvalue indices that identify a member, return
+/// the linearized index of the start of the member.
+///
+unsigned llvm::ComputeLinearIndex(const TargetLowering &TLI, const Type *Ty,
+                                  const unsigned *Indices,
+                                  const unsigned *IndicesEnd,
+                                  unsigned CurIndex) {
+  // Base case: We're done.
+  if (Indices && Indices == IndicesEnd)
+    return CurIndex;
+
+  // Given a struct type, recursively traverse the elements.
+  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (StructType::element_iterator EB = STy->element_begin(),
+                                      EI = EB,
+                                      EE = STy->element_end();
+        EI != EE; ++EI) {
+      if (Indices && *Indices == unsigned(EI - EB))
+        return ComputeLinearIndex(TLI, *EI, Indices+1, IndicesEnd, CurIndex);
+      CurIndex = ComputeLinearIndex(TLI, *EI, 0, 0, CurIndex);
+    }
+    return CurIndex;
+  }
+  // Given an array type, recursively traverse the elements.
+  else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    const Type *EltTy = ATy->getElementType();
+    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i) {
+      if (Indices && *Indices == i)
+        return ComputeLinearIndex(TLI, EltTy, Indices+1, IndicesEnd, CurIndex);
+      CurIndex = ComputeLinearIndex(TLI, EltTy, 0, 0, CurIndex);
+    }
+    return CurIndex;
+  }
+  // We haven't found the type we're looking for, so keep searching.
+  return CurIndex + 1;
+}
+
+/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of
+/// EVTs that represent all the individual underlying
+/// non-aggregate types that comprise it.
+///
+/// If Offsets is non-null, it points to a vector to be filled in
+/// with the in-memory offsets of each of the individual values.
+///
+void llvm::ComputeValueVTs(const TargetLowering &TLI, const Type *Ty,
+                           SmallVectorImpl<EVT> &ValueVTs,
+                           SmallVectorImpl<uint64_t> *Offsets,
+                           uint64_t StartingOffset) {
+  // Given a struct type, recursively traverse the elements.
+  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = TLI.getTargetData()->getStructLayout(STy);
+    for (StructType::element_iterator EB = STy->element_begin(),
+                                      EI = EB,
+                                      EE = STy->element_end();
+         EI != EE; ++EI)
+      ComputeValueVTs(TLI, *EI, ValueVTs, Offsets,
+                      StartingOffset + SL->getElementOffset(EI - EB));
+    return;
+  }
+  // Given an array type, recursively traverse the elements.
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    const Type *EltTy = ATy->getElementType();
+    uint64_t EltSize = TLI.getTargetData()->getTypeAllocSize(EltTy);
+    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
+      ComputeValueVTs(TLI, EltTy, ValueVTs, Offsets,
+                      StartingOffset + i * EltSize);
+    return;
+  }
+  // Interpret void as zero return values.
+  if (Ty->isVoidTy())
+    return;
+  // Base case: we can get an EVT for this LLVM IR type.
+  ValueVTs.push_back(TLI.getValueType(Ty));
+  if (Offsets)
+    Offsets->push_back(StartingOffset);
+}
+
+/// ExtractTypeInfo - Returns the type info, possibly bitcast, encoded in V.
+GlobalVariable *llvm::ExtractTypeInfo(Value *V) {
+  V = V->stripPointerCasts();
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
+
+  if (GV && GV->getName() == "llvm.eh.catch.all.value") {
+    assert(GV->hasInitializer() &&
+           "The EH catch-all value must have an initializer");
+    Value *Init = GV->getInitializer();
+    GV = dyn_cast<GlobalVariable>(Init);
+    if (!GV) V = cast<ConstantPointerNull>(Init);
+  }
+
+  assert((GV || isa<ConstantPointerNull>(V)) &&
+         "TypeInfo must be a global variable or NULL");
+  return GV;
+}
+
+/// hasInlineAsmMemConstraint - Return true if the inline asm instruction being
+/// processed uses a memory 'm' constraint.
+bool
+llvm::hasInlineAsmMemConstraint(std::vector<InlineAsm::ConstraintInfo> &CInfos,
+                                const TargetLowering &TLI) {
+  for (unsigned i = 0, e = CInfos.size(); i != e; ++i) {
+    InlineAsm::ConstraintInfo &CI = CInfos[i];
+    for (unsigned j = 0, ee = CI.Codes.size(); j != ee; ++j) {
+      TargetLowering::ConstraintType CType = TLI.getConstraintType(CI.Codes[j]);
+      if (CType == TargetLowering::C_Memory)
+        return true;
+    }
+
+    // Indirect operand accesses access memory.
+    if (CI.isIndirect)
+      return true;
+  }
+
+  return false;
+}
+
+/// getFCmpCondCode - Return the ISD condition code corresponding to
+/// the given LLVM IR floating-point condition code.  This includes
+/// consideration of global floating-point math flags.
+///
+ISD::CondCode llvm::getFCmpCondCode(FCmpInst::Predicate Pred) {
+  ISD::CondCode FPC, FOC;
+  switch (Pred) {
+  case FCmpInst::FCMP_FALSE: FOC = FPC = ISD::SETFALSE; break;
+  case FCmpInst::FCMP_OEQ:   FOC = ISD::SETEQ; FPC = ISD::SETOEQ; break;
+  case FCmpInst::FCMP_OGT:   FOC = ISD::SETGT; FPC = ISD::SETOGT; break;
+  case FCmpInst::FCMP_OGE:   FOC = ISD::SETGE; FPC = ISD::SETOGE; break;
+  case FCmpInst::FCMP_OLT:   FOC = ISD::SETLT; FPC = ISD::SETOLT; break;
+  case FCmpInst::FCMP_OLE:   FOC = ISD::SETLE; FPC = ISD::SETOLE; break;
+  case FCmpInst::FCMP_ONE:   FOC = ISD::SETNE; FPC = ISD::SETONE; break;
+  case FCmpInst::FCMP_ORD:   FOC = FPC = ISD::SETO;   break;
+  case FCmpInst::FCMP_UNO:   FOC = FPC = ISD::SETUO;  break;
+  case FCmpInst::FCMP_UEQ:   FOC = ISD::SETEQ; FPC = ISD::SETUEQ; break;
+  case FCmpInst::FCMP_UGT:   FOC = ISD::SETGT; FPC = ISD::SETUGT; break;
+  case FCmpInst::FCMP_UGE:   FOC = ISD::SETGE; FPC = ISD::SETUGE; break;
+  case FCmpInst::FCMP_ULT:   FOC = ISD::SETLT; FPC = ISD::SETULT; break;
+  case FCmpInst::FCMP_ULE:   FOC = ISD::SETLE; FPC = ISD::SETULE; break;
+  case FCmpInst::FCMP_UNE:   FOC = ISD::SETNE; FPC = ISD::SETUNE; break;
+  case FCmpInst::FCMP_TRUE:  FOC = FPC = ISD::SETTRUE; break;
+  default:
+    llvm_unreachable("Invalid FCmp predicate opcode!");
+    FOC = FPC = ISD::SETFALSE;
+    break;
+  }
+  if (NoNaNsFPMath)
+    return FOC;
+  else
+    return FPC;
+}
+
+/// getICmpCondCode - Return the ISD condition code corresponding to
+/// the given LLVM IR integer condition code.
+///
+ISD::CondCode llvm::getICmpCondCode(ICmpInst::Predicate Pred) {
+  switch (Pred) {
+  case ICmpInst::ICMP_EQ:  return ISD::SETEQ;
+  case ICmpInst::ICMP_NE:  return ISD::SETNE;
+  case ICmpInst::ICMP_SLE: return ISD::SETLE;
+  case ICmpInst::ICMP_ULE: return ISD::SETULE;
+  case ICmpInst::ICMP_SGE: return ISD::SETGE;
+  case ICmpInst::ICMP_UGE: return ISD::SETUGE;
+  case ICmpInst::ICMP_SLT: return ISD::SETLT;
+  case ICmpInst::ICMP_ULT: return ISD::SETULT;
+  case ICmpInst::ICMP_SGT: return ISD::SETGT;
+  case ICmpInst::ICMP_UGT: return ISD::SETUGT;
+  default:
+    llvm_unreachable("Invalid ICmp predicate opcode!");
+    return ISD::SETNE;
+  }
+}
+
+/// Test if the given instruction is in a position to be optimized
+/// with a tail-call. This roughly means that it's in a block with
+/// a return and there's nothing that needs to be scheduled
+/// between it and the return.
+///
+/// This function only tests target-independent requirements.
+bool llvm::isInTailCallPosition(ImmutableCallSite CS, Attributes CalleeRetAttr,
+                                const TargetLowering &TLI) {
+  const Instruction *I = CS.getInstruction();
+  const BasicBlock *ExitBB = I->getParent();
+  const TerminatorInst *Term = ExitBB->getTerminator();
+  const ReturnInst *Ret = dyn_cast<ReturnInst>(Term);
+  const Function *F = ExitBB->getParent();
+
+  // The block must end in a return statement or unreachable.
+  //
+  // FIXME: Decline tailcall if it's not guaranteed and if the block ends in
+  // an unreachable, for now. The way tailcall optimization is currently
+  // implemented means it will add an epilogue followed by a jump. That is
+  // not profitable. Also, if the callee is a special function (e.g.
+  // longjmp on x86), it can end up causing miscompilation that has not
+  // been fully understood.
+  if (!Ret &&
+      (!GuaranteedTailCallOpt || !isa<UnreachableInst>(Term))) return false;
+
+  // If I will have a chain, make sure no other instruction that will have a
+  // chain interposes between I and the return.
+  if (I->mayHaveSideEffects() || I->mayReadFromMemory() ||
+      !I->isSafeToSpeculativelyExecute())
+    for (BasicBlock::const_iterator BBI = prior(prior(ExitBB->end())); ;
+         --BBI) {
+      if (&*BBI == I)
+        break;
+      // Debug info intrinsics do not get in the way of tail call optimization.
+      if (ISA_DEBUG_INFO_INTRINSIC(BBI))
+        continue;
+      if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
+          !BBI->isSafeToSpeculativelyExecute())
+        return false;
+    }
+
+  // If the block ends with a void return or unreachable, it doesn't matter
+  // what the call's return type is.
+  if (!Ret || Ret->getNumOperands() == 0) return true;
+
+  // If the return value is undef, it doesn't matter what the call's
+  // return type is.
+  if (isa<UndefValue>(Ret->getOperand(0))) return true;
+
+  // Conservatively require the attributes of the call to match those of
+  // the return. Ignore noalias because it doesn't affect the call sequence.
+  unsigned CallerRetAttr = F->getAttributes().getRetAttributes();
+  if ((CalleeRetAttr ^ CallerRetAttr) & ~Attribute::NoAlias)
+    return false;
+
+  // It's not safe to eliminate the sign / zero extension of the return value.
+  if ((CallerRetAttr & Attribute::ZExt) || (CallerRetAttr & Attribute::SExt))
+    return false;
+
+  // Otherwise, make sure the unmodified return value of I is the return value.
+  for (const Instruction *U = dyn_cast<Instruction>(Ret->getOperand(0)); ;
+       U = dyn_cast<Instruction>(U->getOperand(0))) {
+    if (!U)
+      return false;
+    if (!U->hasOneUse())
+      return false;
+    if (U == I)
+      break;
+    // Check for a truly no-op truncate.
+    if (isa<TruncInst>(U) &&
+        TLI.isTruncateFree(U->getOperand(0)->getType(), U->getType()))
+      continue;
+    // Check for a truly no-op bitcast.
+    if (isa<BitCastInst>(U) &&
+        (U->getOperand(0)->getType() == U->getType() ||
+         (U->getOperand(0)->getType()->isPointerTy() &&
+          U->getType()->isPointerTy())))
+      continue;
+    // Otherwise it's not a true no-op.
+    return false;
+  }
+
+  return true;
+}
+
diff --git a/src/LLVM/lib/CodeGen/AntiDepBreaker.h b/src/LLVM/lib/CodeGen/AntiDepBreaker.h
new file mode 100644
index 0000000..086b757
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/AntiDepBreaker.h
@@ -0,0 +1,59 @@
+//=- llvm/CodeGen/AntiDepBreaker.h - Anti-Dependence Breaking -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AntiDepBreaker class, which implements
+// anti-dependence breaking heuristics for post-register-allocation scheduling.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_ANTIDEPBREAKER_H
+#define LLVM_CODEGEN_ANTIDEPBREAKER_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <vector>
+
+namespace llvm {
+
+/// AntiDepBreaker - This class works into conjunction with the
+/// post-RA scheduler to rename registers to break register
+/// anti-dependencies.
+class AntiDepBreaker {
+public:
+  virtual ~AntiDepBreaker();
+
+  /// Start - Initialize anti-dep breaking for a new basic block.
+  virtual void StartBlock(MachineBasicBlock *BB) =0;
+
+  /// BreakAntiDependencies - Identifiy anti-dependencies within a
+  /// basic-block region and break them by renaming registers. Return
+  /// the number of anti-dependencies broken.
+  ///
+  virtual unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
+                                MachineBasicBlock::iterator Begin,
+                                MachineBasicBlock::iterator End,
+                                unsigned InsertPosIndex) =0;
+  
+  /// Observe - Update liveness information to account for the current
+  /// instruction, which will not be scheduled.
+  ///
+  virtual void Observe(MachineInstr *MI, unsigned Count,
+                       unsigned InsertPosIndex) =0;
+  
+  /// Finish - Finish anti-dep breaking for a basic block.
+  virtual void FinishBlock() =0;
+};
+
+}
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/AsmPrinter/CMakeLists.txt b/src/LLVM/lib/CodeGen/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..ca8b843
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_llvm_library(LLVMAsmPrinter
+  AsmPrinter.cpp
+  AsmPrinterDwarf.cpp
+  AsmPrinterInlineAsm.cpp
+  DIE.cpp
+  DwarfDebug.cpp
+  DwarfException.cpp
+  OcamlGCPrinter.cpp
+  )
+
+target_link_libraries (LLVMAsmPrinter LLVMMCParser)
diff --git a/src/LLVM/lib/CodeGen/AsmPrinter/DIE.h b/src/LLVM/lib/CodeGen/AsmPrinter/DIE.h
new file mode 100644
index 0000000..d56c094
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/AsmPrinter/DIE.h
@@ -0,0 +1,434 @@
+//===--- lib/CodeGen/DIE.h - DWARF Info Entries -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Data structures for DWARF info entries.
+// 
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DIE_H__
+#define CODEGEN_ASMPRINTER_DIE_H__
+
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Dwarf.h"
+#include <vector>
+
+namespace llvm {
+  class AsmPrinter;
+  class MCSymbol;
+  class raw_ostream;
+
+  //===--------------------------------------------------------------------===//
+  /// DIEAbbrevData - Dwarf abbreviation data, describes the one attribute of a
+  /// Dwarf abbreviation.
+  class DIEAbbrevData {
+    /// Attribute - Dwarf attribute code.
+    ///
+    unsigned Attribute;
+
+    /// Form - Dwarf form code.
+    ///
+    unsigned Form;
+  public:
+    DIEAbbrevData(unsigned A, unsigned F) : Attribute(A), Form(F) {}
+
+    // Accessors.
+    unsigned getAttribute() const { return Attribute; }
+    unsigned getForm()      const { return Form; }
+
+    /// Profile - Used to gather unique data for the abbreviation folding set.
+    ///
+    void Profile(FoldingSetNodeID &ID) const;
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEAbbrev - Dwarf abbreviation, describes the organization of a debug
+  /// information object.
+  class DIEAbbrev : public FoldingSetNode {
+    /// Tag - Dwarf tag code.
+    ///
+    unsigned Tag;
+
+    /// Unique number for node.
+    ///
+    unsigned Number;
+
+    /// ChildrenFlag - Dwarf children flag.
+    ///
+    unsigned ChildrenFlag;
+
+    /// Data - Raw data bytes for abbreviation.
+    ///
+    SmallVector<DIEAbbrevData, 8> Data;
+
+  public:
+    DIEAbbrev(unsigned T, unsigned C) : Tag(T), ChildrenFlag(C), Data() {}
+    virtual ~DIEAbbrev() {}
+
+    // Accessors.
+    unsigned getTag() const { return Tag; }
+    unsigned getNumber() const { return Number; }
+    unsigned getChildrenFlag() const { return ChildrenFlag; }
+    const SmallVector<DIEAbbrevData, 8> &getData() const { return Data; }
+    void setTag(unsigned T) { Tag = T; }
+    void setChildrenFlag(unsigned CF) { ChildrenFlag = CF; }
+    void setNumber(unsigned N) { Number = N; }
+
+    /// AddAttribute - Adds another set of attribute information to the
+    /// abbreviation.
+    void AddAttribute(unsigned Attribute, unsigned Form) {
+      Data.push_back(DIEAbbrevData(Attribute, Form));
+    }
+
+    /// AddFirstAttribute - Adds a set of attribute information to the front
+    /// of the abbreviation.
+    void AddFirstAttribute(unsigned Attribute, unsigned Form) {
+      Data.insert(Data.begin(), DIEAbbrevData(Attribute, Form));
+    }
+
+    /// Profile - Used to gather unique data for the abbreviation folding set.
+    ///
+    void Profile(FoldingSetNodeID &ID) const;
+
+    /// Emit - Print the abbreviation using the specified asm printer.
+    ///
+    void Emit(AsmPrinter *AP) const;
+
+#ifndef NDEBUG
+    void print(raw_ostream &O);
+    void dump();
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIE - A structured debug information entry.  Has an abbreviation which
+  /// describes it's organization.
+  class DIEValue;
+
+  class DIE {
+  protected:
+    /// Abbrev - Buffer for constructing abbreviation.
+    ///
+    DIEAbbrev Abbrev;
+
+    /// Offset - Offset in debug info section.
+    ///
+    unsigned Offset;
+
+    /// Size - Size of instance + children.
+    ///
+    unsigned Size;
+
+    /// Children DIEs.
+    ///
+    std::vector<DIE *> Children;
+
+    DIE *Parent;
+
+    /// Attributes values.
+    ///
+    SmallVector<DIEValue*, 32> Values;
+
+    // Private data for print()
+    mutable unsigned IndentCount;
+  public:
+    explicit DIE(unsigned Tag)
+      : Abbrev(Tag, dwarf::DW_CHILDREN_no), Offset(0),
+        Size(0), Parent (0), IndentCount(0) {}
+    virtual ~DIE();
+
+    // Accessors.
+    DIEAbbrev &getAbbrev() { return Abbrev; }
+    unsigned getAbbrevNumber() const { return Abbrev.getNumber(); }
+    unsigned getTag() const { return Abbrev.getTag(); }
+    unsigned getOffset() const { return Offset; }
+    unsigned getSize() const { return Size; }
+    const std::vector<DIE *> &getChildren() const { return Children; }
+    const SmallVector<DIEValue*, 32> &getValues() const { return Values; }
+    DIE *getParent() const { return Parent; }
+    void setTag(unsigned Tag) { Abbrev.setTag(Tag); }
+    void setOffset(unsigned O) { Offset = O; }
+    void setSize(unsigned S) { Size = S; }
+    
+    /// addValue - Add a value and attributes to a DIE.
+    ///
+    void addValue(unsigned Attribute, unsigned Form, DIEValue *Value) {
+      Abbrev.AddAttribute(Attribute, Form);
+      Values.push_back(Value);
+    }
+
+    /// SiblingOffset - Return the offset of the debug information entry's
+    /// sibling.
+    unsigned getSiblingOffset() const { return Offset + Size; }
+
+    /// addSiblingOffset - Add a sibling offset field to the front of the DIE.
+    /// The caller is responsible for deleting the return value at or after the
+    /// same time it destroys this DIE.
+    ///
+    DIEValue *addSiblingOffset(BumpPtrAllocator &A);
+
+    /// addChild - Add a child to the DIE.
+    ///
+    void addChild(DIE *Child) {
+      if (Child->getParent()) {
+        assert (Child->getParent() == this && "Unexpected DIE Parent!");
+        return;
+      }
+      Abbrev.setChildrenFlag(dwarf::DW_CHILDREN_yes);
+      Children.push_back(Child);
+      Child->Parent = this;
+    }
+
+#ifndef NDEBUG
+    void print(raw_ostream &O, unsigned IncIndent = 0);
+    void dump();
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEValue - A debug information entry value.
+  ///
+  class DIEValue {
+  public:
+    enum {
+      isInteger,
+      isString,
+      isLabel,
+      isSectionOffset,
+      isDelta,
+      isEntry,
+      isBlock
+    };
+  protected:
+    /// Type - Type of data stored in the value.
+    ///
+    unsigned Type;
+  public:
+    explicit DIEValue(unsigned T) : Type(T) {}
+    virtual ~DIEValue() {}
+
+    // Accessors
+    unsigned getType()  const { return Type; }
+
+    /// EmitValue - Emit value via the Dwarf writer.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const = 0;
+
+    /// SizeOf - Return the size of a value in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const = 0;
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEValue *) { return true; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O) = 0;
+    void dump();
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEInteger - An integer value DIE.
+  ///
+  class DIEInteger : public DIEValue {
+    uint64_t Integer;
+  public:
+    explicit DIEInteger(uint64_t I) : DIEValue(isInteger), Integer(I) {}
+
+    /// BestForm - Choose the best form for integer.
+    ///
+    static unsigned BestForm(bool IsSigned, uint64_t Int) {
+      if (IsSigned) {
+        if ((char)Int == (signed)Int)   return dwarf::DW_FORM_data1;
+        if ((short)Int == (signed)Int)  return dwarf::DW_FORM_data2;
+        if ((int)Int == (signed)Int)    return dwarf::DW_FORM_data4;
+      } else {
+        if ((unsigned char)Int == Int)  return dwarf::DW_FORM_data1;
+        if ((unsigned short)Int == Int) return dwarf::DW_FORM_data2;
+        if ((unsigned int)Int == Int)   return dwarf::DW_FORM_data4;
+      }
+      return dwarf::DW_FORM_data8;
+    }
+
+    /// EmitValue - Emit integer of appropriate size.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+
+    uint64_t getValue() const { return Integer; }
+
+    /// SizeOf - Determine size of integer value in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEInteger *) { return true; }
+    static bool classof(const DIEValue *I) { return I->getType() == isInteger; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEString - A string value DIE. This DIE keeps string reference only.
+  ///
+  class DIEString : public DIEValue {
+    const StringRef Str;
+  public:
+    explicit DIEString(const StringRef S) : DIEValue(isString), Str(S) {}
+
+    /// EmitValue - Emit string value.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+
+    /// SizeOf - Determine size of string value in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned /*Form*/) const {
+      return Str.size() + sizeof(char); // sizeof('\0');
+    }
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEString *) { return true; }
+    static bool classof(const DIEValue *S) { return S->getType() == isString; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIELabel - A label expression DIE.
+  //
+  class DIELabel : public DIEValue {
+    const MCSymbol *Label;
+  public:
+    explicit DIELabel(const MCSymbol *L) : DIEValue(isLabel), Label(L) {}
+
+    /// EmitValue - Emit label value.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+
+    /// getValue - Get MCSymbol.
+    ///
+    const MCSymbol *getValue()       const { return Label; }
+
+    /// SizeOf - Determine size of label value in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIELabel *)  { return true; }
+    static bool classof(const DIEValue *L) { return L->getType() == isLabel; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEDelta - A simple label difference DIE.
+  ///
+  class DIEDelta : public DIEValue {
+    const MCSymbol *LabelHi;
+    const MCSymbol *LabelLo;
+  public:
+    DIEDelta(const MCSymbol *Hi, const MCSymbol *Lo)
+      : DIEValue(isDelta), LabelHi(Hi), LabelLo(Lo) {}
+
+    /// EmitValue - Emit delta value.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+
+    /// SizeOf - Determine size of delta value in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEDelta *)  { return true; }
+    static bool classof(const DIEValue *D) { return D->getType() == isDelta; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEntry - A pointer to another debug information entry.  An instance of
+  /// this class can also be used as a proxy for a debug information entry not
+  /// yet defined (ie. types.)
+  class DIEEntry : public DIEValue {
+    DIE *const Entry;
+  public:
+    explicit DIEEntry(DIE *E) : DIEValue(isEntry), Entry(E) {}
+
+    DIE *getEntry() const { return Entry; }
+
+    /// EmitValue - Emit debug information entry offset.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+
+    /// SizeOf - Determine size of debug information entry in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const {
+      return sizeof(int32_t);
+    }
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEEntry *)  { return true; }
+    static bool classof(const DIEValue *E) { return E->getType() == isEntry; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEBlock - A block of values.  Primarily used for location expressions.
+  //
+  class DIEBlock : public DIEValue, public DIE {
+    unsigned Size;                // Size in bytes excluding size header.
+  public:
+    DIEBlock()
+      : DIEValue(isBlock), DIE(0), Size(0) {}
+    virtual ~DIEBlock() {}
+
+    /// ComputeSize - calculate the size of the block.
+    ///
+    unsigned ComputeSize(AsmPrinter *AP);
+
+    /// BestForm - Choose the best form for data.
+    ///
+    unsigned BestForm() const {
+      if ((unsigned char)Size == Size)  return dwarf::DW_FORM_block1;
+      if ((unsigned short)Size == Size) return dwarf::DW_FORM_block2;
+      if ((unsigned int)Size == Size)   return dwarf::DW_FORM_block4;
+      return dwarf::DW_FORM_block;
+    }
+
+    /// EmitValue - Emit block data.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+
+    /// SizeOf - Determine size of block data in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEBlock *)  { return true; }
+    static bool classof(const DIEValue *E) { return E->getType() == isBlock; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O);
+#endif
+  };
+
+} // end llvm namespace
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/AsmPrinter/DwarfDebug.h b/src/LLVM/lib/CodeGen/AsmPrinter/DwarfDebug.h
new file mode 100644
index 0000000..d5982a6
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -0,0 +1,642 @@
+//===-- llvm/CodeGen/DwarfDebug.h - Dwarf Debug Framework ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DWARFDEBUG_H__
+#define CODEGEN_ASMPRINTER_DWARFDEBUG_H__
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "DIE.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/Support/Allocator.h"
+
+namespace llvm {
+
+class CompileUnit;
+class DbgConcreteScope;
+class DbgScope;
+class DbgVariable;
+class MachineFrameInfo;
+class MachineModuleInfo;
+class MachineOperand;
+class MCAsmInfo;
+class DIEAbbrev;
+class DIE;
+class DIEBlock;
+class DIEEntry;
+
+class DIEnumerator;
+class DIDescriptor;
+class DIVariable;
+class DIGlobal;
+class DIGlobalVariable;
+class DISubprogram;
+class DIBasicType;
+class DIDerivedType;
+class DIType;
+class DINameSpace;
+class DISubrange;
+class DICompositeType;
+
+//===----------------------------------------------------------------------===//
+/// SrcLineInfo - This class is used to record source line correspondence.
+///
+class SrcLineInfo {
+  unsigned Line;                     // Source line number.
+  unsigned Column;                   // Source column.
+  unsigned SourceID;                 // Source ID number.
+  MCSymbol *Label;                   // Label in code ID number.
+public:
+  SrcLineInfo(unsigned L, unsigned C, unsigned S, MCSymbol *label)
+    : Line(L), Column(C), SourceID(S), Label(label) {}
+
+  // Accessors
+  unsigned getLine() const { return Line; }
+  unsigned getColumn() const { return Column; }
+  unsigned getSourceID() const { return SourceID; }
+  MCSymbol *getLabel() const { return Label; }
+};
+
+class DwarfDebug {
+  /// Asm - Target of Dwarf emission.
+  AsmPrinter *Asm;
+
+  /// MMI - Collected machine module information.
+  MachineModuleInfo *MMI;
+
+  //===--------------------------------------------------------------------===//
+  // Attributes used to construct specific Dwarf sections.
+  //
+
+  CompileUnit *FirstCU;
+  DenseMap <const MDNode *, CompileUnit *> CUMap;
+
+  /// AbbreviationsSet - Used to uniquely define abbreviations.
+  ///
+  FoldingSet<DIEAbbrev> AbbreviationsSet;
+
+  /// Abbreviations - A list of all the unique abbreviations in use.
+  ///
+  std::vector<DIEAbbrev *> Abbreviations;
+
+  /// DirectoryIdMap - Directory name to directory id map.
+  ///
+  StringMap<unsigned> DirectoryIdMap;
+
+  /// DirectoryNames - A list of directory names.
+  SmallVector<std::string, 8> DirectoryNames;
+
+  /// SourceFileIdMap - Source file name to source file id map.
+  ///
+  StringMap<unsigned> SourceFileIdMap;
+
+  /// SourceFileNames - A list of source file names.
+  SmallVector<std::string, 8> SourceFileNames;
+
+  /// SourceIdMap - Source id map, i.e. pair of directory id and source file
+  /// id mapped to a unique id.
+  DenseMap<std::pair<unsigned, unsigned>, unsigned> SourceIdMap;
+
+  /// SourceIds - Reverse map from source id to directory id + file id pair.
+  ///
+  SmallVector<std::pair<unsigned, unsigned>, 8> SourceIds;
+
+  /// Lines - List of source line correspondence.
+  std::vector<SrcLineInfo> Lines;
+
+  /// DIEBlocks - A list of all the DIEBlocks in use.
+  std::vector<DIEBlock *> DIEBlocks;
+
+  // DIEValueAllocator - All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
+
+  /// StringPool - A String->Symbol mapping of strings used by indirect
+  /// references.
+  StringMap<std::pair<MCSymbol*, unsigned> > StringPool;
+  unsigned NextStringPoolNumber;
+  
+  MCSymbol *getStringPoolEntry(StringRef Str);
+
+  /// SectionMap - Provides a unique id per text section.
+  ///
+  UniqueVector<const MCSection*> SectionMap;
+
+  /// SectionSourceLines - Tracks line numbers per text section.
+  ///
+  std::vector<std::vector<SrcLineInfo> > SectionSourceLines;
+
+  // CurrentFnDbgScope - Top level scope for the current function.
+  //
+  DbgScope *CurrentFnDbgScope;
+  
+  /// DbgScopeMap - Tracks the scopes in the current function.  Owns the
+  /// contained DbgScope*s.
+  ///
+  DenseMap<const MDNode *, DbgScope *> DbgScopeMap;
+
+  /// ConcreteScopes - Tracks the concrete scopees in the current function.
+  /// These scopes are also included in DbgScopeMap.
+  DenseMap<const MDNode *, DbgScope *> ConcreteScopes;
+
+  /// AbstractScopes - Tracks the abstract scopes a module. These scopes are
+  /// not included DbgScopeMap.  AbstractScopes owns its DbgScope*s.
+  DenseMap<const MDNode *, DbgScope *> AbstractScopes;
+
+  /// AbstractSPDies - Collection of abstract subprogram DIEs.
+  DenseMap<const MDNode *, DIE *> AbstractSPDies;
+
+  /// AbstractScopesList - Tracks abstract scopes constructed while processing
+  /// a function. This list is cleared during endFunction().
+  SmallVector<DbgScope *, 4>AbstractScopesList;
+
+  /// AbstractVariables - Collection on abstract variables.  Owned by the
+  /// DbgScopes in AbstractScopes.
+  DenseMap<const MDNode *, DbgVariable *> AbstractVariables;
+
+  /// DbgVariableToFrameIndexMap - Tracks frame index used to find 
+  /// variable's value.
+  DenseMap<const DbgVariable *, int> DbgVariableToFrameIndexMap;
+
+  /// DbgVariableToDbgInstMap - Maps DbgVariable to corresponding DBG_VALUE
+  /// machine instruction.
+  DenseMap<const DbgVariable *, const MachineInstr *> DbgVariableToDbgInstMap;
+
+  /// DbgVariableLabelsMap - Maps DbgVariable to corresponding MCSymbol.
+  DenseMap<const DbgVariable *, const MCSymbol *> DbgVariableLabelsMap;
+
+  /// DotDebugLocEntry - This struct describes location entries emitted in
+  /// .debug_loc section.
+  typedef struct DotDebugLocEntry {
+    const MCSymbol *Begin;
+    const MCSymbol *End;
+    MachineLocation Loc;
+    DotDebugLocEntry() : Begin(0), End(0) {}
+    DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, 
+                  MachineLocation &L) : Begin(B), End(E), Loc(L) {}
+    /// Empty entries are also used as a trigger to emit temp label. Such
+    /// labels are referenced is used to find debug_loc offset for a given DIE.
+    bool isEmpty() { return Begin == 0 && End == 0; }
+  } DotDebugLocEntry;
+
+  /// DotDebugLocEntries - Collection of DotDebugLocEntry.
+  SmallVector<DotDebugLocEntry, 4> DotDebugLocEntries;
+
+  /// UseDotDebugLocEntry - DW_AT_location attributes for the DIEs in this set
+  /// idetifies corresponding .debug_loc entry offset.
+  SmallPtrSet<const DIE *, 4> UseDotDebugLocEntry;
+
+  /// VarToAbstractVarMap - Maps DbgVariable with corresponding Abstract
+  /// DbgVariable, if any.
+  DenseMap<const DbgVariable *, const DbgVariable *> VarToAbstractVarMap;
+
+  /// InliendSubprogramDIEs - Collection of subprgram DIEs that are marked
+  /// (at the end of the module) as DW_AT_inline.
+  SmallPtrSet<DIE *, 4> InlinedSubprogramDIEs;
+
+  /// ContainingTypeMap - This map is used to keep track of subprogram DIEs that
+  /// need DW_AT_containing_type attribute. This attribute points to a DIE that
+  /// corresponds to the MDNode mapped with the subprogram DIE.
+  DenseMap<DIE *, const MDNode *> ContainingTypeMap;
+
+  typedef SmallVector<DbgScope *, 2> ScopeVector;
+
+  SmallPtrSet<const MachineInstr *, 8> InsnsEndScopeSet;
+
+  /// InlineInfo - Keep track of inlined functions and their location.  This
+  /// information is used to populate debug_inlined section.
+  typedef std::pair<const MCSymbol *, DIE *> InlineInfoLabels;
+  DenseMap<const MDNode *, SmallVector<InlineInfoLabels, 4> > InlineInfo;
+  SmallVector<const MDNode *, 4> InlinedSPNodes;
+
+  // ProcessedSPNodes - This is a collection of subprogram MDNodes that
+  // are processed to create DIEs.
+  SmallPtrSet<const MDNode *, 16> ProcessedSPNodes;
+
+  /// LabelsBeforeInsn - Maps instruction with label emitted before 
+  /// instruction.
+  DenseMap<const MachineInstr *, MCSymbol *> LabelsBeforeInsn;
+
+  /// LabelsAfterInsn - Maps instruction with label emitted after
+  /// instruction.
+  DenseMap<const MachineInstr *, MCSymbol *> LabelsAfterInsn;
+
+  /// insnNeedsLabel - Collection of instructions that need a label to mark
+  /// a debuggging information entity.
+  SmallPtrSet<const MachineInstr *, 8> InsnNeedsLabel;
+
+  SmallVector<const MCSymbol *, 8> DebugRangeSymbols;
+
+  /// Previous instruction's location information. This is used to determine
+  /// label location to indicate scope boundries in dwarf debug info.
+  DebugLoc PrevInstLoc;
+  MCSymbol *PrevLabel;
+
+  struct FunctionDebugFrameInfo {
+    unsigned Number;
+    std::vector<MachineMove> Moves;
+
+    FunctionDebugFrameInfo(unsigned Num, const std::vector<MachineMove> &M)
+      : Number(Num), Moves(M) {}
+  };
+
+  std::vector<FunctionDebugFrameInfo> DebugFrames;
+
+  // Section Symbols: these are assembler temporary labels that are emitted at
+  // the beginning of each supported dwarf section.  These are used to form
+  // section offsets and are created by EmitSectionLabels.
+  MCSymbol *DwarfFrameSectionSym, *DwarfInfoSectionSym, *DwarfAbbrevSectionSym;
+  MCSymbol *DwarfStrSectionSym, *TextSectionSym, *DwarfDebugRangeSectionSym;
+  MCSymbol *DwarfDebugLocSectionSym;
+  MCSymbol *DwarfDebugLineSectionSym, *CurrentLineSectionSym;
+  MCSymbol *FunctionBeginSym, *FunctionEndSym;
+
+  DIEInteger *DIEIntegerOne;
+private:
+  
+  /// getSourceDirectoryAndFileIds - Return the directory and file ids that
+  /// maps to the source id. Source id starts at 1.
+  std::pair<unsigned, unsigned>
+  getSourceDirectoryAndFileIds(unsigned SId) const {
+    return SourceIds[SId-1];
+  }
+
+  /// getNumSourceDirectories - Return the number of source directories in the
+  /// debug info.
+  unsigned getNumSourceDirectories() const {
+    return DirectoryNames.size();
+  }
+
+  /// getSourceDirectoryName - Return the name of the directory corresponding
+  /// to the id.
+  const std::string &getSourceDirectoryName(unsigned Id) const {
+    return DirectoryNames[Id - 1];
+  }
+
+  /// getSourceFileName - Return the name of the source file corresponding
+  /// to the id.
+  const std::string &getSourceFileName(unsigned Id) const {
+    return SourceFileNames[Id - 1];
+  }
+
+  /// getNumSourceIds - Return the number of unique source ids.
+  unsigned getNumSourceIds() const {
+    return SourceIds.size();
+  }
+
+  /// assignAbbrevNumber - Define a unique number for the abbreviation.
+  ///
+  void assignAbbrevNumber(DIEAbbrev &Abbrev);
+
+  /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
+  /// information entry.
+  DIEEntry *createDIEEntry(DIE *Entry);
+
+  /// addUInt - Add an unsigned integer attribute data and value.
+  ///
+  void addUInt(DIE *Die, unsigned Attribute, unsigned Form, uint64_t Integer);
+
+  /// addSInt - Add an signed integer attribute data and value.
+  ///
+  void addSInt(DIE *Die, unsigned Attribute, unsigned Form, int64_t Integer);
+
+  /// addString - Add a string attribute data and value.
+  ///
+  void addString(DIE *Die, unsigned Attribute, unsigned Form,
+                 const StringRef Str);
+
+  /// addLabel - Add a Dwarf label attribute data and value.
+  ///
+  void addLabel(DIE *Die, unsigned Attribute, unsigned Form,
+                const MCSymbol *Label);
+
+  /// addDelta - Add a label delta attribute data and value.
+  ///
+  void addDelta(DIE *Die, unsigned Attribute, unsigned Form,
+                const MCSymbol *Hi, const MCSymbol *Lo);
+
+  /// addDIEEntry - Add a DIE attribute data and value.
+  ///
+  void addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form, DIE *Entry);
+  
+  /// addBlock - Add block data.
+  ///
+  void addBlock(DIE *Die, unsigned Attribute, unsigned Form, DIEBlock *Block);
+
+  /// addSourceLine - Add location information to specified debug information
+  /// entry.
+  void addSourceLine(DIE *Die, DIVariable V);
+  void addSourceLine(DIE *Die, DIGlobalVariable G);
+  void addSourceLine(DIE *Die, DISubprogram SP);
+  void addSourceLine(DIE *Die, DIType Ty);
+  void addSourceLine(DIE *Die, DINameSpace NS);
+
+  /// addAddress - Add an address attribute to a die based on the location
+  /// provided.
+  void addAddress(DIE *Die, unsigned Attribute,
+                  const MachineLocation &Location);
+
+  /// addRegisterAddress - Add register location entry in variable DIE.
+  bool addRegisterAddress(DIE *Die, const MCSymbol *VS, const MachineOperand &MO);
+
+  /// addConstantValue - Add constant value entry in variable DIE.
+  bool addConstantValue(DIE *Die, const MCSymbol *VS, const MachineOperand &MO);
+
+  /// addConstantFPValue - Add constant value entry in variable DIE.
+  bool addConstantFPValue(DIE *Die, const MCSymbol *VS, const MachineOperand &MO);
+
+  /// addComplexAddress - Start with the address based on the location provided,
+  /// and generate the DWARF information necessary to find the actual variable
+  /// (navigating the extra location information encoded in the type) based on
+  /// the starting location.  Add the DWARF information to the die.
+  ///
+  void addComplexAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
+                         const MachineLocation &Location);
+
+  // FIXME: Should be reformulated in terms of addComplexAddress.
+  /// addBlockByrefAddress - Start with the address based on the location
+  /// provided, and generate the DWARF information necessary to find the
+  /// actual Block variable (navigating the Block struct) based on the
+  /// starting location.  Add the DWARF information to the die.  Obsolete,
+  /// please use addComplexAddress instead.
+  ///
+  void addBlockByrefAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
+                            const MachineLocation &Location);
+
+  /// addVariableAddress - Add DW_AT_location attribute for a DbgVariable.
+  void addVariableAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
+                          const MachineLocation &Location);
+
+  /// addToContextOwner - Add Die into the list of its context owner's children.
+  void addToContextOwner(DIE *Die, DIDescriptor Context);
+
+  /// addType - Add a new type attribute to the specified entity.
+  void addType(DIE *Entity, DIType Ty);
+
+ 
+  /// getOrCreateNameSpace - Create a DIE for DINameSpace.
+  DIE *getOrCreateNameSpace(DINameSpace NS);
+
+  /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
+  /// given DIType.
+  DIE *getOrCreateTypeDIE(DIType Ty);
+
+  void addPubTypes(DISubprogram SP);
+
+  /// constructTypeDIE - Construct basic type die from DIBasicType.
+  void constructTypeDIE(DIE &Buffer,
+                        DIBasicType BTy);
+
+  /// constructTypeDIE - Construct derived type die from DIDerivedType.
+  void constructTypeDIE(DIE &Buffer,
+                        DIDerivedType DTy);
+
+  /// constructTypeDIE - Construct type DIE from DICompositeType.
+  void constructTypeDIE(DIE &Buffer,
+                        DICompositeType CTy);
+
+  /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
+  void constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy);
+
+  /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
+  void constructArrayTypeDIE(DIE &Buffer, 
+                             DICompositeType *CTy);
+
+  /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
+  DIE *constructEnumTypeDIE(DIEnumerator ETy);
+
+  /// createMemberDIE - Create new member DIE.
+  DIE *createMemberDIE(DIDerivedType DT);
+
+  /// createSubprogramDIE - Create new DIE using SP.
+  DIE *createSubprogramDIE(DISubprogram SP, bool MakeDecl = false);
+
+  /// getOrCreateDbgScope - Create DbgScope for the scope.
+  DbgScope *getOrCreateDbgScope(const MDNode *Scope, const MDNode *InlinedAt);
+
+  DbgScope *getOrCreateAbstractScope(const MDNode *N);
+
+  /// findAbstractVariable - Find abstract variable associated with Var.
+  DbgVariable *findAbstractVariable(DIVariable &Var, DebugLoc Loc);
+
+  /// updateSubprogramScopeDIE - Find DIE for the given subprogram and 
+  /// attach appropriate DW_AT_low_pc and DW_AT_high_pc attributes.
+  /// If there are global variables in this scope then create and insert
+  /// DIEs for these variables.
+  DIE *updateSubprogramScopeDIE(const MDNode *SPNode);
+
+  /// constructLexicalScope - Construct new DW_TAG_lexical_block 
+  /// for this scope and attach DW_AT_low_pc/DW_AT_high_pc labels.
+  DIE *constructLexicalScopeDIE(DbgScope *Scope);
+
+  /// constructInlinedScopeDIE - This scope represents inlined body of
+  /// a function. Construct DIE to represent this concrete inlined copy
+  /// of the function.
+  DIE *constructInlinedScopeDIE(DbgScope *Scope);
+
+  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
+  DIE *constructVariableDIE(DbgVariable *DV, DbgScope *S);
+
+  /// constructScopeDIE - Construct a DIE for this scope.
+  DIE *constructScopeDIE(DbgScope *Scope);
+
+  /// EmitSectionLabels - Emit initial Dwarf sections with a label at
+  /// the start of each one.
+  void EmitSectionLabels();
+
+  /// emitDIE - Recusively Emits a debug information entry.
+  ///
+  void emitDIE(DIE *Die);
+
+  /// computeSizeAndOffset - Compute the size and offset of a DIE.
+  ///
+  unsigned computeSizeAndOffset(DIE *Die, unsigned Offset, bool Last);
+
+  /// computeSizeAndOffsets - Compute the size and offset of all the DIEs.
+  ///
+  void computeSizeAndOffsets();
+
+  /// EmitDebugInfo - Emit the debug info section.
+  ///
+  void emitDebugInfo();
+
+  /// emitAbbreviations - Emit the abbreviation section.
+  ///
+  void emitAbbreviations() const;
+
+  /// emitEndOfLineMatrix - Emit the last address of the section and the end of
+  /// the line matrix.
+  ///
+  void emitEndOfLineMatrix(unsigned SectionEnd);
+
+  /// emitDebugLines - Emit source line information.
+  ///
+  void emitDebugLines();
+
+  /// emitCommonDebugFrame - Emit common frame info into a debug frame section.
+  ///
+  void emitCommonDebugFrame();
+
+  /// emitFunctionDebugFrame - Emit per function frame info into a debug frame
+  /// section.
+  void emitFunctionDebugFrame(const FunctionDebugFrameInfo &DebugFrameInfo);
+
+  /// emitDebugPubNames - Emit visible names into a debug pubnames section.
+  ///
+  void emitDebugPubNames();
+
+  /// emitDebugPubTypes - Emit visible types into a debug pubtypes section.
+  ///
+  void emitDebugPubTypes();
+
+  /// emitDebugStr - Emit visible names into a debug str section.
+  ///
+  void emitDebugStr();
+
+  /// emitDebugLoc - Emit visible names into a debug loc section.
+  ///
+  void emitDebugLoc();
+
+  /// EmitDebugARanges - Emit visible names into a debug aranges section.
+  ///
+  void EmitDebugARanges();
+
+  /// emitDebugRanges - Emit visible names into a debug ranges section.
+  ///
+  void emitDebugRanges();
+
+  /// emitDebugMacInfo - Emit visible names into a debug macinfo section.
+  ///
+  void emitDebugMacInfo();
+
+  /// emitDebugInlineInfo - Emit inline info using following format.
+  /// Section Header:
+  /// 1. length of section
+  /// 2. Dwarf version number
+  /// 3. address size.
+  ///
+  /// Entries (one "entry" for each function that was inlined):
+  ///
+  /// 1. offset into __debug_str section for MIPS linkage name, if exists; 
+  ///   otherwise offset into __debug_str for regular function name.
+  /// 2. offset into __debug_str section for regular function name.
+  /// 3. an unsigned LEB128 number indicating the number of distinct inlining 
+  /// instances for the function.
+  /// 
+  /// The rest of the entry consists of a {die_offset, low_pc}  pair for each 
+  /// inlined instance; the die_offset points to the inlined_subroutine die in
+  /// the __debug_info section, and the low_pc is the starting address  for the
+  ///  inlining instance.
+  void emitDebugInlineInfo();
+
+  /// GetOrCreateSourceID - Look up the source id with the given directory and
+  /// source file names. If none currently exists, create a new id and insert it
+  /// in the SourceIds map. This can update DirectoryNames and SourceFileNames
+  /// maps as well.
+  unsigned GetOrCreateSourceID(StringRef DirName, StringRef FileName);
+
+  /// constructCompileUnit - Create new CompileUnit for the given 
+  /// metadata node with tag DW_TAG_compile_unit.
+  void constructCompileUnit(const MDNode *N);
+
+  /// getCompielUnit - Get CompileUnit DIE.
+  CompileUnit *getCompileUnit(const MDNode *N) const;
+
+  /// constructGlobalVariableDIE - Construct global variable DIE.
+  void constructGlobalVariableDIE(const MDNode *N);
+
+  /// construct SubprogramDIE - Construct subprogram DIE.
+  void constructSubprogramDIE(const MDNode *N);
+
+  /// recordSourceLine - Register a source line with debug info. Returns the
+  /// unique label that was emitted and which provides correspondence to
+  /// the source line list.
+  MCSymbol *recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope);
+  
+  /// getSourceLineCount - Return the number of source lines in the debug
+  /// info.
+  unsigned getSourceLineCount() const {
+    return Lines.size();
+  }
+  
+  /// recordVariableFrameIndex - Record a variable's index.
+  void recordVariableFrameIndex(const DbgVariable *V, int Index);
+
+  /// findVariableFrameIndex - Return true if frame index for the variable
+  /// is found. Update FI to hold value of the index.
+  bool findVariableFrameIndex(const DbgVariable *V, int *FI);
+
+  /// findVariableLabel - Find MCSymbol for the variable.
+  const MCSymbol *findVariableLabel(const DbgVariable *V);
+
+  /// findDbgScope - Find DbgScope for the debug loc attached with an 
+  /// instruction.
+  DbgScope *findDbgScope(const MachineInstr *MI);
+
+  /// identifyScopeMarkers() - Indentify instructions that are marking
+  /// beginning of or end of a scope.
+  void identifyScopeMarkers();
+
+  /// extractScopeInformation - Scan machine instructions in this function
+  /// and collect DbgScopes. Return true, if atleast one scope was found.
+  bool extractScopeInformation();
+  
+  /// collectVariableInfo - Populate DbgScope entries with variables' info.
+  void collectVariableInfo(const MachineFunction *,
+                           SmallPtrSet<const MDNode *, 16> &ProcessedVars);
+  
+  /// collectVariableInfoFromMMITable - Collect variable information from
+  /// side table maintained by MMI.
+  void collectVariableInfoFromMMITable(const MachineFunction * MF,
+                                       SmallPtrSet<const MDNode *, 16> &P);
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  DwarfDebug(AsmPrinter *A, Module *M);
+  ~DwarfDebug();
+
+  /// beginModule - Emit all Dwarf sections that should come prior to the
+  /// content.
+  void beginModule(Module *M);
+
+  /// endModule - Emit all Dwarf sections that should come after the content.
+  ///
+  void endModule();
+
+  /// beginFunction - Gather pre-function debug information.  Assumes being
+  /// emitted immediately after the function entry point.
+  void beginFunction(const MachineFunction *MF);
+
+  /// endFunction - Gather and emit post-function debug information.
+  ///
+  void endFunction(const MachineFunction *MF);
+
+  /// getLabelBeforeInsn - Return Label preceding the instruction.
+  const MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
+
+  /// getLabelAfterInsn - Return Label immediately following the instruction.
+  const MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
+
+  /// beginScope - Process beginning of a scope.
+  void beginScope(const MachineInstr *MI);
+
+  /// endScope - Prcess end of a scope.
+  void endScope(const MachineInstr *MI);
+};
+} // End of namespace llvm
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/AsmPrinter/DwarfException.h b/src/LLVM/lib/CodeGen/AsmPrinter/DwarfException.h
new file mode 100644
index 0000000..94a305a
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -0,0 +1,171 @@
+//===-- DwarfException.h - Dwarf Exception Framework -----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
+#define LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
+
+#include "llvm/ADT/DenseMap.h"
+#include <vector>
+
+namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+struct LandingPadInfo;
+class MachineModuleInfo;
+class MachineMove;
+class MachineInstr;
+class MachineFunction;
+class MCAsmInfo;
+class MCExpr;
+class MCSymbol;
+class Function;
+class AsmPrinter;
+
+//===----------------------------------------------------------------------===//
+/// DwarfException - Emits Dwarf exception handling directives.
+///
+class DwarfException {
+  /// Asm - Target of Dwarf emission.
+  AsmPrinter *Asm;
+
+  /// MMI - Collected machine module information.
+  MachineModuleInfo *MMI;
+
+  struct FunctionEHFrameInfo {
+    MCSymbol *FunctionEHSym;  // L_foo.eh
+    unsigned Number;
+    unsigned PersonalityIndex;
+    bool adjustsStack;
+    bool hasLandingPads;
+    std::vector<MachineMove> Moves;
+    const Function *function;
+
+    FunctionEHFrameInfo(MCSymbol *EHSym, unsigned Num, unsigned P,
+                        bool hC, bool hL,
+                        const std::vector<MachineMove> &M,
+                        const Function *f):
+      FunctionEHSym(EHSym), Number(Num), PersonalityIndex(P),
+      adjustsStack(hC), hasLandingPads(hL), Moves(M), function (f) { }
+  };
+
+  std::vector<FunctionEHFrameInfo> EHFrames;
+
+  /// UsesLSDA - Indicates whether an FDE that uses the CIE at the given index
+  /// uses an LSDA. If so, then we need to encode that information in the CIE's
+  /// augmentation.
+  DenseMap<unsigned, bool> UsesLSDA;
+
+  /// shouldEmitTable - Per-function flag to indicate if EH tables should
+  /// be emitted.
+  bool shouldEmitTable;
+
+  /// shouldEmitMoves - Per-function flag to indicate if frame moves info
+  /// should be emitted.
+  bool shouldEmitMoves;
+
+  /// shouldEmitTableModule - Per-module flag to indicate if EH tables
+  /// should be emitted.
+  bool shouldEmitTableModule;
+
+  /// shouldEmitFrameModule - Per-module flag to indicate if frame moves
+  /// should be emitted.
+  bool shouldEmitMovesModule;
+
+  /// EmitCIE - Emit a Common Information Entry (CIE). This holds information
+  /// that is shared among many Frame Description Entries.  There is at least
+  /// one CIE in every non-empty .debug_frame section.
+  void EmitCIE(const Function *Personality, unsigned Index);
+
+  /// EmitFDE - Emit the Frame Description Entry (FDE) for the function.
+  void EmitFDE(const FunctionEHFrameInfo &EHFrameInfo);
+
+  /// SharedTypeIds - How many leading type ids two landing pads have in common.
+  static unsigned SharedTypeIds(const LandingPadInfo *L,
+                                const LandingPadInfo *R);
+
+  /// PadLT - Order landing pads lexicographically by type id.
+  static bool PadLT(const LandingPadInfo *L, const LandingPadInfo *R);
+
+  /// PadRange - Structure holding a try-range and the associated landing pad.
+  struct PadRange {
+    // The index of the landing pad.
+    unsigned PadIndex;
+    // The index of the begin and end labels in the landing pad's label lists.
+    unsigned RangeIndex;
+  };
+
+  typedef DenseMap<MCSymbol *, PadRange> RangeMapType;
+
+  /// ActionEntry - Structure describing an entry in the actions table.
+  struct ActionEntry {
+    int ValueForTypeID; // The value to write - may not be equal to the type id.
+    int NextAction;
+    unsigned Previous;
+  };
+
+  /// CallSiteEntry - Structure describing an entry in the call-site table.
+  struct CallSiteEntry {
+    // The 'try-range' is BeginLabel .. EndLabel.
+    MCSymbol *BeginLabel; // zero indicates the start of the function.
+    MCSymbol *EndLabel;   // zero indicates the end of the function.
+
+    // The landing pad starts at PadLabel.
+    MCSymbol *PadLabel;   // zero indicates that there is no landing pad.
+    unsigned Action;
+  };
+
+  /// ComputeActionsTable - Compute the actions table and gather the first
+  /// action index for each landing pad site.
+  unsigned ComputeActionsTable(const SmallVectorImpl<const LandingPadInfo*>&LPs,
+                               SmallVectorImpl<ActionEntry> &Actions,
+                               SmallVectorImpl<unsigned> &FirstActions);
+
+  /// CallToNoUnwindFunction - Return `true' if this is a call to a function
+  /// marked `nounwind'. Return `false' otherwise.
+  bool CallToNoUnwindFunction(const MachineInstr *MI);
+
+  /// ComputeCallSiteTable - Compute the call-site table.  The entry for an
+  /// invoke has a try-range containing the call, a non-zero landing pad and an
+  /// appropriate action.  The entry for an ordinary call has a try-range
+  /// containing the call and zero for the landing pad and the action.  Calls
+  /// marked 'nounwind' have no entry and must not be contained in the try-range
+  /// of any entry - they form gaps in the table.  Entries must be ordered by
+  /// try-range address.
+  void ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
+                            const RangeMapType &PadMap,
+                            const SmallVectorImpl<const LandingPadInfo *> &LPs,
+                            const SmallVectorImpl<unsigned> &FirstActions);
+  void EmitExceptionTable();
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  DwarfException(AsmPrinter *A);
+  ~DwarfException();
+
+  /// EndModule - Emit all exception information that should come after the
+  /// content.
+  void EndModule();
+
+  /// BeginFunction - Gather pre-function exception information.  Assumes being
+  /// emitted immediately after the function entry point.
+  void BeginFunction(const MachineFunction *MF);
+
+  /// EndFunction - Gather and emit post-function exception information.
+  void EndFunction();
+};
+
+} // End of namespace llvm
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/AsmPrinter/Makefile b/src/LLVM/lib/CodeGen/AsmPrinter/Makefile
new file mode 100644
index 0000000..60aa6cb
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/AsmPrinter/Makefile
@@ -0,0 +1,13 @@
+##===- lib/CodeGen/AsmPrinter/Makefile ---------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMAsmPrinter
+
+include $(LEVEL)/Makefile.common
diff --git a/src/LLVM/lib/CodeGen/BranchFolding.cpp b/src/LLVM/lib/CodeGen/BranchFolding.cpp
new file mode 100644
index 0000000..4ff84c9
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/BranchFolding.cpp
@@ -0,0 +1,1310 @@
+//===-- BranchFolding.cpp - Fold machine code branch instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass forwards branches to unconditional branches to make them branch
+// directly to the target block.  This pass often results in dead MBB's, which
+// it then removes.
+//
+// Note that this pass must be run after register allocation, it cannot handle
+// SSA form.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "branchfolding"
+#include "BranchFolding.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
+STATISTIC(NumBranchOpts, "Number of branches optimized");
+STATISTIC(NumTailMerge , "Number of block tails merged");
+
+static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge",
+                              cl::init(cl::BOU_UNSET), cl::Hidden);
+
+// Throttle for huge numbers of predecessors (compile speed problems)
+static cl::opt<unsigned>
+TailMergeThreshold("tail-merge-threshold",
+          cl::desc("Max number of predecessors to consider tail merging"),
+          cl::init(150), cl::Hidden);
+
+// Heuristic for tail merging (and, inversely, tail duplication).
+// TODO: This should be replaced with a target query.
+static cl::opt<unsigned>
+TailMergeSize("tail-merge-size",
+          cl::desc("Min number of instructions to consider tail merging"),
+                              cl::init(3), cl::Hidden);
+
+namespace {
+  /// BranchFolderPass - Wrap branch folder in a machine function pass.
+  class BranchFolderPass : public MachineFunctionPass,
+                           public BranchFolder {
+  public:
+    static char ID;
+    explicit BranchFolderPass(bool defaultEnableTailMerge)
+      : MachineFunctionPass(ID), BranchFolder(defaultEnableTailMerge) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+    virtual const char *getPassName() const { return "Control Flow Optimizer"; }
+  };
+}
+
+char BranchFolderPass::ID = 0;
+
+FunctionPass *llvm::createBranchFoldingPass(bool DefaultEnableTailMerge) {
+  return new BranchFolderPass(DefaultEnableTailMerge);
+}
+
+bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
+  return OptimizeFunction(MF,
+                          MF.getTarget().getInstrInfo(),
+                          MF.getTarget().getRegisterInfo(),
+                          getAnalysisIfAvailable<MachineModuleInfo>());
+}
+
+
+BranchFolder::BranchFolder(bool defaultEnableTailMerge) {
+  switch (FlagEnableTailMerge) {
+  case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break;
+  case cl::BOU_TRUE: EnableTailMerge = true; break;
+  case cl::BOU_FALSE: EnableTailMerge = false; break;
+  }
+}
+
+/// RemoveDeadBlock - Remove the specified dead machine basic block from the
+/// function, updating the CFG.
+void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
+  assert(MBB->pred_empty() && "MBB must be dead!");
+  DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
+
+  MachineFunction *MF = MBB->getParent();
+  // drop all successors.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_end()-1);
+
+  // Remove the block.
+  MF->erase(MBB);
+}
+
+/// OptimizeImpDefsBlock - If a basic block is just a bunch of implicit_def
+/// followed by terminators, and if the implicitly defined registers are not
+/// used by the terminators, remove those implicit_def's. e.g.
+/// BB1:
+///   r0 = implicit_def
+///   r1 = implicit_def
+///   br
+/// This block can be optimized away later if the implicit instructions are
+/// removed.
+bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) {
+  SmallSet<unsigned, 4> ImpDefRegs;
+  MachineBasicBlock::iterator I = MBB->begin();
+  while (I != MBB->end()) {
+    if (!I->isImplicitDef())
+      break;
+    unsigned Reg = I->getOperand(0).getReg();
+    ImpDefRegs.insert(Reg);
+    for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+         unsigned SubReg = *SubRegs; ++SubRegs)
+      ImpDefRegs.insert(SubReg);
+    ++I;
+  }
+  if (ImpDefRegs.empty())
+    return false;
+
+  MachineBasicBlock::iterator FirstTerm = I;
+  while (I != MBB->end()) {
+    if (!TII->isUnpredicatedTerminator(I))
+      return false;
+    // See if it uses any of the implicitly defined registers.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = I->getOperand(i);
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (ImpDefRegs.count(Reg))
+        return false;
+    }
+    ++I;
+  }
+
+  I = MBB->begin();
+  while (I != FirstTerm) {
+    MachineInstr *ImpDefMI = &*I;
+    ++I;
+    MBB->erase(ImpDefMI);
+  }
+
+  return true;
+}
+
+/// OptimizeFunction - Perhaps branch folding, tail merging and other
+/// CFG optimizations on the given function.
+bool BranchFolder::OptimizeFunction(MachineFunction &MF,
+                                    const TargetInstrInfo *tii,
+                                    const TargetRegisterInfo *tri,
+                                    MachineModuleInfo *mmi) {
+  if (!tii) return false;
+
+  TII = tii;
+  TRI = tri;
+  MMI = mmi;
+
+  RS = TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : NULL;
+
+  // Fix CFG.  The later algorithms expect it to be right.
+  bool MadeChange = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; I++) {
+    MachineBasicBlock *MBB = I, *TBB = 0, *FBB = 0;
+    SmallVector<MachineOperand, 4> Cond;
+    if (!TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, true))
+      MadeChange |= MBB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
+    MadeChange |= OptimizeImpDefsBlock(MBB);
+  }
+
+  bool MadeChangeThisIteration = true;
+  while (MadeChangeThisIteration) {
+    MadeChangeThisIteration = false;
+    MadeChangeThisIteration |= TailMergeBlocks(MF);
+    MadeChangeThisIteration |= OptimizeBranches(MF);
+    MadeChange |= MadeChangeThisIteration;
+  }
+
+  // See if any jump tables have become dead as the code generator
+  // did its thing.
+  MachineJumpTableInfo *JTI = MF.getJumpTableInfo();
+  if (JTI == 0) {
+    delete RS;
+    return MadeChange;
+  }
+  
+  // Walk the function to find jump tables that are live.
+  BitVector JTIsLive(JTI->getJumpTables().size());
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end();
+       BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
+         I != E; ++I)
+      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
+        MachineOperand &Op = I->getOperand(op);
+        if (!Op.isJTI()) continue;
+
+        // Remember that this JT is live.
+        JTIsLive.set(Op.getIndex());
+      }
+  }
+
+  // Finally, remove dead jump tables.  This happens when the
+  // indirect jump was unreachable (and thus deleted).
+  for (unsigned i = 0, e = JTIsLive.size(); i != e; ++i)
+    if (!JTIsLive.test(i)) {
+      JTI->RemoveJumpTable(i);
+      MadeChange = true;
+    }
+
+  delete RS;
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Tail Merging of Blocks
+//===----------------------------------------------------------------------===//
+
+/// HashMachineInstr - Compute a hash value for MI and its operands.
+static unsigned HashMachineInstr(const MachineInstr *MI) {
+  unsigned Hash = MI->getOpcode();
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &Op = MI->getOperand(i);
+
+    // Merge in bits from the operand if easy.
+    unsigned OperandHash = 0;
+    switch (Op.getType()) {
+    case MachineOperand::MO_Register:          OperandHash = Op.getReg(); break;
+    case MachineOperand::MO_Immediate:         OperandHash = Op.getImm(); break;
+    case MachineOperand::MO_MachineBasicBlock:
+      OperandHash = Op.getMBB()->getNumber();
+      break;
+    case MachineOperand::MO_FrameIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_JumpTableIndex:
+      OperandHash = Op.getIndex();
+      break;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      // Global address / external symbol are too hard, don't bother, but do
+      // pull in the offset.
+      OperandHash = Op.getOffset();
+      break;
+    default: break;
+    }
+
+    Hash += ((OperandHash << 3) | Op.getType()) << (i&31);
+  }
+  return Hash;
+}
+
+/// HashEndOfMBB - Hash the last instruction in the MBB.
+static unsigned HashEndOfMBB(const MachineBasicBlock *MBB) {
+  MachineBasicBlock::const_iterator I = MBB->end();
+  if (I == MBB->begin())
+    return 0;   // Empty MBB.
+
+  --I;
+  // Skip debug info so it will not affect codegen.
+  while (I->isDebugValue()) {
+    if (I==MBB->begin())
+      return 0;      // MBB empty except for debug info.
+    --I;
+  }
+
+  return HashMachineInstr(I);
+}
+
+/// ComputeCommonTailLength - Given two machine basic blocks, compute the number
+/// of instructions they actually have in common together at their end.  Return
+/// iterators for the first shared instruction in each block.
+static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
+                                        MachineBasicBlock *MBB2,
+                                        MachineBasicBlock::iterator &I1,
+                                        MachineBasicBlock::iterator &I2) {
+  I1 = MBB1->end();
+  I2 = MBB2->end();
+
+  unsigned TailLen = 0;
+  while (I1 != MBB1->begin() && I2 != MBB2->begin()) {
+    --I1; --I2;
+    // Skip debugging pseudos; necessary to avoid changing the code.
+    while (I1->isDebugValue()) {
+      if (I1==MBB1->begin()) {
+        while (I2->isDebugValue()) {
+          if (I2==MBB2->begin())
+            // I1==DBG at begin; I2==DBG at begin
+            return TailLen;
+          --I2;
+        }
+        ++I2;
+        // I1==DBG at begin; I2==non-DBG, or first of DBGs not at begin
+        return TailLen;
+      }
+      --I1;
+    }
+    // I1==first (untested) non-DBG preceding known match
+    while (I2->isDebugValue()) {
+      if (I2==MBB2->begin()) {
+        ++I1;
+        // I1==non-DBG, or first of DBGs not at begin; I2==DBG at begin
+        return TailLen;
+      }
+      --I2;
+    }
+    // I1, I2==first (untested) non-DBGs preceding known match
+    if (!I1->isIdenticalTo(I2)) {
+      ++I1; ++I2;
+      break;
+    }
+    ++TailLen;
+  }
+  // Back past possible debugging pseudos at beginning of block.  This matters
+  // when one block differs from the other only by whether debugging pseudos
+  // are present at the beginning.  (This way, the various checks later for
+  // I1==MBB1->begin() work as expected.)
+  if (I1 == MBB1->begin() && I2 != MBB2->begin()) {
+    --I2;
+    while (I2->isDebugValue()) {
+      if (I2 == MBB2->begin()) {
+        return TailLen;
+        }
+      --I2;
+    }
+    ++I2;
+  }
+  if (I2 == MBB2->begin() && I1 != MBB1->begin()) {
+    --I1;
+    while (I1->isDebugValue()) {
+      if (I1 == MBB1->begin())
+        return TailLen;
+      --I1;
+    }
+    ++I1;
+  }
+  return TailLen;
+}
+
+/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
+/// after it, replacing it with an unconditional branch to NewDest.
+void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
+                                           MachineBasicBlock *NewDest) {
+  TII->ReplaceTailWithBranchTo(OldInst, NewDest);
+  ++NumTailMerge;
+}
+
+/// SplitMBBAt - Given a machine basic block and an iterator into it, split the
+/// MBB so that the part before the iterator falls into the part starting at the
+/// iterator.  This returns the new MBB.
+MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
+                                            MachineBasicBlock::iterator BBI1) {
+  if (!TII->isLegalToSplitMBBAt(CurMBB, BBI1))
+    return 0;
+
+  MachineFunction &MF = *CurMBB.getParent();
+
+  // Create the fall-through block.
+  MachineFunction::iterator MBBI = &CurMBB;
+  MachineBasicBlock *NewMBB =MF.CreateMachineBasicBlock(CurMBB.getBasicBlock());
+  CurMBB.getParent()->insert(++MBBI, NewMBB);
+
+  // Move all the successors of this block to the specified block.
+  NewMBB->transferSuccessors(&CurMBB);
+
+  // Add an edge from CurMBB to NewMBB for the fall-through.
+  CurMBB.addSuccessor(NewMBB);
+
+  // Splice the code over.
+  NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());
+
+  // For targets that use the register scavenger, we must maintain LiveIns.
+  if (RS) {
+    RS->enterBasicBlock(&CurMBB);
+    if (!CurMBB.empty())
+      RS->forward(prior(CurMBB.end()));
+    BitVector RegsLiveAtExit(TRI->getNumRegs());
+    RS->getRegsUsed(RegsLiveAtExit, false);
+    for (unsigned int i = 0, e = TRI->getNumRegs(); i != e; i++)
+      if (RegsLiveAtExit[i])
+        NewMBB->addLiveIn(i);
+  }
+
+  return NewMBB;
+}
+
+/// EstimateRuntime - Make a rough estimate for how long it will take to run
+/// the specified code.
+static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
+                                MachineBasicBlock::iterator E) {
+  unsigned Time = 0;
+  for (; I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+    const TargetInstrDesc &TID = I->getDesc();
+    if (TID.isCall())
+      Time += 10;
+    else if (TID.mayLoad() || TID.mayStore())
+      Time += 2;
+    else
+      ++Time;
+  }
+  return Time;
+}
+
+// CurMBB needs to add an unconditional branch to SuccMBB (we removed these
+// branches temporarily for tail merging).  In the case where CurMBB ends
+// with a conditional branch to the next block, optimize by reversing the
+// test and conditionally branching to SuccMBB instead.
+static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
+                    const TargetInstrInfo *TII) {
+  MachineFunction *MF = CurMBB->getParent();
+  MachineFunction::iterator I = llvm::next(MachineFunction::iterator(CurMBB));
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  DebugLoc dl;  // FIXME: this is nowhere
+  if (I != MF->end() &&
+      !TII->AnalyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
+    MachineBasicBlock *NextBB = I;
+    if (TBB == NextBB && !Cond.empty() && !FBB) {
+      if (!TII->ReverseBranchCondition(Cond)) {
+        TII->RemoveBranch(*CurMBB);
+        TII->InsertBranch(*CurMBB, SuccBB, NULL, Cond, dl);
+        return;
+      }
+    }
+  }
+  TII->InsertBranch(*CurMBB, SuccBB, NULL,
+                    SmallVector<MachineOperand, 0>(), dl);
+}
+
+bool
+BranchFolder::MergePotentialsElt::operator<(const MergePotentialsElt &o) const {
+  if (getHash() < o.getHash())
+    return true;
+   else if (getHash() > o.getHash())
+    return false;
+  else if (getBlock()->getNumber() < o.getBlock()->getNumber())
+    return true;
+  else if (getBlock()->getNumber() > o.getBlock()->getNumber())
+    return false;
+  else {
+    // _GLIBCXX_DEBUG checks strict weak ordering, which involves comparing
+    // an object with itself.
+#ifndef _GLIBCXX_DEBUG
+    llvm_unreachable("Predecessor appears twice");
+#endif
+    return false;
+  }
+}
+
+/// CountTerminators - Count the number of terminators in the given
+/// block and set I to the position of the first non-terminator, if there
+/// is one, or MBB->end() otherwise.
+static unsigned CountTerminators(MachineBasicBlock *MBB,
+                                 MachineBasicBlock::iterator &I) {
+  I = MBB->end();
+  unsigned NumTerms = 0;
+  for (;;) {
+    if (I == MBB->begin()) {
+      I = MBB->end();
+      break;
+    }
+    --I;
+    if (!I->getDesc().isTerminator()) break;
+    ++NumTerms;
+  }
+  return NumTerms;
+}
+
+/// ProfitableToMerge - Check if two machine basic blocks have a common tail
+/// and decide if it would be profitable to merge those tails.  Return the
+/// length of the common tail and iterators to the first common instruction
+/// in each block.
+static bool ProfitableToMerge(MachineBasicBlock *MBB1,
+                              MachineBasicBlock *MBB2,
+                              unsigned minCommonTailLength,
+                              unsigned &CommonTailLen,
+                              MachineBasicBlock::iterator &I1,
+                              MachineBasicBlock::iterator &I2,
+                              MachineBasicBlock *SuccBB,
+                              MachineBasicBlock *PredBB) {
+  CommonTailLen = ComputeCommonTailLength(MBB1, MBB2, I1, I2);
+  MachineFunction *MF = MBB1->getParent();
+
+  if (CommonTailLen == 0)
+    return false;
+
+  // It's almost always profitable to merge any number of non-terminator
+  // instructions with the block that falls through into the common successor.
+  if (MBB1 == PredBB || MBB2 == PredBB) {
+    MachineBasicBlock::iterator I;
+    unsigned NumTerms = CountTerminators(MBB1 == PredBB ? MBB2 : MBB1, I);
+    if (CommonTailLen > NumTerms)
+      return true;
+  }
+
+  // If one of the blocks can be completely merged and happens to be in
+  // a position where the other could fall through into it, merge any number
+  // of instructions, because it can be done without a branch.
+  // TODO: If the blocks are not adjacent, move one of them so that they are?
+  if (MBB1->isLayoutSuccessor(MBB2) && I2 == MBB2->begin())
+    return true;
+  if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin())
+    return true;
+
+  // If both blocks have an unconditional branch temporarily stripped out,
+  // count that as an additional common instruction for the following
+  // heuristics.
+  unsigned EffectiveTailLen = CommonTailLen;
+  if (SuccBB && MBB1 != PredBB && MBB2 != PredBB &&
+      !MBB1->back().getDesc().isBarrier() &&
+      !MBB2->back().getDesc().isBarrier())
+    ++EffectiveTailLen;
+
+  // Check if the common tail is long enough to be worthwhile.
+  if (EffectiveTailLen >= minCommonTailLength)
+    return true;
+
+  // If we are optimizing for code size, 2 instructions in common is enough if
+  // we don't have to split a block.  At worst we will be introducing 1 new
+  // branch instruction, which is likely to be smaller than the 2
+  // instructions that would be deleted in the merge.
+  if (EffectiveTailLen >= 2 &&
+      MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize) &&
+      (I1 == MBB1->begin() || I2 == MBB2->begin()))
+    return true;
+
+  return false;
+}
+
+/// ComputeSameTails - Look through all the blocks in MergePotentials that have
+/// hash CurHash (guaranteed to match the last element).  Build the vector
+/// SameTails of all those that have the (same) largest number of instructions
+/// in common of any pair of these blocks.  SameTails entries contain an
+/// iterator into MergePotentials (from which the MachineBasicBlock can be
+/// found) and a MachineBasicBlock::iterator into that MBB indicating the
+/// instruction where the matching code sequence begins.
+/// Order of elements in SameTails is the reverse of the order in which
+/// those blocks appear in MergePotentials (where they are not necessarily
+/// consecutive).
+unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
+                                        unsigned minCommonTailLength,
+                                        MachineBasicBlock *SuccBB,
+                                        MachineBasicBlock *PredBB) {
+  unsigned maxCommonTailLength = 0U;
+  SameTails.clear();
+  MachineBasicBlock::iterator TrialBBI1, TrialBBI2;
+  MPIterator HighestMPIter = prior(MergePotentials.end());
+  for (MPIterator CurMPIter = prior(MergePotentials.end()),
+                  B = MergePotentials.begin();
+       CurMPIter != B && CurMPIter->getHash() == CurHash;
+       --CurMPIter) {
+    for (MPIterator I = prior(CurMPIter); I->getHash() == CurHash ; --I) {
+      unsigned CommonTailLen;
+      if (ProfitableToMerge(CurMPIter->getBlock(), I->getBlock(),
+                            minCommonTailLength,
+                            CommonTailLen, TrialBBI1, TrialBBI2,
+                            SuccBB, PredBB)) {
+        if (CommonTailLen > maxCommonTailLength) {
+          SameTails.clear();
+          maxCommonTailLength = CommonTailLen;
+          HighestMPIter = CurMPIter;
+          SameTails.push_back(SameTailElt(CurMPIter, TrialBBI1));
+        }
+        if (HighestMPIter == CurMPIter &&
+            CommonTailLen == maxCommonTailLength)
+          SameTails.push_back(SameTailElt(I, TrialBBI2));
+      }
+      if (I == B)
+        break;
+    }
+  }
+  return maxCommonTailLength;
+}
+
+/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from
+/// MergePotentials, restoring branches at ends of blocks as appropriate.
+void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
+                                        MachineBasicBlock *SuccBB,
+                                        MachineBasicBlock *PredBB) {
+  MPIterator CurMPIter, B;
+  for (CurMPIter = prior(MergePotentials.end()), B = MergePotentials.begin();
+       CurMPIter->getHash() == CurHash;
+       --CurMPIter) {
+    // Put the unconditional branch back, if we need one.
+    MachineBasicBlock *CurMBB = CurMPIter->getBlock();
+    if (SuccBB && CurMBB != PredBB)
+      FixTail(CurMBB, SuccBB, TII);
+    if (CurMPIter == B)
+      break;
+  }
+  if (CurMPIter->getHash() != CurHash)
+    CurMPIter++;
+  MergePotentials.erase(CurMPIter, MergePotentials.end());
+}
+
+/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist
+/// only of the common tail.  Create a block that does by splitting one.
+bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                             unsigned maxCommonTailLength,
+                                             unsigned &commonTailIndex) {
+  commonTailIndex = 0;
+  unsigned TimeEstimate = ~0U;
+  for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
+    // Use PredBB if possible; that doesn't require a new branch.
+    if (SameTails[i].getBlock() == PredBB) {
+      commonTailIndex = i;
+      break;
+    }
+    // Otherwise, make a (fairly bogus) choice based on estimate of
+    // how long it will take the various blocks to execute.
+    unsigned t = EstimateRuntime(SameTails[i].getBlock()->begin(),
+                                 SameTails[i].getTailStartPos());
+    if (t <= TimeEstimate) {
+      TimeEstimate = t;
+      commonTailIndex = i;
+    }
+  }
+
+  MachineBasicBlock::iterator BBI =
+    SameTails[commonTailIndex].getTailStartPos();
+  MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+
+  // If the common tail includes any debug info we will take it pretty
+  // randomly from one of the inputs.  Might be better to remove it?
+  DEBUG(dbgs() << "\nSplitting BB#" << MBB->getNumber() << ", size "
+               << maxCommonTailLength);
+
+  MachineBasicBlock *newMBB = SplitMBBAt(*MBB, BBI);
+  if (!newMBB) {
+    DEBUG(dbgs() << "... failed!");
+    return false;
+  }
+
+  SameTails[commonTailIndex].setBlock(newMBB);
+  SameTails[commonTailIndex].setTailStartPos(newMBB->begin());
+
+  // If we split PredBB, newMBB is the new predecessor.
+  if (PredBB == MBB)
+    PredBB = newMBB;
+
+  return true;
+}
+
+// See if any of the blocks in MergePotentials (which all have a common single
+// successor, or all have no successor) can be tail-merged.  If there is a
+// successor, any blocks in MergePotentials that are not tail-merged and
+// are not immediately before Succ must have an unconditional branch to
+// Succ added (but the predecessor/successor lists need no adjustment).
+// The lone predecessor of Succ that falls through into Succ,
+// if any, is given in PredBB.
+
+bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
+                                      MachineBasicBlock *PredBB) {
+  bool MadeChange = false;
+
+  // Except for the special cases below, tail-merge if there are at least
+  // this many instructions in common.
+  unsigned minCommonTailLength = TailMergeSize;
+
+  DEBUG(dbgs() << "\nTryTailMergeBlocks: ";
+        for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
+          dbgs() << "BB#" << MergePotentials[i].getBlock()->getNumber()
+                 << (i == e-1 ? "" : ", ");
+        dbgs() << "\n";
+        if (SuccBB) {
+          dbgs() << "  with successor BB#" << SuccBB->getNumber() << '\n';
+          if (PredBB)
+            dbgs() << "  which has fall-through from BB#"
+                   << PredBB->getNumber() << "\n";
+        }
+        dbgs() << "Looking for common tails of at least "
+               << minCommonTailLength << " instruction"
+               << (minCommonTailLength == 1 ? "" : "s") << '\n';
+       );
+
+  // Sort by hash value so that blocks with identical end sequences sort
+  // together.
+  std::stable_sort(MergePotentials.begin(), MergePotentials.end());
+
+  // Walk through equivalence sets looking for actual exact matches.
+  while (MergePotentials.size() > 1) {
+    unsigned CurHash = MergePotentials.back().getHash();
+
+    // Build SameTails, identifying the set of blocks with this hash code
+    // and with the maximum number of instructions in common.
+    unsigned maxCommonTailLength = ComputeSameTails(CurHash,
+                                                    minCommonTailLength,
+                                                    SuccBB, PredBB);
+
+    // If we didn't find any pair that has at least minCommonTailLength
+    // instructions in common, remove all blocks with this hash code and retry.
+    if (SameTails.empty()) {
+      RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
+      continue;
+    }
+
+    // If one of the blocks is the entire common tail (and not the entry
+    // block, which we can't jump to), we can treat all blocks with this same
+    // tail at once.  Use PredBB if that is one of the possibilities, as that
+    // will not introduce any extra branches.
+    MachineBasicBlock *EntryBB = MergePotentials.begin()->getBlock()->
+                                 getParent()->begin();
+    unsigned commonTailIndex = SameTails.size();
+    // If there are two blocks, check to see if one can be made to fall through
+    // into the other.
+    if (SameTails.size() == 2 &&
+        SameTails[0].getBlock()->isLayoutSuccessor(SameTails[1].getBlock()) &&
+        SameTails[1].tailIsWholeBlock())
+      commonTailIndex = 1;
+    else if (SameTails.size() == 2 &&
+             SameTails[1].getBlock()->isLayoutSuccessor(
+                                                     SameTails[0].getBlock()) &&
+             SameTails[0].tailIsWholeBlock())
+      commonTailIndex = 0;
+    else {
+      // Otherwise just pick one, favoring the fall-through predecessor if
+      // there is one.
+      for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
+        MachineBasicBlock *MBB = SameTails[i].getBlock();
+        if (MBB == EntryBB && SameTails[i].tailIsWholeBlock())
+          continue;
+        if (MBB == PredBB) {
+          commonTailIndex = i;
+          break;
+        }
+        if (SameTails[i].tailIsWholeBlock())
+          commonTailIndex = i;
+      }
+    }
+
+    if (commonTailIndex == SameTails.size() ||
+        (SameTails[commonTailIndex].getBlock() == PredBB &&
+         !SameTails[commonTailIndex].tailIsWholeBlock())) {
+      // None of the blocks consist entirely of the common tail.
+      // Split a block so that one does.
+      if (!CreateCommonTailOnlyBlock(PredBB,
+                                     maxCommonTailLength, commonTailIndex)) {
+        RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
+        continue;
+      }
+    }
+
+    MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+    // MBB is common tail.  Adjust all other BB's to jump to this one.
+    // Traversal must be forwards so erases work.
+    DEBUG(dbgs() << "\nUsing common tail in BB#" << MBB->getNumber()
+                 << " for ");
+    for (unsigned int i=0, e = SameTails.size(); i != e; ++i) {
+      if (commonTailIndex == i)
+        continue;
+      DEBUG(dbgs() << "BB#" << SameTails[i].getBlock()->getNumber()
+                   << (i == e-1 ? "" : ", "));
+      // Hack the end off BB i, making it jump to BB commonTailIndex instead.
+      ReplaceTailWithBranchTo(SameTails[i].getTailStartPos(), MBB);
+      // BB i is no longer a predecessor of SuccBB; remove it from the worklist.
+      MergePotentials.erase(SameTails[i].getMPIter());
+    }
+    DEBUG(dbgs() << "\n");
+    // We leave commonTailIndex in the worklist in case there are other blocks
+    // that match it with a smaller number of instructions.
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
+
+  if (!EnableTailMerge) return false;
+
+  bool MadeChange = false;
+
+  // First find blocks with no successors.
+  MergePotentials.clear();
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    if (I->succ_empty())
+      MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(I), I));
+  }
+
+  // See if we can do any tail merging on those.
+  if (MergePotentials.size() < TailMergeThreshold &&
+      MergePotentials.size() >= 2)
+    MadeChange |= TryTailMergeBlocks(NULL, NULL);
+
+  // Look at blocks (IBB) with multiple predecessors (PBB).
+  // We change each predecessor to a canonical form, by
+  // (1) temporarily removing any unconditional branch from the predecessor
+  // to IBB, and
+  // (2) alter conditional branches so they branch to the other block
+  // not IBB; this may require adding back an unconditional branch to IBB
+  // later, where there wasn't one coming in.  E.g.
+  //   Bcc IBB
+  //   fallthrough to QBB
+  // here becomes
+  //   Bncc QBB
+  // with a conceptual B to IBB after that, which never actually exists.
+  // With those changes, we see whether the predecessors' tails match,
+  // and merge them if so.  We change things out of canonical form and
+  // back to the way they were later in the process.  (OptimizeBranches
+  // would undo some of this, but we can't use it, because we'd get into
+  // a compile-time infinite loop repeatedly doing and undoing the same
+  // transformations.)
+
+  for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+       I != E; ++I) {
+    if (I->pred_size() >= 2 && I->pred_size() < TailMergeThreshold) {
+      SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
+      MachineBasicBlock *IBB = I;
+      MachineBasicBlock *PredBB = prior(I);
+      MergePotentials.clear();
+      for (MachineBasicBlock::pred_iterator P = I->pred_begin(),
+                                            E2 = I->pred_end();
+           P != E2; ++P) {
+        MachineBasicBlock *PBB = *P;
+        // Skip blocks that loop to themselves, can't tail merge these.
+        if (PBB == IBB)
+          continue;
+        // Visit each predecessor only once.
+        if (!UniquePreds.insert(PBB))
+          continue;
+        MachineBasicBlock *TBB = 0, *FBB = 0;
+        SmallVector<MachineOperand, 4> Cond;
+        if (!TII->AnalyzeBranch(*PBB, TBB, FBB, Cond, true)) {
+          // Failing case:  IBB is the target of a cbr, and
+          // we cannot reverse the branch.
+          SmallVector<MachineOperand, 4> NewCond(Cond);
+          if (!Cond.empty() && TBB == IBB) {
+            if (TII->ReverseBranchCondition(NewCond))
+              continue;
+            // This is the QBB case described above
+            if (!FBB)
+              FBB = llvm::next(MachineFunction::iterator(PBB));
+          }
+          // Remove the unconditional branch at the end, if any.
+          if (TBB && (Cond.empty() || FBB)) {
+            DebugLoc dl;  // FIXME: this is nowhere
+            TII->RemoveBranch(*PBB);
+            if (!Cond.empty())
+              // reinsert conditional branch only, for now
+              TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond, dl);
+          }
+          MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P));
+        }
+      }
+      if (MergePotentials.size() >= 2)
+        MadeChange |= TryTailMergeBlocks(IBB, PredBB);
+      // Reinsert an unconditional branch if needed.
+      // The 1 below can occur as a result of removing blocks in TryTailMergeBlocks.
+      PredBB = prior(I);      // this may have been changed in TryTailMergeBlocks
+      if (MergePotentials.size() == 1 &&
+          MergePotentials.begin()->getBlock() != PredBB)
+        FixTail(MergePotentials.begin()->getBlock(), IBB, TII);
+    }
+  }
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Branch Optimization
+//===----------------------------------------------------------------------===//
+
+bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  // Make sure blocks are numbered in order
+  MF.RenumberBlocks();
+
+  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) {
+    MachineBasicBlock *MBB = I++;
+    MadeChange |= OptimizeBlock(MBB);
+
+    // If it is dead, remove it.
+    if (MBB->pred_empty()) {
+      RemoveDeadBlock(MBB);
+      MadeChange = true;
+      ++NumDeadBlocks;
+    }
+  }
+  return MadeChange;
+}
+
+// Blocks should be considered empty if they contain only debug info;
+// else the debug info would affect codegen.
+static bool IsEmptyBlock(MachineBasicBlock *MBB) {
+  if (MBB->empty())
+    return true;
+  for (MachineBasicBlock::iterator MBBI = MBB->begin(), MBBE = MBB->end();
+       MBBI!=MBBE; ++MBBI) {
+    if (!MBBI->isDebugValue())
+      return false;
+  }
+  return true;
+}
+
+// Blocks with only debug info and branches should be considered the same
+// as blocks with only branches.
+static bool IsBranchOnlyBlock(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator MBBI, MBBE;
+  for (MBBI = MBB->begin(), MBBE = MBB->end(); MBBI!=MBBE; ++MBBI) {
+    if (!MBBI->isDebugValue())
+      break;
+  }
+  return (MBBI->getDesc().isBranch());
+}
+
+/// IsBetterFallthrough - Return true if it would be clearly better to
+/// fall-through to MBB1 than to fall through into MBB2.  This has to return
+/// a strict ordering, returning true for both (MBB1,MBB2) and (MBB2,MBB1) will
+/// result in infinite loops.
+static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
+                                MachineBasicBlock *MBB2) {
+  // Right now, we use a simple heuristic.  If MBB2 ends with a call, and
+  // MBB1 doesn't, we prefer to fall through into MBB1.  This allows us to
+  // optimize branches that branch to either a return block or an assert block
+  // into a fallthrough to the return.
+  if (IsEmptyBlock(MBB1) || IsEmptyBlock(MBB2)) return false;
+
+  // If there is a clear successor ordering we make sure that one block
+  // will fall through to the next
+  if (MBB1->isSuccessor(MBB2)) return true;
+  if (MBB2->isSuccessor(MBB1)) return false;
+
+  // Neither block consists entirely of debug info (per IsEmptyBlock check),
+  // so we needn't test for falling off the beginning here.
+  MachineBasicBlock::iterator MBB1I = --MBB1->end();
+  while (MBB1I->isDebugValue())
+    --MBB1I;
+  MachineBasicBlock::iterator MBB2I = --MBB2->end();
+  while (MBB2I->isDebugValue())
+    --MBB2I;
+  return MBB2I->getDesc().isCall() && !MBB1I->getDesc().isCall();
+}
+
+/// OptimizeBlock - Analyze and optimize control flow related to the specified
+/// block.  This is never called on the entry block.
+bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
+  bool MadeChange = false;
+  MachineFunction &MF = *MBB->getParent();
+  DebugLoc dl;  // FIXME: this is nowhere
+ReoptimizeBlock:
+
+  MachineFunction::iterator FallThrough = MBB;
+  ++FallThrough;
+
+  // If this block is empty, make everyone use its fall-through, not the block
+  // explicitly.  Landing pads should not do this since the landing-pad table
+  // points to this block.  Blocks with their addresses taken shouldn't be
+  // optimized away.
+  if (IsEmptyBlock(MBB) && !MBB->hasAddressTaken()) {
+    // Dead block?  Leave for cleanup later.
+    if (MBB->pred_empty()) return MadeChange;
+
+    if (FallThrough == MF.end()) {
+      // TODO: Simplify preds to not branch here if possible!
+    } else {
+      // Rewrite all predecessors of the old block to go to the fallthrough
+      // instead.
+      while (!MBB->pred_empty()) {
+        MachineBasicBlock *Pred = *(MBB->pred_end()-1);
+        Pred->ReplaceUsesOfBlockWith(MBB, FallThrough);
+      }
+      // If MBB was the target of a jump table, update jump tables to go to the
+      // fallthrough instead.
+      if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
+        MJTI->ReplaceMBBInJumpTables(MBB, FallThrough);
+      MadeChange = true;
+    }
+    return MadeChange;
+  }
+
+  // Check to see if we can simplify the terminator of the block before this
+  // one.
+  MachineBasicBlock &PrevBB = *prior(MachineFunction::iterator(MBB));
+
+  MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0;
+  SmallVector<MachineOperand, 4> PriorCond;
+  bool PriorUnAnalyzable =
+    TII->AnalyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
+  if (!PriorUnAnalyzable) {
+    // If the CFG for the prior block has extra edges, remove them.
+    MadeChange |= PrevBB.CorrectExtraCFGEdges(PriorTBB, PriorFBB,
+                                              !PriorCond.empty());
+
+    // If the previous branch is conditional and both conditions go to the same
+    // destination, remove the branch, replacing it with an unconditional one or
+    // a fall-through.
+    if (PriorTBB && PriorTBB == PriorFBB) {
+      TII->RemoveBranch(PrevBB);
+      PriorCond.clear();
+      if (PriorTBB != MBB)
+        TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond, dl);
+      MadeChange = true;
+      ++NumBranchOpts;
+      goto ReoptimizeBlock;
+    }
+
+    // If the previous block unconditionally falls through to this block and
+    // this block has no other predecessors, move the contents of this block
+    // into the prior block. This doesn't usually happen when SimplifyCFG
+    // has been used, but it can happen if tail merging splits a fall-through
+    // predecessor of a block.
+    // This has to check PrevBB->succ_size() because EH edges are ignored by
+    // AnalyzeBranch.
+    if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 &&
+        PrevBB.succ_size() == 1 &&
+        !MBB->hasAddressTaken()) {
+      DEBUG(dbgs() << "\nMerging into block: " << PrevBB
+                   << "From MBB: " << *MBB);
+      PrevBB.splice(PrevBB.end(), MBB, MBB->begin(), MBB->end());
+      PrevBB.removeSuccessor(PrevBB.succ_begin());;
+      assert(PrevBB.succ_empty());
+      PrevBB.transferSuccessors(MBB);
+      MadeChange = true;
+      return MadeChange;
+    }
+
+    // If the previous branch *only* branches to *this* block (conditional or
+    // not) remove the branch.
+    if (PriorTBB == MBB && PriorFBB == 0) {
+      TII->RemoveBranch(PrevBB);
+      MadeChange = true;
+      ++NumBranchOpts;
+      goto ReoptimizeBlock;
+    }
+
+    // If the prior block branches somewhere else on the condition and here if
+    // the condition is false, remove the uncond second branch.
+    if (PriorFBB == MBB) {
+      TII->RemoveBranch(PrevBB);
+      TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond, dl);
+      MadeChange = true;
+      ++NumBranchOpts;
+      goto ReoptimizeBlock;
+    }
+
+    // If the prior block branches here on true and somewhere else on false, and
+    // if the branch condition is reversible, reverse the branch to create a
+    // fall-through.
+    if (PriorTBB == MBB) {
+      SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
+      if (!TII->ReverseBranchCondition(NewPriorCond)) {
+        TII->RemoveBranch(PrevBB);
+        TII->InsertBranch(PrevBB, PriorFBB, 0, NewPriorCond, dl);
+        MadeChange = true;
+        ++NumBranchOpts;
+        goto ReoptimizeBlock;
+      }
+    }
+
+    // If this block has no successors (e.g. it is a return block or ends with
+    // a call to a no-return function like abort or __cxa_throw) and if the pred
+    // falls through into this block, and if it would otherwise fall through
+    // into the block after this, move this block to the end of the function.
+    //
+    // We consider it more likely that execution will stay in the function (e.g.
+    // due to loops) than it is to exit it.  This asserts in loops etc, moving
+    // the assert condition out of the loop body.
+    if (MBB->succ_empty() && !PriorCond.empty() && PriorFBB == 0 &&
+        MachineFunction::iterator(PriorTBB) == FallThrough &&
+        !MBB->canFallThrough()) {
+      bool DoTransform = true;
+
+      // We have to be careful that the succs of PredBB aren't both no-successor
+      // blocks.  If neither have successors and if PredBB is the second from
+      // last block in the function, we'd just keep swapping the two blocks for
+      // last.  Only do the swap if one is clearly better to fall through than
+      // the other.
+      if (FallThrough == --MF.end() &&
+          !IsBetterFallthrough(PriorTBB, MBB))
+        DoTransform = false;
+
+      if (DoTransform) {
+        // Reverse the branch so we will fall through on the previous true cond.
+        SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
+        if (!TII->ReverseBranchCondition(NewPriorCond)) {
+          DEBUG(dbgs() << "\nMoving MBB: " << *MBB
+                       << "To make fallthrough to: " << *PriorTBB << "\n");
+
+          TII->RemoveBranch(PrevBB);
+          TII->InsertBranch(PrevBB, MBB, 0, NewPriorCond, dl);
+
+          // Move this block to the end of the function.
+          MBB->moveAfter(--MF.end());
+          MadeChange = true;
+          ++NumBranchOpts;
+          return MadeChange;
+        }
+      }
+    }
+  }
+
+  // Analyze the branch in the current block.
+  MachineBasicBlock *CurTBB = 0, *CurFBB = 0;
+  SmallVector<MachineOperand, 4> CurCond;
+  bool CurUnAnalyzable= TII->AnalyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
+  if (!CurUnAnalyzable) {
+    // If the CFG for the prior block has extra edges, remove them.
+    MadeChange |= MBB->CorrectExtraCFGEdges(CurTBB, CurFBB, !CurCond.empty());
+
+    // If this is a two-way branch, and the FBB branches to this block, reverse
+    // the condition so the single-basic-block loop is faster.  Instead of:
+    //    Loop: xxx; jcc Out; jmp Loop
+    // we want:
+    //    Loop: xxx; jncc Loop; jmp Out
+    if (CurTBB && CurFBB && CurFBB == MBB && CurTBB != MBB) {
+      SmallVector<MachineOperand, 4> NewCond(CurCond);
+      if (!TII->ReverseBranchCondition(NewCond)) {
+        TII->RemoveBranch(*MBB);
+        TII->InsertBranch(*MBB, CurFBB, CurTBB, NewCond, dl);
+        MadeChange = true;
+        ++NumBranchOpts;
+        goto ReoptimizeBlock;
+      }
+    }
+
+    // If this branch is the only thing in its block, see if we can forward
+    // other blocks across it.
+    if (CurTBB && CurCond.empty() && CurFBB == 0 &&
+        IsBranchOnlyBlock(MBB) && CurTBB != MBB &&
+        !MBB->hasAddressTaken()) {
+      // This block may contain just an unconditional branch.  Because there can
+      // be 'non-branch terminators' in the block, try removing the branch and
+      // then seeing if the block is empty.
+      TII->RemoveBranch(*MBB);
+      // If the only things remaining in the block are debug info, remove these
+      // as well, so this will behave the same as an empty block in non-debug
+      // mode.
+      if (!MBB->empty()) {
+        bool NonDebugInfoFound = false;
+        for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+             I != E; ++I) {
+          if (!I->isDebugValue()) {
+            NonDebugInfoFound = true;
+            break;
+          }
+        }
+        if (!NonDebugInfoFound)
+          // Make the block empty, losing the debug info (we could probably
+          // improve this in some cases.)
+          MBB->erase(MBB->begin(), MBB->end());
+      }
+      // If this block is just an unconditional branch to CurTBB, we can
+      // usually completely eliminate the block.  The only case we cannot
+      // completely eliminate the block is when the block before this one
+      // falls through into MBB and we can't understand the prior block's branch
+      // condition.
+      if (MBB->empty()) {
+        bool PredHasNoFallThrough = !PrevBB.canFallThrough();
+        if (PredHasNoFallThrough || !PriorUnAnalyzable ||
+            !PrevBB.isSuccessor(MBB)) {
+          // If the prior block falls through into us, turn it into an
+          // explicit branch to us to make updates simpler.
+          if (!PredHasNoFallThrough && PrevBB.isSuccessor(MBB) &&
+              PriorTBB != MBB && PriorFBB != MBB) {
+            if (PriorTBB == 0) {
+              assert(PriorCond.empty() && PriorFBB == 0 &&
+                     "Bad branch analysis");
+              PriorTBB = MBB;
+            } else {
+              assert(PriorFBB == 0 && "Machine CFG out of date!");
+              PriorFBB = MBB;
+            }
+            TII->RemoveBranch(PrevBB);
+            TII->InsertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, dl);
+          }
+
+          // Iterate through all the predecessors, revectoring each in-turn.
+          size_t PI = 0;
+          bool DidChange = false;
+          bool HasBranchToSelf = false;
+          while(PI != MBB->pred_size()) {
+            MachineBasicBlock *PMBB = *(MBB->pred_begin() + PI);
+            if (PMBB == MBB) {
+              // If this block has an uncond branch to itself, leave it.
+              ++PI;
+              HasBranchToSelf = true;
+            } else {
+              DidChange = true;
+              PMBB->ReplaceUsesOfBlockWith(MBB, CurTBB);
+              // If this change resulted in PMBB ending in a conditional
+              // branch where both conditions go to the same destination,
+              // change this to an unconditional branch (and fix the CFG).
+              MachineBasicBlock *NewCurTBB = 0, *NewCurFBB = 0;
+              SmallVector<MachineOperand, 4> NewCurCond;
+              bool NewCurUnAnalyzable = TII->AnalyzeBranch(*PMBB, NewCurTBB,
+                      NewCurFBB, NewCurCond, true);
+              if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) {
+                TII->RemoveBranch(*PMBB);
+                NewCurCond.clear();
+                TII->InsertBranch(*PMBB, NewCurTBB, 0, NewCurCond, dl);
+                MadeChange = true;
+                ++NumBranchOpts;
+                PMBB->CorrectExtraCFGEdges(NewCurTBB, 0, false);
+              }
+            }
+          }
+
+          // Change any jumptables to go to the new MBB.
+          if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
+            MJTI->ReplaceMBBInJumpTables(MBB, CurTBB);
+          if (DidChange) {
+            ++NumBranchOpts;
+            MadeChange = true;
+            if (!HasBranchToSelf) return MadeChange;
+          }
+        }
+      }
+
+      // Add the branch back if the block is more than just an uncond branch.
+      TII->InsertBranch(*MBB, CurTBB, 0, CurCond, dl);
+    }
+  }
+
+  // If the prior block doesn't fall through into this block, and if this
+  // block doesn't fall through into some other block, see if we can find a
+  // place to move this block where a fall-through will happen.
+  if (!PrevBB.canFallThrough()) {
+
+    // Now we know that there was no fall-through into this block, check to
+    // see if it has a fall-through into its successor.
+    bool CurFallsThru = MBB->canFallThrough();
+
+    {
+      // Check all the predecessors of this block.  If one of them has no fall
+      // throughs, move this block right after it.
+      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+           E = MBB->pred_end(); PI != E; ++PI) {
+        // Analyze the branch at the end of the pred.
+        MachineBasicBlock *PredBB = *PI;
+        MachineFunction::iterator PredFallthrough = PredBB; ++PredFallthrough;
+        MachineBasicBlock *PredTBB = 0, *PredFBB = 0;
+        SmallVector<MachineOperand, 4> PredCond;
+        if (PredBB != MBB && !PredBB->canFallThrough() &&
+            !TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true)
+            && (!CurFallsThru || !CurTBB || !CurFBB)
+            && (!CurFallsThru || MBB->getNumber() >= PredBB->getNumber())) {
+          // If the current block doesn't fall through, just move it.
+          // If the current block can fall through and does not end with a
+          // conditional branch, we need to append an unconditional jump to
+          // the (current) next block.  To avoid a possible compile-time
+          // infinite loop, move blocks only backward in this case.
+          // Also, if there are already 2 branches here, we cannot add a third;
+          // this means we have the case
+          // Bcc next
+          // B elsewhere
+          // next:
+          if (CurFallsThru) {
+            MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
+            CurCond.clear();
+            TII->InsertBranch(*MBB, NextBB, 0, CurCond, dl);
+          }
+          MBB->moveAfter(PredBB);
+          MadeChange = true;
+          goto ReoptimizeBlock;
+        }
+      }
+    }
+
+    if (!CurFallsThru) {
+      // Check all successors to see if we can move this block before it.
+      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+           E = MBB->succ_end(); SI != E; ++SI) {
+        // Analyze the branch at the end of the block before the succ.
+        MachineBasicBlock *SuccBB = *SI;
+        MachineFunction::iterator SuccPrev = SuccBB; --SuccPrev;
+
+        // If this block doesn't already fall-through to that successor, and if
+        // the succ doesn't already have a block that can fall through into it,
+        // and if the successor isn't an EH destination, we can arrange for the
+        // fallthrough to happen.
+        if (SuccBB != MBB && &*SuccPrev != MBB &&
+            !SuccPrev->canFallThrough() && !CurUnAnalyzable) {
+          MBB->moveBefore(SuccBB);
+          MadeChange = true;
+          goto ReoptimizeBlock;
+        }
+      }
+
+      // Okay, there is no really great place to put this block.  If, however,
+      // the block before this one would be a fall-through if this block were
+      // removed, move this block to the end of the function.
+      MachineBasicBlock *PrevTBB = 0, *PrevFBB = 0;
+      SmallVector<MachineOperand, 4> PrevCond;
+      if (FallThrough != MF.end() &&
+          !TII->AnalyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
+          PrevBB.isSuccessor(FallThrough)) {
+        MBB->moveAfter(--MF.end());
+        MadeChange = true;
+        return MadeChange;
+      }
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/src/LLVM/lib/CodeGen/BranchFolding.h b/src/LLVM/lib/CodeGen/BranchFolding.h
new file mode 100644
index 0000000..15dfa7f
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/BranchFolding.h
@@ -0,0 +1,116 @@
+//===-- BranchFolding.h - Fold machine code branch instructions --*- C++ -*===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_BRANCHFOLDING_HPP
+#define LLVM_CODEGEN_BRANCHFOLDING_HPP
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include <vector>
+
+namespace llvm {
+  class MachineFunction;
+  class MachineModuleInfo;
+  class RegScavenger;
+  class TargetInstrInfo;
+  class TargetRegisterInfo;
+  template<typename T> class SmallVectorImpl;
+
+  class BranchFolder {
+  public:
+    explicit BranchFolder(bool defaultEnableTailMerge);
+
+    bool OptimizeFunction(MachineFunction &MF,
+                          const TargetInstrInfo *tii,
+                          const TargetRegisterInfo *tri,
+                          MachineModuleInfo *mmi);
+  private:
+    class MergePotentialsElt {
+      unsigned Hash;
+      MachineBasicBlock *Block;
+    public:
+      MergePotentialsElt(unsigned h, MachineBasicBlock *b)
+        : Hash(h), Block(b) {}
+
+      unsigned getHash() const { return Hash; }
+      MachineBasicBlock *getBlock() const { return Block; }
+
+      void setBlock(MachineBasicBlock *MBB) {
+        Block = MBB;
+      }
+
+      bool operator<(const MergePotentialsElt &) const;
+    };
+    typedef std::vector<MergePotentialsElt>::iterator MPIterator;
+    std::vector<MergePotentialsElt> MergePotentials;
+
+    class SameTailElt {
+      MPIterator MPIter;
+      MachineBasicBlock::iterator TailStartPos;
+    public:
+      SameTailElt(MPIterator mp, MachineBasicBlock::iterator tsp)
+        : MPIter(mp), TailStartPos(tsp) {}
+
+      MPIterator getMPIter() const {
+        return MPIter;
+      }
+      MergePotentialsElt &getMergePotentialsElt() const {
+        return *getMPIter();
+      }
+      MachineBasicBlock::iterator getTailStartPos() const {
+        return TailStartPos;
+      }
+      unsigned getHash() const {
+        return getMergePotentialsElt().getHash();
+      }
+      MachineBasicBlock *getBlock() const {
+        return getMergePotentialsElt().getBlock();
+      }
+      bool tailIsWholeBlock() const {
+        return TailStartPos == getBlock()->begin();
+      }
+
+      void setBlock(MachineBasicBlock *MBB) {
+        getMergePotentialsElt().setBlock(MBB);
+      }
+      void setTailStartPos(MachineBasicBlock::iterator Pos) {
+        TailStartPos = Pos;
+      }
+    };
+    std::vector<SameTailElt> SameTails;
+
+    bool EnableTailMerge;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineModuleInfo *MMI;
+    RegScavenger *RS;
+
+    bool TailMergeBlocks(MachineFunction &MF);
+    bool TryTailMergeBlocks(MachineBasicBlock* SuccBB,
+                       MachineBasicBlock* PredBB);
+    void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
+                                 MachineBasicBlock *NewDest);
+    MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB,
+                                  MachineBasicBlock::iterator BBI1);
+    unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength,
+                              MachineBasicBlock *SuccBB,
+                              MachineBasicBlock *PredBB);
+    void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB,
+                                                MachineBasicBlock* PredBB);
+    bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                   unsigned maxCommonTailLength,
+                                   unsigned &commonTailIndex);
+
+    bool OptimizeBranches(MachineFunction &MF);
+    bool OptimizeBlock(MachineBasicBlock *MBB);
+    void RemoveDeadBlock(MachineBasicBlock *MBB);
+    bool OptimizeImpDefsBlock(MachineBasicBlock *MBB);
+  };
+}
+
+#endif /* LLVM_CODEGEN_BRANCHFOLDING_HPP */
diff --git a/src/LLVM/lib/CodeGen/CMakeLists.txt b/src/LLVM/lib/CodeGen/CMakeLists.txt
new file mode 100644
index 0000000..a1e270e
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/CMakeLists.txt
@@ -0,0 +1,85 @@
+add_llvm_library(LLVMCodeGen
+  AggressiveAntiDepBreaker.cpp
+  Analysis.cpp
+  BranchFolding.cpp
+  CalcSpillWeights.cpp
+  CallingConvLower.cpp
+  CodePlacementOpt.cpp
+  CriticalAntiDepBreaker.cpp
+  DeadMachineInstructionElim.cpp
+  DwarfEHPrepare.cpp
+  ELFCodeEmitter.cpp
+  ELFWriter.cpp
+  GCMetadata.cpp
+  GCMetadataPrinter.cpp
+  GCStrategy.cpp
+  IfConversion.cpp
+  InlineSpiller.cpp
+  IntrinsicLowering.cpp
+  LLVMTargetMachine.cpp
+  LatencyPriorityQueue.cpp
+  LiveInterval.cpp
+  LiveIntervalAnalysis.cpp
+  LiveStackAnalysis.cpp
+  LiveVariables.cpp
+  LowerSubregs.cpp
+  MachineBasicBlock.cpp
+  MachineCSE.cpp
+  MachineDominators.cpp
+  MachineFunction.cpp
+  MachineFunctionAnalysis.cpp
+  MachineFunctionPass.cpp
+  MachineFunctionPrinterPass.cpp
+  MachineInstr.cpp
+  MachineLICM.cpp
+  MachineLoopInfo.cpp
+  MachineModuleInfo.cpp
+  MachineModuleInfoImpls.cpp
+  MachinePassRegistry.cpp
+  MachineRegisterInfo.cpp
+  MachineSSAUpdater.cpp
+  MachineSink.cpp
+  MachineVerifier.cpp
+  ObjectCodeEmitter.cpp
+  OcamlGC.cpp
+  OptimizePHIs.cpp
+  PHIElimination.cpp
+  Passes.cpp
+  PeepholeOptimizer.cpp
+  PostRAHazardRecognizer.cpp
+  PostRASchedulerList.cpp
+  PreAllocSplitting.cpp
+  ProcessImplicitDefs.cpp
+  PrologEpilogInserter.cpp
+  PseudoSourceValue.cpp
+  RegAllocFast.cpp
+  RegAllocLinearScan.cpp
+  RegAllocPBQP.cpp
+  RegisterCoalescer.cpp
+  RegisterScavenging.cpp
+  RenderMachineFunction.cpp
+  ScheduleDAG.cpp
+  ScheduleDAGEmit.cpp
+  ScheduleDAGInstrs.cpp
+  ScheduleDAGPrinter.cpp
+  ShadowStackGC.cpp
+  ShrinkWrapping.cpp
+  SimpleRegisterCoalescing.cpp
+  SjLjEHPrepare.cpp
+  SlotIndexes.cpp
+  Spiller.cpp
+  SplitKit.cpp
+  Splitter.cpp
+  StackProtector.cpp
+  StackSlotColoring.cpp
+  StrongPHIElimination.cpp
+  TailDuplication.cpp
+  TargetInstrInfoImpl.cpp
+  TargetLoweringObjectFileImpl.cpp
+  TwoAddressInstructionPass.cpp
+  UnreachableBlockElim.cpp
+  VirtRegMap.cpp
+  VirtRegRewriter.cpp
+  )
+
+target_link_libraries (LLVMCodeGen LLVMCore LLVMScalarOpts)
diff --git a/src/LLVM/lib/CodeGen/CalcSpillWeights.cpp b/src/LLVM/lib/CodeGen/CalcSpillWeights.cpp
new file mode 100644
index 0000000..8ecce4a
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/CalcSpillWeights.cpp
@@ -0,0 +1,226 @@
+//===------------------------ CalcSpillWeights.cpp ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "calcspillweights"
+
+#include "llvm/Function.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+char CalculateSpillWeights::ID = 0;
+INITIALIZE_PASS(CalculateSpillWeights, "calcspillweights",
+                "Calculate spill weights", false, false);
+
+void CalculateSpillWeights::getAnalysisUsage(AnalysisUsage &au) const {
+  au.addRequired<LiveIntervals>();
+  au.addRequired<MachineLoopInfo>();
+  au.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(au);
+}
+
+bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &fn) {
+
+  DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
+               << "********** Function: "
+               << fn.getFunction()->getName() << '\n');
+
+  LiveIntervals &lis = getAnalysis<LiveIntervals>();
+  VirtRegAuxInfo vrai(fn, lis, getAnalysis<MachineLoopInfo>());
+  for (LiveIntervals::iterator I = lis.begin(), E = lis.end(); I != E; ++I) {
+    LiveInterval &li = *I->second;
+    if (TargetRegisterInfo::isVirtualRegister(li.reg))
+      vrai.CalculateWeightAndHint(li);
+  }
+  return false;
+}
+
+// Return the preferred allocation register for reg, given a COPY instruction.
+static unsigned copyHint(const MachineInstr *mi, unsigned reg,
+                         const TargetRegisterInfo &tri,
+                         const MachineRegisterInfo &mri) {
+  unsigned sub, hreg, hsub;
+  if (mi->getOperand(0).getReg() == reg) {
+    sub = mi->getOperand(0).getSubReg();
+    hreg = mi->getOperand(1).getReg();
+    hsub = mi->getOperand(1).getSubReg();
+  } else {
+    sub = mi->getOperand(1).getSubReg();
+    hreg = mi->getOperand(0).getReg();
+    hsub = mi->getOperand(0).getSubReg();
+  }
+
+  if (!hreg)
+    return 0;
+
+  if (TargetRegisterInfo::isVirtualRegister(hreg))
+    return sub == hsub ? hreg : 0;
+
+  const TargetRegisterClass *rc = mri.getRegClass(reg);
+
+  // Only allow physreg hints in rc.
+  if (sub == 0)
+    return rc->contains(hreg) ? hreg : 0;
+
+  // reg:sub should match the physreg hreg.
+  return tri.getMatchingSuperReg(hreg, sub, rc);
+}
+
+void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
+  MachineRegisterInfo &mri = mf_.getRegInfo();
+  const TargetRegisterInfo &tri = *mf_.getTarget().getRegisterInfo();
+  MachineBasicBlock *mbb = 0;
+  MachineLoop *loop = 0;
+  unsigned loopDepth = 0;
+  bool isExiting = false;
+  float totalWeight = 0;
+  SmallPtrSet<MachineInstr*, 8> visited;
+
+  // Find the best physreg hist and the best virtreg hint.
+  float bestPhys = 0, bestVirt = 0;
+  unsigned hintPhys = 0, hintVirt = 0;
+
+  // Don't recompute a target specific hint.
+  bool noHint = mri.getRegAllocationHint(li.reg).first != 0;
+
+  for (MachineRegisterInfo::reg_iterator I = mri.reg_begin(li.reg);
+       MachineInstr *mi = I.skipInstruction();) {
+    if (mi->isIdentityCopy() || mi->isImplicitDef() || mi->isDebugValue())
+      continue;
+    if (!visited.insert(mi))
+      continue;
+
+    // Get loop info for mi.
+    if (mi->getParent() != mbb) {
+      mbb = mi->getParent();
+      loop = loops_.getLoopFor(mbb);
+      loopDepth = loop ? loop->getLoopDepth() : 0;
+      isExiting = loop ? loop->isLoopExiting(mbb) : false;
+    }
+
+    // Calculate instr weight.
+    bool reads, writes;
+    tie(reads, writes) = mi->readsWritesVirtualRegister(li.reg);
+    float weight = LiveIntervals::getSpillWeight(writes, reads, loopDepth);
+
+    // Give extra weight to what looks like a loop induction variable update.
+    if (writes && isExiting && lis_.isLiveOutOfMBB(li, mbb))
+      weight *= 3;
+
+    totalWeight += weight;
+
+    // Get allocation hints from copies.
+    if (noHint || !mi->isCopy())
+      continue;
+    unsigned hint = copyHint(mi, li.reg, tri, mri);
+    if (!hint)
+      continue;
+    float hweight = hint_[hint] += weight;
+    if (TargetRegisterInfo::isPhysicalRegister(hint)) {
+      if (hweight > bestPhys && lis_.isAllocatable(hint))
+        bestPhys = hweight, hintPhys = hint;
+    } else {
+      if (hweight > bestVirt)
+        bestVirt = hweight, hintVirt = hint;
+    }
+  }
+
+  hint_.clear();
+
+  // Always prefer the physreg hint.
+  if (unsigned hint = hintPhys ? hintPhys : hintVirt) {
+    mri.setRegAllocationHint(li.reg, 0, hint);
+    // Weakly boost the spill weifght of hinted registers.
+    totalWeight *= 1.01F;
+  }
+
+  // Mark li as unspillable if all live ranges are tiny.
+  if (li.isZeroLength()) {
+    li.markNotSpillable();
+    return;
+  }
+
+  // If all of the definitions of the interval are re-materializable,
+  // it is a preferred candidate for spilling. If none of the defs are
+  // loads, then it's potentially very cheap to re-materialize.
+  // FIXME: this gets much more complicated once we support non-trivial
+  // re-materialization.
+  bool isLoad = false;
+  SmallVector<LiveInterval*, 4> spillIs;
+  if (lis_.isReMaterializable(li, spillIs, isLoad)) {
+    if (isLoad)
+      totalWeight *= 0.9F;
+    else
+      totalWeight *= 0.5F;
+  }
+
+  li.weight = totalWeight;
+  lis_.normalizeSpillWeight(li);
+}
+
+void VirtRegAuxInfo::CalculateRegClass(unsigned reg) {
+  MachineRegisterInfo &mri = mf_.getRegInfo();
+  const TargetRegisterInfo *tri = mf_.getTarget().getRegisterInfo();
+  const TargetRegisterClass *orc = mri.getRegClass(reg);
+  SmallPtrSet<const TargetRegisterClass*,8> rcs;
+
+  for (MachineRegisterInfo::reg_nodbg_iterator I = mri.reg_nodbg_begin(reg),
+       E = mri.reg_nodbg_end(); I != E; ++I) {
+    // The targets don't have accurate enough regclass descriptions that we can
+    // handle subregs. We need something similar to
+    // TRI::getMatchingSuperRegClass, but returning a super class instead of a
+    // sub class.
+    if (I.getOperand().getSubReg()) {
+      DEBUG(dbgs() << "Cannot handle subregs: " << I.getOperand() << '\n');
+      return;
+    }
+    if (const TargetRegisterClass *rc =
+                                I->getDesc().getRegClass(I.getOperandNo(), tri))
+      rcs.insert(rc);
+  }
+
+  // If we found no regclass constraints, just leave reg as is.
+  // In theory, we could inflate to the largest superclass of reg's existing
+  // class, but that might not be legal for the current cpu setting.
+  // This could happen if reg is only used by COPY instructions, so we may need
+  // to improve on this.
+  if (rcs.empty()) {
+    DEBUG(dbgs() << "Not inflating unconstrained" << orc->getName() << ":%reg"
+                 << reg << ".\n");
+    return;
+  }
+
+  // Compute the intersection of all classes in rcs.
+  // This ought to be independent of iteration order, but if the target register
+  // classes don't form a proper algebra, it is possible to get different
+  // results. The solution is to make sure the intersection of any two register
+  // classes is also a register class or the null set.
+  const TargetRegisterClass *rc = 0;
+  for (SmallPtrSet<const TargetRegisterClass*,8>::iterator I = rcs.begin(),
+         E = rcs.end(); I != E; ++I) {
+    rc = rc ? getCommonSubClass(rc, *I) : *I;
+    assert(rc && "Incompatible regclass constraints found");
+  }
+
+  if (rc == orc)
+    return;
+  DEBUG(dbgs() << "Inflating " << orc->getName() << ":%reg" << reg << " to "
+               << rc->getName() <<".\n");
+  mri.setRegClass(reg, rc);
+}
diff --git a/src/LLVM/lib/CodeGen/CallingConvLower.cpp b/src/LLVM/lib/CodeGen/CallingConvLower.cpp
new file mode 100644
index 0000000..62ad817
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/CallingConvLower.cpp
@@ -0,0 +1,177 @@
+//===-- CallingConvLower.cpp - Calling Conventions ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CCState class, used for lowering and implementing
+// calling conventions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+CCState::CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &tm,
+                 SmallVector<CCValAssign, 16> &locs, LLVMContext &C)
+  : CallingConv(CC), IsVarArg(isVarArg), TM(tm),
+    TRI(*TM.getRegisterInfo()), Locs(locs), Context(C) {
+  // No stack is used.
+  StackOffset = 0;
+  
+  UsedRegs.resize((TRI.getNumRegs()+31)/32);
+}
+
+// HandleByVal - Allocate a stack slot large enough to pass an argument by
+// value. The size and alignment information of the argument is encoded in its
+// parameter attribute.
+void CCState::HandleByVal(unsigned ValNo, EVT ValVT,
+                          EVT LocVT, CCValAssign::LocInfo LocInfo,
+                          int MinSize, int MinAlign,
+                          ISD::ArgFlagsTy ArgFlags) {
+  unsigned Align = ArgFlags.getByValAlign();
+  unsigned Size  = ArgFlags.getByValSize();
+  if (MinSize > (int)Size)
+    Size = MinSize;
+  if (MinAlign > (int)Align)
+    Align = MinAlign;
+  unsigned Offset = AllocateStack(Size, Align);
+
+  addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+}
+
+/// MarkAllocated - Mark a register and all of its aliases as allocated.
+void CCState::MarkAllocated(unsigned Reg) {
+  UsedRegs[Reg/32] |= 1 << (Reg&31);
+  
+  if (const unsigned *RegAliases = TRI.getAliasSet(Reg))
+    for (; (Reg = *RegAliases); ++RegAliases)
+      UsedRegs[Reg/32] |= 1 << (Reg&31);
+}
+
+/// AnalyzeFormalArguments - Analyze an array of argument values,
+/// incorporating info about the formals into this state.
+void
+CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn Fn) {
+  unsigned NumArgs = Ins.size();
+
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    EVT ArgVT = Ins[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Formal argument #" << i << " has unhandled type "
+             << ArgVT.getEVTString();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// CheckReturn - Analyze the return values of a function, returning true if
+/// the return can be performed without sret-demotion, and false otherwise.
+bool CCState::CheckReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                          CCAssignFn Fn) {
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    EVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
+      return false;
+  }
+  return true;
+}
+
+/// AnalyzeReturn - Analyze the returned values of a return,
+/// incorporating info about the result values into this state.
+void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                            CCAssignFn Fn) {
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    EVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Return operand #" << i << " has unhandled type "
+             << VT.getEVTString();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// AnalyzeCallOperands - Analyze the outgoing arguments to a call,
+/// incorporating info about the passed values into this state.
+void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  CCAssignFn Fn) {
+  unsigned NumOps = Outs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    EVT ArgVT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << i << " has unhandled type "
+             << ArgVT.getEVTString();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// AnalyzeCallOperands - Same as above except it takes vectors of types
+/// and argument flags.
+void CCState::AnalyzeCallOperands(SmallVectorImpl<EVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                                  CCAssignFn Fn) {
+  unsigned NumOps = ArgVTs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    EVT ArgVT = ArgVTs[i];
+    ISD::ArgFlagsTy ArgFlags = Flags[i];
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << i << " has unhandled type "
+             << ArgVT.getEVTString();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// AnalyzeCallResult - Analyze the return values of a call,
+/// incorporating info about the passed values into this state.
+void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn Fn) {
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    EVT VT = Ins[i].VT;
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call result #" << i << " has unhandled type "
+             << VT.getEVTString();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// AnalyzeCallResult - Same as above except it's specialized for calls which
+/// produce a single value.
+void CCState::AnalyzeCallResult(EVT VT, CCAssignFn Fn) {
+  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this)) {
+#ifndef NDEBUG
+    dbgs() << "Call result has unhandled type "
+           << VT.getEVTString();
+#endif
+    llvm_unreachable(0);
+  }
+}
diff --git a/src/LLVM/lib/CodeGen/CodePlacementOpt.cpp b/src/LLVM/lib/CodeGen/CodePlacementOpt.cpp
new file mode 100644
index 0000000..a554745
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/CodePlacementOpt.cpp
@@ -0,0 +1,422 @@
+//===-- CodePlacementOpt.cpp - Code Placement pass. -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass that optimize code placement and align loop
+// headers to target specific alignment boundary.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "code-placement"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumLoopsAligned,  "Number of loops aligned");
+STATISTIC(NumIntraElim,     "Number of intra loop branches eliminated");
+STATISTIC(NumIntraMoved,    "Number of intra loop branches moved");
+
+namespace {
+  class CodePlacementOpt : public MachineFunctionPass {
+    const MachineLoopInfo *MLI;
+    const TargetInstrInfo *TII;
+    const TargetLowering  *TLI;
+
+  public:
+    static char ID;
+    CodePlacementOpt() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+    virtual const char *getPassName() const {
+      return "Code Placement Optimizater";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    bool HasFallthrough(MachineBasicBlock *MBB);
+    bool HasAnalyzableTerminator(MachineBasicBlock *MBB);
+    void Splice(MachineFunction &MF,
+                MachineFunction::iterator InsertPt,
+                MachineFunction::iterator Begin,
+                MachineFunction::iterator End);
+    bool EliminateUnconditionalJumpsToTop(MachineFunction &MF,
+                                          MachineLoop *L);
+    bool MoveDiscontiguousLoopBlocks(MachineFunction &MF,
+                                     MachineLoop *L);
+    bool OptimizeIntraLoopEdgesInLoopNest(MachineFunction &MF, MachineLoop *L);
+    bool OptimizeIntraLoopEdges(MachineFunction &MF);
+    bool AlignLoops(MachineFunction &MF);
+    bool AlignLoop(MachineFunction &MF, MachineLoop *L, unsigned Align);
+  };
+
+  char CodePlacementOpt::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createCodePlacementOptPass() {
+  return new CodePlacementOpt();
+}
+
+/// HasFallthrough - Test whether the given branch has a fallthrough, either as
+/// a plain fallthrough or as a fallthrough case of a conditional branch.
+///
+bool CodePlacementOpt::HasFallthrough(MachineBasicBlock *MBB) {
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond))
+    return false;
+  // This conditional branch has no fallthrough.
+  if (FBB)
+    return false;
+  // An unconditional branch has no fallthrough.
+  if (Cond.empty() && TBB)
+    return false;
+  // It has a fallthrough.
+  return true;
+}
+
+/// HasAnalyzableTerminator - Test whether AnalyzeBranch will succeed on MBB.
+/// This is called before major changes are begun to test whether it will be
+/// possible to complete the changes.
+///
+/// Target-specific code is hereby encouraged to make AnalyzeBranch succeed
+/// whenever possible.
+///
+bool CodePlacementOpt::HasAnalyzableTerminator(MachineBasicBlock *MBB) {
+  // Aggressively handle return blocks and similar constructs.
+  if (MBB->succ_empty()) return true;
+
+  // Ask the target's AnalyzeBranch if it can handle this block.
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  // Make sure the terminator is understood.
+  if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond))
+    return false;
+   // Ignore blocks which look like they might have EH-related control flow.
+   // AnalyzeBranch thinks it knows how to analyze such things, but it doesn't
+   // recognize the possibility of a control transfer through an unwind.
+   // Such blocks contain EH_LABEL instructions, however they may be in the
+   // middle of the block. Instead of searching for them, just check to see
+   // if the CFG disagrees with AnalyzeBranch.
+  if (1u + !Cond.empty() != MBB->succ_size())
+    return false;
+  // Make sure we have the option of reversing the condition.
+  if (!Cond.empty() && TII->ReverseBranchCondition(Cond))
+    return false;
+  return true;
+}
+
+/// Splice - Move the sequence of instructions [Begin,End) to just before
+/// InsertPt. Update branch instructions as needed to account for broken
+/// fallthrough edges and to take advantage of newly exposed fallthrough
+/// opportunities.
+///
+void CodePlacementOpt::Splice(MachineFunction &MF,
+                              MachineFunction::iterator InsertPt,
+                              MachineFunction::iterator Begin,
+                              MachineFunction::iterator End) {
+  assert(Begin != MF.begin() && End != MF.begin() && InsertPt != MF.begin() &&
+         "Splice can't change the entry block!");
+  MachineFunction::iterator OldBeginPrior = prior(Begin);
+  MachineFunction::iterator OldEndPrior = prior(End);
+
+  MF.splice(InsertPt, Begin, End);
+
+  prior(Begin)->updateTerminator();
+  OldBeginPrior->updateTerminator();
+  OldEndPrior->updateTerminator();
+}
+
+/// EliminateUnconditionalJumpsToTop - Move blocks which unconditionally jump
+/// to the loop top to the top of the loop so that they have a fall through.
+/// This can introduce a branch on entry to the loop, but it can eliminate a
+/// branch within the loop. See the @simple case in
+/// test/CodeGen/X86/loop_blocks.ll for an example of this.
+bool CodePlacementOpt::EliminateUnconditionalJumpsToTop(MachineFunction &MF,
+                                                        MachineLoop *L) {
+  bool Changed = false;
+  MachineBasicBlock *TopMBB = L->getTopBlock();
+
+  bool BotHasFallthrough = HasFallthrough(L->getBottomBlock());
+
+  if (TopMBB == MF.begin() ||
+      HasAnalyzableTerminator(prior(MachineFunction::iterator(TopMBB)))) {
+  new_top:
+    for (MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin(),
+         PE = TopMBB->pred_end(); PI != PE; ++PI) {
+      MachineBasicBlock *Pred = *PI;
+      if (Pred == TopMBB) continue;
+      if (HasFallthrough(Pred)) continue;
+      if (!L->contains(Pred)) continue;
+
+      // Verify that we can analyze all the loop entry edges before beginning
+      // any changes which will require us to be able to analyze them.
+      if (Pred == MF.begin())
+        continue;
+      if (!HasAnalyzableTerminator(Pred))
+        continue;
+      if (!HasAnalyzableTerminator(prior(MachineFunction::iterator(Pred))))
+        continue;
+
+      // Move the block.
+      DEBUG(dbgs() << "CGP: Moving blocks starting at BB#" << Pred->getNumber()
+                   << " to top of loop.\n");
+      Changed = true;
+
+      // Move it and all the blocks that can reach it via fallthrough edges
+      // exclusively, to keep existing fallthrough edges intact.
+      MachineFunction::iterator Begin = Pred;
+      MachineFunction::iterator End = llvm::next(Begin);
+      while (Begin != MF.begin()) {
+        MachineFunction::iterator Prior = prior(Begin);
+        if (Prior == MF.begin())
+          break;
+        // Stop when a non-fallthrough edge is found.
+        if (!HasFallthrough(Prior))
+          break;
+        // Stop if a block which could fall-through out of the loop is found.
+        if (Prior->isSuccessor(End))
+          break;
+        // If we've reached the top, stop scanning.
+        if (Prior == MachineFunction::iterator(TopMBB)) {
+          // We know top currently has a fall through (because we just checked
+          // it) which would be lost if we do the transformation, so it isn't
+          // worthwhile to do the transformation unless it would expose a new
+          // fallthrough edge.
+          if (!Prior->isSuccessor(End))
+            goto next_pred;
+          // Otherwise we can stop scanning and procede to move the blocks.
+          break;
+        }
+        // If we hit a switch or something complicated, don't move anything
+        // for this predecessor.
+        if (!HasAnalyzableTerminator(prior(MachineFunction::iterator(Prior))))
+          break;
+        // Ok, the block prior to Begin will be moved along with the rest.
+        // Extend the range to include it.
+        Begin = Prior;
+        ++NumIntraMoved;
+      }
+
+      // Move the blocks.
+      Splice(MF, TopMBB, Begin, End);
+
+      // Update TopMBB.
+      TopMBB = L->getTopBlock();
+
+      // We have a new loop top. Iterate on it. We shouldn't have to do this
+      // too many times if BranchFolding has done a reasonable job.
+      goto new_top;
+    next_pred:;
+    }
+  }
+
+  // If the loop previously didn't exit with a fall-through and it now does,
+  // we eliminated a branch.
+  if (Changed &&
+      !BotHasFallthrough &&
+      HasFallthrough(L->getBottomBlock())) {
+    ++NumIntraElim;
+  }
+
+  return Changed;
+}
+
+/// MoveDiscontiguousLoopBlocks - Move any loop blocks that are not in the
+/// portion of the loop contiguous with the header. This usually makes the loop
+/// contiguous, provided that AnalyzeBranch can handle all the relevant
+/// branching. See the @cfg_islands case in test/CodeGen/X86/loop_blocks.ll
+/// for an example of this.
+bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF,
+                                                   MachineLoop *L) {
+  bool Changed = false;
+  MachineBasicBlock *TopMBB = L->getTopBlock();
+  MachineBasicBlock *BotMBB = L->getBottomBlock();
+
+  // Determine a position to move orphaned loop blocks to. If TopMBB is not
+  // entered via fallthrough and BotMBB is exited via fallthrough, prepend them
+  // to the top of the loop to avoid loosing that fallthrough. Otherwise append
+  // them to the bottom, even if it previously had a fallthrough, on the theory
+  // that it's worth an extra branch to keep the loop contiguous.
+  MachineFunction::iterator InsertPt =
+    llvm::next(MachineFunction::iterator(BotMBB));
+  bool InsertAtTop = false;
+  if (TopMBB != MF.begin() &&
+      !HasFallthrough(prior(MachineFunction::iterator(TopMBB))) &&
+      HasFallthrough(BotMBB)) {
+    InsertPt = TopMBB;
+    InsertAtTop = true;
+  }
+
+  // Keep a record of which blocks are in the portion of the loop contiguous
+  // with the loop header.
+  SmallPtrSet<MachineBasicBlock *, 8> ContiguousBlocks;
+  for (MachineFunction::iterator I = TopMBB,
+       E = llvm::next(MachineFunction::iterator(BotMBB)); I != E; ++I)
+    ContiguousBlocks.insert(I);
+
+  // Find non-contigous blocks and fix them.
+  if (InsertPt != MF.begin() && HasAnalyzableTerminator(prior(InsertPt)))
+    for (MachineLoop::block_iterator BI = L->block_begin(), BE = L->block_end();
+         BI != BE; ++BI) {
+      MachineBasicBlock *BB = *BI;
+
+      // Verify that we can analyze all the loop entry edges before beginning
+      // any changes which will require us to be able to analyze them.
+      if (!HasAnalyzableTerminator(BB))
+        continue;
+      if (!HasAnalyzableTerminator(prior(MachineFunction::iterator(BB))))
+        continue;
+
+      // If the layout predecessor is part of the loop, this block will be
+      // processed along with it. This keeps them in their relative order.
+      if (BB != MF.begin() &&
+          L->contains(prior(MachineFunction::iterator(BB))))
+        continue;
+
+      // Check to see if this block is already contiguous with the main
+      // portion of the loop.
+      if (!ContiguousBlocks.insert(BB))
+        continue;
+
+      // Move the block.
+      DEBUG(dbgs() << "CGP: Moving blocks starting at BB#" << BB->getNumber()
+                   << " to be contiguous with loop.\n");
+      Changed = true;
+
+      // Process this block and all loop blocks contiguous with it, to keep
+      // them in their relative order.
+      MachineFunction::iterator Begin = BB;
+      MachineFunction::iterator End = llvm::next(MachineFunction::iterator(BB));
+      for (; End != MF.end(); ++End) {
+        if (!L->contains(End)) break;
+        if (!HasAnalyzableTerminator(End)) break;
+        ContiguousBlocks.insert(End);
+        ++NumIntraMoved;
+      }
+
+      // If we're inserting at the bottom of the loop, and the code we're
+      // moving originally had fall-through successors, bring the sucessors
+      // up with the loop blocks to preserve the fall-through edges.
+      if (!InsertAtTop)
+        for (; End != MF.end(); ++End) {
+          if (L->contains(End)) break;
+          if (!HasAnalyzableTerminator(End)) break;
+          if (!HasFallthrough(prior(End))) break;
+        }
+
+      // Move the blocks. This may invalidate TopMBB and/or BotMBB, but
+      // we don't need them anymore at this point.
+      Splice(MF, InsertPt, Begin, End);
+    }
+
+  return Changed;
+}
+
+/// OptimizeIntraLoopEdgesInLoopNest - Reposition loop blocks to minimize
+/// intra-loop branching and to form contiguous loops.
+///
+/// This code takes the approach of making minor changes to the existing
+/// layout to fix specific loop-oriented problems. Also, it depends on
+/// AnalyzeBranch, which can't understand complex control instructions.
+///
+bool CodePlacementOpt::OptimizeIntraLoopEdgesInLoopNest(MachineFunction &MF,
+                                                        MachineLoop *L) {
+  bool Changed = false;
+
+  // Do optimization for nested loops.
+  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    Changed |= OptimizeIntraLoopEdgesInLoopNest(MF, *I);
+
+  // Do optimization for this loop.
+  Changed |= EliminateUnconditionalJumpsToTop(MF, L);
+  Changed |= MoveDiscontiguousLoopBlocks(MF, L);
+
+  return Changed;
+}
+
+/// OptimizeIntraLoopEdges - Reposition loop blocks to minimize
+/// intra-loop branching and to form contiguous loops.
+///
+bool CodePlacementOpt::OptimizeIntraLoopEdges(MachineFunction &MF) {
+  bool Changed = false;
+
+  if (!TLI->shouldOptimizeCodePlacement())
+    return Changed;
+
+  // Do optimization for each loop in the function.
+  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
+       I != E; ++I)
+    if (!(*I)->getParentLoop())
+      Changed |= OptimizeIntraLoopEdgesInLoopNest(MF, *I);
+
+  return Changed;
+}
+
+/// AlignLoops - Align loop headers to target preferred alignments.
+///
+bool CodePlacementOpt::AlignLoops(MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+  if (F->hasFnAttr(Attribute::OptimizeForSize))
+    return false;
+
+  unsigned Align = TLI->getPrefLoopAlignment();
+  if (!Align)
+    return false;  // Don't care about loop alignment.
+
+  bool Changed = false;
+
+  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
+       I != E; ++I)
+    Changed |= AlignLoop(MF, *I, Align);
+
+  return Changed;
+}
+
+/// AlignLoop - Align loop headers to target preferred alignments.
+///
+bool CodePlacementOpt::AlignLoop(MachineFunction &MF, MachineLoop *L,
+                                 unsigned Align) {
+  bool Changed = false;
+
+  // Do alignment for nested loops.
+  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    Changed |= AlignLoop(MF, *I, Align);
+
+  L->getTopBlock()->setAlignment(Align);
+  Changed = true;
+  ++NumLoopsAligned;
+
+  return Changed;
+}
+
+bool CodePlacementOpt::runOnMachineFunction(MachineFunction &MF) {
+  MLI = &getAnalysis<MachineLoopInfo>();
+  if (MLI->empty())
+    return false;  // No loops.
+
+  TLI = MF.getTarget().getTargetLowering();
+  TII = MF.getTarget().getInstrInfo();
+
+  bool Changed = OptimizeIntraLoopEdges(MF);
+
+  Changed |= AlignLoops(MF);
+
+  return Changed;
+}
diff --git a/src/LLVM/lib/CodeGen/CriticalAntiDepBreaker.cpp b/src/LLVM/lib/CodeGen/CriticalAntiDepBreaker.cpp
new file mode 100644
index 0000000..98d4d41
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -0,0 +1,607 @@
+//===----- CriticalAntiDepBreaker.cpp - Anti-dep breaker -------- ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CriticalAntiDepBreaker class, which
+// implements register anti-dependence breaking along a blocks
+// critical path during post-RA scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "post-RA-sched"
+#include "CriticalAntiDepBreaker.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+CriticalAntiDepBreaker::
+CriticalAntiDepBreaker(MachineFunction& MFi) :
+  AntiDepBreaker(), MF(MFi),
+  MRI(MF.getRegInfo()),
+  TII(MF.getTarget().getInstrInfo()),
+  TRI(MF.getTarget().getRegisterInfo()),
+  AllocatableSet(TRI->getAllocatableSet(MF)),
+  Classes(TRI->getNumRegs(), static_cast<const TargetRegisterClass *>(0)),
+  KillIndices(TRI->getNumRegs(), 0),
+  DefIndices(TRI->getNumRegs(), 0) {}
+
+CriticalAntiDepBreaker::~CriticalAntiDepBreaker() {
+}
+
+void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
+  const unsigned BBSize = BB->size();
+  for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i) {
+    // Clear out the register class data.
+    Classes[i] = static_cast<const TargetRegisterClass *>(0);
+
+    // Initialize the indices to indicate that no registers are live.
+    KillIndices[i] = ~0u;
+    DefIndices[i] = BBSize;
+  }
+
+  // Clear "do not change" set.
+  KeepRegs.clear();
+
+  bool IsReturnBlock = (!BB->empty() && BB->back().getDesc().isReturn());
+
+  // Determine the live-out physregs for this block.
+  if (IsReturnBlock) {
+    // In a return block, examine the function live-out regs.
+    for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(),
+         E = MRI.liveout_end(); I != E; ++I) {
+      unsigned Reg = *I;
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = BB->size();
+      DefIndices[Reg] = ~0u;
+
+      // Repeat, for all aliases.
+      for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+        unsigned AliasReg = *Alias;
+        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[AliasReg] = BB->size();
+        DefIndices[AliasReg] = ~0u;
+      }
+    }
+  }
+
+  // In a non-return block, examine the live-in regs of all successors.
+  // Note a return block can have successors if the return instruction is
+  // predicated.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         SE = BB->succ_end(); SI != SE; ++SI)
+    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+           E = (*SI)->livein_end(); I != E; ++I) {
+      unsigned Reg = *I;
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = BB->size();
+      DefIndices[Reg] = ~0u;
+
+      // Repeat, for all aliases.
+      for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+        unsigned AliasReg = *Alias;
+        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[AliasReg] = BB->size();
+        DefIndices[AliasReg] = ~0u;
+      }
+    }
+
+  // Mark live-out callee-saved registers. In a return block this is
+  // all callee-saved registers. In non-return this is any
+  // callee-saved register that is not saved in the prolog.
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  BitVector Pristine = MFI->getPristineRegs(BB);
+  for (const unsigned *I = TRI->getCalleeSavedRegs(); *I; ++I) {
+    unsigned Reg = *I;
+    if (!IsReturnBlock && !Pristine.test(Reg)) continue;
+    Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+    KillIndices[Reg] = BB->size();
+    DefIndices[Reg] = ~0u;
+
+    // Repeat, for all aliases.
+    for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+      unsigned AliasReg = *Alias;
+      Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[AliasReg] = BB->size();
+      DefIndices[AliasReg] = ~0u;
+    }
+  }
+}
+
+void CriticalAntiDepBreaker::FinishBlock() {
+  RegRefs.clear();
+  KeepRegs.clear();
+}
+
+void CriticalAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
+                                     unsigned InsertPosIndex) {
+  if (MI->isDebugValue())
+    return;
+  assert(Count < InsertPosIndex && "Instruction index out of expected range!");
+
+  // Any register which was defined within the previous scheduling region
+  // may have been rescheduled and its lifetime may overlap with registers
+  // in ways not reflected in our current liveness state. For each such
+  // register, adjust the liveness state to be conservatively correct.
+  for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg)
+    if (DefIndices[Reg] < InsertPosIndex && DefIndices[Reg] >= Count) {
+      assert(KillIndices[Reg] == ~0u && "Clobbered register is live!");
+
+      // Mark this register to be non-renamable.
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+      // Move the def index to the end of the previous region, to reflect
+      // that the def could theoretically have been scheduled at the end.
+      DefIndices[Reg] = InsertPosIndex;
+    }
+
+  PrescanInstruction(MI);
+  ScanInstruction(MI, Count);
+}
+
+/// CriticalPathStep - Return the next SUnit after SU on the bottom-up
+/// critical path.
+static const SDep *CriticalPathStep(const SUnit *SU) {
+  const SDep *Next = 0;
+  unsigned NextDepth = 0;
+  // Find the predecessor edge with the greatest depth.
+  for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
+       P != PE; ++P) {
+    const SUnit *PredSU = P->getSUnit();
+    unsigned PredLatency = P->getLatency();
+    unsigned PredTotalLatency = PredSU->getDepth() + PredLatency;
+    // In the case of a latency tie, prefer an anti-dependency edge over
+    // other types of edges.
+    if (NextDepth < PredTotalLatency ||
+        (NextDepth == PredTotalLatency && P->getKind() == SDep::Anti)) {
+      NextDepth = PredTotalLatency;
+      Next = &*P;
+    }
+  }
+  return Next;
+}
+
+void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
+  // It's not safe to change register allocation for source operands of
+  // that have special allocation requirements. Also assume all registers
+  // used in a call must not be changed (ABI).
+  // FIXME: The issue with predicated instruction is more complex. We are being
+  // conservatively here because the kill markers cannot be trusted after
+  // if-conversion:
+  // %R6<def> = LDR %SP, %reg0, 92, pred:14, pred:%reg0; mem:LD4[FixedStack14]
+  // ...
+  // STR %R0, %R6<kill>, %reg0, 0, pred:0, pred:%CPSR; mem:ST4[%395]
+  // %R6<def> = LDR %SP, %reg0, 100, pred:0, pred:%CPSR; mem:LD4[FixedStack12]
+  // STR %R0, %R6<kill>, %reg0, 0, pred:14, pred:%reg0; mem:ST4[%396](align=8)
+  //
+  // The first R6 kill is not really a kill since it's killed by a predicated
+  // instruction which may not be executed. The second R6 def may or may not
+  // re-define R6 so it's not safe to change it since the last R6 use cannot be
+  // changed.
+  bool Special = MI->getDesc().isCall() ||
+    MI->getDesc().hasExtraSrcRegAllocReq() ||
+    TII->isPredicated(MI);
+
+  // Scan the register operands for this instruction and update
+  // Classes and RegRefs.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    const TargetRegisterClass *NewRC = 0;
+
+    if (i < MI->getDesc().getNumOperands())
+      NewRC = MI->getDesc().OpInfo[i].getRegClass(TRI);
+
+    // For now, only allow the register to be changed if its register
+    // class is consistent across all uses.
+    if (!Classes[Reg] && NewRC)
+      Classes[Reg] = NewRC;
+    else if (!NewRC || Classes[Reg] != NewRC)
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+    // Now check for aliases.
+    for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+      // If an alias of the reg is used during the live range, give up.
+      // Note that this allows us to skip checking if AntiDepReg
+      // overlaps with any of the aliases, among other things.
+      unsigned AliasReg = *Alias;
+      if (Classes[AliasReg]) {
+        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      }
+    }
+
+    // If we're still willing to consider this register, note the reference.
+    if (Classes[Reg] != reinterpret_cast<TargetRegisterClass *>(-1))
+      RegRefs.insert(std::make_pair(Reg, &MO));
+
+    if (MO.isUse() && Special) {
+      if (KeepRegs.insert(Reg)) {
+        for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+             *Subreg; ++Subreg)
+          KeepRegs.insert(*Subreg);
+      }
+    }
+  }
+}
+
+void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
+                                             unsigned Count) {
+  // Update liveness.
+  // Proceding upwards, registers that are defed but not used in this
+  // instruction are now dead.
+
+  if (!TII->isPredicated(MI)) {
+    // Predicated defs are modeled as read + write, i.e. similar to two
+    // address updates.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+      if (!MO.isDef()) continue;
+      // Ignore two-addr defs.
+      if (MI->isRegTiedToUseOperand(i)) continue;
+
+      DefIndices[Reg] = Count;
+      KillIndices[Reg] = ~0u;
+      assert(((KillIndices[Reg] == ~0u) !=
+              (DefIndices[Reg] == ~0u)) &&
+             "Kill and Def maps aren't consistent for Reg!");
+      KeepRegs.erase(Reg);
+      Classes[Reg] = 0;
+      RegRefs.erase(Reg);
+      // Repeat, for all subregs.
+      for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+           *Subreg; ++Subreg) {
+        unsigned SubregReg = *Subreg;
+        DefIndices[SubregReg] = Count;
+        KillIndices[SubregReg] = ~0u;
+        KeepRegs.erase(SubregReg);
+        Classes[SubregReg] = 0;
+        RegRefs.erase(SubregReg);
+      }
+      // Conservatively mark super-registers as unusable.
+      for (const unsigned *Super = TRI->getSuperRegisters(Reg);
+           *Super; ++Super) {
+        unsigned SuperReg = *Super;
+        Classes[SuperReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      }
+    }
+  }
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    if (!MO.isUse()) continue;
+
+    const TargetRegisterClass *NewRC = 0;
+    if (i < MI->getDesc().getNumOperands())
+      NewRC = MI->getDesc().OpInfo[i].getRegClass(TRI);
+
+    // For now, only allow the register to be changed if its register
+    // class is consistent across all uses.
+    if (!Classes[Reg] && NewRC)
+      Classes[Reg] = NewRC;
+    else if (!NewRC || Classes[Reg] != NewRC)
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+    RegRefs.insert(std::make_pair(Reg, &MO));
+
+    // It wasn't previously live but now it is, this is a kill.
+    if (KillIndices[Reg] == ~0u) {
+      KillIndices[Reg] = Count;
+      DefIndices[Reg] = ~0u;
+          assert(((KillIndices[Reg] == ~0u) !=
+                  (DefIndices[Reg] == ~0u)) &&
+               "Kill and Def maps aren't consistent for Reg!");
+    }
+    // Repeat, for all aliases.
+    for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+      unsigned AliasReg = *Alias;
+      if (KillIndices[AliasReg] == ~0u) {
+        KillIndices[AliasReg] = Count;
+        DefIndices[AliasReg] = ~0u;
+      }
+    }
+  }
+}
+
+unsigned
+CriticalAntiDepBreaker::findSuitableFreeRegister(MachineInstr *MI,
+                                                 unsigned AntiDepReg,
+                                                 unsigned LastNewReg,
+                                                 const TargetRegisterClass *RC)
+{
+  for (TargetRegisterClass::iterator R = RC->allocation_order_begin(MF),
+       RE = RC->allocation_order_end(MF); R != RE; ++R) {
+    unsigned NewReg = *R;
+    // Don't replace a register with itself.
+    if (NewReg == AntiDepReg) continue;
+    // Don't replace a register with one that was recently used to repair
+    // an anti-dependence with this AntiDepReg, because that would
+    // re-introduce that anti-dependence.
+    if (NewReg == LastNewReg) continue;
+    // If the instruction already has a def of the NewReg, it's not suitable.
+    // For example, Instruction with multiple definitions can result in this
+    // condition.
+    if (MI->modifiesRegister(NewReg, TRI)) continue;
+    // If NewReg is dead and NewReg's most recent def is not before
+    // AntiDepReg's kill, it's safe to replace AntiDepReg with NewReg.
+    assert(((KillIndices[AntiDepReg] == ~0u) != (DefIndices[AntiDepReg] == ~0u))
+           && "Kill and Def maps aren't consistent for AntiDepReg!");
+    assert(((KillIndices[NewReg] == ~0u) != (DefIndices[NewReg] == ~0u))
+           && "Kill and Def maps aren't consistent for NewReg!");
+    if (KillIndices[NewReg] != ~0u ||
+        Classes[NewReg] == reinterpret_cast<TargetRegisterClass *>(-1) ||
+        KillIndices[AntiDepReg] > DefIndices[NewReg])
+      continue;
+    return NewReg;
+  }
+
+  // No registers are free and available!
+  return 0;
+}
+
+unsigned CriticalAntiDepBreaker::
+BreakAntiDependencies(const std::vector<SUnit>& SUnits,
+                      MachineBasicBlock::iterator Begin,
+                      MachineBasicBlock::iterator End,
+                      unsigned InsertPosIndex) {
+  // The code below assumes that there is at least one instruction,
+  // so just duck out immediately if the block is empty.
+  if (SUnits.empty()) return 0;
+
+  // Keep a map of the MachineInstr*'s back to the SUnit representing them.
+  // This is used for updating debug information.
+  DenseMap<MachineInstr*,const SUnit*> MISUnitMap;
+
+  // Find the node at the bottom of the critical path.
+  const SUnit *Max = 0;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    const SUnit *SU = &SUnits[i];
+    MISUnitMap[SU->getInstr()] = SU;
+    if (!Max || SU->getDepth() + SU->Latency > Max->getDepth() + Max->Latency)
+      Max = SU;
+  }
+
+#ifndef NDEBUG
+  {
+    DEBUG(dbgs() << "Critical path has total latency "
+          << (Max->getDepth() + Max->Latency) << "\n");
+    DEBUG(dbgs() << "Available regs:");
+    for (unsigned Reg = 0; Reg < TRI->getNumRegs(); ++Reg) {
+      if (KillIndices[Reg] == ~0u)
+        DEBUG(dbgs() << " " << TRI->getName(Reg));
+    }
+    DEBUG(dbgs() << '\n');
+  }
+#endif
+
+  // Track progress along the critical path through the SUnit graph as we walk
+  // the instructions.
+  const SUnit *CriticalPathSU = Max;
+  MachineInstr *CriticalPathMI = CriticalPathSU->getInstr();
+
+  // Consider this pattern:
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  // There are three anti-dependencies here, and without special care,
+  // we'd break all of them using the same register:
+  //   A = ...
+  //   ... = A
+  //   B = ...
+  //   ... = B
+  //   B = ...
+  //   ... = B
+  //   B = ...
+  //   ... = B
+  // because at each anti-dependence, B is the first register that
+  // isn't A which is free.  This re-introduces anti-dependencies
+  // at all but one of the original anti-dependencies that we were
+  // trying to break.  To avoid this, keep track of the most recent
+  // register that each register was replaced with, avoid
+  // using it to repair an anti-dependence on the same register.
+  // This lets us produce this:
+  //   A = ...
+  //   ... = A
+  //   B = ...
+  //   ... = B
+  //   C = ...
+  //   ... = C
+  //   B = ...
+  //   ... = B
+  // This still has an anti-dependence on B, but at least it isn't on the
+  // original critical path.
+  //
+  // TODO: If we tracked more than one register here, we could potentially
+  // fix that remaining critical edge too. This is a little more involved,
+  // because unlike the most recent register, less recent registers should
+  // still be considered, though only if no other registers are available.
+  std::vector<unsigned> LastNewReg(TRI->getNumRegs(), 0);
+
+  // Attempt to break anti-dependence edges on the critical path. Walk the
+  // instructions from the bottom up, tracking information about liveness
+  // as we go to help determine which registers are available.
+  unsigned Broken = 0;
+  unsigned Count = InsertPosIndex - 1;
+  for (MachineBasicBlock::iterator I = End, E = Begin;
+       I != E; --Count) {
+    MachineInstr *MI = --I;
+    if (MI->isDebugValue())
+      continue;
+
+    // Check if this instruction has a dependence on the critical path that
+    // is an anti-dependence that we may be able to break. If it is, set
+    // AntiDepReg to the non-zero register associated with the anti-dependence.
+    //
+    // We limit our attention to the critical path as a heuristic to avoid
+    // breaking anti-dependence edges that aren't going to significantly
+    // impact the overall schedule. There are a limited number of registers
+    // and we want to save them for the important edges.
+    //
+    // TODO: Instructions with multiple defs could have multiple
+    // anti-dependencies. The current code here only knows how to break one
+    // edge per instruction. Note that we'd have to be able to break all of
+    // the anti-dependencies in an instruction in order to be effective.
+    unsigned AntiDepReg = 0;
+    if (MI == CriticalPathMI) {
+      if (const SDep *Edge = CriticalPathStep(CriticalPathSU)) {
+        const SUnit *NextSU = Edge->getSUnit();
+
+        // Only consider anti-dependence edges.
+        if (Edge->getKind() == SDep::Anti) {
+          AntiDepReg = Edge->getReg();
+          assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
+          if (!AllocatableSet.test(AntiDepReg))
+            // Don't break anti-dependencies on non-allocatable registers.
+            AntiDepReg = 0;
+          else if (KeepRegs.count(AntiDepReg))
+            // Don't break anti-dependencies if an use down below requires
+            // this exact register.
+            AntiDepReg = 0;
+          else {
+            // If the SUnit has other dependencies on the SUnit that it
+            // anti-depends on, don't bother breaking the anti-dependency
+            // since those edges would prevent such units from being
+            // scheduled past each other regardless.
+            //
+            // Also, if there are dependencies on other SUnits with the
+            // same register as the anti-dependency, don't attempt to
+            // break it.
+            for (SUnit::const_pred_iterator P = CriticalPathSU->Preds.begin(),
+                 PE = CriticalPathSU->Preds.end(); P != PE; ++P)
+              if (P->getSUnit() == NextSU ?
+                    (P->getKind() != SDep::Anti || P->getReg() != AntiDepReg) :
+                    (P->getKind() == SDep::Data && P->getReg() == AntiDepReg)) {
+                AntiDepReg = 0;
+                break;
+              }
+          }
+        }
+        CriticalPathSU = NextSU;
+        CriticalPathMI = CriticalPathSU->getInstr();
+      } else {
+        // We've reached the end of the critical path.
+        CriticalPathSU = 0;
+        CriticalPathMI = 0;
+      }
+    }
+
+    PrescanInstruction(MI);
+
+    // If MI's defs have a special allocation requirement, don't allow
+    // any def registers to be changed. Also assume all registers
+    // defined in a call must not be changed (ABI).
+    if (MI->getDesc().isCall() || MI->getDesc().hasExtraDefRegAllocReq() ||
+        TII->isPredicated(MI))
+      // If this instruction's defs have special allocation requirement, don't
+      // break this anti-dependency.
+      AntiDepReg = 0;
+    else if (AntiDepReg) {
+      // If this instruction has a use of AntiDepReg, breaking it
+      // is invalid.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg()) continue;
+        unsigned Reg = MO.getReg();
+        if (Reg == 0) continue;
+        if (MO.isUse() && TRI->regsOverlap(AntiDepReg, Reg)) {
+          AntiDepReg = 0;
+          break;
+        }
+      }
+    }
+
+    // Determine AntiDepReg's register class, if it is live and is
+    // consistently used within a single class.
+    const TargetRegisterClass *RC = AntiDepReg != 0 ? Classes[AntiDepReg] : 0;
+    assert((AntiDepReg == 0 || RC != NULL) &&
+           "Register should be live if it's causing an anti-dependence!");
+    if (RC == reinterpret_cast<TargetRegisterClass *>(-1))
+      AntiDepReg = 0;
+
+    // Look for a suitable register to use to break the anti-depenence.
+    //
+    // TODO: Instead of picking the first free register, consider which might
+    // be the best.
+    if (AntiDepReg != 0) {
+      if (unsigned NewReg = findSuitableFreeRegister(MI, AntiDepReg,
+                                                     LastNewReg[AntiDepReg],
+                                                     RC)) {
+        DEBUG(dbgs() << "Breaking anti-dependence edge on "
+              << TRI->getName(AntiDepReg)
+              << " with " << RegRefs.count(AntiDepReg) << " references"
+              << " using " << TRI->getName(NewReg) << "!\n");
+
+        // Update the references to the old register to refer to the new
+        // register.
+        std::pair<std::multimap<unsigned, MachineOperand *>::iterator,
+                  std::multimap<unsigned, MachineOperand *>::iterator>
+           Range = RegRefs.equal_range(AntiDepReg);
+        for (std::multimap<unsigned, MachineOperand *>::iterator
+             Q = Range.first, QE = Range.second; Q != QE; ++Q) {
+          Q->second->setReg(NewReg);
+          // If the SU for the instruction being updated has debug information
+          // related to the anti-dependency register, make sure to update that
+          // as well.
+          const SUnit *SU = MISUnitMap[Q->second->getParent()];
+          if (!SU) continue;
+          for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) {
+            MachineInstr *DI = SU->DbgInstrList[i];
+            assert (DI->getNumOperands()==3 && DI->getOperand(0).isReg() &&
+                    DI->getOperand(0).getReg()
+                    && "Non register dbg_value attached to SUnit!");
+            if (DI->getOperand(0).getReg() == AntiDepReg)
+              DI->getOperand(0).setReg(NewReg);
+          }
+        }
+
+        // We just went back in time and modified history; the
+        // liveness information for the anti-depenence reg is now
+        // inconsistent. Set the state as if it were dead.
+        Classes[NewReg] = Classes[AntiDepReg];
+        DefIndices[NewReg] = DefIndices[AntiDepReg];
+        KillIndices[NewReg] = KillIndices[AntiDepReg];
+        assert(((KillIndices[NewReg] == ~0u) !=
+                (DefIndices[NewReg] == ~0u)) &&
+             "Kill and Def maps aren't consistent for NewReg!");
+
+        Classes[AntiDepReg] = 0;
+        DefIndices[AntiDepReg] = KillIndices[AntiDepReg];
+        KillIndices[AntiDepReg] = ~0u;
+        assert(((KillIndices[AntiDepReg] == ~0u) !=
+                (DefIndices[AntiDepReg] == ~0u)) &&
+             "Kill and Def maps aren't consistent for AntiDepReg!");
+
+        RegRefs.erase(AntiDepReg);
+        LastNewReg[AntiDepReg] = NewReg;
+        ++Broken;
+      }
+    }
+
+    ScanInstruction(MI, Count);
+  }
+
+  return Broken;
+}
diff --git a/src/LLVM/lib/CodeGen/CriticalAntiDepBreaker.h b/src/LLVM/lib/CodeGen/CriticalAntiDepBreaker.h
new file mode 100644
index 0000000..0ed7c35
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -0,0 +1,100 @@
+//=- llvm/CodeGen/CriticalAntiDepBreaker.h - Anti-Dep Support -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CriticalAntiDepBreaker class, which
+// implements register anti-dependence breaking along a blocks
+// critical path during post-RA scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H
+#define LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H
+
+#include "AntiDepBreaker.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include <map>
+
+namespace llvm {
+class TargetInstrInfo;
+class TargetRegisterInfo;
+
+  class CriticalAntiDepBreaker : public AntiDepBreaker {
+    MachineFunction& MF;
+    MachineRegisterInfo &MRI;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+
+    /// AllocatableSet - The set of allocatable registers.
+    /// We'll be ignoring anti-dependencies on non-allocatable registers,
+    /// because they may not be safe to break.
+    const BitVector AllocatableSet;
+
+    /// Classes - For live regs that are only used in one register class in a
+    /// live range, the register class. If the register is not live, the
+    /// corresponding value is null. If the register is live but used in
+    /// multiple register classes, the corresponding value is -1 casted to a
+    /// pointer.
+    std::vector<const TargetRegisterClass*> Classes;
+
+    /// RegRegs - Map registers to all their references within a live range.
+    std::multimap<unsigned, MachineOperand *> RegRefs;
+
+    /// KillIndices - The index of the most recent kill (proceding bottom-up),
+    /// or ~0u if the register is not live.
+    std::vector<unsigned> KillIndices;
+
+    /// DefIndices - The index of the most recent complete def (proceding bottom
+    /// up), or ~0u if the register is live.
+    std::vector<unsigned> DefIndices;
+
+    /// KeepRegs - A set of registers which are live and cannot be changed to
+    /// break anti-dependencies.
+    SmallSet<unsigned, 4> KeepRegs;
+
+  public:
+    CriticalAntiDepBreaker(MachineFunction& MFi);
+    ~CriticalAntiDepBreaker();
+
+    /// Start - Initialize anti-dep breaking for a new basic block.
+    void StartBlock(MachineBasicBlock *BB);
+
+    /// BreakAntiDependencies - Identifiy anti-dependencies along the critical
+    /// path
+    /// of the ScheduleDAG and break them by renaming registers.
+    ///
+    unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
+                                   MachineBasicBlock::iterator Begin,
+                                   MachineBasicBlock::iterator End,
+                                   unsigned InsertPosIndex);
+
+    /// Observe - Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    ///
+    void Observe(MachineInstr *MI, unsigned Count, unsigned InsertPosIndex);
+
+    /// Finish - Finish anti-dep breaking for a basic block.
+    void FinishBlock();
+
+  private:
+    void PrescanInstruction(MachineInstr *MI);
+    void ScanInstruction(MachineInstr *MI, unsigned Count);
+    unsigned findSuitableFreeRegister(MachineInstr *MI,
+                                      unsigned AntiDepReg,
+                                      unsigned LastNewReg,
+                                      const TargetRegisterClass *);
+  };
+}
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/DeadMachineInstructionElim.cpp b/src/LLVM/lib/CodeGen/DeadMachineInstructionElim.cpp
new file mode 100644
index 0000000..138b83d
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -0,0 +1,186 @@
+//===- DeadMachineInstructionElim.cpp - Remove dead machine instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an extremely simple MachineInstr-level dead-code-elimination pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "codegen-dce"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumDeletes,          "Number of dead instructions deleted");
+
+namespace {
+  class DeadMachineInstructionElim : public MachineFunctionPass {
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+    
+    const TargetRegisterInfo *TRI;
+    const MachineRegisterInfo *MRI;
+    const TargetInstrInfo *TII;
+    BitVector LivePhysRegs;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    DeadMachineInstructionElim() : MachineFunctionPass(ID) {}
+
+  private:
+    bool isDead(const MachineInstr *MI) const;
+  };
+}
+char DeadMachineInstructionElim::ID = 0;
+
+INITIALIZE_PASS(DeadMachineInstructionElim, "dead-mi-elimination",
+                "Remove dead machine instructions", false, false);
+
+FunctionPass *llvm::createDeadMachineInstructionElimPass() {
+  return new DeadMachineInstructionElim();
+}
+
+bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
+  // Don't delete instructions with side effects.
+  bool SawStore = false;
+  if (!MI->isSafeToMove(TII, 0, SawStore) && !MI->isPHI())
+    return false;
+
+  // Examine each operand.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      unsigned Reg = MO.getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(Reg) ?
+          LivePhysRegs[Reg] : !MRI->use_nodbg_empty(Reg)) {
+        // This def has a non-debug use. Don't delete the instruction!
+        return false;
+      }
+    }
+  }
+
+  // If there are no defs with uses, the instruction is dead.
+  return true;
+}
+
+bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
+  bool AnyChanges = false;
+  MRI = &MF.getRegInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getTarget().getInstrInfo();
+
+  // Compute a bitvector to represent all non-allocatable physregs.
+  BitVector NonAllocatableRegs = TRI->getAllocatableSet(MF);
+  NonAllocatableRegs.flip();
+
+  // Loop over all instructions in all blocks, from bottom to top, so that it's
+  // more likely that chains of dependent but ultimately dead instructions will
+  // be cleaned up.
+  for (MachineFunction::reverse_iterator I = MF.rbegin(), E = MF.rend();
+       I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+
+    // Start out assuming that all non-allocatable registers are live
+    // out of this block.
+    LivePhysRegs = NonAllocatableRegs;
+
+    // Also add any explicit live-out physregs for this block.
+    if (!MBB->empty() && MBB->back().getDesc().isReturn())
+      for (MachineRegisterInfo::liveout_iterator LOI = MRI->liveout_begin(),
+           LOE = MRI->liveout_end(); LOI != LOE; ++LOI) {
+        unsigned Reg = *LOI;
+        if (TargetRegisterInfo::isPhysicalRegister(Reg))
+          LivePhysRegs.set(Reg);
+      }
+
+    // Now scan the instructions and delete dead ones, tracking physreg
+    // liveness as we go.
+    for (MachineBasicBlock::reverse_iterator MII = MBB->rbegin(),
+         MIE = MBB->rend(); MII != MIE; ) {
+      MachineInstr *MI = &*MII;
+
+      // If the instruction is dead, delete it!
+      if (isDead(MI)) {
+        DEBUG(dbgs() << "DeadMachineInstructionElim: DELETING: " << *MI);
+        // It is possible that some DBG_VALUE instructions refer to this
+        // instruction.  Examine each def operand for such references;
+        // if found, mark the DBG_VALUE as undef (but don't delete it).
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          const MachineOperand &MO = MI->getOperand(i);
+          if (!MO.isReg() || !MO.isDef())
+            continue;
+          unsigned Reg = MO.getReg();
+          if (!TargetRegisterInfo::isVirtualRegister(Reg))
+            continue;
+          MachineRegisterInfo::use_iterator nextI;
+          for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
+               E = MRI->use_end(); I!=E; I=nextI) {
+            nextI = llvm::next(I);  // I is invalidated by the setReg
+            MachineOperand& Use = I.getOperand();
+            MachineInstr *UseMI = Use.getParent();
+            if (UseMI==MI)
+              continue;
+            assert(Use.isDebug());
+            UseMI->getOperand(0).setReg(0U);
+          }
+        }
+        AnyChanges = true;
+        MI->eraseFromParent();
+        ++NumDeletes;
+        MIE = MBB->rend();
+        // MII is now pointing to the next instruction to process,
+        // so don't increment it.
+        continue;
+      }
+
+      // Record the physreg defs.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.isDef()) {
+          unsigned Reg = MO.getReg();
+          if (Reg != 0 && TargetRegisterInfo::isPhysicalRegister(Reg)) {
+            LivePhysRegs.reset(Reg);
+            // Check the subreg set, not the alias set, because a def
+            // of a super-register may still be partially live after
+            // this def.
+            for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+                 *SubRegs; ++SubRegs)
+              LivePhysRegs.reset(*SubRegs);
+          }
+        }
+      }
+      // Record the physreg uses, after the defs, in case a physreg is
+      // both defined and used in the same instruction.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.isUse()) {
+          unsigned Reg = MO.getReg();
+          if (Reg != 0 && TargetRegisterInfo::isPhysicalRegister(Reg)) {
+            LivePhysRegs.set(Reg);
+            for (const unsigned *AliasSet = TRI->getAliasSet(Reg);
+                 *AliasSet; ++AliasSet)
+              LivePhysRegs.set(*AliasSet);
+          }
+        }
+      }
+
+      // We didn't delete the current instruction, so increment MII to
+      // the next one.
+      ++MII;
+    }
+  }
+
+  LivePhysRegs.clear();
+  return AnyChanges;
+}
diff --git a/src/LLVM/lib/CodeGen/ELF.h b/src/LLVM/lib/CodeGen/ELF.h
new file mode 100644
index 0000000..fb884c9
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/ELF.h
@@ -0,0 +1,227 @@
+//===-- lib/CodeGen/ELF.h - ELF constants and data structures ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header contains common, non-processor-specific data structures and
+// constants for the ELF file format.
+//
+// The details of the ELF32 bits in this file are largely based on the Tool
+// Interface Standard (TIS) Executable and Linking Format (ELF) Specification
+// Version 1.2, May 1995. The ELF64 is based on HP/Intel definition of the
+// ELF-64 object file format document, Version 1.5 Draft 2 May 27, 1998
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ELF_H
+#define CODEGEN_ELF_H
+
+#include "llvm/CodeGen/BinaryObject.h"
+#include "llvm/CodeGen/MachineRelocation.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/System/DataTypes.h"
+
+namespace llvm {
+  class GlobalValue;
+
+  /// ELFSym - This struct contains information about each symbol that is
+  /// added to logical symbol table for the module.  This is eventually
+  /// turned into a real symbol table in the file.
+  struct ELFSym {
+
+    // ELF symbols are related to llvm ones by being one of the two llvm
+    // types, for the other ones (section, file, func) a null pointer is
+    // assumed by default.
+    union {
+      const GlobalValue *GV;  // If this is a pointer to a GV
+      const char *Ext;        // If this is a pointer to a named symbol
+    } Source;
+
+    // Describes from which source type this ELF symbol comes from,
+    // they can be GlobalValue, ExternalSymbol or neither.
+    enum {
+      isGV,      // The Source.GV field is valid.
+      isExtSym,  // The Source.ExtSym field is valid.
+      isOther    // Not a GlobalValue or External Symbol
+    };
+    unsigned SourceType;
+
+    bool isGlobalValue() const { return SourceType == isGV; }
+    bool isExternalSym() const { return SourceType == isExtSym; }
+
+    // getGlobalValue - If this is a global value which originated the
+    // elf symbol, return a reference to it.
+    const GlobalValue *getGlobalValue() const {
+      assert(SourceType == isGV && "This is not a global value");
+      return Source.GV;
+    }
+
+    // getExternalSym - If this is an external symbol which originated the
+    // elf symbol, return a reference to it.
+    const char *getExternalSymbol() const {
+      assert(SourceType == isExtSym && "This is not an external symbol");
+      return Source.Ext;
+    }
+
+    // getGV - From a global value return a elf symbol to represent it
+    static ELFSym *getGV(const GlobalValue *GV, unsigned Bind,
+                         unsigned Type, unsigned Visibility) {
+      ELFSym *Sym = new ELFSym();
+      Sym->Source.GV = GV;
+      Sym->setBind(Bind);
+      Sym->setType(Type);
+      Sym->setVisibility(Visibility);
+      Sym->SourceType = isGV;
+      return Sym;
+    }
+
+    // getExtSym - Create and return an elf symbol to represent an
+    // external symbol
+    static ELFSym *getExtSym(const char *Ext) {
+      ELFSym *Sym = new ELFSym();
+      Sym->Source.Ext = Ext;
+      Sym->setBind(ELF::STB_GLOBAL);
+      Sym->setType(ELF::STT_NOTYPE);
+      Sym->setVisibility(ELF::STV_DEFAULT);
+      Sym->SourceType = isExtSym;
+      return Sym;
+    }
+
+    // getSectionSym - Returns a elf symbol to represent an elf section
+    static ELFSym *getSectionSym() {
+      ELFSym *Sym = new ELFSym();
+      Sym->setBind(ELF::STB_LOCAL);
+      Sym->setType(ELF::STT_SECTION);
+      Sym->setVisibility(ELF::STV_DEFAULT);
+      Sym->SourceType = isOther;
+      return Sym;
+    }
+
+    // getFileSym - Returns a elf symbol to represent the module identifier
+    static ELFSym *getFileSym() {
+      ELFSym *Sym = new ELFSym();
+      Sym->setBind(ELF::STB_LOCAL);
+      Sym->setType(ELF::STT_FILE);
+      Sym->setVisibility(ELF::STV_DEFAULT);
+      Sym->SectionIdx = 0xfff1;  // ELFSection::SHN_ABS;
+      Sym->SourceType = isOther;
+      return Sym;
+    }
+
+    // getUndefGV - Returns a STT_NOTYPE symbol
+    static ELFSym *getUndefGV(const GlobalValue *GV, unsigned Bind) {
+      ELFSym *Sym = new ELFSym();
+      Sym->Source.GV = GV;
+      Sym->setBind(Bind);
+      Sym->setType(ELF::STT_NOTYPE);
+      Sym->setVisibility(ELF::STV_DEFAULT);
+      Sym->SectionIdx = 0;  //ELFSection::SHN_UNDEF;
+      Sym->SourceType = isGV;
+      return Sym;
+    }
+
+    // ELF specific fields
+    unsigned NameIdx;         // Index in .strtab of name, once emitted.
+    uint64_t Value;
+    unsigned Size;
+    uint8_t Info;
+    uint8_t Other;
+    unsigned short SectionIdx;
+
+    // Symbol index into the Symbol table
+    unsigned SymTabIdx;
+
+    ELFSym() : SourceType(isOther), NameIdx(0), Value(0),
+               Size(0), Info(0), Other(ELF::STV_DEFAULT), SectionIdx(0),
+               SymTabIdx(0) {}
+
+    unsigned getBind() const { return (Info >> 4) & 0xf; }
+    unsigned getType() const { return Info & 0xf; }
+    bool isLocalBind() const { return getBind() == ELF::STB_LOCAL; }
+    bool isFileType() const { return getType() == ELF::STT_FILE; }
+
+    void setBind(unsigned X) {
+      assert(X == (X & 0xF) && "Bind value out of range!");
+      Info = (Info & 0x0F) | (X << 4);
+    }
+
+    void setType(unsigned X) {
+      assert(X == (X & 0xF) && "Type value out of range!");
+      Info = (Info & 0xF0) | X;
+    }
+
+    void setVisibility(unsigned V) {
+      assert(V == (V & 0x3) && "Visibility value out of range!");
+      Other = V;
+    }
+  };
+
+  /// ELFSection - This struct contains information about each section that is
+  /// emitted to the file.  This is eventually turned into the section header
+  /// table at the end of the file.
+  class ELFSection : public BinaryObject {
+    public:
+    // ELF specific fields
+    unsigned NameIdx;   // sh_name - .shstrtab idx of name, once emitted.
+    unsigned Type;      // sh_type - Section contents & semantics 
+    unsigned Flags;     // sh_flags - Section flags.
+    uint64_t Addr;      // sh_addr - The mem addr this section is in.
+    unsigned Offset;    // sh_offset - Offset from the file start
+    unsigned Size;      // sh_size - The section size.
+    unsigned Link;      // sh_link - Section header table index link.
+    unsigned Info;      // sh_info - Auxillary information.
+    unsigned Align;     // sh_addralign - Alignment of section.
+    unsigned EntSize;   // sh_entsize - Size of entries in the section e
+
+    /// SectionIdx - The number of the section in the Section Table.
+    unsigned short SectionIdx;
+
+    /// Sym - The symbol to represent this section if it has one.
+    ELFSym *Sym;
+
+    /// getSymIndex - Returns the symbol table index of the symbol
+    /// representing this section.
+    unsigned getSymbolTableIndex() const {
+      assert(Sym && "section not present in the symbol table");
+      return Sym->SymTabIdx;
+    }
+
+    ELFSection(const std::string &name, bool isLittleEndian, bool is64Bit)
+      : BinaryObject(name, isLittleEndian, is64Bit), Type(0), Flags(0), Addr(0),
+        Offset(0), Size(0), Link(0), Info(0), Align(0), EntSize(0), Sym(0) {}
+  };
+
+  /// ELFRelocation - This class contains all the information necessary to
+  /// to generate any 32-bit or 64-bit ELF relocation entry.
+  class ELFRelocation {
+    uint64_t r_offset;    // offset in the section of the object this applies to
+    uint32_t r_symidx;    // symbol table index of the symbol to use
+    uint32_t r_type;      // machine specific relocation type
+    int64_t  r_add;       // explicit relocation addend
+    bool     r_rela;      // if true then the addend is part of the entry
+                          // otherwise the addend is at the location specified
+                          // by r_offset
+  public:
+    uint64_t getInfo(bool is64Bit) const {
+      if (is64Bit)
+        return ((uint64_t)r_symidx << 32) + ((uint64_t)r_type & 0xFFFFFFFFL);
+      else
+        return (r_symidx << 8)  + (r_type & 0xFFL);
+    }
+
+    uint64_t getOffset() const { return r_offset; }
+    int64_t getAddend() const { return r_add; }
+
+    ELFRelocation(uint64_t off, uint32_t sym, uint32_t type,
+                  bool rela = true, int64_t addend = 0) :
+      r_offset(off), r_symidx(sym), r_type(type),
+      r_add(addend), r_rela(rela) {}
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/InlineSpiller.cpp b/src/LLVM/lib/CodeGen/InlineSpiller.cpp
new file mode 100644
index 0000000..63a89aa
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/InlineSpiller.cpp
@@ -0,0 +1,468 @@
+//===-------- InlineSpiller.cpp - Insert spills and restores inline -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The inline spiller modifies the machine function directly instead of
+// inserting spills and restores in VirtRegMap.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "spiller"
+#include "Spiller.h"
+#include "SplitKit.h"
+#include "VirtRegMap.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class InlineSpiller : public Spiller {
+  MachineFunctionPass &pass_;
+  MachineFunction &mf_;
+  LiveIntervals &lis_;
+  MachineLoopInfo &loops_;
+  VirtRegMap &vrm_;
+  MachineFrameInfo &mfi_;
+  MachineRegisterInfo &mri_;
+  const TargetInstrInfo &tii_;
+  const TargetRegisterInfo &tri_;
+  const BitVector reserved_;
+
+  SplitAnalysis splitAnalysis_;
+
+  // Variables that are valid during spill(), but used by multiple methods.
+  LiveInterval *li_;
+  std::vector<LiveInterval*> *newIntervals_;
+  const TargetRegisterClass *rc_;
+  int stackSlot_;
+  const SmallVectorImpl<LiveInterval*> *spillIs_;
+
+  // Values of the current interval that can potentially remat.
+  SmallPtrSet<VNInfo*, 8> reMattable_;
+
+  // Values in reMattable_ that failed to remat at some point.
+  SmallPtrSet<VNInfo*, 8> usedValues_;
+
+  ~InlineSpiller() {}
+
+public:
+  InlineSpiller(MachineFunctionPass &pass,
+                MachineFunction &mf,
+                VirtRegMap &vrm)
+    : pass_(pass),
+      mf_(mf),
+      lis_(pass.getAnalysis<LiveIntervals>()),
+      loops_(pass.getAnalysis<MachineLoopInfo>()),
+      vrm_(vrm),
+      mfi_(*mf.getFrameInfo()),
+      mri_(mf.getRegInfo()),
+      tii_(*mf.getTarget().getInstrInfo()),
+      tri_(*mf.getTarget().getRegisterInfo()),
+      reserved_(tri_.getReservedRegs(mf_)),
+      splitAnalysis_(mf, lis_, loops_) {}
+
+  void spill(LiveInterval *li,
+             std::vector<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &spillIs,
+             SlotIndex *earliestIndex);
+
+private:
+  bool split();
+
+  bool allUsesAvailableAt(const MachineInstr *OrigMI, SlotIndex OrigIdx,
+                          SlotIndex UseIdx);
+  bool reMaterializeFor(MachineBasicBlock::iterator MI);
+  void reMaterializeAll();
+
+  bool coalesceStackAccess(MachineInstr *MI);
+  bool foldMemoryOperand(MachineBasicBlock::iterator MI,
+                         const SmallVectorImpl<unsigned> &Ops);
+  void insertReload(LiveInterval &NewLI, MachineBasicBlock::iterator MI);
+  void insertSpill(LiveInterval &NewLI, MachineBasicBlock::iterator MI);
+};
+}
+
+namespace llvm {
+Spiller *createInlineSpiller(MachineFunctionPass &pass,
+                             MachineFunction &mf,
+                             VirtRegMap &vrm) {
+  return new InlineSpiller(pass, mf, vrm);
+}
+}
+
+/// split - try splitting the current interval into pieces that may allocate
+/// separately. Return true if successful.
+bool InlineSpiller::split() {
+  // FIXME: Add intra-MBB splitting.
+  if (lis_.intervalIsInOneMBB(*li_))
+    return false;
+
+  splitAnalysis_.analyze(li_);
+
+  if (const MachineLoop *loop = splitAnalysis_.getBestSplitLoop()) {
+    // We can split, but li_ may be left intact with fewer uses.
+    if (SplitEditor(splitAnalysis_, lis_, vrm_, *newIntervals_)
+          .splitAroundLoop(loop))
+      return true;
+  }
+  return false;
+}
+
+/// allUsesAvailableAt - Return true if all registers used by OrigMI at
+/// OrigIdx are also available with the same value at UseIdx.
+bool InlineSpiller::allUsesAvailableAt(const MachineInstr *OrigMI,
+                                       SlotIndex OrigIdx,
+                                       SlotIndex UseIdx) {
+  OrigIdx = OrigIdx.getUseIndex();
+  UseIdx = UseIdx.getUseIndex();
+  for (unsigned i = 0, e = OrigMI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = OrigMI->getOperand(i);
+    if (!MO.isReg() || !MO.getReg() || MO.getReg() == li_->reg)
+      continue;
+    // Reserved registers are OK.
+    if (MO.isUndef() || !lis_.hasInterval(MO.getReg()))
+      continue;
+    // We don't want to move any defs.
+    if (MO.isDef())
+      return false;
+    // We cannot depend on virtual registers in spillIs_. They will be spilled.
+    for (unsigned si = 0, se = spillIs_->size(); si != se; ++si)
+      if ((*spillIs_)[si]->reg == MO.getReg())
+        return false;
+
+    LiveInterval &LI = lis_.getInterval(MO.getReg());
+    const VNInfo *OVNI = LI.getVNInfoAt(OrigIdx);
+    if (!OVNI)
+      continue;
+    if (OVNI != LI.getVNInfoAt(UseIdx))
+      return false;
+  }
+  return true;
+}
+
+/// reMaterializeFor - Attempt to rematerialize li_->reg before MI instead of
+/// reloading it.
+bool InlineSpiller::reMaterializeFor(MachineBasicBlock::iterator MI) {
+  SlotIndex UseIdx = lis_.getInstructionIndex(MI).getUseIndex();
+  VNInfo *OrigVNI = li_->getVNInfoAt(UseIdx);
+  if (!OrigVNI) {
+    DEBUG(dbgs() << "\tadding <undef> flags: ");
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isUse() && MO.getReg() == li_->reg)
+        MO.setIsUndef();
+    }
+    DEBUG(dbgs() << UseIdx << '\t' << *MI);
+    return true;
+  }
+  if (!reMattable_.count(OrigVNI)) {
+    DEBUG(dbgs() << "\tusing non-remat valno " << OrigVNI->id << ": "
+                 << UseIdx << '\t' << *MI);
+    return false;
+  }
+  MachineInstr *OrigMI = lis_.getInstructionFromIndex(OrigVNI->def);
+  if (!allUsesAvailableAt(OrigMI, OrigVNI->def, UseIdx)) {
+    usedValues_.insert(OrigVNI);
+    DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << *MI);
+    return false;
+  }
+
+  // If the instruction also writes li_->reg, it had better not require the same
+  // register for uses and defs.
+  bool Reads, Writes;
+  SmallVector<unsigned, 8> Ops;
+  tie(Reads, Writes) = MI->readsWritesVirtualRegister(li_->reg, &Ops);
+  if (Writes) {
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(Ops[i]);
+      if (MO.isUse() ? MI->isRegTiedToDefOperand(Ops[i]) : MO.getSubReg()) {
+        usedValues_.insert(OrigVNI);
+        DEBUG(dbgs() << "\tcannot remat tied reg: " << UseIdx << '\t' << *MI);
+        return false;
+      }
+    }
+  }
+
+  // Alocate a new register for the remat.
+  unsigned NewVReg = mri_.createVirtualRegister(rc_);
+  vrm_.grow();
+  LiveInterval &NewLI = lis_.getOrCreateInterval(NewVReg);
+  NewLI.markNotSpillable();
+  newIntervals_->push_back(&NewLI);
+
+  // Finally we can rematerialize OrigMI before MI.
+  MachineBasicBlock &MBB = *MI->getParent();
+  tii_.reMaterialize(MBB, MI, NewLI.reg, 0, OrigMI, tri_);
+  MachineBasicBlock::iterator RematMI = MI;
+  SlotIndex DefIdx = lis_.InsertMachineInstrInMaps(--RematMI).getDefIndex();
+  DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t' << *RematMI);
+
+  // Replace operands
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(Ops[i]);
+    if (MO.isReg() && MO.isUse() && MO.getReg() == li_->reg) {
+      MO.setReg(NewVReg);
+      MO.setIsKill();
+    }
+  }
+  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << *MI);
+
+  VNInfo *DefVNI = NewLI.getNextValue(DefIdx, 0, true,
+                                       lis_.getVNInfoAllocator());
+  NewLI.addRange(LiveRange(DefIdx, UseIdx.getDefIndex(), DefVNI));
+  DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+  return true;
+}
+
+/// reMaterializeAll - Try to rematerialize as many uses of li_ as possible,
+/// and trim the live ranges after.
+void InlineSpiller::reMaterializeAll() {
+  // Do a quick scan of the interval values to find if any are remattable.
+  reMattable_.clear();
+  usedValues_.clear();
+  for (LiveInterval::const_vni_iterator I = li_->vni_begin(),
+       E = li_->vni_end(); I != E; ++I) {
+    VNInfo *VNI = *I;
+    if (VNI->isUnused() || !VNI->isDefAccurate())
+      continue;
+    MachineInstr *DefMI = lis_.getInstructionFromIndex(VNI->def);
+    if (!DefMI || !tii_.isTriviallyReMaterializable(DefMI))
+      continue;
+    reMattable_.insert(VNI);
+  }
+
+  // Often, no defs are remattable.
+  if (reMattable_.empty())
+    return;
+
+  // Try to remat before all uses of li_->reg.
+  bool anyRemat = false;
+  for (MachineRegisterInfo::use_nodbg_iterator
+       RI = mri_.use_nodbg_begin(li_->reg);
+       MachineInstr *MI = RI.skipInstruction();)
+     anyRemat |= reMaterializeFor(MI);
+
+  if (!anyRemat)
+    return;
+
+  // Remove any values that were completely rematted.
+  bool anyRemoved = false;
+  for (SmallPtrSet<VNInfo*, 8>::iterator I = reMattable_.begin(),
+       E = reMattable_.end(); I != E; ++I) {
+    VNInfo *VNI = *I;
+    if (VNI->hasPHIKill() || usedValues_.count(VNI))
+      continue;
+    MachineInstr *DefMI = lis_.getInstructionFromIndex(VNI->def);
+    DEBUG(dbgs() << "\tremoving dead def: " << VNI->def << '\t' << *DefMI);
+    lis_.RemoveMachineInstrFromMaps(DefMI);
+    vrm_.RemoveMachineInstrFromMaps(DefMI);
+    DefMI->eraseFromParent();
+    VNI->setIsDefAccurate(false);
+    anyRemoved = true;
+  }
+
+  if (!anyRemoved)
+    return;
+
+  // Removing values may cause debug uses where li_ is not live.
+  for (MachineRegisterInfo::use_iterator RI = mri_.use_begin(li_->reg);
+       MachineInstr *MI = RI.skipInstruction();) {
+    if (!MI->isDebugValue())
+      continue;
+    // Try to preserve the debug value if li_ is live immediately after it.
+    MachineBasicBlock::iterator NextMI = MI;
+    ++NextMI;
+    if (NextMI != MI->getParent()->end() && !lis_.isNotInMIMap(NextMI)) {
+      VNInfo *VNI = li_->getVNInfoAt(lis_.getInstructionIndex(NextMI));
+      if (VNI && (VNI->hasPHIKill() || usedValues_.count(VNI)))
+        continue;
+    }
+    DEBUG(dbgs() << "Removing debug info due to remat:" << "\t" << *MI);
+    MI->eraseFromParent();
+  }
+}
+
+/// If MI is a load or store of stackSlot_, it can be removed.
+bool InlineSpiller::coalesceStackAccess(MachineInstr *MI) {
+  int FI = 0;
+  unsigned reg;
+  if (!(reg = tii_.isLoadFromStackSlot(MI, FI)) &&
+      !(reg = tii_.isStoreToStackSlot(MI, FI)))
+    return false;
+
+  // We have a stack access. Is it the right register and slot?
+  if (reg != li_->reg || FI != stackSlot_)
+    return false;
+
+  DEBUG(dbgs() << "Coalescing stack access: " << *MI);
+  lis_.RemoveMachineInstrFromMaps(MI);
+  MI->eraseFromParent();
+  return true;
+}
+
+/// foldMemoryOperand - Try folding stack slot references in Ops into MI.
+/// Return true on success, and MI will be erased.
+bool InlineSpiller::foldMemoryOperand(MachineBasicBlock::iterator MI,
+                                      const SmallVectorImpl<unsigned> &Ops) {
+  // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
+  // operands.
+  SmallVector<unsigned, 8> FoldOps;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    unsigned Idx = Ops[i];
+    MachineOperand &MO = MI->getOperand(Idx);
+    if (MO.isImplicit())
+      continue;
+    // FIXME: Teach targets to deal with subregs.
+    if (MO.getSubReg())
+      return false;
+    // Tied use operands should not be passed to foldMemoryOperand.
+    if (!MI->isRegTiedToDefOperand(Idx))
+      FoldOps.push_back(Idx);
+  }
+
+  MachineInstr *FoldMI = tii_.foldMemoryOperand(MI, FoldOps, stackSlot_);
+  if (!FoldMI)
+    return false;
+  lis_.ReplaceMachineInstrInMaps(MI, FoldMI);
+  vrm_.addSpillSlotUse(stackSlot_, FoldMI);
+  MI->eraseFromParent();
+  DEBUG(dbgs() << "\tfolded: " << *FoldMI);
+  return true;
+}
+
+/// insertReload - Insert a reload of NewLI.reg before MI.
+void InlineSpiller::insertReload(LiveInterval &NewLI,
+                                 MachineBasicBlock::iterator MI) {
+  MachineBasicBlock &MBB = *MI->getParent();
+  SlotIndex Idx = lis_.getInstructionIndex(MI).getDefIndex();
+  tii_.loadRegFromStackSlot(MBB, MI, NewLI.reg, stackSlot_, rc_, &tri_);
+  --MI; // Point to load instruction.
+  SlotIndex LoadIdx = lis_.InsertMachineInstrInMaps(MI).getDefIndex();
+  vrm_.addSpillSlotUse(stackSlot_, MI);
+  DEBUG(dbgs() << "\treload:  " << LoadIdx << '\t' << *MI);
+  VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, 0, true,
+                                       lis_.getVNInfoAllocator());
+  NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI));
+}
+
+/// insertSpill - Insert a spill of NewLI.reg after MI.
+void InlineSpiller::insertSpill(LiveInterval &NewLI,
+                                MachineBasicBlock::iterator MI) {
+  MachineBasicBlock &MBB = *MI->getParent();
+  SlotIndex Idx = lis_.getInstructionIndex(MI).getDefIndex();
+  tii_.storeRegToStackSlot(MBB, ++MI, NewLI.reg, true, stackSlot_, rc_, &tri_);
+  --MI; // Point to store instruction.
+  SlotIndex StoreIdx = lis_.InsertMachineInstrInMaps(MI).getDefIndex();
+  vrm_.addSpillSlotUse(stackSlot_, MI);
+  DEBUG(dbgs() << "\tspilled: " << StoreIdx << '\t' << *MI);
+  VNInfo *StoreVNI = NewLI.getNextValue(Idx, 0, true,
+                                        lis_.getVNInfoAllocator());
+  NewLI.addRange(LiveRange(Idx, StoreIdx, StoreVNI));
+}
+
+void InlineSpiller::spill(LiveInterval *li,
+                          std::vector<LiveInterval*> &newIntervals,
+                          SmallVectorImpl<LiveInterval*> &spillIs,
+                          SlotIndex *earliestIndex) {
+  DEBUG(dbgs() << "Inline spilling " << *li << "\n");
+  assert(li->isSpillable() && "Attempting to spill already spilled value.");
+  assert(!li->isStackSlot() && "Trying to spill a stack slot.");
+
+  li_ = li;
+  newIntervals_ = &newIntervals;
+  rc_ = mri_.getRegClass(li->reg);
+  spillIs_ = &spillIs;
+
+  if (split())
+    return;
+
+  reMaterializeAll();
+
+  // Remat may handle everything.
+  if (li_->empty())
+    return;
+
+  stackSlot_ = vrm_.getStackSlot(li->reg);
+  if (stackSlot_ == VirtRegMap::NO_STACK_SLOT)
+    stackSlot_ = vrm_.assignVirt2StackSlot(li->reg);
+
+  // Iterate over instructions using register.
+  for (MachineRegisterInfo::reg_iterator RI = mri_.reg_begin(li->reg);
+       MachineInstr *MI = RI.skipInstruction();) {
+
+    // Debug values are not allowed to affect codegen.
+    if (MI->isDebugValue()) {
+      // Modify DBG_VALUE now that the value is in a spill slot.
+      uint64_t Offset = MI->getOperand(1).getImm();
+      const MDNode *MDPtr = MI->getOperand(2).getMetadata();
+      DebugLoc DL = MI->getDebugLoc();
+      if (MachineInstr *NewDV = tii_.emitFrameIndexDebugValue(mf_, stackSlot_,
+                                                           Offset, MDPtr, DL)) {
+        DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
+        MachineBasicBlock *MBB = MI->getParent();
+        MBB->insert(MBB->erase(MI), NewDV);
+      } else {
+        DEBUG(dbgs() << "Removing debug info due to spill:" << "\t" << *MI);
+        MI->eraseFromParent();
+      }
+      continue;
+    }
+
+    // Stack slot accesses may coalesce away.
+    if (coalesceStackAccess(MI))
+      continue;
+
+    // Analyze instruction.
+    bool Reads, Writes;
+    SmallVector<unsigned, 8> Ops;
+    tie(Reads, Writes) = MI->readsWritesVirtualRegister(li->reg, &Ops);
+
+    // Attempt to fold memory ops.
+    if (foldMemoryOperand(MI, Ops))
+      continue;
+
+    // Allocate interval around instruction.
+    // FIXME: Infer regclass from instruction alone.
+    unsigned NewVReg = mri_.createVirtualRegister(rc_);
+    vrm_.grow();
+    LiveInterval &NewLI = lis_.getOrCreateInterval(NewVReg);
+    NewLI.markNotSpillable();
+
+    if (Reads)
+      insertReload(NewLI, MI);
+
+    // Rewrite instruction operands.
+    bool hasLiveDef = false;
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(Ops[i]);
+      MO.setReg(NewVReg);
+      if (MO.isUse()) {
+        if (!MI->isRegTiedToDefOperand(Ops[i]))
+          MO.setIsKill();
+      } else {
+        if (!MO.isDead())
+          hasLiveDef = true;
+      }
+    }
+
+    // FIXME: Use a second vreg if instruction has no tied ops.
+    if (Writes && hasLiveDef)
+      insertSpill(NewLI, MI);
+
+    DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+    newIntervals.push_back(&NewLI);
+  }
+}
diff --git a/src/LLVM/lib/CodeGen/LLVMTargetMachine.cpp b/src/LLVM/lib/CodeGen/LLVMTargetMachine.cpp
new file mode 100644
index 0000000..2f9ed2a
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/LLVMTargetMachine.cpp
@@ -0,0 +1,261 @@
+//===-- LLVMTargetMachine.cpp - Implement the LLVMTargetMachine class -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVMTargetMachine class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+static cl::opt<bool> DisablePostRA("disable-post-ra", cl::Hidden,
+    cl::desc("Disable Post Regalloc"));
+static cl::opt<bool> DisableBranchFold("disable-branch-fold", cl::Hidden,
+    cl::desc("Disable branch folding"));
+static cl::opt<bool> DisableTailDuplicate("disable-tail-duplicate", cl::Hidden,
+    cl::desc("Disable tail duplication"));
+static cl::opt<bool> DisableEarlyTailDup("disable-early-taildup", cl::Hidden,
+    cl::desc("Disable pre-register allocation tail duplication"));
+static cl::opt<bool> DisableCodePlace("disable-code-place", cl::Hidden,
+    cl::desc("Disable code placement"));
+static cl::opt<bool> DisableSSC("disable-ssc", cl::Hidden,
+    cl::desc("Disable Stack Slot Coloring"));
+static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden,
+    cl::desc("Disable Machine LICM"));
+static cl::opt<bool> DisablePostRAMachineLICM("disable-postra-machine-licm",
+    cl::Hidden,
+    cl::desc("Disable Machine LICM"));
+static cl::opt<bool> DisableMachineSink("disable-machine-sink", cl::Hidden,
+    cl::desc("Disable Machine Sinking"));
+static cl::opt<bool> DisableLSR("disable-lsr", cl::Hidden,
+    cl::desc("Disable Loop Strength Reduction Pass"));
+static cl::opt<bool> DisableCGP("disable-cgp", cl::Hidden,
+    cl::desc("Disable Codegen Prepare"));
+static cl::opt<bool> ShowMCEncoding("show-mc-encoding", cl::Hidden,
+    cl::desc("Show encoding in .s output"));
+static cl::opt<bool> ShowMCInst("show-mc-inst", cl::Hidden,
+    cl::desc("Show instruction structure in .s output"));
+static cl::opt<bool> EnableMCLogging("enable-mc-api-logging", cl::Hidden,
+    cl::desc("Enable MC API logging"));
+
+static cl::opt<cl::boolOrDefault>
+AsmVerbose("asm-verbose", cl::desc("Add comments to directives."),
+           cl::init(cl::BOU_UNSET));
+
+static bool getVerboseAsm() {
+  switch (AsmVerbose) {
+  default:
+  case cl::BOU_UNSET: return TargetMachine::getAsmVerbosityDefault();
+  case cl::BOU_TRUE:  return true;
+  case cl::BOU_FALSE: return false;
+  }      
+}
+
+// Enable or disable an experimental optimization to split GEPs
+// and run a special GVN pass which does not examine loads, in
+// an effort to factor out redundancy implicit in complex GEPs.
+static cl::opt<bool> EnableSplitGEPGVN("split-gep-gvn", cl::Hidden,
+    cl::desc("Split GEPs and run no-load GVN"));
+
+LLVMTargetMachine::LLVMTargetMachine(const Target &T,
+                                     const std::string &Triple)
+  : TargetMachine(T), TargetTriple(Triple) {
+  AsmInfo = T.createAsmInfo(TargetTriple);
+}
+
+// Set the default code model for the JIT for a generic target.
+// FIXME: Is small right here? or .is64Bit() ? Large : Small?
+void LLVMTargetMachine::setCodeModelForJIT() {
+  setCodeModel(CodeModel::Small);
+}
+
+// Set the default code model for static compilation for a generic target.
+void LLVMTargetMachine::setCodeModelForStatic() {
+  setCodeModel(CodeModel::Small);
+}
+
+/// addPassesToEmitMachineCode - Add passes to the specified pass manager to
+/// get machine code emitted.  This uses a JITCodeEmitter object to handle
+/// actually outputting the machine code and resolving things like the address
+/// of functions.  This method should returns true if machine code emission is
+/// not supported.
+///
+bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
+                                                   JITCodeEmitter &JCE,
+                                                   CodeGenOpt::Level OptLevel) {
+  // Make sure the code model is set.
+  setCodeModelForJIT();
+  
+  // Add common CodeGen passes.
+  MCContext *Ctx = 0;
+  if (addCommonCodeGenPasses(PM, OptLevel, Ctx))
+    return true;
+
+  addCodeEmitter(PM, OptLevel, JCE);
+
+  return false; // success!
+}
+
+/// addPassesToEmitMC - Add passes to the specified pass manager to get
+/// machine code emitted with the MCJIT. This method returns true if machine
+/// code is not supported. It fills the MCContext Ctx pointer which can be
+/// used to build custom MCStreamer.
+///
+bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
+                                          MCContext *&Ctx,
+                                          CodeGenOpt::Level OptLevel) {
+  // Add common CodeGen passes.
+  if (addCommonCodeGenPasses(PM, OptLevel, Ctx))
+    return true;
+  // Make sure the code model is set.
+  setCodeModelForJIT();
+
+  return false; // success!
+}
+
+/// addCommonCodeGenPasses - Add standard LLVM codegen passes used for both
+/// emitting to assembly files or machine code output.
+///
+bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
+                                               CodeGenOpt::Level OptLevel,
+                                               MCContext *&OutContext) {
+  // Standard LLVM-Level Passes.
+
+  // Optionally, tun split-GEPs and no-load GVN.
+  if (EnableSplitGEPGVN) {
+    PM.add(createGEPSplitterPass());
+    PM.add(createGVNPass(/*NoLoads=*/true));
+  }
+
+  // Run loop strength reduction before anything else.
+  if (OptLevel != CodeGenOpt::None && !DisableLSR) {
+    PM.add(createLoopStrengthReducePass(getTargetLowering()));
+  }
+
+  // Make sure that no unreachable blocks are instruction selected.
+  PM.add(createUnreachableBlockEliminationPass());
+
+  if (OptLevel != CodeGenOpt::None && !DisableCGP)
+    PM.add(createCodeGenPreparePass(getTargetLowering()));
+
+  PM.add(createStackProtectorPass(getTargetLowering()));
+
+  addPreISel(PM, OptLevel);
+
+  // Standard Lower-Level Passes.
+  
+  // Install a MachineModuleInfo class, which is an immutable pass that holds
+  // all the per-module stuff we're generating, including MCContext.
+  MachineModuleInfo *MMI = new MachineModuleInfo(*getMCAsmInfo());
+  PM.add(MMI);
+  OutContext = &MMI->getContext(); // Return the MCContext specifically by-ref.
+  
+
+  // Set up a MachineFunction for the rest of CodeGen to work on.
+  PM.add(new MachineFunctionAnalysis(*this, OptLevel));
+
+  // Ask the target for an isel.
+  if (addInstSelector(PM, OptLevel))
+    return true;
+
+  // Optimize PHIs before DCE: removing dead PHI cycles may make more
+  // instructions dead.
+  if (OptLevel != CodeGenOpt::None)
+    PM.add(createOptimizePHIsPass());
+
+  if (OptLevel != CodeGenOpt::None) {
+    // With optimization, dead code should already be eliminated. However
+    // there is one known exception: lowered code for arguments that are only
+    // used by tail calls, where the tail calls reuse the incoming stack
+    // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+    PM.add(createDeadMachineInstructionElimPass());
+
+    PM.add(createPeepholeOptimizerPass());
+    if (!DisableMachineLICM)
+      PM.add(createMachineLICMPass());
+    PM.add(createMachineCSEPass());
+    if (!DisableMachineSink)
+      PM.add(createMachineSinkingPass());
+  }
+
+  // Pre-ra tail duplication.
+  if (OptLevel != CodeGenOpt::None && !DisableEarlyTailDup) {
+    PM.add(createTailDuplicatePass(true));
+  }
+
+  // Run pre-ra passes.
+  addPreRegAlloc(PM, OptLevel);
+
+  // Perform register allocation.
+  PM.add(createRegisterAllocator(OptLevel));
+
+  // Perform stack slot coloring and post-ra machine LICM.
+  if (OptLevel != CodeGenOpt::None) {
+    // FIXME: Re-enable coloring with register when it's capable of adding
+    // kill markers.
+    if (!DisableSSC)
+      PM.add(createStackSlotColoringPass(false));
+
+    // Run post-ra machine LICM to hoist reloads / remats.
+    if (!DisablePostRAMachineLICM)
+      PM.add(createMachineLICMPass(false));
+  }
+
+  // Run post-ra passes.
+  addPostRegAlloc(PM, OptLevel);
+
+  PM.add(createLowerSubregsPass());
+
+  // Insert prolog/epilog code.  Eliminate abstract frame index references...
+  PM.add(createPrologEpilogCodeInserter());
+
+  // Run pre-sched2 passes.
+  addPreSched2(PM, OptLevel);
+  
+  // Second pass scheduler.
+  if (OptLevel != CodeGenOpt::None && !DisablePostRA) {
+    PM.add(createPostRAScheduler(OptLevel));
+  }
+
+  // Branch folding must be run after regalloc and prolog/epilog insertion.
+  if (OptLevel != CodeGenOpt::None && !DisableBranchFold) {
+    PM.add(createBranchFoldingPass(getEnableTailMergeDefault()));
+  }
+
+  // Tail duplication.
+  if (OptLevel != CodeGenOpt::None && !DisableTailDuplicate) {
+    PM.add(createTailDuplicatePass(false));
+  }
+
+  if (OptLevel != CodeGenOpt::None && !DisableCodePlace) {
+    PM.add(createCodePlacementOptPass());
+  }
+
+  addPreEmitPass(PM, OptLevel);
+
+  return false;
+}
diff --git a/src/LLVM/lib/CodeGen/LatencyPriorityQueue.cpp b/src/LLVM/lib/CodeGen/LatencyPriorityQueue.cpp
new file mode 100644
index 0000000..b9527fa
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/LatencyPriorityQueue.cpp
@@ -0,0 +1,138 @@
+//===---- LatencyPriorityQueue.cpp - A latency-oriented priority queue ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LatencyPriorityQueue class, which is a
+// SchedulingPriorityQueue that schedules using latency information to
+// reduce the length of the critical path through the basic block.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "scheduler"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+bool latency_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
+  // The isScheduleHigh flag allows nodes with wraparound dependencies that
+  // cannot easily be modeled as edges with latencies to be scheduled as
+  // soon as possible in a top-down schedule.
+  if (LHS->isScheduleHigh && !RHS->isScheduleHigh)
+    return false;
+  if (!LHS->isScheduleHigh && RHS->isScheduleHigh)
+    return true;
+
+  unsigned LHSNum = LHS->NodeNum;
+  unsigned RHSNum = RHS->NodeNum;
+
+  // The most important heuristic is scheduling the critical path.
+  unsigned LHSLatency = PQ->getLatency(LHSNum);
+  unsigned RHSLatency = PQ->getLatency(RHSNum);
+  if (LHSLatency < RHSLatency) return true;
+  if (LHSLatency > RHSLatency) return false;
+  
+  // After that, if two nodes have identical latencies, look to see if one will
+  // unblock more other nodes than the other.
+  unsigned LHSBlocked = PQ->getNumSolelyBlockNodes(LHSNum);
+  unsigned RHSBlocked = PQ->getNumSolelyBlockNodes(RHSNum);
+  if (LHSBlocked < RHSBlocked) return true;
+  if (LHSBlocked > RHSBlocked) return false;
+  
+  // Finally, just to provide a stable ordering, use the node number as a
+  // deciding factor.
+  return LHSNum < RHSNum;
+}
+
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+SUnit *LatencyPriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    SUnit &Pred = *I->getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return 0;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+      
+  return OnlyAvailablePred;
+}
+
+void LatencyPriorityQueue::push(SUnit *SU) {
+  // Look at all of the successors of this node.  Count the number of nodes that
+  // this node is the sole unscheduled node for.
+  unsigned NumNodesBlocking = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+      ++NumNodesBlocking;
+  }
+  NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking;
+  
+  Queue.push_back(SU);
+}
+
+
+// ScheduledNode - As nodes are scheduled, we look to see if there are any
+// successor nodes that have a single unscheduled predecessor.  If so, that
+// single predecessor has a higher priority, since scheduling it will make
+// the node available.
+void LatencyPriorityQueue::ScheduledNode(SUnit *SU) {
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    AdjustPriorityOfUnscheduledPreds(I->getSUnit());
+  }
+}
+
+/// AdjustPriorityOfUnscheduledPreds - One of the predecessors of SU was just
+/// scheduled.  If SU is not itself available, then there is at least one
+/// predecessor node that has not been scheduled yet.  If SU has exactly ONE
+/// unscheduled predecessor, we want to increase its priority: it getting
+/// scheduled will make this node available, so it is better than some other
+/// node of the same priority that will not make a node available.
+void LatencyPriorityQueue::AdjustPriorityOfUnscheduledPreds(SUnit *SU) {
+  if (SU->isAvailable) return;  // All preds scheduled.
+  
+  SUnit *OnlyAvailablePred = getSingleUnscheduledPred(SU);
+  if (OnlyAvailablePred == 0 || !OnlyAvailablePred->isAvailable) return;
+  
+  // Okay, we found a single predecessor that is available, but not scheduled.
+  // Since it is available, it must be in the priority queue.  First remove it.
+  remove(OnlyAvailablePred);
+
+  // Reinsert the node into the priority queue, which recomputes its
+  // NumNodesSolelyBlocking value.
+  push(OnlyAvailablePred);
+}
+
+SUnit *LatencyPriorityQueue::pop() {
+  if (empty()) return NULL;
+  std::vector<SUnit *>::iterator Best = Queue.begin();
+  for (std::vector<SUnit *>::iterator I = llvm::next(Queue.begin()),
+       E = Queue.end(); I != E; ++I)
+    if (Picker(*Best, *I))
+      Best = I;
+  SUnit *V = *Best;
+  if (Best != prior(Queue.end()))
+    std::swap(*Best, Queue.back());
+  Queue.pop_back();
+  return V;
+}
+
+void LatencyPriorityQueue::remove(SUnit *SU) {
+  assert(!Queue.empty() && "Queue is empty!");
+  std::vector<SUnit *>::iterator I = std::find(Queue.begin(), Queue.end(), SU);
+  if (I != prior(Queue.end()))
+    std::swap(*I, Queue.back());
+  Queue.pop_back();
+}
diff --git a/src/LLVM/lib/CodeGen/LiveInterval.cpp b/src/LLVM/lib/CodeGen/LiveInterval.cpp
new file mode 100644
index 0000000..6820a61
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/LiveInterval.cpp
@@ -0,0 +1,865 @@
+//===-- LiveInterval.cpp - Live Interval Representation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveRange and LiveInterval classes.  Given some
+// numbering of each the machine instructions an interval [i, j) is said to be a
+// live interval for register v if there is no instruction with number j' > j
+// such that v is live at j' and there is no instruction with number i' < i such
+// that v is live at i'. In this implementation intervals can have holes,
+// i.e. an interval might look like [1,20), [50,65), [1000,1001).  Each
+// individual range is represented as an instance of LiveRange, and the whole
+// interval is represented as an instance of LiveInterval.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+// An example for liveAt():
+//
+// this = [1,4), liveAt(0) will return false. The instruction defining this
+// spans slots [0,3]. The interval belongs to an spilled definition of the
+// variable it represents. This is because slot 1 is used (def slot) and spans
+// up to slot 3 (store slot).
+//
+bool LiveInterval::liveAt(SlotIndex I) const {
+  Ranges::const_iterator r = std::upper_bound(ranges.begin(), ranges.end(), I);
+
+  if (r == ranges.begin())
+    return false;
+
+  --r;
+  return r->contains(I);
+}
+
+// liveBeforeAndAt - Check if the interval is live at the index and the index
+// just before it. If index is liveAt, check if it starts a new live range.
+// If it does, then check if the previous live range ends at index-1.
+bool LiveInterval::liveBeforeAndAt(SlotIndex I) const {
+  Ranges::const_iterator r = std::upper_bound(ranges.begin(), ranges.end(), I);
+
+  if (r == ranges.begin())
+    return false;
+
+  --r;
+  if (!r->contains(I))
+    return false;
+  if (I != r->start)
+    return true;
+  // I is the start of a live range. Check if the previous live range ends
+  // at I-1.
+  if (r == ranges.begin())
+    return false;
+  return r->end == I;
+}
+
+/// killedAt - Return true if a live range ends at index. Note that the kill
+/// point is not contained in the half-open live range. It is usually the
+/// getDefIndex() slot following its last use.
+bool LiveInterval::killedAt(SlotIndex I) const {
+  Ranges::const_iterator r = std::lower_bound(ranges.begin(), ranges.end(), I);
+
+  // Now r points to the first interval with start >= I, or ranges.end().
+  if (r == ranges.begin())
+    return false;
+
+  --r;
+  // Now r points to the last interval with end <= I.
+  // r->end is the kill point.
+  return r->end == I;
+}
+
+/// killedInRange - Return true if the interval has kills in [Start,End).
+bool LiveInterval::killedInRange(SlotIndex Start, SlotIndex End) const {
+  Ranges::const_iterator r =
+    std::lower_bound(ranges.begin(), ranges.end(), End);
+
+  // Now r points to the first interval with start >= End, or ranges.end().
+  if (r == ranges.begin())
+    return false;
+
+  --r;
+  // Now r points to the last interval with end <= End.
+  // r->end is the kill point.
+  return r->end >= Start && r->end < End;
+}
+
+// overlaps - Return true if the intersection of the two live intervals is
+// not empty.
+//
+// An example for overlaps():
+//
+// 0: A = ...
+// 4: B = ...
+// 8: C = A + B ;; last use of A
+//
+// The live intervals should look like:
+//
+// A = [3, 11)
+// B = [7, x)
+// C = [11, y)
+//
+// A->overlaps(C) should return false since we want to be able to join
+// A and C.
+//
+bool LiveInterval::overlapsFrom(const LiveInterval& other,
+                                const_iterator StartPos) const {
+  assert(!empty() && "empty interval");
+  const_iterator i = begin();
+  const_iterator ie = end();
+  const_iterator j = StartPos;
+  const_iterator je = other.end();
+
+  assert((StartPos->start <= i->start || StartPos == other.begin()) &&
+         StartPos != other.end() && "Bogus start position hint!");
+
+  if (i->start < j->start) {
+    i = std::upper_bound(i, ie, j->start);
+    if (i != ranges.begin()) --i;
+  } else if (j->start < i->start) {
+    ++StartPos;
+    if (StartPos != other.end() && StartPos->start <= i->start) {
+      assert(StartPos < other.end() && i < end());
+      j = std::upper_bound(j, je, i->start);
+      if (j != other.ranges.begin()) --j;
+    }
+  } else {
+    return true;
+  }
+
+  if (j == je) return false;
+
+  while (i != ie) {
+    if (i->start > j->start) {
+      std::swap(i, j);
+      std::swap(ie, je);
+    }
+
+    if (i->end > j->start)
+      return true;
+    ++i;
+  }
+
+  return false;
+}
+
+/// overlaps - Return true if the live interval overlaps a range specified
+/// by [Start, End).
+bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
+  assert(Start < End && "Invalid range");
+  const_iterator I = std::lower_bound(begin(), end(), End);
+  return I != begin() && (--I)->end > Start;
+}
+
+
+/// ValNo is dead, remove it.  If it is the largest value number, just nuke it
+/// (and any other deleted values neighboring it), otherwise mark it as ~1U so
+/// it can be nuked later.
+void LiveInterval::markValNoForDeletion(VNInfo *ValNo) {
+  if (ValNo->id == getNumValNums()-1) {
+    do {
+      valnos.pop_back();
+    } while (!valnos.empty() && valnos.back()->isUnused());
+  } else {
+    ValNo->setIsUnused(true);
+  }
+}
+
+/// RenumberValues - Renumber all values in order of appearance and delete the
+/// remaining unused values.
+void LiveInterval::RenumberValues() {
+  SmallPtrSet<VNInfo*, 8> Seen;
+  valnos.clear();
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    VNInfo *VNI = I->valno;
+    if (!Seen.insert(VNI))
+      continue;
+    assert(!VNI->isUnused() && "Unused valno used by live range");
+    VNI->id = (unsigned)valnos.size();
+    valnos.push_back(VNI);
+  }
+}
+
+/// extendIntervalEndTo - This method is used when we want to extend the range
+/// specified by I to end at the specified endpoint.  To do this, we should
+/// merge and eliminate all ranges that this will overlap with.  The iterator is
+/// not invalidated.
+void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
+  assert(I != ranges.end() && "Not a valid interval!");
+  VNInfo *ValNo = I->valno;
+
+  // Search for the first interval that we can't merge with.
+  Ranges::iterator MergeTo = llvm::next(I);
+  for (; MergeTo != ranges.end() && NewEnd >= MergeTo->end; ++MergeTo) {
+    assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+  }
+
+  // If NewEnd was in the middle of an interval, make sure to get its endpoint.
+  I->end = std::max(NewEnd, prior(MergeTo)->end);
+
+  // Erase any dead ranges.
+  ranges.erase(llvm::next(I), MergeTo);
+
+  // If the newly formed range now touches the range after it and if they have
+  // the same value number, merge the two ranges into one range.
+  Ranges::iterator Next = llvm::next(I);
+  if (Next != ranges.end() && Next->start <= I->end && Next->valno == ValNo) {
+    I->end = Next->end;
+    ranges.erase(Next);
+  }
+}
+
+
+/// extendIntervalStartTo - This method is used when we want to extend the range
+/// specified by I to start at the specified endpoint.  To do this, we should
+/// merge and eliminate all ranges that this will overlap with.
+LiveInterval::Ranges::iterator
+LiveInterval::extendIntervalStartTo(Ranges::iterator I, SlotIndex NewStart) {
+  assert(I != ranges.end() && "Not a valid interval!");
+  VNInfo *ValNo = I->valno;
+
+  // Search for the first interval that we can't merge with.
+  Ranges::iterator MergeTo = I;
+  do {
+    if (MergeTo == ranges.begin()) {
+      I->start = NewStart;
+      ranges.erase(MergeTo, I);
+      return I;
+    }
+    assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+    --MergeTo;
+  } while (NewStart <= MergeTo->start);
+
+  // If we start in the middle of another interval, just delete a range and
+  // extend that interval.
+  if (MergeTo->end >= NewStart && MergeTo->valno == ValNo) {
+    MergeTo->end = I->end;
+  } else {
+    // Otherwise, extend the interval right after.
+    ++MergeTo;
+    MergeTo->start = NewStart;
+    MergeTo->end = I->end;
+  }
+
+  ranges.erase(llvm::next(MergeTo), llvm::next(I));
+  return MergeTo;
+}
+
+LiveInterval::iterator
+LiveInterval::addRangeFrom(LiveRange LR, iterator From) {
+  SlotIndex Start = LR.start, End = LR.end;
+  iterator it = std::upper_bound(From, ranges.end(), Start);
+
+  // If the inserted interval starts in the middle or right at the end of
+  // another interval, just extend that interval to contain the range of LR.
+  if (it != ranges.begin()) {
+    iterator B = prior(it);
+    if (LR.valno == B->valno) {
+      if (B->start <= Start && B->end >= Start) {
+        extendIntervalEndTo(B, End);
+        return B;
+      }
+    } else {
+      // Check to make sure that we are not overlapping two live ranges with
+      // different valno's.
+      assert(B->end <= Start &&
+             "Cannot overlap two LiveRanges with differing ValID's"
+             " (did you def the same reg twice in a MachineInstr?)");
+    }
+  }
+
+  // Otherwise, if this range ends in the middle of, or right next to, another
+  // interval, merge it into that interval.
+  if (it != ranges.end()) {
+    if (LR.valno == it->valno) {
+      if (it->start <= End) {
+        it = extendIntervalStartTo(it, Start);
+
+        // If LR is a complete superset of an interval, we may need to grow its
+        // endpoint as well.
+        if (End > it->end)
+          extendIntervalEndTo(it, End);
+        return it;
+      }
+    } else {
+      // Check to make sure that we are not overlapping two live ranges with
+      // different valno's.
+      assert(it->start >= End &&
+             "Cannot overlap two LiveRanges with differing ValID's");
+    }
+  }
+
+  // Otherwise, this is just a new range that doesn't interact with anything.
+  // Insert it.
+  return ranges.insert(it, LR);
+}
+
+/// isInOneLiveRange - Return true if the range specified is entirely in 
+/// a single LiveRange of the live interval.
+bool LiveInterval::isInOneLiveRange(SlotIndex Start, SlotIndex End) {
+  Ranges::iterator I = std::upper_bound(ranges.begin(), ranges.end(), Start);
+  if (I == ranges.begin())
+    return false;
+  --I;
+  return I->containsRange(Start, End);
+}
+
+
+/// removeRange - Remove the specified range from this interval.  Note that
+/// the range must be in a single LiveRange in its entirety.
+void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
+                               bool RemoveDeadValNo) {
+  // Find the LiveRange containing this span.
+  Ranges::iterator I = std::upper_bound(ranges.begin(), ranges.end(), Start);
+  assert(I != ranges.begin() && "Range is not in interval!");
+  --I;
+  assert(I->containsRange(Start, End) && "Range is not entirely in interval!");
+
+  // If the span we are removing is at the start of the LiveRange, adjust it.
+  VNInfo *ValNo = I->valno;
+  if (I->start == Start) {
+    if (I->end == End) {
+      if (RemoveDeadValNo) {
+        // Check if val# is dead.
+        bool isDead = true;
+        for (const_iterator II = begin(), EE = end(); II != EE; ++II)
+          if (II != I && II->valno == ValNo) {
+            isDead = false;
+            break;
+          }
+        if (isDead) {
+          // Now that ValNo is dead, remove it.
+          markValNoForDeletion(ValNo);
+        }
+      }
+
+      ranges.erase(I);  // Removed the whole LiveRange.
+    } else
+      I->start = End;
+    return;
+  }
+
+  // Otherwise if the span we are removing is at the end of the LiveRange,
+  // adjust the other way.
+  if (I->end == End) {
+    I->end = Start;
+    return;
+  }
+
+  // Otherwise, we are splitting the LiveRange into two pieces.
+  SlotIndex OldEnd = I->end;
+  I->end = Start;   // Trim the old interval.
+
+  // Insert the new one.
+  ranges.insert(llvm::next(I), LiveRange(End, OldEnd, ValNo));
+}
+
+/// removeValNo - Remove all the ranges defined by the specified value#.
+/// Also remove the value# from value# list.
+void LiveInterval::removeValNo(VNInfo *ValNo) {
+  if (empty()) return;
+  Ranges::iterator I = ranges.end();
+  Ranges::iterator E = ranges.begin();
+  do {
+    --I;
+    if (I->valno == ValNo)
+      ranges.erase(I);
+  } while (I != E);
+  // Now that ValNo is dead, remove it.
+  markValNoForDeletion(ValNo);
+}
+
+/// getLiveRangeContaining - Return the live range that contains the
+/// specified index, or null if there is none.
+LiveInterval::const_iterator 
+LiveInterval::FindLiveRangeContaining(SlotIndex Idx) const {
+  const_iterator It = std::upper_bound(begin(), end(), Idx);
+  if (It != ranges.begin()) {
+    --It;
+    if (It->contains(Idx))
+      return It;
+  }
+
+  return end();
+}
+
+LiveInterval::iterator 
+LiveInterval::FindLiveRangeContaining(SlotIndex Idx) {
+  iterator It = std::upper_bound(begin(), end(), Idx);
+  if (It != begin()) {
+    --It;
+    if (It->contains(Idx))
+      return It;
+  }
+  
+  return end();
+}
+
+/// findDefinedVNInfo - Find the VNInfo defined by the specified
+/// index (register interval).
+VNInfo *LiveInterval::findDefinedVNInfoForRegInt(SlotIndex Idx) const {
+  for (LiveInterval::const_vni_iterator i = vni_begin(), e = vni_end();
+       i != e; ++i) {
+    if ((*i)->def == Idx)
+      return *i;
+  }
+
+  return 0;
+}
+
+/// findDefinedVNInfo - Find the VNInfo defined by the specified
+/// register (stack inteval).
+VNInfo *LiveInterval::findDefinedVNInfoForStackInt(unsigned reg) const {
+  for (LiveInterval::const_vni_iterator i = vni_begin(), e = vni_end();
+       i != e; ++i) {
+    if ((*i)->getReg() == reg)
+      return *i;
+  }
+  return 0;
+}
+
+/// join - Join two live intervals (this, and other) together.  This applies
+/// mappings to the value numbers in the LHS/RHS intervals as specified.  If
+/// the intervals are not joinable, this aborts.
+void LiveInterval::join(LiveInterval &Other,
+                        const int *LHSValNoAssignments,
+                        const int *RHSValNoAssignments, 
+                        SmallVector<VNInfo*, 16> &NewVNInfo,
+                        MachineRegisterInfo *MRI) {
+  // Determine if any of our live range values are mapped.  This is uncommon, so
+  // we want to avoid the interval scan if not. 
+  bool MustMapCurValNos = false;
+  unsigned NumVals = getNumValNums();
+  unsigned NumNewVals = NewVNInfo.size();
+  for (unsigned i = 0; i != NumVals; ++i) {
+    unsigned LHSValID = LHSValNoAssignments[i];
+    if (i != LHSValID ||
+        (NewVNInfo[LHSValID] && NewVNInfo[LHSValID] != getValNumInfo(i)))
+      MustMapCurValNos = true;
+  }
+
+  // If we have to apply a mapping to our base interval assignment, rewrite it
+  // now.
+  if (MustMapCurValNos) {
+    // Map the first live range.
+    iterator OutIt = begin();
+    OutIt->valno = NewVNInfo[LHSValNoAssignments[OutIt->valno->id]];
+    ++OutIt;
+    for (iterator I = OutIt, E = end(); I != E; ++I) {
+      OutIt->valno = NewVNInfo[LHSValNoAssignments[I->valno->id]];
+      
+      // If this live range has the same value # as its immediate predecessor,
+      // and if they are neighbors, remove one LiveRange.  This happens when we
+      // have [0,3:0)[4,7:1) and map 0/1 onto the same value #.
+      if (OutIt->valno == (OutIt-1)->valno && (OutIt-1)->end == OutIt->start) {
+        (OutIt-1)->end = OutIt->end;
+      } else {
+        if (I != OutIt) {
+          OutIt->start = I->start;
+          OutIt->end = I->end;
+        }
+        
+        // Didn't merge, on to the next one.
+        ++OutIt;
+      }
+    }
+    
+    // If we merge some live ranges, chop off the end.
+    ranges.erase(OutIt, end());
+  }
+
+  // Remember assignements because val# ids are changing.
+  SmallVector<unsigned, 16> OtherAssignments;
+  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
+    OtherAssignments.push_back(RHSValNoAssignments[I->valno->id]);
+
+  // Update val# info. Renumber them and make sure they all belong to this
+  // LiveInterval now. Also remove dead val#'s.
+  unsigned NumValNos = 0;
+  for (unsigned i = 0; i < NumNewVals; ++i) {
+    VNInfo *VNI = NewVNInfo[i];
+    if (VNI) {
+      if (NumValNos >= NumVals)
+        valnos.push_back(VNI);
+      else 
+        valnos[NumValNos] = VNI;
+      VNI->id = NumValNos++;  // Renumber val#.
+    }
+  }
+  if (NumNewVals < NumVals)
+    valnos.resize(NumNewVals);  // shrinkify
+
+  // Okay, now insert the RHS live ranges into the LHS.
+  iterator InsertPos = begin();
+  unsigned RangeNo = 0;
+  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I, ++RangeNo) {
+    // Map the valno in the other live range to the current live range.
+    I->valno = NewVNInfo[OtherAssignments[RangeNo]];
+    assert(I->valno && "Adding a dead range?");
+    InsertPos = addRangeFrom(*I, InsertPos);
+  }
+
+  ComputeJoinedWeight(Other);
+}
+
+/// MergeRangesInAsValue - Merge all of the intervals in RHS into this live
+/// interval as the specified value number.  The LiveRanges in RHS are
+/// allowed to overlap with LiveRanges in the current interval, but only if
+/// the overlapping LiveRanges have the specified value number.
+void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS, 
+                                        VNInfo *LHSValNo) {
+  // TODO: Make this more efficient.
+  iterator InsertPos = begin();
+  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
+    // Map the valno in the other live range to the current live range.
+    LiveRange Tmp = *I;
+    Tmp.valno = LHSValNo;
+    InsertPos = addRangeFrom(Tmp, InsertPos);
+  }
+}
+
+
+/// MergeValueInAsValue - Merge all of the live ranges of a specific val#
+/// in RHS into this live interval as the specified value number.
+/// The LiveRanges in RHS are allowed to overlap with LiveRanges in the
+/// current interval, it will replace the value numbers of the overlaped
+/// live ranges with the specified value number.
+void LiveInterval::MergeValueInAsValue(
+                                    const LiveInterval &RHS,
+                                    const VNInfo *RHSValNo, VNInfo *LHSValNo) {
+  SmallVector<VNInfo*, 4> ReplacedValNos;
+  iterator IP = begin();
+  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
+    assert(I->valno == RHS.getValNumInfo(I->valno->id) && "Bad VNInfo");
+    if (I->valno != RHSValNo)
+      continue;
+    SlotIndex Start = I->start, End = I->end;
+    IP = std::upper_bound(IP, end(), Start);
+    // If the start of this range overlaps with an existing liverange, trim it.
+    if (IP != begin() && IP[-1].end > Start) {
+      if (IP[-1].valno != LHSValNo) {
+        ReplacedValNos.push_back(IP[-1].valno);
+        IP[-1].valno = LHSValNo; // Update val#.
+      }
+      Start = IP[-1].end;
+      // Trimmed away the whole range?
+      if (Start >= End) continue;
+    }
+    // If the end of this range overlaps with an existing liverange, trim it.
+    if (IP != end() && End > IP->start) {
+      if (IP->valno != LHSValNo) {
+        ReplacedValNos.push_back(IP->valno);
+        IP->valno = LHSValNo;  // Update val#.
+      }
+      End = IP->start;
+      // If this trimmed away the whole range, ignore it.
+      if (Start == End) continue;
+    }
+    
+    // Map the valno in the other live range to the current live range.
+    IP = addRangeFrom(LiveRange(Start, End, LHSValNo), IP);
+  }
+
+
+  SmallSet<VNInfo*, 4> Seen;
+  for (unsigned i = 0, e = ReplacedValNos.size(); i != e; ++i) {
+    VNInfo *V1 = ReplacedValNos[i];
+    if (Seen.insert(V1)) {
+      bool isDead = true;
+      for (const_iterator I = begin(), E = end(); I != E; ++I)
+        if (I->valno == V1) {
+          isDead = false;
+          break;
+        }          
+      if (isDead) {
+        // Now that V1 is dead, remove it.
+        markValNoForDeletion(V1);
+      }
+    }
+  }
+}
+
+
+/// MergeInClobberRanges - For any live ranges that are not defined in the
+/// current interval, but are defined in the Clobbers interval, mark them
+/// used with an unknown definition value.
+void LiveInterval::MergeInClobberRanges(LiveIntervals &li_,
+                                        const LiveInterval &Clobbers,
+                                        VNInfo::Allocator &VNInfoAllocator) {
+  if (Clobbers.empty()) return;
+  
+  DenseMap<VNInfo*, VNInfo*> ValNoMaps;
+  VNInfo *UnusedValNo = 0;
+  iterator IP = begin();
+  for (const_iterator I = Clobbers.begin(), E = Clobbers.end(); I != E; ++I) {
+    // For every val# in the Clobbers interval, create a new "unknown" val#.
+    VNInfo *ClobberValNo = 0;
+    DenseMap<VNInfo*, VNInfo*>::iterator VI = ValNoMaps.find(I->valno);
+    if (VI != ValNoMaps.end())
+      ClobberValNo = VI->second;
+    else if (UnusedValNo)
+      ClobberValNo = UnusedValNo;
+    else {
+      UnusedValNo = ClobberValNo =
+        getNextValue(li_.getInvalidIndex(), 0, false, VNInfoAllocator);
+      ValNoMaps.insert(std::make_pair(I->valno, ClobberValNo));
+    }
+
+    bool Done = false;
+    SlotIndex Start = I->start, End = I->end;
+    // If a clobber range starts before an existing range and ends after
+    // it, the clobber range will need to be split into multiple ranges.
+    // Loop until the entire clobber range is handled.
+    while (!Done) {
+      Done = true;
+      IP = std::upper_bound(IP, end(), Start);
+      SlotIndex SubRangeStart = Start;
+      SlotIndex SubRangeEnd = End;
+
+      // If the start of this range overlaps with an existing liverange, trim it.
+      if (IP != begin() && IP[-1].end > SubRangeStart) {
+        SubRangeStart = IP[-1].end;
+        // Trimmed away the whole range?
+        if (SubRangeStart >= SubRangeEnd) continue;
+      }
+      // If the end of this range overlaps with an existing liverange, trim it.
+      if (IP != end() && SubRangeEnd > IP->start) {
+        // If the clobber live range extends beyond the existing live range,
+        // it'll need at least another live range, so set the flag to keep
+        // iterating.
+        if (SubRangeEnd > IP->end) {
+          Start = IP->end;
+          Done = false;
+        }
+        SubRangeEnd = IP->start;
+        // If this trimmed away the whole range, ignore it.
+        if (SubRangeStart == SubRangeEnd) continue;
+      }
+
+      // Insert the clobber interval.
+      IP = addRangeFrom(LiveRange(SubRangeStart, SubRangeEnd, ClobberValNo),
+                        IP);
+      UnusedValNo = 0;
+    }
+  }
+
+  if (UnusedValNo) {
+    // Delete the last unused val#.
+    valnos.pop_back();
+  }
+}
+
+void LiveInterval::MergeInClobberRange(LiveIntervals &li_,
+                                       SlotIndex Start,
+                                       SlotIndex End,
+                                       VNInfo::Allocator &VNInfoAllocator) {
+  // Find a value # to use for the clobber ranges.  If there is already a value#
+  // for unknown values, use it.
+  VNInfo *ClobberValNo =
+    getNextValue(li_.getInvalidIndex(), 0, false, VNInfoAllocator);
+  
+  iterator IP = begin();
+  IP = std::upper_bound(IP, end(), Start);
+    
+  // If the start of this range overlaps with an existing liverange, trim it.
+  if (IP != begin() && IP[-1].end > Start) {
+    Start = IP[-1].end;
+    // Trimmed away the whole range?
+    if (Start >= End) return;
+  }
+  // If the end of this range overlaps with an existing liverange, trim it.
+  if (IP != end() && End > IP->start) {
+    End = IP->start;
+    // If this trimmed away the whole range, ignore it.
+    if (Start == End) return;
+  }
+    
+  // Insert the clobber interval.
+  addRangeFrom(LiveRange(Start, End, ClobberValNo), IP);
+}
+
+/// MergeValueNumberInto - This method is called when two value nubmers
+/// are found to be equivalent.  This eliminates V1, replacing all
+/// LiveRanges with the V1 value number with the V2 value number.  This can
+/// cause merging of V1/V2 values numbers and compaction of the value space.
+VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
+  assert(V1 != V2 && "Identical value#'s are always equivalent!");
+
+  // This code actually merges the (numerically) larger value number into the
+  // smaller value number, which is likely to allow us to compactify the value
+  // space.  The only thing we have to be careful of is to preserve the
+  // instruction that defines the result value.
+
+  // Make sure V2 is smaller than V1.
+  if (V1->id < V2->id) {
+    V1->copyFrom(*V2);
+    std::swap(V1, V2);
+  }
+
+  // Merge V1 live ranges into V2.
+  for (iterator I = begin(); I != end(); ) {
+    iterator LR = I++;
+    if (LR->valno != V1) continue;  // Not a V1 LiveRange.
+    
+    // Okay, we found a V1 live range.  If it had a previous, touching, V2 live
+    // range, extend it.
+    if (LR != begin()) {
+      iterator Prev = LR-1;
+      if (Prev->valno == V2 && Prev->end == LR->start) {
+        Prev->end = LR->end;
+
+        // Erase this live-range.
+        ranges.erase(LR);
+        I = Prev+1;
+        LR = Prev;
+      }
+    }
+    
+    // Okay, now we have a V1 or V2 live range that is maximally merged forward.
+    // Ensure that it is a V2 live-range.
+    LR->valno = V2;
+    
+    // If we can merge it into later V2 live ranges, do so now.  We ignore any
+    // following V1 live ranges, as they will be merged in subsequent iterations
+    // of the loop.
+    if (I != end()) {
+      if (I->start == LR->end && I->valno == V2) {
+        LR->end = I->end;
+        ranges.erase(I);
+        I = LR+1;
+      }
+    }
+  }
+  
+  // Now that V1 is dead, remove it.
+  markValNoForDeletion(V1);
+  
+  return V2;
+}
+
+void LiveInterval::Copy(const LiveInterval &RHS,
+                        MachineRegisterInfo *MRI,
+                        VNInfo::Allocator &VNInfoAllocator) {
+  ranges.clear();
+  valnos.clear();
+  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(RHS.reg);
+  MRI->setRegAllocationHint(reg, Hint.first, Hint.second);
+
+  weight = RHS.weight;
+  for (unsigned i = 0, e = RHS.getNumValNums(); i != e; ++i) {
+    const VNInfo *VNI = RHS.getValNumInfo(i);
+    createValueCopy(VNI, VNInfoAllocator);
+  }
+  for (unsigned i = 0, e = RHS.ranges.size(); i != e; ++i) {
+    const LiveRange &LR = RHS.ranges[i];
+    addRange(LiveRange(LR.start, LR.end, getValNumInfo(LR.valno->id)));
+  }
+}
+
+unsigned LiveInterval::getSize() const {
+  unsigned Sum = 0;
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    Sum += I->start.distance(I->end);
+  return Sum;
+}
+
+/// ComputeJoinedWeight - Set the weight of a live interval Joined
+/// after Other has been merged into it.
+void LiveInterval::ComputeJoinedWeight(const LiveInterval &Other) {
+  // If either of these intervals was spilled, the weight is the
+  // weight of the non-spilled interval.  This can only happen with
+  // iterative coalescers.
+
+  if (Other.weight != HUGE_VALF) {
+    weight += Other.weight;
+  }
+  else if (weight == HUGE_VALF &&
+      !TargetRegisterInfo::isPhysicalRegister(reg)) {
+    // Remove this assert if you have an iterative coalescer
+    assert(0 && "Joining to spilled interval");
+    weight = Other.weight;
+  }
+  else {
+    // Otherwise the weight stays the same
+    // Remove this assert if you have an iterative coalescer
+    assert(0 && "Joining from spilled interval");
+  }
+}
+
+raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange &LR) {
+  return os << '[' << LR.start << ',' << LR.end << ':' << LR.valno->id << ")";
+}
+
+void LiveRange::dump() const {
+  dbgs() << *this << "\n";
+}
+
+void LiveInterval::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
+  if (isStackSlot())
+    OS << "SS#" << getStackSlotIndex();
+  else if (TRI && TargetRegisterInfo::isPhysicalRegister(reg))
+    OS << TRI->getName(reg);
+  else
+    OS << "%reg" << reg;
+
+  OS << ',' << weight;
+
+  if (empty())
+    OS << " EMPTY";
+  else {
+    OS << " = ";
+    for (LiveInterval::Ranges::const_iterator I = ranges.begin(),
+           E = ranges.end(); I != E; ++I) {
+      OS << *I;
+      assert(I->valno == getValNumInfo(I->valno->id) && "Bad VNInfo");
+    }
+  }
+
+  // Print value number info.
+  if (getNumValNums()) {
+    OS << "  ";
+    unsigned vnum = 0;
+    for (const_vni_iterator i = vni_begin(), e = vni_end(); i != e;
+         ++i, ++vnum) {
+      const VNInfo *vni = *i;
+      if (vnum) OS << " ";
+      OS << vnum << "@";
+      if (vni->isUnused()) {
+        OS << "x";
+      } else {
+        if (!vni->isDefAccurate() && !vni->isPHIDef())
+          OS << "?";
+        else
+          OS << vni->def;
+        if (vni->hasPHIKill())
+          OS << "-phikill";
+        if (vni->hasRedefByEC())
+          OS << "-ec";
+      }
+    }
+  }
+}
+
+void LiveInterval::dump() const {
+  dbgs() << *this << "\n";
+}
+
+
+void LiveRange::print(raw_ostream &os) const {
+  os << *this;
+}
diff --git a/src/LLVM/lib/CodeGen/LiveIntervalAnalysis.cpp b/src/LLVM/lib/CodeGen/LiveIntervalAnalysis.cpp
new file mode 100644
index 0000000..587c55f
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -0,0 +1,2002 @@
+//===-- LiveIntervalAnalysis.cpp - Live Interval Analysis -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveInterval analysis pass which is used
+// by the Linear Scan Register allocator. This pass linearizes the
+// basic blocks of the function in DFS order and uses the
+// LiveVariables pass to conservatively compute live intervals for
+// each virtual and physical register.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "liveintervals"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "VirtRegMap.h"
+#include "llvm/Value.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ProcessImplicitDefs.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <limits>
+#include <cmath>
+using namespace llvm;
+
+// Hidden options for help debugging.
+static cl::opt<bool> DisableReMat("disable-rematerialization", 
+                                  cl::init(false), cl::Hidden);
+
+STATISTIC(numIntervals , "Number of original intervals");
+STATISTIC(numFolds     , "Number of loads/stores folded into instructions");
+STATISTIC(numSplits    , "Number of intervals split");
+
+char LiveIntervals::ID = 0;
+INITIALIZE_PASS(LiveIntervals, "liveintervals",
+                "Live Interval Analysis", false, false);
+
+void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AliasAnalysis>();
+  AU.addPreserved<AliasAnalysis>();
+  AU.addPreserved<LiveVariables>();
+  AU.addRequired<LiveVariables>();
+  AU.addPreservedID(MachineLoopInfoID);
+  AU.addPreservedID(MachineDominatorsID);
+  
+  if (!StrongPHIElim) {
+    AU.addPreservedID(PHIEliminationID);
+    AU.addRequiredID(PHIEliminationID);
+  }
+  
+  AU.addRequiredID(TwoAddressInstructionPassID);
+  AU.addPreserved<ProcessImplicitDefs>();
+  AU.addRequired<ProcessImplicitDefs>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequiredTransitive<SlotIndexes>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void LiveIntervals::releaseMemory() {
+  // Free the live intervals themselves.
+  for (DenseMap<unsigned, LiveInterval*>::iterator I = r2iMap_.begin(),
+       E = r2iMap_.end(); I != E; ++I)
+    delete I->second;
+  
+  r2iMap_.clear();
+
+  // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
+  VNInfoAllocator.Reset();
+  while (!CloneMIs.empty()) {
+    MachineInstr *MI = CloneMIs.back();
+    CloneMIs.pop_back();
+    mf_->DeleteMachineInstr(MI);
+  }
+}
+
+/// runOnMachineFunction - Register allocate the whole function
+///
+bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
+  mf_ = &fn;
+  mri_ = &mf_->getRegInfo();
+  tm_ = &fn.getTarget();
+  tri_ = tm_->getRegisterInfo();
+  tii_ = tm_->getInstrInfo();
+  aa_ = &getAnalysis<AliasAnalysis>();
+  lv_ = &getAnalysis<LiveVariables>();
+  indexes_ = &getAnalysis<SlotIndexes>();
+  allocatableRegs_ = tri_->getAllocatableSet(fn);
+
+  computeIntervals();
+
+  numIntervals += getNumIntervals();
+
+  DEBUG(dump());
+  return true;
+}
+
+/// print - Implement the dump method.
+void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
+  OS << "********** INTERVALS **********\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    I->second->print(OS, tri_);
+    OS << "\n";
+  }
+
+  printInstrs(OS);
+}
+
+void LiveIntervals::printInstrs(raw_ostream &OS) const {
+  OS << "********** MACHINEINSTRS **********\n";
+
+  for (MachineFunction::iterator mbbi = mf_->begin(), mbbe = mf_->end();
+       mbbi != mbbe; ++mbbi) {
+    OS << "BB#" << mbbi->getNumber()
+       << ":\t\t# derived from " << mbbi->getName() << "\n";
+    for (MachineBasicBlock::iterator mii = mbbi->begin(),
+           mie = mbbi->end(); mii != mie; ++mii) {
+      if (mii->isDebugValue())
+        OS << "    \t" << *mii;
+      else
+        OS << getInstructionIndex(mii) << '\t' << *mii;
+    }
+  }
+}
+
+void LiveIntervals::dumpInstrs() const {
+  printInstrs(dbgs());
+}
+
+bool LiveIntervals::conflictsWithPhysReg(const LiveInterval &li,
+                                         VirtRegMap &vrm, unsigned reg) {
+  // We don't handle fancy stuff crossing basic block boundaries
+  if (li.ranges.size() != 1)
+    return true;
+  const LiveRange &range = li.ranges.front();
+  SlotIndex idx = range.start.getBaseIndex();
+  SlotIndex end = range.end.getPrevSlot().getBaseIndex().getNextIndex();
+
+  // Skip deleted instructions
+  MachineInstr *firstMI = getInstructionFromIndex(idx);
+  while (!firstMI && idx != end) {
+    idx = idx.getNextIndex();
+    firstMI = getInstructionFromIndex(idx);
+  }
+  if (!firstMI)
+    return false;
+
+  // Find last instruction in range
+  SlotIndex lastIdx = end.getPrevIndex();
+  MachineInstr *lastMI = getInstructionFromIndex(lastIdx);
+  while (!lastMI && lastIdx != idx) {
+    lastIdx = lastIdx.getPrevIndex();
+    lastMI = getInstructionFromIndex(lastIdx);
+  }
+  if (!lastMI)
+    return false;
+
+  // Range cannot cross basic block boundaries or terminators
+  MachineBasicBlock *MBB = firstMI->getParent();
+  if (MBB != lastMI->getParent() || lastMI->getDesc().isTerminator())
+    return true;
+
+  MachineBasicBlock::const_iterator E = lastMI;
+  ++E;
+  for (MachineBasicBlock::const_iterator I = firstMI; I != E; ++I) {
+    const MachineInstr &MI = *I;
+
+    // Allow copies to and from li.reg
+    if (MI.isCopy())
+      if (MI.getOperand(0).getReg() == li.reg ||
+          MI.getOperand(1).getReg() == li.reg)
+        continue;
+
+    // Check for operands using reg
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e;  ++i) {
+      const MachineOperand& mop = MI.getOperand(i);
+      if (!mop.isReg())
+        continue;
+      unsigned PhysReg = mop.getReg();
+      if (PhysReg == 0 || PhysReg == li.reg)
+        continue;
+      if (TargetRegisterInfo::isVirtualRegister(PhysReg)) {
+        if (!vrm.hasPhys(PhysReg))
+          continue;
+        PhysReg = vrm.getPhys(PhysReg);
+      }
+      if (PhysReg && tri_->regsOverlap(PhysReg, reg))
+        return true;
+    }
+  }
+
+  // No conflicts found.
+  return false;
+}
+
+bool LiveIntervals::conflictsWithAliasRef(LiveInterval &li, unsigned Reg,
+                                  SmallPtrSet<MachineInstr*,32> &JoinedCopies) {
+  for (LiveInterval::Ranges::const_iterator
+         I = li.ranges.begin(), E = li.ranges.end(); I != E; ++I) {
+    for (SlotIndex index = I->start.getBaseIndex(),
+           end = I->end.getPrevSlot().getBaseIndex().getNextIndex();
+           index != end;
+           index = index.getNextIndex()) {
+      MachineInstr *MI = getInstructionFromIndex(index);
+      if (!MI)
+        continue;               // skip deleted instructions
+
+      if (JoinedCopies.count(MI))
+        continue;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand& MO = MI->getOperand(i);
+        if (!MO.isReg())
+          continue;
+        unsigned PhysReg = MO.getReg();
+        if (PhysReg == 0 || PhysReg == Reg ||
+            TargetRegisterInfo::isVirtualRegister(PhysReg))
+          continue;
+        if (tri_->regsOverlap(Reg, PhysReg))
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+#ifndef NDEBUG
+static void printRegName(unsigned reg, const TargetRegisterInfo* tri_) {
+  if (TargetRegisterInfo::isPhysicalRegister(reg))
+    dbgs() << tri_->getName(reg);
+  else
+    dbgs() << "%reg" << reg;
+}
+#endif
+
+static
+bool MultipleDefsBySameMI(const MachineInstr &MI, unsigned MOIdx) {
+  unsigned Reg = MI.getOperand(MOIdx).getReg();
+  for (unsigned i = MOIdx+1, e = MI.getNumOperands(); i < e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg())
+      continue;
+    if (MO.getReg() == Reg && MO.isDef()) {
+      assert(MI.getOperand(MOIdx).getSubReg() != MO.getSubReg() &&
+             MI.getOperand(MOIdx).getSubReg() &&
+             (MO.getSubReg() || MO.isImplicit()));
+      return true;
+    }
+  }
+  return false;
+}
+
+/// isPartialRedef - Return true if the specified def at the specific index is
+/// partially re-defining the specified live interval. A common case of this is
+/// a definition of the sub-register. 
+bool LiveIntervals::isPartialRedef(SlotIndex MIIdx, MachineOperand &MO,
+                                   LiveInterval &interval) {
+  if (!MO.getSubReg() || MO.isEarlyClobber())
+    return false;
+
+  SlotIndex RedefIndex = MIIdx.getDefIndex();
+  const LiveRange *OldLR =
+    interval.getLiveRangeContaining(RedefIndex.getUseIndex());
+  if (OldLR->valno->isDefAccurate()) {
+    MachineInstr *DefMI = getInstructionFromIndex(OldLR->valno->def);
+    return DefMI->findRegisterDefOperandIdx(interval.reg) != -1;
+  }
+  return false;
+}
+
+void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
+                                             MachineBasicBlock::iterator mi,
+                                             SlotIndex MIIdx,
+                                             MachineOperand& MO,
+                                             unsigned MOIdx,
+                                             LiveInterval &interval) {
+  DEBUG({
+      dbgs() << "\t\tregister: ";
+      printRegName(interval.reg, tri_);
+    });
+
+  // Virtual registers may be defined multiple times (due to phi
+  // elimination and 2-addr elimination).  Much of what we do only has to be
+  // done once for the vreg.  We use an empty interval to detect the first
+  // time we see a vreg.
+  LiveVariables::VarInfo& vi = lv_->getVarInfo(interval.reg);
+  if (interval.empty()) {
+    // Get the Idx of the defining instructions.
+    SlotIndex defIndex = MIIdx.getDefIndex();
+    // Earlyclobbers move back one, so that they overlap the live range
+    // of inputs.
+    if (MO.isEarlyClobber())
+      defIndex = MIIdx.getUseIndex();
+
+    // Make sure the first definition is not a partial redefinition. Add an
+    // <imp-def> of the full register.
+    if (MO.getSubReg())
+      mi->addRegisterDefined(interval.reg);
+
+    MachineInstr *CopyMI = NULL;
+    if (mi->isCopyLike()) {
+      CopyMI = mi;
+    }
+
+    VNInfo *ValNo = interval.getNextValue(defIndex, CopyMI, true,
+                                          VNInfoAllocator);
+    assert(ValNo->id == 0 && "First value in interval is not 0?");
+
+    // Loop over all of the blocks that the vreg is defined in.  There are
+    // two cases we have to handle here.  The most common case is a vreg
+    // whose lifetime is contained within a basic block.  In this case there
+    // will be a single kill, in MBB, which comes after the definition.
+    if (vi.Kills.size() == 1 && vi.Kills[0]->getParent() == mbb) {
+      // FIXME: what about dead vars?
+      SlotIndex killIdx;
+      if (vi.Kills[0] != mi)
+        killIdx = getInstructionIndex(vi.Kills[0]).getDefIndex();
+      else
+        killIdx = defIndex.getStoreIndex();
+
+      // If the kill happens after the definition, we have an intra-block
+      // live range.
+      if (killIdx > defIndex) {
+        assert(vi.AliveBlocks.empty() &&
+               "Shouldn't be alive across any blocks!");
+        LiveRange LR(defIndex, killIdx, ValNo);
+        interval.addRange(LR);
+        DEBUG(dbgs() << " +" << LR << "\n");
+        return;
+      }
+    }
+
+    // The other case we handle is when a virtual register lives to the end
+    // of the defining block, potentially live across some blocks, then is
+    // live into some number of blocks, but gets killed.  Start by adding a
+    // range that goes from this definition to the end of the defining block.
+    LiveRange NewLR(defIndex, getMBBEndIdx(mbb), ValNo);
+    DEBUG(dbgs() << " +" << NewLR);
+    interval.addRange(NewLR);
+
+    bool PHIJoin = lv_->isPHIJoin(interval.reg);
+
+    if (PHIJoin) {
+      // A phi join register is killed at the end of the MBB and revived as a new
+      // valno in the killing blocks.
+      assert(vi.AliveBlocks.empty() && "Phi join can't pass through blocks");
+      DEBUG(dbgs() << " phi-join");
+      ValNo->setHasPHIKill(true);
+    } else {
+      // Iterate over all of the blocks that the variable is completely
+      // live in, adding [insrtIndex(begin), instrIndex(end)+4) to the
+      // live interval.
+      for (SparseBitVector<>::iterator I = vi.AliveBlocks.begin(),
+               E = vi.AliveBlocks.end(); I != E; ++I) {
+        MachineBasicBlock *aliveBlock = mf_->getBlockNumbered(*I);
+        LiveRange LR(getMBBStartIdx(aliveBlock), getMBBEndIdx(aliveBlock), ValNo);
+        interval.addRange(LR);
+        DEBUG(dbgs() << " +" << LR);
+      }
+    }
+
+    // Finally, this virtual register is live from the start of any killing
+    // block to the 'use' slot of the killing instruction.
+    for (unsigned i = 0, e = vi.Kills.size(); i != e; ++i) {
+      MachineInstr *Kill = vi.Kills[i];
+      SlotIndex Start = getMBBStartIdx(Kill->getParent());
+      SlotIndex killIdx = getInstructionIndex(Kill).getDefIndex();
+
+      // Create interval with one of a NEW value number.  Note that this value
+      // number isn't actually defined by an instruction, weird huh? :)
+      if (PHIJoin) {
+        ValNo = interval.getNextValue(SlotIndex(Start, true), 0, false,
+                                      VNInfoAllocator);
+        ValNo->setIsPHIDef(true);
+      }
+      LiveRange LR(Start, killIdx, ValNo);
+      interval.addRange(LR);
+      DEBUG(dbgs() << " +" << LR);
+    }
+
+  } else {
+    if (MultipleDefsBySameMI(*mi, MOIdx))
+      // Multiple defs of the same virtual register by the same instruction.
+      // e.g. %reg1031:5<def>, %reg1031:6<def> = VLD1q16 %reg1024<kill>, ...
+      // This is likely due to elimination of REG_SEQUENCE instructions. Return
+      // here since there is nothing to do.
+      return;
+
+    // If this is the second time we see a virtual register definition, it
+    // must be due to phi elimination or two addr elimination.  If this is
+    // the result of two address elimination, then the vreg is one of the
+    // def-and-use register operand.
+
+    // It may also be partial redef like this:
+    // 80	%reg1041:6<def> = VSHRNv4i16 %reg1034<kill>, 12, pred:14, pred:%reg0
+    // 120	%reg1041:5<def> = VSHRNv4i16 %reg1039<kill>, 12, pred:14, pred:%reg0
+    bool PartReDef = isPartialRedef(MIIdx, MO, interval);
+    if (PartReDef || mi->isRegTiedToUseOperand(MOIdx)) {
+      // If this is a two-address definition, then we have already processed
+      // the live range.  The only problem is that we didn't realize there
+      // are actually two values in the live interval.  Because of this we
+      // need to take the LiveRegion that defines this register and split it
+      // into two values.
+      SlotIndex RedefIndex = MIIdx.getDefIndex();
+      if (MO.isEarlyClobber())
+        RedefIndex = MIIdx.getUseIndex();
+
+      const LiveRange *OldLR =
+        interval.getLiveRangeContaining(RedefIndex.getUseIndex());
+      VNInfo *OldValNo = OldLR->valno;
+      SlotIndex DefIndex = OldValNo->def.getDefIndex();
+
+      // Delete the previous value, which should be short and continuous,
+      // because the 2-addr copy must be in the same MBB as the redef.
+      interval.removeRange(DefIndex, RedefIndex);
+
+      // The new value number (#1) is defined by the instruction we claimed
+      // defined value #0.
+      VNInfo *ValNo = interval.getNextValue(OldValNo->def, OldValNo->getCopy(),
+                                            false, // update at *
+                                            VNInfoAllocator);
+      ValNo->setFlags(OldValNo->getFlags()); // * <- updating here
+
+      // Value#0 is now defined by the 2-addr instruction.
+      OldValNo->def  = RedefIndex;
+      OldValNo->setCopy(0);
+
+      // A re-def may be a copy. e.g. %reg1030:6<def> = VMOVD %reg1026, ...
+      if (PartReDef && mi->isCopyLike())
+        OldValNo->setCopy(&*mi);
+      
+      // Add the new live interval which replaces the range for the input copy.
+      LiveRange LR(DefIndex, RedefIndex, ValNo);
+      DEBUG(dbgs() << " replace range with " << LR);
+      interval.addRange(LR);
+
+      // If this redefinition is dead, we need to add a dummy unit live
+      // range covering the def slot.
+      if (MO.isDead())
+        interval.addRange(LiveRange(RedefIndex, RedefIndex.getStoreIndex(),
+                                    OldValNo));
+
+      DEBUG({
+          dbgs() << " RESULT: ";
+          interval.print(dbgs(), tri_);
+        });
+    } else if (lv_->isPHIJoin(interval.reg)) {
+      // In the case of PHI elimination, each variable definition is only
+      // live until the end of the block.  We've already taken care of the
+      // rest of the live range.
+
+      SlotIndex defIndex = MIIdx.getDefIndex();
+      if (MO.isEarlyClobber())
+        defIndex = MIIdx.getUseIndex();
+
+      VNInfo *ValNo;
+      MachineInstr *CopyMI = NULL;
+      if (mi->isCopyLike())
+        CopyMI = mi;
+      ValNo = interval.getNextValue(defIndex, CopyMI, true, VNInfoAllocator);
+      
+      SlotIndex killIndex = getMBBEndIdx(mbb);
+      LiveRange LR(defIndex, killIndex, ValNo);
+      interval.addRange(LR);
+      ValNo->setHasPHIKill(true);
+      DEBUG(dbgs() << " phi-join +" << LR);
+    } else {
+      llvm_unreachable("Multiply defined register");
+    }
+  }
+
+  DEBUG(dbgs() << '\n');
+}
+
+void LiveIntervals::handlePhysicalRegisterDef(MachineBasicBlock *MBB,
+                                              MachineBasicBlock::iterator mi,
+                                              SlotIndex MIIdx,
+                                              MachineOperand& MO,
+                                              LiveInterval &interval,
+                                              MachineInstr *CopyMI) {
+  // A physical register cannot be live across basic block, so its
+  // lifetime must end somewhere in its defining basic block.
+  DEBUG({
+      dbgs() << "\t\tregister: ";
+      printRegName(interval.reg, tri_);
+    });
+
+  SlotIndex baseIndex = MIIdx;
+  SlotIndex start = baseIndex.getDefIndex();
+  // Earlyclobbers move back one.
+  if (MO.isEarlyClobber())
+    start = MIIdx.getUseIndex();
+  SlotIndex end = start;
+
+  // If it is not used after definition, it is considered dead at
+  // the instruction defining it. Hence its interval is:
+  // [defSlot(def), defSlot(def)+1)
+  // For earlyclobbers, the defSlot was pushed back one; the extra
+  // advance below compensates.
+  if (MO.isDead()) {
+    DEBUG(dbgs() << " dead");
+    end = start.getStoreIndex();
+    goto exit;
+  }
+
+  // If it is not dead on definition, it must be killed by a
+  // subsequent instruction. Hence its interval is:
+  // [defSlot(def), useSlot(kill)+1)
+  baseIndex = baseIndex.getNextIndex();
+  while (++mi != MBB->end()) {
+
+    if (mi->isDebugValue())
+      continue;
+    if (getInstructionFromIndex(baseIndex) == 0)
+      baseIndex = indexes_->getNextNonNullIndex(baseIndex);
+
+    if (mi->killsRegister(interval.reg, tri_)) {
+      DEBUG(dbgs() << " killed");
+      end = baseIndex.getDefIndex();
+      goto exit;
+    } else {
+      int DefIdx = mi->findRegisterDefOperandIdx(interval.reg,false,false,tri_);
+      if (DefIdx != -1) {
+        if (mi->isRegTiedToUseOperand(DefIdx)) {
+          // Two-address instruction.
+          end = baseIndex.getDefIndex();
+        } else {
+          // Another instruction redefines the register before it is ever read.
+          // Then the register is essentially dead at the instruction that
+          // defines it. Hence its interval is:
+          // [defSlot(def), defSlot(def)+1)
+          DEBUG(dbgs() << " dead");
+          end = start.getStoreIndex();
+        }
+        goto exit;
+      }
+    }
+    
+    baseIndex = baseIndex.getNextIndex();
+  }
+  
+  // The only case we should have a dead physreg here without a killing or
+  // instruction where we know it's dead is if it is live-in to the function
+  // and never used. Another possible case is the implicit use of the
+  // physical register has been deleted by two-address pass.
+  end = start.getStoreIndex();
+
+exit:
+  assert(start < end && "did not find end of interval?");
+
+  // Already exists? Extend old live interval.
+  LiveInterval::iterator OldLR = interval.FindLiveRangeContaining(start);
+  bool Extend = OldLR != interval.end();
+  VNInfo *ValNo = Extend
+    ? OldLR->valno : interval.getNextValue(start, CopyMI, true, VNInfoAllocator);
+  if (MO.isEarlyClobber() && Extend)
+    ValNo->setHasRedefByEC(true);
+  LiveRange LR(start, end, ValNo);
+  interval.addRange(LR);
+  DEBUG(dbgs() << " +" << LR << '\n');
+}
+
+void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator MI,
+                                      SlotIndex MIIdx,
+                                      MachineOperand& MO,
+                                      unsigned MOIdx) {
+  if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    handleVirtualRegisterDef(MBB, MI, MIIdx, MO, MOIdx,
+                             getOrCreateInterval(MO.getReg()));
+  else if (allocatableRegs_[MO.getReg()]) {
+    MachineInstr *CopyMI = NULL;
+    if (MI->isCopyLike())
+      CopyMI = MI;
+    handlePhysicalRegisterDef(MBB, MI, MIIdx, MO,
+                              getOrCreateInterval(MO.getReg()), CopyMI);
+    // Def of a register also defines its sub-registers.
+    for (const unsigned* AS = tri_->getSubRegisters(MO.getReg()); *AS; ++AS)
+      // If MI also modifies the sub-register explicitly, avoid processing it
+      // more than once. Do not pass in TRI here so it checks for exact match.
+      if (!MI->definesRegister(*AS))
+        handlePhysicalRegisterDef(MBB, MI, MIIdx, MO,
+                                  getOrCreateInterval(*AS), 0);
+  }
+}
+
+void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB,
+                                         SlotIndex MIIdx,
+                                         LiveInterval &interval, bool isAlias) {
+  DEBUG({
+      dbgs() << "\t\tlivein register: ";
+      printRegName(interval.reg, tri_);
+    });
+
+  // Look for kills, if it reaches a def before it's killed, then it shouldn't
+  // be considered a livein.
+  MachineBasicBlock::iterator mi = MBB->begin();
+  MachineBasicBlock::iterator E = MBB->end();
+  // Skip over DBG_VALUE at the start of the MBB.
+  if (mi != E && mi->isDebugValue()) {
+    while (++mi != E && mi->isDebugValue())
+      ;
+    if (mi == E)
+      // MBB is empty except for DBG_VALUE's.
+      return;
+  }
+
+  SlotIndex baseIndex = MIIdx;
+  SlotIndex start = baseIndex;
+  if (getInstructionFromIndex(baseIndex) == 0)
+    baseIndex = indexes_->getNextNonNullIndex(baseIndex);
+
+  SlotIndex end = baseIndex;
+  bool SeenDefUse = false;
+
+  while (mi != E) {
+    if (mi->killsRegister(interval.reg, tri_)) {
+      DEBUG(dbgs() << " killed");
+      end = baseIndex.getDefIndex();
+      SeenDefUse = true;
+      break;
+    } else if (mi->definesRegister(interval.reg, tri_)) {
+      // Another instruction redefines the register before it is ever read.
+      // Then the register is essentially dead at the instruction that defines
+      // it. Hence its interval is:
+      // [defSlot(def), defSlot(def)+1)
+      DEBUG(dbgs() << " dead");
+      end = start.getStoreIndex();
+      SeenDefUse = true;
+      break;
+    }
+
+    while (++mi != E && mi->isDebugValue())
+      // Skip over DBG_VALUE.
+      ;
+    if (mi != E)
+      baseIndex = indexes_->getNextNonNullIndex(baseIndex);
+  }
+
+  // Live-in register might not be used at all.
+  if (!SeenDefUse) {
+    if (isAlias) {
+      DEBUG(dbgs() << " dead");
+      end = MIIdx.getStoreIndex();
+    } else {
+      DEBUG(dbgs() << " live through");
+      end = baseIndex;
+    }
+  }
+
+  VNInfo *vni =
+    interval.getNextValue(SlotIndex(getMBBStartIdx(MBB), true),
+                          0, false, VNInfoAllocator);
+  vni->setIsPHIDef(true);
+  LiveRange LR(start, end, vni);
+
+  interval.addRange(LR);
+  DEBUG(dbgs() << " +" << LR << '\n');
+}
+
+/// computeIntervals - computes the live intervals for virtual
+/// registers. for some ordering of the machine instructions [1,N] a
+/// live interval is an interval [i, j) where 1 <= i <= j < N for
+/// which a variable is live
+void LiveIntervals::computeIntervals() { 
+  DEBUG(dbgs() << "********** COMPUTING LIVE INTERVALS **********\n"
+               << "********** Function: "
+               << ((Value*)mf_->getFunction())->getName() << '\n');
+
+  SmallVector<unsigned, 8> UndefUses;
+  for (MachineFunction::iterator MBBI = mf_->begin(), E = mf_->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    if (MBB->empty())
+      continue;
+
+    // Track the index of the current machine instr.
+    SlotIndex MIIndex = getMBBStartIdx(MBB);
+    DEBUG(dbgs() << "BB#" << MBB->getNumber()
+          << ":\t\t# derived from " << MBB->getName() << "\n");
+
+    // Create intervals for live-ins to this BB first.
+    for (MachineBasicBlock::livein_iterator LI = MBB->livein_begin(),
+           LE = MBB->livein_end(); LI != LE; ++LI) {
+      handleLiveInRegister(MBB, MIIndex, getOrCreateInterval(*LI));
+      // Multiple live-ins can alias the same register.
+      for (const unsigned* AS = tri_->getSubRegisters(*LI); *AS; ++AS)
+        if (!hasInterval(*AS))
+          handleLiveInRegister(MBB, MIIndex, getOrCreateInterval(*AS),
+                               true);
+    }
+    
+    // Skip over empty initial indices.
+    if (getInstructionFromIndex(MIIndex) == 0)
+      MIIndex = indexes_->getNextNonNullIndex(MIIndex);
+    
+    for (MachineBasicBlock::iterator MI = MBB->begin(), miEnd = MBB->end();
+         MI != miEnd; ++MI) {
+      DEBUG(dbgs() << MIIndex << "\t" << *MI);
+      if (MI->isDebugValue())
+        continue;
+
+      // Handle defs.
+      for (int i = MI->getNumOperands() - 1; i >= 0; --i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg() || !MO.getReg())
+          continue;
+
+        // handle register defs - build intervals
+        if (MO.isDef())
+          handleRegisterDef(MBB, MI, MIIndex, MO, i);
+        else if (MO.isUndef())
+          UndefUses.push_back(MO.getReg());
+      }
+      
+      // Move to the next instr slot.
+      MIIndex = indexes_->getNextNonNullIndex(MIIndex);
+    }
+  }
+
+  // Create empty intervals for registers defined by implicit_def's (except
+  // for those implicit_def that define values which are liveout of their
+  // blocks.
+  for (unsigned i = 0, e = UndefUses.size(); i != e; ++i) {
+    unsigned UndefReg = UndefUses[i];
+    (void)getOrCreateInterval(UndefReg);
+  }
+}
+
+LiveInterval* LiveIntervals::createInterval(unsigned reg) {
+  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? HUGE_VALF : 0.0F;
+  return new LiveInterval(reg, Weight);
+}
+
+/// dupInterval - Duplicate a live interval. The caller is responsible for
+/// managing the allocated memory.
+LiveInterval* LiveIntervals::dupInterval(LiveInterval *li) {
+  LiveInterval *NewLI = createInterval(li->reg);
+  NewLI->Copy(*li, mri_, getVNInfoAllocator());
+  return NewLI;
+}
+
+//===----------------------------------------------------------------------===//
+// Register allocator hooks.
+//
+
+/// getReMatImplicitUse - If the remat definition MI has one (for now, we only
+/// allow one) virtual register operand, then its uses are implicitly using
+/// the register. Returns the virtual register.
+unsigned LiveIntervals::getReMatImplicitUse(const LiveInterval &li,
+                                            MachineInstr *MI) const {
+  unsigned RegOp = 0;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || Reg == li.reg)
+      continue;
+    
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+        !allocatableRegs_[Reg])
+      continue;
+    // FIXME: For now, only remat MI with at most one register operand.
+    assert(!RegOp &&
+           "Can't rematerialize instruction with multiple register operand!");
+    RegOp = MO.getReg();
+#ifndef NDEBUG
+    break;
+#endif
+  }
+  return RegOp;
+}
+
+/// isValNoAvailableAt - Return true if the val# of the specified interval
+/// which reaches the given instruction also reaches the specified use index.
+bool LiveIntervals::isValNoAvailableAt(const LiveInterval &li, MachineInstr *MI,
+                                       SlotIndex UseIdx) const {
+  SlotIndex Index = getInstructionIndex(MI);  
+  VNInfo *ValNo = li.FindLiveRangeContaining(Index)->valno;
+  LiveInterval::const_iterator UI = li.FindLiveRangeContaining(UseIdx);
+  return UI != li.end() && UI->valno == ValNo;
+}
+
+/// isReMaterializable - Returns true if the definition MI of the specified
+/// val# of the specified interval is re-materializable.
+bool LiveIntervals::isReMaterializable(const LiveInterval &li,
+                                       const VNInfo *ValNo, MachineInstr *MI,
+                                       SmallVectorImpl<LiveInterval*> &SpillIs,
+                                       bool &isLoad) {
+  if (DisableReMat)
+    return false;
+
+  if (!tii_->isTriviallyReMaterializable(MI, aa_))
+    return false;
+
+  // Target-specific code can mark an instruction as being rematerializable
+  // if it has one virtual reg use, though it had better be something like
+  // a PIC base register which is likely to be live everywhere.
+  unsigned ImpUse = getReMatImplicitUse(li, MI);
+  if (ImpUse) {
+    const LiveInterval &ImpLi = getInterval(ImpUse);
+    for (MachineRegisterInfo::use_nodbg_iterator
+           ri = mri_->use_nodbg_begin(li.reg), re = mri_->use_nodbg_end();
+         ri != re; ++ri) {
+      MachineInstr *UseMI = &*ri;
+      SlotIndex UseIdx = getInstructionIndex(UseMI);
+      if (li.FindLiveRangeContaining(UseIdx)->valno != ValNo)
+        continue;
+      if (!isValNoAvailableAt(ImpLi, MI, UseIdx))
+        return false;
+    }
+
+    // If a register operand of the re-materialized instruction is going to
+    // be spilled next, then it's not legal to re-materialize this instruction.
+    for (unsigned i = 0, e = SpillIs.size(); i != e; ++i)
+      if (ImpUse == SpillIs[i]->reg)
+        return false;
+  }
+  return true;
+}
+
+/// isReMaterializable - Returns true if the definition MI of the specified
+/// val# of the specified interval is re-materializable.
+bool LiveIntervals::isReMaterializable(const LiveInterval &li,
+                                       const VNInfo *ValNo, MachineInstr *MI) {
+  SmallVector<LiveInterval*, 4> Dummy1;
+  bool Dummy2;
+  return isReMaterializable(li, ValNo, MI, Dummy1, Dummy2);
+}
+
+/// isReMaterializable - Returns true if every definition of MI of every
+/// val# of the specified interval is re-materializable.
+bool LiveIntervals::isReMaterializable(const LiveInterval &li,
+                                       SmallVectorImpl<LiveInterval*> &SpillIs,
+                                       bool &isLoad) {
+  isLoad = false;
+  for (LiveInterval::const_vni_iterator i = li.vni_begin(), e = li.vni_end();
+       i != e; ++i) {
+    const VNInfo *VNI = *i;
+    if (VNI->isUnused())
+      continue; // Dead val#.
+    // Is the def for the val# rematerializable?
+    if (!VNI->isDefAccurate())
+      return false;
+    MachineInstr *ReMatDefMI = getInstructionFromIndex(VNI->def);
+    bool DefIsLoad = false;
+    if (!ReMatDefMI ||
+        !isReMaterializable(li, VNI, ReMatDefMI, SpillIs, DefIsLoad))
+      return false;
+    isLoad |= DefIsLoad;
+  }
+  return true;
+}
+
+/// FilterFoldedOps - Filter out two-address use operands. Return
+/// true if it finds any issue with the operands that ought to prevent
+/// folding.
+static bool FilterFoldedOps(MachineInstr *MI,
+                            SmallVector<unsigned, 2> &Ops,
+                            unsigned &MRInfo,
+                            SmallVector<unsigned, 2> &FoldOps) {
+  MRInfo = 0;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    unsigned OpIdx = Ops[i];
+    MachineOperand &MO = MI->getOperand(OpIdx);
+    // FIXME: fold subreg use.
+    if (MO.getSubReg())
+      return true;
+    if (MO.isDef())
+      MRInfo |= (unsigned)VirtRegMap::isMod;
+    else {
+      // Filter out two-address use operand(s).
+      if (MI->isRegTiedToDefOperand(OpIdx)) {
+        MRInfo = VirtRegMap::isModRef;
+        continue;
+      }
+      MRInfo |= (unsigned)VirtRegMap::isRef;
+    }
+    FoldOps.push_back(OpIdx);
+  }
+  return false;
+}
+                           
+
+/// tryFoldMemoryOperand - Attempts to fold either a spill / restore from
+/// slot / to reg or any rematerialized load into ith operand of specified
+/// MI. If it is successul, MI is updated with the newly created MI and
+/// returns true.
+bool LiveIntervals::tryFoldMemoryOperand(MachineInstr* &MI,
+                                         VirtRegMap &vrm, MachineInstr *DefMI,
+                                         SlotIndex InstrIdx,
+                                         SmallVector<unsigned, 2> &Ops,
+                                         bool isSS, int Slot, unsigned Reg) {
+  // If it is an implicit def instruction, just delete it.
+  if (MI->isImplicitDef()) {
+    RemoveMachineInstrFromMaps(MI);
+    vrm.RemoveMachineInstrFromMaps(MI);
+    MI->eraseFromParent();
+    ++numFolds;
+    return true;
+  }
+
+  // Filter the list of operand indexes that are to be folded. Abort if
+  // any operand will prevent folding.
+  unsigned MRInfo = 0;
+  SmallVector<unsigned, 2> FoldOps;
+  if (FilterFoldedOps(MI, Ops, MRInfo, FoldOps))
+    return false;
+
+  // The only time it's safe to fold into a two address instruction is when
+  // it's folding reload and spill from / into a spill stack slot.
+  if (DefMI && (MRInfo & VirtRegMap::isMod))
+    return false;
+
+  MachineInstr *fmi = isSS ? tii_->foldMemoryOperand(MI, FoldOps, Slot)
+                           : tii_->foldMemoryOperand(MI, FoldOps, DefMI);
+  if (fmi) {
+    // Remember this instruction uses the spill slot.
+    if (isSS) vrm.addSpillSlotUse(Slot, fmi);
+
+    // Attempt to fold the memory reference into the instruction. If
+    // we can do this, we don't need to insert spill code.
+    if (isSS && !mf_->getFrameInfo()->isImmutableObjectIndex(Slot))
+      vrm.virtFolded(Reg, MI, fmi, (VirtRegMap::ModRef)MRInfo);
+    vrm.transferSpillPts(MI, fmi);
+    vrm.transferRestorePts(MI, fmi);
+    vrm.transferEmergencySpills(MI, fmi);
+    ReplaceMachineInstrInMaps(MI, fmi);
+    MI->eraseFromParent();
+    MI = fmi;
+    ++numFolds;
+    return true;
+  }
+  return false;
+}
+
+/// canFoldMemoryOperand - Returns true if the specified load / store
+/// folding is possible.
+bool LiveIntervals::canFoldMemoryOperand(MachineInstr *MI,
+                                         SmallVector<unsigned, 2> &Ops,
+                                         bool ReMat) const {
+  // Filter the list of operand indexes that are to be folded. Abort if
+  // any operand will prevent folding.
+  unsigned MRInfo = 0;
+  SmallVector<unsigned, 2> FoldOps;
+  if (FilterFoldedOps(MI, Ops, MRInfo, FoldOps))
+    return false;
+
+  // It's only legal to remat for a use, not a def.
+  if (ReMat && (MRInfo & VirtRegMap::isMod))
+    return false;
+
+  return tii_->canFoldMemoryOperand(MI, FoldOps);
+}
+
+bool LiveIntervals::intervalIsInOneMBB(const LiveInterval &li) const {
+  LiveInterval::Ranges::const_iterator itr = li.ranges.begin();
+
+  MachineBasicBlock *mbb =  indexes_->getMBBCoveringRange(itr->start, itr->end);
+
+  if (mbb == 0)
+    return false;
+
+  for (++itr; itr != li.ranges.end(); ++itr) {
+    MachineBasicBlock *mbb2 =
+      indexes_->getMBBCoveringRange(itr->start, itr->end);
+
+    if (mbb2 != mbb)
+      return false;
+  }
+
+  return true;
+}
+
+/// rewriteImplicitOps - Rewrite implicit use operands of MI (i.e. uses of
+/// interval on to-be re-materialized operands of MI) with new register.
+void LiveIntervals::rewriteImplicitOps(const LiveInterval &li,
+                                       MachineInstr *MI, unsigned NewVReg,
+                                       VirtRegMap &vrm) {
+  // There is an implicit use. That means one of the other operand is
+  // being remat'ed and the remat'ed instruction has li.reg as an
+  // use operand. Make sure we rewrite that as well.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (!vrm.isReMaterialized(Reg))
+      continue;
+    MachineInstr *ReMatMI = vrm.getReMaterializedMI(Reg);
+    MachineOperand *UseMO = ReMatMI->findRegisterUseOperand(li.reg);
+    if (UseMO)
+      UseMO->setReg(NewVReg);
+  }
+}
+
+/// rewriteInstructionForSpills, rewriteInstructionsForSpills - Helper functions
+/// for addIntervalsForSpills to rewrite uses / defs for the given live range.
+bool LiveIntervals::
+rewriteInstructionForSpills(const LiveInterval &li, const VNInfo *VNI,
+                 bool TrySplit, SlotIndex index, SlotIndex end, 
+                 MachineInstr *MI,
+                 MachineInstr *ReMatOrigDefMI, MachineInstr *ReMatDefMI,
+                 unsigned Slot, int LdSlot,
+                 bool isLoad, bool isLoadSS, bool DefIsReMat, bool CanDelete,
+                 VirtRegMap &vrm,
+                 const TargetRegisterClass* rc,
+                 SmallVector<int, 4> &ReMatIds,
+                 const MachineLoopInfo *loopInfo,
+                 unsigned &NewVReg, unsigned ImpUse, bool &HasDef, bool &HasUse,
+                 DenseMap<unsigned,unsigned> &MBBVRegsMap,
+                 std::vector<LiveInterval*> &NewLIs) {
+  bool CanFold = false;
+ RestartInstruction:
+  for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
+    MachineOperand& mop = MI->getOperand(i);
+    if (!mop.isReg())
+      continue;
+    unsigned Reg = mop.getReg();
+    if (Reg == 0 || TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (Reg != li.reg)
+      continue;
+
+    bool TryFold = !DefIsReMat;
+    bool FoldSS = true; // Default behavior unless it's a remat.
+    int FoldSlot = Slot;
+    if (DefIsReMat) {
+      // If this is the rematerializable definition MI itself and
+      // all of its uses are rematerialized, simply delete it.
+      if (MI == ReMatOrigDefMI && CanDelete) {
+        DEBUG(dbgs() << "\t\t\t\tErasing re-materializable def: "
+                     << *MI << '\n');
+        RemoveMachineInstrFromMaps(MI);
+        vrm.RemoveMachineInstrFromMaps(MI);
+        MI->eraseFromParent();
+        break;
+      }
+
+      // If def for this use can't be rematerialized, then try folding.
+      // If def is rematerializable and it's a load, also try folding.
+      TryFold = !ReMatDefMI || (ReMatDefMI && (MI == ReMatOrigDefMI || isLoad));
+      if (isLoad) {
+        // Try fold loads (from stack slot, constant pool, etc.) into uses.
+        FoldSS = isLoadSS;
+        FoldSlot = LdSlot;
+      }
+    }
+
+    // Scan all of the operands of this instruction rewriting operands
+    // to use NewVReg instead of li.reg as appropriate.  We do this for
+    // two reasons:
+    //
+    //   1. If the instr reads the same spilled vreg multiple times, we
+    //      want to reuse the NewVReg.
+    //   2. If the instr is a two-addr instruction, we are required to
+    //      keep the src/dst regs pinned.
+    //
+    // Keep track of whether we replace a use and/or def so that we can
+    // create the spill interval with the appropriate range. 
+    SmallVector<unsigned, 2> Ops;
+    tie(HasUse, HasDef) = MI->readsWritesVirtualRegister(Reg, &Ops);
+
+    // Create a new virtual register for the spill interval.
+    // Create the new register now so we can map the fold instruction
+    // to the new register so when it is unfolded we get the correct
+    // answer.
+    bool CreatedNewVReg = false;
+    if (NewVReg == 0) {
+      NewVReg = mri_->createVirtualRegister(rc);
+      vrm.grow();
+      CreatedNewVReg = true;
+
+      // The new virtual register should get the same allocation hints as the
+      // old one.
+      std::pair<unsigned, unsigned> Hint = mri_->getRegAllocationHint(Reg);
+      if (Hint.first || Hint.second)
+        mri_->setRegAllocationHint(NewVReg, Hint.first, Hint.second);
+    }
+
+    if (!TryFold)
+      CanFold = false;
+    else {
+      // Do not fold load / store here if we are splitting. We'll find an
+      // optimal point to insert a load / store later.
+      if (!TrySplit) {
+        if (tryFoldMemoryOperand(MI, vrm, ReMatDefMI, index,
+                                 Ops, FoldSS, FoldSlot, NewVReg)) {
+          // Folding the load/store can completely change the instruction in
+          // unpredictable ways, rescan it from the beginning.
+
+          if (FoldSS) {
+            // We need to give the new vreg the same stack slot as the
+            // spilled interval.
+            vrm.assignVirt2StackSlot(NewVReg, FoldSlot);
+          }
+
+          HasUse = false;
+          HasDef = false;
+          CanFold = false;
+          if (isNotInMIMap(MI))
+            break;
+          goto RestartInstruction;
+        }
+      } else {
+        // We'll try to fold it later if it's profitable.
+        CanFold = canFoldMemoryOperand(MI, Ops, DefIsReMat);
+      }
+    }
+
+    mop.setReg(NewVReg);
+    if (mop.isImplicit())
+      rewriteImplicitOps(li, MI, NewVReg, vrm);
+
+    // Reuse NewVReg for other reads.
+    for (unsigned j = 0, e = Ops.size(); j != e; ++j) {
+      MachineOperand &mopj = MI->getOperand(Ops[j]);
+      mopj.setReg(NewVReg);
+      if (mopj.isImplicit())
+        rewriteImplicitOps(li, MI, NewVReg, vrm);
+    }
+            
+    if (CreatedNewVReg) {
+      if (DefIsReMat) {
+        vrm.setVirtIsReMaterialized(NewVReg, ReMatDefMI);
+        if (ReMatIds[VNI->id] == VirtRegMap::MAX_STACK_SLOT) {
+          // Each valnum may have its own remat id.
+          ReMatIds[VNI->id] = vrm.assignVirtReMatId(NewVReg);
+        } else {
+          vrm.assignVirtReMatId(NewVReg, ReMatIds[VNI->id]);
+        }
+        if (!CanDelete || (HasUse && HasDef)) {
+          // If this is a two-addr instruction then its use operands are
+          // rematerializable but its def is not. It should be assigned a
+          // stack slot.
+          vrm.assignVirt2StackSlot(NewVReg, Slot);
+        }
+      } else {
+        vrm.assignVirt2StackSlot(NewVReg, Slot);
+      }
+    } else if (HasUse && HasDef &&
+               vrm.getStackSlot(NewVReg) == VirtRegMap::NO_STACK_SLOT) {
+      // If this interval hasn't been assigned a stack slot (because earlier
+      // def is a deleted remat def), do it now.
+      assert(Slot != VirtRegMap::NO_STACK_SLOT);
+      vrm.assignVirt2StackSlot(NewVReg, Slot);
+    }
+
+    // Re-matting an instruction with virtual register use. Add the
+    // register as an implicit use on the use MI.
+    if (DefIsReMat && ImpUse)
+      MI->addOperand(MachineOperand::CreateReg(ImpUse, false, true));
+
+    // Create a new register interval for this spill / remat.
+    LiveInterval &nI = getOrCreateInterval(NewVReg);
+    if (CreatedNewVReg) {
+      NewLIs.push_back(&nI);
+      MBBVRegsMap.insert(std::make_pair(MI->getParent()->getNumber(), NewVReg));
+      if (TrySplit)
+        vrm.setIsSplitFromReg(NewVReg, li.reg);
+    }
+
+    if (HasUse) {
+      if (CreatedNewVReg) {
+        LiveRange LR(index.getLoadIndex(), index.getDefIndex(),
+                     nI.getNextValue(SlotIndex(), 0, false, VNInfoAllocator));
+        DEBUG(dbgs() << " +" << LR);
+        nI.addRange(LR);
+      } else {
+        // Extend the split live interval to this def / use.
+        SlotIndex End = index.getDefIndex();
+        LiveRange LR(nI.ranges[nI.ranges.size()-1].end, End,
+                     nI.getValNumInfo(nI.getNumValNums()-1));
+        DEBUG(dbgs() << " +" << LR);
+        nI.addRange(LR);
+      }
+    }
+    if (HasDef) {
+      LiveRange LR(index.getDefIndex(), index.getStoreIndex(),
+                   nI.getNextValue(SlotIndex(), 0, false, VNInfoAllocator));
+      DEBUG(dbgs() << " +" << LR);
+      nI.addRange(LR);
+    }
+
+    DEBUG({
+        dbgs() << "\t\t\t\tAdded new interval: ";
+        nI.print(dbgs(), tri_);
+        dbgs() << '\n';
+      });
+  }
+  return CanFold;
+}
+bool LiveIntervals::anyKillInMBBAfterIdx(const LiveInterval &li,
+                                   const VNInfo *VNI,
+                                   MachineBasicBlock *MBB,
+                                   SlotIndex Idx) const {
+  return li.killedInRange(Idx.getNextSlot(), getMBBEndIdx(MBB));
+}
+
+/// RewriteInfo - Keep track of machine instrs that will be rewritten
+/// during spilling.
+namespace {
+  struct RewriteInfo {
+    SlotIndex Index;
+    MachineInstr *MI;
+    RewriteInfo(SlotIndex i, MachineInstr *mi) : Index(i), MI(mi) {}
+  };
+
+  struct RewriteInfoCompare {
+    bool operator()(const RewriteInfo &LHS, const RewriteInfo &RHS) const {
+      return LHS.Index < RHS.Index;
+    }
+  };
+}
+
+void LiveIntervals::
+rewriteInstructionsForSpills(const LiveInterval &li, bool TrySplit,
+                    LiveInterval::Ranges::const_iterator &I,
+                    MachineInstr *ReMatOrigDefMI, MachineInstr *ReMatDefMI,
+                    unsigned Slot, int LdSlot,
+                    bool isLoad, bool isLoadSS, bool DefIsReMat, bool CanDelete,
+                    VirtRegMap &vrm,
+                    const TargetRegisterClass* rc,
+                    SmallVector<int, 4> &ReMatIds,
+                    const MachineLoopInfo *loopInfo,
+                    BitVector &SpillMBBs,
+                    DenseMap<unsigned, std::vector<SRInfo> > &SpillIdxes,
+                    BitVector &RestoreMBBs,
+                    DenseMap<unsigned, std::vector<SRInfo> > &RestoreIdxes,
+                    DenseMap<unsigned,unsigned> &MBBVRegsMap,
+                    std::vector<LiveInterval*> &NewLIs) {
+  bool AllCanFold = true;
+  unsigned NewVReg = 0;
+  SlotIndex start = I->start.getBaseIndex();
+  SlotIndex end = I->end.getPrevSlot().getBaseIndex().getNextIndex();
+
+  // First collect all the def / use in this live range that will be rewritten.
+  // Make sure they are sorted according to instruction index.
+  std::vector<RewriteInfo> RewriteMIs;
+  for (MachineRegisterInfo::reg_iterator ri = mri_->reg_begin(li.reg),
+         re = mri_->reg_end(); ri != re; ) {
+    MachineInstr *MI = &*ri;
+    MachineOperand &O = ri.getOperand();
+    ++ri;
+    if (MI->isDebugValue()) {
+      // Modify DBG_VALUE now that the value is in a spill slot.
+      if (Slot != VirtRegMap::MAX_STACK_SLOT || isLoadSS) {
+        uint64_t Offset = MI->getOperand(1).getImm();
+        const MDNode *MDPtr = MI->getOperand(2).getMetadata();
+        DebugLoc DL = MI->getDebugLoc();
+        int FI = isLoadSS ? LdSlot : (int)Slot;
+        if (MachineInstr *NewDV = tii_->emitFrameIndexDebugValue(*mf_, FI,
+                                                           Offset, MDPtr, DL)) {
+          DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
+          ReplaceMachineInstrInMaps(MI, NewDV);
+          MachineBasicBlock *MBB = MI->getParent();
+          MBB->insert(MBB->erase(MI), NewDV);
+          continue;
+        }
+      }
+
+      DEBUG(dbgs() << "Removing debug info due to spill:" << "\t" << *MI);
+      RemoveMachineInstrFromMaps(MI);
+      vrm.RemoveMachineInstrFromMaps(MI);
+      MI->eraseFromParent();
+      continue;
+    }
+    assert(!(O.isImplicit() && O.isUse()) &&
+           "Spilling register that's used as implicit use?");
+    SlotIndex index = getInstructionIndex(MI);
+    if (index < start || index >= end)
+      continue;
+
+    if (O.isUndef())
+      // Must be defined by an implicit def. It should not be spilled. Note,
+      // this is for correctness reason. e.g.
+      // 8   %reg1024<def> = IMPLICIT_DEF
+      // 12  %reg1024<def> = INSERT_SUBREG %reg1024<kill>, %reg1025, 2
+      // The live range [12, 14) are not part of the r1024 live interval since
+      // it's defined by an implicit def. It will not conflicts with live
+      // interval of r1025. Now suppose both registers are spilled, you can
+      // easily see a situation where both registers are reloaded before
+      // the INSERT_SUBREG and both target registers that would overlap.
+      continue;
+    RewriteMIs.push_back(RewriteInfo(index, MI));
+  }
+  std::sort(RewriteMIs.begin(), RewriteMIs.end(), RewriteInfoCompare());
+
+  unsigned ImpUse = DefIsReMat ? getReMatImplicitUse(li, ReMatDefMI) : 0;
+  // Now rewrite the defs and uses.
+  for (unsigned i = 0, e = RewriteMIs.size(); i != e; ) {
+    RewriteInfo &rwi = RewriteMIs[i];
+    ++i;
+    SlotIndex index = rwi.Index;
+    MachineInstr *MI = rwi.MI;
+    // If MI def and/or use the same register multiple times, then there
+    // are multiple entries.
+    while (i != e && RewriteMIs[i].MI == MI) {
+      assert(RewriteMIs[i].Index == index);
+      ++i;
+    }
+    MachineBasicBlock *MBB = MI->getParent();
+
+    if (ImpUse && MI != ReMatDefMI) {
+      // Re-matting an instruction with virtual register use. Prevent interval
+      // from being spilled.
+      getInterval(ImpUse).markNotSpillable();
+    }
+
+    unsigned MBBId = MBB->getNumber();
+    unsigned ThisVReg = 0;
+    if (TrySplit) {
+      DenseMap<unsigned,unsigned>::iterator NVI = MBBVRegsMap.find(MBBId);
+      if (NVI != MBBVRegsMap.end()) {
+        ThisVReg = NVI->second;
+        // One common case:
+        // x = use
+        // ...
+        // ...
+        // def = ...
+        //     = use
+        // It's better to start a new interval to avoid artifically
+        // extend the new interval.
+        if (MI->readsWritesVirtualRegister(li.reg) ==
+            std::make_pair(false,true)) {
+          MBBVRegsMap.erase(MBB->getNumber());
+          ThisVReg = 0;
+        }
+      }
+    }
+
+    bool IsNew = ThisVReg == 0;
+    if (IsNew) {
+      // This ends the previous live interval. If all of its def / use
+      // can be folded, give it a low spill weight.
+      if (NewVReg && TrySplit && AllCanFold) {
+        LiveInterval &nI = getOrCreateInterval(NewVReg);
+        nI.weight /= 10.0F;
+      }
+      AllCanFold = true;
+    }
+    NewVReg = ThisVReg;
+
+    bool HasDef = false;
+    bool HasUse = false;
+    bool CanFold = rewriteInstructionForSpills(li, I->valno, TrySplit,
+                         index, end, MI, ReMatOrigDefMI, ReMatDefMI,
+                         Slot, LdSlot, isLoad, isLoadSS, DefIsReMat,
+                         CanDelete, vrm, rc, ReMatIds, loopInfo, NewVReg,
+                         ImpUse, HasDef, HasUse, MBBVRegsMap, NewLIs);
+    if (!HasDef && !HasUse)
+      continue;
+
+    AllCanFold &= CanFold;
+
+    // Update weight of spill interval.
+    LiveInterval &nI = getOrCreateInterval(NewVReg);
+    if (!TrySplit) {
+      // The spill weight is now infinity as it cannot be spilled again.
+      nI.markNotSpillable();
+      continue;
+    }
+
+    // Keep track of the last def and first use in each MBB.
+    if (HasDef) {
+      if (MI != ReMatOrigDefMI || !CanDelete) {
+        bool HasKill = false;
+        if (!HasUse)
+          HasKill = anyKillInMBBAfterIdx(li, I->valno, MBB, index.getDefIndex());
+        else {
+          // If this is a two-address code, then this index starts a new VNInfo.
+          const VNInfo *VNI = li.findDefinedVNInfoForRegInt(index.getDefIndex());
+          if (VNI)
+            HasKill = anyKillInMBBAfterIdx(li, VNI, MBB, index.getDefIndex());
+        }
+        DenseMap<unsigned, std::vector<SRInfo> >::iterator SII =
+          SpillIdxes.find(MBBId);
+        if (!HasKill) {
+          if (SII == SpillIdxes.end()) {
+            std::vector<SRInfo> S;
+            S.push_back(SRInfo(index, NewVReg, true));
+            SpillIdxes.insert(std::make_pair(MBBId, S));
+          } else if (SII->second.back().vreg != NewVReg) {
+            SII->second.push_back(SRInfo(index, NewVReg, true));
+          } else if (index > SII->second.back().index) {
+            // If there is an earlier def and this is a two-address
+            // instruction, then it's not possible to fold the store (which
+            // would also fold the load).
+            SRInfo &Info = SII->second.back();
+            Info.index = index;
+            Info.canFold = !HasUse;
+          }
+          SpillMBBs.set(MBBId);
+        } else if (SII != SpillIdxes.end() &&
+                   SII->second.back().vreg == NewVReg &&
+                   index > SII->second.back().index) {
+          // There is an earlier def that's not killed (must be two-address).
+          // The spill is no longer needed.
+          SII->second.pop_back();
+          if (SII->second.empty()) {
+            SpillIdxes.erase(MBBId);
+            SpillMBBs.reset(MBBId);
+          }
+        }
+      }
+    }
+
+    if (HasUse) {
+      DenseMap<unsigned, std::vector<SRInfo> >::iterator SII =
+        SpillIdxes.find(MBBId);
+      if (SII != SpillIdxes.end() &&
+          SII->second.back().vreg == NewVReg &&
+          index > SII->second.back().index)
+        // Use(s) following the last def, it's not safe to fold the spill.
+        SII->second.back().canFold = false;
+      DenseMap<unsigned, std::vector<SRInfo> >::iterator RII =
+        RestoreIdxes.find(MBBId);
+      if (RII != RestoreIdxes.end() && RII->second.back().vreg == NewVReg)
+        // If we are splitting live intervals, only fold if it's the first
+        // use and there isn't another use later in the MBB.
+        RII->second.back().canFold = false;
+      else if (IsNew) {
+        // Only need a reload if there isn't an earlier def / use.
+        if (RII == RestoreIdxes.end()) {
+          std::vector<SRInfo> Infos;
+          Infos.push_back(SRInfo(index, NewVReg, true));
+          RestoreIdxes.insert(std::make_pair(MBBId, Infos));
+        } else {
+          RII->second.push_back(SRInfo(index, NewVReg, true));
+        }
+        RestoreMBBs.set(MBBId);
+      }
+    }
+
+    // Update spill weight.
+    unsigned loopDepth = loopInfo->getLoopDepth(MBB);
+    nI.weight += getSpillWeight(HasDef, HasUse, loopDepth);
+  }
+
+  if (NewVReg && TrySplit && AllCanFold) {
+    // If all of its def / use can be folded, give it a low spill weight.
+    LiveInterval &nI = getOrCreateInterval(NewVReg);
+    nI.weight /= 10.0F;
+  }
+}
+
+bool LiveIntervals::alsoFoldARestore(int Id, SlotIndex index,
+                        unsigned vr, BitVector &RestoreMBBs,
+                        DenseMap<unsigned,std::vector<SRInfo> > &RestoreIdxes) {
+  if (!RestoreMBBs[Id])
+    return false;
+  std::vector<SRInfo> &Restores = RestoreIdxes[Id];
+  for (unsigned i = 0, e = Restores.size(); i != e; ++i)
+    if (Restores[i].index == index &&
+        Restores[i].vreg == vr &&
+        Restores[i].canFold)
+      return true;
+  return false;
+}
+
+void LiveIntervals::eraseRestoreInfo(int Id, SlotIndex index,
+                        unsigned vr, BitVector &RestoreMBBs,
+                        DenseMap<unsigned,std::vector<SRInfo> > &RestoreIdxes) {
+  if (!RestoreMBBs[Id])
+    return;
+  std::vector<SRInfo> &Restores = RestoreIdxes[Id];
+  for (unsigned i = 0, e = Restores.size(); i != e; ++i)
+    if (Restores[i].index == index && Restores[i].vreg)
+      Restores[i].index = SlotIndex();
+}
+
+/// handleSpilledImpDefs - Remove IMPLICIT_DEF instructions which are being
+/// spilled and create empty intervals for their uses.
+void
+LiveIntervals::handleSpilledImpDefs(const LiveInterval &li, VirtRegMap &vrm,
+                                    const TargetRegisterClass* rc,
+                                    std::vector<LiveInterval*> &NewLIs) {
+  for (MachineRegisterInfo::reg_iterator ri = mri_->reg_begin(li.reg),
+         re = mri_->reg_end(); ri != re; ) {
+    MachineOperand &O = ri.getOperand();
+    MachineInstr *MI = &*ri;
+    ++ri;
+    if (MI->isDebugValue()) {
+      // Remove debug info for now.
+      O.setReg(0U);
+      DEBUG(dbgs() << "Removing debug info due to spill:" << "\t" << *MI);
+      continue;
+    }
+    if (O.isDef()) {
+      assert(MI->isImplicitDef() &&
+             "Register def was not rewritten?");
+      RemoveMachineInstrFromMaps(MI);
+      vrm.RemoveMachineInstrFromMaps(MI);
+      MI->eraseFromParent();
+    } else {
+      // This must be an use of an implicit_def so it's not part of the live
+      // interval. Create a new empty live interval for it.
+      // FIXME: Can we simply erase some of the instructions? e.g. Stores?
+      unsigned NewVReg = mri_->createVirtualRegister(rc);
+      vrm.grow();
+      vrm.setIsImplicitlyDefined(NewVReg);
+      NewLIs.push_back(&getOrCreateInterval(NewVReg));
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.getReg() == li.reg) {
+          MO.setReg(NewVReg);
+          MO.setIsUndef();
+        }
+      }
+    }
+  }
+}
+
+float
+LiveIntervals::getSpillWeight(bool isDef, bool isUse, unsigned loopDepth) {
+  // Limit the loop depth ridiculousness.
+  if (loopDepth > 200)
+    loopDepth = 200;
+
+  // The loop depth is used to roughly estimate the number of times the
+  // instruction is executed. Something like 10^d is simple, but will quickly
+  // overflow a float. This expression behaves like 10^d for small d, but is
+  // more tempered for large d. At d=200 we get 6.7e33 which leaves a bit of
+  // headroom before overflow.
+  float lc = std::pow(1 + (100.0f / (loopDepth+10)), (float)loopDepth);
+
+  return (isDef + isUse) * lc;
+}
+
+void
+LiveIntervals::normalizeSpillWeights(std::vector<LiveInterval*> &NewLIs) {
+  for (unsigned i = 0, e = NewLIs.size(); i != e; ++i)
+    normalizeSpillWeight(*NewLIs[i]);
+}
+
+std::vector<LiveInterval*> LiveIntervals::
+addIntervalsForSpills(const LiveInterval &li,
+                      SmallVectorImpl<LiveInterval*> &SpillIs,
+                      const MachineLoopInfo *loopInfo, VirtRegMap &vrm) {
+  assert(li.isSpillable() && "attempt to spill already spilled interval!");
+
+  DEBUG({
+      dbgs() << "\t\t\t\tadding intervals for spills for interval: ";
+      li.print(dbgs(), tri_);
+      dbgs() << '\n';
+    });
+
+  // Each bit specify whether a spill is required in the MBB.
+  BitVector SpillMBBs(mf_->getNumBlockIDs());
+  DenseMap<unsigned, std::vector<SRInfo> > SpillIdxes;
+  BitVector RestoreMBBs(mf_->getNumBlockIDs());
+  DenseMap<unsigned, std::vector<SRInfo> > RestoreIdxes;
+  DenseMap<unsigned,unsigned> MBBVRegsMap;
+  std::vector<LiveInterval*> NewLIs;
+  const TargetRegisterClass* rc = mri_->getRegClass(li.reg);
+
+  unsigned NumValNums = li.getNumValNums();
+  SmallVector<MachineInstr*, 4> ReMatDefs;
+  ReMatDefs.resize(NumValNums, NULL);
+  SmallVector<MachineInstr*, 4> ReMatOrigDefs;
+  ReMatOrigDefs.resize(NumValNums, NULL);
+  SmallVector<int, 4> ReMatIds;
+  ReMatIds.resize(NumValNums, VirtRegMap::MAX_STACK_SLOT);
+  BitVector ReMatDelete(NumValNums);
+  unsigned Slot = VirtRegMap::MAX_STACK_SLOT;
+
+  // Spilling a split live interval. It cannot be split any further. Also,
+  // it's also guaranteed to be a single val# / range interval.
+  if (vrm.getPreSplitReg(li.reg)) {
+    vrm.setIsSplitFromReg(li.reg, 0);
+    // Unset the split kill marker on the last use.
+    SlotIndex KillIdx = vrm.getKillPoint(li.reg);
+    if (KillIdx != SlotIndex()) {
+      MachineInstr *KillMI = getInstructionFromIndex(KillIdx);
+      assert(KillMI && "Last use disappeared?");
+      int KillOp = KillMI->findRegisterUseOperandIdx(li.reg, true);
+      assert(KillOp != -1 && "Last use disappeared?");
+      KillMI->getOperand(KillOp).setIsKill(false);
+    }
+    vrm.removeKillPoint(li.reg);
+    bool DefIsReMat = vrm.isReMaterialized(li.reg);
+    Slot = vrm.getStackSlot(li.reg);
+    assert(Slot != VirtRegMap::MAX_STACK_SLOT);
+    MachineInstr *ReMatDefMI = DefIsReMat ?
+      vrm.getReMaterializedMI(li.reg) : NULL;
+    int LdSlot = 0;
+    bool isLoadSS = DefIsReMat && tii_->isLoadFromStackSlot(ReMatDefMI, LdSlot);
+    bool isLoad = isLoadSS ||
+      (DefIsReMat && (ReMatDefMI->getDesc().canFoldAsLoad()));
+    bool IsFirstRange = true;
+    for (LiveInterval::Ranges::const_iterator
+           I = li.ranges.begin(), E = li.ranges.end(); I != E; ++I) {
+      // If this is a split live interval with multiple ranges, it means there
+      // are two-address instructions that re-defined the value. Only the
+      // first def can be rematerialized!
+      if (IsFirstRange) {
+        // Note ReMatOrigDefMI has already been deleted.
+        rewriteInstructionsForSpills(li, false, I, NULL, ReMatDefMI,
+                             Slot, LdSlot, isLoad, isLoadSS, DefIsReMat,
+                             false, vrm, rc, ReMatIds, loopInfo,
+                             SpillMBBs, SpillIdxes, RestoreMBBs, RestoreIdxes,
+                             MBBVRegsMap, NewLIs);
+      } else {
+        rewriteInstructionsForSpills(li, false, I, NULL, 0,
+                             Slot, 0, false, false, false,
+                             false, vrm, rc, ReMatIds, loopInfo,
+                             SpillMBBs, SpillIdxes, RestoreMBBs, RestoreIdxes,
+                             MBBVRegsMap, NewLIs);
+      }
+      IsFirstRange = false;
+    }
+
+    handleSpilledImpDefs(li, vrm, rc, NewLIs);
+    normalizeSpillWeights(NewLIs);
+    return NewLIs;
+  }
+
+  bool TrySplit = !intervalIsInOneMBB(li);
+  if (TrySplit)
+    ++numSplits;
+  bool NeedStackSlot = false;
+  for (LiveInterval::const_vni_iterator i = li.vni_begin(), e = li.vni_end();
+       i != e; ++i) {
+    const VNInfo *VNI = *i;
+    unsigned VN = VNI->id;
+    if (VNI->isUnused())
+      continue; // Dead val#.
+    // Is the def for the val# rematerializable?
+    MachineInstr *ReMatDefMI = VNI->isDefAccurate()
+      ? getInstructionFromIndex(VNI->def) : 0;
+    bool dummy;
+    if (ReMatDefMI && isReMaterializable(li, VNI, ReMatDefMI, SpillIs, dummy)) {
+      // Remember how to remat the def of this val#.
+      ReMatOrigDefs[VN] = ReMatDefMI;
+      // Original def may be modified so we have to make a copy here.
+      MachineInstr *Clone = mf_->CloneMachineInstr(ReMatDefMI);
+      CloneMIs.push_back(Clone);
+      ReMatDefs[VN] = Clone;
+
+      bool CanDelete = true;
+      if (VNI->hasPHIKill()) {
+        // A kill is a phi node, not all of its uses can be rematerialized.
+        // It must not be deleted.
+        CanDelete = false;
+        // Need a stack slot if there is any live range where uses cannot be
+        // rematerialized.
+        NeedStackSlot = true;
+      }
+      if (CanDelete)
+        ReMatDelete.set(VN);
+    } else {
+      // Need a stack slot if there is any live range where uses cannot be
+      // rematerialized.
+      NeedStackSlot = true;
+    }
+  }
+
+  // One stack slot per live interval.
+  if (NeedStackSlot && vrm.getPreSplitReg(li.reg) == 0) {
+    if (vrm.getStackSlot(li.reg) == VirtRegMap::NO_STACK_SLOT)
+      Slot = vrm.assignVirt2StackSlot(li.reg);
+    
+    // This case only occurs when the prealloc splitter has already assigned
+    // a stack slot to this vreg.
+    else
+      Slot = vrm.getStackSlot(li.reg);
+  }
+
+  // Create new intervals and rewrite defs and uses.
+  for (LiveInterval::Ranges::const_iterator
+         I = li.ranges.begin(), E = li.ranges.end(); I != E; ++I) {
+    MachineInstr *ReMatDefMI = ReMatDefs[I->valno->id];
+    MachineInstr *ReMatOrigDefMI = ReMatOrigDefs[I->valno->id];
+    bool DefIsReMat = ReMatDefMI != NULL;
+    bool CanDelete = ReMatDelete[I->valno->id];
+    int LdSlot = 0;
+    bool isLoadSS = DefIsReMat && tii_->isLoadFromStackSlot(ReMatDefMI, LdSlot);
+    bool isLoad = isLoadSS ||
+      (DefIsReMat && ReMatDefMI->getDesc().canFoldAsLoad());
+    rewriteInstructionsForSpills(li, TrySplit, I, ReMatOrigDefMI, ReMatDefMI,
+                               Slot, LdSlot, isLoad, isLoadSS, DefIsReMat,
+                               CanDelete, vrm, rc, ReMatIds, loopInfo,
+                               SpillMBBs, SpillIdxes, RestoreMBBs, RestoreIdxes,
+                               MBBVRegsMap, NewLIs);
+  }
+
+  // Insert spills / restores if we are splitting.
+  if (!TrySplit) {
+    handleSpilledImpDefs(li, vrm, rc, NewLIs);
+    normalizeSpillWeights(NewLIs);
+    return NewLIs;
+  }
+
+  SmallPtrSet<LiveInterval*, 4> AddedKill;
+  SmallVector<unsigned, 2> Ops;
+  if (NeedStackSlot) {
+    int Id = SpillMBBs.find_first();
+    while (Id != -1) {
+      std::vector<SRInfo> &spills = SpillIdxes[Id];
+      for (unsigned i = 0, e = spills.size(); i != e; ++i) {
+        SlotIndex index = spills[i].index;
+        unsigned VReg = spills[i].vreg;
+        LiveInterval &nI = getOrCreateInterval(VReg);
+        bool isReMat = vrm.isReMaterialized(VReg);
+        MachineInstr *MI = getInstructionFromIndex(index);
+        bool CanFold = false;
+        bool FoundUse = false;
+        Ops.clear();
+        if (spills[i].canFold) {
+          CanFold = true;
+          for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) {
+            MachineOperand &MO = MI->getOperand(j);
+            if (!MO.isReg() || MO.getReg() != VReg)
+              continue;
+
+            Ops.push_back(j);
+            if (MO.isDef())
+              continue;
+            if (isReMat || 
+                (!FoundUse && !alsoFoldARestore(Id, index, VReg,
+                                                RestoreMBBs, RestoreIdxes))) {
+              // MI has two-address uses of the same register. If the use
+              // isn't the first and only use in the BB, then we can't fold
+              // it. FIXME: Move this to rewriteInstructionsForSpills.
+              CanFold = false;
+              break;
+            }
+            FoundUse = true;
+          }
+        }
+        // Fold the store into the def if possible.
+        bool Folded = false;
+        if (CanFold && !Ops.empty()) {
+          if (tryFoldMemoryOperand(MI, vrm, NULL, index, Ops, true, Slot,VReg)){
+            Folded = true;
+            if (FoundUse) {
+              // Also folded uses, do not issue a load.
+              eraseRestoreInfo(Id, index, VReg, RestoreMBBs, RestoreIdxes);
+              nI.removeRange(index.getLoadIndex(), index.getDefIndex());
+            }
+            nI.removeRange(index.getDefIndex(), index.getStoreIndex());
+          }
+        }
+
+        // Otherwise tell the spiller to issue a spill.
+        if (!Folded) {
+          LiveRange *LR = &nI.ranges[nI.ranges.size()-1];
+          bool isKill = LR->end == index.getStoreIndex();
+          if (!MI->registerDefIsDead(nI.reg))
+            // No need to spill a dead def.
+            vrm.addSpillPoint(VReg, isKill, MI);
+          if (isKill)
+            AddedKill.insert(&nI);
+        }
+      }
+      Id = SpillMBBs.find_next(Id);
+    }
+  }
+
+  int Id = RestoreMBBs.find_first();
+  while (Id != -1) {
+    std::vector<SRInfo> &restores = RestoreIdxes[Id];
+    for (unsigned i = 0, e = restores.size(); i != e; ++i) {
+      SlotIndex index = restores[i].index;
+      if (index == SlotIndex())
+        continue;
+      unsigned VReg = restores[i].vreg;
+      LiveInterval &nI = getOrCreateInterval(VReg);
+      bool isReMat = vrm.isReMaterialized(VReg);
+      MachineInstr *MI = getInstructionFromIndex(index);
+      bool CanFold = false;
+      Ops.clear();
+      if (restores[i].canFold) {
+        CanFold = true;
+        for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) {
+          MachineOperand &MO = MI->getOperand(j);
+          if (!MO.isReg() || MO.getReg() != VReg)
+            continue;
+
+          if (MO.isDef()) {
+            // If this restore were to be folded, it would have been folded
+            // already.
+            CanFold = false;
+            break;
+          }
+          Ops.push_back(j);
+        }
+      }
+
+      // Fold the load into the use if possible.
+      bool Folded = false;
+      if (CanFold && !Ops.empty()) {
+        if (!isReMat)
+          Folded = tryFoldMemoryOperand(MI, vrm, NULL,index,Ops,true,Slot,VReg);
+        else {
+          MachineInstr *ReMatDefMI = vrm.getReMaterializedMI(VReg);
+          int LdSlot = 0;
+          bool isLoadSS = tii_->isLoadFromStackSlot(ReMatDefMI, LdSlot);
+          // If the rematerializable def is a load, also try to fold it.
+          if (isLoadSS || ReMatDefMI->getDesc().canFoldAsLoad())
+            Folded = tryFoldMemoryOperand(MI, vrm, ReMatDefMI, index,
+                                          Ops, isLoadSS, LdSlot, VReg);
+          if (!Folded) {
+            unsigned ImpUse = getReMatImplicitUse(li, ReMatDefMI);
+            if (ImpUse) {
+              // Re-matting an instruction with virtual register use. Add the
+              // register as an implicit use on the use MI and mark the register
+              // interval as unspillable.
+              LiveInterval &ImpLi = getInterval(ImpUse);
+              ImpLi.markNotSpillable();
+              MI->addOperand(MachineOperand::CreateReg(ImpUse, false, true));
+            }
+          }
+        }
+      }
+      // If folding is not possible / failed, then tell the spiller to issue a
+      // load / rematerialization for us.
+      if (Folded)
+        nI.removeRange(index.getLoadIndex(), index.getDefIndex());
+      else
+        vrm.addRestorePoint(VReg, MI);
+    }
+    Id = RestoreMBBs.find_next(Id);
+  }
+
+  // Finalize intervals: add kills, finalize spill weights, and filter out
+  // dead intervals.
+  std::vector<LiveInterval*> RetNewLIs;
+  for (unsigned i = 0, e = NewLIs.size(); i != e; ++i) {
+    LiveInterval *LI = NewLIs[i];
+    if (!LI->empty()) {
+      if (!AddedKill.count(LI)) {
+        LiveRange *LR = &LI->ranges[LI->ranges.size()-1];
+        SlotIndex LastUseIdx = LR->end.getBaseIndex();
+        MachineInstr *LastUse = getInstructionFromIndex(LastUseIdx);
+        int UseIdx = LastUse->findRegisterUseOperandIdx(LI->reg, false);
+        assert(UseIdx != -1);
+        if (!LastUse->isRegTiedToDefOperand(UseIdx)) {
+          LastUse->getOperand(UseIdx).setIsKill();
+          vrm.addKillPoint(LI->reg, LastUseIdx);
+        }
+      }
+      RetNewLIs.push_back(LI);
+    }
+  }
+
+  handleSpilledImpDefs(li, vrm, rc, RetNewLIs);
+  normalizeSpillWeights(RetNewLIs);
+  return RetNewLIs;
+}
+
+/// hasAllocatableSuperReg - Return true if the specified physical register has
+/// any super register that's allocatable.
+bool LiveIntervals::hasAllocatableSuperReg(unsigned Reg) const {
+  for (const unsigned* AS = tri_->getSuperRegisters(Reg); *AS; ++AS)
+    if (allocatableRegs_[*AS] && hasInterval(*AS))
+      return true;
+  return false;
+}
+
+/// getRepresentativeReg - Find the largest super register of the specified
+/// physical register.
+unsigned LiveIntervals::getRepresentativeReg(unsigned Reg) const {
+  // Find the largest super-register that is allocatable. 
+  unsigned BestReg = Reg;
+  for (const unsigned* AS = tri_->getSuperRegisters(Reg); *AS; ++AS) {
+    unsigned SuperReg = *AS;
+    if (!hasAllocatableSuperReg(SuperReg) && hasInterval(SuperReg)) {
+      BestReg = SuperReg;
+      break;
+    }
+  }
+  return BestReg;
+}
+
+/// getNumConflictsWithPhysReg - Return the number of uses and defs of the
+/// specified interval that conflicts with the specified physical register.
+unsigned LiveIntervals::getNumConflictsWithPhysReg(const LiveInterval &li,
+                                                   unsigned PhysReg) const {
+  unsigned NumConflicts = 0;
+  const LiveInterval &pli = getInterval(getRepresentativeReg(PhysReg));
+  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(li.reg),
+         E = mri_->reg_end(); I != E; ++I) {
+    MachineOperand &O = I.getOperand();
+    MachineInstr *MI = O.getParent();
+    if (MI->isDebugValue())
+      continue;
+    SlotIndex Index = getInstructionIndex(MI);
+    if (pli.liveAt(Index))
+      ++NumConflicts;
+  }
+  return NumConflicts;
+}
+
+/// spillPhysRegAroundRegDefsUses - Spill the specified physical register
+/// around all defs and uses of the specified interval. Return true if it
+/// was able to cut its interval.
+bool LiveIntervals::spillPhysRegAroundRegDefsUses(const LiveInterval &li,
+                                            unsigned PhysReg, VirtRegMap &vrm) {
+  unsigned SpillReg = getRepresentativeReg(PhysReg);
+
+  for (const unsigned *AS = tri_->getAliasSet(PhysReg); *AS; ++AS)
+    // If there are registers which alias PhysReg, but which are not a
+    // sub-register of the chosen representative super register. Assert
+    // since we can't handle it yet.
+    assert(*AS == SpillReg || !allocatableRegs_[*AS] || !hasInterval(*AS) ||
+           tri_->isSuperRegister(*AS, SpillReg));
+
+  bool Cut = false;
+  SmallVector<unsigned, 4> PRegs;
+  if (hasInterval(SpillReg))
+    PRegs.push_back(SpillReg);
+  else {
+    SmallSet<unsigned, 4> Added;
+    for (const unsigned* AS = tri_->getSubRegisters(SpillReg); *AS; ++AS)
+      if (Added.insert(*AS) && hasInterval(*AS)) {
+        PRegs.push_back(*AS);
+        for (const unsigned* ASS = tri_->getSubRegisters(*AS); *ASS; ++ASS)
+          Added.insert(*ASS);
+      }
+  }
+
+  SmallPtrSet<MachineInstr*, 8> SeenMIs;
+  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(li.reg),
+         E = mri_->reg_end(); I != E; ++I) {
+    MachineOperand &O = I.getOperand();
+    MachineInstr *MI = O.getParent();
+    if (MI->isDebugValue() || SeenMIs.count(MI))
+      continue;
+    SeenMIs.insert(MI);
+    SlotIndex Index = getInstructionIndex(MI);
+    for (unsigned i = 0, e = PRegs.size(); i != e; ++i) {
+      unsigned PReg = PRegs[i];
+      LiveInterval &pli = getInterval(PReg);
+      if (!pli.liveAt(Index))
+        continue;
+      vrm.addEmergencySpill(PReg, MI);
+      SlotIndex StartIdx = Index.getLoadIndex();
+      SlotIndex EndIdx = Index.getNextIndex().getBaseIndex();
+      if (pli.isInOneLiveRange(StartIdx, EndIdx)) {
+        pli.removeRange(StartIdx, EndIdx);
+        Cut = true;
+      } else {
+        std::string msg;
+        raw_string_ostream Msg(msg);
+        Msg << "Ran out of registers during register allocation!";
+        report_fatal_error(Msg.str());
+      }
+      for (const unsigned* AS = tri_->getSubRegisters(PReg); *AS; ++AS) {
+        if (!hasInterval(*AS))
+          continue;
+        LiveInterval &spli = getInterval(*AS);
+        if (spli.liveAt(Index))
+          spli.removeRange(Index.getLoadIndex(),
+                           Index.getNextIndex().getBaseIndex());
+      }
+    }
+  }
+  return Cut;
+}
+
+LiveRange LiveIntervals::addLiveRangeToEndOfBlock(unsigned reg,
+                                                  MachineInstr* startInst) {
+  LiveInterval& Interval = getOrCreateInterval(reg);
+  VNInfo* VN = Interval.getNextValue(
+    SlotIndex(getInstructionIndex(startInst).getDefIndex()),
+    startInst, true, getVNInfoAllocator());
+  VN->setHasPHIKill(true);
+  LiveRange LR(
+     SlotIndex(getInstructionIndex(startInst).getDefIndex()),
+     getMBBEndIdx(startInst->getParent()), VN);
+  Interval.addRange(LR);
+  
+  return LR;
+}
+
diff --git a/src/LLVM/lib/CodeGen/LiveStackAnalysis.cpp b/src/LLVM/lib/CodeGen/LiveStackAnalysis.cpp
new file mode 100644
index 0000000..b5c385f
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/LiveStackAnalysis.cpp
@@ -0,0 +1,64 @@
+//===-- LiveStackAnalysis.cpp - Live Stack Slot Analysis ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the live stack slot analysis pass. It is analogous to
+// live interval analysis except it's analyzing liveness of stack slots rather
+// than registers.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "livestacks"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+#include <limits>
+using namespace llvm;
+
+char LiveStacks::ID = 0;
+INITIALIZE_PASS(LiveStacks, "livestacks",
+                "Live Stack Slot Analysis", false, false);
+
+void LiveStacks::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequiredTransitive<SlotIndexes>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void LiveStacks::releaseMemory() {
+  // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
+  VNInfoAllocator.Reset();
+  S2IMap.clear();
+  S2RCMap.clear();
+}
+
+bool LiveStacks::runOnMachineFunction(MachineFunction &) {
+  // FIXME: No analysis is being done right now. We are relying on the
+  // register allocators to provide the information.
+  return false;
+}
+
+/// print - Implement the dump method.
+void LiveStacks::print(raw_ostream &OS, const Module*) const {
+
+  OS << "********** INTERVALS **********\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    I->second.print(OS);
+    int Slot = I->first;
+    const TargetRegisterClass *RC = getIntervalRegClass(Slot);
+    if (RC)
+      OS << " [" << RC->getName() << "]\n";
+    else
+      OS << " [Unknown]\n";
+  }
+}
diff --git a/src/LLVM/lib/CodeGen/LiveVariables.cpp b/src/LLVM/lib/CodeGen/LiveVariables.cpp
new file mode 100644
index 0000000..1d935b2
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/LiveVariables.cpp
@@ -0,0 +1,799 @@
+//===-- LiveVariables.cpp - Live Variable Analysis for Machine Code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveVariable analysis pass.  For each machine
+// instruction in the function, this pass calculates the set of registers that
+// are immediately dead after the instruction (i.e., the instruction calculates
+// the value, but it is never used) and the set of registers that are used by
+// the instruction, but are never used after the instruction (i.e., they are
+// killed).
+//
+// This class computes live variables using are sparse implementation based on
+// the machine code SSA form.  This class computes live variable information for
+// each virtual and _register allocatable_ physical register in a function.  It
+// uses the dominance properties of SSA form to efficiently compute live
+// variables for virtual registers, and assumes that physical registers are only
+// live within a single basic block (allowing it to do a single local analysis
+// to resolve physical register lifetimes in each basic block).  If a physical
+// register is not register allocatable, it is not tracked.  This is useful for
+// things like the stack pointer and condition codes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+char LiveVariables::ID = 0;
+INITIALIZE_PASS(LiveVariables, "livevars",
+                "Live Variable Analysis", false, false);
+
+
+void LiveVariables::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequiredID(UnreachableMachineBlockElimID);
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *
+LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const {
+  for (unsigned i = 0, e = Kills.size(); i != e; ++i)
+    if (Kills[i]->getParent() == MBB)
+      return Kills[i];
+  return NULL;
+}
+
+void LiveVariables::VarInfo::dump() const {
+  dbgs() << "  Alive in blocks: ";
+  for (SparseBitVector<>::iterator I = AliveBlocks.begin(),
+           E = AliveBlocks.end(); I != E; ++I)
+    dbgs() << *I << ", ";
+  dbgs() << "\n  Killed by:";
+  if (Kills.empty())
+    dbgs() << " No instructions.\n";
+  else {
+    for (unsigned i = 0, e = Kills.size(); i != e; ++i)
+      dbgs() << "\n    #" << i << ": " << *Kills[i];
+    dbgs() << "\n";
+  }
+}
+
+/// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.
+LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) {
+  assert(TargetRegisterInfo::isVirtualRegister(RegIdx) &&
+         "getVarInfo: not a virtual register!");
+  RegIdx -= TargetRegisterInfo::FirstVirtualRegister;
+  if (RegIdx >= VirtRegInfo.size()) {
+    if (RegIdx >= 2*VirtRegInfo.size())
+      VirtRegInfo.resize(RegIdx*2);
+    else
+      VirtRegInfo.resize(2*VirtRegInfo.size());
+  }
+  return VirtRegInfo[RegIdx];
+}
+
+void LiveVariables::MarkVirtRegAliveInBlock(VarInfo& VRInfo,
+                                            MachineBasicBlock *DefBlock,
+                                            MachineBasicBlock *MBB,
+                                    std::vector<MachineBasicBlock*> &WorkList) {
+  unsigned BBNum = MBB->getNumber();
+  
+  // Check to see if this basic block is one of the killing blocks.  If so,
+  // remove it.
+  for (unsigned i = 0, e = VRInfo.Kills.size(); i != e; ++i)
+    if (VRInfo.Kills[i]->getParent() == MBB) {
+      VRInfo.Kills.erase(VRInfo.Kills.begin()+i);  // Erase entry
+      break;
+    }
+  
+  if (MBB == DefBlock) return;  // Terminate recursion
+
+  if (VRInfo.AliveBlocks.test(BBNum))
+    return;  // We already know the block is live
+
+  // Mark the variable known alive in this bb
+  VRInfo.AliveBlocks.set(BBNum);
+
+  for (MachineBasicBlock::const_pred_reverse_iterator PI = MBB->pred_rbegin(),
+         E = MBB->pred_rend(); PI != E; ++PI)
+    WorkList.push_back(*PI);
+}
+
+void LiveVariables::MarkVirtRegAliveInBlock(VarInfo &VRInfo,
+                                            MachineBasicBlock *DefBlock,
+                                            MachineBasicBlock *MBB) {
+  std::vector<MachineBasicBlock*> WorkList;
+  MarkVirtRegAliveInBlock(VRInfo, DefBlock, MBB, WorkList);
+
+  while (!WorkList.empty()) {
+    MachineBasicBlock *Pred = WorkList.back();
+    WorkList.pop_back();
+    MarkVirtRegAliveInBlock(VRInfo, DefBlock, Pred, WorkList);
+  }
+}
+
+void LiveVariables::HandleVirtRegUse(unsigned reg, MachineBasicBlock *MBB,
+                                     MachineInstr *MI) {
+  assert(MRI->getVRegDef(reg) && "Register use before def!");
+
+  unsigned BBNum = MBB->getNumber();
+
+  VarInfo& VRInfo = getVarInfo(reg);
+  VRInfo.NumUses++;
+
+  // Check to see if this basic block is already a kill block.
+  if (!VRInfo.Kills.empty() && VRInfo.Kills.back()->getParent() == MBB) {
+    // Yes, this register is killed in this basic block already. Increase the
+    // live range by updating the kill instruction.
+    VRInfo.Kills.back() = MI;
+    return;
+  }
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = VRInfo.Kills.size(); i != e; ++i)
+    assert(VRInfo.Kills[i]->getParent() != MBB && "entry should be at end!");
+#endif
+
+  // This situation can occur:
+  //
+  //     ,------.
+  //     |      |
+  //     |      v
+  //     |   t2 = phi ... t1 ...
+  //     |      |
+  //     |      v
+  //     |   t1 = ...
+  //     |  ... = ... t1 ...
+  //     |      |
+  //     `------'
+  //
+  // where there is a use in a PHI node that's a predecessor to the defining
+  // block. We don't want to mark all predecessors as having the value "alive"
+  // in this case.
+  if (MBB == MRI->getVRegDef(reg)->getParent()) return;
+
+  // Add a new kill entry for this basic block. If this virtual register is
+  // already marked as alive in this basic block, that means it is alive in at
+  // least one of the successor blocks, it's not a kill.
+  if (!VRInfo.AliveBlocks.test(BBNum))
+    VRInfo.Kills.push_back(MI);
+
+  // Update all dominating blocks to mark them as "known live".
+  for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+         E = MBB->pred_end(); PI != E; ++PI)
+    MarkVirtRegAliveInBlock(VRInfo, MRI->getVRegDef(reg)->getParent(), *PI);
+}
+
+void LiveVariables::HandleVirtRegDef(unsigned Reg, MachineInstr *MI) {
+  VarInfo &VRInfo = getVarInfo(Reg);
+
+  if (VRInfo.AliveBlocks.empty())
+    // If vr is not alive in any block, then defaults to dead.
+    VRInfo.Kills.push_back(MI);
+}
+
+/// FindLastPartialDef - Return the last partial def of the specified register.
+/// Also returns the sub-registers that're defined by the instruction.
+MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
+                                            SmallSet<unsigned,4> &PartDefRegs) {
+  unsigned LastDefReg = 0;
+  unsigned LastDefDist = 0;
+  MachineInstr *LastDef = NULL;
+  for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+       unsigned SubReg = *SubRegs; ++SubRegs) {
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (!Def)
+      continue;
+    unsigned Dist = DistanceMap[Def];
+    if (Dist > LastDefDist) {
+      LastDefReg  = SubReg;
+      LastDef     = Def;
+      LastDefDist = Dist;
+    }
+  }
+
+  if (!LastDef)
+    return 0;
+
+  PartDefRegs.insert(LastDefReg);
+  for (unsigned i = 0, e = LastDef->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = LastDef->getOperand(i);
+    if (!MO.isReg() || !MO.isDef() || MO.getReg() == 0)
+      continue;
+    unsigned DefReg = MO.getReg();
+    if (TRI->isSubRegister(Reg, DefReg)) {
+      PartDefRegs.insert(DefReg);
+      for (const unsigned *SubRegs = TRI->getSubRegisters(DefReg);
+           unsigned SubReg = *SubRegs; ++SubRegs)
+        PartDefRegs.insert(SubReg);
+    }
+  }
+  return LastDef;
+}
+
+/// HandlePhysRegUse - Turn previous partial def's into read/mod/writes. Add
+/// implicit defs to a machine instruction if there was an earlier def of its
+/// super-register.
+void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) {
+  MachineInstr *LastDef = PhysRegDef[Reg];
+  // If there was a previous use or a "full" def all is well.
+  if (!LastDef && !PhysRegUse[Reg]) {
+    // Otherwise, the last sub-register def implicitly defines this register.
+    // e.g.
+    // AH =
+    // AL = ... <imp-def EAX>, <imp-kill AH>
+    //    = AH
+    // ...
+    //    = EAX
+    // All of the sub-registers must have been defined before the use of Reg!
+    SmallSet<unsigned, 4> PartDefRegs;
+    MachineInstr *LastPartialDef = FindLastPartialDef(Reg, PartDefRegs);
+    // If LastPartialDef is NULL, it must be using a livein register.
+    if (LastPartialDef) {
+      LastPartialDef->addOperand(MachineOperand::CreateReg(Reg, true/*IsDef*/,
+                                                           true/*IsImp*/));
+      PhysRegDef[Reg] = LastPartialDef;
+      SmallSet<unsigned, 8> Processed;
+      for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+           unsigned SubReg = *SubRegs; ++SubRegs) {
+        if (Processed.count(SubReg))
+          continue;
+        if (PartDefRegs.count(SubReg))
+          continue;
+        // This part of Reg was defined before the last partial def. It's killed
+        // here.
+        LastPartialDef->addOperand(MachineOperand::CreateReg(SubReg,
+                                                             false/*IsDef*/,
+                                                             true/*IsImp*/));
+        PhysRegDef[SubReg] = LastPartialDef;
+        for (const unsigned *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+          Processed.insert(*SS);
+      }
+    }
+  }
+  else if (LastDef && !PhysRegUse[Reg] &&
+           !LastDef->findRegisterDefOperand(Reg))
+    // Last def defines the super register, add an implicit def of reg.
+    LastDef->addOperand(MachineOperand::CreateReg(Reg,
+                                                 true/*IsDef*/, true/*IsImp*/));
+
+  // Remember this use.
+  PhysRegUse[Reg]  = MI;
+  for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+       unsigned SubReg = *SubRegs; ++SubRegs)
+    PhysRegUse[SubReg] =  MI;
+}
+
+/// FindLastRefOrPartRef - Return the last reference or partial reference of
+/// the specified register.
+MachineInstr *LiveVariables::FindLastRefOrPartRef(unsigned Reg) {
+  MachineInstr *LastDef = PhysRegDef[Reg];
+  MachineInstr *LastUse = PhysRegUse[Reg];
+  if (!LastDef && !LastUse)
+    return 0;
+
+  MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
+  unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
+  unsigned LastPartDefDist = 0;
+  for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+       unsigned SubReg = *SubRegs; ++SubRegs) {
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (Def && Def != LastDef) {
+      // There was a def of this sub-register in between. This is a partial
+      // def, keep track of the last one.
+      unsigned Dist = DistanceMap[Def];
+      if (Dist > LastPartDefDist)
+        LastPartDefDist = Dist;
+    } else if (MachineInstr *Use = PhysRegUse[SubReg]) {
+      unsigned Dist = DistanceMap[Use];
+      if (Dist > LastRefOrPartRefDist) {
+        LastRefOrPartRefDist = Dist;
+        LastRefOrPartRef = Use;
+      }
+    }
+  }
+
+  return LastRefOrPartRef;
+}
+
+bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
+  MachineInstr *LastDef = PhysRegDef[Reg];
+  MachineInstr *LastUse = PhysRegUse[Reg];
+  if (!LastDef && !LastUse)
+    return false;
+
+  MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
+  unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
+  // The whole register is used.
+  // AL =
+  // AH =
+  //
+  //    = AX
+  //    = AL, AX<imp-use, kill>
+  // AX =
+  //
+  // Or whole register is defined, but not used at all.
+  // AX<dead> =
+  // ...
+  // AX =
+  //
+  // Or whole register is defined, but only partly used.
+  // AX<dead> = AL<imp-def>
+  //    = AL<kill>
+  // AX = 
+  MachineInstr *LastPartDef = 0;
+  unsigned LastPartDefDist = 0;
+  SmallSet<unsigned, 8> PartUses;
+  for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+       unsigned SubReg = *SubRegs; ++SubRegs) {
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (Def && Def != LastDef) {
+      // There was a def of this sub-register in between. This is a partial
+      // def, keep track of the last one.
+      unsigned Dist = DistanceMap[Def];
+      if (Dist > LastPartDefDist) {
+        LastPartDefDist = Dist;
+        LastPartDef = Def;
+      }
+      continue;
+    }
+    if (MachineInstr *Use = PhysRegUse[SubReg]) {
+      PartUses.insert(SubReg);
+      for (const unsigned *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+        PartUses.insert(*SS);
+      unsigned Dist = DistanceMap[Use];
+      if (Dist > LastRefOrPartRefDist) {
+        LastRefOrPartRefDist = Dist;
+        LastRefOrPartRef = Use;
+      }
+    }
+  }
+
+  if (!PhysRegUse[Reg]) {
+    // Partial uses. Mark register def dead and add implicit def of
+    // sub-registers which are used.
+    // EAX<dead>  = op  AL<imp-def>
+    // That is, EAX def is dead but AL def extends pass it.
+    PhysRegDef[Reg]->addRegisterDead(Reg, TRI, true);
+    for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+         unsigned SubReg = *SubRegs; ++SubRegs) {
+      if (!PartUses.count(SubReg))
+        continue;
+      bool NeedDef = true;
+      if (PhysRegDef[Reg] == PhysRegDef[SubReg]) {
+        MachineOperand *MO = PhysRegDef[Reg]->findRegisterDefOperand(SubReg);
+        if (MO) {
+          NeedDef = false;
+          assert(!MO->isDead());
+        }
+      }
+      if (NeedDef)
+        PhysRegDef[Reg]->addOperand(MachineOperand::CreateReg(SubReg,
+                                                 true/*IsDef*/, true/*IsImp*/));
+      MachineInstr *LastSubRef = FindLastRefOrPartRef(SubReg);
+      if (LastSubRef)
+        LastSubRef->addRegisterKilled(SubReg, TRI, true);
+      else {
+        LastRefOrPartRef->addRegisterKilled(SubReg, TRI, true);
+        PhysRegUse[SubReg] = LastRefOrPartRef;
+        for (const unsigned *SSRegs = TRI->getSubRegisters(SubReg);
+             unsigned SSReg = *SSRegs; ++SSRegs)
+          PhysRegUse[SSReg] = LastRefOrPartRef;
+      }
+      for (const unsigned *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+        PartUses.erase(*SS);
+    }
+  } else if (LastRefOrPartRef == PhysRegDef[Reg] && LastRefOrPartRef != MI) {
+    if (LastPartDef)
+      // The last partial def kills the register.
+      LastPartDef->addOperand(MachineOperand::CreateReg(Reg, false/*IsDef*/,
+                                                true/*IsImp*/, true/*IsKill*/));
+    else {
+      MachineOperand *MO =
+        LastRefOrPartRef->findRegisterDefOperand(Reg, false, TRI);
+      bool NeedEC = MO->isEarlyClobber() && MO->getReg() != Reg;
+      // If the last reference is the last def, then it's not used at all.
+      // That is, unless we are currently processing the last reference itself.
+      LastRefOrPartRef->addRegisterDead(Reg, TRI, true);
+      if (NeedEC) {
+        // If we are adding a subreg def and the superreg def is marked early
+        // clobber, add an early clobber marker to the subreg def.
+        MO = LastRefOrPartRef->findRegisterDefOperand(Reg);
+        if (MO)
+          MO->setIsEarlyClobber();
+      }
+    }
+  } else
+    LastRefOrPartRef->addRegisterKilled(Reg, TRI, true);
+  return true;
+}
+
+void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
+                                     SmallVector<unsigned, 4> &Defs) {
+  // What parts of the register are previously defined?
+  SmallSet<unsigned, 32> Live;
+  if (PhysRegDef[Reg] || PhysRegUse[Reg]) {
+    Live.insert(Reg);
+    for (const unsigned *SS = TRI->getSubRegisters(Reg); *SS; ++SS)
+      Live.insert(*SS);
+  } else {
+    for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+         unsigned SubReg = *SubRegs; ++SubRegs) {
+      // If a register isn't itself defined, but all parts that make up of it
+      // are defined, then consider it also defined.
+      // e.g.
+      // AL =
+      // AH =
+      //    = AX
+      if (Live.count(SubReg))
+        continue;
+      if (PhysRegDef[SubReg] || PhysRegUse[SubReg]) {
+        Live.insert(SubReg);
+        for (const unsigned *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+          Live.insert(*SS);
+      }
+    }
+  }
+
+  // Start from the largest piece, find the last time any part of the register
+  // is referenced.
+  HandlePhysRegKill(Reg, MI);
+  // Only some of the sub-registers are used.
+  for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+       unsigned SubReg = *SubRegs; ++SubRegs) {
+    if (!Live.count(SubReg))
+      // Skip if this sub-register isn't defined.
+      continue;
+    HandlePhysRegKill(SubReg, MI);
+  }
+
+  if (MI)
+    Defs.push_back(Reg);  // Remember this def.
+}
+
+void LiveVariables::UpdatePhysRegDefs(MachineInstr *MI,
+                                      SmallVector<unsigned, 4> &Defs) {
+  while (!Defs.empty()) {
+    unsigned Reg = Defs.back();
+    Defs.pop_back();
+    PhysRegDef[Reg]  = MI;
+    PhysRegUse[Reg]  = NULL;
+    for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+         unsigned SubReg = *SubRegs; ++SubRegs) {
+      PhysRegDef[SubReg]  = MI;
+      PhysRegUse[SubReg]  = NULL;
+    }
+  }
+}
+
+namespace {
+  struct RegSorter {
+    const TargetRegisterInfo *TRI;
+
+    RegSorter(const TargetRegisterInfo *tri) : TRI(tri) { }
+    bool operator()(unsigned A, unsigned B) {
+      if (TRI->isSubRegister(A, B))
+        return true;
+      else if (TRI->isSubRegister(B, A))
+        return false;
+      return A < B;
+    }
+  };
+}
+
+bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  MRI = &mf.getRegInfo();
+  TRI = MF->getTarget().getRegisterInfo();
+
+  ReservedRegisters = TRI->getReservedRegs(mf);
+
+  unsigned NumRegs = TRI->getNumRegs();
+  PhysRegDef  = new MachineInstr*[NumRegs];
+  PhysRegUse  = new MachineInstr*[NumRegs];
+  PHIVarInfo = new SmallVector<unsigned, 4>[MF->getNumBlockIDs()];
+  std::fill(PhysRegDef,  PhysRegDef  + NumRegs, (MachineInstr*)0);
+  std::fill(PhysRegUse,  PhysRegUse  + NumRegs, (MachineInstr*)0);
+  PHIJoins.clear();
+
+  /// Get some space for a respectable number of registers.
+  VirtRegInfo.resize(64);
+
+  analyzePHINodes(mf);
+
+  // Calculate live variable information in depth first order on the CFG of the
+  // function.  This guarantees that we will see the definition of a virtual
+  // register before its uses due to dominance properties of SSA (except for PHI
+  // nodes, which are treated as a special case).
+  MachineBasicBlock *Entry = MF->begin();
+  SmallPtrSet<MachineBasicBlock*,16> Visited;
+
+  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*,16> >
+         DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited);
+       DFI != E; ++DFI) {
+    MachineBasicBlock *MBB = *DFI;
+
+    // Mark live-in registers as live-in.
+    SmallVector<unsigned, 4> Defs;
+    for (MachineBasicBlock::livein_iterator II = MBB->livein_begin(),
+           EE = MBB->livein_end(); II != EE; ++II) {
+      assert(TargetRegisterInfo::isPhysicalRegister(*II) &&
+             "Cannot have a live-in virtual register!");
+      HandlePhysRegDef(*II, 0, Defs);
+    }
+
+    // Loop over all of the instructions, processing them.
+    DistanceMap.clear();
+    unsigned Dist = 0;
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+         I != E; ++I) {
+      MachineInstr *MI = I;
+      if (MI->isDebugValue())
+        continue;
+      DistanceMap.insert(std::make_pair(MI, Dist++));
+
+      // Process all of the operands of the instruction...
+      unsigned NumOperandsToProcess = MI->getNumOperands();
+
+      // Unless it is a PHI node.  In this case, ONLY process the DEF, not any
+      // of the uses.  They will be handled in other basic blocks.
+      if (MI->isPHI())
+        NumOperandsToProcess = 1;
+
+      // Clear kill and dead markers. LV will recompute them.
+      SmallVector<unsigned, 4> UseRegs;
+      SmallVector<unsigned, 4> DefRegs;
+      for (unsigned i = 0; i != NumOperandsToProcess; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg() || MO.getReg() == 0)
+          continue;
+        unsigned MOReg = MO.getReg();
+        if (MO.isUse()) {
+          MO.setIsKill(false);
+          UseRegs.push_back(MOReg);
+        } else /*MO.isDef()*/ {
+          MO.setIsDead(false);
+          DefRegs.push_back(MOReg);
+        }
+      }
+
+      // Process all uses.
+      for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) {
+        unsigned MOReg = UseRegs[i];
+        if (TargetRegisterInfo::isVirtualRegister(MOReg))
+          HandleVirtRegUse(MOReg, MBB, MI);
+        else if (!ReservedRegisters[MOReg])
+          HandlePhysRegUse(MOReg, MI);
+      }
+
+      // Process all defs.
+      for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) {
+        unsigned MOReg = DefRegs[i];
+        if (TargetRegisterInfo::isVirtualRegister(MOReg))
+          HandleVirtRegDef(MOReg, MI);
+        else if (!ReservedRegisters[MOReg])
+          HandlePhysRegDef(MOReg, MI, Defs);
+      }
+      UpdatePhysRegDefs(MI, Defs);
+    }
+
+    // Handle any virtual assignments from PHI nodes which might be at the
+    // bottom of this basic block.  We check all of our successor blocks to see
+    // if they have PHI nodes, and if so, we simulate an assignment at the end
+    // of the current block.
+    if (!PHIVarInfo[MBB->getNumber()].empty()) {
+      SmallVector<unsigned, 4>& VarInfoVec = PHIVarInfo[MBB->getNumber()];
+
+      for (SmallVector<unsigned, 4>::iterator I = VarInfoVec.begin(),
+             E = VarInfoVec.end(); I != E; ++I)
+        // Mark it alive only in the block we are representing.
+        MarkVirtRegAliveInBlock(getVarInfo(*I),MRI->getVRegDef(*I)->getParent(),
+                                MBB);
+    }
+
+    // Finally, if the last instruction in the block is a return, make sure to
+    // mark it as using all of the live-out values in the function.
+    // Things marked both call and return are tail calls; do not do this for
+    // them.  The tail callee need not take the same registers as input
+    // that it produces as output, and there are dependencies for its input
+    // registers elsewhere.
+    if (!MBB->empty() && MBB->back().getDesc().isReturn()
+        && !MBB->back().getDesc().isCall()) {
+      MachineInstr *Ret = &MBB->back();
+
+      for (MachineRegisterInfo::liveout_iterator
+           I = MF->getRegInfo().liveout_begin(),
+           E = MF->getRegInfo().liveout_end(); I != E; ++I) {
+        assert(TargetRegisterInfo::isPhysicalRegister(*I) &&
+               "Cannot have a live-out virtual register!");
+        HandlePhysRegUse(*I, Ret);
+
+        // Add live-out registers as implicit uses.
+        if (!Ret->readsRegister(*I))
+          Ret->addOperand(MachineOperand::CreateReg(*I, false, true));
+      }
+    }
+
+    // Loop over PhysRegDef / PhysRegUse, killing any registers that are
+    // available at the end of the basic block.
+    for (unsigned i = 0; i != NumRegs; ++i)
+      if (PhysRegDef[i] || PhysRegUse[i])
+        HandlePhysRegDef(i, 0, Defs);
+
+    std::fill(PhysRegDef,  PhysRegDef  + NumRegs, (MachineInstr*)0);
+    std::fill(PhysRegUse,  PhysRegUse  + NumRegs, (MachineInstr*)0);
+  }
+
+  // Convert and transfer the dead / killed information we have gathered into
+  // VirtRegInfo onto MI's.
+  for (unsigned i = 0, e1 = VirtRegInfo.size(); i != e1; ++i)
+    for (unsigned j = 0, e2 = VirtRegInfo[i].Kills.size(); j != e2; ++j)
+      if (VirtRegInfo[i].Kills[j] ==
+          MRI->getVRegDef(i + TargetRegisterInfo::FirstVirtualRegister))
+        VirtRegInfo[i]
+          .Kills[j]->addRegisterDead(i +
+                                     TargetRegisterInfo::FirstVirtualRegister,
+                                     TRI);
+      else
+        VirtRegInfo[i]
+          .Kills[j]->addRegisterKilled(i +
+                                       TargetRegisterInfo::FirstVirtualRegister,
+                                       TRI);
+
+  // Check to make sure there are no unreachable blocks in the MC CFG for the
+  // function.  If so, it is due to a bug in the instruction selector or some
+  // other part of the code generator if this happens.
+#ifndef NDEBUG
+  for(MachineFunction::iterator i = MF->begin(), e = MF->end(); i != e; ++i)
+    assert(Visited.count(&*i) != 0 && "unreachable basic block found");
+#endif
+
+  delete[] PhysRegDef;
+  delete[] PhysRegUse;
+  delete[] PHIVarInfo;
+
+  return false;
+}
+
+/// replaceKillInstruction - Update register kill info by replacing a kill
+/// instruction with a new one.
+void LiveVariables::replaceKillInstruction(unsigned Reg, MachineInstr *OldMI,
+                                           MachineInstr *NewMI) {
+  VarInfo &VI = getVarInfo(Reg);
+  std::replace(VI.Kills.begin(), VI.Kills.end(), OldMI, NewMI);
+}
+
+/// removeVirtualRegistersKilled - Remove all killed info for the specified
+/// instruction.
+void LiveVariables::removeVirtualRegistersKilled(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isKill()) {
+      MO.setIsKill(false);
+      unsigned Reg = MO.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        bool removed = getVarInfo(Reg).removeKill(MI);
+        assert(removed && "kill not in register's VarInfo?");
+        removed = true;
+      }
+    }
+  }
+}
+
+/// analyzePHINodes - Gather information about the PHI nodes in here. In
+/// particular, we want to map the variable information of a virtual register
+/// which is used in a PHI node. We map that to the BB the vreg is coming from.
+///
+void LiveVariables::analyzePHINodes(const MachineFunction& Fn) {
+  for (MachineFunction::const_iterator I = Fn.begin(), E = Fn.end();
+       I != E; ++I)
+    for (MachineBasicBlock::const_iterator BBI = I->begin(), BBE = I->end();
+         BBI != BBE && BBI->isPHI(); ++BBI)
+      for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
+        PHIVarInfo[BBI->getOperand(i + 1).getMBB()->getNumber()]
+          .push_back(BBI->getOperand(i).getReg());
+}
+
+bool LiveVariables::VarInfo::isLiveIn(const MachineBasicBlock &MBB,
+                                      unsigned Reg,
+                                      MachineRegisterInfo &MRI) {
+  unsigned Num = MBB.getNumber();
+
+  // Reg is live-through.
+  if (AliveBlocks.test(Num))
+    return true;
+
+  // Registers defined in MBB cannot be live in.
+  const MachineInstr *Def = MRI.getVRegDef(Reg);
+  if (Def && Def->getParent() == &MBB)
+    return false;
+
+ // Reg was not defined in MBB, was it killed here?
+  return findKill(&MBB);
+}
+
+bool LiveVariables::isLiveOut(unsigned Reg, const MachineBasicBlock &MBB) {
+  LiveVariables::VarInfo &VI = getVarInfo(Reg);
+
+  // Loop over all of the successors of the basic block, checking to see if
+  // the value is either live in the block, or if it is killed in the block.
+  std::vector<MachineBasicBlock*> OpSuccBlocks;
+  for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+         E = MBB.succ_end(); SI != E; ++SI) {
+    MachineBasicBlock *SuccMBB = *SI;
+
+    // Is it alive in this successor?
+    unsigned SuccIdx = SuccMBB->getNumber();
+    if (VI.AliveBlocks.test(SuccIdx))
+      return true;
+    OpSuccBlocks.push_back(SuccMBB);
+  }
+
+  // Check to see if this value is live because there is a use in a successor
+  // that kills it.
+  switch (OpSuccBlocks.size()) {
+  case 1: {
+    MachineBasicBlock *SuccMBB = OpSuccBlocks[0];
+    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+      if (VI.Kills[i]->getParent() == SuccMBB)
+        return true;
+    break;
+  }
+  case 2: {
+    MachineBasicBlock *SuccMBB1 = OpSuccBlocks[0], *SuccMBB2 = OpSuccBlocks[1];
+    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+      if (VI.Kills[i]->getParent() == SuccMBB1 ||
+          VI.Kills[i]->getParent() == SuccMBB2)
+        return true;
+    break;
+  }
+  default:
+    std::sort(OpSuccBlocks.begin(), OpSuccBlocks.end());
+    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+      if (std::binary_search(OpSuccBlocks.begin(), OpSuccBlocks.end(),
+                             VI.Kills[i]->getParent()))
+        return true;
+  }
+  return false;
+}
+
+/// addNewBlock - Add a new basic block BB as an empty succcessor to DomBB. All
+/// variables that are live out of DomBB will be marked as passing live through
+/// BB.
+void LiveVariables::addNewBlock(MachineBasicBlock *BB,
+                                MachineBasicBlock *DomBB,
+                                MachineBasicBlock *SuccBB) {
+  const unsigned NumNew = BB->getNumber();
+
+  // All registers used by PHI nodes in SuccBB must be live through BB.
+  for (MachineBasicBlock::const_iterator BBI = SuccBB->begin(),
+         BBE = SuccBB->end(); BBI != BBE && BBI->isPHI(); ++BBI)
+    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
+      if (BBI->getOperand(i+1).getMBB() == BB)
+        getVarInfo(BBI->getOperand(i).getReg()).AliveBlocks.set(NumNew);
+
+  // Update info for all live variables
+  for (unsigned Reg = TargetRegisterInfo::FirstVirtualRegister,
+         E = MRI->getLastVirtReg()+1; Reg != E; ++Reg) {
+    VarInfo &VI = getVarInfo(Reg);
+    if (!VI.AliveBlocks.test(NumNew) && VI.isLiveIn(*SuccBB, Reg, *MRI))
+      VI.AliveBlocks.set(NumNew);
+  }
+}
diff --git a/src/LLVM/lib/CodeGen/LowerSubregs.cpp b/src/LLVM/lib/CodeGen/LowerSubregs.cpp
new file mode 100644
index 0000000..a1f1a07
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/LowerSubregs.cpp
@@ -0,0 +1,243 @@
+//===-- LowerSubregs.cpp - Subregister Lowering instruction pass ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a MachineFunction pass which runs after register
+// allocation that turns subreg insert/extract instructions into register
+// copies, as needed. This ensures correct codegen even if the coalescer
+// isn't able to remove all subreg instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lowersubregs"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+  struct LowerSubregsInstructionPass : public MachineFunctionPass {
+  private:
+    const TargetRegisterInfo *TRI;
+    const TargetInstrInfo *TII;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    LowerSubregsInstructionPass() : MachineFunctionPass(ID) {}
+    
+    const char *getPassName() const {
+      return "Subregister lowering instruction pass";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    /// runOnMachineFunction - pass entry point
+    bool runOnMachineFunction(MachineFunction&);
+
+  private:
+    bool LowerSubregToReg(MachineInstr *MI);
+    bool LowerCopy(MachineInstr *MI);
+
+    void TransferDeadFlag(MachineInstr *MI, unsigned DstReg,
+                          const TargetRegisterInfo *TRI);
+    void TransferKillFlag(MachineInstr *MI, unsigned SrcReg,
+                          const TargetRegisterInfo *TRI,
+                          bool AddIfNotFound = false);
+    void TransferImplicitDefs(MachineInstr *MI);
+  };
+
+  char LowerSubregsInstructionPass::ID = 0;
+}
+
+FunctionPass *llvm::createLowerSubregsPass() { 
+  return new LowerSubregsInstructionPass(); 
+}
+
+/// TransferDeadFlag - MI is a pseudo-instruction with DstReg dead,
+/// and the lowered replacement instructions immediately precede it.
+/// Mark the replacement instructions with the dead flag.
+void
+LowerSubregsInstructionPass::TransferDeadFlag(MachineInstr *MI,
+                                              unsigned DstReg,
+                                              const TargetRegisterInfo *TRI) {
+  for (MachineBasicBlock::iterator MII =
+        prior(MachineBasicBlock::iterator(MI)); ; --MII) {
+    if (MII->addRegisterDead(DstReg, TRI))
+      break;
+    assert(MII != MI->getParent()->begin() &&
+           "copyPhysReg output doesn't reference destination register!");
+  }
+}
+
+/// TransferKillFlag - MI is a pseudo-instruction with SrcReg killed,
+/// and the lowered replacement instructions immediately precede it.
+/// Mark the replacement instructions with the kill flag.
+void
+LowerSubregsInstructionPass::TransferKillFlag(MachineInstr *MI,
+                                              unsigned SrcReg,
+                                              const TargetRegisterInfo *TRI,
+                                              bool AddIfNotFound) {
+  for (MachineBasicBlock::iterator MII =
+        prior(MachineBasicBlock::iterator(MI)); ; --MII) {
+    if (MII->addRegisterKilled(SrcReg, TRI, AddIfNotFound))
+      break;
+    assert(MII != MI->getParent()->begin() &&
+           "copyPhysReg output doesn't reference source register!");
+  }
+}
+
+/// TransferImplicitDefs - MI is a pseudo-instruction, and the lowered
+/// replacement instructions immediately precede it.  Copy any implicit-def
+/// operands from MI to the replacement instruction.
+void
+LowerSubregsInstructionPass::TransferImplicitDefs(MachineInstr *MI) {
+  MachineBasicBlock::iterator CopyMI = MI;
+  --CopyMI;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isImplicit() || MO.isUse())
+      continue;
+    CopyMI->addOperand(MachineOperand::CreateReg(MO.getReg(), true, true));
+  }
+}
+
+bool LowerSubregsInstructionPass::LowerSubregToReg(MachineInstr *MI) {
+  MachineBasicBlock *MBB = MI->getParent();
+  assert((MI->getOperand(0).isReg() && MI->getOperand(0).isDef()) &&
+         MI->getOperand(1).isImm() &&
+         (MI->getOperand(2).isReg() && MI->getOperand(2).isUse()) &&
+          MI->getOperand(3).isImm() && "Invalid subreg_to_reg");
+
+  unsigned DstReg  = MI->getOperand(0).getReg();
+  unsigned InsReg  = MI->getOperand(2).getReg();
+  assert(!MI->getOperand(2).getSubReg() && "SubIdx on physreg?");
+  unsigned SubIdx  = MI->getOperand(3).getImm();
+
+  assert(SubIdx != 0 && "Invalid index for insert_subreg");
+  unsigned DstSubReg = TRI->getSubReg(DstReg, SubIdx);
+
+  assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+         "Insert destination must be in a physical register");
+  assert(TargetRegisterInfo::isPhysicalRegister(InsReg) &&
+         "Inserted value must be in a physical register");
+
+  DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
+
+  if (DstSubReg == InsReg) {
+    // No need to insert an identify copy instruction.
+    // Watch out for case like this:
+    // %RAX<def> = SUBREG_TO_REG 0, %EAX<kill>, 3
+    // We must leave %RAX live.
+    if (DstReg != InsReg) {
+      MI->setDesc(TII->get(TargetOpcode::KILL));
+      MI->RemoveOperand(3);     // SubIdx
+      MI->RemoveOperand(1);     // Imm
+      DEBUG(dbgs() << "subreg: replace by: " << *MI);
+      return true;
+    }
+    DEBUG(dbgs() << "subreg: eliminated!");
+  } else {
+    TII->copyPhysReg(*MBB, MI, MI->getDebugLoc(), DstSubReg, InsReg,
+                     MI->getOperand(2).isKill());
+    // Transfer the kill/dead flags, if needed.
+    if (MI->getOperand(0).isDead())
+      TransferDeadFlag(MI, DstSubReg, TRI);
+    DEBUG({
+        MachineBasicBlock::iterator dMI = MI;
+        dbgs() << "subreg: " << *(--dMI);
+      });
+  }
+
+  DEBUG(dbgs() << '\n');
+  MBB->erase(MI);
+  return true;
+}
+
+bool LowerSubregsInstructionPass::LowerCopy(MachineInstr *MI) {
+  MachineOperand &DstMO = MI->getOperand(0);
+  MachineOperand &SrcMO = MI->getOperand(1);
+
+  if (SrcMO.getReg() == DstMO.getReg()) {
+    DEBUG(dbgs() << "identity copy: " << *MI);
+    // No need to insert an identity copy instruction, but replace with a KILL
+    // if liveness is changed.
+    if (DstMO.isDead() || SrcMO.isUndef() || MI->getNumOperands() > 2) {
+      // We must make sure the super-register gets killed. Replace the
+      // instruction with KILL.
+      MI->setDesc(TII->get(TargetOpcode::KILL));
+      DEBUG(dbgs() << "replaced by:   " << *MI);
+      return true;
+    }
+    // Vanilla identity copy.
+    MI->eraseFromParent();
+    return true;
+  }
+
+  DEBUG(dbgs() << "real copy:   " << *MI);
+  TII->copyPhysReg(*MI->getParent(), MI, MI->getDebugLoc(),
+                   DstMO.getReg(), SrcMO.getReg(), SrcMO.isKill());
+
+  if (DstMO.isDead())
+    TransferDeadFlag(MI, DstMO.getReg(), TRI);
+  if (MI->getNumOperands() > 2)
+    TransferImplicitDefs(MI);
+  DEBUG({
+    MachineBasicBlock::iterator dMI = MI;
+    dbgs() << "replaced by: " << *(--dMI);
+  });
+  MI->eraseFromParent();
+  return true;
+}
+
+/// runOnMachineFunction - Reduce subregister inserts and extracts to register
+/// copies.
+///
+bool LowerSubregsInstructionPass::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "Machine Function\n"  
+               << "********** LOWERING SUBREG INSTRS **********\n"
+               << "********** Function: " 
+               << MF.getFunction()->getName() << '\n');
+  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getTarget().getInstrInfo();
+
+  bool MadeChange = false;
+
+  for (MachineFunction::iterator mbbi = MF.begin(), mbbe = MF.end();
+       mbbi != mbbe; ++mbbi) {
+    for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
+         mi != me;) {
+      MachineBasicBlock::iterator nmi = llvm::next(mi);
+      MachineInstr *MI = mi;
+      assert(!MI->isInsertSubreg() && "INSERT_SUBREG should no longer appear");
+      assert(MI->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+             "EXTRACT_SUBREG should no longer appear");
+      if (MI->isSubregToReg()) {
+        MadeChange |= LowerSubregToReg(MI);
+      } else if (MI->isCopy()) {
+        MadeChange |= LowerCopy(MI);
+      }
+      mi = nmi;
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/src/LLVM/lib/CodeGen/MachineBasicBlock.cpp b/src/LLVM/lib/CodeGen/MachineBasicBlock.cpp
new file mode 100644
index 0000000..c6db0f7
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineBasicBlock.cpp
@@ -0,0 +1,628 @@
+//===-- llvm/CodeGen/MachineBasicBlock.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Collect the sequence of machine instructions for a basic block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrDesc.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/LeakDetector.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+MachineBasicBlock::MachineBasicBlock(MachineFunction &mf, const BasicBlock *bb)
+  : BB(bb), Number(-1), xParent(&mf), Alignment(0),
+    AddressTaken(false) {
+  Insts.Parent = this;
+}
+
+MachineBasicBlock::~MachineBasicBlock() {
+  LeakDetector::removeGarbageObject(this);
+}
+
+/// getSymbol - Return the MCSymbol for this basic block.
+///
+MCSymbol *MachineBasicBlock::getSymbol() const {
+  const MachineFunction *MF = getParent();
+  MCContext &Ctx = MF->getContext();
+  const char *Prefix = Ctx.getAsmInfo().getPrivateGlobalPrefix();
+  return Ctx.GetOrCreateSymbol(Twine(Prefix) + "BB" +
+                               Twine(MF->getFunctionNumber()) + "_" +
+                               Twine(getNumber()));
+}
+
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineBasicBlock &MBB) {
+  MBB.print(OS);
+  return OS;
+}
+
+/// addNodeToList (MBB) - When an MBB is added to an MF, we need to update the 
+/// parent pointer of the MBB, the MBB numbering, and any instructions in the
+/// MBB to be on the right operand list for registers.
+///
+/// MBBs start out as #-1. When a MBB is added to a MachineFunction, it
+/// gets the next available unique MBB number. If it is removed from a
+/// MachineFunction, it goes back to being #-1.
+void ilist_traits<MachineBasicBlock>::addNodeToList(MachineBasicBlock *N) {
+  MachineFunction &MF = *N->getParent();
+  N->Number = MF.addToMBBNumbering(N);
+
+  // Make sure the instructions have their operands in the reginfo lists.
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  for (MachineBasicBlock::iterator I = N->begin(), E = N->end(); I != E; ++I)
+    I->AddRegOperandsToUseLists(RegInfo);
+
+  LeakDetector::removeGarbageObject(N);
+}
+
+void ilist_traits<MachineBasicBlock>::removeNodeFromList(MachineBasicBlock *N) {
+  N->getParent()->removeFromMBBNumbering(N->Number);
+  N->Number = -1;
+  LeakDetector::addGarbageObject(N);
+}
+
+
+/// addNodeToList (MI) - When we add an instruction to a basic block
+/// list, we update its parent pointer and add its operands from reg use/def
+/// lists if appropriate.
+void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) {
+  assert(N->getParent() == 0 && "machine instruction already in a basic block");
+  N->setParent(Parent);
+  
+  // Add the instruction's register operands to their corresponding
+  // use/def lists.
+  MachineFunction *MF = Parent->getParent();
+  N->AddRegOperandsToUseLists(MF->getRegInfo());
+
+  LeakDetector::removeGarbageObject(N);
+}
+
+/// removeNodeFromList (MI) - When we remove an instruction from a basic block
+/// list, we update its parent pointer and remove its operands from reg use/def
+/// lists if appropriate.
+void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
+  assert(N->getParent() != 0 && "machine instruction not in a basic block");
+
+  // Remove from the use/def lists.
+  N->RemoveRegOperandsFromUseLists();
+  
+  N->setParent(0);
+
+  LeakDetector::addGarbageObject(N);
+}
+
+/// transferNodesFromList (MI) - When moving a range of instructions from one
+/// MBB list to another, we need to update the parent pointers and the use/def
+/// lists.
+void ilist_traits<MachineInstr>::
+transferNodesFromList(ilist_traits<MachineInstr> &fromList,
+                      MachineBasicBlock::iterator first,
+                      MachineBasicBlock::iterator last) {
+  assert(Parent->getParent() == fromList.Parent->getParent() &&
+        "MachineInstr parent mismatch!");
+
+  // Splice within the same MBB -> no change.
+  if (Parent == fromList.Parent) return;
+
+  // If splicing between two blocks within the same function, just update the
+  // parent pointers.
+  for (; first != last; ++first)
+    first->setParent(Parent);
+}
+
+void ilist_traits<MachineInstr>::deleteNode(MachineInstr* MI) {
+  assert(!MI->getParent() && "MI is still in a block!");
+  Parent->getParent()->DeleteMachineInstr(MI);
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() {
+  iterator I = begin();
+  while (I != end() && I->isPHI())
+    ++I;
+  return I;
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getFirstTerminator() {
+  iterator I = end();
+  while (I != begin() && (--I)->getDesc().isTerminator())
+    ; /*noop */
+  if (I != end() && !I->getDesc().isTerminator()) ++I;
+  return I;
+}
+
+void MachineBasicBlock::dump() const {
+  print(dbgs());
+}
+
+static inline void OutputReg(raw_ostream &os, unsigned RegNo,
+                             const TargetRegisterInfo *TRI = 0) {
+  if (RegNo != 0 && TargetRegisterInfo::isPhysicalRegister(RegNo)) {
+    if (TRI)
+      os << " %" << TRI->get(RegNo).Name;
+    else
+      os << " %physreg" << RegNo;
+  } else
+    os << " %reg" << RegNo;
+}
+
+StringRef MachineBasicBlock::getName() const {
+  if (const BasicBlock *LBB = getBasicBlock())
+    return LBB->getName();
+  else
+    return "(null)";
+}
+
+void MachineBasicBlock::print(raw_ostream &OS) const {
+  const MachineFunction *MF = getParent();
+  if (!MF) {
+    OS << "Can't print out MachineBasicBlock because parent MachineFunction"
+       << " is null\n";
+    return;
+  }
+
+  if (Alignment) { OS << "Alignment " << Alignment << "\n"; }
+
+  OS << "BB#" << getNumber() << ": ";
+
+  const char *Comma = "";
+  if (const BasicBlock *LBB = getBasicBlock()) {
+    OS << Comma << "derived from LLVM BB ";
+    WriteAsOperand(OS, LBB, /*PrintType=*/false);
+    Comma = ", ";
+  }
+  if (hasAddressTaken()) { OS << Comma << "ADDRESS TAKEN"; Comma = ", "; }
+  OS << '\n';
+
+  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();  
+  if (!livein_empty()) {
+    OS << "    Live Ins:";
+    for (livein_iterator I = livein_begin(),E = livein_end(); I != E; ++I)
+      OutputReg(OS, *I, TRI);
+    OS << '\n';
+  }
+  // Print the preds of this block according to the CFG.
+  if (!pred_empty()) {
+    OS << "    Predecessors according to CFG:";
+    for (const_pred_iterator PI = pred_begin(), E = pred_end(); PI != E; ++PI)
+      OS << " BB#" << (*PI)->getNumber();
+    OS << '\n';
+  }
+  
+  for (const_iterator I = begin(); I != end(); ++I) {
+    OS << '\t';
+    I->print(OS, &getParent()->getTarget());
+  }
+
+  // Print the successors of this block according to the CFG.
+  if (!succ_empty()) {
+    OS << "    Successors according to CFG:";
+    for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI)
+      OS << " BB#" << (*SI)->getNumber();
+    OS << '\n';
+  }
+}
+
+void MachineBasicBlock::removeLiveIn(unsigned Reg) {
+  std::vector<unsigned>::iterator I =
+    std::find(LiveIns.begin(), LiveIns.end(), Reg);
+  assert(I != LiveIns.end() && "Not a live in!");
+  LiveIns.erase(I);
+}
+
+bool MachineBasicBlock::isLiveIn(unsigned Reg) const {
+  livein_iterator I = std::find(livein_begin(), livein_end(), Reg);
+  return I != livein_end();
+}
+
+void MachineBasicBlock::moveBefore(MachineBasicBlock *NewAfter) {
+  getParent()->splice(NewAfter, this);
+}
+
+void MachineBasicBlock::moveAfter(MachineBasicBlock *NewBefore) {
+  MachineFunction::iterator BBI = NewBefore;
+  getParent()->splice(++BBI, this);
+}
+
+void MachineBasicBlock::updateTerminator() {
+  const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
+  // A block with no successors has no concerns with fall-through edges.
+  if (this->succ_empty()) return;
+
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  DebugLoc dl;  // FIXME: this is nowhere
+  bool B = TII->AnalyzeBranch(*this, TBB, FBB, Cond);
+  (void) B;
+  assert(!B && "UpdateTerminators requires analyzable predecessors!");
+  if (Cond.empty()) {
+    if (TBB) {
+      // The block has an unconditional branch. If its successor is now
+      // its layout successor, delete the branch.
+      if (isLayoutSuccessor(TBB))
+        TII->RemoveBranch(*this);
+    } else {
+      // The block has an unconditional fallthrough. If its successor is not
+      // its layout successor, insert a branch.
+      TBB = *succ_begin();
+      if (!isLayoutSuccessor(TBB))
+        TII->InsertBranch(*this, TBB, 0, Cond, dl);
+    }
+  } else {
+    if (FBB) {
+      // The block has a non-fallthrough conditional branch. If one of its
+      // successors is its layout successor, rewrite it to a fallthrough
+      // conditional branch.
+      if (isLayoutSuccessor(TBB)) {
+        if (TII->ReverseBranchCondition(Cond))
+          return;
+        TII->RemoveBranch(*this);
+        TII->InsertBranch(*this, FBB, 0, Cond, dl);
+      } else if (isLayoutSuccessor(FBB)) {
+        TII->RemoveBranch(*this);
+        TII->InsertBranch(*this, TBB, 0, Cond, dl);
+      }
+    } else {
+      // The block has a fallthrough conditional branch.
+      MachineBasicBlock *MBBA = *succ_begin();
+      MachineBasicBlock *MBBB = *llvm::next(succ_begin());
+      if (MBBA == TBB) std::swap(MBBB, MBBA);
+      if (isLayoutSuccessor(TBB)) {
+        if (TII->ReverseBranchCondition(Cond)) {
+          // We can't reverse the condition, add an unconditional branch.
+          Cond.clear();
+          TII->InsertBranch(*this, MBBA, 0, Cond, dl);
+          return;
+        }
+        TII->RemoveBranch(*this);
+        TII->InsertBranch(*this, MBBA, 0, Cond, dl);
+      } else if (!isLayoutSuccessor(MBBA)) {
+        TII->RemoveBranch(*this);
+        TII->InsertBranch(*this, TBB, MBBA, Cond, dl);
+      }
+    }
+  }
+}
+
+void MachineBasicBlock::addSuccessor(MachineBasicBlock *succ) {
+  Successors.push_back(succ);
+  succ->addPredecessor(this);
+}
+
+void MachineBasicBlock::removeSuccessor(MachineBasicBlock *succ) {
+  succ->removePredecessor(this);
+  succ_iterator I = std::find(Successors.begin(), Successors.end(), succ);
+  assert(I != Successors.end() && "Not a current successor!");
+  Successors.erase(I);
+}
+
+MachineBasicBlock::succ_iterator 
+MachineBasicBlock::removeSuccessor(succ_iterator I) {
+  assert(I != Successors.end() && "Not a current successor!");
+  (*I)->removePredecessor(this);
+  return Successors.erase(I);
+}
+
+void MachineBasicBlock::addPredecessor(MachineBasicBlock *pred) {
+  Predecessors.push_back(pred);
+}
+
+void MachineBasicBlock::removePredecessor(MachineBasicBlock *pred) {
+  std::vector<MachineBasicBlock *>::iterator I =
+    std::find(Predecessors.begin(), Predecessors.end(), pred);
+  assert(I != Predecessors.end() && "Pred is not a predecessor of this block!");
+  Predecessors.erase(I);
+}
+
+void MachineBasicBlock::transferSuccessors(MachineBasicBlock *fromMBB) {
+  if (this == fromMBB)
+    return;
+  
+  while (!fromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *fromMBB->succ_begin();
+    addSuccessor(Succ);
+    fromMBB->removeSuccessor(Succ);
+  }
+}
+
+void
+MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) {
+  if (this == fromMBB)
+    return;
+  
+  while (!fromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *fromMBB->succ_begin();
+    addSuccessor(Succ);
+    fromMBB->removeSuccessor(Succ);
+
+    // Fix up any PHI nodes in the successor.
+    for (MachineBasicBlock::iterator MI = Succ->begin(), ME = Succ->end();
+         MI != ME && MI->isPHI(); ++MI)
+      for (unsigned i = 2, e = MI->getNumOperands()+1; i != e; i += 2) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.getMBB() == fromMBB)
+          MO.setMBB(this);
+      }
+  }
+}
+
+bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const {
+  std::vector<MachineBasicBlock *>::const_iterator I =
+    std::find(Successors.begin(), Successors.end(), MBB);
+  return I != Successors.end();
+}
+
+bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
+  MachineFunction::const_iterator I(this);
+  return llvm::next(I) == MachineFunction::const_iterator(MBB);
+}
+
+bool MachineBasicBlock::canFallThrough() {
+  MachineFunction::iterator Fallthrough = this;
+  ++Fallthrough;
+  // If FallthroughBlock is off the end of the function, it can't fall through.
+  if (Fallthrough == getParent()->end())
+    return false;
+
+  // If FallthroughBlock isn't a successor, no fallthrough is possible.
+  if (!isSuccessor(Fallthrough))
+    return false;
+
+  // Analyze the branches, if any, at the end of the block.
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
+  if (TII->AnalyzeBranch(*this, TBB, FBB, Cond)) {
+    // If we couldn't analyze the branch, examine the last instruction.
+    // If the block doesn't end in a known control barrier, assume fallthrough
+    // is possible. The isPredicable check is needed because this code can be
+    // called during IfConversion, where an instruction which is normally a
+    // Barrier is predicated and thus no longer an actual control barrier. This
+    // is over-conservative though, because if an instruction isn't actually
+    // predicated we could still treat it like a barrier.
+    return empty() || !back().getDesc().isBarrier() ||
+           back().getDesc().isPredicable();
+  }
+
+  // If there is no branch, control always falls through.
+  if (TBB == 0) return true;
+
+  // If there is some explicit branch to the fallthrough block, it can obviously
+  // reach, even though the branch should get folded to fall through implicitly.
+  if (MachineFunction::iterator(TBB) == Fallthrough ||
+      MachineFunction::iterator(FBB) == Fallthrough)
+    return true;
+
+  // If it's an unconditional branch to some block not the fall through, it
+  // doesn't fall through.
+  if (Cond.empty()) return false;
+
+  // Otherwise, if it is conditional and has no explicit false block, it falls
+  // through.
+  return FBB == 0;
+}
+
+MachineBasicBlock *
+MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
+  MachineFunction *MF = getParent();
+  DebugLoc dl;  // FIXME: this is nowhere
+
+  // We may need to update this's terminator, but we can't do that if AnalyzeBranch
+  // fails. If this uses a jump table, we won't touch it.
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->AnalyzeBranch(*this, TBB, FBB, Cond))
+    return NULL;
+
+  MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
+  MF->insert(llvm::next(MachineFunction::iterator(this)), NMBB);
+  DEBUG(dbgs() << "PHIElimination splitting critical edge:"
+        " BB#" << getNumber()
+        << " -- BB#" << NMBB->getNumber()
+        << " -- BB#" << Succ->getNumber() << '\n');
+
+  ReplaceUsesOfBlockWith(Succ, NMBB);
+  updateTerminator();
+
+  // Insert unconditional "jump Succ" instruction in NMBB if necessary.
+  NMBB->addSuccessor(Succ);
+  if (!NMBB->isLayoutSuccessor(Succ)) {
+    Cond.clear();
+    MF->getTarget().getInstrInfo()->InsertBranch(*NMBB, Succ, NULL, Cond, dl);
+  }
+
+  // Fix PHI nodes in Succ so they refer to NMBB instead of this
+  for (MachineBasicBlock::iterator i = Succ->begin(), e = Succ->end();
+       i != e && i->isPHI(); ++i)
+    for (unsigned ni = 1, ne = i->getNumOperands(); ni != ne; ni += 2)
+      if (i->getOperand(ni+1).getMBB() == this)
+        i->getOperand(ni+1).setMBB(NMBB);
+
+  if (LiveVariables *LV =
+        P->getAnalysisIfAvailable<LiveVariables>())
+    LV->addNewBlock(NMBB, this, Succ);
+
+  if (MachineDominatorTree *MDT =
+        P->getAnalysisIfAvailable<MachineDominatorTree>())
+    MDT->addNewBlock(NMBB, this);
+
+  if (MachineLoopInfo *MLI =
+        P->getAnalysisIfAvailable<MachineLoopInfo>())
+    if (MachineLoop *TIL = MLI->getLoopFor(this)) {
+      // If one or the other blocks were not in a loop, the new block is not
+      // either, and thus LI doesn't need to be updated.
+      if (MachineLoop *DestLoop = MLI->getLoopFor(Succ)) {
+        if (TIL == DestLoop) {
+          // Both in the same loop, the NMBB joins loop.
+          DestLoop->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else if (TIL->contains(DestLoop)) {
+          // Edge from an outer loop to an inner loop.  Add to the outer loop.
+          TIL->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else if (DestLoop->contains(TIL)) {
+          // Edge from an inner loop to an outer loop.  Add to the outer loop.
+          DestLoop->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else {
+          // Edge from two loops with no containment relation.  Because these
+          // are natural loops, we know that the destination block must be the
+          // header of its loop (adding a branch into a loop elsewhere would
+          // create an irreducible loop).
+          assert(DestLoop->getHeader() == Succ &&
+                 "Should not create irreducible loops!");
+          if (MachineLoop *P = DestLoop->getParentLoop())
+            P->addBasicBlockToLoop(NMBB, MLI->getBase());
+        }
+      }
+    }
+
+  return NMBB;
+}
+
+/// removeFromParent - This method unlinks 'this' from the containing function,
+/// and returns it, but does not delete it.
+MachineBasicBlock *MachineBasicBlock::removeFromParent() {
+  assert(getParent() && "Not embedded in a function!");
+  getParent()->remove(this);
+  return this;
+}
+
+
+/// eraseFromParent - This method unlinks 'this' from the containing function,
+/// and deletes it.
+void MachineBasicBlock::eraseFromParent() {
+  assert(getParent() && "Not embedded in a function!");
+  getParent()->erase(this);
+}
+
+
+/// ReplaceUsesOfBlockWith - Given a machine basic block that branched to
+/// 'Old', change the code and CFG so that it branches to 'New' instead.
+void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old,
+                                               MachineBasicBlock *New) {
+  assert(Old != New && "Cannot replace self with self!");
+
+  MachineBasicBlock::iterator I = end();
+  while (I != begin()) {
+    --I;
+    if (!I->getDesc().isTerminator()) break;
+
+    // Scan the operands of this machine instruction, replacing any uses of Old
+    // with New.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+      if (I->getOperand(i).isMBB() &&
+          I->getOperand(i).getMBB() == Old)
+        I->getOperand(i).setMBB(New);
+  }
+
+  // Update the successor information.
+  removeSuccessor(Old);
+  addSuccessor(New);
+}
+
+/// CorrectExtraCFGEdges - Various pieces of code can cause excess edges in the
+/// CFG to be inserted.  If we have proven that MBB can only branch to DestA and
+/// DestB, remove any other MBB successors from the CFG.  DestA and DestB can be
+/// null.
+/// 
+/// Besides DestA and DestB, retain other edges leading to LandingPads
+/// (currently there can be only one; we don't check or require that here).
+/// Note it is possible that DestA and/or DestB are LandingPads.
+bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
+                                             MachineBasicBlock *DestB,
+                                             bool isCond) {
+  // The values of DestA and DestB frequently come from a call to the
+  // 'TargetInstrInfo::AnalyzeBranch' method. We take our meaning of the initial
+  // values from there.
+  //
+  // 1. If both DestA and DestB are null, then the block ends with no branches
+  //    (it falls through to its successor).
+  // 2. If DestA is set, DestB is null, and isCond is false, then the block ends
+  //    with only an unconditional branch.
+  // 3. If DestA is set, DestB is null, and isCond is true, then the block ends
+  //    with a conditional branch that falls through to a successor (DestB).
+  // 4. If DestA and DestB is set and isCond is true, then the block ends with a
+  //    conditional branch followed by an unconditional branch. DestA is the
+  //    'true' destination and DestB is the 'false' destination.
+
+  bool Changed = false;
+
+  MachineFunction::iterator FallThru =
+    llvm::next(MachineFunction::iterator(this));
+
+  if (DestA == 0 && DestB == 0) {
+    // Block falls through to successor.
+    DestA = FallThru;
+    DestB = FallThru;
+  } else if (DestA != 0 && DestB == 0) {
+    if (isCond)
+      // Block ends in conditional jump that falls through to successor.
+      DestB = FallThru;
+  } else {
+    assert(DestA && DestB && isCond &&
+           "CFG in a bad state. Cannot correct CFG edges");
+  }
+
+  // Remove superfluous edges. I.e., those which aren't destinations of this
+  // basic block, duplicate edges, or landing pads.
+  SmallPtrSet<const MachineBasicBlock*, 8> SeenMBBs;
+  MachineBasicBlock::succ_iterator SI = succ_begin();
+  while (SI != succ_end()) {
+    const MachineBasicBlock *MBB = *SI;
+    if (!SeenMBBs.insert(MBB) ||
+        MBB != DestA && MBB != DestB) {
+      // This is a superfluous edge, remove it.
+      SI = removeSuccessor(SI);
+      Changed = true;
+    } else {
+      ++SI;
+    }
+  }
+
+  return Changed;
+}
+
+/// findDebugLoc - find the next valid DebugLoc starting at MBBI, skipping
+/// any DBG_VALUE instructions.  Return UnknownLoc if there is none.
+DebugLoc
+MachineBasicBlock::findDebugLoc(MachineBasicBlock::iterator &MBBI) {
+  DebugLoc DL;
+  MachineBasicBlock::iterator E = end();
+  if (MBBI != E) {
+    // Skip debug declarations, we don't want a DebugLoc from them.
+    MachineBasicBlock::iterator MBBI2 = MBBI;
+    while (MBBI2 != E && MBBI2->isDebugValue())
+      MBBI2++;
+    if (MBBI2 != E)
+      DL = MBBI2->getDebugLoc();
+  }
+  return DL;
+}
+
+void llvm::WriteAsOperand(raw_ostream &OS, const MachineBasicBlock *MBB,
+                          bool t) {
+  OS << "BB#" << MBB->getNumber();
+}
+
diff --git a/src/LLVM/lib/CodeGen/MachineCSE.cpp b/src/LLVM/lib/CodeGen/MachineCSE.cpp
new file mode 100644
index 0000000..c2f41ef
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineCSE.cpp
@@ -0,0 +1,507 @@
+//===-- MachineCSE.cpp - Machine Common Subexpression Elimination Pass ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global common subexpression elimination on machine
+// instructions using a scoped hash table based value numbering scheme. It
+// must be run while the machine function is still in SSA form.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "machine-cse"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+STATISTIC(NumCoalesces, "Number of copies coalesced");
+STATISTIC(NumCSEs,      "Number of common subexpression eliminated");
+STATISTIC(NumPhysCSEs,  "Number of phyreg defining common subexpr eliminated");
+
+namespace {
+  class MachineCSE : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    AliasAnalysis *AA;
+    MachineDominatorTree *DT;
+    MachineRegisterInfo *MRI;
+  public:
+    static char ID; // Pass identification
+    MachineCSE() : MachineFunctionPass(ID), LookAheadLimit(5), CurrVN(0) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+    }
+
+  private:
+    const unsigned LookAheadLimit;
+    typedef ScopedHashTableScope<MachineInstr*, unsigned,
+                                 MachineInstrExpressionTrait> ScopeType;
+    DenseMap<MachineBasicBlock*, ScopeType*> ScopeMap;
+    ScopedHashTable<MachineInstr*, unsigned, MachineInstrExpressionTrait> VNT;
+    SmallVector<MachineInstr*, 64> Exps;
+    unsigned CurrVN;
+
+    bool PerformTrivialCoalescing(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool isPhysDefTriviallyDead(unsigned Reg,
+                                MachineBasicBlock::const_iterator I,
+                                MachineBasicBlock::const_iterator E) const ;
+    bool hasLivePhysRegDefUse(const MachineInstr *MI,
+                              const MachineBasicBlock *MBB,
+                              unsigned &PhysDef) const;
+    bool PhysRegDefReaches(MachineInstr *CSMI, MachineInstr *MI,
+                           unsigned PhysDef) const;
+    bool isCSECandidate(MachineInstr *MI);
+    bool isProfitableToCSE(unsigned CSReg, unsigned Reg,
+                           MachineInstr *CSMI, MachineInstr *MI);
+    void EnterScope(MachineBasicBlock *MBB);
+    void ExitScope(MachineBasicBlock *MBB);
+    bool ProcessBlock(MachineBasicBlock *MBB);
+    void ExitScopeIfDone(MachineDomTreeNode *Node,
+                 DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
+                 DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap);
+    bool PerformCSE(MachineDomTreeNode *Node);
+  };
+} // end anonymous namespace
+
+char MachineCSE::ID = 0;
+INITIALIZE_PASS(MachineCSE, "machine-cse",
+                "Machine Common Subexpression Elimination", false, false);
+
+FunctionPass *llvm::createMachineCSEPass() { return new MachineCSE(); }
+
+bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI,
+                                          MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (!MRI->hasOneUse(Reg))
+      // Only coalesce single use copies. This ensure the copy will be
+      // deleted.
+      continue;
+    MachineInstr *DefMI = MRI->getVRegDef(Reg);
+    if (DefMI->getParent() != MBB)
+      continue;
+    if (!DefMI->isCopy())
+      continue;
+    unsigned SrcReg = DefMI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+      continue;
+    if (DefMI->getOperand(0).getSubReg() || DefMI->getOperand(1).getSubReg())
+      continue;
+    const TargetRegisterClass *SRC   = MRI->getRegClass(SrcReg);
+    const TargetRegisterClass *RC    = MRI->getRegClass(Reg);
+    const TargetRegisterClass *NewRC = getCommonSubClass(RC, SRC);
+    if (!NewRC)
+      continue;
+    DEBUG(dbgs() << "Coalescing: " << *DefMI);
+    DEBUG(dbgs() << "*** to: " << *MI);
+    MO.setReg(SrcReg);
+    MRI->clearKillFlags(SrcReg);
+    if (NewRC != SRC)
+      MRI->setRegClass(SrcReg, NewRC);
+    DefMI->eraseFromParent();
+    ++NumCoalesces;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool
+MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
+                                   MachineBasicBlock::const_iterator I,
+                                   MachineBasicBlock::const_iterator E) const {
+  unsigned LookAheadLeft = LookAheadLimit;
+  while (LookAheadLeft) {
+    // Skip over dbg_value's.
+    while (I != E && I->isDebugValue())
+      ++I;
+
+    if (I == E)
+      // Reached end of block, register is obviously dead.
+      return true;
+
+    bool SeenDef = false;
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = I->getOperand(i);
+      if (!MO.isReg() || !MO.getReg())
+        continue;
+      if (!TRI->regsOverlap(MO.getReg(), Reg))
+        continue;
+      if (MO.isUse())
+        // Found a use!
+        return false;
+      SeenDef = true;
+    }
+    if (SeenDef)
+      // See a def of Reg (or an alias) before encountering any use, it's 
+      // trivially dead.
+      return true;
+
+    --LookAheadLeft;
+    ++I;
+  }
+  return false;
+}
+
+/// hasLivePhysRegDefUse - Return true if the specified instruction read / write
+/// physical registers (except for dead defs of physical registers). It also
+/// returns the physical register def by reference if it's the only one and the
+/// instruction does not uses a physical register.
+bool MachineCSE::hasLivePhysRegDefUse(const MachineInstr *MI,
+                                      const MachineBasicBlock *MBB,
+                                      unsigned &PhysDef) const {
+  PhysDef = 0;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (MO.isUse()) {
+      // Can't touch anything to read a physical register.
+      PhysDef = 0;
+      return true;
+    }
+    if (MO.isDead())
+      // If the def is dead, it's ok.
+      continue;
+    // Ok, this is a physical register def that's not marked "dead". That's
+    // common since this pass is run before livevariables. We can scan
+    // forward a few instructions and check if it is obviously dead.
+    if (PhysDef) {
+      // Multiple physical register defs. These are rare, forget about it.
+      PhysDef = 0;
+      return true;
+    }
+    PhysDef = Reg;
+  }
+
+  if (PhysDef) {
+    MachineBasicBlock::const_iterator I = MI; I = llvm::next(I);
+    if (!isPhysDefTriviallyDead(PhysDef, I, MBB->end()))
+      return true;
+  }
+  return false;
+}
+
+bool MachineCSE::PhysRegDefReaches(MachineInstr *CSMI, MachineInstr *MI,
+                                  unsigned PhysDef) const {
+  // For now conservatively returns false if the common subexpression is
+  // not in the same basic block as the given instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  if (CSMI->getParent() != MBB)
+    return false;
+  MachineBasicBlock::const_iterator I = CSMI; I = llvm::next(I);
+  MachineBasicBlock::const_iterator E = MI;
+  unsigned LookAheadLeft = LookAheadLimit;
+  while (LookAheadLeft) {
+    // Skip over dbg_value's.
+    while (I != E && I->isDebugValue())
+      ++I;
+
+    if (I == E)
+      return true;
+    if (I->modifiesRegister(PhysDef, TRI))
+      return false;
+
+    --LookAheadLeft;
+    ++I;
+  }
+
+  return false;
+}
+
+bool MachineCSE::isCSECandidate(MachineInstr *MI) {
+  if (MI->isLabel() || MI->isPHI() || MI->isImplicitDef() ||
+      MI->isKill() || MI->isDebugValue())
+    return false;
+
+  // Ignore copies.
+  if (MI->isCopyLike())
+    return false;
+
+  // Ignore stuff that we obviously can't move.
+  const TargetInstrDesc &TID = MI->getDesc();  
+  if (TID.mayStore() || TID.isCall() || TID.isTerminator() ||
+      TID.hasUnmodeledSideEffects())
+    return false;
+
+  if (TID.mayLoad()) {
+    // Okay, this instruction does a load. As a refinement, we allow the target
+    // to decide whether the loaded value is actually a constant. If so, we can
+    // actually use it as a load.
+    if (!MI->isInvariantLoad(AA))
+      // FIXME: we should be able to hoist loads with no other side effects if
+      // there are no other instructions which can change memory in this loop.
+      // This is a trivial form of alias analysis.
+      return false;
+  }
+  return true;
+}
+
+/// isProfitableToCSE - Return true if it's profitable to eliminate MI with a
+/// common expression that defines Reg.
+bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
+                                   MachineInstr *CSMI, MachineInstr *MI) {
+  // FIXME: Heuristics that works around the lack the live range splitting.
+
+  // Heuristics #1: Don't cse "cheap" computating if the def is not local or in an
+  // immediate predecessor. We don't want to increase register pressure and end up
+  // causing other computation to be spilled.
+  if (MI->getDesc().isAsCheapAsAMove()) {
+    MachineBasicBlock *CSBB = CSMI->getParent();
+    MachineBasicBlock *BB = MI->getParent();
+    if (CSBB != BB && 
+        find(CSBB->succ_begin(), CSBB->succ_end(), BB) == CSBB->succ_end())
+      return false;
+  }
+
+  // Heuristics #2: If the expression doesn't not use a vr and the only use
+  // of the redundant computation are copies, do not cse.
+  bool HasVRegUse = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isUse() && MO.getReg() &&
+        TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+      HasVRegUse = true;
+      break;
+    }
+  }
+  if (!HasVRegUse) {
+    bool HasNonCopyUse = false;
+    for (MachineRegisterInfo::use_nodbg_iterator I =  MRI->use_nodbg_begin(Reg),
+           E = MRI->use_nodbg_end(); I != E; ++I) {
+      MachineInstr *Use = &*I;
+      // Ignore copies.
+      if (!Use->isCopyLike()) {
+        HasNonCopyUse = true;
+        break;
+      }
+    }
+    if (!HasNonCopyUse)
+      return false;
+  }
+
+  // Heuristics #3: If the common subexpression is used by PHIs, do not reuse
+  // it unless the defined value is already used in the BB of the new use.
+  bool HasPHI = false;
+  SmallPtrSet<MachineBasicBlock*, 4> CSBBs;
+  for (MachineRegisterInfo::use_nodbg_iterator I =  MRI->use_nodbg_begin(CSReg),
+       E = MRI->use_nodbg_end(); I != E; ++I) {
+    MachineInstr *Use = &*I;
+    HasPHI |= Use->isPHI();
+    CSBBs.insert(Use->getParent());
+  }
+
+  if (!HasPHI)
+    return true;
+  return CSBBs.count(MI->getParent());
+}
+
+void MachineCSE::EnterScope(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n');
+  ScopeType *Scope = new ScopeType(VNT);
+  ScopeMap[MBB] = Scope;
+}
+
+void MachineCSE::ExitScope(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
+  DenseMap<MachineBasicBlock*, ScopeType*>::iterator SI = ScopeMap.find(MBB);
+  assert(SI != ScopeMap.end());
+  ScopeMap.erase(SI);
+  delete SI->second;
+}
+
+bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+
+  SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
+    MachineInstr *MI = &*I;
+    ++I;
+
+    if (!isCSECandidate(MI))
+      continue;
+
+    bool DefPhys = false;
+    bool FoundCSE = VNT.count(MI);
+    if (!FoundCSE) {
+      // Look for trivial copy coalescing opportunities.
+      if (PerformTrivialCoalescing(MI, MBB)) {
+        // After coalescing MI itself may become a copy.
+        if (MI->isCopyLike())
+          continue;
+        FoundCSE = VNT.count(MI);
+      }
+    }
+    // FIXME: commute commutable instructions?
+
+    // If the instruction defines a physical register and the value *may* be
+    // used, then it's not safe to replace it with a common subexpression.
+    unsigned PhysDef = 0;
+    if (FoundCSE && hasLivePhysRegDefUse(MI, MBB, PhysDef)) {
+      FoundCSE = false;
+
+      // ... Unless the CS is local and it also defines the physical register
+      // which is not clobbered in between.
+      if (PhysDef) {
+        unsigned CSVN = VNT.lookup(MI);
+        MachineInstr *CSMI = Exps[CSVN];
+        if (PhysRegDefReaches(CSMI, MI, PhysDef)) {
+          FoundCSE = true;
+          DefPhys = true;
+        }
+      }
+    }
+
+    if (!FoundCSE) {
+      VNT.insert(MI, CurrVN++);
+      Exps.push_back(MI);
+      continue;
+    }
+
+    // Found a common subexpression, eliminate it.
+    unsigned CSVN = VNT.lookup(MI);
+    MachineInstr *CSMI = Exps[CSVN];
+    DEBUG(dbgs() << "Examining: " << *MI);
+    DEBUG(dbgs() << "*** Found a common subexpression: " << *CSMI);
+
+    // Check if it's profitable to perform this CSE.
+    bool DoCSE = true;
+    unsigned NumDefs = MI->getDesc().getNumDefs();
+    for (unsigned i = 0, e = MI->getNumOperands(); NumDefs && i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned OldReg = MO.getReg();
+      unsigned NewReg = CSMI->getOperand(i).getReg();
+      if (OldReg == NewReg)
+        continue;
+      assert(TargetRegisterInfo::isVirtualRegister(OldReg) &&
+             TargetRegisterInfo::isVirtualRegister(NewReg) &&
+             "Do not CSE physical register defs!");
+      if (!isProfitableToCSE(NewReg, OldReg, CSMI, MI)) {
+        DoCSE = false;
+        break;
+      }
+      CSEPairs.push_back(std::make_pair(OldReg, NewReg));
+      --NumDefs;
+    }
+
+    // Actually perform the elimination.
+    if (DoCSE) {
+      for (unsigned i = 0, e = CSEPairs.size(); i != e; ++i) {
+        MRI->replaceRegWith(CSEPairs[i].first, CSEPairs[i].second);
+        MRI->clearKillFlags(CSEPairs[i].second);
+      }
+      MI->eraseFromParent();
+      ++NumCSEs;
+      if (DefPhys)
+        ++NumPhysCSEs;
+    } else {
+      DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
+      VNT.insert(MI, CurrVN++);
+      Exps.push_back(MI);
+    }
+    CSEPairs.clear();
+  }
+
+  return Changed;
+}
+
+/// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given
+/// dominator tree node if its a leaf or all of its children are done. Walk
+/// up the dominator tree to destroy ancestors which are now done.
+void
+MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
+                DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
+                DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
+  if (OpenChildren[Node])
+    return;
+
+  // Pop scope.
+  ExitScope(Node->getBlock());
+
+  // Now traverse upwards to pop ancestors whose offsprings are all done.
+  while (MachineDomTreeNode *Parent = ParentMap[Node]) {
+    unsigned Left = --OpenChildren[Parent];
+    if (Left != 0)
+      break;
+    ExitScope(Parent->getBlock());
+    Node = Parent;
+  }
+}
+
+bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
+  SmallVector<MachineDomTreeNode*, 32> Scopes;
+  SmallVector<MachineDomTreeNode*, 8> WorkList;
+  DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> ParentMap;
+  DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;
+
+  // Perform a DFS walk to determine the order of visit.
+  WorkList.push_back(Node);
+  do {
+    Node = WorkList.pop_back_val();
+    Scopes.push_back(Node);
+    const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
+    unsigned NumChildren = Children.size();
+    OpenChildren[Node] = NumChildren;
+    for (unsigned i = 0; i != NumChildren; ++i) {
+      MachineDomTreeNode *Child = Children[i];
+      ParentMap[Child] = Node;
+      WorkList.push_back(Child);
+    }
+  } while (!WorkList.empty());
+
+  // Now perform CSE.
+  bool Changed = false;
+  for (unsigned i = 0, e = Scopes.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = Scopes[i];
+    MachineBasicBlock *MBB = Node->getBlock();
+    EnterScope(MBB);
+    Changed |= ProcessBlock(MBB);
+    // If it's a leaf node, it's done. Traverse upwards to pop ancestors.
+    ExitScopeIfDone(Node, OpenChildren, ParentMap);
+  }
+
+  return Changed;
+}
+
+bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  AA = &getAnalysis<AliasAnalysis>();
+  DT = &getAnalysis<MachineDominatorTree>();
+  return PerformCSE(DT->getRootNode());
+}
diff --git a/src/LLVM/lib/CodeGen/MachineDominators.cpp b/src/LLVM/lib/CodeGen/MachineDominators.cpp
new file mode 100644
index 0000000..36c9d66
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineDominators.cpp
@@ -0,0 +1,58 @@
+//===- MachineDominators.cpp - Machine Dominator Calculation --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements simple dominator construction algorithms for finding
+// forward dominators on machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/Passes.h"
+
+using namespace llvm;
+
+namespace llvm {
+TEMPLATE_INSTANTIATION(class DomTreeNodeBase<MachineBasicBlock>);
+TEMPLATE_INSTANTIATION(class DominatorTreeBase<MachineBasicBlock>);
+}
+
+char MachineDominatorTree::ID = 0;
+
+static RegisterPass<MachineDominatorTree>
+E("machinedomtree", "MachineDominator Tree Construction", true);
+
+char &llvm::MachineDominatorsID = MachineDominatorTree::ID;
+
+void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  DT->recalculate(F);
+
+  return false;
+}
+
+MachineDominatorTree::MachineDominatorTree()
+    : MachineFunctionPass(ID) {
+  DT = new DominatorTreeBase<MachineBasicBlock>(false);
+}
+
+MachineDominatorTree::~MachineDominatorTree() {
+  delete DT;
+}
+
+void MachineDominatorTree::releaseMemory() {
+  DT->releaseMemory();
+}
+
+void MachineDominatorTree::print(raw_ostream &OS, const Module*) const {
+  DT->print(OS);
+}
diff --git a/src/LLVM/lib/CodeGen/MachineFunction.cpp b/src/LLVM/lib/CodeGen/MachineFunction.cpp
new file mode 100644
index 0000000..0171700
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineFunction.cpp
@@ -0,0 +1,739 @@
+//===-- MachineFunction.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Collect native machine code information for a function.  This allows
+// target-specific information about the generated code to be stored with each
+// function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Config/config.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MachineFunction implementation
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method.
+MachineFunctionInfo::~MachineFunctionInfo() {}
+
+void ilist_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
+  MBB->getParent()->DeleteMachineBasicBlock(MBB);
+}
+
+MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
+                                 unsigned FunctionNum, MachineModuleInfo &mmi)
+  : Fn(F), Target(TM), Ctx(mmi.getContext()), MMI(mmi) {
+  if (TM.getRegisterInfo())
+    RegInfo = new (Allocator) MachineRegisterInfo(*TM.getRegisterInfo());
+  else
+    RegInfo = 0;
+  MFInfo = 0;
+  FrameInfo = new (Allocator) MachineFrameInfo(*TM.getFrameInfo());
+  if (Fn->hasFnAttr(Attribute::StackAlignment))
+    FrameInfo->setMaxAlignment(Attribute::getStackAlignmentFromAttrs(
+        Fn->getAttributes().getFnAttributes()));
+  ConstantPool = new (Allocator) MachineConstantPool(TM.getTargetData());
+  Alignment = TM.getTargetLowering()->getFunctionAlignment(F);
+  FunctionNumber = FunctionNum;
+  JumpTableInfo = 0;
+}
+
+MachineFunction::~MachineFunction() {
+  BasicBlocks.clear();
+  InstructionRecycler.clear(Allocator);
+  BasicBlockRecycler.clear(Allocator);
+  if (RegInfo) {
+    RegInfo->~MachineRegisterInfo();
+    Allocator.Deallocate(RegInfo);
+  }
+  if (MFInfo) {
+    MFInfo->~MachineFunctionInfo();
+    Allocator.Deallocate(MFInfo);
+  }
+  FrameInfo->~MachineFrameInfo();         Allocator.Deallocate(FrameInfo);
+  ConstantPool->~MachineConstantPool();   Allocator.Deallocate(ConstantPool);
+  
+  if (JumpTableInfo) {
+    JumpTableInfo->~MachineJumpTableInfo();
+    Allocator.Deallocate(JumpTableInfo);
+  }
+}
+
+/// getOrCreateJumpTableInfo - Get the JumpTableInfo for this function, if it
+/// does already exist, allocate one.
+MachineJumpTableInfo *MachineFunction::
+getOrCreateJumpTableInfo(unsigned EntryKind) {
+  if (JumpTableInfo) return JumpTableInfo;
+  
+  JumpTableInfo = new (Allocator)
+    MachineJumpTableInfo((MachineJumpTableInfo::JTEntryKind)EntryKind);
+  return JumpTableInfo;
+}
+
+/// RenumberBlocks - This discards all of the MachineBasicBlock numbers and
+/// recomputes them.  This guarantees that the MBB numbers are sequential,
+/// dense, and match the ordering of the blocks within the function.  If a
+/// specific MachineBasicBlock is specified, only that block and those after
+/// it are renumbered.
+void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
+  if (empty()) { MBBNumbering.clear(); return; }
+  MachineFunction::iterator MBBI, E = end();
+  if (MBB == 0)
+    MBBI = begin();
+  else
+    MBBI = MBB;
+  
+  // Figure out the block number this should have.
+  unsigned BlockNo = 0;
+  if (MBBI != begin())
+    BlockNo = prior(MBBI)->getNumber()+1;
+  
+  for (; MBBI != E; ++MBBI, ++BlockNo) {
+    if (MBBI->getNumber() != (int)BlockNo) {
+      // Remove use of the old number.
+      if (MBBI->getNumber() != -1) {
+        assert(MBBNumbering[MBBI->getNumber()] == &*MBBI &&
+               "MBB number mismatch!");
+        MBBNumbering[MBBI->getNumber()] = 0;
+      }
+      
+      // If BlockNo is already taken, set that block's number to -1.
+      if (MBBNumbering[BlockNo])
+        MBBNumbering[BlockNo]->setNumber(-1);
+
+      MBBNumbering[BlockNo] = MBBI;
+      MBBI->setNumber(BlockNo);
+    }
+  }    
+
+  // Okay, all the blocks are renumbered.  If we have compactified the block
+  // numbering, shrink MBBNumbering now.
+  assert(BlockNo <= MBBNumbering.size() && "Mismatch!");
+  MBBNumbering.resize(BlockNo);
+}
+
+/// CreateMachineInstr - Allocate a new MachineInstr. Use this instead
+/// of `new MachineInstr'.
+///
+MachineInstr *
+MachineFunction::CreateMachineInstr(const TargetInstrDesc &TID,
+                                    DebugLoc DL, bool NoImp) {
+  return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
+    MachineInstr(TID, DL, NoImp);
+}
+
+/// CloneMachineInstr - Create a new MachineInstr which is a copy of the
+/// 'Orig' instruction, identical in all ways except the instruction
+/// has no parent, prev, or next.
+///
+MachineInstr *
+MachineFunction::CloneMachineInstr(const MachineInstr *Orig) {
+  return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
+             MachineInstr(*this, *Orig);
+}
+
+/// DeleteMachineInstr - Delete the given MachineInstr.
+///
+void
+MachineFunction::DeleteMachineInstr(MachineInstr *MI) {
+  MI->~MachineInstr();
+  InstructionRecycler.Deallocate(Allocator, MI);
+}
+
+/// CreateMachineBasicBlock - Allocate a new MachineBasicBlock. Use this
+/// instead of `new MachineBasicBlock'.
+///
+MachineBasicBlock *
+MachineFunction::CreateMachineBasicBlock(const BasicBlock *bb) {
+  return new (BasicBlockRecycler.Allocate<MachineBasicBlock>(Allocator))
+             MachineBasicBlock(*this, bb);
+}
+
+/// DeleteMachineBasicBlock - Delete the given MachineBasicBlock.
+///
+void
+MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) {
+  assert(MBB->getParent() == this && "MBB parent mismatch!");
+  MBB->~MachineBasicBlock();
+  BasicBlockRecycler.Deallocate(Allocator, MBB);
+}
+
+MachineMemOperand *
+MachineFunction::getMachineMemOperand(const Value *v, unsigned f,
+                                      int64_t o, uint64_t s,
+                                      unsigned base_alignment) {
+  return new (Allocator) MachineMemOperand(v, f, o, s, base_alignment);
+}
+
+MachineMemOperand *
+MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
+                                      int64_t Offset, uint64_t Size) {
+  return new (Allocator)
+             MachineMemOperand(MMO->getValue(), MMO->getFlags(),
+                               int64_t(uint64_t(MMO->getOffset()) +
+                                       uint64_t(Offset)),
+                               Size, MMO->getBaseAlignment());
+}
+
+MachineInstr::mmo_iterator
+MachineFunction::allocateMemRefsArray(unsigned long Num) {
+  return Allocator.Allocate<MachineMemOperand *>(Num);
+}
+
+std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator>
+MachineFunction::extractLoadMemRefs(MachineInstr::mmo_iterator Begin,
+                                    MachineInstr::mmo_iterator End) {
+  // Count the number of load mem refs.
+  unsigned Num = 0;
+  for (MachineInstr::mmo_iterator I = Begin; I != End; ++I)
+    if ((*I)->isLoad())
+      ++Num;
+
+  // Allocate a new array and populate it with the load information.
+  MachineInstr::mmo_iterator Result = allocateMemRefsArray(Num);
+  unsigned Index = 0;
+  for (MachineInstr::mmo_iterator I = Begin; I != End; ++I) {
+    if ((*I)->isLoad()) {
+      if (!(*I)->isStore())
+        // Reuse the MMO.
+        Result[Index] = *I;
+      else {
+        // Clone the MMO and unset the store flag.
+        MachineMemOperand *JustLoad =
+          getMachineMemOperand((*I)->getValue(),
+                               (*I)->getFlags() & ~MachineMemOperand::MOStore,
+                               (*I)->getOffset(), (*I)->getSize(),
+                               (*I)->getBaseAlignment());
+        Result[Index] = JustLoad;
+      }
+      ++Index;
+    }
+  }
+  return std::make_pair(Result, Result + Num);
+}
+
+std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator>
+MachineFunction::extractStoreMemRefs(MachineInstr::mmo_iterator Begin,
+                                     MachineInstr::mmo_iterator End) {
+  // Count the number of load mem refs.
+  unsigned Num = 0;
+  for (MachineInstr::mmo_iterator I = Begin; I != End; ++I)
+    if ((*I)->isStore())
+      ++Num;
+
+  // Allocate a new array and populate it with the store information.
+  MachineInstr::mmo_iterator Result = allocateMemRefsArray(Num);
+  unsigned Index = 0;
+  for (MachineInstr::mmo_iterator I = Begin; I != End; ++I) {
+    if ((*I)->isStore()) {
+      if (!(*I)->isLoad())
+        // Reuse the MMO.
+        Result[Index] = *I;
+      else {
+        // Clone the MMO and unset the load flag.
+        MachineMemOperand *JustStore =
+          getMachineMemOperand((*I)->getValue(),
+                               (*I)->getFlags() & ~MachineMemOperand::MOLoad,
+                               (*I)->getOffset(), (*I)->getSize(),
+                               (*I)->getBaseAlignment());
+        Result[Index] = JustStore;
+      }
+      ++Index;
+    }
+  }
+  return std::make_pair(Result, Result + Num);
+}
+
+void MachineFunction::dump() const {
+  print(dbgs());
+}
+
+void MachineFunction::print(raw_ostream &OS) const {
+  OS << "# Machine code for function " << Fn->getName() << ":\n";
+
+  // Print Frame Information
+  FrameInfo->print(*this, OS);
+  
+  // Print JumpTable Information
+  if (JumpTableInfo)
+    JumpTableInfo->print(OS);
+
+  // Print Constant Pool
+  ConstantPool->print(OS);
+  
+  const TargetRegisterInfo *TRI = getTarget().getRegisterInfo();
+  
+  if (RegInfo && !RegInfo->livein_empty()) {
+    OS << "Function Live Ins: ";
+    for (MachineRegisterInfo::livein_iterator
+         I = RegInfo->livein_begin(), E = RegInfo->livein_end(); I != E; ++I) {
+      if (TRI)
+        OS << "%" << TRI->getName(I->first);
+      else
+        OS << " %physreg" << I->first;
+      
+      if (I->second)
+        OS << " in reg%" << I->second;
+
+      if (llvm::next(I) != E)
+        OS << ", ";
+    }
+    OS << '\n';
+  }
+  if (RegInfo && !RegInfo->liveout_empty()) {
+    OS << "Function Live Outs: ";
+    for (MachineRegisterInfo::liveout_iterator
+         I = RegInfo->liveout_begin(), E = RegInfo->liveout_end(); I != E; ++I){
+      if (TRI)
+        OS << '%' << TRI->getName(*I);
+      else
+        OS << "%physreg" << *I;
+
+      if (llvm::next(I) != E)
+        OS << " ";
+    }
+    OS << '\n';
+  }
+  
+  for (const_iterator BB = begin(), E = end(); BB != E; ++BB) {
+    OS << '\n';
+    BB->print(OS);
+  }
+
+  OS << "\n# End machine code for function " << Fn->getName() << ".\n\n";
+}
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<const MachineFunction*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+
+    static std::string getGraphName(const MachineFunction *F) {
+      return "CFG for '" + F->getFunction()->getNameStr() + "' function";
+    }
+
+    std::string getNodeLabel(const MachineBasicBlock *Node,
+                             const MachineFunction *Graph) {
+      if (isSimple () && Node->getBasicBlock() &&
+          !Node->getBasicBlock()->getName().empty())
+        return Node->getBasicBlock()->getNameStr() + ":";
+
+      std::string OutStr;
+      {
+        raw_string_ostream OSS(OutStr);
+        
+        if (isSimple())
+          OSS << Node->getNumber() << ':';
+        else
+          Node->print(OSS);
+      }
+
+      if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
+
+      // Process string output to make it nicer...
+      for (unsigned i = 0; i != OutStr.length(); ++i)
+        if (OutStr[i] == '\n') {                            // Left justify
+          OutStr[i] = '\\';
+          OutStr.insert(OutStr.begin()+i+1, 'l');
+        }
+      return OutStr;
+    }
+  };
+}
+
+void MachineFunction::viewCFG() const
+{
+#ifndef NDEBUG
+  ViewGraph(this, "mf" + getFunction()->getNameStr());
+#else
+  errs() << "MachineFunction::viewCFG is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+void MachineFunction::viewCFGOnly() const
+{
+#ifndef NDEBUG
+  ViewGraph(this, "mf" + getFunction()->getNameStr(), true);
+#else
+  errs() << "MachineFunction::viewCFGOnly is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+/// addLiveIn - Add the specified physical register as a live-in value and
+/// create a corresponding virtual register for it.
+unsigned MachineFunction::addLiveIn(unsigned PReg,
+                                    const TargetRegisterClass *RC) {
+  MachineRegisterInfo &MRI = getRegInfo();
+  unsigned VReg = MRI.getLiveInVirtReg(PReg);
+  if (VReg) {
+    assert(MRI.getRegClass(VReg) == RC && "Register class mismatch!");
+    return VReg;
+  }
+  VReg = MRI.createVirtualRegister(RC);
+  MRI.addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+/// getJTISymbol - Return the MCSymbol for the specified non-empty jump table.
+/// If isLinkerPrivate is specified, an 'l' label is returned, otherwise a
+/// normal 'L' label is returned.
+MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx, 
+                                        bool isLinkerPrivate) const {
+  assert(JumpTableInfo && "No jump tables");
+  
+  assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!");
+  const MCAsmInfo &MAI = *getTarget().getMCAsmInfo();
+  
+  const char *Prefix = isLinkerPrivate ? MAI.getLinkerPrivateGlobalPrefix() :
+                                         MAI.getPrivateGlobalPrefix();
+  SmallString<60> Name;
+  raw_svector_ostream(Name)
+    << Prefix << "JTI" << getFunctionNumber() << '_' << JTI;
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  MachineFrameInfo implementation
+//===----------------------------------------------------------------------===//
+
+/// CreateFixedObject - Create a new object at a fixed location on the stack.
+/// All fixed objects should be created before other objects are created for
+/// efficiency. By default, fixed objects are immutable. This returns an
+/// index with a negative value.
+///
+int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
+                                        bool Immutable) {
+  assert(Size != 0 && "Cannot allocate zero size fixed stack objects!");
+  // The alignment of the frame index can be determined from its offset from
+  // the incoming frame position.  If the frame object is at offset 32 and
+  // the stack is guaranteed to be 16-byte aligned, then we know that the
+  // object is 16-byte aligned.
+  unsigned StackAlign = TFI.getStackAlignment();
+  unsigned Align = MinAlign(SPOffset, StackAlign);
+  Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
+                                              /*isSS*/false, false));
+  return -++NumFixedObjects;
+}
+
+
+BitVector
+MachineFrameInfo::getPristineRegs(const MachineBasicBlock *MBB) const {
+  assert(MBB && "MBB must be valid");
+  const MachineFunction *MF = MBB->getParent();
+  assert(MF && "MBB must be part of a MachineFunction");
+  const TargetMachine &TM = MF->getTarget();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  BitVector BV(TRI->getNumRegs());
+
+  // Before CSI is calculated, no registers are considered pristine. They can be
+  // freely used and PEI will make sure they are saved.
+  if (!isCalleeSavedInfoValid())
+    return BV;
+
+  for (const unsigned *CSR = TRI->getCalleeSavedRegs(MF); CSR && *CSR; ++CSR)
+    BV.set(*CSR);
+
+  // The entry MBB always has all CSRs pristine.
+  if (MBB == &MF->front())
+    return BV;
+
+  // On other MBBs the saved CSRs are not pristine.
+  const std::vector<CalleeSavedInfo> &CSI = getCalleeSavedInfo();
+  for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+         E = CSI.end(); I != E; ++I)
+    BV.reset(I->getReg());
+
+  return BV;
+}
+
+
+void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
+  if (Objects.empty()) return;
+
+  const TargetFrameInfo *FI = MF.getTarget().getFrameInfo();
+  int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0);
+
+  OS << "Frame Objects:\n";
+
+  for (unsigned i = 0, e = Objects.size(); i != e; ++i) {
+    const StackObject &SO = Objects[i];
+    OS << "  fi#" << (int)(i-NumFixedObjects) << ": ";
+    if (SO.Size == ~0ULL) {
+      OS << "dead\n";
+      continue;
+    }
+    if (SO.Size == 0)
+      OS << "variable sized";
+    else
+      OS << "size=" << SO.Size;
+    OS << ", align=" << SO.Alignment;
+
+    if (i < NumFixedObjects)
+      OS << ", fixed";
+    if (i < NumFixedObjects || SO.SPOffset != -1) {
+      int64_t Off = SO.SPOffset - ValOffset;
+      OS << ", at location [SP";
+      if (Off > 0)
+        OS << "+" << Off;
+      else if (Off < 0)
+        OS << Off;
+      OS << "]";
+    }
+    OS << "\n";
+  }
+}
+
+void MachineFrameInfo::dump(const MachineFunction &MF) const {
+  print(MF, dbgs());
+}
+
+//===----------------------------------------------------------------------===//
+//  MachineJumpTableInfo implementation
+//===----------------------------------------------------------------------===//
+
+/// getEntrySize - Return the size of each entry in the jump table.
+unsigned MachineJumpTableInfo::getEntrySize(const TargetData &TD) const {
+  // The size of a jump table entry is 4 bytes unless the entry is just the
+  // address of a block, in which case it is the pointer size.
+  switch (getEntryKind()) {
+  case MachineJumpTableInfo::EK_BlockAddress:
+    return TD.getPointerSize();
+  case MachineJumpTableInfo::EK_GPRel32BlockAddress:
+  case MachineJumpTableInfo::EK_LabelDifference32:
+  case MachineJumpTableInfo::EK_Custom32:
+    return 4;
+  case MachineJumpTableInfo::EK_Inline:
+    return 0;
+  }
+  assert(0 && "Unknown jump table encoding!");
+  return ~0;
+}
+
+/// getEntryAlignment - Return the alignment of each entry in the jump table.
+unsigned MachineJumpTableInfo::getEntryAlignment(const TargetData &TD) const {
+  // The alignment of a jump table entry is the alignment of int32 unless the
+  // entry is just the address of a block, in which case it is the pointer
+  // alignment.
+  switch (getEntryKind()) {
+  case MachineJumpTableInfo::EK_BlockAddress:
+    return TD.getPointerABIAlignment();
+  case MachineJumpTableInfo::EK_GPRel32BlockAddress:
+  case MachineJumpTableInfo::EK_LabelDifference32:
+  case MachineJumpTableInfo::EK_Custom32:
+    return TD.getABIIntegerTypeAlignment(32);
+  case MachineJumpTableInfo::EK_Inline:
+    return 1;
+  }
+  assert(0 && "Unknown jump table encoding!");
+  return ~0;
+}
+
+/// createJumpTableIndex - Create a new jump table entry in the jump table info.
+///
+unsigned MachineJumpTableInfo::createJumpTableIndex(
+                               const std::vector<MachineBasicBlock*> &DestBBs) {
+  assert(!DestBBs.empty() && "Cannot create an empty jump table!");
+  JumpTables.push_back(MachineJumpTableEntry(DestBBs));
+  return JumpTables.size()-1;
+}
+
+/// ReplaceMBBInJumpTables - If Old is the target of any jump tables, update
+/// the jump tables to branch to New instead.
+bool MachineJumpTableInfo::ReplaceMBBInJumpTables(MachineBasicBlock *Old,
+                                                  MachineBasicBlock *New) {
+  assert(Old != New && "Not making a change?");
+  bool MadeChange = false;
+  for (size_t i = 0, e = JumpTables.size(); i != e; ++i)
+    ReplaceMBBInJumpTable(i, Old, New);
+  return MadeChange;
+}
+
+/// ReplaceMBBInJumpTable - If Old is a target of the jump tables, update
+/// the jump table to branch to New instead.
+bool MachineJumpTableInfo::ReplaceMBBInJumpTable(unsigned Idx,
+                                                 MachineBasicBlock *Old,
+                                                 MachineBasicBlock *New) {
+  assert(Old != New && "Not making a change?");
+  bool MadeChange = false;
+  MachineJumpTableEntry &JTE = JumpTables[Idx];
+  for (size_t j = 0, e = JTE.MBBs.size(); j != e; ++j)
+    if (JTE.MBBs[j] == Old) {
+      JTE.MBBs[j] = New;
+      MadeChange = true;
+    }
+  return MadeChange;
+}
+
+void MachineJumpTableInfo::print(raw_ostream &OS) const {
+  if (JumpTables.empty()) return;
+
+  OS << "Jump Tables:\n";
+
+  for (unsigned i = 0, e = JumpTables.size(); i != e; ++i) {
+    OS << "  jt#" << i << ": ";
+    for (unsigned j = 0, f = JumpTables[i].MBBs.size(); j != f; ++j)
+      OS << " BB#" << JumpTables[i].MBBs[j]->getNumber();
+  }
+
+  OS << '\n';
+}
+
+void MachineJumpTableInfo::dump() const { print(dbgs()); }
+
+
+//===----------------------------------------------------------------------===//
+//  MachineConstantPool implementation
+//===----------------------------------------------------------------------===//
+
+const Type *MachineConstantPoolEntry::getType() const {
+  if (isMachineConstantPoolEntry())
+    return Val.MachineCPVal->getType();
+  return Val.ConstVal->getType();
+}
+
+
+unsigned MachineConstantPoolEntry::getRelocationInfo() const {
+  if (isMachineConstantPoolEntry())
+    return Val.MachineCPVal->getRelocationInfo();
+  return Val.ConstVal->getRelocationInfo();
+}
+
+MachineConstantPool::~MachineConstantPool() {
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i)
+    if (Constants[i].isMachineConstantPoolEntry())
+      delete Constants[i].Val.MachineCPVal;
+}
+
+/// CanShareConstantPoolEntry - Test whether the given two constants
+/// can be allocated the same constant pool entry.
+static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
+                                      const TargetData *TD) {
+  // Handle the trivial case quickly.
+  if (A == B) return true;
+
+  // If they have the same type but weren't the same constant, quickly
+  // reject them.
+  if (A->getType() == B->getType()) return false;
+
+  // For now, only support constants with the same size.
+  if (TD->getTypeStoreSize(A->getType()) != TD->getTypeStoreSize(B->getType()))
+    return false;
+
+  // If a floating-point value and an integer value have the same encoding,
+  // they can share a constant-pool entry.
+  if (const ConstantFP *AFP = dyn_cast<ConstantFP>(A))
+    if (const ConstantInt *BI = dyn_cast<ConstantInt>(B))
+      return AFP->getValueAPF().bitcastToAPInt() == BI->getValue();
+  if (const ConstantFP *BFP = dyn_cast<ConstantFP>(B))
+    if (const ConstantInt *AI = dyn_cast<ConstantInt>(A))
+      return BFP->getValueAPF().bitcastToAPInt() == AI->getValue();
+
+  // Two vectors can share an entry if each pair of corresponding
+  // elements could.
+  if (const ConstantVector *AV = dyn_cast<ConstantVector>(A))
+    if (const ConstantVector *BV = dyn_cast<ConstantVector>(B)) {
+      if (AV->getType()->getNumElements() != BV->getType()->getNumElements())
+        return false;
+      for (unsigned i = 0, e = AV->getType()->getNumElements(); i != e; ++i)
+        if (!CanShareConstantPoolEntry(AV->getOperand(i),
+                                       BV->getOperand(i), TD))
+          return false;
+      return true;
+    }
+
+  // TODO: Handle other cases.
+
+  return false;
+}
+
+/// getConstantPoolIndex - Create a new entry in the constant pool or return
+/// an existing one.  User must specify the log2 of the minimum required
+/// alignment for the object.
+///
+unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C, 
+                                                   unsigned Alignment) {
+  assert(Alignment && "Alignment must be specified!");
+  if (Alignment > PoolAlignment) PoolAlignment = Alignment;
+
+  // Check to see if we already have this constant.
+  //
+  // FIXME, this could be made much more efficient for large constant pools.
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i)
+    if (!Constants[i].isMachineConstantPoolEntry() &&
+        CanShareConstantPoolEntry(Constants[i].Val.ConstVal, C, TD)) {
+      if ((unsigned)Constants[i].getAlignment() < Alignment)
+        Constants[i].Alignment = Alignment;
+      return i;
+    }
+  
+  Constants.push_back(MachineConstantPoolEntry(C, Alignment));
+  return Constants.size()-1;
+}
+
+unsigned MachineConstantPool::getConstantPoolIndex(MachineConstantPoolValue *V,
+                                                   unsigned Alignment) {
+  assert(Alignment && "Alignment must be specified!");
+  if (Alignment > PoolAlignment) PoolAlignment = Alignment;
+  
+  // Check to see if we already have this constant.
+  //
+  // FIXME, this could be made much more efficient for large constant pools.
+  int Idx = V->getExistingMachineCPValue(this, Alignment);
+  if (Idx != -1)
+    return (unsigned)Idx;
+
+  Constants.push_back(MachineConstantPoolEntry(V, Alignment));
+  return Constants.size()-1;
+}
+
+void MachineConstantPool::print(raw_ostream &OS) const {
+  if (Constants.empty()) return;
+
+  OS << "Constant Pool:\n";
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    OS << "  cp#" << i << ": ";
+    if (Constants[i].isMachineConstantPoolEntry())
+      Constants[i].Val.MachineCPVal->print(OS);
+    else
+      OS << *(Value*)Constants[i].Val.ConstVal;
+    OS << ", align=" << Constants[i].getAlignment();
+    OS << "\n";
+  }
+}
+
+void MachineConstantPool::dump() const { print(dbgs()); }
diff --git a/src/LLVM/lib/CodeGen/MachineFunctionAnalysis.cpp b/src/LLVM/lib/CodeGen/MachineFunctionAnalysis.cpp
new file mode 100644
index 0000000..4f84b95
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -0,0 +1,62 @@
+//===-- MachineFunctionAnalysis.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definitions of the MachineFunctionAnalysis members.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+using namespace llvm;
+
+// Register this pass with PassInfo directly to avoid having to define
+// a default constructor.
+static PassInfo
+X("Machine Function Analysis", "machine-function-analysis",
+   &MachineFunctionAnalysis::ID, 0,
+  /*CFGOnly=*/false, /*is_analysis=*/true);
+
+char MachineFunctionAnalysis::ID = 0;
+
+MachineFunctionAnalysis::MachineFunctionAnalysis(const TargetMachine &tm,
+                                                 CodeGenOpt::Level OL) :
+  FunctionPass(ID), TM(tm), OptLevel(OL), MF(0) {
+}
+
+MachineFunctionAnalysis::~MachineFunctionAnalysis() {
+  releaseMemory();
+  assert(!MF && "MachineFunctionAnalysis left initialized!");
+}
+
+void MachineFunctionAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineModuleInfo>();
+}
+
+bool MachineFunctionAnalysis::doInitialization(Module &M) {
+  MachineModuleInfo *MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  assert(MMI && "MMI not around yet??");
+  MMI->setModule(&M);
+  NextFnNum = 0;
+  return false;
+}
+
+
+bool MachineFunctionAnalysis::runOnFunction(Function &F) {
+  assert(!MF && "MachineFunctionAnalysis already initialized!");
+  MF = new MachineFunction(&F, TM, NextFnNum++,
+                           getAnalysis<MachineModuleInfo>());
+  return false;
+}
+
+void MachineFunctionAnalysis::releaseMemory() {
+  delete MF;
+  MF = 0;
+}
diff --git a/src/LLVM/lib/CodeGen/MachineFunctionPass.cpp b/src/LLVM/lib/CodeGen/MachineFunctionPass.cpp
new file mode 100644
index 0000000..e5a4912
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineFunctionPass.cpp
@@ -0,0 +1,56 @@
+//===-- MachineFunctionPass.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definitions of the MachineFunctionPass members.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+using namespace llvm;
+
+Pass *MachineFunctionPass::createPrinterPass(raw_ostream &O,
+                                             const std::string &Banner) const {
+  return createMachineFunctionPrinterPass(O, Banner);
+}
+
+bool MachineFunctionPass::runOnFunction(Function &F) {
+  // Do not codegen any 'available_externally' functions at all, they have
+  // definitions outside the translation unit.
+  if (F.hasAvailableExternallyLinkage())
+    return false;
+
+  MachineFunction &MF = getAnalysis<MachineFunctionAnalysis>().getMF();
+  return runOnMachineFunction(MF);
+}
+
+void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addPreserved<MachineFunctionAnalysis>();
+
+  // MachineFunctionPass preserves all LLVM IR passes, but there's no
+  // high-level way to express this. Instead, just list a bunch of
+  // passes explicitly. This does not include setPreservesCFG,
+  // because CodeGen overloads that to mean preserving the MachineBasicBlock
+  // CFG in addition to the LLVM IR CFG.
+  AU.addPreserved<AliasAnalysis>();
+  AU.addPreserved("scalar-evolution");
+  AU.addPreserved("iv-users");
+  AU.addPreserved("memdep");
+  AU.addPreserved("live-values");
+  AU.addPreserved("domtree");
+  AU.addPreserved("domfrontier");
+  AU.addPreserved("loops");
+  AU.addPreserved("lda");
+
+  FunctionPass::getAnalysisUsage(AU);
+}
diff --git a/src/LLVM/lib/CodeGen/MachineFunctionPrinterPass.cpp b/src/LLVM/lib/CodeGen/MachineFunctionPrinterPass.cpp
new file mode 100644
index 0000000..2aaa798
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -0,0 +1,60 @@
+//===-- MachineFunctionPrinterPass.cpp ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineFunctionPrinterPass implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+/// MachineFunctionPrinterPass - This is a pass to dump the IR of a
+/// MachineFunction.
+///
+struct MachineFunctionPrinterPass : public MachineFunctionPass {
+  static char ID;
+
+  raw_ostream &OS;
+  const std::string Banner;
+
+  MachineFunctionPrinterPass(raw_ostream &os, const std::string &banner) 
+      : MachineFunctionPass(ID), OS(os), Banner(banner) {}
+
+  const char *getPassName() const { return "MachineFunction Printer"; }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) {
+    OS << "# " << Banner << ":\n";
+    MF.print(OS);
+    return false;
+  }
+};
+
+char MachineFunctionPrinterPass::ID = 0;
+}
+
+namespace llvm {
+/// Returns a newly-created MachineFunction Printer pass. The
+/// default banner is empty.
+///
+MachineFunctionPass *createMachineFunctionPrinterPass(raw_ostream &OS,
+                                                      const std::string &Banner){
+  return new MachineFunctionPrinterPass(OS, Banner);
+}
+
+}
diff --git a/src/LLVM/lib/CodeGen/MachineInstr.cpp b/src/LLVM/lib/CodeGen/MachineInstr.cpp
new file mode 100644
index 0000000..1602a7d
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineInstr.cpp
@@ -0,0 +1,1492 @@
+//===-- lib/CodeGen/MachineInstr.cpp --------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Methods common to all machine instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Metadata.h"
+#include "llvm/Type.h"
+#include "llvm/Value.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetInstrDesc.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LeakDetector.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/FoldingSet.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MachineOperand Implementation
+//===----------------------------------------------------------------------===//
+
+/// AddRegOperandToRegInfo - Add this register operand to the specified
+/// MachineRegisterInfo.  If it is null, then the next/prev fields should be
+/// explicitly nulled out.
+void MachineOperand::AddRegOperandToRegInfo(MachineRegisterInfo *RegInfo) {
+  assert(isReg() && "Can only add reg operand to use lists");
+  
+  // If the reginfo pointer is null, just explicitly null out or next/prev
+  // pointers, to ensure they are not garbage.
+  if (RegInfo == 0) {
+    Contents.Reg.Prev = 0;
+    Contents.Reg.Next = 0;
+    return;
+  }
+  
+  // Otherwise, add this operand to the head of the registers use/def list.
+  MachineOperand **Head = &RegInfo->getRegUseDefListHead(getReg());
+  
+  // For SSA values, we prefer to keep the definition at the start of the list.
+  // we do this by skipping over the definition if it is at the head of the
+  // list.
+  if (*Head && (*Head)->isDef())
+    Head = &(*Head)->Contents.Reg.Next;
+  
+  Contents.Reg.Next = *Head;
+  if (Contents.Reg.Next) {
+    assert(getReg() == Contents.Reg.Next->getReg() &&
+           "Different regs on the same list!");
+    Contents.Reg.Next->Contents.Reg.Prev = &Contents.Reg.Next;
+  }
+  
+  Contents.Reg.Prev = Head;
+  *Head = this;
+}
+
+/// RemoveRegOperandFromRegInfo - Remove this register operand from the
+/// MachineRegisterInfo it is linked with.
+void MachineOperand::RemoveRegOperandFromRegInfo() {
+  assert(isOnRegUseList() && "Reg operand is not on a use list");
+  // Unlink this from the doubly linked list of operands.
+  MachineOperand *NextOp = Contents.Reg.Next;
+  *Contents.Reg.Prev = NextOp; 
+  if (NextOp) {
+    assert(NextOp->getReg() == getReg() && "Corrupt reg use/def chain!");
+    NextOp->Contents.Reg.Prev = Contents.Reg.Prev;
+  }
+  Contents.Reg.Prev = 0;
+  Contents.Reg.Next = 0;
+}
+
+void MachineOperand::setReg(unsigned Reg) {
+  if (getReg() == Reg) return; // No change.
+  
+  // Otherwise, we have to change the register.  If this operand is embedded
+  // into a machine function, we need to update the old and new register's
+  // use/def lists.
+  if (MachineInstr *MI = getParent())
+    if (MachineBasicBlock *MBB = MI->getParent())
+      if (MachineFunction *MF = MBB->getParent()) {
+        RemoveRegOperandFromRegInfo();
+        Contents.Reg.RegNo = Reg;
+        AddRegOperandToRegInfo(&MF->getRegInfo());
+        return;
+      }
+        
+  // Otherwise, just change the register, no problem.  :)
+  Contents.Reg.RegNo = Reg;
+}
+
+void MachineOperand::substVirtReg(unsigned Reg, unsigned SubIdx,
+                                  const TargetRegisterInfo &TRI) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  if (SubIdx && getSubReg())
+    SubIdx = TRI.composeSubRegIndices(SubIdx, getSubReg());
+  setReg(Reg);
+  if (SubIdx)
+    setSubReg(SubIdx);
+}
+
+void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) {
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  if (getSubReg()) {
+    Reg = TRI.getSubReg(Reg, getSubReg());
+    assert(Reg && "Invalid SubReg for physical register");
+    setSubReg(0);
+  }
+  setReg(Reg);
+}
+
+/// ChangeToImmediate - Replace this operand with a new immediate operand of
+/// the specified value.  If an operand is known to be an immediate already,
+/// the setImm method should be used.
+void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
+  // If this operand is currently a register operand, and if this is in a
+  // function, deregister the operand from the register's use/def list.
+  if (isReg() && getParent() && getParent()->getParent() &&
+      getParent()->getParent()->getParent())
+    RemoveRegOperandFromRegInfo();
+  
+  OpKind = MO_Immediate;
+  Contents.ImmVal = ImmVal;
+}
+
+/// ChangeToRegister - Replace this operand with a new register operand of
+/// the specified value.  If an operand is known to be an register already,
+/// the setReg method should be used.
+void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
+                                      bool isKill, bool isDead, bool isUndef,
+                                      bool isDebug) {
+  // If this operand is already a register operand, use setReg to update the 
+  // register's use/def lists.
+  if (isReg()) {
+    assert(!isEarlyClobber());
+    setReg(Reg);
+  } else {
+    // Otherwise, change this to a register and set the reg#.
+    OpKind = MO_Register;
+    Contents.Reg.RegNo = Reg;
+
+    // If this operand is embedded in a function, add the operand to the
+    // register's use/def list.
+    if (MachineInstr *MI = getParent())
+      if (MachineBasicBlock *MBB = MI->getParent())
+        if (MachineFunction *MF = MBB->getParent())
+          AddRegOperandToRegInfo(&MF->getRegInfo());
+  }
+
+  IsDef = isDef;
+  IsImp = isImp;
+  IsKill = isKill;
+  IsDead = isDead;
+  IsUndef = isUndef;
+  IsEarlyClobber = false;
+  IsDebug = isDebug;
+  SubReg = 0;
+}
+
+/// isIdenticalTo - Return true if this operand is identical to the specified
+/// operand.
+bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
+  if (getType() != Other.getType() ||
+      getTargetFlags() != Other.getTargetFlags())
+    return false;
+  
+  switch (getType()) {
+  default: llvm_unreachable("Unrecognized operand type");
+  case MachineOperand::MO_Register:
+    return getReg() == Other.getReg() && isDef() == Other.isDef() &&
+           getSubReg() == Other.getSubReg();
+  case MachineOperand::MO_Immediate:
+    return getImm() == Other.getImm();
+  case MachineOperand::MO_FPImmediate:
+    return getFPImm() == Other.getFPImm();
+  case MachineOperand::MO_MachineBasicBlock:
+    return getMBB() == Other.getMBB();
+  case MachineOperand::MO_FrameIndex:
+    return getIndex() == Other.getIndex();
+  case MachineOperand::MO_ConstantPoolIndex:
+    return getIndex() == Other.getIndex() && getOffset() == Other.getOffset();
+  case MachineOperand::MO_JumpTableIndex:
+    return getIndex() == Other.getIndex();
+  case MachineOperand::MO_GlobalAddress:
+    return getGlobal() == Other.getGlobal() && getOffset() == Other.getOffset();
+  case MachineOperand::MO_ExternalSymbol:
+    return !strcmp(getSymbolName(), Other.getSymbolName()) &&
+           getOffset() == Other.getOffset();
+  case MachineOperand::MO_BlockAddress:
+    return getBlockAddress() == Other.getBlockAddress();
+  case MachineOperand::MO_MCSymbol:
+    return getMCSymbol() == Other.getMCSymbol();
+  case MachineOperand::MO_Metadata:
+    return getMetadata() == Other.getMetadata();
+  }
+}
+
+/// print - Print the specified machine operand.
+///
+void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
+  // If the instruction is embedded into a basic block, we can find the
+  // target info for the instruction.
+  if (!TM)
+    if (const MachineInstr *MI = getParent())
+      if (const MachineBasicBlock *MBB = MI->getParent())
+        if (const MachineFunction *MF = MBB->getParent())
+          TM = &MF->getTarget();
+
+  switch (getType()) {
+  case MachineOperand::MO_Register:
+    if (getReg() == 0 || TargetRegisterInfo::isVirtualRegister(getReg())) {
+      OS << "%reg" << getReg();
+    } else {
+      if (TM)
+        OS << "%" << TM->getRegisterInfo()->get(getReg()).Name;
+      else
+        OS << "%physreg" << getReg();
+    }
+
+    if (getSubReg() != 0) {
+      if (TM)
+        OS << ':' << TM->getRegisterInfo()->getSubRegIndexName(getSubReg());
+      else
+        OS << ':' << getSubReg();
+    }
+
+    if (isDef() || isKill() || isDead() || isImplicit() || isUndef() ||
+        isEarlyClobber()) {
+      OS << '<';
+      bool NeedComma = false;
+      if (isDef()) {
+        if (NeedComma) OS << ',';
+        if (isEarlyClobber())
+          OS << "earlyclobber,";
+        if (isImplicit())
+          OS << "imp-";
+        OS << "def";
+        NeedComma = true;
+      } else if (isImplicit()) {
+          OS << "imp-use";
+          NeedComma = true;
+      }
+
+      if (isKill() || isDead() || isUndef()) {
+        if (NeedComma) OS << ',';
+        if (isKill())  OS << "kill";
+        if (isDead())  OS << "dead";
+        if (isUndef()) {
+          if (isKill() || isDead())
+            OS << ',';
+          OS << "undef";
+        }
+      }
+      OS << '>';
+    }
+    break;
+  case MachineOperand::MO_Immediate:
+    OS << getImm();
+    break;
+  case MachineOperand::MO_FPImmediate:
+    if (getFPImm()->getType()->isFloatTy())
+      OS << getFPImm()->getValueAPF().convertToFloat();
+    else
+      OS << getFPImm()->getValueAPF().convertToDouble();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    OS << "<BB#" << getMBB()->getNumber() << ">";
+    break;
+  case MachineOperand::MO_FrameIndex:
+    OS << "<fi#" << getIndex() << '>';
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    OS << "<cp#" << getIndex();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    OS << "<jt#" << getIndex() << '>';
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    OS << "<ga:";
+    WriteAsOperand(OS, getGlobal(), /*PrintType=*/false);
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    OS << "<es:" << getSymbolName();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_BlockAddress:
+    OS << '<';
+    WriteAsOperand(OS, getBlockAddress(), /*PrintType=*/false);
+    OS << '>';
+    break;
+  case MachineOperand::MO_Metadata:
+    OS << '<';
+    WriteAsOperand(OS, getMetadata(), /*PrintType=*/false);
+    OS << '>';
+    break;
+  case MachineOperand::MO_MCSymbol:
+    OS << "<MCSym=" << *getMCSymbol() << '>';
+    break;
+  default:
+    llvm_unreachable("Unrecognized operand type");
+  }
+  
+  if (unsigned TF = getTargetFlags())
+    OS << "[TF=" << TF << ']';
+}
+
+//===----------------------------------------------------------------------===//
+// MachineMemOperand Implementation
+//===----------------------------------------------------------------------===//
+
+MachineMemOperand::MachineMemOperand(const Value *v, unsigned int f,
+                                     int64_t o, uint64_t s, unsigned int a)
+  : Offset(o), Size(s), V(v),
+    Flags((f & ((1 << MOMaxBits) - 1)) | ((Log2_32(a) + 1) << MOMaxBits)) {
+  assert(getBaseAlignment() == a && "Alignment is not a power of 2!");
+  assert((isLoad() || isStore()) && "Not a load/store!");
+}
+
+/// Profile - Gather unique data for the object.
+///
+void MachineMemOperand::Profile(FoldingSetNodeID &ID) const {
+  ID.AddInteger(Offset);
+  ID.AddInteger(Size);
+  ID.AddPointer(V);
+  ID.AddInteger(Flags);
+}
+
+void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
+  // The Value and Offset may differ due to CSE. But the flags and size
+  // should be the same.
+  assert(MMO->getFlags() == getFlags() && "Flags mismatch!");
+  assert(MMO->getSize() == getSize() && "Size mismatch!");
+
+  if (MMO->getBaseAlignment() >= getBaseAlignment()) {
+    // Update the alignment value.
+    Flags = (Flags & ((1 << MOMaxBits) - 1)) |
+      ((Log2_32(MMO->getBaseAlignment()) + 1) << MOMaxBits);
+    // Also update the base and offset, because the new alignment may
+    // not be applicable with the old ones.
+    V = MMO->getValue();
+    Offset = MMO->getOffset();
+  }
+}
+
+/// getAlignment - Return the minimum known alignment in bytes of the
+/// actual memory reference.
+uint64_t MachineMemOperand::getAlignment() const {
+  return MinAlign(getBaseAlignment(), getOffset());
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) {
+  assert((MMO.isLoad() || MMO.isStore()) &&
+         "SV has to be a load, store or both.");
+  
+  if (MMO.isVolatile())
+    OS << "Volatile ";
+
+  if (MMO.isLoad())
+    OS << "LD";
+  if (MMO.isStore())
+    OS << "ST";
+  OS << MMO.getSize();
+  
+  // Print the address information.
+  OS << "[";
+  if (!MMO.getValue())
+    OS << "<unknown>";
+  else
+    WriteAsOperand(OS, MMO.getValue(), /*PrintType=*/false);
+
+  // If the alignment of the memory reference itself differs from the alignment
+  // of the base pointer, print the base alignment explicitly, next to the base
+  // pointer.
+  if (MMO.getBaseAlignment() != MMO.getAlignment())
+    OS << "(align=" << MMO.getBaseAlignment() << ")";
+
+  if (MMO.getOffset() != 0)
+    OS << "+" << MMO.getOffset();
+  OS << "]";
+
+  // Print the alignment of the reference.
+  if (MMO.getBaseAlignment() != MMO.getAlignment() ||
+      MMO.getBaseAlignment() != MMO.getSize())
+    OS << "(align=" << MMO.getAlignment() << ")";
+
+  return OS;
+}
+
+//===----------------------------------------------------------------------===//
+// MachineInstr Implementation
+//===----------------------------------------------------------------------===//
+
+/// MachineInstr ctor - This constructor creates a dummy MachineInstr with
+/// TID NULL and no operands.
+MachineInstr::MachineInstr()
+  : TID(0), NumImplicitOps(0), AsmPrinterFlags(0), MemRefs(0), MemRefsEnd(0),
+    Parent(0) {
+  // Make sure that we get added to a machine basicblock
+  LeakDetector::addGarbageObject(this);
+}
+
+void MachineInstr::addImplicitDefUseOperands() {
+  if (TID->ImplicitDefs)
+    for (const unsigned *ImpDefs = TID->ImplicitDefs; *ImpDefs; ++ImpDefs)
+      addOperand(MachineOperand::CreateReg(*ImpDefs, true, true));
+  if (TID->ImplicitUses)
+    for (const unsigned *ImpUses = TID->ImplicitUses; *ImpUses; ++ImpUses)
+      addOperand(MachineOperand::CreateReg(*ImpUses, false, true));
+}
+
+/// MachineInstr ctor - This constructor creates a MachineInstr and adds the
+/// implicit operands. It reserves space for the number of operands specified by
+/// the TargetInstrDesc.
+MachineInstr::MachineInstr(const TargetInstrDesc &tid, bool NoImp)
+  : TID(&tid), NumImplicitOps(0), AsmPrinterFlags(0),
+    MemRefs(0), MemRefsEnd(0), Parent(0) {
+  if (!NoImp)
+    NumImplicitOps = TID->getNumImplicitDefs() + TID->getNumImplicitUses();
+  Operands.reserve(NumImplicitOps + TID->getNumOperands());
+  if (!NoImp)
+    addImplicitDefUseOperands();
+  // Make sure that we get added to a machine basicblock
+  LeakDetector::addGarbageObject(this);
+}
+
+/// MachineInstr ctor - As above, but with a DebugLoc.
+MachineInstr::MachineInstr(const TargetInstrDesc &tid, const DebugLoc dl,
+                           bool NoImp)
+  : TID(&tid), NumImplicitOps(0), AsmPrinterFlags(0), MemRefs(0), MemRefsEnd(0),
+    Parent(0), debugLoc(dl) {
+  if (!NoImp)
+    NumImplicitOps = TID->getNumImplicitDefs() + TID->getNumImplicitUses();
+  Operands.reserve(NumImplicitOps + TID->getNumOperands());
+  if (!NoImp)
+    addImplicitDefUseOperands();
+  // Make sure that we get added to a machine basicblock
+  LeakDetector::addGarbageObject(this);
+}
+
+/// MachineInstr ctor - Work exactly the same as the ctor two above, except
+/// that the MachineInstr is created and added to the end of the specified 
+/// basic block.
+MachineInstr::MachineInstr(MachineBasicBlock *MBB, const TargetInstrDesc &tid)
+  : TID(&tid), NumImplicitOps(0), AsmPrinterFlags(0),
+    MemRefs(0), MemRefsEnd(0), Parent(0) {
+  assert(MBB && "Cannot use inserting ctor with null basic block!");
+  NumImplicitOps = TID->getNumImplicitDefs() + TID->getNumImplicitUses();
+  Operands.reserve(NumImplicitOps + TID->getNumOperands());
+  addImplicitDefUseOperands();
+  // Make sure that we get added to a machine basicblock
+  LeakDetector::addGarbageObject(this);
+  MBB->push_back(this);  // Add instruction to end of basic block!
+}
+
+/// MachineInstr ctor - As above, but with a DebugLoc.
+///
+MachineInstr::MachineInstr(MachineBasicBlock *MBB, const DebugLoc dl,
+                           const TargetInstrDesc &tid)
+  : TID(&tid), NumImplicitOps(0), AsmPrinterFlags(0), MemRefs(0), MemRefsEnd(0),
+    Parent(0), debugLoc(dl) {
+  assert(MBB && "Cannot use inserting ctor with null basic block!");
+  NumImplicitOps = TID->getNumImplicitDefs() + TID->getNumImplicitUses();
+  Operands.reserve(NumImplicitOps + TID->getNumOperands());
+  addImplicitDefUseOperands();
+  // Make sure that we get added to a machine basicblock
+  LeakDetector::addGarbageObject(this);
+  MBB->push_back(this);  // Add instruction to end of basic block!
+}
+
+/// MachineInstr ctor - Copies MachineInstr arg exactly
+///
+MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
+  : TID(&MI.getDesc()), NumImplicitOps(0), AsmPrinterFlags(0),
+    MemRefs(MI.MemRefs), MemRefsEnd(MI.MemRefsEnd),
+    Parent(0), debugLoc(MI.getDebugLoc()) {
+  Operands.reserve(MI.getNumOperands());
+
+  // Add operands
+  for (unsigned i = 0; i != MI.getNumOperands(); ++i)
+    addOperand(MI.getOperand(i));
+  NumImplicitOps = MI.NumImplicitOps;
+
+  // Set parent to null.
+  Parent = 0;
+
+  LeakDetector::addGarbageObject(this);
+}
+
+MachineInstr::~MachineInstr() {
+  LeakDetector::removeGarbageObject(this);
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+    assert(Operands[i].ParentMI == this && "ParentMI mismatch!");
+    assert((!Operands[i].isReg() || !Operands[i].isOnRegUseList()) &&
+           "Reg operand def/use list corrupted");
+  }
+#endif
+}
+
+/// getRegInfo - If this instruction is embedded into a MachineFunction,
+/// return the MachineRegisterInfo object for the current function, otherwise
+/// return null.
+MachineRegisterInfo *MachineInstr::getRegInfo() {
+  if (MachineBasicBlock *MBB = getParent())
+    return &MBB->getParent()->getRegInfo();
+  return 0;
+}
+
+/// RemoveRegOperandsFromUseLists - Unlink all of the register operands in
+/// this instruction from their respective use lists.  This requires that the
+/// operands already be on their use lists.
+void MachineInstr::RemoveRegOperandsFromUseLists() {
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+    if (Operands[i].isReg())
+      Operands[i].RemoveRegOperandFromRegInfo();
+  }
+}
+
+/// AddRegOperandsToUseLists - Add all of the register operands in
+/// this instruction from their respective use lists.  This requires that the
+/// operands not be on their use lists yet.
+void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &RegInfo) {
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+    if (Operands[i].isReg())
+      Operands[i].AddRegOperandToRegInfo(&RegInfo);
+  }
+}
+
+
+/// addOperand - Add the specified operand to the instruction.  If it is an
+/// implicit operand, it is added to the end of the operand list.  If it is
+/// an explicit operand it is added at the end of the explicit operand list
+/// (before the first implicit operand). 
+void MachineInstr::addOperand(const MachineOperand &Op) {
+  bool isImpReg = Op.isReg() && Op.isImplicit();
+  assert((isImpReg || !OperandsComplete()) &&
+         "Trying to add an operand to a machine instr that is already done!");
+
+  MachineRegisterInfo *RegInfo = getRegInfo();
+
+  // If we are adding the operand to the end of the list, our job is simpler.
+  // This is true most of the time, so this is a reasonable optimization.
+  if (isImpReg || NumImplicitOps == 0) {
+    // We can only do this optimization if we know that the operand list won't
+    // reallocate.
+    if (Operands.empty() || Operands.size()+1 <= Operands.capacity()) {
+      Operands.push_back(Op);
+    
+      // Set the parent of the operand.
+      Operands.back().ParentMI = this;
+  
+      // If the operand is a register, update the operand's use list.
+      if (Op.isReg()) {
+        Operands.back().AddRegOperandToRegInfo(RegInfo);
+        // If the register operand is flagged as early, mark the operand as such
+        unsigned OpNo = Operands.size() - 1;
+        if (TID->getOperandConstraint(OpNo, TOI::EARLY_CLOBBER) != -1)
+          Operands[OpNo].setIsEarlyClobber(true);
+      }
+      return;
+    }
+  }
+  
+  // Otherwise, we have to insert a real operand before any implicit ones.
+  unsigned OpNo = Operands.size()-NumImplicitOps;
+
+  // If this instruction isn't embedded into a function, then we don't need to
+  // update any operand lists.
+  if (RegInfo == 0) {
+    // Simple insertion, no reginfo update needed for other register operands.
+    Operands.insert(Operands.begin()+OpNo, Op);
+    Operands[OpNo].ParentMI = this;
+
+    // Do explicitly set the reginfo for this operand though, to ensure the
+    // next/prev fields are properly nulled out.
+    if (Operands[OpNo].isReg()) {
+      Operands[OpNo].AddRegOperandToRegInfo(0);
+      // If the register operand is flagged as early, mark the operand as such
+      if (TID->getOperandConstraint(OpNo, TOI::EARLY_CLOBBER) != -1)
+        Operands[OpNo].setIsEarlyClobber(true);
+    }
+
+  } else if (Operands.size()+1 <= Operands.capacity()) {
+    // Otherwise, we have to remove register operands from their register use
+    // list, add the operand, then add the register operands back to their use
+    // list.  This also must handle the case when the operand list reallocates
+    // to somewhere else.
+  
+    // If insertion of this operand won't cause reallocation of the operand
+    // list, just remove the implicit operands, add the operand, then re-add all
+    // the rest of the operands.
+    for (unsigned i = OpNo, e = Operands.size(); i != e; ++i) {
+      assert(Operands[i].isReg() && "Should only be an implicit reg!");
+      Operands[i].RemoveRegOperandFromRegInfo();
+    }
+    
+    // Add the operand.  If it is a register, add it to the reg list.
+    Operands.insert(Operands.begin()+OpNo, Op);
+    Operands[OpNo].ParentMI = this;
+
+    if (Operands[OpNo].isReg()) {
+      Operands[OpNo].AddRegOperandToRegInfo(RegInfo);
+      // If the register operand is flagged as early, mark the operand as such
+      if (TID->getOperandConstraint(OpNo, TOI::EARLY_CLOBBER) != -1)
+        Operands[OpNo].setIsEarlyClobber(true);
+    }
+    
+    // Re-add all the implicit ops.
+    for (unsigned i = OpNo+1, e = Operands.size(); i != e; ++i) {
+      assert(Operands[i].isReg() && "Should only be an implicit reg!");
+      Operands[i].AddRegOperandToRegInfo(RegInfo);
+    }
+  } else {
+    // Otherwise, we will be reallocating the operand list.  Remove all reg
+    // operands from their list, then readd them after the operand list is
+    // reallocated.
+    RemoveRegOperandsFromUseLists();
+    
+    Operands.insert(Operands.begin()+OpNo, Op);
+    Operands[OpNo].ParentMI = this;
+  
+    // Re-add all the operands.
+    AddRegOperandsToUseLists(*RegInfo);
+
+      // If the register operand is flagged as early, mark the operand as such
+    if (Operands[OpNo].isReg()
+        && TID->getOperandConstraint(OpNo, TOI::EARLY_CLOBBER) != -1)
+      Operands[OpNo].setIsEarlyClobber(true);
+  }
+}
+
+/// RemoveOperand - Erase an operand  from an instruction, leaving it with one
+/// fewer operand than it started with.
+///
+void MachineInstr::RemoveOperand(unsigned OpNo) {
+  assert(OpNo < Operands.size() && "Invalid operand number");
+  
+  // Special case removing the last one.
+  if (OpNo == Operands.size()-1) {
+    // If needed, remove from the reg def/use list.
+    if (Operands.back().isReg() && Operands.back().isOnRegUseList())
+      Operands.back().RemoveRegOperandFromRegInfo();
+    
+    Operands.pop_back();
+    return;
+  }
+
+  // Otherwise, we are removing an interior operand.  If we have reginfo to
+  // update, remove all operands that will be shifted down from their reg lists,
+  // move everything down, then re-add them.
+  MachineRegisterInfo *RegInfo = getRegInfo();
+  if (RegInfo) {
+    for (unsigned i = OpNo, e = Operands.size(); i != e; ++i) {
+      if (Operands[i].isReg())
+        Operands[i].RemoveRegOperandFromRegInfo();
+    }
+  }
+  
+  Operands.erase(Operands.begin()+OpNo);
+
+  if (RegInfo) {
+    for (unsigned i = OpNo, e = Operands.size(); i != e; ++i) {
+      if (Operands[i].isReg())
+        Operands[i].AddRegOperandToRegInfo(RegInfo);
+    }
+  }
+}
+
+/// addMemOperand - Add a MachineMemOperand to the machine instruction.
+/// This function should be used only occasionally. The setMemRefs function
+/// is the primary method for setting up a MachineInstr's MemRefs list.
+void MachineInstr::addMemOperand(MachineFunction &MF,
+                                 MachineMemOperand *MO) {
+  mmo_iterator OldMemRefs = MemRefs;
+  mmo_iterator OldMemRefsEnd = MemRefsEnd;
+
+  size_t NewNum = (MemRefsEnd - MemRefs) + 1;
+  mmo_iterator NewMemRefs = MF.allocateMemRefsArray(NewNum);
+  mmo_iterator NewMemRefsEnd = NewMemRefs + NewNum;
+
+  std::copy(OldMemRefs, OldMemRefsEnd, NewMemRefs);
+  NewMemRefs[NewNum - 1] = MO;
+
+  MemRefs = NewMemRefs;
+  MemRefsEnd = NewMemRefsEnd;
+}
+
+bool MachineInstr::isIdenticalTo(const MachineInstr *Other,
+                                 MICheckType Check) const {
+  // If opcodes or number of operands are not the same then the two
+  // instructions are obviously not identical.
+  if (Other->getOpcode() != getOpcode() ||
+      Other->getNumOperands() != getNumOperands())
+    return false;
+
+  // Check operands to make sure they match.
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    const MachineOperand &OMO = Other->getOperand(i);
+    // Clients may or may not want to ignore defs when testing for equality.
+    // For example, machine CSE pass only cares about finding common
+    // subexpressions, so it's safe to ignore virtual register defs.
+    if (Check != CheckDefs && MO.isReg() && MO.isDef()) {
+      if (Check == IgnoreDefs)
+        continue;
+      // Check == IgnoreVRegDefs
+      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) ||
+          TargetRegisterInfo::isPhysicalRegister(OMO.getReg()))
+        if (MO.getReg() != OMO.getReg())
+          return false;
+    } else if (!MO.isIdenticalTo(OMO))
+      return false;
+  }
+  return true;
+}
+
+/// removeFromParent - This method unlinks 'this' from the containing basic
+/// block, and returns it, but does not delete it.
+MachineInstr *MachineInstr::removeFromParent() {
+  assert(getParent() && "Not embedded in a basic block!");
+  getParent()->remove(this);
+  return this;
+}
+
+
+/// eraseFromParent - This method unlinks 'this' from the containing basic
+/// block, and deletes it.
+void MachineInstr::eraseFromParent() {
+  assert(getParent() && "Not embedded in a basic block!");
+  getParent()->erase(this);
+}
+
+
+/// OperandComplete - Return true if it's illegal to add a new operand
+///
+bool MachineInstr::OperandsComplete() const {
+  unsigned short NumOperands = TID->getNumOperands();
+  if (!TID->isVariadic() && getNumOperands()-NumImplicitOps >= NumOperands)
+    return true;  // Broken: we have all the operands of this instruction!
+  return false;
+}
+
+/// getNumExplicitOperands - Returns the number of non-implicit operands.
+///
+unsigned MachineInstr::getNumExplicitOperands() const {
+  unsigned NumOperands = TID->getNumOperands();
+  if (!TID->isVariadic())
+    return NumOperands;
+
+  for (unsigned i = NumOperands, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isImplicit())
+      NumOperands++;
+  }
+  return NumOperands;
+}
+
+
+/// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of
+/// the specific register or -1 if it is not found. It further tightens
+/// the search criteria to a use that kills the register if isKill is true.
+int MachineInstr::findRegisterUseOperandIdx(unsigned Reg, bool isKill,
+                                          const TargetRegisterInfo *TRI) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+    if (MOReg == Reg ||
+        (TRI &&
+         TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+         TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         TRI->isSubRegister(MOReg, Reg)))
+      if (!isKill || MO.isKill())
+        return i;
+  }
+  return -1;
+}
+
+/// readsWritesVirtualRegister - Return a pair of bools (reads, writes)
+/// indicating if this instruction reads or writes Reg. This also considers
+/// partial defines.
+std::pair<bool,bool>
+MachineInstr::readsWritesVirtualRegister(unsigned Reg,
+                                         SmallVectorImpl<unsigned> *Ops) const {
+  bool PartDef = false; // Partial redefine.
+  bool FullDef = false; // Full define.
+  bool Use = false;
+
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || MO.getReg() != Reg)
+      continue;
+    if (Ops)
+      Ops->push_back(i);
+    if (MO.isUse())
+      Use |= !MO.isUndef();
+    else if (MO.getSubReg())
+      PartDef = true;
+    else
+      FullDef = true;
+  }
+  // A partial redefine uses Reg unless there is also a full define.
+  return std::make_pair(Use || (PartDef && !FullDef), PartDef || FullDef);
+}
+
+/// findRegisterDefOperandIdx() - Returns the operand index that is a def of
+/// the specified register or -1 if it is not found. If isDead is true, defs
+/// that are not dead are skipped. If TargetRegisterInfo is non-null, then it
+/// also checks if there is a def of a super-register.
+int
+MachineInstr::findRegisterDefOperandIdx(unsigned Reg, bool isDead, bool Overlap,
+                                        const TargetRegisterInfo *TRI) const {
+  bool isPhys = TargetRegisterInfo::isPhysicalRegister(Reg);
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned MOReg = MO.getReg();
+    bool Found = (MOReg == Reg);
+    if (!Found && TRI && isPhys &&
+        TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+      if (Overlap)
+        Found = TRI->regsOverlap(MOReg, Reg);
+      else
+        Found = TRI->isSubRegister(MOReg, Reg);
+    }
+    if (Found && (!isDead || MO.isDead()))
+      return i;
+  }
+  return -1;
+}
+
+/// findFirstPredOperandIdx() - Find the index of the first operand in the
+/// operand list that is used to represent the predicate. It returns -1 if
+/// none is found.
+int MachineInstr::findFirstPredOperandIdx() const {
+  const TargetInstrDesc &TID = getDesc();
+  if (TID.isPredicable()) {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+      if (TID.OpInfo[i].isPredicate())
+        return i;
+  }
+
+  return -1;
+}
+  
+/// isRegTiedToUseOperand - Given the index of a register def operand,
+/// check if the register def is tied to a source operand, due to either
+/// two-address elimination or inline assembly constraints. Returns the
+/// first tied use operand index by reference is UseOpIdx is not null.
+bool MachineInstr::
+isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx) const {
+  assert(getOperand(DefOpIdx).isDef() && "DefOpIdx is not a def!");
+  const TargetInstrDesc &TID = getDesc();
+  for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (MO.isReg() && MO.isUse() &&
+        TID.getOperandConstraint(i, TOI::TIED_TO) == (int)DefOpIdx) {
+      if (UseOpIdx)
+        *UseOpIdx = (unsigned)i;
+      return true;
+    }
+  }
+  return false;
+}
+
+/// isRegTiedToDefOperand - Return true if the operand of the specified index
+/// is a register use and it is tied to an def operand. It also returns the def
+/// operand index by reference.
+bool MachineInstr::
+isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx) const {
+  const TargetInstrDesc &TID = getDesc();
+  if (UseOpIdx >= TID.getNumOperands())
+    return false;
+  const MachineOperand &MO = getOperand(UseOpIdx);
+  if (!MO.isReg() || !MO.isUse())
+    return false;
+  int DefIdx = TID.getOperandConstraint(UseOpIdx, TOI::TIED_TO);
+  if (DefIdx == -1)
+    return false;
+  if (DefOpIdx)
+    *DefOpIdx = (unsigned)DefIdx;
+  return true;
+}
+
+/// clearKillInfo - Clears kill flags on all operands.
+///
+void MachineInstr::clearKillInfo() {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (MO.isReg() && MO.isUse())
+      MO.setIsKill(false);
+  }
+}
+
+/// copyKillDeadInfo - Copies kill / dead operand properties from MI.
+///
+void MachineInstr::copyKillDeadInfo(const MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || (!MO.isKill() && !MO.isDead()))
+      continue;
+    for (unsigned j = 0, ee = getNumOperands(); j != ee; ++j) {
+      MachineOperand &MOp = getOperand(j);
+      if (!MOp.isIdenticalTo(MO))
+        continue;
+      if (MO.isKill())
+        MOp.setIsKill();
+      else
+        MOp.setIsDead();
+      break;
+    }
+  }
+}
+
+/// copyPredicates - Copies predicate operand(s) from MI.
+void MachineInstr::copyPredicates(const MachineInstr *MI) {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isPredicable())
+    return;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    if (TID.OpInfo[i].isPredicate()) {
+      // Predicated operands must be last operands.
+      addOperand(MI->getOperand(i));
+    }
+  }
+}
+
+void MachineInstr::substituteRegister(unsigned FromReg,
+                                      unsigned ToReg,
+                                      unsigned SubIdx,
+                                      const TargetRegisterInfo &RegInfo) {
+  if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
+    if (SubIdx)
+      ToReg = RegInfo.getSubReg(ToReg, SubIdx);
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = getOperand(i);
+      if (!MO.isReg() || MO.getReg() != FromReg)
+        continue;
+      MO.substPhysReg(ToReg, RegInfo);
+    }
+  } else {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = getOperand(i);
+      if (!MO.isReg() || MO.getReg() != FromReg)
+        continue;
+      MO.substVirtReg(ToReg, SubIdx, RegInfo);
+    }
+  }
+}
+
+/// isSafeToMove - Return true if it is safe to move this instruction. If
+/// SawStore is set to true, it means that there is a store (or call) between
+/// the instruction's location and its intended destination.
+bool MachineInstr::isSafeToMove(const TargetInstrInfo *TII,
+                                AliasAnalysis *AA,
+                                bool &SawStore) const {
+  // Ignore stuff that we obviously can't move.
+  if (TID->mayStore() || TID->isCall()) {
+    SawStore = true;
+    return false;
+  }
+  if (TID->isTerminator() || TID->hasUnmodeledSideEffects())
+    return false;
+
+  // See if this instruction does a load.  If so, we have to guarantee that the
+  // loaded value doesn't change between the load and the its intended
+  // destination. The check for isInvariantLoad gives the targe the chance to
+  // classify the load as always returning a constant, e.g. a constant pool
+  // load.
+  if (TID->mayLoad() && !isInvariantLoad(AA))
+    // Otherwise, this is a real load.  If there is a store between the load and
+    // end of block, or if the load is volatile, we can't move it.
+    return !SawStore && !hasVolatileMemoryRef();
+
+  return true;
+}
+
+/// isSafeToReMat - Return true if it's safe to rematerialize the specified
+/// instruction which defined the specified register instead of copying it.
+bool MachineInstr::isSafeToReMat(const TargetInstrInfo *TII,
+                                 AliasAnalysis *AA,
+                                 unsigned DstReg) const {
+  bool SawStore = false;
+  if (!TII->isTriviallyReMaterializable(this, AA) ||
+      !isSafeToMove(TII, AA, SawStore))
+    return false;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg())
+      continue;
+    // FIXME: For now, do not remat any instruction with register operands.
+    // Later on, we can loosen the restriction is the register operands have
+    // not been modified between the def and use. Note, this is different from
+    // MachineSink because the code is no longer in two-address form (at least
+    // partially).
+    if (MO.isUse())
+      return false;
+    else if (!MO.isDead() && MO.getReg() != DstReg)
+      return false;
+  }
+  return true;
+}
+
+/// hasVolatileMemoryRef - Return true if this instruction may have a
+/// volatile memory reference, or if the information describing the
+/// memory reference is not available. Return false if it is known to
+/// have no volatile memory references.
+bool MachineInstr::hasVolatileMemoryRef() const {
+  // An instruction known never to access memory won't have a volatile access.
+  if (!TID->mayStore() &&
+      !TID->mayLoad() &&
+      !TID->isCall() &&
+      !TID->hasUnmodeledSideEffects())
+    return false;
+
+  // Otherwise, if the instruction has no memory reference information,
+  // conservatively assume it wasn't preserved.
+  if (memoperands_empty())
+    return true;
+  
+  // Check the memory reference information for volatile references.
+  for (mmo_iterator I = memoperands_begin(), E = memoperands_end(); I != E; ++I)
+    if ((*I)->isVolatile())
+      return true;
+
+  return false;
+}
+
+/// isInvariantLoad - Return true if this instruction is loading from a
+/// location whose value is invariant across the function.  For example,
+/// loading a value from the constant pool or from the argument area
+/// of a function if it does not change.  This should only return true of
+/// *all* loads the instruction does are invariant (if it does multiple loads).
+bool MachineInstr::isInvariantLoad(AliasAnalysis *AA) const {
+  // If the instruction doesn't load at all, it isn't an invariant load.
+  if (!TID->mayLoad())
+    return false;
+
+  // If the instruction has lost its memoperands, conservatively assume that
+  // it may not be an invariant load.
+  if (memoperands_empty())
+    return false;
+
+  const MachineFrameInfo *MFI = getParent()->getParent()->getFrameInfo();
+
+  for (mmo_iterator I = memoperands_begin(),
+       E = memoperands_end(); I != E; ++I) {
+    if ((*I)->isVolatile()) return false;
+    if ((*I)->isStore()) return false;
+
+    if (const Value *V = (*I)->getValue()) {
+      // A load from a constant PseudoSourceValue is invariant.
+      if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V))
+        if (PSV->isConstant(MFI))
+          continue;
+      // If we have an AliasAnalysis, ask it whether the memory is constant.
+      if (AA && AA->pointsToConstantMemory(V))
+        continue;
+    }
+
+    // Otherwise assume conservatively.
+    return false;
+  }
+
+  // Everything checks out.
+  return true;
+}
+
+/// isConstantValuePHI - If the specified instruction is a PHI that always
+/// merges together the same virtual register, return the register, otherwise
+/// return 0.
+unsigned MachineInstr::isConstantValuePHI() const {
+  if (!isPHI())
+    return 0;
+  assert(getNumOperands() >= 3 &&
+         "It's illegal to have a PHI without source operands");
+
+  unsigned Reg = getOperand(1).getReg();
+  for (unsigned i = 3, e = getNumOperands(); i < e; i += 2)
+    if (getOperand(i).getReg() != Reg)
+      return 0;
+  return Reg;
+}
+
+/// allDefsAreDead - Return true if all the defs of this instruction are dead.
+///
+bool MachineInstr::allDefsAreDead() const {
+  for (unsigned i = 0, e = getNumOperands(); i < e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || MO.isUse())
+      continue;
+    if (!MO.isDead())
+      return false;
+  }
+  return true;
+}
+
+void MachineInstr::dump() const {
+  dbgs() << "  " << *this;
+}
+
+static void printDebugLoc(DebugLoc DL, const MachineFunction *MF, 
+                         raw_ostream &CommentOS) {
+  const LLVMContext &Ctx = MF->getFunction()->getContext();
+  if (!DL.isUnknown()) {          // Print source line info.
+    DIScope Scope(DL.getScope(Ctx));
+    // Omit the directory, because it's likely to be long and uninteresting.
+    if (Scope.Verify())
+      CommentOS << Scope.getFilename();
+    else
+      CommentOS << "<unknown>";
+    CommentOS << ':' << DL.getLine();
+    if (DL.getCol() != 0)
+      CommentOS << ':' << DL.getCol();
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
+    if (!InlinedAtDL.isUnknown()) {
+      CommentOS << " @[ ";
+      printDebugLoc(InlinedAtDL, MF, CommentOS);
+      CommentOS << " ]";
+    }
+  }
+}
+
+void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
+  // We can be a bit tidier if we know the TargetMachine and/or MachineFunction.
+  const MachineFunction *MF = 0;
+  const MachineRegisterInfo *MRI = 0;
+  if (const MachineBasicBlock *MBB = getParent()) {
+    MF = MBB->getParent();
+    if (!TM && MF)
+      TM = &MF->getTarget();
+    if (MF)
+      MRI = &MF->getRegInfo();
+  }
+
+  // Save a list of virtual registers.
+  SmallVector<unsigned, 8> VirtRegs;
+
+  // Print explicitly defined operands on the left of an assignment syntax.
+  unsigned StartOp = 0, e = getNumOperands();
+  for (; StartOp < e && getOperand(StartOp).isReg() &&
+         getOperand(StartOp).isDef() &&
+         !getOperand(StartOp).isImplicit();
+       ++StartOp) {
+    if (StartOp != 0) OS << ", ";
+    getOperand(StartOp).print(OS, TM);
+    unsigned Reg = getOperand(StartOp).getReg();
+    if (Reg && TargetRegisterInfo::isVirtualRegister(Reg))
+      VirtRegs.push_back(Reg);
+  }
+
+  if (StartOp != 0)
+    OS << " = ";
+
+  // Print the opcode name.
+  OS << getDesc().getName();
+
+  // Print the rest of the operands.
+  bool OmittedAnyCallClobbers = false;
+  bool FirstOp = true;
+  for (unsigned i = StartOp, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+
+    if (MO.isReg() && MO.getReg() &&
+        TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      VirtRegs.push_back(MO.getReg());
+
+    // Omit call-clobbered registers which aren't used anywhere. This makes
+    // call instructions much less noisy on targets where calls clobber lots
+    // of registers. Don't rely on MO.isDead() because we may be called before
+    // LiveVariables is run, or we may be looking at a non-allocatable reg.
+    if (MF && getDesc().isCall() &&
+        MO.isReg() && MO.isImplicit() && MO.isDef()) {
+      unsigned Reg = MO.getReg();
+      if (Reg != 0 && TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        const MachineRegisterInfo &MRI = MF->getRegInfo();
+        if (MRI.use_empty(Reg) && !MRI.isLiveOut(Reg)) {
+          bool HasAliasLive = false;
+          for (const unsigned *Alias = TM->getRegisterInfo()->getAliasSet(Reg);
+               unsigned AliasReg = *Alias; ++Alias)
+            if (!MRI.use_empty(AliasReg) || MRI.isLiveOut(AliasReg)) {
+              HasAliasLive = true;
+              break;
+            }
+          if (!HasAliasLive) {
+            OmittedAnyCallClobbers = true;
+            continue;
+          }
+        }
+      }
+    }
+
+    if (FirstOp) FirstOp = false; else OS << ",";
+    OS << " ";
+    if (i < getDesc().NumOperands) {
+      const TargetOperandInfo &TOI = getDesc().OpInfo[i];
+      if (TOI.isPredicate())
+        OS << "pred:";
+      if (TOI.isOptionalDef())
+        OS << "opt:";
+    }
+    if (isDebugValue() && MO.isMetadata()) {
+      // Pretty print DBG_VALUE instructions.
+      const MDNode *MD = MO.getMetadata();
+      if (const MDString *MDS = dyn_cast<MDString>(MD->getOperand(2)))
+        OS << "!\"" << MDS->getString() << '\"';
+      else
+        MO.print(OS, TM);
+    } else if (TM && (isInsertSubreg() || isRegSequence()) && MO.isImm()) {
+      OS << TM->getRegisterInfo()->getSubRegIndexName(MO.getImm());
+    } else
+      MO.print(OS, TM);
+  }
+
+  // Briefly indicate whether any call clobbers were omitted.
+  if (OmittedAnyCallClobbers) {
+    if (!FirstOp) OS << ",";
+    OS << " ...";
+  }
+
+  bool HaveSemi = false;
+  if (!memoperands_empty()) {
+    if (!HaveSemi) OS << ";"; HaveSemi = true;
+
+    OS << " mem:";
+    for (mmo_iterator i = memoperands_begin(), e = memoperands_end();
+         i != e; ++i) {
+      OS << **i;
+      if (llvm::next(i) != e)
+        OS << " ";
+    }
+  }
+
+  // Print the regclass of any virtual registers encountered.
+  if (MRI && !VirtRegs.empty()) {
+    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    for (unsigned i = 0; i != VirtRegs.size(); ++i) {
+      const TargetRegisterClass *RC = MRI->getRegClass(VirtRegs[i]);
+      OS << " " << RC->getName() << ":%reg" << VirtRegs[i];
+      for (unsigned j = i+1; j != VirtRegs.size();) {
+        if (MRI->getRegClass(VirtRegs[j]) != RC) {
+          ++j;
+          continue;
+        }
+        if (VirtRegs[i] != VirtRegs[j])
+          OS << "," << VirtRegs[j];
+        VirtRegs.erase(VirtRegs.begin()+j);
+      }
+    }
+  }
+
+  if (!debugLoc.isUnknown() && MF) {
+    if (!HaveSemi) OS << ";";
+    OS << " dbg:";
+    printDebugLoc(debugLoc, MF, OS);
+  }
+
+  OS << "\n";
+}
+
+bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
+                                     const TargetRegisterInfo *RegInfo,
+                                     bool AddIfNotFound) {
+  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
+  bool hasAliases = isPhysReg && RegInfo->getAliasSet(IncomingReg);
+  bool Found = false;
+  SmallVector<unsigned,4> DeadOps;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || MO.isUndef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    if (Reg == IncomingReg) {
+      if (!Found) {
+        if (MO.isKill())
+          // The register is already marked kill.
+          return true;
+        if (isPhysReg && isRegTiedToDefOperand(i))
+          // Two-address uses of physregs must not be marked kill.
+          return true;
+        MO.setIsKill();
+        Found = true;
+      }
+    } else if (hasAliases && MO.isKill() &&
+               TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // A super-register kill already exists.
+      if (RegInfo->isSuperRegister(IncomingReg, Reg))
+        return true;
+      if (RegInfo->isSubRegister(IncomingReg, Reg))
+        DeadOps.push_back(i);
+    }
+  }
+
+  // Trim unneeded kill operands.
+  while (!DeadOps.empty()) {
+    unsigned OpIdx = DeadOps.back();
+    if (getOperand(OpIdx).isImplicit())
+      RemoveOperand(OpIdx);
+    else
+      getOperand(OpIdx).setIsKill(false);
+    DeadOps.pop_back();
+  }
+
+  // If not found, this means an alias of one of the operands is killed. Add a
+  // new implicit operand if required.
+  if (!Found && AddIfNotFound) {
+    addOperand(MachineOperand::CreateReg(IncomingReg,
+                                         false /*IsDef*/,
+                                         true  /*IsImp*/,
+                                         true  /*IsKill*/));
+    return true;
+  }
+  return Found;
+}
+
+bool MachineInstr::addRegisterDead(unsigned IncomingReg,
+                                   const TargetRegisterInfo *RegInfo,
+                                   bool AddIfNotFound) {
+  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
+  bool hasAliases = isPhysReg && RegInfo->getAliasSet(IncomingReg);
+  bool Found = false;
+  SmallVector<unsigned,4> DeadOps;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    if (Reg == IncomingReg) {
+      if (!Found) {
+        if (MO.isDead())
+          // The register is already marked dead.
+          return true;
+        MO.setIsDead();
+        Found = true;
+      }
+    } else if (hasAliases && MO.isDead() &&
+               TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // There exists a super-register that's marked dead.
+      if (RegInfo->isSuperRegister(IncomingReg, Reg))
+        return true;
+      if (RegInfo->getSubRegisters(IncomingReg) &&
+          RegInfo->getSuperRegisters(Reg) &&
+          RegInfo->isSubRegister(IncomingReg, Reg))
+        DeadOps.push_back(i);
+    }
+  }
+
+  // Trim unneeded dead operands.
+  while (!DeadOps.empty()) {
+    unsigned OpIdx = DeadOps.back();
+    if (getOperand(OpIdx).isImplicit())
+      RemoveOperand(OpIdx);
+    else
+      getOperand(OpIdx).setIsDead(false);
+    DeadOps.pop_back();
+  }
+
+  // If not found, this means an alias of one of the operands is dead. Add a
+  // new implicit operand if required.
+  if (Found || !AddIfNotFound)
+    return Found;
+    
+  addOperand(MachineOperand::CreateReg(IncomingReg,
+                                       true  /*IsDef*/,
+                                       true  /*IsImp*/,
+                                       false /*IsKill*/,
+                                       true  /*IsDead*/));
+  return true;
+}
+
+void MachineInstr::addRegisterDefined(unsigned IncomingReg,
+                                      const TargetRegisterInfo *RegInfo) {
+  if (TargetRegisterInfo::isPhysicalRegister(IncomingReg)) {
+    MachineOperand *MO = findRegisterDefOperand(IncomingReg, false, RegInfo);
+    if (MO)
+      return;
+  } else {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = getOperand(i);
+      if (MO.isReg() && MO.getReg() == IncomingReg && MO.isDef() &&
+          MO.getSubReg() == 0)
+        return;
+    }
+  }
+  addOperand(MachineOperand::CreateReg(IncomingReg,
+                                       true  /*IsDef*/,
+                                       true  /*IsImp*/));
+}
+
+void MachineInstr::setPhysRegsDeadExcept(const SmallVectorImpl<unsigned> &UsedRegs,
+                                         const TargetRegisterInfo &TRI) {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    bool Dead = true;
+    for (SmallVectorImpl<unsigned>::const_iterator I = UsedRegs.begin(),
+         E = UsedRegs.end(); I != E; ++I)
+      if (TRI.regsOverlap(*I, Reg)) {
+        Dead = false;
+        break;
+      }
+    // If there are no uses, including partial uses, the def is dead.
+    if (Dead) MO.setIsDead();
+  }
+}
+
+unsigned
+MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
+  unsigned Hash = MI->getOpcode() * 37;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    uint64_t Key = (uint64_t)MO.getType() << 32;
+    switch (MO.getType()) {
+    default: break;
+    case MachineOperand::MO_Register:
+      if (MO.isDef() && MO.getReg() &&
+          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        continue;  // Skip virtual register defs.
+      Key |= MO.getReg();
+      break;
+    case MachineOperand::MO_Immediate:
+      Key |= MO.getImm();
+      break;
+    case MachineOperand::MO_FrameIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_JumpTableIndex:
+      Key |= MO.getIndex();
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      Key |= DenseMapInfo<void*>::getHashValue(MO.getMBB());
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      Key |= DenseMapInfo<void*>::getHashValue(MO.getGlobal());
+      break;
+    case MachineOperand::MO_BlockAddress:
+      Key |= DenseMapInfo<void*>::getHashValue(MO.getBlockAddress());
+      break;
+    case MachineOperand::MO_MCSymbol:
+      Key |= DenseMapInfo<void*>::getHashValue(MO.getMCSymbol());
+      break;
+    }
+    Key += ~(Key << 32);
+    Key ^= (Key >> 22);
+    Key += ~(Key << 13);
+    Key ^= (Key >> 8);
+    Key += (Key << 3);
+    Key ^= (Key >> 15);
+    Key += ~(Key << 27);
+    Key ^= (Key >> 31);
+    Hash = (unsigned)Key + Hash * 37;
+  }
+  return Hash;
+}
diff --git a/src/LLVM/lib/CodeGen/MachineLICM.cpp b/src/LLVM/lib/CodeGen/MachineLICM.cpp
new file mode 100644
index 0000000..1a74b74
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineLICM.cpp
@@ -0,0 +1,842 @@
+//===-- MachineLICM.cpp - Machine Loop Invariant Code Motion Pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion on machine instructions. We
+// attempt to remove as much code from the body of a loop as possible.
+//
+// This pass does not attempt to throttle itself to limit register pressure.
+// The register allocation phases are expected to perform rematerialization
+// to recover when register pressure is high.
+//
+// This pass is not intended to be a replacement or a complete alternative
+// for the LLVM-IR-level LICM pass. It is only designed to hoist simple
+// constructs that are not exposed before lowering and instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "machine-licm"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+STATISTIC(NumHoisted, "Number of machine instructions hoisted out of loops");
+STATISTIC(NumCSEed,   "Number of hoisted machine instructions CSEed");
+STATISTIC(NumPostRAHoisted,
+          "Number of machine instructions hoisted out of loops post regalloc");
+
+namespace {
+  class MachineLICM : public MachineFunctionPass {
+    bool PreRegAlloc;
+
+    const TargetMachine   *TM;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const MachineFrameInfo *MFI;
+    MachineRegisterInfo *RegInfo;
+
+    // Various analyses that we use...
+    AliasAnalysis        *AA;      // Alias analysis info.
+    MachineLoopInfo      *MLI;     // Current MachineLoopInfo
+    MachineDominatorTree *DT;      // Machine dominator tree for the cur loop
+
+    // State that is updated as we process loops
+    bool         Changed;          // True if a loop is changed.
+    bool         FirstInLoop;      // True if it's the first LICM in the loop.
+    MachineLoop *CurLoop;          // The current loop we are working on.
+    MachineBasicBlock *CurPreheader; // The preheader for CurLoop.
+
+    BitVector AllocatableSet;
+
+    // For each opcode, keep a list of potential CSE instructions.
+    DenseMap<unsigned, std::vector<const MachineInstr*> > CSEMap;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    MachineLICM() :
+      MachineFunctionPass(ID), PreRegAlloc(true) {}
+
+    explicit MachineLICM(bool PreRA) :
+      MachineFunctionPass(ID), PreRegAlloc(PreRA) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "Machine Instruction LICM"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<AliasAnalysis>();
+      AU.addPreserved<MachineLoopInfo>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual void releaseMemory() {
+      CSEMap.clear();
+    }
+
+  private:
+    /// CandidateInfo - Keep track of information about hoisting candidates.
+    struct CandidateInfo {
+      MachineInstr *MI;
+      unsigned      Def;
+      int           FI;
+      CandidateInfo(MachineInstr *mi, unsigned def, int fi)
+        : MI(mi), Def(def), FI(fi) {}
+    };
+
+    /// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop
+    /// invariants out to the preheader.
+    void HoistRegionPostRA();
+
+    /// HoistPostRA - When an instruction is found to only use loop invariant
+    /// operands that is safe to hoist, this instruction is called to do the
+    /// dirty work.
+    void HoistPostRA(MachineInstr *MI, unsigned Def);
+
+    /// ProcessMI - Examine the instruction for potentai LICM candidate. Also
+    /// gather register def and frame object update information.
+    void ProcessMI(MachineInstr *MI, unsigned *PhysRegDefs,
+                   SmallSet<int, 32> &StoredFIs,
+                   SmallVector<CandidateInfo, 32> &Candidates);
+
+    /// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the
+    /// current loop.
+    void AddToLiveIns(unsigned Reg);
+
+    /// IsLICMCandidate - Returns true if the instruction may be a suitable
+    /// candidate for LICM. e.g. If the instruction is a call, then it's
+    /// obviously not safe to hoist it.
+    bool IsLICMCandidate(MachineInstr &I);
+
+    /// IsLoopInvariantInst - Returns true if the instruction is loop
+    /// invariant. I.e., all virtual register operands are defined outside of
+    /// the loop, physical registers aren't accessed (explicitly or implicitly),
+    /// and the instruction is hoistable.
+    /// 
+    bool IsLoopInvariantInst(MachineInstr &I);
+
+    /// IsProfitableToHoist - Return true if it is potentially profitable to
+    /// hoist the given loop invariant.
+    bool IsProfitableToHoist(MachineInstr &MI);
+
+    /// HoistRegion - Walk the specified region of the CFG (defined by all
+    /// blocks dominated by the specified block, and that are in the current
+    /// loop) in depth first order w.r.t the DominatorTree. This allows us to
+    /// visit definitions before uses, allowing us to hoist a loop body in one
+    /// pass without iteration.
+    ///
+    void HoistRegion(MachineDomTreeNode *N);
+
+    /// isLoadFromConstantMemory - Return true if the given instruction is a
+    /// load from constant memory.
+    bool isLoadFromConstantMemory(MachineInstr *MI);
+
+    /// ExtractHoistableLoad - Unfold a load from the given machineinstr if
+    /// the load itself could be hoisted. Return the unfolded and hoistable
+    /// load, or null if the load couldn't be unfolded or if it wouldn't
+    /// be hoistable.
+    MachineInstr *ExtractHoistableLoad(MachineInstr *MI);
+
+    /// LookForDuplicate - Find an instruction amount PrevMIs that is a
+    /// duplicate of MI. Return this instruction if it's found.
+    const MachineInstr *LookForDuplicate(const MachineInstr *MI,
+                                     std::vector<const MachineInstr*> &PrevMIs);
+
+    /// EliminateCSE - Given a LICM'ed instruction, look for an instruction on
+    /// the preheader that compute the same value. If it's found, do a RAU on
+    /// with the definition of the existing instruction rather than hoisting
+    /// the instruction to the preheader.
+    bool EliminateCSE(MachineInstr *MI,
+           DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI);
+
+    /// Hoist - When an instruction is found to only use loop invariant operands
+    /// that is safe to hoist, this instruction is called to do the dirty work.
+    ///
+    void Hoist(MachineInstr *MI);
+
+    /// InitCSEMap - Initialize the CSE map with instructions that are in the
+    /// current loop preheader that may become duplicates of instructions that
+    /// are hoisted out of the loop.
+    void InitCSEMap(MachineBasicBlock *BB);
+
+    /// getCurPreheader - Get the preheader for the current loop, splitting
+    /// a critical edge if needed.
+    MachineBasicBlock *getCurPreheader();
+  };
+} // end anonymous namespace
+
+char MachineLICM::ID = 0;
+INITIALIZE_PASS(MachineLICM, "machinelicm",
+                "Machine Loop Invariant Code Motion", false, false);
+
+FunctionPass *llvm::createMachineLICMPass(bool PreRegAlloc) {
+  return new MachineLICM(PreRegAlloc);
+}
+
+/// LoopIsOuterMostWithPredecessor - Test if the given loop is the outer-most
+/// loop that has a unique predecessor.
+static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
+  // Check whether this loop even has a unique predecessor.
+  if (!CurLoop->getLoopPredecessor())
+    return false;
+  // Ok, now check to see if any of its outer loops do.
+  for (MachineLoop *L = CurLoop->getParentLoop(); L; L = L->getParentLoop())
+    if (L->getLoopPredecessor())
+      return false;
+  // None of them did, so this is the outermost with a unique predecessor.
+  return true;
+}
+
+bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
+  if (PreRegAlloc)
+    DEBUG(dbgs() << "******** Pre-regalloc Machine LICM ********\n");
+  else
+    DEBUG(dbgs() << "******** Post-regalloc Machine LICM ********\n");
+
+  Changed = FirstInLoop = false;
+  TM = &MF.getTarget();
+  TII = TM->getInstrInfo();
+  TRI = TM->getRegisterInfo();
+  MFI = MF.getFrameInfo();
+  RegInfo = &MF.getRegInfo();
+  AllocatableSet = TRI->getAllocatableSet(MF);
+
+  // Get our Loop information...
+  MLI = &getAnalysis<MachineLoopInfo>();
+  DT  = &getAnalysis<MachineDominatorTree>();
+  AA  = &getAnalysis<AliasAnalysis>();
+
+  SmallVector<MachineLoop *, 8> Worklist(MLI->begin(), MLI->end());
+  while (!Worklist.empty()) {
+    CurLoop = Worklist.pop_back_val();
+    CurPreheader = 0;
+
+    // If this is done before regalloc, only visit outer-most preheader-sporting
+    // loops.
+    if (PreRegAlloc && !LoopIsOuterMostWithPredecessor(CurLoop)) {
+      Worklist.append(CurLoop->begin(), CurLoop->end());
+      continue;
+    }
+
+    if (!PreRegAlloc)
+      HoistRegionPostRA();
+    else {
+      // CSEMap is initialized for loop header when the first instruction is
+      // being hoisted.
+      MachineDomTreeNode *N = DT->getNode(CurLoop->getHeader());
+      FirstInLoop = true;
+      HoistRegion(N);
+      CSEMap.clear();
+    }
+  }
+
+  return Changed;
+}
+
+/// InstructionStoresToFI - Return true if instruction stores to the
+/// specified frame.
+static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
+  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
+         oe = MI->memoperands_end(); o != oe; ++o) {
+    if (!(*o)->isStore() || !(*o)->getValue())
+      continue;
+    if (const FixedStackPseudoSourceValue *Value =
+        dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+      if (Value->getFrameIndex() == FI)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// ProcessMI - Examine the instruction for potentai LICM candidate. Also
+/// gather register def and frame object update information.
+void MachineLICM::ProcessMI(MachineInstr *MI,
+                            unsigned *PhysRegDefs,
+                            SmallSet<int, 32> &StoredFIs,
+                            SmallVector<CandidateInfo, 32> &Candidates) {
+  bool RuledOut = false;
+  bool HasNonInvariantUse = false;
+  unsigned Def = 0;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isFI()) {
+      // Remember if the instruction stores to the frame index.
+      int FI = MO.getIndex();
+      if (!StoredFIs.count(FI) &&
+          MFI->isSpillSlotObjectIndex(FI) &&
+          InstructionStoresToFI(MI, FI))
+        StoredFIs.insert(FI);
+      HasNonInvariantUse = true;
+      continue;
+    }
+
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+           "Not expecting virtual register!");
+
+    if (!MO.isDef()) {
+      if (Reg && PhysRegDefs[Reg])
+        // If it's using a non-loop-invariant register, then it's obviously not
+        // safe to hoist.
+        HasNonInvariantUse = true;
+      continue;
+    }
+
+    if (MO.isImplicit()) {
+      ++PhysRegDefs[Reg];
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+        ++PhysRegDefs[*AS];
+      if (!MO.isDead())
+        // Non-dead implicit def? This cannot be hoisted.
+        RuledOut = true;
+      // No need to check if a dead implicit def is also defined by
+      // another instruction.
+      continue;
+    }
+
+    // FIXME: For now, avoid instructions with multiple defs, unless
+    // it's a dead implicit def.
+    if (Def)
+      RuledOut = true;
+    else
+      Def = Reg;
+
+    // If we have already seen another instruction that defines the same
+    // register, then this is not safe.
+    if (++PhysRegDefs[Reg] > 1)
+      // MI defined register is seen defined by another instruction in
+      // the loop, it cannot be a LICM candidate.
+      RuledOut = true;
+    for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+      if (++PhysRegDefs[*AS] > 1)
+        RuledOut = true;
+  }
+
+  // Only consider reloads for now and remats which do not have register
+  // operands. FIXME: Consider unfold load folding instructions.
+  if (Def && !RuledOut) {
+    int FI = INT_MIN;
+    if ((!HasNonInvariantUse && IsLICMCandidate(*MI)) ||
+        (TII->isLoadFromStackSlot(MI, FI) && MFI->isSpillSlotObjectIndex(FI)))
+      Candidates.push_back(CandidateInfo(MI, Def, FI));
+  }
+}
+
+/// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop
+/// invariants out to the preheader.
+void MachineLICM::HoistRegionPostRA() {
+  unsigned NumRegs = TRI->getNumRegs();
+  unsigned *PhysRegDefs = new unsigned[NumRegs];
+  std::fill(PhysRegDefs, PhysRegDefs + NumRegs, 0);
+
+  SmallVector<CandidateInfo, 32> Candidates;
+  SmallSet<int, 32> StoredFIs;
+
+  // Walk the entire region, count number of defs for each register, and
+  // collect potential LICM candidates.
+  const std::vector<MachineBasicBlock*> Blocks = CurLoop->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *BB = Blocks[i];
+    // Conservatively treat live-in's as an external def.
+    // FIXME: That means a reload that're reused in successor block(s) will not
+    // be LICM'ed.
+    for (MachineBasicBlock::livein_iterator I = BB->livein_begin(),
+           E = BB->livein_end(); I != E; ++I) {
+      unsigned Reg = *I;
+      ++PhysRegDefs[Reg];
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+        ++PhysRegDefs[*AS];
+    }
+
+    for (MachineBasicBlock::iterator
+           MII = BB->begin(), E = BB->end(); MII != E; ++MII) {
+      MachineInstr *MI = &*MII;
+      ProcessMI(MI, PhysRegDefs, StoredFIs, Candidates);
+    }
+  }
+
+  // Now evaluate whether the potential candidates qualify.
+  // 1. Check if the candidate defined register is defined by another
+  //    instruction in the loop.
+  // 2. If the candidate is a load from stack slot (always true for now),
+  //    check if the slot is stored anywhere in the loop.
+  for (unsigned i = 0, e = Candidates.size(); i != e; ++i) {
+    if (Candidates[i].FI != INT_MIN &&
+        StoredFIs.count(Candidates[i].FI))
+      continue;
+
+    if (PhysRegDefs[Candidates[i].Def] == 1) {
+      bool Safe = true;
+      MachineInstr *MI = Candidates[i].MI;
+      for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) {
+        const MachineOperand &MO = MI->getOperand(j);
+        if (!MO.isReg() || MO.isDef() || !MO.getReg())
+          continue;
+        if (PhysRegDefs[MO.getReg()]) {
+          // If it's using a non-loop-invariant register, then it's obviously
+          // not safe to hoist.
+          Safe = false;
+          break;
+        }
+      }
+      if (Safe)
+        HoistPostRA(MI, Candidates[i].Def);
+    }
+  }
+
+  delete[] PhysRegDefs;
+}
+
+/// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the current
+/// loop, and make sure it is not killed by any instructions in the loop.
+void MachineLICM::AddToLiveIns(unsigned Reg) {
+  const std::vector<MachineBasicBlock*> Blocks = CurLoop->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *BB = Blocks[i];
+    if (!BB->isLiveIn(Reg))
+      BB->addLiveIn(Reg);
+    for (MachineBasicBlock::iterator
+           MII = BB->begin(), E = BB->end(); MII != E; ++MII) {
+      MachineInstr *MI = &*MII;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg() || !MO.getReg() || MO.isDef()) continue;
+        if (MO.getReg() == Reg || TRI->isSuperRegister(Reg, MO.getReg()))
+          MO.setIsKill(false);
+      }
+    }
+  }
+}
+
+/// HoistPostRA - When an instruction is found to only use loop invariant
+/// operands that is safe to hoist, this instruction is called to do the
+/// dirty work.
+void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader) return;
+
+  // Now move the instructions to the predecessor, inserting it before any
+  // terminator instructions.
+  DEBUG({
+      dbgs() << "Hoisting " << *MI;
+      if (Preheader->getBasicBlock())
+        dbgs() << " to MachineBasicBlock "
+               << Preheader->getName();
+      if (MI->getParent()->getBasicBlock())
+        dbgs() << " from MachineBasicBlock "
+               << MI->getParent()->getName();
+      dbgs() << "\n";
+    });
+
+  // Splice the instruction to the preheader.
+  MachineBasicBlock *MBB = MI->getParent();
+  Preheader->splice(Preheader->getFirstTerminator(), MBB, MI);
+
+  // Add register to livein list to all the BBs in the current loop since a 
+  // loop invariant must be kept live throughout the whole loop. This is
+  // important to ensure later passes do not scavenge the def register.
+  AddToLiveIns(Def);
+
+  ++NumPostRAHoisted;
+  Changed = true;
+}
+
+/// HoistRegion - Walk the specified region of the CFG (defined by all blocks
+/// dominated by the specified block, and that are in the current loop) in depth
+/// first order w.r.t the DominatorTree. This allows us to visit definitions
+/// before uses, allowing us to hoist a loop body in one pass without iteration.
+///
+void MachineLICM::HoistRegion(MachineDomTreeNode *N) {
+  assert(N != 0 && "Null dominator tree node?");
+  MachineBasicBlock *BB = N->getBlock();
+
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB)) return;
+
+  for (MachineBasicBlock::iterator
+         MII = BB->begin(), E = BB->end(); MII != E; ) {
+    MachineBasicBlock::iterator NextMII = MII; ++NextMII;
+    Hoist(&*MII);
+    MII = NextMII;
+  }
+
+  // Don't hoist things out of a large switch statement.  This often causes
+  // code to be hoisted that wasn't going to be executed, and increases
+  // register pressure in a situation where it's likely to matter.
+  if (BB->succ_size() < 25) {
+    const std::vector<MachineDomTreeNode*> &Children = N->getChildren();
+    for (unsigned I = 0, E = Children.size(); I != E; ++I)
+      HoistRegion(Children[I]);
+  }
+}
+
+/// IsLICMCandidate - Returns true if the instruction may be a suitable
+/// candidate for LICM. e.g. If the instruction is a call, then it's obviously
+/// not safe to hoist it.
+bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
+  // Check if it's safe to move the instruction.
+  bool DontMoveAcrossStore = true;
+  if (!I.isSafeToMove(TII, AA, DontMoveAcrossStore))
+    return false;
+  
+  return true;
+}
+
+/// IsLoopInvariantInst - Returns true if the instruction is loop
+/// invariant. I.e., all virtual register operands are defined outside of the
+/// loop, physical registers aren't accessed explicitly, and there are no side
+/// effects that aren't captured by the operands or other flags.
+/// 
+bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
+  if (!IsLICMCandidate(I))
+    return false;
+
+  // The instruction is loop invariant if all of its operands are.
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = I.getOperand(i);
+
+    if (!MO.isReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    // Don't hoist an instruction that uses or defines a physical register.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        if (!RegInfo->def_empty(Reg))
+          return false;
+        if (AllocatableSet.test(Reg))
+          return false;
+        // Check for a def among the register's aliases too.
+        for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+          unsigned AliasReg = *Alias;
+          if (!RegInfo->def_empty(AliasReg))
+            return false;
+          if (AllocatableSet.test(AliasReg))
+            return false;
+        }
+        // Otherwise it's safe to move.
+        continue;
+      } else if (!MO.isDead()) {
+        // A def that isn't dead. We can't move it.
+        return false;
+      } else if (CurLoop->getHeader()->isLiveIn(Reg)) {
+        // If the reg is live into the loop, we can't hoist an instruction
+        // which would clobber it.
+        return false;
+      }
+    }
+
+    if (!MO.isUse())
+      continue;
+
+    assert(RegInfo->getVRegDef(Reg) &&
+           "Machine instr not mapped for this vreg?!");
+
+    // If the loop contains the definition of an operand, then the instruction
+    // isn't loop invariant.
+    if (CurLoop->contains(RegInfo->getVRegDef(Reg)))
+      return false;
+  }
+
+  // If we got this far, the instruction is loop invariant!
+  return true;
+}
+
+
+/// HasPHIUses - Return true if the specified register has any PHI use.
+static bool HasPHIUses(unsigned Reg, MachineRegisterInfo *RegInfo) {
+  for (MachineRegisterInfo::use_iterator UI = RegInfo->use_begin(Reg),
+         UE = RegInfo->use_end(); UI != UE; ++UI) {
+    MachineInstr *UseMI = &*UI;
+    if (UseMI->isPHI())
+      return true;
+  }
+  return false;
+}
+
+/// isLoadFromConstantMemory - Return true if the given instruction is a
+/// load from constant memory. Machine LICM will hoist these even if they are
+/// not re-materializable.
+bool MachineLICM::isLoadFromConstantMemory(MachineInstr *MI) {
+  if (!MI->getDesc().mayLoad()) return false;
+  if (!MI->hasOneMemOperand()) return false;
+  MachineMemOperand *MMO = *MI->memoperands_begin();
+  if (MMO->isVolatile()) return false;
+  if (!MMO->getValue()) return false;
+  const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(MMO->getValue());
+  if (PSV) {
+    MachineFunction &MF = *MI->getParent()->getParent();
+    return PSV->isConstant(MF.getFrameInfo());
+  } else {
+    return AA->pointsToConstantMemory(MMO->getValue());
+  }
+}
+
+/// IsProfitableToHoist - Return true if it is potentially profitable to hoist
+/// the given loop invariant.
+bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
+  // FIXME: For now, only hoist re-materilizable instructions. LICM will
+  // increase register pressure. We want to make sure it doesn't increase
+  // spilling.
+  // Also hoist loads from constant memory, e.g. load from stubs, GOT. Hoisting
+  // these tend to help performance in low register pressure situation. The
+  // trade off is it may cause spill in high pressure situation. It will end up
+  // adding a store in the loop preheader. But the reload is no more expensive.
+  // The side benefit is these loads are frequently CSE'ed.
+  if (!TII->isTriviallyReMaterializable(&MI, AA)) {
+    if (!isLoadFromConstantMemory(&MI))
+      return false;
+  }
+
+  // If result(s) of this instruction is used by PHIs, then don't hoist it.
+  // The presence of joins makes it difficult for current register allocator
+  // implementation to perform remat.
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    if (HasPHIUses(MO.getReg(), RegInfo))
+      return false;
+  }
+
+  return true;
+}
+
+MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
+  // If not, we may be able to unfold a load and hoist that.
+  // First test whether the instruction is loading from an amenable
+  // memory location.
+  if (!isLoadFromConstantMemory(MI))
+    return 0;
+
+  // Next determine the register class for a temporary register.
+  unsigned LoadRegIndex;
+  unsigned NewOpc =
+    TII->getOpcodeAfterMemoryUnfold(MI->getOpcode(),
+                                    /*UnfoldLoad=*/true,
+                                    /*UnfoldStore=*/false,
+                                    &LoadRegIndex);
+  if (NewOpc == 0) return 0;
+  const TargetInstrDesc &TID = TII->get(NewOpc);
+  if (TID.getNumDefs() != 1) return 0;
+  const TargetRegisterClass *RC = TID.OpInfo[LoadRegIndex].getRegClass(TRI);
+  // Ok, we're unfolding. Create a temporary register and do the unfold.
+  unsigned Reg = RegInfo->createVirtualRegister(RC);
+
+  MachineFunction &MF = *MI->getParent()->getParent();
+  SmallVector<MachineInstr *, 2> NewMIs;
+  bool Success =
+    TII->unfoldMemoryOperand(MF, MI, Reg,
+                             /*UnfoldLoad=*/true, /*UnfoldStore=*/false,
+                             NewMIs);
+  (void)Success;
+  assert(Success &&
+         "unfoldMemoryOperand failed when getOpcodeAfterMemoryUnfold "
+         "succeeded!");
+  assert(NewMIs.size() == 2 &&
+         "Unfolded a load into multiple instructions!");
+  MachineBasicBlock *MBB = MI->getParent();
+  MBB->insert(MI, NewMIs[0]);
+  MBB->insert(MI, NewMIs[1]);
+  // If unfolding produced a load that wasn't loop-invariant or profitable to
+  // hoist, discard the new instructions and bail.
+  if (!IsLoopInvariantInst(*NewMIs[0]) || !IsProfitableToHoist(*NewMIs[0])) {
+    NewMIs[0]->eraseFromParent();
+    NewMIs[1]->eraseFromParent();
+    return 0;
+  }
+  // Otherwise we successfully unfolded a load that we can hoist.
+  MI->eraseFromParent();
+  return NewMIs[0];
+}
+
+void MachineLICM::InitCSEMap(MachineBasicBlock *BB) {
+  for (MachineBasicBlock::iterator I = BB->begin(),E = BB->end(); I != E; ++I) {
+    const MachineInstr *MI = &*I;
+    // FIXME: For now, only hoist re-materilizable instructions. LICM will
+    // increase register pressure. We want to make sure it doesn't increase
+    // spilling.
+    if (TII->isTriviallyReMaterializable(MI, AA)) {
+      unsigned Opcode = MI->getOpcode();
+      DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
+        CI = CSEMap.find(Opcode);
+      if (CI != CSEMap.end())
+        CI->second.push_back(MI);
+      else {
+        std::vector<const MachineInstr*> CSEMIs;
+        CSEMIs.push_back(MI);
+        CSEMap.insert(std::make_pair(Opcode, CSEMIs));
+      }
+    }
+  }
+}
+
+const MachineInstr*
+MachineLICM::LookForDuplicate(const MachineInstr *MI,
+                              std::vector<const MachineInstr*> &PrevMIs) {
+  for (unsigned i = 0, e = PrevMIs.size(); i != e; ++i) {
+    const MachineInstr *PrevMI = PrevMIs[i];
+    if (TII->produceSameValue(MI, PrevMI))
+      return PrevMI;
+  }
+  return 0;
+}
+
+bool MachineLICM::EliminateCSE(MachineInstr *MI,
+          DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI) {
+  // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
+  // the undef property onto uses.
+  if (CI == CSEMap.end() || MI->isImplicitDef())
+    return false;
+
+  if (const MachineInstr *Dup = LookForDuplicate(MI, CI->second)) {
+    DEBUG(dbgs() << "CSEing " << *MI << " with " << *Dup);
+
+    // Replace virtual registers defined by MI by their counterparts defined
+    // by Dup.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+
+      // Physical registers may not differ here.
+      assert((!MO.isReg() || MO.getReg() == 0 ||
+              !TargetRegisterInfo::isPhysicalRegister(MO.getReg()) ||
+              MO.getReg() == Dup->getOperand(i).getReg()) &&
+             "Instructions with different phys regs are not identical!");
+
+      if (MO.isReg() && MO.isDef() &&
+          !TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+        RegInfo->replaceRegWith(MO.getReg(), Dup->getOperand(i).getReg());
+        RegInfo->clearKillFlags(Dup->getOperand(i).getReg());
+      }
+    }
+    MI->eraseFromParent();
+    ++NumCSEed;
+    return true;
+  }
+  return false;
+}
+
+/// Hoist - When an instruction is found to use only loop invariant operands
+/// that are safe to hoist, this instruction is called to do the dirty work.
+///
+void MachineLICM::Hoist(MachineInstr *MI) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader) return;
+
+  // First check whether we should hoist this instruction.
+  if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) {
+    // If not, try unfolding a hoistable load.
+    MI = ExtractHoistableLoad(MI);
+    if (!MI) return;
+  }
+
+  // Now move the instructions to the predecessor, inserting it before any
+  // terminator instructions.
+  DEBUG({
+      dbgs() << "Hoisting " << *MI;
+      if (Preheader->getBasicBlock())
+        dbgs() << " to MachineBasicBlock "
+               << Preheader->getName();
+      if (MI->getParent()->getBasicBlock())
+        dbgs() << " from MachineBasicBlock "
+               << MI->getParent()->getName();
+      dbgs() << "\n";
+    });
+
+  // If this is the first instruction being hoisted to the preheader,
+  // initialize the CSE map with potential common expressions.
+  if (FirstInLoop) {
+    InitCSEMap(Preheader);
+    FirstInLoop = false;
+  }
+
+  // Look for opportunity to CSE the hoisted instruction.
+  unsigned Opcode = MI->getOpcode();
+  DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
+    CI = CSEMap.find(Opcode);
+  if (!EliminateCSE(MI, CI)) {
+    // Otherwise, splice the instruction to the preheader.
+    Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI);
+
+    // Clear the kill flags of any register this instruction defines,
+    // since they may need to be live throughout the entire loop
+    // rather than just live for part of it.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDef() && !MO.isDead())
+        RegInfo->clearKillFlags(MO.getReg());
+    }
+
+    // Add to the CSE map.
+    if (CI != CSEMap.end())
+      CI->second.push_back(MI);
+    else {
+      std::vector<const MachineInstr*> CSEMIs;
+      CSEMIs.push_back(MI);
+      CSEMap.insert(std::make_pair(Opcode, CSEMIs));
+    }
+  }
+
+  ++NumHoisted;
+  Changed = true;
+}
+
+MachineBasicBlock *MachineLICM::getCurPreheader() {
+  // Determine the block to which to hoist instructions. If we can't find a
+  // suitable loop predecessor, we can't do any hoisting.
+
+  // If we've tried to get a preheader and failed, don't try again.
+  if (CurPreheader == reinterpret_cast<MachineBasicBlock *>(-1))
+    return 0;
+
+  if (!CurPreheader) {
+    CurPreheader = CurLoop->getLoopPreheader();
+    if (!CurPreheader) {
+      MachineBasicBlock *Pred = CurLoop->getLoopPredecessor();
+      if (!Pred) {
+        CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
+        return 0;
+      }
+
+      CurPreheader = Pred->SplitCriticalEdge(CurLoop->getHeader(), this);
+      if (!CurPreheader) {
+        CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
+        return 0;
+      }
+    }
+  }
+  return CurPreheader;
+}
diff --git a/src/LLVM/lib/CodeGen/MachineLoopInfo.cpp b/src/LLVM/lib/CodeGen/MachineLoopInfo.cpp
new file mode 100644
index 0000000..039edf2
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineLoopInfo.cpp
@@ -0,0 +1,80 @@
+//===- MachineLoopInfo.cpp - Natural Loop Calculator ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MachineLoopInfo class that is used to identify natural
+// loops and determine the loop depth of various nodes of the CFG.  Note that
+// the loops identified may actually be several natural loops that share the 
+// same header node... not just a single natural loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+namespace llvm {
+#define MLB class LoopBase<MachineBasicBlock, MachineLoop>
+TEMPLATE_INSTANTIATION(MLB);
+#undef MLB
+#define MLIB class LoopInfoBase<MachineBasicBlock, MachineLoop>
+TEMPLATE_INSTANTIATION(MLIB);
+#undef MLIB
+}
+
+char MachineLoopInfo::ID = 0;
+static RegisterPass<MachineLoopInfo>
+X("machine-loops", "Machine Natural Loop Construction", true);
+
+char &llvm::MachineLoopInfoID = MachineLoopInfo::ID;
+
+bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) {
+  releaseMemory();
+  LI.Calculate(getAnalysis<MachineDominatorTree>().getBase());    // Update
+  return false;
+}
+
+void MachineLoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineBasicBlock *MachineLoop::getTopBlock() {
+  MachineBasicBlock *TopMBB = getHeader();
+  MachineFunction::iterator Begin = TopMBB->getParent()->begin();
+  if (TopMBB != Begin) {
+    MachineBasicBlock *PriorMBB = prior(MachineFunction::iterator(TopMBB));
+    while (contains(PriorMBB)) {
+      TopMBB = PriorMBB;
+      if (TopMBB == Begin) break;
+      PriorMBB = prior(MachineFunction::iterator(TopMBB));
+    }
+  }
+  return TopMBB;
+}
+
+MachineBasicBlock *MachineLoop::getBottomBlock() {
+  MachineBasicBlock *BotMBB = getHeader();
+  MachineFunction::iterator End = BotMBB->getParent()->end();
+  if (BotMBB != prior(End)) {
+    MachineBasicBlock *NextMBB = llvm::next(MachineFunction::iterator(BotMBB));
+    while (contains(NextMBB)) {
+      BotMBB = NextMBB;
+      if (BotMBB == llvm::next(MachineFunction::iterator(BotMBB))) break;
+      NextMBB = llvm::next(MachineFunction::iterator(BotMBB));
+    }
+  }
+  return BotMBB;
+}
+
+void MachineLoop::dump() const {
+  print(dbgs());
+}
diff --git a/src/LLVM/lib/CodeGen/MachineModuleInfo.cpp b/src/LLVM/lib/CodeGen/MachineModuleInfo.cpp
new file mode 100644
index 0000000..2d510b9
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineModuleInfo.cpp
@@ -0,0 +1,421 @@
+//===-- llvm/CodeGen/MachineModuleInfo.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineModuleInfo.h"
+
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Module.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+using namespace llvm::dwarf;
+
+// Handle the Pass registration stuff necessary to use TargetData's.
+INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo",
+                "Machine Module Information", false, false);
+char MachineModuleInfo::ID = 0;
+
+// Out of line virtual method.
+MachineModuleInfoImpl::~MachineModuleInfoImpl() {}
+
+namespace llvm {
+class MMIAddrLabelMapCallbackPtr : CallbackVH {
+  MMIAddrLabelMap *Map;
+public:
+  MMIAddrLabelMapCallbackPtr() : Map(0) {}
+  MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V), Map(0) {}
+  
+  void setPtr(BasicBlock *BB) {
+    ValueHandleBase::operator=(BB);
+  }
+    
+  void setMap(MMIAddrLabelMap *map) { Map = map; }
+  
+  virtual void deleted();
+  virtual void allUsesReplacedWith(Value *V2);
+};
+  
+class MMIAddrLabelMap {
+  MCContext &Context;
+  struct AddrLabelSymEntry {
+    /// Symbols - The symbols for the label.  This is a pointer union that is
+    /// either one symbol (the common case) or a list of symbols.
+    PointerUnion<MCSymbol *, std::vector<MCSymbol*>*> Symbols;
+    
+    Function *Fn;   // The containing function of the BasicBlock.
+    unsigned Index; // The index in BBCallbacks for the BasicBlock.
+  };
+  
+  DenseMap<AssertingVH<BasicBlock>, AddrLabelSymEntry> AddrLabelSymbols;
+  
+  /// BBCallbacks - Callbacks for the BasicBlock's that we have entries for.  We
+  /// use this so we get notified if a block is deleted or RAUWd.
+  std::vector<MMIAddrLabelMapCallbackPtr> BBCallbacks;
+
+  /// DeletedAddrLabelsNeedingEmission - This is a per-function list of symbols
+  /// whose corresponding BasicBlock got deleted.  These symbols need to be
+  /// emitted at some point in the file, so AsmPrinter emits them after the
+  /// function body.
+  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*> >
+    DeletedAddrLabelsNeedingEmission;
+public:
+  
+  MMIAddrLabelMap(MCContext &context) : Context(context) {}
+  ~MMIAddrLabelMap() {
+    assert(DeletedAddrLabelsNeedingEmission.empty() &&
+           "Some labels for deleted blocks never got emitted");
+    
+    // Deallocate any of the 'list of symbols' case.
+    for (DenseMap<AssertingVH<BasicBlock>, AddrLabelSymEntry>::iterator
+         I = AddrLabelSymbols.begin(), E = AddrLabelSymbols.end(); I != E; ++I)
+      if (I->second.Symbols.is<std::vector<MCSymbol*>*>())
+        delete I->second.Symbols.get<std::vector<MCSymbol*>*>();
+  }
+  
+  MCSymbol *getAddrLabelSymbol(BasicBlock *BB);
+  std::vector<MCSymbol*> getAddrLabelSymbolToEmit(BasicBlock *BB);
+
+  void takeDeletedSymbolsForFunction(Function *F, 
+                                     std::vector<MCSymbol*> &Result);
+
+  void UpdateForDeletedBlock(BasicBlock *BB);
+  void UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New);
+};
+}
+
+MCSymbol *MMIAddrLabelMap::getAddrLabelSymbol(BasicBlock *BB) {
+  assert(BB->hasAddressTaken() &&
+         "Shouldn't get label for block without address taken");
+  AddrLabelSymEntry &Entry = AddrLabelSymbols[BB];
+  
+  // If we already had an entry for this block, just return it.
+  if (!Entry.Symbols.isNull()) {
+    assert(BB->getParent() == Entry.Fn && "Parent changed");
+    if (Entry.Symbols.is<MCSymbol*>())
+      return Entry.Symbols.get<MCSymbol*>();
+    return (*Entry.Symbols.get<std::vector<MCSymbol*>*>())[0];
+  }
+  
+  // Otherwise, this is a new entry, create a new symbol for it and add an
+  // entry to BBCallbacks so we can be notified if the BB is deleted or RAUWd.
+  BBCallbacks.push_back(BB);
+  BBCallbacks.back().setMap(this);
+  Entry.Index = BBCallbacks.size()-1;
+  Entry.Fn = BB->getParent();
+  MCSymbol *Result = Context.CreateTempSymbol();
+  Entry.Symbols = Result;
+  return Result;
+}
+
+std::vector<MCSymbol*>
+MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
+  assert(BB->hasAddressTaken() &&
+         "Shouldn't get label for block without address taken");
+  AddrLabelSymEntry &Entry = AddrLabelSymbols[BB];
+  
+  std::vector<MCSymbol*> Result;
+  
+  // If we already had an entry for this block, just return it.
+  if (Entry.Symbols.isNull())
+    Result.push_back(getAddrLabelSymbol(BB));
+  else if (MCSymbol *Sym = Entry.Symbols.dyn_cast<MCSymbol*>())
+    Result.push_back(Sym);
+  else
+    Result = *Entry.Symbols.get<std::vector<MCSymbol*>*>();
+  return Result;
+}
+
+
+/// takeDeletedSymbolsForFunction - If we have any deleted symbols for F, return
+/// them.
+void MMIAddrLabelMap::
+takeDeletedSymbolsForFunction(Function *F, std::vector<MCSymbol*> &Result) {
+  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*> >::iterator I =
+    DeletedAddrLabelsNeedingEmission.find(F);
+
+  // If there are no entries for the function, just return.
+  if (I == DeletedAddrLabelsNeedingEmission.end()) return;
+  
+  // Otherwise, take the list.
+  std::swap(Result, I->second);
+  DeletedAddrLabelsNeedingEmission.erase(I);
+}
+
+
+void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) {
+  // If the block got deleted, there is no need for the symbol.  If the symbol
+  // was already emitted, we can just forget about it, otherwise we need to
+  // queue it up for later emission when the function is output.
+  AddrLabelSymEntry Entry = AddrLabelSymbols[BB];
+  AddrLabelSymbols.erase(BB);
+  assert(!Entry.Symbols.isNull() && "Didn't have a symbol, why a callback?");
+  BBCallbacks[Entry.Index] = 0;  // Clear the callback.
+
+  assert((BB->getParent() == 0 || BB->getParent() == Entry.Fn) &&
+         "Block/parent mismatch");
+
+  // Handle both the single and the multiple symbols cases.
+  if (MCSymbol *Sym = Entry.Symbols.dyn_cast<MCSymbol*>()) {
+    if (Sym->isDefined())
+      return;
+  
+    // If the block is not yet defined, we need to emit it at the end of the
+    // function.  Add the symbol to the DeletedAddrLabelsNeedingEmission list
+    // for the containing Function.  Since the block is being deleted, its
+    // parent may already be removed, we have to get the function from 'Entry'.
+    DeletedAddrLabelsNeedingEmission[Entry.Fn].push_back(Sym);
+  } else {
+    std::vector<MCSymbol*> *Syms = Entry.Symbols.get<std::vector<MCSymbol*>*>();
+
+    for (unsigned i = 0, e = Syms->size(); i != e; ++i) {
+      MCSymbol *Sym = (*Syms)[i];
+      if (Sym->isDefined()) continue;  // Ignore already emitted labels.
+      
+      // If the block is not yet defined, we need to emit it at the end of the
+      // function.  Add the symbol to the DeletedAddrLabelsNeedingEmission list
+      // for the containing Function.  Since the block is being deleted, its
+      // parent may already be removed, we have to get the function from
+      // 'Entry'.
+      DeletedAddrLabelsNeedingEmission[Entry.Fn].push_back(Sym);
+    }
+    
+    // The entry is deleted, free the memory associated with the symbol list.
+    delete Syms;
+  }
+}
+
+void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) {
+  // Get the entry for the RAUW'd block and remove it from our map.
+  AddrLabelSymEntry OldEntry = AddrLabelSymbols[Old];
+  AddrLabelSymbols.erase(Old);
+  assert(!OldEntry.Symbols.isNull() && "Didn't have a symbol, why a callback?");
+
+  AddrLabelSymEntry &NewEntry = AddrLabelSymbols[New];
+
+  // If New is not address taken, just move our symbol over to it.
+  if (NewEntry.Symbols.isNull()) {
+    BBCallbacks[OldEntry.Index].setPtr(New);    // Update the callback.
+    NewEntry = OldEntry;     // Set New's entry.
+    return;
+  }
+
+  BBCallbacks[OldEntry.Index] = 0;    // Update the callback.
+
+  // Otherwise, we need to add the old symbol to the new block's set.  If it is
+  // just a single entry, upgrade it to a symbol list.
+  if (MCSymbol *PrevSym = NewEntry.Symbols.dyn_cast<MCSymbol*>()) {
+    std::vector<MCSymbol*> *SymList = new std::vector<MCSymbol*>();
+    SymList->push_back(PrevSym);
+    NewEntry.Symbols = SymList;
+  }
+      
+  std::vector<MCSymbol*> *SymList =
+    NewEntry.Symbols.get<std::vector<MCSymbol*>*>();
+
+  // If the old entry was a single symbol, add it.
+  if (MCSymbol *Sym = OldEntry.Symbols.dyn_cast<MCSymbol*>()) {
+    SymList->push_back(Sym);
+    return;
+  }
+  
+  // Otherwise, concatenate the list.
+  std::vector<MCSymbol*> *Syms =OldEntry.Symbols.get<std::vector<MCSymbol*>*>();
+  SymList->insert(SymList->end(), Syms->begin(), Syms->end());
+  delete Syms;
+}
+
+
+void MMIAddrLabelMapCallbackPtr::deleted() {
+  Map->UpdateForDeletedBlock(cast<BasicBlock>(getValPtr()));
+}
+
+void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
+  Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
+}
+
+
+//===----------------------------------------------------------------------===//
+
+MachineModuleInfo::MachineModuleInfo(const MCAsmInfo &MAI)
+: ImmutablePass(ID), Context(MAI),
+  ObjFileMMI(0),
+  CurCallSite(0), DbgInfoAvailable(false){
+  AddrLabelSymbols = 0;
+  TheModule = 0;
+}
+
+MachineModuleInfo::MachineModuleInfo()
+: ImmutablePass(ID), Context(*(MCAsmInfo*)0) {
+  assert(0 && "This MachineModuleInfo constructor should never be called, MMI "
+         "should always be explicitly constructed by LLVMTargetMachine");
+  abort();
+}
+
+MachineModuleInfo::~MachineModuleInfo() {
+  delete ObjFileMMI;
+  
+  // FIXME: Why isn't doFinalization being called??
+  //assert(AddrLabelSymbols == 0 && "doFinalization not called");
+  delete AddrLabelSymbols;
+  AddrLabelSymbols = 0;
+}
+
+/// doInitialization - Initialize the state for a new module.
+///
+bool MachineModuleInfo::doInitialization() {
+  assert(AddrLabelSymbols == 0 && "Improperly initialized");
+  return false;
+}
+
+/// doFinalization - Tear down the state after completion of a module.
+///
+bool MachineModuleInfo::doFinalization() {
+  delete AddrLabelSymbols;
+  AddrLabelSymbols = 0;
+  return false;
+}
+
+/// EndFunction - Discard function meta information.
+///
+void MachineModuleInfo::EndFunction() {
+  // Clean up frame info.
+  FrameMoves.clear();
+
+  // Clean up exception info.
+  CallSiteMap.clear();
+  TypeInfos.clear();
+  FilterIds.clear();
+  FilterEnds.clear();
+  VariableDbgInfo.clear();
+}
+
+/// AnalyzeModule - Scan the module for global debug information.
+///
+void MachineModuleInfo::AnalyzeModule(const Module &M) {
+  // Insert functions in the llvm.used array (but not llvm.compiler.used) into
+  // UsedFunctions.
+  const GlobalVariable *GV = M.getGlobalVariable("llvm.used");
+  if (!GV || !GV->hasInitializer()) return;
+
+  // Should be an array of 'i8*'.
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (InitList == 0) return;
+
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+    if (const Function *F =
+          dyn_cast<Function>(InitList->getOperand(i)->stripPointerCasts()))
+      UsedFunctions.insert(F);
+}
+
+//===- Address of Block Management ----------------------------------------===//
+
+
+/// getAddrLabelSymbol - Return the symbol to be used for the specified basic
+/// block when its address is taken.  This cannot be its normal LBB label
+/// because the block may be accessed outside its containing function.
+MCSymbol *MachineModuleInfo::getAddrLabelSymbol(const BasicBlock *BB) {
+  // Lazily create AddrLabelSymbols.
+  if (AddrLabelSymbols == 0)
+    AddrLabelSymbols = new MMIAddrLabelMap(Context);
+  return AddrLabelSymbols->getAddrLabelSymbol(const_cast<BasicBlock*>(BB));
+}
+
+/// getAddrLabelSymbolToEmit - Return the symbol to be used for the specified
+/// basic block when its address is taken.  If other blocks were RAUW'd to
+/// this one, we may have to emit them as well, return the whole set.
+std::vector<MCSymbol*> MachineModuleInfo::
+getAddrLabelSymbolToEmit(const BasicBlock *BB) {
+  // Lazily create AddrLabelSymbols.
+  if (AddrLabelSymbols == 0)
+    AddrLabelSymbols = new MMIAddrLabelMap(Context);
+ return AddrLabelSymbols->getAddrLabelSymbolToEmit(const_cast<BasicBlock*>(BB));
+}
+
+
+/// takeDeletedSymbolsForFunction - If the specified function has had any
+/// references to address-taken blocks generated, but the block got deleted,
+/// return the symbol now so we can emit it.  This prevents emitting a
+/// reference to a symbol that has no definition.
+void MachineModuleInfo::
+takeDeletedSymbolsForFunction(const Function *F,
+                              std::vector<MCSymbol*> &Result) {
+  // If no blocks have had their addresses taken, we're done.
+  if (AddrLabelSymbols == 0) return;
+  return AddrLabelSymbols->
+     takeDeletedSymbolsForFunction(const_cast<Function*>(F), Result);
+}
+
+/// getTypeIDFor - Return the type id for the specified typeinfo.  This is
+/// function wide.
+unsigned MachineModuleInfo::getTypeIDFor(const GlobalVariable *TI) {
+  for (unsigned i = 0, N = TypeInfos.size(); i != N; ++i)
+    if (TypeInfos[i] == TI) return i + 1;
+
+  TypeInfos.push_back(TI);
+  return TypeInfos.size();
+}
+
+/// getFilterIDFor - Return the filter id for the specified typeinfos.  This is
+/// function wide.
+int MachineModuleInfo::getFilterIDFor(std::vector<unsigned> &TyIds) {
+  // If the new filter coincides with the tail of an existing filter, then
+  // re-use the existing filter.  Folding filters more than this requires
+  // re-ordering filters and/or their elements - probably not worth it.
+  for (std::vector<unsigned>::iterator I = FilterEnds.begin(),
+       E = FilterEnds.end(); I != E; ++I) {
+    unsigned i = *I, j = TyIds.size();
+
+    while (i && j)
+      if (FilterIds[--i] != TyIds[--j])
+        goto try_next;
+
+    if (!j)
+      // The new filter coincides with range [i, end) of the existing filter.
+      return -(1 + i);
+
+try_next:;
+  }
+
+  // Add the new filter.
+  int FilterID = -(1 + FilterIds.size());
+  FilterIds.reserve(FilterIds.size() + TyIds.size() + 1);
+  for (unsigned I = 0, N = TyIds.size(); I != N; ++I)
+    FilterIds.push_back(TyIds[I]);
+  FilterEnds.push_back(FilterIds.size());
+  FilterIds.push_back(0); // terminator
+  return FilterID;
+}
+
+namespace {
+  /// VariableDebugSorter - Comparison to sort the VariableDbgInfo map
+  /// by source location, to avoid depending on the arbitrary order that
+  /// instruction selection visits variables in.
+  struct VariableDebugSorter {
+    bool operator()(const MachineModuleInfo::VariableDbgInfoMapTy::value_type &A,
+                    const MachineModuleInfo::VariableDbgInfoMapTy::value_type &B)
+                  const {
+       if (A.second.second.getLine() != B.second.second.getLine())
+         return A.second.second.getLine() < B.second.second.getLine();
+       if (A.second.second.getCol() != B.second.second.getCol())
+         return A.second.second.getCol() < B.second.second.getCol();
+       return false;
+    }
+  };
+}
diff --git a/src/LLVM/lib/CodeGen/MachineModuleInfoImpls.cpp b/src/LLVM/lib/CodeGen/MachineModuleInfoImpls.cpp
new file mode 100644
index 0000000..5ab56c0
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineModuleInfoImpls.cpp
@@ -0,0 +1,45 @@
+//===-- llvm/CodeGen/MachineModuleInfoImpls.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements object-file format specific implementations of
+// MachineModuleInfoImpl.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/MC/MCSymbol.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MachineModuleInfoMachO
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method.
+void MachineModuleInfoMachO::Anchor() {}
+void MachineModuleInfoELF::Anchor() {}
+
+static int SortSymbolPair(const void *LHS, const void *RHS) {
+  typedef std::pair<MCSymbol*, MachineModuleInfoImpl::StubValueTy> PairTy;
+  const MCSymbol *LHSS = ((const PairTy *)LHS)->first;
+  const MCSymbol *RHSS = ((const PairTy *)RHS)->first;
+  return LHSS->getName().compare(RHSS->getName());
+}
+
+/// GetSortedStubs - Return the entries from a DenseMap in a deterministic
+/// sorted orer.
+MachineModuleInfoImpl::SymbolListTy
+MachineModuleInfoImpl::GetSortedStubs(const DenseMap<MCSymbol*,
+                                      MachineModuleInfoImpl::StubValueTy>&Map) {
+  MachineModuleInfoImpl::SymbolListTy List(Map.begin(), Map.end());
+
+  if (!List.empty())
+    qsort(&List[0], List.size(), sizeof(List[0]), SortSymbolPair);
+  return List;
+}
+
diff --git a/src/LLVM/lib/CodeGen/MachinePassRegistry.cpp b/src/LLVM/lib/CodeGen/MachinePassRegistry.cpp
new file mode 100644
index 0000000..9f4ef12
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachinePassRegistry.cpp
@@ -0,0 +1,41 @@
+//===-- CodeGen/MachineInstr.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the machine function pass registry for register allocators
+// and instruction schedulers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachinePassRegistry.h"
+
+using namespace llvm;
+
+
+/// Add - Adds a function pass to the registration list.
+///
+void MachinePassRegistry::Add(MachinePassRegistryNode *Node) {
+  Node->setNext(List);
+  List = Node;
+  if (Listener) Listener->NotifyAdd(Node->getName(),
+                                    Node->getCtor(),
+                                    Node->getDescription());
+}
+
+
+/// Remove - Removes a function pass from the registration list.
+///
+void MachinePassRegistry::Remove(MachinePassRegistryNode *Node) {
+  for (MachinePassRegistryNode **I = &List; *I; I = (*I)->getNextAddress()) {
+    if (*I == Node) {
+      if (Listener) Listener->NotifyRemove(Node->getName());
+      *I = (*I)->getNext();
+      break;
+    }
+  }
+}
diff --git a/src/LLVM/lib/CodeGen/MachineRegisterInfo.cpp b/src/LLVM/lib/CodeGen/MachineRegisterInfo.cpp
new file mode 100644
index 0000000..5d852f2
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineRegisterInfo.cpp
@@ -0,0 +1,224 @@
+//===-- lib/Codegen/MachineRegisterInfo.cpp -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the MachineRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI) {
+  VRegInfo.reserve(256);
+  RegAllocHints.reserve(256);
+  RegClass2VRegMap = new std::vector<unsigned>[TRI.getNumRegClasses()];
+  UsedPhysRegs.resize(TRI.getNumRegs());
+  
+  // Create the physreg use/def lists.
+  PhysRegUseDefLists = new MachineOperand*[TRI.getNumRegs()];
+  memset(PhysRegUseDefLists, 0, sizeof(MachineOperand*)*TRI.getNumRegs());
+}
+
+MachineRegisterInfo::~MachineRegisterInfo() {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = VRegInfo.size(); i != e; ++i)
+    assert(VRegInfo[i].second == 0 && "Vreg use list non-empty still?");
+  for (unsigned i = 0, e = UsedPhysRegs.size(); i != e; ++i)
+    assert(!PhysRegUseDefLists[i] &&
+           "PhysRegUseDefLists has entries after all instructions are deleted");
+#endif
+  delete [] PhysRegUseDefLists;
+  delete [] RegClass2VRegMap;
+}
+
+/// setRegClass - Set the register class of the specified virtual register.
+///
+void
+MachineRegisterInfo::setRegClass(unsigned Reg, const TargetRegisterClass *RC) {
+  unsigned VR = Reg;
+  Reg -= TargetRegisterInfo::FirstVirtualRegister;
+  assert(Reg < VRegInfo.size() && "Invalid vreg!");
+  const TargetRegisterClass *OldRC = VRegInfo[Reg].first;
+  VRegInfo[Reg].first = RC;
+
+  // Remove from old register class's vregs list. This may be slow but
+  // fortunately this operation is rarely needed.
+  std::vector<unsigned> &VRegs = RegClass2VRegMap[OldRC->getID()];
+  std::vector<unsigned>::iterator I = std::find(VRegs.begin(), VRegs.end(), VR);
+  VRegs.erase(I);
+
+  // Add to new register class's vregs list.
+  RegClass2VRegMap[RC->getID()].push_back(VR);
+}
+
+/// createVirtualRegister - Create and return a new virtual register in the
+/// function with the specified register class.
+///
+unsigned
+MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
+  assert(RegClass && "Cannot create register without RegClass!");
+  // Add a reg, but keep track of whether the vector reallocated or not.
+  void *ArrayBase = VRegInfo.empty() ? 0 : &VRegInfo[0];
+  VRegInfo.push_back(std::make_pair(RegClass, (MachineOperand*)0));
+  RegAllocHints.push_back(std::make_pair(0, 0));
+
+  if (!((&VRegInfo[0] == ArrayBase || VRegInfo.size() == 1)))
+    // The vector reallocated, handle this now.
+    HandleVRegListReallocation();
+  unsigned VR = getLastVirtReg();
+  RegClass2VRegMap[RegClass->getID()].push_back(VR);
+  return VR;
+}
+
+/// HandleVRegListReallocation - We just added a virtual register to the
+/// VRegInfo info list and it reallocated.  Update the use/def lists info
+/// pointers.
+void MachineRegisterInfo::HandleVRegListReallocation() {
+  // The back pointers for the vreg lists point into the previous vector.
+  // Update them to point to their correct slots.
+  for (unsigned i = 0, e = VRegInfo.size(); i != e; ++i) {
+    MachineOperand *List = VRegInfo[i].second;
+    if (!List) continue;
+    // Update the back-pointer to be accurate once more.
+    List->Contents.Reg.Prev = &VRegInfo[i].second;
+  }
+}
+
+/// replaceRegWith - Replace all instances of FromReg with ToReg in the
+/// machine function.  This is like llvm-level X->replaceAllUsesWith(Y),
+/// except that it also changes any definitions of the register as well.
+void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
+  assert(FromReg != ToReg && "Cannot replace a reg with itself");
+
+  // TODO: This could be more efficient by bulk changing the operands.
+  for (reg_iterator I = reg_begin(FromReg), E = reg_end(); I != E; ) {
+    MachineOperand &O = I.getOperand();
+    ++I;
+    O.setReg(ToReg);
+  }
+}
+
+
+/// getVRegDef - Return the machine instr that defines the specified virtual
+/// register or null if none is found.  This assumes that the code is in SSA
+/// form, so there should only be one definition.
+MachineInstr *MachineRegisterInfo::getVRegDef(unsigned Reg) const {
+  assert(Reg-TargetRegisterInfo::FirstVirtualRegister < VRegInfo.size() &&
+         "Invalid vreg!");
+  // Since we are in SSA form, we can use the first definition.
+  if (!def_empty(Reg))
+    return &*def_begin(Reg);
+  return 0;
+}
+
+bool MachineRegisterInfo::hasOneUse(unsigned RegNo) const {
+  use_iterator UI = use_begin(RegNo);
+  if (UI == use_end())
+    return false;
+  return ++UI == use_end();
+}
+
+bool MachineRegisterInfo::hasOneNonDBGUse(unsigned RegNo) const {
+  use_nodbg_iterator UI = use_nodbg_begin(RegNo);
+  if (UI == use_nodbg_end())
+    return false;
+  return ++UI == use_nodbg_end();
+}
+
+/// clearKillFlags - Iterate over all the uses of the given register and
+/// clear the kill flag from the MachineOperand. This function is used by
+/// optimization passes which extend register lifetimes and need only
+/// preserve conservative kill flag information.
+void MachineRegisterInfo::clearKillFlags(unsigned Reg) const {
+  for (use_iterator UI = use_begin(Reg), UE = use_end(); UI != UE; ++UI)
+    UI.getOperand().setIsKill(false);
+}
+
+bool MachineRegisterInfo::isLiveIn(unsigned Reg) const {
+  for (livein_iterator I = livein_begin(), E = livein_end(); I != E; ++I)
+    if (I->first == Reg || I->second == Reg)
+      return true;
+  return false;
+}
+
+bool MachineRegisterInfo::isLiveOut(unsigned Reg) const {
+  for (liveout_iterator I = liveout_begin(), E = liveout_end(); I != E; ++I)
+    if (*I == Reg)
+      return true;
+  return false;
+}
+
+/// getLiveInPhysReg - If VReg is a live-in virtual register, return the
+/// corresponding live-in physical register.
+unsigned MachineRegisterInfo::getLiveInPhysReg(unsigned VReg) const {
+  for (livein_iterator I = livein_begin(), E = livein_end(); I != E; ++I)
+    if (I->second == VReg)
+      return I->first;
+  return 0;
+}
+
+/// getLiveInVirtReg - If PReg is a live-in physical register, return the
+/// corresponding live-in physical register.
+unsigned MachineRegisterInfo::getLiveInVirtReg(unsigned PReg) const {
+  for (livein_iterator I = livein_begin(), E = livein_end(); I != E; ++I)
+    if (I->first == PReg)
+      return I->second;
+  return 0;
+}
+
+/// EmitLiveInCopies - Emit copies to initialize livein virtual registers
+/// into the given entry block.
+void
+MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
+                                      const TargetRegisterInfo &TRI,
+                                      const TargetInstrInfo &TII) {
+  // Emit the copies into the top of the block.
+  for (unsigned i = 0, e = LiveIns.size(); i != e; ++i)
+    if (LiveIns[i].second) {
+      if (use_empty(LiveIns[i].second)) {
+        // The livein has no uses. Drop it.
+        //
+        // It would be preferable to have isel avoid creating live-in
+        // records for unused arguments in the first place, but it's
+        // complicated by the debug info code for arguments.
+        LiveIns.erase(LiveIns.begin() + i);
+        --i; --e;
+      } else {
+        // Emit a copy.
+        BuildMI(*EntryMBB, EntryMBB->begin(), DebugLoc(),
+                TII.get(TargetOpcode::COPY), LiveIns[i].second)
+          .addReg(LiveIns[i].first);
+
+        // Add the register to the entry block live-in set.
+        EntryMBB->addLiveIn(LiveIns[i].first);
+      }
+    } else {
+      // Add the register to the entry block live-in set.
+      EntryMBB->addLiveIn(LiveIns[i].first);
+    }
+}
+
+void MachineRegisterInfo::closePhysRegsUsed(const TargetRegisterInfo &TRI) {
+  for (int i = UsedPhysRegs.find_first(); i >= 0;
+       i = UsedPhysRegs.find_next(i))
+         for (const unsigned *SS = TRI.getSubRegisters(i);
+              unsigned SubReg = *SS; ++SS)
+           if (SubReg > unsigned(i))
+             UsedPhysRegs.set(SubReg);
+}
+
+#ifndef NDEBUG
+void MachineRegisterInfo::dumpUses(unsigned Reg) const {
+  for (use_iterator I = use_begin(Reg), E = use_end(); I != E; ++I)
+    I.getOperand().getParent()->dump();
+}
+#endif
diff --git a/src/LLVM/lib/CodeGen/MachineSSAUpdater.cpp b/src/LLVM/lib/CodeGen/MachineSSAUpdater.cpp
new file mode 100644
index 0000000..84d6df2
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineSSAUpdater.cpp
@@ -0,0 +1,372 @@
+//===- MachineSSAUpdater.cpp - Unstructured SSA Update Tool ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MachineSSAUpdater class. It's based on SSAUpdater
+// class in lib/Transforms/Utils.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+using namespace llvm;
+
+typedef DenseMap<MachineBasicBlock*, unsigned> AvailableValsTy;
+static AvailableValsTy &getAvailableVals(void *AV) {
+  return *static_cast<AvailableValsTy*>(AV);
+}
+
+MachineSSAUpdater::MachineSSAUpdater(MachineFunction &MF,
+                                     SmallVectorImpl<MachineInstr*> *NewPHI)
+  : AV(0), InsertedPHIs(NewPHI) {
+  TII = MF.getTarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+}
+
+MachineSSAUpdater::~MachineSSAUpdater() {
+  delete &getAvailableVals(AV);
+}
+
+/// Initialize - Reset this object to get ready for a new set of SSA
+/// updates.  ProtoValue is the value used to name PHI nodes.
+void MachineSSAUpdater::Initialize(unsigned V) {
+  if (AV == 0)
+    AV = new AvailableValsTy();
+  else
+    getAvailableVals(AV).clear();
+
+  VR = V;
+  VRC = MRI->getRegClass(VR);
+}
+
+/// HasValueForBlock - Return true if the MachineSSAUpdater already has a value for
+/// the specified block.
+bool MachineSSAUpdater::HasValueForBlock(MachineBasicBlock *BB) const {
+  return getAvailableVals(AV).count(BB);
+}
+
+/// AddAvailableValue - Indicate that a rewritten value is available in the
+/// specified block with the specified value.
+void MachineSSAUpdater::AddAvailableValue(MachineBasicBlock *BB, unsigned V) {
+  getAvailableVals(AV)[BB] = V;
+}
+
+/// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is
+/// live at the end of the specified block.
+unsigned MachineSSAUpdater::GetValueAtEndOfBlock(MachineBasicBlock *BB) {
+  return GetValueAtEndOfBlockInternal(BB);
+}
+
+static
+unsigned LookForIdenticalPHI(MachineBasicBlock *BB,
+          SmallVector<std::pair<MachineBasicBlock*, unsigned>, 8> &PredValues) {
+  if (BB->empty())
+    return 0;
+
+  MachineBasicBlock::iterator I = BB->front();
+  if (!I->isPHI())
+    return 0;
+
+  AvailableValsTy AVals;
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    AVals[PredValues[i].first] = PredValues[i].second;
+  while (I != BB->end() && I->isPHI()) {
+    bool Same = true;
+    for (unsigned i = 1, e = I->getNumOperands(); i != e; i += 2) {
+      unsigned SrcReg = I->getOperand(i).getReg();
+      MachineBasicBlock *SrcBB = I->getOperand(i+1).getMBB();
+      if (AVals[SrcBB] != SrcReg) {
+        Same = false;
+        break;
+      }
+    }
+    if (Same)
+      return I->getOperand(0).getReg();
+    ++I;
+  }
+  return 0;
+}
+
+/// InsertNewDef - Insert an empty PHI or IMPLICIT_DEF instruction which define
+/// a value of the given register class at the start of the specified basic
+/// block. It returns the virtual register defined by the instruction.
+static
+MachineInstr *InsertNewDef(unsigned Opcode,
+                           MachineBasicBlock *BB, MachineBasicBlock::iterator I,
+                           const TargetRegisterClass *RC,
+                           MachineRegisterInfo *MRI,
+                           const TargetInstrInfo *TII) {
+  unsigned NewVR = MRI->createVirtualRegister(RC);
+  return BuildMI(*BB, I, DebugLoc(), TII->get(Opcode), NewVR);
+}
+
+/// GetValueInMiddleOfBlock - Construct SSA form, materializing a value that
+/// is live in the middle of the specified block.
+///
+/// GetValueInMiddleOfBlock is the same as GetValueAtEndOfBlock except in one
+/// important case: if there is a definition of the rewritten value after the
+/// 'use' in BB.  Consider code like this:
+///
+///      X1 = ...
+///   SomeBB:
+///      use(X)
+///      X2 = ...
+///      br Cond, SomeBB, OutBB
+///
+/// In this case, there are two values (X1 and X2) added to the AvailableVals
+/// set by the client of the rewriter, and those values are both live out of
+/// their respective blocks.  However, the use of X happens in the *middle* of
+/// a block.  Because of this, we need to insert a new PHI node in SomeBB to
+/// merge the appropriate values, and this value isn't live out of the block.
+///
+unsigned MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
+  // If there is no definition of the renamed variable in this block, just use
+  // GetValueAtEndOfBlock to do our work.
+  if (!HasValueForBlock(BB))
+    return GetValueAtEndOfBlockInternal(BB);
+
+  // If there are no predecessors, just return undef.
+  if (BB->pred_empty()) {
+    // Insert an implicit_def to represent an undef value.
+    MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
+                                        BB, BB->getFirstTerminator(),
+                                        VRC, MRI, TII);
+    return NewDef->getOperand(0).getReg();
+  }
+
+  // Otherwise, we have the hard case.  Get the live-in values for each
+  // predecessor.
+  SmallVector<std::pair<MachineBasicBlock*, unsigned>, 8> PredValues;
+  unsigned SingularValue = 0;
+
+  bool isFirstPred = true;
+  for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+         E = BB->pred_end(); PI != E; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+    unsigned PredVal = GetValueAtEndOfBlockInternal(PredBB);
+    PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+    // Compute SingularValue.
+    if (isFirstPred) {
+      SingularValue = PredVal;
+      isFirstPred = false;
+    } else if (PredVal != SingularValue)
+      SingularValue = 0;
+  }
+
+  // Otherwise, if all the merged values are the same, just use it.
+  if (SingularValue != 0)
+    return SingularValue;
+
+  // If an identical PHI is already in BB, just reuse it.
+  unsigned DupPHI = LookForIdenticalPHI(BB, PredValues);
+  if (DupPHI)
+    return DupPHI;
+
+  // Otherwise, we do need a PHI: insert one now.
+  MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->front();
+  MachineInstr *InsertedPHI = InsertNewDef(TargetOpcode::PHI, BB,
+                                           Loc, VRC, MRI, TII);
+
+  // Fill in all the predecessors of the PHI.
+  MachineInstrBuilder MIB(InsertedPHI);
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    MIB.addReg(PredValues[i].second).addMBB(PredValues[i].first);
+
+  // See if the PHI node can be merged to a single value.  This can happen in
+  // loop cases when we get a PHI of itself and one other value.
+  if (unsigned ConstVal = InsertedPHI->isConstantValuePHI()) {
+    InsertedPHI->eraseFromParent();
+    return ConstVal;
+  }
+
+  // If the client wants to know about all new instructions, tell it.
+  if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+
+  DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  return InsertedPHI->getOperand(0).getReg();
+}
+
+static
+MachineBasicBlock *findCorrespondingPred(const MachineInstr *MI,
+                                         MachineOperand *U) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+    if (&MI->getOperand(i) == U)
+      return MI->getOperand(i+1).getMBB();
+  }
+
+  llvm_unreachable("MachineOperand::getParent() failure?");
+  return 0;
+}
+
+/// RewriteUse - Rewrite a use of the symbolic value.  This handles PHI nodes,
+/// which use their value in the corresponding predecessor.
+void MachineSSAUpdater::RewriteUse(MachineOperand &U) {
+  MachineInstr *UseMI = U.getParent();
+  unsigned NewVR = 0;
+  if (UseMI->isPHI()) {
+    MachineBasicBlock *SourceBB = findCorrespondingPred(UseMI, &U);
+    NewVR = GetValueAtEndOfBlockInternal(SourceBB);
+  } else {
+    NewVR = GetValueInMiddleOfBlock(UseMI->getParent());
+  }
+
+  U.setReg(NewVR);
+}
+
+void MachineSSAUpdater::ReplaceRegWith(unsigned OldReg, unsigned NewReg) {
+  MRI->replaceRegWith(OldReg, NewReg);
+
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  for (DenseMap<MachineBasicBlock*, unsigned>::iterator
+         I = AvailableVals.begin(), E = AvailableVals.end(); I != E; ++I)
+    if (I->second == OldReg)
+      I->second = NewReg;
+}
+
+/// MachinePHIiter - Iterator for PHI operands.  This is used for the
+/// PHI_iterator in the SSAUpdaterImpl template.
+namespace {
+  class MachinePHIiter {
+  private:
+    MachineInstr *PHI;
+    unsigned idx;
+ 
+  public:
+    explicit MachinePHIiter(MachineInstr *P) // begin iterator
+      : PHI(P), idx(1) {}
+    MachinePHIiter(MachineInstr *P, bool) // end iterator
+      : PHI(P), idx(PHI->getNumOperands()) {}
+
+    MachinePHIiter &operator++() { idx += 2; return *this; } 
+    bool operator==(const MachinePHIiter& x) const { return idx == x.idx; }
+    bool operator!=(const MachinePHIiter& x) const { return !operator==(x); }
+    unsigned getIncomingValue() { return PHI->getOperand(idx).getReg(); }
+    MachineBasicBlock *getIncomingBlock() {
+      return PHI->getOperand(idx+1).getMBB();
+    }
+  };
+}
+
+/// SSAUpdaterTraits<MachineSSAUpdater> - Traits for the SSAUpdaterImpl
+/// template, specialized for MachineSSAUpdater.
+namespace llvm {
+template<>
+class SSAUpdaterTraits<MachineSSAUpdater> {
+public:
+  typedef MachineBasicBlock BlkT;
+  typedef unsigned ValT;
+  typedef MachineInstr PhiT;
+
+  typedef MachineBasicBlock::succ_iterator BlkSucc_iterator;
+  static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return BB->succ_begin(); }
+  static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return BB->succ_end(); }
+
+  typedef MachinePHIiter PHI_iterator;
+  static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
+  static inline PHI_iterator PHI_end(PhiT *PHI) {
+    return PHI_iterator(PHI, true);
+  }
+
+  /// FindPredecessorBlocks - Put the predecessors of BB into the Preds
+  /// vector.
+  static void FindPredecessorBlocks(MachineBasicBlock *BB,
+                                    SmallVectorImpl<MachineBasicBlock*> *Preds){
+    for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+           E = BB->pred_end(); PI != E; ++PI)
+      Preds->push_back(*PI);
+  }
+
+  /// GetUndefVal - Create an IMPLICIT_DEF instruction with a new register.
+  /// Add it into the specified block and return the register.
+  static unsigned GetUndefVal(MachineBasicBlock *BB,
+                              MachineSSAUpdater *Updater) {
+    // Insert an implicit_def to represent an undef value.
+    MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
+                                        BB, BB->getFirstTerminator(),
+                                        Updater->VRC, Updater->MRI,
+                                        Updater->TII);
+    return NewDef->getOperand(0).getReg();
+  }
+
+  /// CreateEmptyPHI - Create a PHI instruction that defines a new register.
+  /// Add it into the specified block and return the register.
+  static unsigned CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds,
+                                 MachineSSAUpdater *Updater) {
+    MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->front();
+    MachineInstr *PHI = InsertNewDef(TargetOpcode::PHI, BB, Loc,
+                                     Updater->VRC, Updater->MRI,
+                                     Updater->TII);
+    return PHI->getOperand(0).getReg();
+  }
+
+  /// AddPHIOperand - Add the specified value as an operand of the PHI for
+  /// the specified predecessor block.
+  static void AddPHIOperand(MachineInstr *PHI, unsigned Val,
+                            MachineBasicBlock *Pred) {
+    PHI->addOperand(MachineOperand::CreateReg(Val, false));
+    PHI->addOperand(MachineOperand::CreateMBB(Pred));
+  }
+
+  /// InstrIsPHI - Check if an instruction is a PHI.
+  ///
+  static MachineInstr *InstrIsPHI(MachineInstr *I) {
+    if (I && I->isPHI())
+      return I;
+    return 0;
+  }
+
+  /// ValueIsPHI - Check if the instruction that defines the specified register
+  /// is a PHI instruction.
+  static MachineInstr *ValueIsPHI(unsigned Val, MachineSSAUpdater *Updater) {
+    return InstrIsPHI(Updater->MRI->getVRegDef(Val));
+  }
+
+  /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source
+  /// operands, i.e., it was just added.
+  static MachineInstr *ValueIsNewPHI(unsigned Val, MachineSSAUpdater *Updater) {
+    MachineInstr *PHI = ValueIsPHI(Val, Updater);
+    if (PHI && PHI->getNumOperands() <= 1)
+      return PHI;
+    return 0;
+  }
+
+  /// GetPHIValue - For the specified PHI instruction, return the register
+  /// that it defines.
+  static unsigned GetPHIValue(MachineInstr *PHI) {
+    return PHI->getOperand(0).getReg();
+  }
+};
+
+} // End llvm namespace
+
+/// GetValueAtEndOfBlockInternal - Check to see if AvailableVals has an entry
+/// for the specified BB and if so, return it.  If not, construct SSA form by
+/// first calculating the required placement of PHIs and then inserting new
+/// PHIs where needed.
+unsigned MachineSSAUpdater::GetValueAtEndOfBlockInternal(MachineBasicBlock *BB){
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  if (unsigned V = AvailableVals[BB])
+    return V;
+
+  SSAUpdaterImpl<MachineSSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
+  return Impl.GetValue(BB);
+}
diff --git a/src/LLVM/lib/CodeGen/MachineSink.cpp b/src/LLVM/lib/CodeGen/MachineSink.cpp
new file mode 100644
index 0000000..b6a9bc4
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/MachineSink.cpp
@@ -0,0 +1,338 @@
+//===-- MachineSink.cpp - Sinking for machine instructions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass moves instructions into successor blocks when possible, so that
+// they aren't executed on paths where their results aren't needed.
+//
+// This pass is not intended to be a replacement or a complete alternative
+// for an LLVM-IR-level sinking pass. It is only designed to sink simple
+// constructs that are not exposed before lowering and instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "machine-sink"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumSunk, "Number of machine instructions sunk");
+
+namespace {
+  class MachineSinking : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo  *RegInfo; // Machine register information
+    MachineDominatorTree *DT;   // Machine dominator tree
+    MachineLoopInfo *LI;
+    AliasAnalysis *AA;
+    BitVector AllocatableSet;   // Which physregs are allocatable?
+
+  public:
+    static char ID; // Pass identification
+    MachineSinking() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addPreserved<MachineLoopInfo>();
+    }
+  private:
+    bool ProcessBlock(MachineBasicBlock &MBB);
+    bool SinkInstruction(MachineInstr *MI, bool &SawStore);
+    bool AllUsesDominatedByBlock(unsigned Reg, MachineBasicBlock *MBB) const;
+  };
+} // end anonymous namespace
+
+char MachineSinking::ID = 0;
+INITIALIZE_PASS(MachineSinking, "machine-sink",
+                "Machine code sinking", false, false);
+
+FunctionPass *llvm::createMachineSinkingPass() { return new MachineSinking(); }
+
+/// AllUsesDominatedByBlock - Return true if all uses of the specified register
+/// occur in blocks dominated by the specified block.
+bool MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
+                                             MachineBasicBlock *MBB) const {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+         "Only makes sense for vregs");
+  // Ignoring debug uses is necessary so debug info doesn't affect the code.
+  // This may leave a referencing dbg_value in the original block, before
+  // the definition of the vreg.  Dwarf generator handles this although the
+  // user might not get the right info at runtime.
+  for (MachineRegisterInfo::use_nodbg_iterator
+         I = RegInfo->use_nodbg_begin(Reg), E = RegInfo->use_nodbg_end();
+       I != E; ++I) {
+    // Determine the block of the use.
+    MachineInstr *UseInst = &*I;
+    MachineBasicBlock *UseBlock = UseInst->getParent();
+
+    if (UseInst->isPHI()) {
+      // PHI nodes use the operand in the predecessor block, not the block with
+      // the PHI.
+      UseBlock = UseInst->getOperand(I.getOperandNo()+1).getMBB();
+    }
+
+    // Check that it dominates.
+    if (!DT->dominates(MBB, UseBlock))
+      return false;
+  }
+
+  return true;
+}
+
+bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "******** Machine Sinking ********\n");
+
+  const TargetMachine &TM = MF.getTarget();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  RegInfo = &MF.getRegInfo();
+  DT = &getAnalysis<MachineDominatorTree>();
+  LI = &getAnalysis<MachineLoopInfo>();
+  AA = &getAnalysis<AliasAnalysis>();
+  AllocatableSet = TRI->getAllocatableSet(MF);
+
+  bool EverMadeChange = false;
+
+  while (1) {
+    bool MadeChange = false;
+
+    // Process all basic blocks.
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end();
+         I != E; ++I)
+      MadeChange |= ProcessBlock(*I);
+
+    // If this iteration over the code changed anything, keep iterating.
+    if (!MadeChange) break;
+    EverMadeChange = true;
+  }
+  return EverMadeChange;
+}
+
+bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
+  // Can't sink anything out of a block that has less than two successors.
+  if (MBB.succ_size() <= 1 || MBB.empty()) return false;
+
+  // Don't bother sinking code out of unreachable blocks. In addition to being
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable loop there may be nowhere to stop.
+  if (!DT->isReachableFromEntry(&MBB)) return false;
+
+  bool MadeChange = false;
+
+  // Walk the basic block bottom-up.  Remember if we saw a store.
+  MachineBasicBlock::iterator I = MBB.end();
+  --I;
+  bool ProcessedBegin, SawStore = false;
+  do {
+    MachineInstr *MI = I;  // The instruction to sink.
+
+    // Predecrement I (if it's not begin) so that it isn't invalidated by
+    // sinking.
+    ProcessedBegin = I == MBB.begin();
+    if (!ProcessedBegin)
+      --I;
+
+    if (MI->isDebugValue())
+      continue;
+
+    if (SinkInstruction(MI, SawStore))
+      ++NumSunk, MadeChange = true;
+
+    // If we just processed the first instruction in the block, we're done.
+  } while (!ProcessedBegin);
+
+  return MadeChange;
+}
+
+/// SinkInstruction - Determine whether it is safe to sink the specified machine
+/// instruction out of its current block into a successor.
+bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
+  // Check if it's safe to move the instruction.
+  if (!MI->isSafeToMove(TII, AA, SawStore))
+    return false;
+
+  // FIXME: This should include support for sinking instructions within the
+  // block they are currently in to shorten the live ranges.  We often get
+  // instructions sunk into the top of a large block, but it would be better to
+  // also sink them down before their first use in the block.  This xform has to
+  // be careful not to *increase* register pressure though, e.g. sinking
+  // "x = y + z" down if it kills y and z would increase the live ranges of y
+  // and z and only shrink the live range of x.
+
+  // Loop over all the operands of the specified instruction.  If there is
+  // anything we can't handle, bail out.
+  MachineBasicBlock *ParentBlock = MI->getParent();
+
+  // SuccToSinkTo - This is the successor to sink this instruction to, once we
+  // decide.
+  MachineBasicBlock *SuccToSinkTo = 0;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;  // Ignore non-register operands.
+
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        if (!RegInfo->def_empty(Reg))
+          return false;
+
+        if (AllocatableSet.test(Reg))
+          return false;
+
+        // Check for a def among the register's aliases too.
+        for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+          unsigned AliasReg = *Alias;
+          if (!RegInfo->def_empty(AliasReg))
+            return false;
+
+          if (AllocatableSet.test(AliasReg))
+            return false;
+        }
+      } else if (!MO.isDead()) {
+        // A def that isn't dead. We can't move it.
+        return false;
+      }
+    } else {
+      // Virtual register uses are always safe to sink.
+      if (MO.isUse()) continue;
+
+      // If it's not safe to move defs of the register class, then abort.
+      if (!TII->isSafeToMoveRegClassDefs(RegInfo->getRegClass(Reg)))
+        return false;
+
+      // FIXME: This picks a successor to sink into based on having one
+      // successor that dominates all the uses.  However, there are cases where
+      // sinking can happen but where the sink point isn't a successor.  For
+      // example:
+      //
+      //   x = computation
+      //   if () {} else {}
+      //   use x
+      //
+      // the instruction could be sunk over the whole diamond for the
+      // if/then/else (or loop, etc), allowing it to be sunk into other blocks
+      // after that.
+
+      // Virtual register defs can only be sunk if all their uses are in blocks
+      // dominated by one of the successors.
+      if (SuccToSinkTo) {
+        // If a previous operand picked a block to sink to, then this operand
+        // must be sinkable to the same block.
+        if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo))
+          return false;
+
+        continue;
+      }
+
+      // Otherwise, we should look at all the successors and decide which one
+      // we should sink to.
+      for (MachineBasicBlock::succ_iterator SI = ParentBlock->succ_begin(),
+           E = ParentBlock->succ_end(); SI != E; ++SI) {
+        if (AllUsesDominatedByBlock(Reg, *SI)) {
+          SuccToSinkTo = *SI;
+          break;
+        }
+      }
+
+      // If we couldn't find a block to sink to, ignore this instruction.
+      if (SuccToSinkTo == 0)
+        return false;
+    }
+  }
+
+  // If there are no outputs, it must have side-effects.
+  if (SuccToSinkTo == 0)
+    return false;
+
+  // It is not possible to sink an instruction into its own block.  This can
+  // happen with loops.
+  if (MI->getParent() == SuccToSinkTo)
+    return false;
+
+  // If the instruction to move defines a dead physical register which is live
+  // when leaving the basic block, don't move it because it could turn into a
+  // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>)
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    if (SuccToSinkTo->isLiveIn(Reg))
+      return false;
+  }
+
+  DEBUG(dbgs() << "Sink instr " << *MI << "\tinto block " << *SuccToSinkTo);
+
+  // If the block has multiple predecessors, this would introduce computation on
+  // a path that it doesn't already exist.  We could split the critical edge,
+  // but for now we just punt.
+  // FIXME: Split critical edges if not backedges.
+  if (SuccToSinkTo->pred_size() > 1) {
+    // We cannot sink a load across a critical edge - there may be stores in
+    // other code paths.
+    bool store = true;
+    if (!MI->isSafeToMove(TII, AA, store)) {
+      DEBUG(dbgs() << " *** PUNTING: Wont sink load along critical edge.\n");
+      return false;
+    }
+
+    // We don't want to sink across a critical edge if we don't dominate the
+    // successor. We could be introducing calculations to new code paths.
+    if (!DT->dominates(ParentBlock, SuccToSinkTo)) {
+      DEBUG(dbgs() << " *** PUNTING: Critical edge found\n");
+      return false;
+    }
+
+    // Don't sink instructions into a loop.
+    if (LI->isLoopHeader(SuccToSinkTo)) {
+      DEBUG(dbgs() << " *** PUNTING: Loop header found\n");
+      return false;
+    }
+
+    // Otherwise we are OK with sinking along a critical edge.
+    DEBUG(dbgs() << "Sinking along critical edge.\n");
+  }
+
+  // Determine where to insert into. Skip phi nodes.
+  MachineBasicBlock::iterator InsertPos = SuccToSinkTo->begin();
+  while (InsertPos != SuccToSinkTo->end() && InsertPos->isPHI())
+    ++InsertPos;
+
+  // Move the instruction.
+  SuccToSinkTo->splice(InsertPos, ParentBlock, MI,
+                       ++MachineBasicBlock::iterator(MI));
+
+  // Conservatively, clear any kill flags, since it's possible that they are no
+  // longer correct.
+  MI->clearKillInfo();
+
+  return true;
+}
diff --git a/src/LLVM/lib/CodeGen/Makefile b/src/LLVM/lib/CodeGen/Makefile
new file mode 100644
index 0000000..4ab3e3c
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/Makefile
@@ -0,0 +1,22 @@
+##===- lib/CodeGen/Makefile --------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMCodeGen
+PARALLEL_DIRS = SelectionDAG AsmPrinter
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
+# Xcode prior to 2.4 generates an error in -pedantic mode with use of HUGE_VAL
+# in this directory.  Disable -pedantic for this broken compiler.
+ifneq ($(HUGE_VAL_SANITY),yes)
+CompileCommonOpts := $(filter-out -pedantic, $(CompileCommonOpts))
+endif
+
diff --git a/src/LLVM/lib/CodeGen/ObjectCodeEmitter.cpp b/src/LLVM/lib/CodeGen/ObjectCodeEmitter.cpp
new file mode 100644
index 0000000..cf05275
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/ObjectCodeEmitter.cpp
@@ -0,0 +1,141 @@
+//===-- llvm/CodeGen/ObjectCodeEmitter.cpp -------------------- -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/BinaryObject.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineRelocation.h"
+#include "llvm/CodeGen/ObjectCodeEmitter.h"
+
+//===----------------------------------------------------------------------===//
+//                       ObjectCodeEmitter Implementation
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+ObjectCodeEmitter::ObjectCodeEmitter() : BO(0) {}
+ObjectCodeEmitter::ObjectCodeEmitter(BinaryObject *bo) : BO(bo) {}
+ObjectCodeEmitter::~ObjectCodeEmitter() {}
+
+/// setBinaryObject - set the BinaryObject we are writting to
+void ObjectCodeEmitter::setBinaryObject(BinaryObject *bo) { BO = bo; }
+
+/// emitByte - This callback is invoked when a byte needs to be
+/// written to the data stream, without buffer overflow testing.
+void ObjectCodeEmitter::emitByte(uint8_t B) {
+  BO->emitByte(B);
+}
+
+/// emitWordLE - This callback is invoked when a 32-bit word needs to be
+/// written to the data stream in little-endian format.
+void ObjectCodeEmitter::emitWordLE(uint32_t W) {
+  BO->emitWordLE(W);
+}
+
+/// emitWordBE - This callback is invoked when a 32-bit word needs to be
+/// written to the data stream in big-endian format.
+void ObjectCodeEmitter::emitWordBE(uint32_t W) {
+  BO->emitWordBE(W);
+}
+
+/// emitDWordLE - This callback is invoked when a 64-bit word needs to be
+/// written to the data stream in little-endian format.
+void ObjectCodeEmitter::emitDWordLE(uint64_t W) {
+  BO->emitDWordLE(W);
+}
+
+/// emitDWordBE - This callback is invoked when a 64-bit word needs to be
+/// written to the data stream in big-endian format.
+void ObjectCodeEmitter::emitDWordBE(uint64_t W) {
+  BO->emitDWordBE(W);
+}
+
+/// emitAlignment - Align 'BO' to the necessary alignment boundary.
+void ObjectCodeEmitter::emitAlignment(unsigned Alignment /* 0 */,
+                                      uint8_t fill /* 0 */) {
+  BO->emitAlignment(Alignment, fill);
+}
+
+/// emitULEB128Bytes - This callback is invoked when a ULEB128 needs to be
+/// written to the data stream.
+void ObjectCodeEmitter::emitULEB128Bytes(uint64_t Value) {
+  BO->emitULEB128Bytes(Value);
+}
+
+/// emitSLEB128Bytes - This callback is invoked when a SLEB128 needs to be
+/// written to the data stream.
+void ObjectCodeEmitter::emitSLEB128Bytes(uint64_t Value) {
+  BO->emitSLEB128Bytes(Value);
+}
+
+/// emitString - This callback is invoked when a String needs to be
+/// written to the data stream.
+void ObjectCodeEmitter::emitString(const std::string &String) {
+  BO->emitString(String);
+}
+
+/// getCurrentPCValue - This returns the address that the next emitted byte
+/// will be output to.
+uintptr_t ObjectCodeEmitter::getCurrentPCValue() const {
+  return BO->getCurrentPCOffset();
+}
+
+/// getCurrentPCOffset - Return the offset from the start of the emitted
+/// buffer that we are currently writing to.
+uintptr_t ObjectCodeEmitter::getCurrentPCOffset() const {
+  return BO->getCurrentPCOffset();
+}
+
+/// addRelocation - Whenever a relocatable address is needed, it should be
+/// noted with this interface.
+void ObjectCodeEmitter::addRelocation(const MachineRelocation& relocation) {
+  BO->addRelocation(relocation);
+}
+
+/// StartMachineBasicBlock - This should be called by the target when a new
+/// basic block is about to be emitted.  This way the MCE knows where the
+/// start of the block is, and can implement getMachineBasicBlockAddress.
+void ObjectCodeEmitter::StartMachineBasicBlock(MachineBasicBlock *MBB) {
+  if (MBBLocations.size() <= (unsigned)MBB->getNumber())
+    MBBLocations.resize((MBB->getNumber()+1)*2);
+  MBBLocations[MBB->getNumber()] = getCurrentPCOffset();
+}
+
+/// getMachineBasicBlockAddress - Return the address of the specified
+/// MachineBasicBlock, only usable after the label for the MBB has been
+/// emitted.
+uintptr_t
+ObjectCodeEmitter::getMachineBasicBlockAddress(MachineBasicBlock *MBB) const {
+  assert(MBBLocations.size() > (unsigned)MBB->getNumber() &&
+         MBBLocations[MBB->getNumber()] && "MBB not emitted!");
+  return MBBLocations[MBB->getNumber()];
+}
+
+/// getJumpTableEntryAddress - Return the address of the jump table with index
+/// 'Index' in the function that last called initJumpTableInfo.
+uintptr_t ObjectCodeEmitter::getJumpTableEntryAddress(unsigned Index) const {
+  assert(JTLocations.size() > Index && "JT not emitted!");
+  return JTLocations[Index];
+}
+
+/// getConstantPoolEntryAddress - Return the address of the 'Index' entry in
+/// the constant pool that was last emitted with the emitConstantPool method.
+uintptr_t ObjectCodeEmitter::getConstantPoolEntryAddress(unsigned Index) const {
+  assert(CPLocations.size() > Index && "CP not emitted!");
+  return CPLocations[Index];
+}
+
+/// getConstantPoolEntrySection - Return the section of the 'Index' entry in
+/// the constant pool that was last emitted with the emitConstantPool method.
+uintptr_t ObjectCodeEmitter::getConstantPoolEntrySection(unsigned Index) const {
+  assert(CPSections.size() > Index && "CP not emitted!");
+  return CPSections[Index];
+}
+
+} // end namespace llvm
+
diff --git a/src/LLVM/lib/CodeGen/OptimizePHIs.cpp b/src/LLVM/lib/CodeGen/OptimizePHIs.cpp
new file mode 100644
index 0000000..edb4eea
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/OptimizePHIs.cpp
@@ -0,0 +1,188 @@
+//===-- OptimizePHIs.cpp - Optimize machine instruction PHIs --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes machine instruction PHIs to take advantage of
+// opportunities created during DAG legalization.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "phi-opt"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumPHICycles, "Number of PHI cycles replaced");
+STATISTIC(NumDeadPHICycles, "Number of dead PHI cycles");
+
+namespace {
+  class OptimizePHIs : public MachineFunctionPass {
+    MachineRegisterInfo *MRI;
+    const TargetInstrInfo *TII;
+
+  public:
+    static char ID; // Pass identification
+    OptimizePHIs() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    typedef SmallPtrSet<MachineInstr*, 16> InstrSet;
+    typedef SmallPtrSetIterator<MachineInstr*> InstrSetIterator;
+
+    bool IsSingleValuePHICycle(MachineInstr *MI, unsigned &SingleValReg,
+                               InstrSet &PHIsInCycle);
+    bool IsDeadPHICycle(MachineInstr *MI, InstrSet &PHIsInCycle);
+    bool OptimizeBB(MachineBasicBlock &MBB);
+  };
+}
+
+char OptimizePHIs::ID = 0;
+INITIALIZE_PASS(OptimizePHIs, "opt-phis",
+                "Optimize machine instruction PHIs", false, false);
+
+FunctionPass *llvm::createOptimizePHIsPass() { return new OptimizePHIs(); }
+
+bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) {
+  MRI = &Fn.getRegInfo();
+  TII = Fn.getTarget().getInstrInfo();
+
+  // Find dead PHI cycles and PHI cycles that can be replaced by a single
+  // value.  InstCombine does these optimizations, but DAG legalization may
+  // introduce new opportunities, e.g., when i64 values are split up for
+  // 32-bit targets.
+  bool Changed = false;
+  for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I)
+    Changed |= OptimizeBB(*I);
+
+  return Changed;
+}
+
+/// IsSingleValuePHICycle - Check if MI is a PHI where all the source operands
+/// are copies of SingleValReg, possibly via copies through other PHIs.  If
+/// SingleValReg is zero on entry, it is set to the register with the single
+/// non-copy value.  PHIsInCycle is a set used to keep track of the PHIs that
+/// have been scanned.
+bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
+                                         unsigned &SingleValReg,
+                                         InstrSet &PHIsInCycle) {
+  assert(MI->isPHI() && "IsSingleValuePHICycle expects a PHI instruction");
+  unsigned DstReg = MI->getOperand(0).getReg();
+
+  // See if we already saw this register.
+  if (!PHIsInCycle.insert(MI))
+    return true;
+
+  // Don't scan crazily complex things.
+  if (PHIsInCycle.size() == 16)
+    return false;
+
+  // Scan the PHI operands.
+  for (unsigned i = 1; i != MI->getNumOperands(); i += 2) {
+    unsigned SrcReg = MI->getOperand(i).getReg();
+    if (SrcReg == DstReg)
+      continue;
+    MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+
+    // Skip over register-to-register moves.
+    if (SrcMI && SrcMI->isCopy() &&
+        !SrcMI->getOperand(0).getSubReg() &&
+        !SrcMI->getOperand(1).getSubReg() &&
+        TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg()))
+      SrcMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
+    if (!SrcMI)
+      return false;
+
+    if (SrcMI->isPHI()) {
+      if (!IsSingleValuePHICycle(SrcMI, SingleValReg, PHIsInCycle))
+        return false;
+    } else {
+      // Fail if there is more than one non-phi/non-move register.
+      if (SingleValReg != 0)
+        return false;
+      SingleValReg = SrcReg;
+    }
+  }
+  return true;
+}
+
+/// IsDeadPHICycle - Check if the register defined by a PHI is only used by
+/// other PHIs in a cycle.
+bool OptimizePHIs::IsDeadPHICycle(MachineInstr *MI, InstrSet &PHIsInCycle) {
+  assert(MI->isPHI() && "IsDeadPHICycle expects a PHI instruction");
+  unsigned DstReg = MI->getOperand(0).getReg();
+  assert(TargetRegisterInfo::isVirtualRegister(DstReg) &&
+         "PHI destination is not a virtual register");
+
+  // See if we already saw this register.
+  if (!PHIsInCycle.insert(MI))
+    return true;
+
+  // Don't scan crazily complex things.
+  if (PHIsInCycle.size() == 16)
+    return false;
+
+  for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DstReg),
+         E = MRI->use_end(); I != E; ++I) {
+    MachineInstr *UseMI = &*I;
+    if (!UseMI->isPHI() || !IsDeadPHICycle(UseMI, PHIsInCycle))
+      return false;
+  }
+
+  return true;
+}
+
+/// OptimizeBB - Remove dead PHI cycles and PHI cycles that can be replaced by
+/// a single value.
+bool OptimizePHIs::OptimizeBB(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator
+         MII = MBB.begin(), E = MBB.end(); MII != E; ) {
+    MachineInstr *MI = &*MII++;
+    if (!MI->isPHI())
+      break;
+
+    // Check for single-value PHI cycles.
+    unsigned SingleValReg = 0;
+    InstrSet PHIsInCycle;
+    if (IsSingleValuePHICycle(MI, SingleValReg, PHIsInCycle) &&
+        SingleValReg != 0) {
+      MRI->replaceRegWith(MI->getOperand(0).getReg(), SingleValReg);
+      MI->eraseFromParent();
+      ++NumPHICycles;
+      Changed = true;
+      continue;
+    }
+
+    // Check for dead PHI cycles.
+    PHIsInCycle.clear();
+    if (IsDeadPHICycle(MI, PHIsInCycle)) {
+      for (InstrSetIterator PI = PHIsInCycle.begin(), PE = PHIsInCycle.end();
+           PI != PE; ++PI) {
+        MachineInstr *PhiMI = *PI;
+        if (&*MII == PhiMI)
+          ++MII;
+        PhiMI->eraseFromParent();
+      }
+      ++NumDeadPHICycles;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
diff --git a/src/LLVM/lib/CodeGen/PBQP/Graph.h b/src/LLVM/lib/CodeGen/PBQP/Graph.h
new file mode 100644
index 0000000..b2224cb
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PBQP/Graph.h
@@ -0,0 +1,425 @@
+//===-------------------- Graph.h - PBQP Graph ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// PBQP Graph class.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_CODEGEN_PBQP_GRAPH_H
+#define LLVM_CODEGEN_PBQP_GRAPH_H
+
+#include "Math.h"
+
+#include <list>
+#include <vector>
+#include <map>
+
+namespace PBQP {
+
+  /// PBQP Graph class.
+  /// Instances of this class describe PBQP problems.
+  class Graph {
+  private:
+
+    // ----- TYPEDEFS -----
+    class NodeEntry;
+    class EdgeEntry;
+
+    typedef std::list<NodeEntry> NodeList;
+    typedef std::list<EdgeEntry> EdgeList;
+
+  public:
+
+    typedef NodeList::iterator NodeItr;
+    typedef NodeList::const_iterator ConstNodeItr;
+
+    typedef EdgeList::iterator EdgeItr;
+    typedef EdgeList::const_iterator ConstEdgeItr;
+
+  private:
+
+    typedef std::list<EdgeItr> AdjEdgeList;
+  
+  public:
+
+    typedef AdjEdgeList::iterator AdjEdgeItr;
+
+  private:
+
+    class NodeEntry {
+    private:
+      Vector costs;      
+      AdjEdgeList adjEdges;
+      unsigned degree;
+      void *data;
+    public:
+      NodeEntry(const Vector &costs) : costs(costs), degree(0) {}
+      Vector& getCosts() { return costs; }
+      const Vector& getCosts() const { return costs; }
+      unsigned getDegree() const { return degree; }
+      AdjEdgeItr edgesBegin() { return adjEdges.begin(); }
+      AdjEdgeItr edgesEnd() { return adjEdges.end(); }
+      AdjEdgeItr addEdge(EdgeItr e) {
+        ++degree;
+        return adjEdges.insert(adjEdges.end(), e);
+      }
+      void removeEdge(AdjEdgeItr ae) {
+        --degree;
+        adjEdges.erase(ae);
+      }
+      void setData(void *data) { this->data = data; }
+      void* getData() { return data; }
+    };
+
+    class EdgeEntry {
+    private:
+      NodeItr node1, node2;
+      Matrix costs;
+      AdjEdgeItr node1AEItr, node2AEItr;
+      void *data;
+    public:
+      EdgeEntry(NodeItr node1, NodeItr node2, const Matrix &costs)
+        : node1(node1), node2(node2), costs(costs) {}
+      NodeItr getNode1() const { return node1; }
+      NodeItr getNode2() const { return node2; }
+      Matrix& getCosts() { return costs; }
+      const Matrix& getCosts() const { return costs; }
+      void setNode1AEItr(AdjEdgeItr ae) { node1AEItr = ae; }
+      AdjEdgeItr getNode1AEItr() { return node1AEItr; }
+      void setNode2AEItr(AdjEdgeItr ae) { node2AEItr = ae; }
+      AdjEdgeItr getNode2AEItr() { return node2AEItr; }
+      void setData(void *data) { this->data = data; }
+      void *getData() { return data; }
+    };
+
+    // ----- MEMBERS -----
+
+    NodeList nodes;
+    unsigned numNodes;
+
+    EdgeList edges;
+    unsigned numEdges;
+
+    // ----- INTERNAL METHODS -----
+
+    NodeEntry& getNode(NodeItr nItr) { return *nItr; }
+    const NodeEntry& getNode(ConstNodeItr nItr) const { return *nItr; }
+
+    EdgeEntry& getEdge(EdgeItr eItr) { return *eItr; }
+    const EdgeEntry& getEdge(ConstEdgeItr eItr) const { return *eItr; }
+
+    NodeItr addConstructedNode(const NodeEntry &n) {
+      ++numNodes;
+      return nodes.insert(nodes.end(), n);
+    }
+
+    EdgeItr addConstructedEdge(const EdgeEntry &e) {
+      assert(findEdge(e.getNode1(), e.getNode2()) == edges.end() &&
+             "Attempt to add duplicate edge.");
+      ++numEdges;
+      EdgeItr edgeItr = edges.insert(edges.end(), e);
+      EdgeEntry &ne = getEdge(edgeItr);
+      NodeEntry &n1 = getNode(ne.getNode1());
+      NodeEntry &n2 = getNode(ne.getNode2());
+      // Sanity check on matrix dimensions:
+      assert((n1.getCosts().getLength() == ne.getCosts().getRows()) &&
+             (n2.getCosts().getLength() == ne.getCosts().getCols()) &&
+             "Edge cost dimensions do not match node costs dimensions.");
+      ne.setNode1AEItr(n1.addEdge(edgeItr));
+      ne.setNode2AEItr(n2.addEdge(edgeItr));
+      return edgeItr;
+    }
+
+    inline void copyFrom(const Graph &other);
+  public:
+
+    /// \brief Construct an empty PBQP graph.
+    Graph() : numNodes(0), numEdges(0) {}
+
+    /// \brief Copy construct this graph from "other". Note: Does not copy node
+    ///        and edge data, only graph structure and costs.
+    /// @param other Source graph to copy from.
+    Graph(const Graph &other) : numNodes(0), numEdges(0) {
+      copyFrom(other);
+    }
+
+    /// \brief Make this graph a copy of "other". Note: Does not copy node and
+    ///        edge data, only graph structure and costs.
+    /// @param other The graph to copy from.
+    /// @return A reference to this graph.
+    ///
+    /// This will clear the current graph, erasing any nodes and edges added,
+    /// before copying from other.
+    Graph& operator=(const Graph &other) {
+      clear();      
+      copyFrom(other);
+      return *this;
+    }
+
+    /// \brief Add a node with the given costs.
+    /// @param costs Cost vector for the new node.
+    /// @return Node iterator for the added node.
+    NodeItr addNode(const Vector &costs) {
+      return addConstructedNode(NodeEntry(costs));
+    }
+
+    /// \brief Add an edge between the given nodes with the given costs.
+    /// @param n1Itr First node.
+    /// @param n2Itr Second node.
+    /// @return Edge iterator for the added edge.
+    EdgeItr addEdge(Graph::NodeItr n1Itr, Graph::NodeItr n2Itr,
+                    const Matrix &costs) {
+      assert(getNodeCosts(n1Itr).getLength() == costs.getRows() &&
+             getNodeCosts(n2Itr).getLength() == costs.getCols() &&
+             "Matrix dimensions mismatch.");
+      return addConstructedEdge(EdgeEntry(n1Itr, n2Itr, costs)); 
+    }
+
+    /// \brief Get the number of nodes in the graph.
+    /// @return Number of nodes in the graph.
+    unsigned getNumNodes() const { return numNodes; }
+
+    /// \brief Get the number of edges in the graph.
+    /// @return Number of edges in the graph.
+    unsigned getNumEdges() const { return numEdges; }
+
+    /// \brief Get a node's cost vector.
+    /// @param nItr Node iterator.
+    /// @return Node cost vector.
+    Vector& getNodeCosts(NodeItr nItr) { return getNode(nItr).getCosts(); }
+
+    /// \brief Get a node's cost vector (const version).
+    /// @param nItr Node iterator.
+    /// @return Node cost vector.
+    const Vector& getNodeCosts(ConstNodeItr nItr) const {
+      return getNode(nItr).getCosts();
+    }
+
+    /// \brief Set a node's data pointer.
+    /// @param nItr Node iterator.
+    /// @param data Pointer to node data.
+    ///
+    /// Typically used by a PBQP solver to attach data to aid in solution.
+    void setNodeData(NodeItr nItr, void *data) { getNode(nItr).setData(data); }
+
+    /// \brief Get the node's data pointer.
+    /// @param nItr Node iterator.
+    /// @return Pointer to node data.
+    void* getNodeData(NodeItr nItr) { return getNode(nItr).getData(); }
+    
+    /// \brief Get an edge's cost matrix.
+    /// @param eItr Edge iterator.
+    /// @return Edge cost matrix.
+    Matrix& getEdgeCosts(EdgeItr eItr) { return getEdge(eItr).getCosts(); }
+
+    /// \brief Get an edge's cost matrix (const version).
+    /// @param eItr Edge iterator.
+    /// @return Edge cost matrix.
+    const Matrix& getEdgeCosts(ConstEdgeItr eItr) const {
+      return getEdge(eItr).getCosts();
+    }
+
+    /// \brief Set an edge's data pointer.
+    /// @param eItr Edge iterator.
+    /// @param data Pointer to edge data.
+    ///
+    /// Typically used by a PBQP solver to attach data to aid in solution.
+    void setEdgeData(EdgeItr eItr, void *data) { getEdge(eItr).setData(data); }
+
+    /// \brief Get an edge's data pointer.
+    /// @param eItr Edge iterator.
+    /// @return Pointer to edge data. 
+    void* getEdgeData(EdgeItr eItr) { return getEdge(eItr).getData(); }
+
+    /// \brief Get a node's degree.
+    /// @param nItr Node iterator.
+    /// @return The degree of the node.
+    unsigned getNodeDegree(NodeItr nItr) const {
+      return getNode(nItr).getDegree();
+    }
+
+    /// \brief Begin iterator for node set.
+    NodeItr nodesBegin() { return nodes.begin(); }
+
+    /// \brief Begin const iterator for node set.
+    ConstNodeItr nodesBegin() const { return nodes.begin(); }
+
+    /// \brief End iterator for node set.
+    NodeItr nodesEnd() { return nodes.end(); }
+
+    /// \brief End const iterator for node set.
+    ConstNodeItr nodesEnd() const { return nodes.end(); }
+
+    /// \brief Begin iterator for edge set.
+    EdgeItr edgesBegin() { return edges.begin(); }
+
+    /// \brief End iterator for edge set.
+    EdgeItr edgesEnd() { return edges.end(); }
+
+    /// \brief Get begin iterator for adjacent edge set.
+    /// @param nItr Node iterator.
+    /// @return Begin iterator for the set of edges connected to the given node.
+    AdjEdgeItr adjEdgesBegin(NodeItr nItr) {
+      return getNode(nItr).edgesBegin();
+    }
+
+    /// \brief Get end iterator for adjacent edge set.
+    /// @param nItr Node iterator.
+    /// @return End iterator for the set of edges connected to the given node.
+    AdjEdgeItr adjEdgesEnd(NodeItr nItr) {
+      return getNode(nItr).edgesEnd();
+    }
+
+    /// \brief Get the first node connected to this edge.
+    /// @param eItr Edge iterator.
+    /// @return The first node connected to the given edge. 
+    NodeItr getEdgeNode1(EdgeItr eItr) {
+      return getEdge(eItr).getNode1();
+    }
+
+    /// \brief Get the second node connected to this edge.
+    /// @param eItr Edge iterator.
+    /// @return The second node connected to the given edge. 
+    NodeItr getEdgeNode2(EdgeItr eItr) {
+      return getEdge(eItr).getNode2();
+    } 
+
+    /// \brief Get the "other" node connected to this edge.
+    /// @param eItr Edge iterator.
+    /// @param nItr Node iterator for the "given" node.
+    /// @return The iterator for the "other" node connected to this edge. 
+    NodeItr getEdgeOtherNode(EdgeItr eItr, NodeItr nItr) {
+      EdgeEntry &e = getEdge(eItr);
+      if (e.getNode1() == nItr) {
+        return e.getNode2();
+      } // else
+      return e.getNode1();
+    }
+
+    /// \brief Get the edge connecting two nodes.
+    /// @param n1Itr First node iterator.
+    /// @param n2Itr Second node iterator.
+    /// @return An iterator for edge (n1Itr, n2Itr) if such an edge exists,
+    ///         otherwise returns edgesEnd(). 
+    EdgeItr findEdge(NodeItr n1Itr, NodeItr n2Itr) {
+      for (AdjEdgeItr aeItr = adjEdgesBegin(n1Itr), aeEnd = adjEdgesEnd(n1Itr);
+         aeItr != aeEnd; ++aeItr) {
+        if ((getEdgeNode1(*aeItr) == n2Itr) ||
+            (getEdgeNode2(*aeItr) == n2Itr)) {
+          return *aeItr;
+        }
+      }
+      return edges.end();
+    }
+
+    /// \brief Remove a node from the graph.
+    /// @param nItr Node iterator.
+    void removeNode(NodeItr nItr) {
+      NodeEntry &n = getNode(nItr);
+      for (AdjEdgeItr itr = n.edgesBegin(), end = n.edgesEnd(); itr != end;) {
+        EdgeItr eItr = *itr;
+        ++itr;
+        removeEdge(eItr); 
+      }
+      nodes.erase(nItr);
+      --numNodes;
+    }
+
+    /// \brief Remove an edge from the graph.
+    /// @param eItr Edge iterator.
+    void removeEdge(EdgeItr eItr) {
+      EdgeEntry &e = getEdge(eItr);
+      NodeEntry &n1 = getNode(e.getNode1());
+      NodeEntry &n2 = getNode(e.getNode2());
+      n1.removeEdge(e.getNode1AEItr());
+      n2.removeEdge(e.getNode2AEItr());
+      edges.erase(eItr);
+      --numEdges;
+    }
+
+    /// \brief Remove all nodes and edges from the graph.
+    void clear() {
+      nodes.clear();
+      edges.clear();
+      numNodes = numEdges = 0;
+    }
+
+    /// \brief Print a representation of this graph in DOT format.
+    /// @param os Output stream to print on.
+    template <typename OStream>
+    void printDot(OStream &os) {
+    
+      os << "graph {\n";
+
+      for (NodeItr nodeItr = nodesBegin(), nodeEnd = nodesEnd();
+           nodeItr != nodeEnd; ++nodeItr) {
+
+        os << "  node" << nodeItr << " [ label=\""
+           << nodeItr << ": " << getNodeCosts(nodeItr) << "\" ]\n";
+      }
+
+      os << "  edge [ len=" << getNumNodes() << " ]\n";
+
+      for (EdgeItr edgeItr = edgesBegin(), edgeEnd = edgesEnd();
+           edgeItr != edgeEnd; ++edgeItr) {
+
+        os << "  node" << getEdgeNode1(edgeItr)
+           << " -- node" << getEdgeNode2(edgeItr)
+           << " [ label=\"";
+
+        const Matrix &edgeCosts = getEdgeCosts(edgeItr);
+
+        for (unsigned i = 0; i < edgeCosts.getRows(); ++i) {
+          os << edgeCosts.getRowAsVector(i) << "\\n";
+        }
+        os << "\" ]\n";
+      }
+      os << "}\n";
+    }
+
+  };
+
+  class NodeItrComparator {
+  public:
+    bool operator()(Graph::NodeItr n1, Graph::NodeItr n2) const {
+      return &*n1 < &*n2;
+    }
+
+    bool operator()(Graph::ConstNodeItr n1, Graph::ConstNodeItr n2) const {
+      return &*n1 < &*n2;
+    }
+  };
+
+  class EdgeItrCompartor {
+  public:
+    bool operator()(Graph::EdgeItr e1, Graph::EdgeItr e2) const {
+      return &*e1 < &*e2;
+    }
+
+    bool operator()(Graph::ConstEdgeItr e1, Graph::ConstEdgeItr e2) const {
+      return &*e1 < &*e2;
+    }
+  };
+
+  void Graph::copyFrom(const Graph &other) {
+    std::map<Graph::ConstNodeItr, Graph::NodeItr,
+             NodeItrComparator> nodeMap;
+
+     for (Graph::ConstNodeItr nItr = other.nodesBegin(),
+                             nEnd = other.nodesEnd();
+         nItr != nEnd; ++nItr) {
+      nodeMap[nItr] = addNode(other.getNodeCosts(nItr));
+    }
+      
+  }
+
+}
+
+#endif // LLVM_CODEGEN_PBQP_GRAPH_HPP
diff --git a/src/LLVM/lib/CodeGen/PBQP/HeuristicBase.h b/src/LLVM/lib/CodeGen/PBQP/HeuristicBase.h
new file mode 100644
index 0000000..3bb24e1
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PBQP/HeuristicBase.h
@@ -0,0 +1,242 @@
+//===-- HeuristcBase.h --- Heuristic base class for PBQP --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PBQP_HEURISTICBASE_H
+#define LLVM_CODEGEN_PBQP_HEURISTICBASE_H
+
+#include "HeuristicSolver.h"
+
+namespace PBQP {
+
+  /// \brief Abstract base class for heuristic implementations.
+  ///
+  /// This class provides a handy base for heuristic implementations with common
+  /// solver behaviour implemented for a number of methods.
+  ///
+  /// To implement your own heuristic using this class as a base you'll have to
+  /// implement, as a minimum, the following methods:
+  /// <ul>
+  ///   <li> void addToHeuristicList(Graph::NodeItr) : Add a node to the
+  ///        heuristic reduction list.
+  ///   <li> void heuristicReduce() : Perform a single heuristic reduction.
+  ///   <li> void preUpdateEdgeCosts(Graph::EdgeItr) : Handle the (imminent)
+  ///        change to the cost matrix on the given edge (by R2).
+  ///   <li> void postUpdateEdgeCostts(Graph::EdgeItr) : Handle the new 
+  ///        costs on the given edge.
+  ///   <li> void handleAddEdge(Graph::EdgeItr) : Handle the addition of a new
+  ///        edge into the PBQP graph (by R2).
+  ///   <li> void handleRemoveEdge(Graph::EdgeItr, Graph::NodeItr) : Handle the
+  ///        disconnection of the given edge from the given node.
+  ///   <li> A constructor for your derived class : to pass back a reference to
+  ///        the solver which is using this heuristic.
+  /// </ul>
+  ///
+  /// These methods are implemented in this class for documentation purposes,
+  /// but will assert if called.
+  /// 
+  /// Note that this class uses the curiously recursive template idiom to
+  /// forward calls to the derived class. These methods need not be made
+  /// virtual, and indeed probably shouldn't for performance reasons.
+  ///
+  /// You'll also need to provide NodeData and EdgeData structs in your class.
+  /// These can be used to attach data relevant to your heuristic to each
+  /// node/edge in the PBQP graph.
+
+  template <typename HImpl>
+  class HeuristicBase {
+  private:
+
+    typedef std::list<Graph::NodeItr> OptimalList;
+
+    HeuristicSolverImpl<HImpl> &s;
+    Graph &g;
+    OptimalList optimalList;
+
+    // Return a reference to the derived heuristic.
+    HImpl& impl() { return static_cast<HImpl&>(*this); }
+
+    // Add the given node to the optimal reductions list. Keep an iterator to
+    // its location for fast removal. 
+    void addToOptimalReductionList(Graph::NodeItr nItr) {
+      optimalList.insert(optimalList.end(), nItr);
+    }
+
+  public:
+
+    /// \brief Construct an instance with a reference to the given solver.
+    /// @param solver The solver which is using this heuristic instance.
+    HeuristicBase(HeuristicSolverImpl<HImpl> &solver)
+      : s(solver), g(s.getGraph()) { }
+
+    /// \brief Get the solver which is using this heuristic instance.
+    /// @return The solver which is using this heuristic instance.
+    ///
+    /// You can use this method to get access to the solver in your derived
+    /// heuristic implementation.
+    HeuristicSolverImpl<HImpl>& getSolver() { return s; }
+
+    /// \brief Get the graph representing the problem to be solved.
+    /// @return The graph representing the problem to be solved.
+    Graph& getGraph() { return g; }
+
+    /// \brief Tell the solver to simplify the graph before the reduction phase.
+    /// @return Whether or not the solver should run a simplification phase
+    ///         prior to the main setup and reduction.
+    ///
+    /// HeuristicBase returns true from this method as it's a sensible default,
+    /// however you can over-ride it in your derived class if you want different
+    /// behaviour.
+    bool solverRunSimplify() const { return true; }
+
+    /// \brief Decide whether a node should be optimally or heuristically 
+    ///        reduced.
+    /// @return Whether or not the given node should be listed for optimal
+    ///         reduction (via R0, R1 or R2).
+    ///
+    /// HeuristicBase returns true for any node with degree less than 3. This is
+    /// sane and sensible for many situations, but not all. You can over-ride
+    /// this method in your derived class if you want a different selection
+    /// criteria. Note however that your criteria for selecting optimal nodes
+    /// should be <i>at least</i> as strong as this. I.e. Nodes of degree 3 or
+    /// higher should not be selected under any circumstances.
+    bool shouldOptimallyReduce(Graph::NodeItr nItr) {
+      if (g.getNodeDegree(nItr) < 3)
+        return true;
+      // else
+      return false;
+    }
+
+    /// \brief Add the given node to the list of nodes to be optimally reduced.
+    /// @return nItr Node iterator to be added.
+    ///
+    /// You probably don't want to over-ride this, except perhaps to record
+    /// statistics before calling this implementation. HeuristicBase relies on
+    /// its behaviour.
+    void addToOptimalReduceList(Graph::NodeItr nItr) {
+      optimalList.push_back(nItr);
+    }
+
+    /// \brief Initialise the heuristic.
+    ///
+    /// HeuristicBase iterates over all nodes in the problem and adds them to
+    /// the appropriate list using addToOptimalReduceList or
+    /// addToHeuristicReduceList based on the result of shouldOptimallyReduce.
+    ///
+    /// This behaviour should be fine for most situations.
+    void setup() {
+      for (Graph::NodeItr nItr = g.nodesBegin(), nEnd = g.nodesEnd();
+           nItr != nEnd; ++nItr) {
+        if (impl().shouldOptimallyReduce(nItr)) {
+          addToOptimalReduceList(nItr);
+        } else {
+          impl().addToHeuristicReduceList(nItr);
+        }
+      }
+    }
+
+    /// \brief Optimally reduce one of the nodes in the optimal reduce list.
+    /// @return True if a reduction takes place, false if the optimal reduce
+    ///         list is empty.
+    ///
+    /// Selects a node from the optimal reduce list and removes it, applying
+    /// R0, R1 or R2 as appropriate based on the selected node's degree.
+    bool optimalReduce() {
+      if (optimalList.empty())
+        return false;
+
+      Graph::NodeItr nItr = optimalList.front();
+      optimalList.pop_front();
+
+      switch (s.getSolverDegree(nItr)) {
+        case 0: s.applyR0(nItr); break;
+        case 1: s.applyR1(nItr); break;
+        case 2: s.applyR2(nItr); break;
+        default: assert(false &&
+                        "Optimal reductions of degree > 2 nodes is invalid.");
+      }
+
+      return true;
+    }
+
+    /// \brief Perform the PBQP reduction process.
+    ///
+    /// Reduces the problem to the empty graph by repeated application of the
+    /// reduction rules R0, R1, R2 and RN.
+    /// R0, R1 or R2 are always applied if possible before RN is used.
+    void reduce() {
+      bool finished = false;
+
+      while (!finished) {
+        if (!optimalReduce())
+          if (!impl().heuristicReduce())
+            finished = true;
+      }
+    }
+
+    /// \brief Add a node to the heuristic reduce list.
+    /// @param nItr Node iterator to add to the heuristic reduce list.
+    void addToHeuristicList(Graph::NodeItr nItr) {
+      assert(false && "Must be implemented in derived class.");
+    }
+
+    /// \brief Heuristically reduce one of the nodes in the heuristic
+    ///        reduce list.
+    /// @return True if a reduction takes place, false if the heuristic reduce
+    ///         list is empty.
+    void heuristicReduce() {
+      assert(false && "Must be implemented in derived class.");
+    }
+
+    /// \brief Prepare a change in the costs on the given edge.
+    /// @param eItr Edge iterator.    
+    void preUpdateEdgeCosts(Graph::EdgeItr eItr) {
+      assert(false && "Must be implemented in derived class.");
+    }
+
+    /// \brief Handle the change in the costs on the given edge.
+    /// @param eItr Edge iterator.
+    void postUpdateEdgeCostts(Graph::EdgeItr eItr) {
+      assert(false && "Must be implemented in derived class.");
+    }
+
+    /// \brief Handle the addition of a new edge into the PBQP graph.
+    /// @param eItr Edge iterator for the added edge.
+    void handleAddEdge(Graph::EdgeItr eItr) {
+      assert(false && "Must be implemented in derived class.");
+    }
+
+    /// \brief Handle disconnection of an edge from a node.
+    /// @param eItr Edge iterator for edge being disconnected.
+    /// @param nItr Node iterator for the node being disconnected from.
+    ///
+    /// Edges are frequently removed due to the removal of a node. This
+    /// method allows for the effect to be computed only for the remaining
+    /// node in the graph.
+    void handleRemoveEdge(Graph::EdgeItr eItr, Graph::NodeItr nItr) {
+      assert(false && "Must be implemented in derived class.");
+    }
+
+    /// \brief Clean up any structures used by HeuristicBase.
+    ///
+    /// At present this just performs a sanity check: that the optimal reduce
+    /// list is empty now that reduction has completed.
+    ///
+    /// If your derived class has more complex structures which need tearing
+    /// down you should over-ride this method but include a call back to this
+    /// implementation.
+    void cleanup() {
+      assert(optimalList.empty() && "Nodes left over in optimal reduce list?");
+    }
+
+  };
+
+}
+
+
+#endif // LLVM_CODEGEN_PBQP_HEURISTICBASE_H
diff --git a/src/LLVM/lib/CodeGen/PBQP/HeuristicSolver.h b/src/LLVM/lib/CodeGen/PBQP/HeuristicSolver.h
new file mode 100644
index 0000000..02938df
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PBQP/HeuristicSolver.h
@@ -0,0 +1,607 @@
+//===-- HeuristicSolver.h - Heuristic PBQP Solver --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Heuristic PBQP solver. This solver is able to perform optimal reductions for
+// nodes of degree 0, 1 or 2. For nodes of degree >2 a plugable heuristic is
+// used to select a node for reduction. 
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PBQP_HEURISTICSOLVER_H
+#define LLVM_CODEGEN_PBQP_HEURISTICSOLVER_H
+
+#include "Graph.h"
+#include "Solution.h"
+#include <vector>
+#include <limits>
+
+namespace PBQP {
+
+  /// \brief Heuristic PBQP solver implementation.
+  ///
+  /// This class should usually be created (and destroyed) indirectly via a call
+  /// to HeuristicSolver<HImpl>::solve(Graph&).
+  /// See the comments for HeuristicSolver.
+  ///
+  /// HeuristicSolverImpl provides the R0, R1 and R2 reduction rules,
+  /// backpropagation phase, and maintains the internal copy of the graph on
+  /// which the reduction is carried out (the original being kept to facilitate
+  /// backpropagation).
+  template <typename HImpl>
+  class HeuristicSolverImpl {
+  private:
+
+    typedef typename HImpl::NodeData HeuristicNodeData;
+    typedef typename HImpl::EdgeData HeuristicEdgeData;
+
+    typedef std::list<Graph::EdgeItr> SolverEdges;
+
+  public:
+  
+    /// \brief Iterator type for edges in the solver graph.
+    typedef SolverEdges::iterator SolverEdgeItr;
+
+  private:
+
+    class NodeData {
+    public:
+      NodeData() : solverDegree(0) {}
+
+      HeuristicNodeData& getHeuristicData() { return hData; }
+
+      SolverEdgeItr addSolverEdge(Graph::EdgeItr eItr) {
+        ++solverDegree;
+        return solverEdges.insert(solverEdges.end(), eItr);
+      }
+
+      void removeSolverEdge(SolverEdgeItr seItr) {
+        --solverDegree;
+        solverEdges.erase(seItr);
+      }
+
+      SolverEdgeItr solverEdgesBegin() { return solverEdges.begin(); }
+      SolverEdgeItr solverEdgesEnd() { return solverEdges.end(); }
+      unsigned getSolverDegree() const { return solverDegree; }
+      void clearSolverEdges() {
+        solverDegree = 0;
+        solverEdges.clear(); 
+      }
+      
+    private:
+      HeuristicNodeData hData;
+      unsigned solverDegree;
+      SolverEdges solverEdges;
+    };
+ 
+    class EdgeData {
+    public:
+      HeuristicEdgeData& getHeuristicData() { return hData; }
+
+      void setN1SolverEdgeItr(SolverEdgeItr n1SolverEdgeItr) {
+        this->n1SolverEdgeItr = n1SolverEdgeItr;
+      }
+
+      SolverEdgeItr getN1SolverEdgeItr() { return n1SolverEdgeItr; }
+
+      void setN2SolverEdgeItr(SolverEdgeItr n2SolverEdgeItr){
+        this->n2SolverEdgeItr = n2SolverEdgeItr;
+      }
+
+      SolverEdgeItr getN2SolverEdgeItr() { return n2SolverEdgeItr; }
+
+    private:
+
+      HeuristicEdgeData hData;
+      SolverEdgeItr n1SolverEdgeItr, n2SolverEdgeItr;
+    };
+
+    Graph &g;
+    HImpl h;
+    Solution s;
+    std::vector<Graph::NodeItr> stack;
+
+    typedef std::list<NodeData> NodeDataList;
+    NodeDataList nodeDataList;
+
+    typedef std::list<EdgeData> EdgeDataList;
+    EdgeDataList edgeDataList;
+
+  public:
+
+    /// \brief Construct a heuristic solver implementation to solve the given
+    ///        graph.
+    /// @param g The graph representing the problem instance to be solved.
+    HeuristicSolverImpl(Graph &g) : g(g), h(*this) {}  
+
+    /// \brief Get the graph being solved by this solver.
+    /// @return The graph representing the problem instance being solved by this
+    ///         solver.
+    Graph& getGraph() { return g; }
+
+    /// \brief Get the heuristic data attached to the given node.
+    /// @param nItr Node iterator.
+    /// @return The heuristic data attached to the given node.
+    HeuristicNodeData& getHeuristicNodeData(Graph::NodeItr nItr) {
+      return getSolverNodeData(nItr).getHeuristicData();
+    }
+
+    /// \brief Get the heuristic data attached to the given edge.
+    /// @param eItr Edge iterator.
+    /// @return The heuristic data attached to the given node.
+    HeuristicEdgeData& getHeuristicEdgeData(Graph::EdgeItr eItr) {
+      return getSolverEdgeData(eItr).getHeuristicData();
+    }
+
+    /// \brief Begin iterator for the set of edges adjacent to the given node in
+    ///        the solver graph.
+    /// @param nItr Node iterator.
+    /// @return Begin iterator for the set of edges adjacent to the given node
+    ///         in the solver graph. 
+    SolverEdgeItr solverEdgesBegin(Graph::NodeItr nItr) {
+      return getSolverNodeData(nItr).solverEdgesBegin();
+    }
+
+    /// \brief End iterator for the set of edges adjacent to the given node in
+    ///        the solver graph.
+    /// @param nItr Node iterator.
+    /// @return End iterator for the set of edges adjacent to the given node in
+    ///         the solver graph. 
+    SolverEdgeItr solverEdgesEnd(Graph::NodeItr nItr) {
+      return getSolverNodeData(nItr).solverEdgesEnd();
+    }
+
+    /// \brief Remove a node from the solver graph.
+    /// @param eItr Edge iterator for edge to be removed.
+    ///
+    /// Does <i>not</i> notify the heuristic of the removal. That should be
+    /// done manually if necessary.
+    void removeSolverEdge(Graph::EdgeItr eItr) {
+      EdgeData &eData = getSolverEdgeData(eItr);
+      NodeData &n1Data = getSolverNodeData(g.getEdgeNode1(eItr)),
+               &n2Data = getSolverNodeData(g.getEdgeNode2(eItr));
+
+      n1Data.removeSolverEdge(eData.getN1SolverEdgeItr());
+      n2Data.removeSolverEdge(eData.getN2SolverEdgeItr());
+    }
+
+    /// \brief Compute a solution to the PBQP problem instance with which this
+    ///        heuristic solver was constructed.
+    /// @return A solution to the PBQP problem.
+    ///
+    /// Performs the full PBQP heuristic solver algorithm, including setup,
+    /// calls to the heuristic (which will call back to the reduction rules in
+    /// this class), and cleanup.
+    Solution computeSolution() {
+      setup();
+      h.setup();
+      h.reduce();
+      backpropagate();
+      h.cleanup();
+      cleanup();
+      return s;
+    }
+
+    /// \brief Add to the end of the stack.
+    /// @param nItr Node iterator to add to the reduction stack.
+    void pushToStack(Graph::NodeItr nItr) {
+      getSolverNodeData(nItr).clearSolverEdges();
+      stack.push_back(nItr);
+    }
+
+    /// \brief Returns the solver degree of the given node.
+    /// @param nItr Node iterator for which degree is requested.
+    /// @return Node degree in the <i>solver</i> graph (not the original graph).
+    unsigned getSolverDegree(Graph::NodeItr nItr) {
+      return  getSolverNodeData(nItr).getSolverDegree();
+    }
+
+    /// \brief Set the solution of the given node.
+    /// @param nItr Node iterator to set solution for.
+    /// @param selection Selection for node.
+    void setSolution(const Graph::NodeItr &nItr, unsigned selection) {
+      s.setSelection(nItr, selection);
+
+      for (Graph::AdjEdgeItr aeItr = g.adjEdgesBegin(nItr),
+                             aeEnd = g.adjEdgesEnd(nItr);
+           aeItr != aeEnd; ++aeItr) {
+        Graph::EdgeItr eItr(*aeItr);
+        Graph::NodeItr anItr(g.getEdgeOtherNode(eItr, nItr));
+        getSolverNodeData(anItr).addSolverEdge(eItr);
+      }
+    }
+
+    /// \brief Apply rule R0.
+    /// @param nItr Node iterator for node to apply R0 to.
+    ///
+    /// Node will be automatically pushed to the solver stack.
+    void applyR0(Graph::NodeItr nItr) {
+      assert(getSolverNodeData(nItr).getSolverDegree() == 0 &&
+             "R0 applied to node with degree != 0.");
+
+      // Nothing to do. Just push the node onto the reduction stack.
+      pushToStack(nItr);
+    }
+
+    /// \brief Apply rule R1.
+    /// @param xnItr Node iterator for node to apply R1 to.
+    ///
+    /// Node will be automatically pushed to the solver stack.
+    void applyR1(Graph::NodeItr xnItr) {
+      NodeData &nd = getSolverNodeData(xnItr);
+      assert(nd.getSolverDegree() == 1 &&
+             "R1 applied to node with degree != 1.");
+
+      Graph::EdgeItr eItr = *nd.solverEdgesBegin();
+
+      const Matrix &eCosts = g.getEdgeCosts(eItr);
+      const Vector &xCosts = g.getNodeCosts(xnItr);
+      
+      // Duplicate a little to avoid transposing matrices.
+      if (xnItr == g.getEdgeNode1(eItr)) {
+        Graph::NodeItr ynItr = g.getEdgeNode2(eItr);
+        Vector &yCosts = g.getNodeCosts(ynItr);
+        for (unsigned j = 0; j < yCosts.getLength(); ++j) {
+          PBQPNum min = eCosts[0][j] + xCosts[0];
+          for (unsigned i = 1; i < xCosts.getLength(); ++i) {
+            PBQPNum c = eCosts[i][j] + xCosts[i];
+            if (c < min)
+              min = c;
+          }
+          yCosts[j] += min;
+        }
+        h.handleRemoveEdge(eItr, ynItr);
+     } else {
+        Graph::NodeItr ynItr = g.getEdgeNode1(eItr);
+        Vector &yCosts = g.getNodeCosts(ynItr);
+        for (unsigned i = 0; i < yCosts.getLength(); ++i) {
+          PBQPNum min = eCosts[i][0] + xCosts[0];
+          for (unsigned j = 1; j < xCosts.getLength(); ++j) {
+            PBQPNum c = eCosts[i][j] + xCosts[j];
+            if (c < min)
+              min = c;
+          }
+          yCosts[i] += min;
+        }
+        h.handleRemoveEdge(eItr, ynItr);
+      }
+      removeSolverEdge(eItr);
+      assert(nd.getSolverDegree() == 0 &&
+             "Degree 1 with edge removed should be 0.");
+      pushToStack(xnItr);
+    }
+
+    /// \brief Apply rule R2.
+    /// @param xnItr Node iterator for node to apply R2 to.
+    ///
+    /// Node will be automatically pushed to the solver stack.
+    void applyR2(Graph::NodeItr xnItr) {
+      assert(getSolverNodeData(xnItr).getSolverDegree() == 2 &&
+             "R2 applied to node with degree != 2.");
+
+      NodeData &nd = getSolverNodeData(xnItr);
+      const Vector &xCosts = g.getNodeCosts(xnItr);
+
+      SolverEdgeItr aeItr = nd.solverEdgesBegin();
+      Graph::EdgeItr yxeItr = *aeItr,
+                     zxeItr = *(++aeItr);
+
+      Graph::NodeItr ynItr = g.getEdgeOtherNode(yxeItr, xnItr),
+                     znItr = g.getEdgeOtherNode(zxeItr, xnItr);
+
+      bool flipEdge1 = (g.getEdgeNode1(yxeItr) == xnItr),
+           flipEdge2 = (g.getEdgeNode1(zxeItr) == xnItr);
+
+      const Matrix *yxeCosts = flipEdge1 ?
+        new Matrix(g.getEdgeCosts(yxeItr).transpose()) :
+        &g.getEdgeCosts(yxeItr);
+
+      const Matrix *zxeCosts = flipEdge2 ?
+        new Matrix(g.getEdgeCosts(zxeItr).transpose()) :
+        &g.getEdgeCosts(zxeItr);
+
+      unsigned xLen = xCosts.getLength(),
+               yLen = yxeCosts->getRows(),
+               zLen = zxeCosts->getRows();
+               
+      Matrix delta(yLen, zLen);
+
+      for (unsigned i = 0; i < yLen; ++i) {
+        for (unsigned j = 0; j < zLen; ++j) {
+          PBQPNum min = (*yxeCosts)[i][0] + (*zxeCosts)[j][0] + xCosts[0];
+          for (unsigned k = 1; k < xLen; ++k) {
+            PBQPNum c = (*yxeCosts)[i][k] + (*zxeCosts)[j][k] + xCosts[k];
+            if (c < min) {
+              min = c;
+            }
+          }
+          delta[i][j] = min;
+        }
+      }
+
+      if (flipEdge1)
+        delete yxeCosts;
+
+      if (flipEdge2)
+        delete zxeCosts;
+
+      Graph::EdgeItr yzeItr = g.findEdge(ynItr, znItr);
+      bool addedEdge = false;
+
+      if (yzeItr == g.edgesEnd()) {
+        yzeItr = g.addEdge(ynItr, znItr, delta);
+        addedEdge = true;
+      } else {
+        Matrix &yzeCosts = g.getEdgeCosts(yzeItr);
+        h.preUpdateEdgeCosts(yzeItr);
+        if (ynItr == g.getEdgeNode1(yzeItr)) {
+          yzeCosts += delta;
+        } else {
+          yzeCosts += delta.transpose();
+        }
+      }
+
+      bool nullCostEdge = tryNormaliseEdgeMatrix(yzeItr);
+
+      if (!addedEdge) {
+        // If we modified the edge costs let the heuristic know.
+        h.postUpdateEdgeCosts(yzeItr);
+      }
+ 
+      if (nullCostEdge) {
+        // If this edge ended up null remove it.
+        if (!addedEdge) {
+          // We didn't just add it, so we need to notify the heuristic
+          // and remove it from the solver.
+          h.handleRemoveEdge(yzeItr, ynItr);
+          h.handleRemoveEdge(yzeItr, znItr);
+          removeSolverEdge(yzeItr);
+        }
+        g.removeEdge(yzeItr);
+      } else if (addedEdge) {
+        // If the edge was added, and non-null, finish setting it up, add it to
+        // the solver & notify heuristic.
+        edgeDataList.push_back(EdgeData());
+        g.setEdgeData(yzeItr, &edgeDataList.back());
+        addSolverEdge(yzeItr);
+        h.handleAddEdge(yzeItr);
+      }
+
+      h.handleRemoveEdge(yxeItr, ynItr);
+      removeSolverEdge(yxeItr);
+      h.handleRemoveEdge(zxeItr, znItr);
+      removeSolverEdge(zxeItr);
+
+      pushToStack(xnItr);
+    }
+
+  private:
+
+    NodeData& getSolverNodeData(Graph::NodeItr nItr) {
+      return *static_cast<NodeData*>(g.getNodeData(nItr));
+    }
+
+    EdgeData& getSolverEdgeData(Graph::EdgeItr eItr) {
+      return *static_cast<EdgeData*>(g.getEdgeData(eItr));
+    }
+
+    void addSolverEdge(Graph::EdgeItr eItr) {
+      EdgeData &eData = getSolverEdgeData(eItr);
+      NodeData &n1Data = getSolverNodeData(g.getEdgeNode1(eItr)),
+               &n2Data = getSolverNodeData(g.getEdgeNode2(eItr));
+
+      eData.setN1SolverEdgeItr(n1Data.addSolverEdge(eItr));
+      eData.setN2SolverEdgeItr(n2Data.addSolverEdge(eItr));
+    }
+
+    void setup() {
+      if (h.solverRunSimplify()) {
+        simplify();
+      }
+
+      // Create node data objects.
+      for (Graph::NodeItr nItr = g.nodesBegin(), nEnd = g.nodesEnd();
+           nItr != nEnd; ++nItr) {
+        nodeDataList.push_back(NodeData());
+        g.setNodeData(nItr, &nodeDataList.back());
+      }
+
+      // Create edge data objects.
+      for (Graph::EdgeItr eItr = g.edgesBegin(), eEnd = g.edgesEnd();
+           eItr != eEnd; ++eItr) {
+        edgeDataList.push_back(EdgeData());
+        g.setEdgeData(eItr, &edgeDataList.back());
+        addSolverEdge(eItr);
+      }
+    }
+
+    void simplify() {
+      disconnectTrivialNodes();
+      eliminateIndependentEdges();
+    }
+
+    // Eliminate trivial nodes.
+    void disconnectTrivialNodes() {
+      unsigned numDisconnected = 0;
+
+      for (Graph::NodeItr nItr = g.nodesBegin(), nEnd = g.nodesEnd();
+           nItr != nEnd; ++nItr) {
+
+        if (g.getNodeCosts(nItr).getLength() == 1) {
+
+          std::vector<Graph::EdgeItr> edgesToRemove;
+
+          for (Graph::AdjEdgeItr aeItr = g.adjEdgesBegin(nItr),
+                                 aeEnd = g.adjEdgesEnd(nItr);
+               aeItr != aeEnd; ++aeItr) {
+
+            Graph::EdgeItr eItr = *aeItr;
+
+            if (g.getEdgeNode1(eItr) == nItr) {
+              Graph::NodeItr otherNodeItr = g.getEdgeNode2(eItr);
+              g.getNodeCosts(otherNodeItr) +=
+                g.getEdgeCosts(eItr).getRowAsVector(0);
+            }
+            else {
+              Graph::NodeItr otherNodeItr = g.getEdgeNode1(eItr);
+              g.getNodeCosts(otherNodeItr) +=
+                g.getEdgeCosts(eItr).getColAsVector(0);
+            }
+
+            edgesToRemove.push_back(eItr);
+          }
+
+          if (!edgesToRemove.empty())
+            ++numDisconnected;
+
+          while (!edgesToRemove.empty()) {
+            g.removeEdge(edgesToRemove.back());
+            edgesToRemove.pop_back();
+          }
+        }
+      }
+    }
+
+    void eliminateIndependentEdges() {
+      std::vector<Graph::EdgeItr> edgesToProcess;
+      unsigned numEliminated = 0;
+
+      for (Graph::EdgeItr eItr = g.edgesBegin(), eEnd = g.edgesEnd();
+           eItr != eEnd; ++eItr) {
+        edgesToProcess.push_back(eItr);
+      }
+
+      while (!edgesToProcess.empty()) {
+        if (tryToEliminateEdge(edgesToProcess.back()))
+          ++numEliminated;
+        edgesToProcess.pop_back();
+      }
+    }
+
+    bool tryToEliminateEdge(Graph::EdgeItr eItr) {
+      if (tryNormaliseEdgeMatrix(eItr)) {
+        g.removeEdge(eItr);
+        return true; 
+      }
+      return false;
+    }
+
+    bool tryNormaliseEdgeMatrix(Graph::EdgeItr &eItr) {
+
+      const PBQPNum infinity = std::numeric_limits<PBQPNum>::infinity();
+
+      Matrix &edgeCosts = g.getEdgeCosts(eItr);
+      Vector &uCosts = g.getNodeCosts(g.getEdgeNode1(eItr)),
+             &vCosts = g.getNodeCosts(g.getEdgeNode2(eItr));
+
+      for (unsigned r = 0; r < edgeCosts.getRows(); ++r) {
+        PBQPNum rowMin = infinity;
+
+        for (unsigned c = 0; c < edgeCosts.getCols(); ++c) {
+          if (vCosts[c] != infinity && edgeCosts[r][c] < rowMin)
+            rowMin = edgeCosts[r][c];
+        }
+
+        uCosts[r] += rowMin;
+
+        if (rowMin != infinity) {
+          edgeCosts.subFromRow(r, rowMin);
+        }
+        else {
+          edgeCosts.setRow(r, 0);
+        }
+      }
+
+      for (unsigned c = 0; c < edgeCosts.getCols(); ++c) {
+        PBQPNum colMin = infinity;
+
+        for (unsigned r = 0; r < edgeCosts.getRows(); ++r) {
+          if (uCosts[r] != infinity && edgeCosts[r][c] < colMin)
+            colMin = edgeCosts[r][c];
+        }
+
+        vCosts[c] += colMin;
+
+        if (colMin != infinity) {
+          edgeCosts.subFromCol(c, colMin);
+        }
+        else {
+          edgeCosts.setCol(c, 0);
+        }
+      }
+
+      return edgeCosts.isZero();
+    }
+
+    void backpropagate() {
+      while (!stack.empty()) {
+        computeSolution(stack.back());
+        stack.pop_back();
+      }
+    }
+
+    void computeSolution(Graph::NodeItr nItr) {
+
+      NodeData &nodeData = getSolverNodeData(nItr);
+
+      Vector v(g.getNodeCosts(nItr));
+
+      // Solve based on existing solved edges.
+      for (SolverEdgeItr solvedEdgeItr = nodeData.solverEdgesBegin(),
+                         solvedEdgeEnd = nodeData.solverEdgesEnd();
+           solvedEdgeItr != solvedEdgeEnd; ++solvedEdgeItr) {
+
+        Graph::EdgeItr eItr(*solvedEdgeItr);
+        Matrix &edgeCosts = g.getEdgeCosts(eItr);
+
+        if (nItr == g.getEdgeNode1(eItr)) {
+          Graph::NodeItr adjNode(g.getEdgeNode2(eItr));
+          unsigned adjSolution = s.getSelection(adjNode);
+          v += edgeCosts.getColAsVector(adjSolution);
+        }
+        else {
+          Graph::NodeItr adjNode(g.getEdgeNode1(eItr));
+          unsigned adjSolution = s.getSelection(adjNode);
+          v += edgeCosts.getRowAsVector(adjSolution);
+        }
+
+      }
+
+      setSolution(nItr, v.minIndex());
+    }
+
+    void cleanup() {
+      h.cleanup();
+      nodeDataList.clear();
+      edgeDataList.clear();
+    }
+  };
+
+  /// \brief PBQP heuristic solver class.
+  ///
+  /// Given a PBQP Graph g representing a PBQP problem, you can find a solution
+  /// by calling
+  /// <tt>Solution s = HeuristicSolver<H>::solve(g);</tt>
+  ///
+  /// The choice of heuristic for the H parameter will affect both the solver
+  /// speed and solution quality. The heuristic should be chosen based on the
+  /// nature of the problem being solved.
+  /// Currently the only solver included with LLVM is the Briggs heuristic for
+  /// register allocation.
+  template <typename HImpl>
+  class HeuristicSolver {
+  public:
+    static Solution solve(Graph &g) {
+      HeuristicSolverImpl<HImpl> hs(g);
+      return hs.computeSolution();
+    }
+  };
+
+}
+
+#endif // LLVM_CODEGEN_PBQP_HEURISTICSOLVER_H
diff --git a/src/LLVM/lib/CodeGen/PBQP/Heuristics/Briggs.h b/src/LLVM/lib/CodeGen/PBQP/Heuristics/Briggs.h
new file mode 100644
index 0000000..18eaf7c
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PBQP/Heuristics/Briggs.h
@@ -0,0 +1,460 @@
+//===-- Briggs.h --- Briggs Heuristic for PBQP ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements the Briggs test for "allocability" of nodes in a
+// PBQP graph representing a register allocation problem. Nodes which can be
+// proven allocable (by a safe and relatively accurate test) are removed from
+// the PBQP graph first. If no provably allocable node is present in the graph
+// then the node with the minimal spill-cost to degree ratio is removed.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PBQP_HEURISTICS_BRIGGS_H
+#define LLVM_CODEGEN_PBQP_HEURISTICS_BRIGGS_H
+
+#include "../HeuristicSolver.h"
+#include "../HeuristicBase.h"
+
+#include <set>
+#include <limits>
+
+namespace PBQP {
+  namespace Heuristics {
+
+    /// \brief PBQP Heuristic which applies an allocability test based on
+    ///        Briggs.
+    /// 
+    /// This heuristic assumes that the elements of cost vectors in the PBQP
+    /// problem represent storage options, with the first being the spill
+    /// option and subsequent elements representing legal registers for the
+    /// corresponding node. Edge cost matrices are likewise assumed to represent
+    /// register constraints.
+    /// If one or more nodes can be proven allocable by this heuristic (by
+    /// inspection of their constraint matrices) then the allocable node of
+    /// highest degree is selected for the next reduction and pushed to the
+    /// solver stack. If no nodes can be proven allocable then the node with
+    /// the lowest estimated spill cost is selected and push to the solver stack
+    /// instead.
+    /// 
+    /// This implementation is built on top of HeuristicBase.       
+    class Briggs : public HeuristicBase<Briggs> {
+    private:
+
+      class LinkDegreeComparator {
+      public:
+        LinkDegreeComparator(HeuristicSolverImpl<Briggs> &s) : s(&s) {}
+        bool operator()(Graph::NodeItr n1Itr, Graph::NodeItr n2Itr) const {
+          if (s->getSolverDegree(n1Itr) > s->getSolverDegree(n2Itr))
+            return true;
+          return false;
+        }
+      private:
+        HeuristicSolverImpl<Briggs> *s;
+      };
+
+      class SpillCostComparator {
+      public:
+        SpillCostComparator(HeuristicSolverImpl<Briggs> &s)
+          : s(&s), g(&s.getGraph()) {}
+        bool operator()(Graph::NodeItr n1Itr, Graph::NodeItr n2Itr) const {
+          PBQPNum cost1 = g->getNodeCosts(n1Itr)[0] / s->getSolverDegree(n1Itr),
+                  cost2 = g->getNodeCosts(n2Itr)[0] / s->getSolverDegree(n2Itr);
+          if (cost1 < cost2)
+            return true;
+          return false;
+        }
+
+      private:
+        HeuristicSolverImpl<Briggs> *s;
+        Graph *g;
+      };
+
+      typedef std::list<Graph::NodeItr> RNAllocableList;
+      typedef RNAllocableList::iterator RNAllocableListItr;
+
+      typedef std::list<Graph::NodeItr> RNUnallocableList;  
+      typedef RNUnallocableList::iterator RNUnallocableListItr;
+
+    public:
+
+      struct NodeData {
+        typedef std::vector<unsigned> UnsafeDegreesArray;
+        bool isHeuristic, isAllocable, isInitialized;
+        unsigned numDenied, numSafe;
+        UnsafeDegreesArray unsafeDegrees;
+        RNAllocableListItr rnaItr;
+        RNUnallocableListItr rnuItr;
+
+        NodeData()
+          : isHeuristic(false), isAllocable(false), isInitialized(false),
+            numDenied(0), numSafe(0) { }
+      };
+
+      struct EdgeData {
+        typedef std::vector<unsigned> UnsafeArray;
+        unsigned worst, reverseWorst;
+        UnsafeArray unsafe, reverseUnsafe;
+        bool isUpToDate;
+
+        EdgeData() : worst(0), reverseWorst(0), isUpToDate(false) {}
+      };
+
+      /// \brief Construct an instance of the Briggs heuristic.
+      /// @param solver A reference to the solver which is using this heuristic.
+      Briggs(HeuristicSolverImpl<Briggs> &solver) :
+        HeuristicBase<Briggs>(solver) {}
+
+      /// \brief Determine whether a node should be reduced using optimal
+      ///        reduction.
+      /// @param nItr Node iterator to be considered.
+      /// @return True if the given node should be optimally reduced, false
+      ///         otherwise.
+      ///
+      /// Selects nodes of degree 0, 1 or 2 for optimal reduction, with one
+      /// exception. Nodes whose spill cost (element 0 of their cost vector) is
+      /// infinite are checked for allocability first. Allocable nodes may be
+      /// optimally reduced, but nodes whose allocability cannot be proven are
+      /// selected for heuristic reduction instead.
+      bool shouldOptimallyReduce(Graph::NodeItr nItr) {
+        if (getSolver().getSolverDegree(nItr) < 3) {
+          return true;
+        }
+        // else
+        return false;
+      }
+
+      /// \brief Add a node to the heuristic reduce list.
+      /// @param nItr Node iterator to add to the heuristic reduce list.
+      void addToHeuristicReduceList(Graph::NodeItr nItr) {
+        NodeData &nd = getHeuristicNodeData(nItr);
+        initializeNode(nItr);
+        nd.isHeuristic = true;
+        if (nd.isAllocable) {
+          nd.rnaItr = rnAllocableList.insert(rnAllocableList.end(), nItr);
+        } else {
+          nd.rnuItr = rnUnallocableList.insert(rnUnallocableList.end(), nItr);
+        }
+      }
+
+      /// \brief Heuristically reduce one of the nodes in the heuristic
+      ///        reduce list.
+      /// @return True if a reduction takes place, false if the heuristic reduce
+      ///         list is empty.
+      ///
+      /// If the list of allocable nodes is non-empty a node is selected
+      /// from it and pushed to the stack. Otherwise if the non-allocable list
+      /// is non-empty a node is selected from it and pushed to the stack.
+      /// If both lists are empty the method simply returns false with no action
+      /// taken.
+      bool heuristicReduce() {
+        if (!rnAllocableList.empty()) {
+          RNAllocableListItr rnaItr =
+            min_element(rnAllocableList.begin(), rnAllocableList.end(),
+                        LinkDegreeComparator(getSolver()));
+          Graph::NodeItr nItr = *rnaItr;
+          rnAllocableList.erase(rnaItr);
+          handleRemoveNode(nItr);
+          getSolver().pushToStack(nItr);
+          return true;
+        } else if (!rnUnallocableList.empty()) {
+          RNUnallocableListItr rnuItr =
+            min_element(rnUnallocableList.begin(), rnUnallocableList.end(),
+                        SpillCostComparator(getSolver()));
+          Graph::NodeItr nItr = *rnuItr;
+          rnUnallocableList.erase(rnuItr);
+          handleRemoveNode(nItr);
+          getSolver().pushToStack(nItr);
+          return true;
+        }
+        // else
+        return false;
+      }
+
+      /// \brief Prepare a change in the costs on the given edge.
+      /// @param eItr Edge iterator.    
+      void preUpdateEdgeCosts(Graph::EdgeItr eItr) {
+        Graph &g = getGraph();
+        Graph::NodeItr n1Itr = g.getEdgeNode1(eItr),
+                       n2Itr = g.getEdgeNode2(eItr);
+        NodeData &n1 = getHeuristicNodeData(n1Itr),
+                 &n2 = getHeuristicNodeData(n2Itr);
+
+        if (n1.isHeuristic)
+          subtractEdgeContributions(eItr, getGraph().getEdgeNode1(eItr));
+        if (n2.isHeuristic)
+          subtractEdgeContributions(eItr, getGraph().getEdgeNode2(eItr));
+
+        EdgeData &ed = getHeuristicEdgeData(eItr);
+        ed.isUpToDate = false;
+      }
+
+      /// \brief Handle the change in the costs on the given edge.
+      /// @param eItr Edge iterator.
+      void postUpdateEdgeCosts(Graph::EdgeItr eItr) {
+        // This is effectively the same as adding a new edge now, since
+        // we've factored out the costs of the old one.
+        handleAddEdge(eItr);
+      }
+
+      /// \brief Handle the addition of a new edge into the PBQP graph.
+      /// @param eItr Edge iterator for the added edge.
+      ///
+      /// Updates allocability of any nodes connected by this edge which are
+      /// being managed by the heuristic. If allocability changes they are
+      /// moved to the appropriate list.
+      void handleAddEdge(Graph::EdgeItr eItr) {
+        Graph &g = getGraph();
+        Graph::NodeItr n1Itr = g.getEdgeNode1(eItr),
+                       n2Itr = g.getEdgeNode2(eItr);
+        NodeData &n1 = getHeuristicNodeData(n1Itr),
+                 &n2 = getHeuristicNodeData(n2Itr);
+
+        // If neither node is managed by the heuristic there's nothing to be
+        // done.
+        if (!n1.isHeuristic && !n2.isHeuristic)
+          return;
+
+        // Ok - we need to update at least one node.
+        computeEdgeContributions(eItr);
+
+        // Update node 1 if it's managed by the heuristic.
+        if (n1.isHeuristic) {
+          bool n1WasAllocable = n1.isAllocable;
+          addEdgeContributions(eItr, n1Itr);
+          updateAllocability(n1Itr);
+          if (n1WasAllocable && !n1.isAllocable) {
+            rnAllocableList.erase(n1.rnaItr);
+            n1.rnuItr =
+              rnUnallocableList.insert(rnUnallocableList.end(), n1Itr);
+          }
+        }
+
+        // Likewise for node 2.
+        if (n2.isHeuristic) {
+          bool n2WasAllocable = n2.isAllocable;
+          addEdgeContributions(eItr, n2Itr);
+          updateAllocability(n2Itr);
+          if (n2WasAllocable && !n2.isAllocable) {
+            rnAllocableList.erase(n2.rnaItr);
+            n2.rnuItr =
+              rnUnallocableList.insert(rnUnallocableList.end(), n2Itr);
+          }
+        }
+      }
+
+      /// \brief Handle disconnection of an edge from a node.
+      /// @param eItr Edge iterator for edge being disconnected.
+      /// @param nItr Node iterator for the node being disconnected from.
+      ///
+      /// Updates allocability of the given node and, if appropriate, moves the
+      /// node to a new list.
+      void handleRemoveEdge(Graph::EdgeItr eItr, Graph::NodeItr nItr) {
+        NodeData &nd = getHeuristicNodeData(nItr);
+
+        // If the node is not managed by the heuristic there's nothing to be
+        // done.
+        if (!nd.isHeuristic)
+          return;
+
+        EdgeData &ed = getHeuristicEdgeData(eItr);
+        (void)ed;
+        assert(ed.isUpToDate && "Edge data is not up to date.");
+
+        // Update node.
+        bool ndWasAllocable = nd.isAllocable;
+        subtractEdgeContributions(eItr, nItr);
+        updateAllocability(nItr);
+
+        // If the node has gone optimal...
+        if (shouldOptimallyReduce(nItr)) {
+          nd.isHeuristic = false;
+          addToOptimalReduceList(nItr);
+          if (ndWasAllocable) {
+            rnAllocableList.erase(nd.rnaItr);
+          } else {
+            rnUnallocableList.erase(nd.rnuItr);
+          }
+        } else {
+          // Node didn't go optimal, but we might have to move it
+          // from "unallocable" to "allocable".
+          if (!ndWasAllocable && nd.isAllocable) {
+            rnUnallocableList.erase(nd.rnuItr);
+            nd.rnaItr = rnAllocableList.insert(rnAllocableList.end(), nItr);
+          }
+        }
+      }
+
+    private:
+
+      NodeData& getHeuristicNodeData(Graph::NodeItr nItr) {
+        return getSolver().getHeuristicNodeData(nItr);
+      }
+
+      EdgeData& getHeuristicEdgeData(Graph::EdgeItr eItr) {
+        return getSolver().getHeuristicEdgeData(eItr);
+      }
+
+      // Work out what this edge will contribute to the allocability of the
+      // nodes connected to it.
+      void computeEdgeContributions(Graph::EdgeItr eItr) {
+        EdgeData &ed = getHeuristicEdgeData(eItr);
+
+        if (ed.isUpToDate)
+          return; // Edge data is already up to date.
+
+        Matrix &eCosts = getGraph().getEdgeCosts(eItr);
+
+        unsigned numRegs = eCosts.getRows() - 1,
+                 numReverseRegs = eCosts.getCols() - 1;
+
+        std::vector<unsigned> rowInfCounts(numRegs, 0),
+                              colInfCounts(numReverseRegs, 0);        
+
+        ed.worst = 0;
+        ed.reverseWorst = 0;
+        ed.unsafe.clear();
+        ed.unsafe.resize(numRegs, 0);
+        ed.reverseUnsafe.clear();
+        ed.reverseUnsafe.resize(numReverseRegs, 0);
+
+        for (unsigned i = 0; i < numRegs; ++i) {
+          for (unsigned j = 0; j < numReverseRegs; ++j) {
+            if (eCosts[i + 1][j + 1] ==
+                  std::numeric_limits<PBQPNum>::infinity()) {
+              ed.unsafe[i] = 1;
+              ed.reverseUnsafe[j] = 1;
+              ++rowInfCounts[i];
+              ++colInfCounts[j];
+
+              if (colInfCounts[j] > ed.worst) {
+                ed.worst = colInfCounts[j];
+              }
+
+              if (rowInfCounts[i] > ed.reverseWorst) {
+                ed.reverseWorst = rowInfCounts[i];
+              }
+            }
+          }
+        }
+
+        ed.isUpToDate = true;
+      }
+
+      // Add the contributions of the given edge to the given node's 
+      // numDenied and safe members. No action is taken other than to update
+      // these member values. Once updated these numbers can be used by clients
+      // to update the node's allocability.
+      void addEdgeContributions(Graph::EdgeItr eItr, Graph::NodeItr nItr) {
+        EdgeData &ed = getHeuristicEdgeData(eItr);
+
+        assert(ed.isUpToDate && "Using out-of-date edge numbers.");
+
+        NodeData &nd = getHeuristicNodeData(nItr);
+        unsigned numRegs = getGraph().getNodeCosts(nItr).getLength() - 1;
+        
+        bool nIsNode1 = nItr == getGraph().getEdgeNode1(eItr);
+        EdgeData::UnsafeArray &unsafe =
+          nIsNode1 ? ed.unsafe : ed.reverseUnsafe;
+        nd.numDenied += nIsNode1 ? ed.worst : ed.reverseWorst;
+
+        for (unsigned r = 0; r < numRegs; ++r) {
+          if (unsafe[r]) {
+            if (nd.unsafeDegrees[r]==0) {
+              --nd.numSafe;
+            }
+            ++nd.unsafeDegrees[r];
+          }
+        }
+      }
+
+      // Subtract the contributions of the given edge to the given node's 
+      // numDenied and safe members. No action is taken other than to update
+      // these member values. Once updated these numbers can be used by clients
+      // to update the node's allocability.
+      void subtractEdgeContributions(Graph::EdgeItr eItr, Graph::NodeItr nItr) {
+        EdgeData &ed = getHeuristicEdgeData(eItr);
+
+        assert(ed.isUpToDate && "Using out-of-date edge numbers.");
+
+        NodeData &nd = getHeuristicNodeData(nItr);
+        unsigned numRegs = getGraph().getNodeCosts(nItr).getLength() - 1;
+        
+        bool nIsNode1 = nItr == getGraph().getEdgeNode1(eItr);
+        EdgeData::UnsafeArray &unsafe =
+          nIsNode1 ? ed.unsafe : ed.reverseUnsafe;
+        nd.numDenied -= nIsNode1 ? ed.worst : ed.reverseWorst;
+
+        for (unsigned r = 0; r < numRegs; ++r) {
+          if (unsafe[r]) { 
+            if (nd.unsafeDegrees[r] == 1) {
+              ++nd.numSafe;
+            }
+            --nd.unsafeDegrees[r];
+          }
+        }
+      }
+
+      void updateAllocability(Graph::NodeItr nItr) {
+        NodeData &nd = getHeuristicNodeData(nItr);
+        unsigned numRegs = getGraph().getNodeCosts(nItr).getLength() - 1;
+        nd.isAllocable = nd.numDenied < numRegs || nd.numSafe > 0;
+      }
+
+      void initializeNode(Graph::NodeItr nItr) {
+        NodeData &nd = getHeuristicNodeData(nItr);
+
+        if (nd.isInitialized)
+          return; // Node data is already up to date.
+
+        unsigned numRegs = getGraph().getNodeCosts(nItr).getLength() - 1;
+
+        nd.numDenied = 0;
+        nd.numSafe = numRegs;
+        nd.unsafeDegrees.resize(numRegs, 0);
+
+        typedef HeuristicSolverImpl<Briggs>::SolverEdgeItr SolverEdgeItr;
+
+        for (SolverEdgeItr aeItr = getSolver().solverEdgesBegin(nItr),
+                           aeEnd = getSolver().solverEdgesEnd(nItr);
+             aeItr != aeEnd; ++aeItr) {
+          
+          Graph::EdgeItr eItr = *aeItr;
+          computeEdgeContributions(eItr);
+          addEdgeContributions(eItr, nItr);
+        }
+
+        updateAllocability(nItr);
+        nd.isInitialized = true;
+      }
+
+      void handleRemoveNode(Graph::NodeItr xnItr) {
+        typedef HeuristicSolverImpl<Briggs>::SolverEdgeItr SolverEdgeItr;
+        std::vector<Graph::EdgeItr> edgesToRemove;
+        for (SolverEdgeItr aeItr = getSolver().solverEdgesBegin(xnItr),
+                           aeEnd = getSolver().solverEdgesEnd(xnItr);
+             aeItr != aeEnd; ++aeItr) {
+          Graph::NodeItr ynItr = getGraph().getEdgeOtherNode(*aeItr, xnItr);
+          handleRemoveEdge(*aeItr, ynItr);
+          edgesToRemove.push_back(*aeItr);
+        }
+        while (!edgesToRemove.empty()) {
+          getSolver().removeSolverEdge(edgesToRemove.back());
+          edgesToRemove.pop_back();
+        }
+      }
+
+      RNAllocableList rnAllocableList;
+      RNUnallocableList rnUnallocableList;
+    };
+
+  }
+}
+
+
+#endif // LLVM_CODEGEN_PBQP_HEURISTICS_BRIGGS_H
diff --git a/src/LLVM/lib/CodeGen/PBQP/Math.h b/src/LLVM/lib/CodeGen/PBQP/Math.h
new file mode 100644
index 0000000..e7598bf
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PBQP/Math.h
@@ -0,0 +1,288 @@
+//===------ Math.h - PBQP Vector and Matrix classes -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PBQP_MATH_H 
+#define LLVM_CODEGEN_PBQP_MATH_H
+
+#include <cassert>
+#include <algorithm>
+#include <functional>
+
+namespace PBQP {
+
+typedef float PBQPNum;
+
+/// \brief PBQP Vector class.
+class Vector {
+  public:
+
+    /// \brief Construct a PBQP vector of the given size.
+    explicit Vector(unsigned length) :
+      length(length), data(new PBQPNum[length]) {
+      }
+
+    /// \brief Construct a PBQP vector with initializer.
+    Vector(unsigned length, PBQPNum initVal) :
+      length(length), data(new PBQPNum[length]) {
+        std::fill(data, data + length, initVal);
+      }
+
+    /// \brief Copy construct a PBQP vector.
+    Vector(const Vector &v) :
+      length(v.length), data(new PBQPNum[length]) {
+        std::copy(v.data, v.data + length, data);
+      }
+
+    /// \brief Destroy this vector, return its memory.
+    ~Vector() { delete[] data; }
+
+    /// \brief Assignment operator.
+    Vector& operator=(const Vector &v) {
+      delete[] data;
+      length = v.length;
+      data = new PBQPNum[length];
+      std::copy(v.data, v.data + length, data);
+      return *this;
+    }
+
+    /// \brief Return the length of the vector
+    unsigned getLength() const {
+      return length;
+    }
+
+    /// \brief Element access.
+    PBQPNum& operator[](unsigned index) {
+      assert(index < length && "Vector element access out of bounds.");
+      return data[index];
+    }
+
+    /// \brief Const element access.
+    const PBQPNum& operator[](unsigned index) const {
+      assert(index < length && "Vector element access out of bounds.");
+      return data[index];
+    }
+
+    /// \brief Add another vector to this one.
+    Vector& operator+=(const Vector &v) {
+      assert(length == v.length && "Vector length mismatch.");
+      std::transform(data, data + length, v.data, data, std::plus<PBQPNum>()); 
+      return *this;
+    }
+
+    /// \brief Subtract another vector from this one.
+    Vector& operator-=(const Vector &v) {
+      assert(length == v.length && "Vector length mismatch.");
+      std::transform(data, data + length, v.data, data, std::minus<PBQPNum>()); 
+      return *this;
+    }
+
+    /// \brief Returns the index of the minimum value in this vector
+    unsigned minIndex() const {
+      return std::min_element(data, data + length) - data;
+    }
+
+  private:
+    unsigned length;
+    PBQPNum *data;
+};
+
+/// \brief Output a textual representation of the given vector on the given
+///        output stream.
+template <typename OStream>
+OStream& operator<<(OStream &os, const Vector &v) {
+  assert((v.getLength() != 0) && "Zero-length vector badness.");
+
+  os << "[ " << v[0];
+  for (unsigned i = 1; i < v.getLength(); ++i) {
+    os << ", " << v[i];
+  }
+  os << " ]";
+
+  return os;
+} 
+
+
+/// \brief PBQP Matrix class
+class Matrix {
+  public:
+
+    /// \brief Construct a PBQP Matrix with the given dimensions.
+    Matrix(unsigned rows, unsigned cols) :
+      rows(rows), cols(cols), data(new PBQPNum[rows * cols]) {
+    }
+
+    /// \brief Construct a PBQP Matrix with the given dimensions and initial
+    /// value.
+    Matrix(unsigned rows, unsigned cols, PBQPNum initVal) :
+      rows(rows), cols(cols), data(new PBQPNum[rows * cols]) {
+        std::fill(data, data + (rows * cols), initVal);
+    }
+
+    /// \brief Copy construct a PBQP matrix.
+    Matrix(const Matrix &m) :
+      rows(m.rows), cols(m.cols), data(new PBQPNum[rows * cols]) {
+        std::copy(m.data, m.data + (rows * cols), data);  
+    }
+
+    /// \brief Destroy this matrix, return its memory.
+    ~Matrix() { delete[] data; }
+
+    /// \brief Assignment operator.
+    Matrix& operator=(const Matrix &m) {
+      delete[] data;
+      rows = m.rows; cols = m.cols;
+      data = new PBQPNum[rows * cols];
+      std::copy(m.data, m.data + (rows * cols), data);
+      return *this;
+    }
+
+    /// \brief Return the number of rows in this matrix.
+    unsigned getRows() const { return rows; }
+
+    /// \brief Return the number of cols in this matrix.
+    unsigned getCols() const { return cols; }
+
+    /// \brief Matrix element access.
+    PBQPNum* operator[](unsigned r) {
+      assert(r < rows && "Row out of bounds.");
+      return data + (r * cols);
+    }
+
+    /// \brief Matrix element access.
+    const PBQPNum* operator[](unsigned r) const {
+      assert(r < rows && "Row out of bounds.");
+      return data + (r * cols);
+    }
+
+    /// \brief Returns the given row as a vector.
+    Vector getRowAsVector(unsigned r) const {
+      Vector v(cols);
+      for (unsigned c = 0; c < cols; ++c)
+        v[c] = (*this)[r][c];
+      return v; 
+    }
+
+    /// \brief Returns the given column as a vector.
+    Vector getColAsVector(unsigned c) const {
+      Vector v(rows);
+      for (unsigned r = 0; r < rows; ++r)
+        v[r] = (*this)[r][c];
+      return v;
+    }
+
+    /// \brief Reset the matrix to the given value.
+    Matrix& reset(PBQPNum val = 0) {
+      std::fill(data, data + (rows * cols), val);
+      return *this;
+    }
+
+    /// \brief Set a single row of this matrix to the given value.
+    Matrix& setRow(unsigned r, PBQPNum val) {
+      assert(r < rows && "Row out of bounds.");
+      std::fill(data + (r * cols), data + ((r + 1) * cols), val);
+      return *this;
+    }
+
+    /// \brief Set a single column of this matrix to the given value.
+    Matrix& setCol(unsigned c, PBQPNum val) {
+      assert(c < cols && "Column out of bounds.");
+      for (unsigned r = 0; r < rows; ++r)
+        (*this)[r][c] = val;
+      return *this;
+    }
+
+    /// \brief Matrix transpose.
+    Matrix transpose() const {
+      Matrix m(cols, rows);
+      for (unsigned r = 0; r < rows; ++r)
+        for (unsigned c = 0; c < cols; ++c)
+          m[c][r] = (*this)[r][c];
+      return m;
+    }
+
+    /// \brief Returns the diagonal of the matrix as a vector.
+    ///
+    /// Matrix must be square.
+    Vector diagonalize() const {
+      assert(rows == cols && "Attempt to diagonalize non-square matrix.");
+
+      Vector v(rows);
+      for (unsigned r = 0; r < rows; ++r)
+        v[r] = (*this)[r][r];
+      return v;
+    } 
+
+    /// \brief Add the given matrix to this one.
+    Matrix& operator+=(const Matrix &m) {
+      assert(rows == m.rows && cols == m.cols &&
+          "Matrix dimensions mismatch.");
+      std::transform(data, data + (rows * cols), m.data, data,
+          std::plus<PBQPNum>());
+      return *this;
+    }
+
+    /// \brief Returns the minimum of the given row
+    PBQPNum getRowMin(unsigned r) const {
+      assert(r < rows && "Row out of bounds");
+      return *std::min_element(data + (r * cols), data + ((r + 1) * cols));
+    }
+
+    /// \brief Returns the minimum of the given column
+    PBQPNum getColMin(unsigned c) const {
+      PBQPNum minElem = (*this)[0][c];
+      for (unsigned r = 1; r < rows; ++r)
+        if ((*this)[r][c] < minElem) minElem = (*this)[r][c];
+      return minElem;
+    }
+
+    /// \brief Subtracts the given scalar from the elements of the given row.
+    Matrix& subFromRow(unsigned r, PBQPNum val) {
+      assert(r < rows && "Row out of bounds");
+      std::transform(data + (r * cols), data + ((r + 1) * cols),
+          data + (r * cols),
+          std::bind2nd(std::minus<PBQPNum>(), val));
+      return *this;
+    }
+
+    /// \brief Subtracts the given scalar from the elements of the given column.
+    Matrix& subFromCol(unsigned c, PBQPNum val) {
+      for (unsigned r = 0; r < rows; ++r)
+        (*this)[r][c] -= val;
+      return *this;
+    }
+
+    /// \brief Returns true if this is a zero matrix.
+    bool isZero() const {
+      return find_if(data, data + (rows * cols),
+          std::bind2nd(std::not_equal_to<PBQPNum>(), 0)) ==
+        data + (rows * cols);
+    }
+
+  private:
+    unsigned rows, cols;
+    PBQPNum *data;
+};
+
+/// \brief Output a textual representation of the given matrix on the given
+///        output stream.
+template <typename OStream>
+OStream& operator<<(OStream &os, const Matrix &m) {
+
+  assert((m.getRows() != 0) && "Zero-row matrix badness.");
+
+  for (unsigned i = 0; i < m.getRows(); ++i) {
+    os << m.getRowAsVector(i);
+  }
+
+  return os;
+}
+
+}
+
+#endif // LLVM_CODEGEN_PBQP_MATH_H
diff --git a/src/LLVM/lib/CodeGen/PBQP/Solution.h b/src/LLVM/lib/CodeGen/PBQP/Solution.h
new file mode 100644
index 0000000..294b537
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PBQP/Solution.h
@@ -0,0 +1,58 @@
+//===-- Solution.h ------- PBQP Solution ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// PBQP Solution class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PBQP_SOLUTION_H
+#define LLVM_CODEGEN_PBQP_SOLUTION_H
+
+#include "Math.h"
+#include "Graph.h"
+
+#include <map>
+
+namespace PBQP {
+
+  /// \brief Represents a solution to a PBQP problem.
+  ///
+  /// To get the selection for each node in the problem use the getSelection method.
+  class Solution {
+  private:
+    typedef std::map<Graph::NodeItr, unsigned, NodeItrComparator> SelectionsMap;
+    SelectionsMap selections;
+
+  public:
+
+    /// \brief Number of nodes for which selections have been made.
+    /// @return Number of nodes for which selections have been made.
+    unsigned numNodes() const { return selections.size(); }
+
+    /// \brief Set the selection for a given node.
+    /// @param nItr Node iterator.
+    /// @param selection Selection for nItr.
+    void setSelection(Graph::NodeItr nItr, unsigned selection) {
+      selections[nItr] = selection;
+    }
+
+    /// \brief Get a node's selection.
+    /// @param nItr Node iterator.
+    /// @return The selection for nItr;
+    unsigned getSelection(Graph::NodeItr nItr) const {
+      SelectionsMap::const_iterator sItr = selections.find(nItr);
+      assert(sItr != selections.end() && "No selection for node.");
+      return sItr->second;
+    }
+
+  };
+
+}
+
+#endif // LLVM_CODEGEN_PBQP_SOLUTION_H
diff --git a/src/LLVM/lib/CodeGen/PHIElimination.cpp b/src/LLVM/lib/CodeGen/PHIElimination.cpp
new file mode 100644
index 0000000..147135e
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PHIElimination.cpp
@@ -0,0 +1,367 @@
+//===-- PhiElimination.cpp - Eliminate PHI nodes by inserting copies ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates machine instruction PHI nodes by inserting copy
+// instructions.  This destroys SSA information, but is the desired input for
+// some register allocators.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "phielim"
+#include "PHIElimination.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Function.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumAtomic, "Number of atomic phis lowered");
+STATISTIC(NumReused, "Number of reused lowered phis");
+
+char PHIElimination::ID = 0;
+static RegisterPass<PHIElimination>
+X("phi-node-elimination", "Eliminate PHI nodes for register allocation");
+
+char &llvm::PHIEliminationID = PHIElimination::ID;
+
+void llvm::PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<LiveVariables>();
+  AU.addPreserved<MachineDominatorTree>();
+  // rdar://7401784 This would be nice:
+  // AU.addPreservedID(MachineLoopInfoID);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool llvm::PHIElimination::runOnMachineFunction(MachineFunction &MF) {
+  MRI = &MF.getRegInfo();
+
+  bool Changed = false;
+
+  // Split critical edges to help the coalescer
+  if (LiveVariables *LV = getAnalysisIfAvailable<LiveVariables>())
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+      Changed |= SplitPHIEdges(MF, *I, *LV);
+
+  // Populate VRegPHIUseCount
+  analyzePHINodes(MF);
+
+  // Eliminate PHI instructions by inserting copies into predecessor blocks.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+    Changed |= EliminatePHINodes(MF, *I);
+
+  // Remove dead IMPLICIT_DEF instructions.
+  for (SmallPtrSet<MachineInstr*, 4>::iterator I = ImpDefs.begin(),
+         E = ImpDefs.end(); I != E; ++I) {
+    MachineInstr *DefMI = *I;
+    unsigned DefReg = DefMI->getOperand(0).getReg();
+    if (MRI->use_nodbg_empty(DefReg))
+      DefMI->eraseFromParent();
+  }
+
+  // Clean up the lowered PHI instructions.
+  for (LoweredPHIMap::iterator I = LoweredPHIs.begin(), E = LoweredPHIs.end();
+       I != E; ++I)
+    MF.DeleteMachineInstr(I->first);
+
+  LoweredPHIs.clear();
+  ImpDefs.clear();
+  VRegPHIUseCount.clear();
+
+  return Changed;
+}
+
+/// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions in
+/// predecessor basic blocks.
+///
+bool llvm::PHIElimination::EliminatePHINodes(MachineFunction &MF,
+                                             MachineBasicBlock &MBB) {
+  if (MBB.empty() || !MBB.front().isPHI())
+    return false;   // Quick exit for basic blocks without PHIs.
+
+  // Get an iterator to the first instruction after the last PHI node (this may
+  // also be the end of the basic block).
+  MachineBasicBlock::iterator AfterPHIsIt = SkipPHIsAndLabels(MBB, MBB.begin());
+
+  while (MBB.front().isPHI())
+    LowerAtomicPHINode(MBB, AfterPHIsIt);
+
+  return true;
+}
+
+/// isSourceDefinedByImplicitDef - Return true if all sources of the phi node
+/// are implicit_def's.
+static bool isSourceDefinedByImplicitDef(const MachineInstr *MPhi,
+                                         const MachineRegisterInfo *MRI) {
+  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
+    unsigned SrcReg = MPhi->getOperand(i).getReg();
+    const MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
+    if (!DefMI || !DefMI->isImplicitDef())
+      return false;
+  }
+  return true;
+}
+
+// FindCopyInsertPoint - Find a safe place in MBB to insert a copy from SrcReg
+// when following the CFG edge to SuccMBB. This needs to be after any def of
+// SrcReg, but before any subsequent point where control flow might jump out of
+// the basic block.
+MachineBasicBlock::iterator
+llvm::PHIElimination::FindCopyInsertPoint(MachineBasicBlock &MBB,
+                                          MachineBasicBlock &SuccMBB,
+                                          unsigned SrcReg) {
+  // Handle the trivial case trivially.
+  if (MBB.empty())
+    return MBB.begin();
+
+  return MBB.getFirstTerminator();
+}
+
+/// LowerAtomicPHINode - Lower the PHI node at the top of the specified block,
+/// under the assuption that it needs to be lowered in a way that supports
+/// atomic execution of PHIs.  This lowering method is always correct all of the
+/// time.
+///
+void llvm::PHIElimination::LowerAtomicPHINode(
+                                      MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator AfterPHIsIt) {
+  ++NumAtomic;
+  // Unlink the PHI node from the basic block, but don't delete the PHI yet.
+  MachineInstr *MPhi = MBB.remove(MBB.begin());
+
+  unsigned NumSrcs = (MPhi->getNumOperands() - 1) / 2;
+  unsigned DestReg = MPhi->getOperand(0).getReg();
+  bool isDead = MPhi->getOperand(0).isDead();
+
+  // Create a new register for the incoming PHI arguments.
+  MachineFunction &MF = *MBB.getParent();
+  unsigned IncomingReg = 0;
+  bool reusedIncoming = false;  // Is IncomingReg reused from an earlier PHI?
+
+  // Insert a register to register copy at the top of the current block (but
+  // after any remaining phi nodes) which copies the new incoming register
+  // into the phi node destination.
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  if (isSourceDefinedByImplicitDef(MPhi, MRI))
+    // If all sources of a PHI node are implicit_def, just emit an
+    // implicit_def instead of a copy.
+    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+            TII->get(TargetOpcode::IMPLICIT_DEF), DestReg);
+  else {
+    // Can we reuse an earlier PHI node? This only happens for critical edges,
+    // typically those created by tail duplication.
+    unsigned &entry = LoweredPHIs[MPhi];
+    if (entry) {
+      // An identical PHI node was already lowered. Reuse the incoming register.
+      IncomingReg = entry;
+      reusedIncoming = true;
+      ++NumReused;
+      DEBUG(dbgs() << "Reusing %reg" << IncomingReg << " for " << *MPhi);
+    } else {
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
+      entry = IncomingReg = MF.getRegInfo().createVirtualRegister(RC);
+    }
+    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+            TII->get(TargetOpcode::COPY), DestReg)
+      .addReg(IncomingReg);
+  }
+
+  // Update live variable information if there is any.
+  LiveVariables *LV = getAnalysisIfAvailable<LiveVariables>();
+  if (LV) {
+    MachineInstr *PHICopy = prior(AfterPHIsIt);
+
+    if (IncomingReg) {
+      LiveVariables::VarInfo &VI = LV->getVarInfo(IncomingReg);
+
+      // Increment use count of the newly created virtual register.
+      VI.NumUses++;
+      LV->setPHIJoin(IncomingReg);
+
+      // When we are reusing the incoming register, it may already have been
+      // killed in this block. The old kill will also have been inserted at
+      // AfterPHIsIt, so it appears before the current PHICopy.
+      if (reusedIncoming)
+        if (MachineInstr *OldKill = VI.findKill(&MBB)) {
+          DEBUG(dbgs() << "Remove old kill from " << *OldKill);
+          LV->removeVirtualRegisterKilled(IncomingReg, OldKill);
+          DEBUG(MBB.dump());
+        }
+
+      // Add information to LiveVariables to know that the incoming value is
+      // killed.  Note that because the value is defined in several places (once
+      // each for each incoming block), the "def" block and instruction fields
+      // for the VarInfo is not filled in.
+      LV->addVirtualRegisterKilled(IncomingReg, PHICopy);
+    }
+
+    // Since we are going to be deleting the PHI node, if it is the last use of
+    // any registers, or if the value itself is dead, we need to move this
+    // information over to the new copy we just inserted.
+    LV->removeVirtualRegistersKilled(MPhi);
+
+    // If the result is dead, update LV.
+    if (isDead) {
+      LV->addVirtualRegisterDead(DestReg, PHICopy);
+      LV->removeVirtualRegisterDead(DestReg, MPhi);
+    }
+  }
+
+  // Adjust the VRegPHIUseCount map to account for the removal of this PHI node.
+  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2)
+    --VRegPHIUseCount[BBVRegPair(MPhi->getOperand(i+1).getMBB()->getNumber(),
+                                 MPhi->getOperand(i).getReg())];
+
+  // Now loop over all of the incoming arguments, changing them to copy into the
+  // IncomingReg register in the corresponding predecessor basic block.
+  SmallPtrSet<MachineBasicBlock*, 8> MBBsInsertedInto;
+  for (int i = NumSrcs - 1; i >= 0; --i) {
+    unsigned SrcReg = MPhi->getOperand(i*2+1).getReg();
+    assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+           "Machine PHI Operands must all be virtual registers!");
+
+    // Get the MachineBasicBlock equivalent of the BasicBlock that is the source
+    // path the PHI.
+    MachineBasicBlock &opBlock = *MPhi->getOperand(i*2+2).getMBB();
+
+    // If source is defined by an implicit def, there is no need to insert a
+    // copy.
+    MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
+    if (DefMI->isImplicitDef()) {
+      ImpDefs.insert(DefMI);
+      continue;
+    }
+
+    // Check to make sure we haven't already emitted the copy for this block.
+    // This can happen because PHI nodes may have multiple entries for the same
+    // basic block.
+    if (!MBBsInsertedInto.insert(&opBlock))
+      continue;  // If the copy has already been emitted, we're done.
+
+    // Find a safe location to insert the copy, this may be the first terminator
+    // in the block (or end()).
+    MachineBasicBlock::iterator InsertPos =
+      FindCopyInsertPoint(opBlock, MBB, SrcReg);
+
+    // Insert the copy.
+    if (!reusedIncoming && IncomingReg)
+      BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), IncomingReg).addReg(SrcReg);
+
+    // Now update live variable information if we have it.  Otherwise we're done
+    if (!LV) continue;
+
+    // We want to be able to insert a kill of the register if this PHI (aka, the
+    // copy we just inserted) is the last use of the source value.  Live
+    // variable analysis conservatively handles this by saying that the value is
+    // live until the end of the block the PHI entry lives in.  If the value
+    // really is dead at the PHI copy, there will be no successor blocks which
+    // have the value live-in.
+
+    // Also check to see if this register is in use by another PHI node which
+    // has not yet been eliminated.  If so, it will be killed at an appropriate
+    // point later.
+
+    // Is it used by any PHI instructions in this block?
+    bool ValueIsUsed = VRegPHIUseCount[BBVRegPair(opBlock.getNumber(), SrcReg)];
+
+    // Okay, if we now know that the value is not live out of the block, we can
+    // add a kill marker in this block saying that it kills the incoming value!
+    if (!ValueIsUsed && !LV->isLiveOut(SrcReg, opBlock)) {
+      // In our final twist, we have to decide which instruction kills the
+      // register.  In most cases this is the copy, however, the first
+      // terminator instruction at the end of the block may also use the value.
+      // In this case, we should mark *it* as being the killing block, not the
+      // copy.
+      MachineBasicBlock::iterator KillInst;
+      MachineBasicBlock::iterator Term = opBlock.getFirstTerminator();
+      if (Term != opBlock.end() && Term->readsRegister(SrcReg)) {
+        KillInst = Term;
+
+        // Check that no other terminators use values.
+#ifndef NDEBUG
+        for (MachineBasicBlock::iterator TI = llvm::next(Term);
+             TI != opBlock.end(); ++TI) {
+          assert(!TI->readsRegister(SrcReg) &&
+                 "Terminator instructions cannot use virtual registers unless"
+                 "they are the first terminator in a block!");
+        }
+#endif
+      } else if (reusedIncoming || !IncomingReg) {
+        // We may have to rewind a bit if we didn't insert a copy this time.
+        KillInst = Term;
+        while (KillInst != opBlock.begin())
+          if ((--KillInst)->readsRegister(SrcReg))
+            break;
+      } else {
+        // We just inserted this copy.
+        KillInst = prior(InsertPos);
+      }
+      assert(KillInst->readsRegister(SrcReg) && "Cannot find kill instruction");
+
+      // Finally, mark it killed.
+      LV->addVirtualRegisterKilled(SrcReg, KillInst);
+
+      // This vreg no longer lives all of the way through opBlock.
+      unsigned opBlockNum = opBlock.getNumber();
+      LV->getVarInfo(SrcReg).AliveBlocks.reset(opBlockNum);
+    }
+  }
+
+  // Really delete the PHI instruction now, if it is not in the LoweredPHIs map.
+  if (reusedIncoming || !IncomingReg)
+    MF.DeleteMachineInstr(MPhi);
+}
+
+/// analyzePHINodes - Gather information about the PHI nodes in here. In
+/// particular, we want to map the number of uses of a virtual register which is
+/// used in a PHI node. We map that to the BB the vreg is coming from. This is
+/// used later to determine when the vreg is killed in the BB.
+///
+void llvm::PHIElimination::analyzePHINodes(const MachineFunction& MF) {
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I)
+    for (MachineBasicBlock::const_iterator BBI = I->begin(), BBE = I->end();
+         BBI != BBE && BBI->isPHI(); ++BBI)
+      for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
+        ++VRegPHIUseCount[BBVRegPair(BBI->getOperand(i+1).getMBB()->getNumber(),
+                                     BBI->getOperand(i).getReg())];
+}
+
+bool llvm::PHIElimination::SplitPHIEdges(MachineFunction &MF,
+                                         MachineBasicBlock &MBB,
+                                         LiveVariables &LV) {
+  if (MBB.empty() || !MBB.front().isPHI())
+    return false;   // Quick exit for basic blocks without PHIs.
+
+  for (MachineBasicBlock::const_iterator BBI = MBB.begin(), BBE = MBB.end();
+       BBI != BBE && BBI->isPHI(); ++BBI) {
+    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
+      unsigned Reg = BBI->getOperand(i).getReg();
+      MachineBasicBlock *PreMBB = BBI->getOperand(i+1).getMBB();
+      // We break edges when registers are live out from the predecessor block
+      // (not considering PHI nodes). If the register is live in to this block
+      // anyway, we would gain nothing from splitting.
+      if (!LV.isLiveIn(Reg, MBB) && LV.isLiveOut(Reg, *PreMBB))
+        PreMBB->SplitCriticalEdge(&MBB, this);
+    }
+  }
+  return true;
+}
diff --git a/src/LLVM/lib/CodeGen/PHIElimination.h b/src/LLVM/lib/CodeGen/PHIElimination.h
new file mode 100644
index 0000000..f183de6
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PHIElimination.h
@@ -0,0 +1,113 @@
+//===-- lib/CodeGen/PHIElimination.h ----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PHIELIMINATION_HPP
+#define LLVM_CODEGEN_PHIELIMINATION_HPP
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+  class LiveVariables;
+  
+  /// Lower PHI instructions to copies.  
+  class PHIElimination : public MachineFunctionPass {
+    MachineRegisterInfo  *MRI; // Machine register information
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PHIElimination() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  private:
+    /// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions
+    /// in predecessor basic blocks.
+    ///
+    bool EliminatePHINodes(MachineFunction &MF, MachineBasicBlock &MBB);
+    void LowerAtomicPHINode(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator AfterPHIsIt);
+
+    /// analyzePHINodes - Gather information about the PHI nodes in
+    /// here. In particular, we want to map the number of uses of a virtual
+    /// register which is used in a PHI node. We map that to the BB the
+    /// vreg is coming from. This is used later to determine when the vreg
+    /// is killed in the BB.
+    ///
+    void analyzePHINodes(const MachineFunction& Fn);
+
+    /// Split critical edges where necessary for good coalescer performance.
+    bool SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB,
+                       LiveVariables &LV);
+
+    /// SplitCriticalEdge - Split a critical edge from A to B by
+    /// inserting a new MBB. Update branches in A and PHI instructions
+    /// in B. Return the new block.
+    MachineBasicBlock *SplitCriticalEdge(MachineBasicBlock *A,
+                                         MachineBasicBlock *B);
+
+    /// FindCopyInsertPoint - Find a safe place in MBB to insert a copy from
+    /// SrcReg when following the CFG edge to SuccMBB. This needs to be after
+    /// any def of SrcReg, but before any subsequent point where control flow
+    /// might jump out of the basic block.
+    MachineBasicBlock::iterator FindCopyInsertPoint(MachineBasicBlock &MBB,
+                                                    MachineBasicBlock &SuccMBB,
+                                                    unsigned SrcReg);
+
+    // SkipPHIsAndLabels - Copies need to be inserted after phi nodes and
+    // also after any exception handling labels: in landing pads execution
+    // starts at the label, so any copies placed before it won't be executed!
+    // We also deal with DBG_VALUEs, which are a bit tricky:
+    //  PHI
+    //  DBG_VALUE
+    //  LABEL
+    // Here the DBG_VALUE needs to be skipped, and if it refers to a PHI it
+    // needs to be annulled or, better, moved to follow the label, as well.
+    //  PHI
+    //  DBG_VALUE
+    //  no label
+    // Here it is not a good idea to skip the DBG_VALUE.
+    // FIXME: For now we skip and annul all DBG_VALUEs, maximally simple and
+    // maximally stupid.
+    MachineBasicBlock::iterator SkipPHIsAndLabels(MachineBasicBlock &MBB,
+                                                MachineBasicBlock::iterator I) {
+      // Rather than assuming that EH labels come before other kinds of labels,
+      // just skip all labels.
+      while (I != MBB.end() && 
+             (I->isPHI() || I->isLabel() || I->isDebugValue())) {
+        if (I->isDebugValue() && I->getNumOperands()==3 && 
+            I->getOperand(0).isReg())
+          I->getOperand(0).setReg(0U);
+        ++I;
+      }
+      return I;
+    }
+
+    typedef std::pair<unsigned, unsigned> BBVRegPair;
+    typedef DenseMap<BBVRegPair, unsigned> VRegPHIUse;
+
+    VRegPHIUse VRegPHIUseCount;
+
+    // Defs of PHI sources which are implicit_def.
+    SmallPtrSet<MachineInstr*, 4> ImpDefs;
+
+    // Map reusable lowered PHI node -> incoming join register.
+    typedef DenseMap<MachineInstr*, unsigned,
+                     MachineInstrExpressionTrait> LoweredPHIMap;
+    LoweredPHIMap LoweredPHIs;
+  };
+
+}
+
+#endif /* LLVM_CODEGEN_PHIELIMINATION_HPP */
diff --git a/src/LLVM/lib/CodeGen/Passes.cpp b/src/LLVM/lib/CodeGen/Passes.cpp
new file mode 100644
index 0000000..3489db2
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/Passes.cpp
@@ -0,0 +1,68 @@
+//===-- Passes.cpp - Target independent code generation passes ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines interfaces to access the target independent code
+// generation passes provided by the LLVM backend.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/Passes.h"
+
+using namespace llvm;
+
+//===---------------------------------------------------------------------===//
+///
+/// RegisterRegAlloc class - Track the registration of register allocators.
+///
+//===---------------------------------------------------------------------===//
+MachinePassRegistry RegisterRegAlloc::Registry;
+
+static FunctionPass *createDefaultRegisterAllocator() { return 0; }
+static RegisterRegAlloc
+defaultRegAlloc("default",
+                "pick register allocator based on -O option",
+                createDefaultRegisterAllocator);
+
+//===---------------------------------------------------------------------===//
+///
+/// RegAlloc command line options.
+///
+//===---------------------------------------------------------------------===//
+static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
+               RegisterPassParser<RegisterRegAlloc> >
+RegAlloc("regalloc",
+         cl::init(&createDefaultRegisterAllocator),
+         cl::desc("Register allocator to use"));
+
+
+//===---------------------------------------------------------------------===//
+///
+/// createRegisterAllocator - choose the appropriate register allocator.
+///
+//===---------------------------------------------------------------------===//
+FunctionPass *llvm::createRegisterAllocator(CodeGenOpt::Level OptLevel) {
+  RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
+
+  if (!Ctor) {
+    Ctor = RegAlloc;
+    RegisterRegAlloc::setDefault(RegAlloc);
+  }
+
+  if (Ctor != createDefaultRegisterAllocator)
+    return Ctor();
+
+  // When the 'default' allocator is requested, pick one based on OptLevel.
+  switch (OptLevel) {
+  case CodeGenOpt::None:
+    return createFastRegisterAllocator();
+  default:
+    return createLinearScanRegisterAllocator();
+  }
+}
diff --git a/src/LLVM/lib/CodeGen/PeepholeOptimizer.cpp b/src/LLVM/lib/CodeGen/PeepholeOptimizer.cpp
new file mode 100644
index 0000000..96a2b6e
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PeepholeOptimizer.cpp
@@ -0,0 +1,293 @@
+//===-- PeepholeOptimizer.cpp - Peephole Optimizations --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Perform peephole optimizations on the machine code:
+//
+// - Optimize Extensions
+//
+//     Optimization of sign / zero extension instructions. It may be extended to
+//     handle other instructions with similar properties.
+//
+//     On some targets, some instructions, e.g. X86 sign / zero extension, may
+//     leave the source value in the lower part of the result. This optimization
+//     will replace some uses of the pre-extension value with uses of the
+//     sub-register of the results.
+//
+// - Optimize Comparisons
+//
+//     Optimization of comparison instructions. For instance, in this code:
+//
+//       sub r1, 1
+//       cmp r1, 0
+//       bz  L1
+//
+//     If the "sub" instruction all ready sets (or could be modified to set) the
+//     same flag that the "cmp" instruction sets and that "bz" uses, then we can
+//     eliminate the "cmp" instruction.
+// 
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "peephole-opt"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+// Optimize Extensions
+static cl::opt<bool>
+Aggressive("aggressive-ext-opt", cl::Hidden,
+           cl::desc("Aggressive extension optimization"));
+
+STATISTIC(NumReuse, "Number of extension results reused");
+
+// Optimize Comparisons
+static cl::opt<bool>
+EnableOptCmps("enable-optimize-cmps", cl::init(true), cl::Hidden);
+
+STATISTIC(NumEliminated, "Number of compares eliminated");
+
+namespace {
+  class PeepholeOptimizer : public MachineFunctionPass {
+    const TargetMachine   *TM;
+    const TargetInstrInfo *TII;
+    MachineRegisterInfo   *MRI;
+    MachineDominatorTree  *DT;  // Machine dominator tree
+
+  public:
+    static char ID; // Pass identification
+    PeepholeOptimizer() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+      if (Aggressive) {
+        AU.addRequired<MachineDominatorTree>();
+        AU.addPreserved<MachineDominatorTree>();
+      }
+    }
+
+  private:
+    bool OptimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+                          SmallPtrSet<MachineInstr*, 8> &LocalMIs);
+  };
+}
+
+char PeepholeOptimizer::ID = 0;
+INITIALIZE_PASS(PeepholeOptimizer, "peephole-opts",
+                "Peephole Optimizations", false, false);
+
+FunctionPass *llvm::createPeepholeOptimizerPass() {
+  return new PeepholeOptimizer();
+}
+
+/// OptimizeExtInstr - If instruction is a copy-like instruction, i.e. it reads
+/// a single register and writes a single register and it does not modify the
+/// source, and if the source value is preserved as a sub-register of the
+/// result, then replace all reachable uses of the source with the subreg of the
+/// result.
+/// 
+/// Do not generate an EXTRACT that is used only in a debug use, as this changes
+/// the code. Since this code does not currently share EXTRACTs, just ignore all
+/// debug uses.
+bool PeepholeOptimizer::
+OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+                 SmallPtrSet<MachineInstr*, 8> &LocalMIs) {
+  LocalMIs.insert(MI);
+
+  unsigned SrcReg, DstReg, SubIdx;
+  if (!TII->isCoalescableExtInstr(*MI, SrcReg, DstReg, SubIdx))
+    return false;
+
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+      TargetRegisterInfo::isPhysicalRegister(SrcReg))
+    return false;
+
+  MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(SrcReg);
+  if (++UI == MRI->use_nodbg_end())
+    // No other uses.
+    return false;
+
+  // The source has other uses. See if we can replace the other uses with use of
+  // the result of the extension.
+  SmallPtrSet<MachineBasicBlock*, 4> ReachedBBs;
+  UI = MRI->use_nodbg_begin(DstReg);
+  for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
+       UI != UE; ++UI)
+    ReachedBBs.insert(UI->getParent());
+
+  // Uses that are in the same BB of uses of the result of the instruction.
+  SmallVector<MachineOperand*, 8> Uses;
+
+  // Uses that the result of the instruction can reach.
+  SmallVector<MachineOperand*, 8> ExtendedUses;
+
+  bool ExtendLife = true;
+  UI = MRI->use_nodbg_begin(SrcReg);
+  for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
+       UI != UE; ++UI) {
+    MachineOperand &UseMO = UI.getOperand();
+    MachineInstr *UseMI = &*UI;
+    if (UseMI == MI)
+      continue;
+
+    if (UseMI->isPHI()) {
+      ExtendLife = false;
+      continue;
+    }
+
+    // It's an error to translate this:
+    //
+    //    %reg1025 = <sext> %reg1024
+    //     ...
+    //    %reg1026 = SUBREG_TO_REG 0, %reg1024, 4
+    //
+    // into this:
+    //
+    //    %reg1025 = <sext> %reg1024
+    //     ...
+    //    %reg1027 = COPY %reg1025:4
+    //    %reg1026 = SUBREG_TO_REG 0, %reg1027, 4
+    //
+    // The problem here is that SUBREG_TO_REG is there to assert that an
+    // implicit zext occurs. It doesn't insert a zext instruction. If we allow
+    // the COPY here, it will give us the value after the <sext>, not the
+    // original value of %reg1024 before <sext>.
+    if (UseMI->getOpcode() == TargetOpcode::SUBREG_TO_REG)
+      continue;
+
+    MachineBasicBlock *UseMBB = UseMI->getParent();
+    if (UseMBB == MBB) {
+      // Local uses that come after the extension.
+      if (!LocalMIs.count(UseMI))
+        Uses.push_back(&UseMO);
+    } else if (ReachedBBs.count(UseMBB)) {
+      // Non-local uses where the result of the extension is used. Always
+      // replace these unless it's a PHI.
+      Uses.push_back(&UseMO);
+    } else if (Aggressive && DT->dominates(MBB, UseMBB)) {
+      // We may want to extend the live range of the extension result in order
+      // to replace these uses.
+      ExtendedUses.push_back(&UseMO);
+    } else {
+      // Both will be live out of the def MBB anyway. Don't extend live range of
+      // the extension result.
+      ExtendLife = false;
+      break;
+    }
+  }
+
+  if (ExtendLife && !ExtendedUses.empty())
+    // Extend the liveness of the extension result.
+    std::copy(ExtendedUses.begin(), ExtendedUses.end(),
+              std::back_inserter(Uses));
+
+  // Now replace all uses.
+  bool Changed = false;
+  if (!Uses.empty()) {
+    SmallPtrSet<MachineBasicBlock*, 4> PHIBBs;
+
+    // Look for PHI uses of the extended result, we don't want to extend the
+    // liveness of a PHI input. It breaks all kinds of assumptions down
+    // stream. A PHI use is expected to be the kill of its source values.
+    UI = MRI->use_nodbg_begin(DstReg);
+    for (MachineRegisterInfo::use_nodbg_iterator
+           UE = MRI->use_nodbg_end(); UI != UE; ++UI)
+      if (UI->isPHI())
+        PHIBBs.insert(UI->getParent());
+
+    const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
+    for (unsigned i = 0, e = Uses.size(); i != e; ++i) {
+      MachineOperand *UseMO = Uses[i];
+      MachineInstr *UseMI = UseMO->getParent();
+      MachineBasicBlock *UseMBB = UseMI->getParent();
+      if (PHIBBs.count(UseMBB))
+        continue;
+
+      unsigned NewVR = MRI->createVirtualRegister(RC);
+      BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), NewVR)
+        .addReg(DstReg, 0, SubIdx);
+
+      UseMO->setReg(NewVR);
+      ++NumReuse;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// OptimizeCmpInstr - If the instruction is a compare and the previous
+/// instruction it's comparing against all ready sets (or could be modified to
+/// set) the same flag as the compare, then we can remove the comparison and use
+/// the flag from the previous instruction.
+bool PeepholeOptimizer::OptimizeCmpInstr(MachineInstr *MI,
+                                         MachineBasicBlock *MBB) {
+  if (!EnableOptCmps) return false;
+
+  // If this instruction is a comparison against zero and isn't comparing a
+  // physical register, we can try to optimize it.
+  unsigned SrcReg;
+  int CmpValue;
+  if (!TII->AnalyzeCompare(MI, SrcReg, CmpValue) ||
+      TargetRegisterInfo::isPhysicalRegister(SrcReg) || CmpValue != 0)
+    return false;
+
+  MachineRegisterInfo::def_iterator DI = MRI->def_begin(SrcReg);
+  if (llvm::next(DI) != MRI->def_end())
+    // Only support one definition.
+    return false;
+
+  // Attempt to convert the defining instruction to set the "zero" flag.
+  if (TII->ConvertToSetZeroFlag(&*DI, MI)) {
+    ++NumEliminated;
+    return true;
+  }
+
+  return false;
+}
+
+bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  TM  = &MF.getTarget();
+  TII = TM->getInstrInfo();
+  MRI = &MF.getRegInfo();
+  DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : 0;
+
+  bool Changed = false;
+
+  SmallPtrSet<MachineInstr*, 8> LocalMIs;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+    LocalMIs.clear();
+
+    for (MachineBasicBlock::iterator
+           MII = I->begin(), ME = I->end(); MII != ME; ) {
+      MachineInstr *MI = &*MII;
+
+      if (MI->getDesc().isCompare()) {
+        ++MII; // The iterator may become invalid if the compare is deleted.
+        Changed |= OptimizeCmpInstr(MI, MBB);
+      } else {
+        Changed |= OptimizeExtInstr(MI, MBB, LocalMIs);
+        ++MII;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/src/LLVM/lib/CodeGen/PostRAHazardRecognizer.cpp b/src/LLVM/lib/CodeGen/PostRAHazardRecognizer.cpp
new file mode 100644
index 0000000..cbde2b0
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -0,0 +1,180 @@
+//===----- PostRAHazardRecognizer.cpp - hazard recognizer -------- ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a hazard recognizer using the instructions itineraries
+// defined for the current target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "post-RA-sched"
+#include "llvm/CodeGen/PostRAHazardRecognizer.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrItineraries.h"
+
+using namespace llvm;
+
+PostRAHazardRecognizer::
+PostRAHazardRecognizer(const InstrItineraryData &LItinData) :
+  ScheduleHazardRecognizer(), ItinData(LItinData) {
+  // Determine the maximum depth of any itinerary. This determines the
+  // depth of the scoreboard. We always make the scoreboard at least 1
+  // cycle deep to avoid dealing with the boundary condition.
+  unsigned ScoreboardDepth = 1;
+  if (!ItinData.isEmpty()) {
+    for (unsigned idx = 0; ; ++idx) {
+      if (ItinData.isEndMarker(idx))
+        break;
+
+      const InstrStage *IS = ItinData.beginStage(idx);
+      const InstrStage *E = ItinData.endStage(idx);
+      unsigned ItinDepth = 0;
+      for (; IS != E; ++IS)
+        ItinDepth += IS->getCycles();
+
+      ScoreboardDepth = std::max(ScoreboardDepth, ItinDepth);
+    }
+  }
+
+  ReservedScoreboard.reset(ScoreboardDepth);
+  RequiredScoreboard.reset(ScoreboardDepth);
+
+  DEBUG(dbgs() << "Using post-ra hazard recognizer: ScoreboardDepth = " 
+               << ScoreboardDepth << '\n');
+}
+
+void PostRAHazardRecognizer::Reset() {
+  RequiredScoreboard.reset();
+  ReservedScoreboard.reset();
+}
+
+void PostRAHazardRecognizer::ScoreBoard::dump() const {
+  dbgs() << "Scoreboard:\n";
+
+  unsigned last = Depth - 1;
+  while ((last > 0) && ((*this)[last] == 0))
+    last--;
+
+  for (unsigned i = 0; i <= last; i++) {
+    unsigned FUs = (*this)[i];
+    dbgs() << "\t";
+    for (int j = 31; j >= 0; j--)
+      dbgs() << ((FUs & (1 << j)) ? '1' : '0');
+    dbgs() << '\n';
+  }
+}
+
+ScheduleHazardRecognizer::HazardType
+PostRAHazardRecognizer::getHazardType(SUnit *SU) {
+  if (ItinData.isEmpty())
+    return NoHazard;
+
+  unsigned cycle = 0;
+
+  // Use the itinerary for the underlying instruction to check for
+  // free FU's in the scoreboard at the appropriate future cycles.
+  unsigned idx = SU->getInstr()->getDesc().getSchedClass();
+  for (const InstrStage *IS = ItinData.beginStage(idx),
+         *E = ItinData.endStage(idx); IS != E; ++IS) {
+    // We must find one of the stage's units free for every cycle the
+    // stage is occupied. FIXME it would be more accurate to find the
+    // same unit free in all the cycles.
+    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
+      assert(((cycle + i) < RequiredScoreboard.getDepth()) &&
+             "Scoreboard depth exceeded!");
+
+      unsigned freeUnits = IS->getUnits();
+      switch (IS->getReservationKind()) {
+      default:
+       assert(0 && "Invalid FU reservation");
+      case InstrStage::Required:
+        // Required FUs conflict with both reserved and required ones
+        freeUnits &= ~ReservedScoreboard[cycle + i];
+        // FALLTHROUGH
+      case InstrStage::Reserved:
+        // Reserved FUs can conflict only with required ones.
+        freeUnits &= ~RequiredScoreboard[cycle + i];
+        break;
+      }
+
+      if (!freeUnits) {
+        DEBUG(dbgs() << "*** Hazard in cycle " << (cycle + i) << ", ");
+        DEBUG(dbgs() << "SU(" << SU->NodeNum << "): ");
+        DEBUG(SU->getInstr()->dump());
+        return Hazard;
+      }
+    }
+
+    // Advance the cycle to the next stage.
+    cycle += IS->getNextCycles();
+  }
+
+  return NoHazard;
+}
+
+void PostRAHazardRecognizer::EmitInstruction(SUnit *SU) {
+  if (ItinData.isEmpty())
+    return;
+
+  unsigned cycle = 0;
+
+  // Use the itinerary for the underlying instruction to reserve FU's
+  // in the scoreboard at the appropriate future cycles.
+  unsigned idx = SU->getInstr()->getDesc().getSchedClass();
+  for (const InstrStage *IS = ItinData.beginStage(idx),
+         *E = ItinData.endStage(idx); IS != E; ++IS) {
+    // We must reserve one of the stage's units for every cycle the
+    // stage is occupied. FIXME it would be more accurate to reserve
+    // the same unit free in all the cycles.
+    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
+      assert(((cycle + i) < RequiredScoreboard.getDepth()) &&
+             "Scoreboard depth exceeded!");
+
+      unsigned freeUnits = IS->getUnits();
+      switch (IS->getReservationKind()) {
+      default:
+       assert(0 && "Invalid FU reservation");
+      case InstrStage::Required:
+        // Required FUs conflict with both reserved and required ones
+        freeUnits &= ~ReservedScoreboard[cycle + i];
+        // FALLTHROUGH
+      case InstrStage::Reserved:
+        // Reserved FUs can conflict only with required ones.
+        freeUnits &= ~RequiredScoreboard[cycle + i];
+        break;
+      }
+
+      // reduce to a single unit
+      unsigned freeUnit = 0;
+      do {
+        freeUnit = freeUnits;
+        freeUnits = freeUnit & (freeUnit - 1);
+      } while (freeUnits);
+
+      assert(freeUnit && "No function unit available!");
+      if (IS->getReservationKind() == InstrStage::Required)
+        RequiredScoreboard[cycle + i] |= freeUnit;
+      else
+        ReservedScoreboard[cycle + i] |= freeUnit;
+    }
+
+    // Advance the cycle to the next stage.
+    cycle += IS->getNextCycles();
+  }
+
+  DEBUG(ReservedScoreboard.dump());
+  DEBUG(RequiredScoreboard.dump());
+}
+
+void PostRAHazardRecognizer::AdvanceCycle() {
+  ReservedScoreboard[0] = 0; ReservedScoreboard.advance();
+  RequiredScoreboard[0] = 0; RequiredScoreboard.advance();
+}
diff --git a/src/LLVM/lib/CodeGen/PostRASchedulerList.cpp b/src/LLVM/lib/CodeGen/PostRASchedulerList.cpp
new file mode 100644
index 0000000..f0bd6d1
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PostRASchedulerList.cpp
@@ -0,0 +1,694 @@
+//===----- SchedulePostRAList.cpp - list scheduler ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a top-down list scheduler, using standard algorithms.
+// The basic approach uses a priority queue of available nodes to schedule.
+// One at a time, nodes are taken from the priority queue (thus in priority
+// order), checked for legality to schedule, and emitted if legal.
+//
+// Nodes may not be legal to schedule either due to structural hazards (e.g.
+// pipeline or resource constraints) or because an input to the instruction has
+// not completed execution.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "post-RA-sched"
+#include "AntiDepBreaker.h"
+#include "AggressiveAntiDepBreaker.h"
+#include "CriticalAntiDepBreaker.h"
+#include "ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumNoops, "Number of noops inserted");
+STATISTIC(NumStalls, "Number of pipeline stalls");
+STATISTIC(NumFixedAnti, "Number of fixed anti-dependencies");
+
+// Post-RA scheduling is enabled with
+// TargetSubtarget.enablePostRAScheduler(). This flag can be used to
+// override the target.
+static cl::opt<bool>
+EnablePostRAScheduler("post-RA-scheduler",
+                       cl::desc("Enable scheduling after register allocation"),
+                       cl::init(false), cl::Hidden);
+static cl::opt<std::string>
+EnableAntiDepBreaking("break-anti-dependencies",
+                      cl::desc("Break post-RA scheduling anti-dependencies: "
+                               "\"critical\", \"all\", or \"none\""),
+                      cl::init("none"), cl::Hidden);
+
+// If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod
+static cl::opt<int>
+DebugDiv("postra-sched-debugdiv",
+                      cl::desc("Debug control MBBs that are scheduled"),
+                      cl::init(0), cl::Hidden);
+static cl::opt<int>
+DebugMod("postra-sched-debugmod",
+                      cl::desc("Debug control MBBs that are scheduled"),
+                      cl::init(0), cl::Hidden);
+
+AntiDepBreaker::~AntiDepBreaker() { }
+
+namespace {
+  class PostRAScheduler : public MachineFunctionPass {
+    AliasAnalysis *AA;
+    const TargetInstrInfo *TII;
+    CodeGenOpt::Level OptLevel;
+
+  public:
+    static char ID;
+    PostRAScheduler(CodeGenOpt::Level ol) :
+      MachineFunctionPass(ID), OptLevel(ol) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    const char *getPassName() const {
+      return "Post RA top-down list latency scheduler";
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn);
+  };
+  char PostRAScheduler::ID = 0;
+
+  class SchedulePostRATDList : public ScheduleDAGInstrs {
+    /// AvailableQueue - The priority queue to use for the available SUnits.
+    ///
+    LatencyPriorityQueue AvailableQueue;
+
+    /// PendingQueue - This contains all of the instructions whose operands have
+    /// been issued, but their results are not ready yet (due to the latency of
+    /// the operation).  Once the operands becomes available, the instruction is
+    /// added to the AvailableQueue.
+    std::vector<SUnit*> PendingQueue;
+
+    /// Topo - A topological ordering for SUnits.
+    ScheduleDAGTopologicalSort Topo;
+
+    /// HazardRec - The hazard recognizer to use.
+    ScheduleHazardRecognizer *HazardRec;
+
+    /// AntiDepBreak - Anti-dependence breaking object, or NULL if none
+    AntiDepBreaker *AntiDepBreak;
+
+    /// AA - AliasAnalysis for making memory reference queries.
+    AliasAnalysis *AA;
+
+    /// KillIndices - The index of the most recent kill (proceding bottom-up),
+    /// or ~0u if the register is not live.
+    std::vector<unsigned> KillIndices;
+
+  public:
+    SchedulePostRATDList(MachineFunction &MF,
+                         const MachineLoopInfo &MLI,
+                         const MachineDominatorTree &MDT,
+                         ScheduleHazardRecognizer *HR,
+                         AntiDepBreaker *ADB,
+                         AliasAnalysis *aa)
+      : ScheduleDAGInstrs(MF, MLI, MDT), Topo(SUnits),
+        HazardRec(HR), AntiDepBreak(ADB), AA(aa),
+        KillIndices(TRI->getNumRegs()) {}
+
+    ~SchedulePostRATDList() {
+    }
+
+    /// StartBlock - Initialize register live-range state for scheduling in
+    /// this block.
+    ///
+    void StartBlock(MachineBasicBlock *BB);
+
+    /// Schedule - Schedule the instruction range using list scheduling.
+    ///
+    void Schedule();
+
+    /// Observe - Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    ///
+    void Observe(MachineInstr *MI, unsigned Count);
+
+    /// FinishBlock - Clean up register live-range state.
+    ///
+    void FinishBlock();
+
+    /// FixupKills - Fix register kill flags that have been made
+    /// invalid due to scheduling
+    ///
+    void FixupKills(MachineBasicBlock *MBB);
+
+  private:
+    void ReleaseSucc(SUnit *SU, SDep *SuccEdge);
+    void ReleaseSuccessors(SUnit *SU);
+    void ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle);
+    void ListScheduleTopDown();
+    void StartBlockForKills(MachineBasicBlock *BB);
+
+    // ToggleKillFlag - Toggle a register operand kill flag. Other
+    // adjustments may be made to the instruction if necessary. Return
+    // true if the operand has been deleted, false if not.
+    bool ToggleKillFlag(MachineInstr *MI, MachineOperand &MO);
+  };
+}
+
+bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
+  AA = &getAnalysis<AliasAnalysis>();
+  TII = Fn.getTarget().getInstrInfo();
+
+  // Check for explicit enable/disable of post-ra scheduling.
+  TargetSubtarget::AntiDepBreakMode AntiDepMode = TargetSubtarget::ANTIDEP_NONE;
+  SmallVector<TargetRegisterClass*, 4> CriticalPathRCs;
+  if (EnablePostRAScheduler.getPosition() > 0) {
+    if (!EnablePostRAScheduler)
+      return false;
+  } else {
+    // Check that post-RA scheduling is enabled for this target.
+    const TargetSubtarget &ST = Fn.getTarget().getSubtarget<TargetSubtarget>();
+    if (!ST.enablePostRAScheduler(OptLevel, AntiDepMode, CriticalPathRCs))
+      return false;
+  }
+
+  // Check for antidep breaking override...
+  if (EnableAntiDepBreaking.getPosition() > 0) {
+    AntiDepMode = (EnableAntiDepBreaking == "all") ?
+      TargetSubtarget::ANTIDEP_ALL :
+        (EnableAntiDepBreaking == "critical")
+           ? TargetSubtarget::ANTIDEP_CRITICAL : TargetSubtarget::ANTIDEP_NONE;
+  }
+
+  DEBUG(dbgs() << "PostRAScheduler\n");
+
+  const MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  const MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+  const TargetMachine &TM = Fn.getTarget();
+  const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
+  ScheduleHazardRecognizer *HR =
+    TM.getInstrInfo()->CreateTargetPostRAHazardRecognizer(InstrItins);
+  AntiDepBreaker *ADB =
+    ((AntiDepMode == TargetSubtarget::ANTIDEP_ALL) ?
+     (AntiDepBreaker *)new AggressiveAntiDepBreaker(Fn, CriticalPathRCs) :
+     ((AntiDepMode == TargetSubtarget::ANTIDEP_CRITICAL) ?
+      (AntiDepBreaker *)new CriticalAntiDepBreaker(Fn) : NULL));
+
+  SchedulePostRATDList Scheduler(Fn, MLI, MDT, HR, ADB, AA);
+
+  // Loop over all of the basic blocks
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+#ifndef NDEBUG
+    // If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod
+    if (DebugDiv > 0) {
+      static int bbcnt = 0;
+      if (bbcnt++ % DebugDiv != DebugMod)
+        continue;
+      dbgs() << "*** DEBUG scheduling " << Fn.getFunction()->getNameStr() <<
+        ":BB#" << MBB->getNumber() << " ***\n";
+    }
+#endif
+
+    // Initialize register live-range state for scheduling in this block.
+    Scheduler.StartBlock(MBB);
+
+    // Schedule each sequence of instructions not interrupted by a label
+    // or anything else that effectively needs to shut down scheduling.
+    MachineBasicBlock::iterator Current = MBB->end();
+    unsigned Count = MBB->size(), CurrentCount = Count;
+    for (MachineBasicBlock::iterator I = Current; I != MBB->begin(); ) {
+      MachineInstr *MI = llvm::prior(I);
+      if (TII->isSchedulingBoundary(MI, MBB, Fn)) {
+        Scheduler.Run(MBB, I, Current, CurrentCount);
+        Scheduler.EmitSchedule();
+        Current = MI;
+        CurrentCount = Count - 1;
+        Scheduler.Observe(MI, CurrentCount);
+      }
+      I = MI;
+      --Count;
+    }
+    assert(Count == 0 && "Instruction count mismatch!");
+    assert((MBB->begin() == Current || CurrentCount != 0) &&
+           "Instruction count mismatch!");
+    Scheduler.Run(MBB, MBB->begin(), Current, CurrentCount);
+    Scheduler.EmitSchedule();
+
+    // Clean up register live-range state.
+    Scheduler.FinishBlock();
+
+    // Update register kills
+    Scheduler.FixupKills(MBB);
+  }
+
+  delete HR;
+  delete ADB;
+
+  return true;
+}
+
+/// StartBlock - Initialize register live-range state for scheduling in
+/// this block.
+///
+void SchedulePostRATDList::StartBlock(MachineBasicBlock *BB) {
+  // Call the superclass.
+  ScheduleDAGInstrs::StartBlock(BB);
+
+  // Reset the hazard recognizer and anti-dep breaker.
+  HazardRec->Reset();
+  if (AntiDepBreak != NULL)
+    AntiDepBreak->StartBlock(BB);
+}
+
+/// Schedule - Schedule the instruction range using list scheduling.
+///
+void SchedulePostRATDList::Schedule() {
+  // Build the scheduling graph.
+  BuildSchedGraph(AA);
+
+  if (AntiDepBreak != NULL) {
+    unsigned Broken =
+      AntiDepBreak->BreakAntiDependencies(SUnits, Begin, InsertPos,
+                                          InsertPosIndex);
+
+    if (Broken != 0) {
+      // We made changes. Update the dependency graph.
+      // Theoretically we could update the graph in place:
+      // When a live range is changed to use a different register, remove
+      // the def's anti-dependence *and* output-dependence edges due to
+      // that register, and add new anti-dependence and output-dependence
+      // edges based on the next live range of the register.
+      SUnits.clear();
+      Sequence.clear();
+      EntrySU = SUnit();
+      ExitSU = SUnit();
+      BuildSchedGraph(AA);
+
+      NumFixedAnti += Broken;
+    }
+  }
+
+  DEBUG(dbgs() << "********** List Scheduling **********\n");
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  AvailableQueue.initNodes(SUnits);
+  ListScheduleTopDown();
+  AvailableQueue.releaseState();
+}
+
+/// Observe - Update liveness information to account for the current
+/// instruction, which will not be scheduled.
+///
+void SchedulePostRATDList::Observe(MachineInstr *MI, unsigned Count) {
+  if (AntiDepBreak != NULL)
+    AntiDepBreak->Observe(MI, Count, InsertPosIndex);
+}
+
+/// FinishBlock - Clean up register live-range state.
+///
+void SchedulePostRATDList::FinishBlock() {
+  if (AntiDepBreak != NULL)
+    AntiDepBreak->FinishBlock();
+
+  // Call the superclass.
+  ScheduleDAGInstrs::FinishBlock();
+}
+
+/// StartBlockForKills - Initialize register live-range state for updating kills
+///
+void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) {
+  // Initialize the indices to indicate that no registers are live.
+  for (unsigned i = 0; i < TRI->getNumRegs(); ++i)
+    KillIndices[i] = ~0u;
+
+  // Determine the live-out physregs for this block.
+  if (!BB->empty() && BB->back().getDesc().isReturn()) {
+    // In a return block, examine the function live-out regs.
+    for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(),
+           E = MRI.liveout_end(); I != E; ++I) {
+      unsigned Reg = *I;
+      KillIndices[Reg] = BB->size();
+      // Repeat, for all subregs.
+      for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+           *Subreg; ++Subreg) {
+        KillIndices[*Subreg] = BB->size();
+      }
+    }
+  }
+  else {
+    // In a non-return block, examine the live-in regs of all successors.
+    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+           SE = BB->succ_end(); SI != SE; ++SI) {
+      for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+             E = (*SI)->livein_end(); I != E; ++I) {
+        unsigned Reg = *I;
+        KillIndices[Reg] = BB->size();
+        // Repeat, for all subregs.
+        for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+             *Subreg; ++Subreg) {
+          KillIndices[*Subreg] = BB->size();
+        }
+      }
+    }
+  }
+}
+
+bool SchedulePostRATDList::ToggleKillFlag(MachineInstr *MI,
+                                          MachineOperand &MO) {
+  // Setting kill flag...
+  if (!MO.isKill()) {
+    MO.setIsKill(true);
+    return false;
+  }
+
+  // If MO itself is live, clear the kill flag...
+  if (KillIndices[MO.getReg()] != ~0u) {
+    MO.setIsKill(false);
+    return false;
+  }
+
+  // If any subreg of MO is live, then create an imp-def for that
+  // subreg and keep MO marked as killed.
+  MO.setIsKill(false);
+  bool AllDead = true;
+  const unsigned SuperReg = MO.getReg();
+  for (const unsigned *Subreg = TRI->getSubRegisters(SuperReg);
+       *Subreg; ++Subreg) {
+    if (KillIndices[*Subreg] != ~0u) {
+      MI->addOperand(MachineOperand::CreateReg(*Subreg,
+                                               true  /*IsDef*/,
+                                               true  /*IsImp*/,
+                                               false /*IsKill*/,
+                                               false /*IsDead*/));
+      AllDead = false;
+    }
+  }
+
+  if(AllDead)
+    MO.setIsKill(true);
+  return false;
+}
+
+/// FixupKills - Fix the register kill flags, they may have been made
+/// incorrect by instruction reordering.
+///
+void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Fixup kills for BB#" << MBB->getNumber() << '\n');
+
+  std::set<unsigned> killedRegs;
+  BitVector ReservedRegs = TRI->getReservedRegs(MF);
+
+  StartBlockForKills(MBB);
+
+  // Examine block from end to start...
+  unsigned Count = MBB->size();
+  for (MachineBasicBlock::iterator I = MBB->end(), E = MBB->begin();
+       I != E; --Count) {
+    MachineInstr *MI = --I;
+    if (MI->isDebugValue())
+      continue;
+
+    // Update liveness.  Registers that are defed but not used in this
+    // instruction are now dead. Mark register and all subregs as they
+    // are completely defined.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+      if (!MO.isDef()) continue;
+      // Ignore two-addr defs.
+      if (MI->isRegTiedToUseOperand(i)) continue;
+
+      KillIndices[Reg] = ~0u;
+
+      // Repeat for all subregs.
+      for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+           *Subreg; ++Subreg) {
+        KillIndices[*Subreg] = ~0u;
+      }
+    }
+
+    // Examine all used registers and set/clear kill flag. When a
+    // register is used multiple times we only set the kill flag on
+    // the first use.
+    killedRegs.clear();
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isUse()) continue;
+      unsigned Reg = MO.getReg();
+      if ((Reg == 0) || ReservedRegs.test(Reg)) continue;
+
+      bool kill = false;
+      if (killedRegs.find(Reg) == killedRegs.end()) {
+        kill = true;
+        // A register is not killed if any subregs are live...
+        for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+             *Subreg; ++Subreg) {
+          if (KillIndices[*Subreg] != ~0u) {
+            kill = false;
+            break;
+          }
+        }
+
+        // If subreg is not live, then register is killed if it became
+        // live in this instruction
+        if (kill)
+          kill = (KillIndices[Reg] == ~0u);
+      }
+
+      if (MO.isKill() != kill) {
+        DEBUG(dbgs() << "Fixing " << MO << " in ");
+        // Warning: ToggleKillFlag may invalidate MO.
+        ToggleKillFlag(MI, MO);
+        DEBUG(MI->dump());
+      }
+
+      killedRegs.insert(Reg);
+    }
+
+    // Mark any used register (that is not using undef) and subregs as
+    // now live...
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
+      unsigned Reg = MO.getReg();
+      if ((Reg == 0) || ReservedRegs.test(Reg)) continue;
+
+      KillIndices[Reg] = Count;
+
+      for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+           *Subreg; ++Subreg) {
+        KillIndices[*Subreg] = Count;
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Top-Down Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. Add it to
+/// the PendingQueue if the count reaches zero. Also update its cycle bound.
+void SchedulePostRATDList::ReleaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --SuccSU->NumPredsLeft;
+
+  // Compute how many cycles it will be before this actually becomes
+  // available.  This is the max of the start time of all predecessors plus
+  // their latencies.
+  SuccSU->setDepthToAtLeast(SU->getDepth() + SuccEdge->getLatency());
+
+  // If all the node's predecessors are scheduled, this node is ready
+  // to be scheduled. Ignore the special ExitSU node.
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
+    PendingQueue.push_back(SuccSU);
+}
+
+/// ReleaseSuccessors - Call ReleaseSucc on each of SU's successors.
+void SchedulePostRATDList::ReleaseSuccessors(SUnit *SU) {
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    ReleaseSucc(SU, &*I);
+  }
+}
+
+/// ScheduleNodeTopDown - Add the node to the schedule. Decrement the pending
+/// count of its successors. If a successor pending count is zero, add it to
+/// the Available queue.
+void SchedulePostRATDList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
+  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+
+  Sequence.push_back(SU);
+  assert(CurCycle >= SU->getDepth() &&
+         "Node scheduled above its depth!");
+  SU->setDepthToAtLeast(CurCycle);
+
+  ReleaseSuccessors(SU);
+  SU->isScheduled = true;
+  AvailableQueue.ScheduledNode(SU);
+}
+
+/// ListScheduleTopDown - The main loop of list scheduling for top-down
+/// schedulers.
+void SchedulePostRATDList::ListScheduleTopDown() {
+  unsigned CurCycle = 0;
+
+  // We're scheduling top-down but we're visiting the regions in
+  // bottom-up order, so we don't know the hazards at the start of a
+  // region. So assume no hazards (this should usually be ok as most
+  // blocks are a single region).
+  HazardRec->Reset();
+
+  // Release any successors of the special Entry node.
+  ReleaseSuccessors(&EntrySU);
+
+  // Add all leaves to Available queue.
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    // It is available if it has no predecessors.
+    bool available = SUnits[i].Preds.empty();
+    if (available) {
+      AvailableQueue.push(&SUnits[i]);
+      SUnits[i].isAvailable = true;
+    }
+  }
+
+  // In any cycle where we can't schedule any instructions, we must
+  // stall or emit a noop, depending on the target.
+  bool CycleHasInsts = false;
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  std::vector<SUnit*> NotReady;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue.empty() || !PendingQueue.empty()) {
+    // Check to see if any of the pending instructions are ready to issue.  If
+    // so, add them to the available queue.
+    unsigned MinDepth = ~0u;
+    for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+      if (PendingQueue[i]->getDepth() <= CurCycle) {
+        AvailableQueue.push(PendingQueue[i]);
+        PendingQueue[i]->isAvailable = true;
+        PendingQueue[i] = PendingQueue.back();
+        PendingQueue.pop_back();
+        --i; --e;
+      } else if (PendingQueue[i]->getDepth() < MinDepth)
+        MinDepth = PendingQueue[i]->getDepth();
+    }
+
+    DEBUG(dbgs() << "\n*** Examining Available\n";
+          LatencyPriorityQueue q = AvailableQueue;
+          while (!q.empty()) {
+            SUnit *su = q.pop();
+            dbgs() << "Height " << su->getHeight() << ": ";
+            su->dump(this);
+          });
+
+    SUnit *FoundSUnit = 0;
+    bool HasNoopHazards = false;
+    while (!AvailableQueue.empty()) {
+      SUnit *CurSUnit = AvailableQueue.pop();
+
+      ScheduleHazardRecognizer::HazardType HT =
+        HazardRec->getHazardType(CurSUnit);
+      if (HT == ScheduleHazardRecognizer::NoHazard) {
+        FoundSUnit = CurSUnit;
+        break;
+      }
+
+      // Remember if this is a noop hazard.
+      HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard;
+
+      NotReady.push_back(CurSUnit);
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    if (!NotReady.empty()) {
+      AvailableQueue.push_all(NotReady);
+      NotReady.clear();
+    }
+
+    // If we found a node to schedule...
+    if (FoundSUnit) {
+      // ... schedule the node...
+      ScheduleNodeTopDown(FoundSUnit, CurCycle);
+      HazardRec->EmitInstruction(FoundSUnit);
+      CycleHasInsts = true;
+    } else {
+      if (CycleHasInsts) {
+        DEBUG(dbgs() << "*** Finished cycle " << CurCycle << '\n');
+        HazardRec->AdvanceCycle();
+      } else if (!HasNoopHazards) {
+        // Otherwise, we have a pipeline stall, but no other problem,
+        // just advance the current cycle and try again.
+        DEBUG(dbgs() << "*** Stall in cycle " << CurCycle << '\n');
+        HazardRec->AdvanceCycle();
+        ++NumStalls;
+      } else {
+        // Otherwise, we have no instructions to issue and we have instructions
+        // that will fault if we don't do this right.  This is the case for
+        // processors without pipeline interlocks and other cases.
+        DEBUG(dbgs() << "*** Emitting noop in cycle " << CurCycle << '\n');
+        HazardRec->EmitNoop();
+        Sequence.push_back(0);   // NULL here means noop
+        ++NumNoops;
+      }
+
+      ++CurCycle;
+      CycleHasInsts = false;
+    }
+  }
+
+#ifndef NDEBUG
+  VerifySchedule(/*isBottomUp=*/false);
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createPostRAScheduler(CodeGenOpt::Level OptLevel) {
+  return new PostRAScheduler(OptLevel);
+}
diff --git a/src/LLVM/lib/CodeGen/PreAllocSplitting.cpp b/src/LLVM/lib/CodeGen/PreAllocSplitting.cpp
new file mode 100644
index 0000000..82df6c2
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PreAllocSplitting.cpp
@@ -0,0 +1,1422 @@
+//===-- PreAllocSplitting.cpp - Pre-allocation Interval Spltting Pass. ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine instruction level pre-register allocation
+// live interval splitting pass. It finds live interval barriers, i.e.
+// instructions which will kill all physical registers in certain register
+// classes, and split all live intervals which cross the barrier.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-alloc-split"
+#include "VirtRegMap.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+static cl::opt<int> PreSplitLimit("pre-split-limit", cl::init(-1), cl::Hidden);
+static cl::opt<int> DeadSplitLimit("dead-split-limit", cl::init(-1),
+                                   cl::Hidden);
+static cl::opt<int> RestoreFoldLimit("restore-fold-limit", cl::init(-1),
+                                     cl::Hidden);
+
+STATISTIC(NumSplits, "Number of intervals split");
+STATISTIC(NumRemats, "Number of intervals split by rematerialization");
+STATISTIC(NumFolds, "Number of intervals split with spill folding");
+STATISTIC(NumRestoreFolds, "Number of intervals split with restore folding");
+STATISTIC(NumRenumbers, "Number of intervals renumbered into new registers");
+STATISTIC(NumDeadSpills, "Number of dead spills removed");
+
+namespace {
+  class PreAllocSplitting : public MachineFunctionPass {
+    MachineFunction       *CurrMF;
+    const TargetMachine   *TM;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo* TRI;
+    MachineFrameInfo      *MFI;
+    MachineRegisterInfo   *MRI;
+    SlotIndexes           *SIs;
+    LiveIntervals         *LIs;
+    LiveStacks            *LSs;
+    VirtRegMap            *VRM;
+
+    // Barrier - Current barrier being processed.
+    MachineInstr          *Barrier;
+
+    // BarrierMBB - Basic block where the barrier resides in.
+    MachineBasicBlock     *BarrierMBB;
+
+    // Barrier - Current barrier index.
+    SlotIndex     BarrierIdx;
+
+    // CurrLI - Current live interval being split.
+    LiveInterval          *CurrLI;
+
+    // CurrSLI - Current stack slot live interval.
+    LiveInterval          *CurrSLI;
+
+    // CurrSValNo - Current val# for the stack slot live interval.
+    VNInfo                *CurrSValNo;
+
+    // IntervalSSMap - A map from live interval to spill slots.
+    DenseMap<unsigned, int> IntervalSSMap;
+
+    // Def2SpillMap - A map from a def instruction index to spill index.
+    DenseMap<SlotIndex, SlotIndex> Def2SpillMap;
+
+  public:
+    static char ID;
+    PreAllocSplitting()
+      : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      AU.addRequired<LiveIntervals>();
+      AU.addPreserved<LiveIntervals>();
+      AU.addRequired<LiveStacks>();
+      AU.addPreserved<LiveStacks>();
+      AU.addPreserved<RegisterCoalescer>();
+      AU.addPreserved<CalculateSpillWeights>();
+      if (StrongPHIElim)
+        AU.addPreservedID(StrongPHIEliminationID);
+      else
+        AU.addPreservedID(PHIEliminationID);
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addRequired<VirtRegMap>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addPreserved<MachineLoopInfo>();
+      AU.addPreserved<VirtRegMap>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    
+    virtual void releaseMemory() {
+      IntervalSSMap.clear();
+      Def2SpillMap.clear();
+    }
+
+    virtual const char *getPassName() const {
+      return "Pre-Register Allocaton Live Interval Splitting";
+    }
+
+    /// print - Implement the dump method.
+    virtual void print(raw_ostream &O, const Module* M = 0) const {
+      LIs->print(O, M);
+    }
+
+
+  private:
+
+    MachineBasicBlock::iterator
+      findSpillPoint(MachineBasicBlock*, MachineInstr*, MachineInstr*,
+                     SmallPtrSet<MachineInstr*, 4>&);
+
+    MachineBasicBlock::iterator
+      findRestorePoint(MachineBasicBlock*, MachineInstr*, SlotIndex,
+                     SmallPtrSet<MachineInstr*, 4>&);
+
+    int CreateSpillStackSlot(unsigned, const TargetRegisterClass *);
+
+    bool IsAvailableInStack(MachineBasicBlock*, unsigned,
+                            SlotIndex, SlotIndex,
+                            SlotIndex&, int&) const;
+
+    void UpdateSpillSlotInterval(VNInfo*, SlotIndex, SlotIndex);
+
+    bool SplitRegLiveInterval(LiveInterval*);
+
+    bool SplitRegLiveIntervals(const TargetRegisterClass **,
+                               SmallPtrSet<LiveInterval*, 8>&);
+    
+    bool createsNewJoin(LiveRange* LR, MachineBasicBlock* DefMBB,
+                        MachineBasicBlock* BarrierMBB);
+    bool Rematerialize(unsigned vreg, VNInfo* ValNo,
+                       MachineInstr* DefMI,
+                       MachineBasicBlock::iterator RestorePt,
+                       SmallPtrSet<MachineInstr*, 4>& RefsInMBB);
+    MachineInstr* FoldSpill(unsigned vreg, const TargetRegisterClass* RC,
+                            MachineInstr* DefMI,
+                            MachineInstr* Barrier,
+                            MachineBasicBlock* MBB,
+                            int& SS,
+                            SmallPtrSet<MachineInstr*, 4>& RefsInMBB);
+    MachineInstr* FoldRestore(unsigned vreg, 
+                              const TargetRegisterClass* RC,
+                              MachineInstr* Barrier,
+                              MachineBasicBlock* MBB,
+                              int SS,
+                              SmallPtrSet<MachineInstr*, 4>& RefsInMBB);
+    void RenumberValno(VNInfo* VN);
+    void ReconstructLiveInterval(LiveInterval* LI);
+    bool removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split);
+    unsigned getNumberOfNonSpills(SmallPtrSet<MachineInstr*, 4>& MIs,
+                               unsigned Reg, int FrameIndex, bool& TwoAddr);
+    VNInfo* PerformPHIConstruction(MachineBasicBlock::iterator Use,
+                                   MachineBasicBlock* MBB, LiveInterval* LI,
+                                   SmallPtrSet<MachineInstr*, 4>& Visited,
+            DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Defs,
+            DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Uses,
+                                      DenseMap<MachineInstr*, VNInfo*>& NewVNs,
+                                DenseMap<MachineBasicBlock*, VNInfo*>& LiveOut,
+                                DenseMap<MachineBasicBlock*, VNInfo*>& Phis,
+                                        bool IsTopLevel, bool IsIntraBlock);
+    VNInfo* PerformPHIConstructionFallBack(MachineBasicBlock::iterator Use,
+                                   MachineBasicBlock* MBB, LiveInterval* LI,
+                                   SmallPtrSet<MachineInstr*, 4>& Visited,
+            DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Defs,
+            DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Uses,
+                                      DenseMap<MachineInstr*, VNInfo*>& NewVNs,
+                                DenseMap<MachineBasicBlock*, VNInfo*>& LiveOut,
+                                DenseMap<MachineBasicBlock*, VNInfo*>& Phis,
+                                        bool IsTopLevel, bool IsIntraBlock);
+};
+} // end anonymous namespace
+
+char PreAllocSplitting::ID = 0;
+
+static RegisterPass<PreAllocSplitting>
+X("pre-alloc-splitting", "Pre-Register Allocation Live Interval Splitting");
+
+char &llvm::PreAllocSplittingID = PreAllocSplitting::ID;
+
+/// findSpillPoint - Find a gap as far away from the given MI that's suitable
+/// for spilling the current live interval. The index must be before any
+/// defs and uses of the live interval register in the mbb. Return begin() if
+/// none is found.
+MachineBasicBlock::iterator
+PreAllocSplitting::findSpillPoint(MachineBasicBlock *MBB, MachineInstr *MI,
+                                  MachineInstr *DefMI,
+                                  SmallPtrSet<MachineInstr*, 4> &RefsInMBB) {
+  MachineBasicBlock::iterator Pt = MBB->begin();
+
+  MachineBasicBlock::iterator MII = MI;
+  MachineBasicBlock::iterator EndPt = DefMI
+    ? MachineBasicBlock::iterator(DefMI) : MBB->begin();
+    
+  while (MII != EndPt && !RefsInMBB.count(MII) &&
+         MII->getOpcode() != TRI->getCallFrameSetupOpcode())
+    --MII;
+  if (MII == EndPt || RefsInMBB.count(MII)) return Pt;
+    
+  while (MII != EndPt && !RefsInMBB.count(MII)) {
+    // We can't insert the spill between the barrier (a call), and its
+    // corresponding call frame setup.
+    if (MII->getOpcode() == TRI->getCallFrameDestroyOpcode()) {
+      while (MII->getOpcode() != TRI->getCallFrameSetupOpcode()) {
+        --MII;
+        if (MII == EndPt) {
+          return Pt;
+        }
+      }
+      continue;
+    } else {
+      Pt = MII;
+    }
+    
+    if (RefsInMBB.count(MII))
+      return Pt;
+    
+    
+    --MII;
+  }
+
+  return Pt;
+}
+
+/// findRestorePoint - Find a gap in the instruction index map that's suitable
+/// for restoring the current live interval value. The index must be before any
+/// uses of the live interval register in the mbb. Return end() if none is
+/// found.
+MachineBasicBlock::iterator
+PreAllocSplitting::findRestorePoint(MachineBasicBlock *MBB, MachineInstr *MI,
+                                    SlotIndex LastIdx,
+                                    SmallPtrSet<MachineInstr*, 4> &RefsInMBB) {
+  // FIXME: Allow spill to be inserted to the beginning of the mbb. Update mbb
+  // begin index accordingly.
+  MachineBasicBlock::iterator Pt = MBB->end();
+  MachineBasicBlock::iterator EndPt = MBB->getFirstTerminator();
+
+  // We start at the call, so walk forward until we find the call frame teardown
+  // since we can't insert restores before that.  Bail if we encounter a use
+  // during this time.
+  MachineBasicBlock::iterator MII = MI;
+  if (MII == EndPt) return Pt;
+  
+  while (MII != EndPt && !RefsInMBB.count(MII) &&
+         MII->getOpcode() != TRI->getCallFrameDestroyOpcode())
+    ++MII;
+  if (MII == EndPt || RefsInMBB.count(MII)) return Pt;
+  ++MII;
+  
+  // FIXME: Limit the number of instructions to examine to reduce
+  // compile time?
+  while (MII != EndPt) {
+    SlotIndex Index = LIs->getInstructionIndex(MII);
+    if (Index > LastIdx)
+      break;
+      
+    // We can't insert a restore between the barrier (a call) and its 
+    // corresponding call frame teardown.
+    if (MII->getOpcode() == TRI->getCallFrameSetupOpcode()) {
+      do {
+        if (MII == EndPt || RefsInMBB.count(MII)) return Pt;
+        ++MII;
+      } while (MII->getOpcode() != TRI->getCallFrameDestroyOpcode());
+    } else {
+      Pt = MII;
+    }
+    
+    if (RefsInMBB.count(MII))
+      return Pt;
+    
+    ++MII;
+  }
+
+  return Pt;
+}
+
+/// CreateSpillStackSlot - Create a stack slot for the live interval being
+/// split. If the live interval was previously split, just reuse the same
+/// slot.
+int PreAllocSplitting::CreateSpillStackSlot(unsigned Reg,
+                                            const TargetRegisterClass *RC) {
+  int SS;
+  DenseMap<unsigned, int>::iterator I = IntervalSSMap.find(Reg);
+  if (I != IntervalSSMap.end()) {
+    SS = I->second;
+  } else {
+    SS = MFI->CreateSpillStackObject(RC->getSize(), RC->getAlignment());
+    IntervalSSMap[Reg] = SS;
+  }
+
+  // Create live interval for stack slot.
+  CurrSLI = &LSs->getOrCreateInterval(SS, RC);
+  if (CurrSLI->hasAtLeastOneValue())
+    CurrSValNo = CurrSLI->getValNumInfo(0);
+  else
+    CurrSValNo = CurrSLI->getNextValue(SlotIndex(), 0, false,
+                                       LSs->getVNInfoAllocator());
+  return SS;
+}
+
+/// IsAvailableInStack - Return true if register is available in a split stack
+/// slot at the specified index.
+bool
+PreAllocSplitting::IsAvailableInStack(MachineBasicBlock *DefMBB,
+                                    unsigned Reg, SlotIndex DefIndex,
+                                    SlotIndex RestoreIndex,
+                                    SlotIndex &SpillIndex,
+                                    int& SS) const {
+  if (!DefMBB)
+    return false;
+
+  DenseMap<unsigned, int>::const_iterator I = IntervalSSMap.find(Reg);
+  if (I == IntervalSSMap.end())
+    return false;
+  DenseMap<SlotIndex, SlotIndex>::const_iterator
+    II = Def2SpillMap.find(DefIndex);
+  if (II == Def2SpillMap.end())
+    return false;
+
+  // If last spill of def is in the same mbb as barrier mbb (where restore will
+  // be), make sure it's not below the intended restore index.
+  // FIXME: Undo the previous spill?
+  assert(LIs->getMBBFromIndex(II->second) == DefMBB);
+  if (DefMBB == BarrierMBB && II->second >= RestoreIndex)
+    return false;
+
+  SS = I->second;
+  SpillIndex = II->second;
+  return true;
+}
+
+/// UpdateSpillSlotInterval - Given the specified val# of the register live
+/// interval being split, and the spill and restore indicies, update the live
+/// interval of the spill stack slot.
+void
+PreAllocSplitting::UpdateSpillSlotInterval(VNInfo *ValNo, SlotIndex SpillIndex,
+                                           SlotIndex RestoreIndex) {
+  assert(LIs->getMBBFromIndex(RestoreIndex) == BarrierMBB &&
+         "Expect restore in the barrier mbb");
+
+  MachineBasicBlock *MBB = LIs->getMBBFromIndex(SpillIndex);
+  if (MBB == BarrierMBB) {
+    // Intra-block spill + restore. We are done.
+    LiveRange SLR(SpillIndex, RestoreIndex, CurrSValNo);
+    CurrSLI->addRange(SLR);
+    return;
+  }
+
+  SmallPtrSet<MachineBasicBlock*, 4> Processed;
+  SlotIndex EndIdx = LIs->getMBBEndIdx(MBB);
+  LiveRange SLR(SpillIndex, EndIdx, CurrSValNo);
+  CurrSLI->addRange(SLR);
+  Processed.insert(MBB);
+
+  // Start from the spill mbb, figure out the extend of the spill slot's
+  // live interval.
+  SmallVector<MachineBasicBlock*, 4> WorkList;
+  const LiveRange *LR = CurrLI->getLiveRangeContaining(SpillIndex);
+  if (LR->end > EndIdx)
+    // If live range extend beyond end of mbb, add successors to work list.
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+           SE = MBB->succ_end(); SI != SE; ++SI)
+      WorkList.push_back(*SI);
+
+  while (!WorkList.empty()) {
+    MachineBasicBlock *MBB = WorkList.back();
+    WorkList.pop_back();
+    if (Processed.count(MBB))
+      continue;
+    SlotIndex Idx = LIs->getMBBStartIdx(MBB);
+    LR = CurrLI->getLiveRangeContaining(Idx);
+    if (LR && LR->valno == ValNo) {
+      EndIdx = LIs->getMBBEndIdx(MBB);
+      if (Idx <= RestoreIndex && RestoreIndex < EndIdx) {
+        // Spill slot live interval stops at the restore.
+        LiveRange SLR(Idx, RestoreIndex, CurrSValNo);
+        CurrSLI->addRange(SLR);
+      } else if (LR->end > EndIdx) {
+        // Live range extends beyond end of mbb, process successors.
+        LiveRange SLR(Idx, EndIdx.getNextIndex(), CurrSValNo);
+        CurrSLI->addRange(SLR);
+        for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+               SE = MBB->succ_end(); SI != SE; ++SI)
+          WorkList.push_back(*SI);
+      } else {
+        LiveRange SLR(Idx, LR->end, CurrSValNo);
+        CurrSLI->addRange(SLR);
+      }
+      Processed.insert(MBB);
+    }
+  }
+}
+
+/// PerformPHIConstruction - From properly set up use and def lists, use a PHI
+/// construction algorithm to compute the ranges and valnos for an interval.
+VNInfo*
+PreAllocSplitting::PerformPHIConstruction(MachineBasicBlock::iterator UseI,
+                                       MachineBasicBlock* MBB, LiveInterval* LI,
+                                       SmallPtrSet<MachineInstr*, 4>& Visited,
+             DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Defs,
+             DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Uses,
+                                       DenseMap<MachineInstr*, VNInfo*>& NewVNs,
+                                 DenseMap<MachineBasicBlock*, VNInfo*>& LiveOut,
+                                 DenseMap<MachineBasicBlock*, VNInfo*>& Phis,
+                                           bool IsTopLevel, bool IsIntraBlock) {
+  // Return memoized result if it's available.
+  if (IsTopLevel && Visited.count(UseI) && NewVNs.count(UseI))
+    return NewVNs[UseI];
+  else if (!IsTopLevel && IsIntraBlock && NewVNs.count(UseI))
+    return NewVNs[UseI];
+  else if (!IsIntraBlock && LiveOut.count(MBB))
+    return LiveOut[MBB];
+  
+  // Check if our block contains any uses or defs.
+  bool ContainsDefs = Defs.count(MBB);
+  bool ContainsUses = Uses.count(MBB);
+  
+  VNInfo* RetVNI = 0;
+  
+  // Enumerate the cases of use/def contaning blocks.
+  if (!ContainsDefs && !ContainsUses) {
+    return PerformPHIConstructionFallBack(UseI, MBB, LI, Visited, Defs, Uses,
+                                          NewVNs, LiveOut, Phis,
+                                          IsTopLevel, IsIntraBlock);
+  } else if (ContainsDefs && !ContainsUses) {
+    SmallPtrSet<MachineInstr*, 2>& BlockDefs = Defs[MBB];
+
+    // Search for the def in this block.  If we don't find it before the
+    // instruction we care about, go to the fallback case.  Note that that
+    // should never happen: this cannot be intrablock, so use should
+    // always be an end() iterator.
+    assert(UseI == MBB->end() && "No use marked in intrablock");
+    
+    MachineBasicBlock::iterator Walker = UseI;
+    --Walker;
+    while (Walker != MBB->begin()) {
+      if (BlockDefs.count(Walker))
+        break;
+      --Walker;
+    }
+    
+    // Once we've found it, extend its VNInfo to our instruction.
+    SlotIndex DefIndex = LIs->getInstructionIndex(Walker);
+    DefIndex = DefIndex.getDefIndex();
+    SlotIndex EndIndex = LIs->getMBBEndIdx(MBB);
+    
+    RetVNI = NewVNs[Walker];
+    LI->addRange(LiveRange(DefIndex, EndIndex, RetVNI));
+  } else if (!ContainsDefs && ContainsUses) {
+    SmallPtrSet<MachineInstr*, 2>& BlockUses = Uses[MBB];
+    
+    // Search for the use in this block that precedes the instruction we care 
+    // about, going to the fallback case if we don't find it.    
+    MachineBasicBlock::iterator Walker = UseI;
+    bool found = false;
+    while (Walker != MBB->begin()) {
+      --Walker;
+      if (BlockUses.count(Walker)) {
+        found = true;
+        break;
+      }
+    }
+
+    if (!found)
+      return PerformPHIConstructionFallBack(UseI, MBB, LI, Visited, Defs,
+                                            Uses, NewVNs, LiveOut, Phis,
+                                            IsTopLevel, IsIntraBlock);
+
+    SlotIndex UseIndex = LIs->getInstructionIndex(Walker);
+    UseIndex = UseIndex.getUseIndex();
+    SlotIndex EndIndex;
+    if (IsIntraBlock) {
+      EndIndex = LIs->getInstructionIndex(UseI).getDefIndex();
+    } else
+      EndIndex = LIs->getMBBEndIdx(MBB);
+
+    // Now, recursively phi construct the VNInfo for the use we found,
+    // and then extend it to include the instruction we care about
+    RetVNI = PerformPHIConstruction(Walker, MBB, LI, Visited, Defs, Uses,
+                                    NewVNs, LiveOut, Phis, false, true);
+    
+    LI->addRange(LiveRange(UseIndex, EndIndex, RetVNI));
+    
+    // FIXME: Need to set kills properly for inter-block stuff.
+  } else if (ContainsDefs && ContainsUses) {
+    SmallPtrSet<MachineInstr*, 2>& BlockDefs = Defs[MBB];
+    SmallPtrSet<MachineInstr*, 2>& BlockUses = Uses[MBB];
+    
+    // This case is basically a merging of the two preceding case, with the
+    // special note that checking for defs must take precedence over checking
+    // for uses, because of two-address instructions.
+    MachineBasicBlock::iterator Walker = UseI;
+    bool foundDef = false;
+    bool foundUse = false;
+    while (Walker != MBB->begin()) {
+      --Walker;
+      if (BlockDefs.count(Walker)) {
+        foundDef = true;
+        break;
+      } else if (BlockUses.count(Walker)) {
+        foundUse = true;
+        break;
+      }
+    }
+
+    if (!foundDef && !foundUse)
+      return PerformPHIConstructionFallBack(UseI, MBB, LI, Visited, Defs,
+                                            Uses, NewVNs, LiveOut, Phis,
+                                            IsTopLevel, IsIntraBlock);
+
+    SlotIndex StartIndex = LIs->getInstructionIndex(Walker);
+    StartIndex = foundDef ? StartIndex.getDefIndex() : StartIndex.getUseIndex();
+    SlotIndex EndIndex;
+    if (IsIntraBlock) {
+      EndIndex = LIs->getInstructionIndex(UseI).getDefIndex();
+    } else
+      EndIndex = LIs->getMBBEndIdx(MBB);
+
+    if (foundDef)
+      RetVNI = NewVNs[Walker];
+    else
+      RetVNI = PerformPHIConstruction(Walker, MBB, LI, Visited, Defs, Uses,
+                                      NewVNs, LiveOut, Phis, false, true);
+
+    LI->addRange(LiveRange(StartIndex, EndIndex, RetVNI));
+  }
+  
+  // Memoize results so we don't have to recompute them.
+  if (!IsIntraBlock) LiveOut[MBB] = RetVNI;
+  else {
+    if (!NewVNs.count(UseI))
+      NewVNs[UseI] = RetVNI;
+    Visited.insert(UseI);
+  }
+
+  return RetVNI;
+}
+
+/// PerformPHIConstructionFallBack - PerformPHIConstruction fall back path.
+///
+VNInfo*
+PreAllocSplitting::PerformPHIConstructionFallBack(MachineBasicBlock::iterator UseI,
+                                       MachineBasicBlock* MBB, LiveInterval* LI,
+                                       SmallPtrSet<MachineInstr*, 4>& Visited,
+             DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Defs,
+             DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Uses,
+                                       DenseMap<MachineInstr*, VNInfo*>& NewVNs,
+                                 DenseMap<MachineBasicBlock*, VNInfo*>& LiveOut,
+                                 DenseMap<MachineBasicBlock*, VNInfo*>& Phis,
+                                           bool IsTopLevel, bool IsIntraBlock) {
+  // NOTE: Because this is the fallback case from other cases, we do NOT
+  // assume that we are not intrablock here.
+  if (Phis.count(MBB)) return Phis[MBB]; 
+
+  SlotIndex StartIndex = LIs->getMBBStartIdx(MBB);
+  VNInfo *RetVNI = Phis[MBB] =
+    LI->getNextValue(SlotIndex(), /*FIXME*/ 0, false,
+                     LIs->getVNInfoAllocator());
+
+  if (!IsIntraBlock) LiveOut[MBB] = RetVNI;
+    
+  // If there are no uses or defs between our starting point and the
+  // beginning of the block, then recursive perform phi construction
+  // on our predecessors.
+  DenseMap<MachineBasicBlock*, VNInfo*> IncomingVNs;
+  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+    VNInfo* Incoming = PerformPHIConstruction((*PI)->end(), *PI, LI, 
+                                              Visited, Defs, Uses, NewVNs,
+                                              LiveOut, Phis, false, false);
+    if (Incoming != 0)
+      IncomingVNs[*PI] = Incoming;
+  }
+    
+  if (MBB->pred_size() == 1 && !RetVNI->hasPHIKill()) {
+    VNInfo* OldVN = RetVNI;
+    VNInfo* NewVN = IncomingVNs.begin()->second;
+    VNInfo* MergedVN = LI->MergeValueNumberInto(OldVN, NewVN);
+    if (MergedVN == OldVN) std::swap(OldVN, NewVN);
+    
+    for (DenseMap<MachineBasicBlock*, VNInfo*>::iterator LOI = LiveOut.begin(),
+         LOE = LiveOut.end(); LOI != LOE; ++LOI)
+      if (LOI->second == OldVN)
+        LOI->second = MergedVN;
+    for (DenseMap<MachineInstr*, VNInfo*>::iterator NVI = NewVNs.begin(),
+         NVE = NewVNs.end(); NVI != NVE; ++NVI)
+      if (NVI->second == OldVN)
+        NVI->second = MergedVN;
+    for (DenseMap<MachineBasicBlock*, VNInfo*>::iterator PI = Phis.begin(),
+         PE = Phis.end(); PI != PE; ++PI)
+      if (PI->second == OldVN)
+        PI->second = MergedVN;
+    RetVNI = MergedVN;
+  } else {
+    // Otherwise, merge the incoming VNInfos with a phi join.  Create a new
+    // VNInfo to represent the joined value.
+    for (DenseMap<MachineBasicBlock*, VNInfo*>::iterator I =
+           IncomingVNs.begin(), E = IncomingVNs.end(); I != E; ++I) {
+      I->second->setHasPHIKill(true);
+    }
+  }
+      
+  SlotIndex EndIndex;
+  if (IsIntraBlock) {
+    EndIndex = LIs->getInstructionIndex(UseI).getDefIndex();
+  } else
+    EndIndex = LIs->getMBBEndIdx(MBB);
+  LI->addRange(LiveRange(StartIndex, EndIndex, RetVNI));
+
+  // Memoize results so we don't have to recompute them.
+  if (!IsIntraBlock)
+    LiveOut[MBB] = RetVNI;
+  else {
+    if (!NewVNs.count(UseI))
+      NewVNs[UseI] = RetVNI;
+    Visited.insert(UseI);
+  }
+
+  return RetVNI;
+}
+
+/// ReconstructLiveInterval - Recompute a live interval from scratch.
+void PreAllocSplitting::ReconstructLiveInterval(LiveInterval* LI) {
+  VNInfo::Allocator& Alloc = LIs->getVNInfoAllocator();
+  
+  // Clear the old ranges and valnos;
+  LI->clear();
+  
+  // Cache the uses and defs of the register
+  typedef DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> > RegMap;
+  RegMap Defs, Uses;
+  
+  // Keep track of the new VNs we're creating.
+  DenseMap<MachineInstr*, VNInfo*> NewVNs;
+  SmallPtrSet<VNInfo*, 2> PhiVNs;
+  
+  // Cache defs, and create a new VNInfo for each def.
+  for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(LI->reg),
+       DE = MRI->def_end(); DI != DE; ++DI) {
+    Defs[(*DI).getParent()].insert(&*DI);
+    
+    SlotIndex DefIdx = LIs->getInstructionIndex(&*DI);
+    DefIdx = DefIdx.getDefIndex();
+    
+    assert(!DI->isPHI() && "PHI instr in code during pre-alloc splitting.");
+    VNInfo* NewVN = LI->getNextValue(DefIdx, 0, true, Alloc);
+    
+    // If the def is a move, set the copy field.
+    if (DI->isCopyLike() && DI->getOperand(0).getReg() == LI->reg)
+      NewVN->setCopy(&*DI);
+
+    NewVNs[&*DI] = NewVN;
+  }
+  
+  // Cache uses as a separate pass from actually processing them.
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(LI->reg),
+       UE = MRI->use_end(); UI != UE; ++UI)
+    Uses[(*UI).getParent()].insert(&*UI);
+    
+  // Now, actually process every use and use a phi construction algorithm
+  // to walk from it to its reaching definitions, building VNInfos along
+  // the way.
+  DenseMap<MachineBasicBlock*, VNInfo*> LiveOut;
+  DenseMap<MachineBasicBlock*, VNInfo*> Phis;
+  SmallPtrSet<MachineInstr*, 4> Visited;
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(LI->reg),
+       UE = MRI->use_end(); UI != UE; ++UI) {
+    PerformPHIConstruction(&*UI, UI->getParent(), LI, Visited, Defs,
+                           Uses, NewVNs, LiveOut, Phis, true, true); 
+  }
+  
+  // Add ranges for dead defs
+  for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(LI->reg),
+       DE = MRI->def_end(); DI != DE; ++DI) {
+    SlotIndex DefIdx = LIs->getInstructionIndex(&*DI);
+    DefIdx = DefIdx.getDefIndex();
+    
+    if (LI->liveAt(DefIdx)) continue;
+    
+    VNInfo* DeadVN = NewVNs[&*DI];
+    LI->addRange(LiveRange(DefIdx, DefIdx.getNextSlot(), DeadVN));
+  }
+}
+
+/// RenumberValno - Split the given valno out into a new vreg, allowing it to
+/// be allocated to a different register.  This function creates a new vreg,
+/// copies the valno and its live ranges over to the new vreg's interval,
+/// removes them from the old interval, and rewrites all uses and defs of
+/// the original reg to the new vreg within those ranges.
+void PreAllocSplitting::RenumberValno(VNInfo* VN) {
+  SmallVector<VNInfo*, 4> Stack;
+  SmallVector<VNInfo*, 4> VNsToCopy;
+  Stack.push_back(VN);
+
+  // Walk through and copy the valno we care about, and any other valnos
+  // that are two-address redefinitions of the one we care about.  These
+  // will need to be rewritten as well.  We also check for safety of the 
+  // renumbering here, by making sure that none of the valno involved has
+  // phi kills.
+  while (!Stack.empty()) {
+    VNInfo* OldVN = Stack.back();
+    Stack.pop_back();
+    
+    // Bail out if we ever encounter a valno that has a PHI kill.  We can't
+    // renumber these.
+    if (OldVN->hasPHIKill()) return;
+    
+    VNsToCopy.push_back(OldVN);
+    
+    // Locate two-address redefinitions
+    for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(CurrLI->reg),
+         DE = MRI->def_end(); DI != DE; ++DI) {
+      if (!DI->isRegTiedToUseOperand(DI.getOperandNo())) continue;
+      SlotIndex DefIdx = LIs->getInstructionIndex(&*DI).getDefIndex();
+      VNInfo* NextVN = CurrLI->findDefinedVNInfoForRegInt(DefIdx);
+      if (std::find(VNsToCopy.begin(), VNsToCopy.end(), NextVN) !=
+          VNsToCopy.end())
+        Stack.push_back(NextVN);
+    }
+  }
+  
+  // Create the new vreg
+  unsigned NewVReg = MRI->createVirtualRegister(MRI->getRegClass(CurrLI->reg));
+  
+  // Create the new live interval
+  LiveInterval& NewLI = LIs->getOrCreateInterval(NewVReg);
+  
+  for (SmallVector<VNInfo*, 4>::iterator OI = VNsToCopy.begin(), OE = 
+       VNsToCopy.end(); OI != OE; ++OI) {
+    VNInfo* OldVN = *OI;
+    
+    // Copy the valno over
+    VNInfo* NewVN = NewLI.createValueCopy(OldVN, LIs->getVNInfoAllocator());
+    NewLI.MergeValueInAsValue(*CurrLI, OldVN, NewVN);
+
+    // Remove the valno from the old interval
+    CurrLI->removeValNo(OldVN);
+  }
+  
+  // Rewrite defs and uses.  This is done in two stages to avoid invalidating
+  // the reg_iterator.
+  SmallVector<std::pair<MachineInstr*, unsigned>, 8> OpsToChange;
+  
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(CurrLI->reg),
+         E = MRI->reg_end(); I != E; ++I) {
+    MachineOperand& MO = I.getOperand();
+    SlotIndex InstrIdx = LIs->getInstructionIndex(&*I);
+    
+    if ((MO.isUse() && NewLI.liveAt(InstrIdx.getUseIndex())) ||
+        (MO.isDef() && NewLI.liveAt(InstrIdx.getDefIndex())))
+      OpsToChange.push_back(std::make_pair(&*I, I.getOperandNo()));
+  }
+  
+  for (SmallVector<std::pair<MachineInstr*, unsigned>, 8>::iterator I =
+       OpsToChange.begin(), E = OpsToChange.end(); I != E; ++I) {
+    MachineInstr* Inst = I->first;
+    unsigned OpIdx = I->second;
+    MachineOperand& MO = Inst->getOperand(OpIdx);
+    MO.setReg(NewVReg);
+  }
+  
+  // Grow the VirtRegMap, since we've created a new vreg.
+  VRM->grow();
+  
+  // The renumbered vreg shares a stack slot with the old register.
+  if (IntervalSSMap.count(CurrLI->reg))
+    IntervalSSMap[NewVReg] = IntervalSSMap[CurrLI->reg];
+  
+  ++NumRenumbers;
+}
+
+bool PreAllocSplitting::Rematerialize(unsigned VReg, VNInfo* ValNo,
+                                      MachineInstr* DefMI,
+                                      MachineBasicBlock::iterator RestorePt,
+                                    SmallPtrSet<MachineInstr*, 4>& RefsInMBB) {
+  MachineBasicBlock& MBB = *RestorePt->getParent();
+  
+  MachineBasicBlock::iterator KillPt = BarrierMBB->end();
+  if (!ValNo->isDefAccurate() || DefMI->getParent() == BarrierMBB)
+    KillPt = findSpillPoint(BarrierMBB, Barrier, NULL, RefsInMBB);
+  else
+    KillPt = llvm::next(MachineBasicBlock::iterator(DefMI));
+  
+  if (KillPt == DefMI->getParent()->end())
+    return false;
+  
+  TII->reMaterialize(MBB, RestorePt, VReg, 0, DefMI, *TRI);
+  SlotIndex RematIdx = LIs->InsertMachineInstrInMaps(prior(RestorePt));
+  
+  ReconstructLiveInterval(CurrLI);
+  RematIdx = RematIdx.getDefIndex();
+  RenumberValno(CurrLI->findDefinedVNInfoForRegInt(RematIdx));
+  
+  ++NumSplits;
+  ++NumRemats;
+  return true;  
+}
+
+MachineInstr* PreAllocSplitting::FoldSpill(unsigned vreg, 
+                                           const TargetRegisterClass* RC,
+                                           MachineInstr* DefMI,
+                                           MachineInstr* Barrier,
+                                           MachineBasicBlock* MBB,
+                                           int& SS,
+                                    SmallPtrSet<MachineInstr*, 4>& RefsInMBB) {
+  // Go top down if RefsInMBB is empty.
+  if (RefsInMBB.empty())
+    return 0;
+  
+  MachineBasicBlock::iterator FoldPt = Barrier;
+  while (&*FoldPt != DefMI && FoldPt != MBB->begin() &&
+         !RefsInMBB.count(FoldPt))
+    --FoldPt;
+  
+  int OpIdx = FoldPt->findRegisterDefOperandIdx(vreg);
+  if (OpIdx == -1)
+    return 0;
+  
+  SmallVector<unsigned, 1> Ops;
+  Ops.push_back(OpIdx);
+  
+  if (!TII->canFoldMemoryOperand(FoldPt, Ops))
+    return 0;
+  
+  DenseMap<unsigned, int>::iterator I = IntervalSSMap.find(vreg);
+  if (I != IntervalSSMap.end()) {
+    SS = I->second;
+  } else {
+    SS = MFI->CreateSpillStackObject(RC->getSize(), RC->getAlignment());
+  }
+  
+  MachineInstr* FMI = TII->foldMemoryOperand(FoldPt, Ops, SS);
+  
+  if (FMI) {
+    LIs->ReplaceMachineInstrInMaps(FoldPt, FMI);
+    FoldPt->eraseFromParent();
+    ++NumFolds;
+    
+    IntervalSSMap[vreg] = SS;
+    CurrSLI = &LSs->getOrCreateInterval(SS, RC);
+    if (CurrSLI->hasAtLeastOneValue())
+      CurrSValNo = CurrSLI->getValNumInfo(0);
+    else
+      CurrSValNo = CurrSLI->getNextValue(SlotIndex(), 0, false,
+                                         LSs->getVNInfoAllocator());
+  }
+  
+  return FMI;
+}
+
+MachineInstr* PreAllocSplitting::FoldRestore(unsigned vreg, 
+                                             const TargetRegisterClass* RC,
+                                             MachineInstr* Barrier,
+                                             MachineBasicBlock* MBB,
+                                             int SS,
+                                     SmallPtrSet<MachineInstr*, 4>& RefsInMBB) {
+  if ((int)RestoreFoldLimit != -1 && RestoreFoldLimit == (int)NumRestoreFolds)
+    return 0;
+                                       
+  // Go top down if RefsInMBB is empty.
+  if (RefsInMBB.empty())
+    return 0;
+  
+  // Can't fold a restore between a call stack setup and teardown.
+  MachineBasicBlock::iterator FoldPt = Barrier;
+  
+  // Advance from barrier to call frame teardown.
+  while (FoldPt != MBB->getFirstTerminator() &&
+         FoldPt->getOpcode() != TRI->getCallFrameDestroyOpcode()) {
+    if (RefsInMBB.count(FoldPt))
+      return 0;
+    
+    ++FoldPt;
+  }
+  
+  if (FoldPt == MBB->getFirstTerminator())
+    return 0;
+  else
+    ++FoldPt;
+  
+  // Now find the restore point.
+  while (FoldPt != MBB->getFirstTerminator() && !RefsInMBB.count(FoldPt)) {
+    if (FoldPt->getOpcode() == TRI->getCallFrameSetupOpcode()) {
+      while (FoldPt != MBB->getFirstTerminator() &&
+             FoldPt->getOpcode() != TRI->getCallFrameDestroyOpcode()) {
+        if (RefsInMBB.count(FoldPt))
+          return 0;
+        
+        ++FoldPt;
+      }
+      
+      if (FoldPt == MBB->getFirstTerminator())
+        return 0;
+    } 
+    
+    ++FoldPt;
+  }
+  
+  if (FoldPt == MBB->getFirstTerminator())
+    return 0;
+  
+  int OpIdx = FoldPt->findRegisterUseOperandIdx(vreg, true);
+  if (OpIdx == -1)
+    return 0;
+  
+  SmallVector<unsigned, 1> Ops;
+  Ops.push_back(OpIdx);
+  
+  if (!TII->canFoldMemoryOperand(FoldPt, Ops))
+    return 0;
+  
+  MachineInstr* FMI = TII->foldMemoryOperand(FoldPt, Ops, SS);
+  
+  if (FMI) {
+    LIs->ReplaceMachineInstrInMaps(FoldPt, FMI);
+    FoldPt->eraseFromParent();
+    ++NumRestoreFolds;
+  }
+  
+  return FMI;
+}
+
+/// SplitRegLiveInterval - Split (spill and restore) the given live interval
+/// so it would not cross the barrier that's being processed. Shrink wrap
+/// (minimize) the live interval to the last uses.
+bool PreAllocSplitting::SplitRegLiveInterval(LiveInterval *LI) {
+  DEBUG(dbgs() << "Pre-alloc splitting " << LI->reg << " for " << *Barrier
+               << "  result: ");
+
+  CurrLI = LI;
+
+  // Find live range where current interval cross the barrier.
+  LiveInterval::iterator LR =
+    CurrLI->FindLiveRangeContaining(BarrierIdx.getUseIndex());
+  VNInfo *ValNo = LR->valno;
+
+  assert(!ValNo->isUnused() && "Val# is defined by a dead def?");
+
+  MachineInstr *DefMI = ValNo->isDefAccurate()
+    ? LIs->getInstructionFromIndex(ValNo->def) : NULL;
+
+  // If this would create a new join point, do not split.
+  if (DefMI && createsNewJoin(LR, DefMI->getParent(), Barrier->getParent())) {
+    DEBUG(dbgs() << "FAILED (would create a new join point).\n");
+    return false;
+  }
+
+  // Find all references in the barrier mbb.
+  SmallPtrSet<MachineInstr*, 4> RefsInMBB;
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(CurrLI->reg),
+         E = MRI->reg_end(); I != E; ++I) {
+    MachineInstr *RefMI = &*I;
+    if (RefMI->getParent() == BarrierMBB)
+      RefsInMBB.insert(RefMI);
+  }
+
+  // Find a point to restore the value after the barrier.
+  MachineBasicBlock::iterator RestorePt =
+    findRestorePoint(BarrierMBB, Barrier, LR->end, RefsInMBB);
+  if (RestorePt == BarrierMBB->end()) {
+    DEBUG(dbgs() << "FAILED (could not find a suitable restore point).\n");
+    return false;
+  }
+
+  if (DefMI && LIs->isReMaterializable(*LI, ValNo, DefMI))
+    if (Rematerialize(LI->reg, ValNo, DefMI, RestorePt, RefsInMBB)) {
+      DEBUG(dbgs() << "success (remat).\n");
+      return true;
+    }
+
+  // Add a spill either before the barrier or after the definition.
+  MachineBasicBlock *DefMBB = DefMI ? DefMI->getParent() : NULL;
+  const TargetRegisterClass *RC = MRI->getRegClass(CurrLI->reg);
+  SlotIndex SpillIndex;
+  MachineInstr *SpillMI = NULL;
+  int SS = -1;
+  if (!ValNo->isDefAccurate()) {
+    // If we don't know where the def is we must split just before the barrier.
+    if ((SpillMI = FoldSpill(LI->reg, RC, 0, Barrier,
+                            BarrierMBB, SS, RefsInMBB))) {
+      SpillIndex = LIs->getInstructionIndex(SpillMI);
+    } else {
+      MachineBasicBlock::iterator SpillPt = 
+        findSpillPoint(BarrierMBB, Barrier, NULL, RefsInMBB);
+      if (SpillPt == BarrierMBB->begin()) {
+        DEBUG(dbgs() << "FAILED (could not find a suitable spill point).\n");
+        return false; // No gap to insert spill.
+      }
+      // Add spill.
+    
+      SS = CreateSpillStackSlot(CurrLI->reg, RC);
+      TII->storeRegToStackSlot(*BarrierMBB, SpillPt, CurrLI->reg, true, SS, RC,
+                               TRI);
+      SpillMI = prior(SpillPt);
+      SpillIndex = LIs->InsertMachineInstrInMaps(SpillMI);
+    }
+  } else if (!IsAvailableInStack(DefMBB, CurrLI->reg, ValNo->def,
+                                 LIs->getZeroIndex(), SpillIndex, SS)) {
+    // If it's already split, just restore the value. There is no need to spill
+    // the def again.
+    if (!DefMI) {
+      DEBUG(dbgs() << "FAILED (def is dead).\n");
+      return false; // Def is dead. Do nothing.
+    }
+    
+    if ((SpillMI = FoldSpill(LI->reg, RC, DefMI, Barrier,
+                             BarrierMBB, SS, RefsInMBB))) {
+      SpillIndex = LIs->getInstructionIndex(SpillMI);
+    } else {
+      // Check if it's possible to insert a spill after the def MI.
+      MachineBasicBlock::iterator SpillPt;
+      if (DefMBB == BarrierMBB) {
+        // Add spill after the def and the last use before the barrier.
+        SpillPt = findSpillPoint(BarrierMBB, Barrier, DefMI,
+                                 RefsInMBB);
+        if (SpillPt == DefMBB->begin()) {
+          DEBUG(dbgs() << "FAILED (could not find a suitable spill point).\n");
+          return false; // No gap to insert spill.
+        }
+      } else {
+        SpillPt = llvm::next(MachineBasicBlock::iterator(DefMI));
+        if (SpillPt == DefMBB->end()) {
+          DEBUG(dbgs() << "FAILED (could not find a suitable spill point).\n");
+          return false; // No gap to insert spill.
+        }
+      }
+      // Add spill. 
+      SS = CreateSpillStackSlot(CurrLI->reg, RC);
+      TII->storeRegToStackSlot(*DefMBB, SpillPt, CurrLI->reg, false, SS, RC,
+                               TRI);
+      SpillMI = prior(SpillPt);
+      SpillIndex = LIs->InsertMachineInstrInMaps(SpillMI);
+    }
+  }
+
+  // Remember def instruction index to spill index mapping.
+  if (DefMI && SpillMI)
+    Def2SpillMap[ValNo->def] = SpillIndex;
+
+  // Add restore.
+  bool FoldedRestore = false;
+  SlotIndex RestoreIndex;
+  if (MachineInstr* LMI = FoldRestore(CurrLI->reg, RC, Barrier,
+                                      BarrierMBB, SS, RefsInMBB)) {
+    RestorePt = LMI;
+    RestoreIndex = LIs->getInstructionIndex(RestorePt);
+    FoldedRestore = true;
+  } else {
+    TII->loadRegFromStackSlot(*BarrierMBB, RestorePt, CurrLI->reg, SS, RC, TRI);
+    MachineInstr *LoadMI = prior(RestorePt);
+    RestoreIndex = LIs->InsertMachineInstrInMaps(LoadMI);
+  }
+
+  // Update spill stack slot live interval.
+  UpdateSpillSlotInterval(ValNo, SpillIndex.getUseIndex().getNextSlot(),
+                          RestoreIndex.getDefIndex());
+
+  ReconstructLiveInterval(CurrLI);
+
+  if (!FoldedRestore) {
+    SlotIndex RestoreIdx = LIs->getInstructionIndex(prior(RestorePt));
+    RestoreIdx = RestoreIdx.getDefIndex();
+    RenumberValno(CurrLI->findDefinedVNInfoForRegInt(RestoreIdx));
+  }
+  
+  ++NumSplits;
+  DEBUG(dbgs() << "success.\n");
+  return true;
+}
+
+/// SplitRegLiveIntervals - Split all register live intervals that cross the
+/// barrier that's being processed.
+bool
+PreAllocSplitting::SplitRegLiveIntervals(const TargetRegisterClass **RCs,
+                                         SmallPtrSet<LiveInterval*, 8>& Split) {
+  // First find all the virtual registers whose live intervals are intercepted
+  // by the current barrier.
+  SmallVector<LiveInterval*, 8> Intervals;
+  for (const TargetRegisterClass **RC = RCs; *RC; ++RC) {
+    // FIXME: If it's not safe to move any instruction that defines the barrier
+    // register class, then it means there are some special dependencies which
+    // codegen is not modelling. Ignore these barriers for now.
+    if (!TII->isSafeToMoveRegClassDefs(*RC))
+      continue;
+    const std::vector<unsigned> &VRs = MRI->getRegClassVirtRegs(*RC);
+    for (unsigned i = 0, e = VRs.size(); i != e; ++i) {
+      unsigned Reg = VRs[i];
+      if (!LIs->hasInterval(Reg))
+        continue;
+      LiveInterval *LI = &LIs->getInterval(Reg);
+      if (LI->liveAt(BarrierIdx) && !Barrier->readsRegister(Reg))
+        // Virtual register live interval is intercepted by the barrier. We
+        // should split and shrink wrap its interval if possible.
+        Intervals.push_back(LI);
+    }
+  }
+
+  // Process the affected live intervals.
+  bool Change = false;
+  while (!Intervals.empty()) {
+    if (PreSplitLimit != -1 && (int)NumSplits == PreSplitLimit)
+      break;
+    LiveInterval *LI = Intervals.back();
+    Intervals.pop_back();
+    bool result = SplitRegLiveInterval(LI);
+    if (result) Split.insert(LI);
+    Change |= result;
+  }
+
+  return Change;
+}
+
+unsigned PreAllocSplitting::getNumberOfNonSpills(
+                                  SmallPtrSet<MachineInstr*, 4>& MIs,
+                                  unsigned Reg, int FrameIndex,
+                                  bool& FeedsTwoAddr) {
+  unsigned NonSpills = 0;
+  for (SmallPtrSet<MachineInstr*, 4>::iterator UI = MIs.begin(), UE = MIs.end();
+       UI != UE; ++UI) {
+    int StoreFrameIndex;
+    unsigned StoreVReg = TII->isStoreToStackSlot(*UI, StoreFrameIndex);
+    if (StoreVReg != Reg || StoreFrameIndex != FrameIndex)
+      ++NonSpills;
+    
+    int DefIdx = (*UI)->findRegisterDefOperandIdx(Reg);
+    if (DefIdx != -1 && (*UI)->isRegTiedToUseOperand(DefIdx))
+      FeedsTwoAddr = true;
+  }
+  
+  return NonSpills;
+}
+
+/// removeDeadSpills - After doing splitting, filter through all intervals we've
+/// split, and see if any of the spills are unnecessary.  If so, remove them.
+bool PreAllocSplitting::removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split) {
+  bool changed = false;
+  
+  // Walk over all of the live intervals that were touched by the splitter,
+  // and see if we can do any DCE and/or folding.
+  for (SmallPtrSet<LiveInterval*, 8>::iterator LI = split.begin(),
+       LE = split.end(); LI != LE; ++LI) {
+    DenseMap<VNInfo*, SmallPtrSet<MachineInstr*, 4> > VNUseCount;
+    
+    // First, collect all the uses of the vreg, and sort them by their
+    // reaching definition (VNInfo).
+    for (MachineRegisterInfo::use_iterator UI = MRI->use_begin((*LI)->reg),
+         UE = MRI->use_end(); UI != UE; ++UI) {
+      SlotIndex index = LIs->getInstructionIndex(&*UI);
+      index = index.getUseIndex();
+      
+      const LiveRange* LR = (*LI)->getLiveRangeContaining(index);
+      VNUseCount[LR->valno].insert(&*UI);
+    }
+    
+    // Now, take the definitions (VNInfo's) one at a time and try to DCE 
+    // and/or fold them away.
+    for (LiveInterval::vni_iterator VI = (*LI)->vni_begin(),
+         VE = (*LI)->vni_end(); VI != VE; ++VI) {
+      
+      if (DeadSplitLimit != -1 && (int)NumDeadSpills == DeadSplitLimit) 
+        return changed;
+      
+      VNInfo* CurrVN = *VI;
+      
+      // We don't currently try to handle definitions with PHI kills, because
+      // it would involve processing more than one VNInfo at once.
+      if (CurrVN->hasPHIKill()) continue;
+      
+      // We also don't try to handle the results of PHI joins, since there's
+      // no defining instruction to analyze.
+      if (!CurrVN->isDefAccurate() || CurrVN->isUnused()) continue;
+    
+      // We're only interested in eliminating cruft introduced by the splitter,
+      // is of the form load-use or load-use-store.  First, check that the
+      // definition is a load, and remember what stack slot we loaded it from.
+      MachineInstr* DefMI = LIs->getInstructionFromIndex(CurrVN->def);
+      int FrameIndex;
+      if (!TII->isLoadFromStackSlot(DefMI, FrameIndex)) continue;
+      
+      // If the definition has no uses at all, just DCE it.
+      if (VNUseCount[CurrVN].size() == 0) {
+        LIs->RemoveMachineInstrFromMaps(DefMI);
+        (*LI)->removeValNo(CurrVN);
+        DefMI->eraseFromParent();
+        VNUseCount.erase(CurrVN);
+        ++NumDeadSpills;
+        changed = true;
+        continue;
+      }
+      
+      // Second, get the number of non-store uses of the definition, as well as
+      // a flag indicating whether it feeds into a later two-address definition.
+      bool FeedsTwoAddr = false;
+      unsigned NonSpillCount = getNumberOfNonSpills(VNUseCount[CurrVN],
+                                                    (*LI)->reg, FrameIndex,
+                                                    FeedsTwoAddr);
+      
+      // If there's one non-store use and it doesn't feed a two-addr, then
+      // this is a load-use-store case that we can try to fold.
+      if (NonSpillCount == 1 && !FeedsTwoAddr) {
+        // Start by finding the non-store use MachineInstr.
+        SmallPtrSet<MachineInstr*, 4>::iterator UI = VNUseCount[CurrVN].begin();
+        int StoreFrameIndex;
+        unsigned StoreVReg = TII->isStoreToStackSlot(*UI, StoreFrameIndex);
+        while (UI != VNUseCount[CurrVN].end() &&
+               (StoreVReg == (*LI)->reg && StoreFrameIndex == FrameIndex)) {
+          ++UI;
+          if (UI != VNUseCount[CurrVN].end())
+            StoreVReg = TII->isStoreToStackSlot(*UI, StoreFrameIndex);
+        }
+        if (UI == VNUseCount[CurrVN].end()) continue;
+        
+        MachineInstr* use = *UI;
+        
+        // Attempt to fold it away!
+        int OpIdx = use->findRegisterUseOperandIdx((*LI)->reg, false);
+        if (OpIdx == -1) continue;
+        SmallVector<unsigned, 1> Ops;
+        Ops.push_back(OpIdx);
+        if (!TII->canFoldMemoryOperand(use, Ops)) continue;
+
+        MachineInstr* NewMI = TII->foldMemoryOperand(use, Ops, FrameIndex);
+
+        if (!NewMI) continue;
+
+        // Update relevant analyses.
+        LIs->RemoveMachineInstrFromMaps(DefMI);
+        LIs->ReplaceMachineInstrInMaps(use, NewMI);
+        (*LI)->removeValNo(CurrVN);
+
+        DefMI->eraseFromParent();
+        use->eraseFromParent();
+        VNUseCount[CurrVN].erase(use);
+
+        // Remove deleted instructions.  Note that we need to remove them from 
+        // the VNInfo->use map as well, just to be safe.
+        for (SmallPtrSet<MachineInstr*, 4>::iterator II = 
+             VNUseCount[CurrVN].begin(), IE = VNUseCount[CurrVN].end();
+             II != IE; ++II) {
+          for (DenseMap<VNInfo*, SmallPtrSet<MachineInstr*, 4> >::iterator
+               VNI = VNUseCount.begin(), VNE = VNUseCount.end(); VNI != VNE; 
+               ++VNI)
+            if (VNI->first != CurrVN)
+              VNI->second.erase(*II);
+          LIs->RemoveMachineInstrFromMaps(*II);
+          (*II)->eraseFromParent();
+        }
+        
+        VNUseCount.erase(CurrVN);
+
+        for (DenseMap<VNInfo*, SmallPtrSet<MachineInstr*, 4> >::iterator
+             VI = VNUseCount.begin(), VE = VNUseCount.end(); VI != VE; ++VI)
+          if (VI->second.erase(use))
+            VI->second.insert(NewMI);
+
+        ++NumDeadSpills;
+        changed = true;
+        continue;
+      }
+      
+      // If there's more than one non-store instruction, we can't profitably
+      // fold it, so bail.
+      if (NonSpillCount) continue;
+        
+      // Otherwise, this is a load-store case, so DCE them.
+      for (SmallPtrSet<MachineInstr*, 4>::iterator UI = 
+           VNUseCount[CurrVN].begin(), UE = VNUseCount[CurrVN].end();
+           UI != UE; ++UI) {
+        LIs->RemoveMachineInstrFromMaps(*UI);
+        (*UI)->eraseFromParent();
+      }
+        
+      VNUseCount.erase(CurrVN);
+        
+      LIs->RemoveMachineInstrFromMaps(DefMI);
+      (*LI)->removeValNo(CurrVN);
+      DefMI->eraseFromParent();
+      ++NumDeadSpills;
+      changed = true;
+    }
+  }
+  
+  return changed;
+}
+
+bool PreAllocSplitting::createsNewJoin(LiveRange* LR,
+                                       MachineBasicBlock* DefMBB,
+                                       MachineBasicBlock* BarrierMBB) {
+  if (DefMBB == BarrierMBB)
+    return false;
+  
+  if (LR->valno->hasPHIKill())
+    return false;
+  
+  SlotIndex MBBEnd = LIs->getMBBEndIdx(BarrierMBB);
+  if (LR->end < MBBEnd)
+    return false;
+  
+  MachineLoopInfo& MLI = getAnalysis<MachineLoopInfo>();
+  if (MLI.getLoopFor(DefMBB) != MLI.getLoopFor(BarrierMBB))
+    return true;
+  
+  MachineDominatorTree& MDT = getAnalysis<MachineDominatorTree>();
+  SmallPtrSet<MachineBasicBlock*, 4> Visited;
+  typedef std::pair<MachineBasicBlock*,
+                    MachineBasicBlock::succ_iterator> ItPair;
+  SmallVector<ItPair, 4> Stack;
+  Stack.push_back(std::make_pair(BarrierMBB, BarrierMBB->succ_begin()));
+  
+  while (!Stack.empty()) {
+    ItPair P = Stack.back();
+    Stack.pop_back();
+    
+    MachineBasicBlock* PredMBB = P.first;
+    MachineBasicBlock::succ_iterator S = P.second;
+    
+    if (S == PredMBB->succ_end())
+      continue;
+    else if (Visited.count(*S)) {
+      Stack.push_back(std::make_pair(PredMBB, ++S));
+      continue;
+    } else
+      Stack.push_back(std::make_pair(PredMBB, S+1));
+    
+    MachineBasicBlock* MBB = *S;
+    Visited.insert(MBB);
+    
+    if (MBB == BarrierMBB)
+      return true;
+    
+    MachineDomTreeNode* DefMDTN = MDT.getNode(DefMBB);
+    MachineDomTreeNode* BarrierMDTN = MDT.getNode(BarrierMBB);
+    MachineDomTreeNode* MDTN = MDT.getNode(MBB)->getIDom();
+    while (MDTN) {
+      if (MDTN == DefMDTN)
+        return true;
+      else if (MDTN == BarrierMDTN)
+        break;
+      MDTN = MDTN->getIDom();
+    }
+    
+    MBBEnd = LIs->getMBBEndIdx(MBB);
+    if (LR->end > MBBEnd)
+      Stack.push_back(std::make_pair(MBB, MBB->succ_begin()));
+  }
+  
+  return false;
+} 
+  
+
+bool PreAllocSplitting::runOnMachineFunction(MachineFunction &MF) {
+  CurrMF = &MF;
+  TM     = &MF.getTarget();
+  TRI    = TM->getRegisterInfo();
+  TII    = TM->getInstrInfo();
+  MFI    = MF.getFrameInfo();
+  MRI    = &MF.getRegInfo();
+  SIs    = &getAnalysis<SlotIndexes>();
+  LIs    = &getAnalysis<LiveIntervals>();
+  LSs    = &getAnalysis<LiveStacks>();
+  VRM    = &getAnalysis<VirtRegMap>();
+
+  bool MadeChange = false;
+
+  // Make sure blocks are numbered in order.
+  MF.RenumberBlocks();
+
+  MachineBasicBlock *Entry = MF.begin();
+  SmallPtrSet<MachineBasicBlock*,16> Visited;
+
+  SmallPtrSet<LiveInterval*, 8> Split;
+
+  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*,16> >
+         DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited);
+       DFI != E; ++DFI) {
+    BarrierMBB = *DFI;
+    for (MachineBasicBlock::iterator I = BarrierMBB->begin(),
+           E = BarrierMBB->end(); I != E; ++I) {
+      Barrier = &*I;
+      const TargetRegisterClass **BarrierRCs =
+        Barrier->getDesc().getRegClassBarriers();
+      if (!BarrierRCs)
+        continue;
+      BarrierIdx = LIs->getInstructionIndex(Barrier);
+      MadeChange |= SplitRegLiveIntervals(BarrierRCs, Split);
+    }
+  }
+
+  MadeChange |= removeDeadSpills(Split);
+
+  return MadeChange;
+}
diff --git a/src/LLVM/lib/CodeGen/ProcessImplicitDefs.cpp b/src/LLVM/lib/CodeGen/ProcessImplicitDefs.cpp
new file mode 100644
index 0000000..b8831db
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -0,0 +1,295 @@
+//===---------------------- ProcessImplicitDefs.cpp -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "processimplicitdefs"
+
+#include "llvm/CodeGen/ProcessImplicitDefs.h"
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+
+using namespace llvm;
+
+char ProcessImplicitDefs::ID = 0;
+INITIALIZE_PASS(ProcessImplicitDefs, "processimpdefs",
+                "Process Implicit Definitions.", false, false);
+
+void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<AliasAnalysis>();
+  AU.addPreserved<LiveVariables>();
+  AU.addRequired<LiveVariables>();
+  AU.addPreservedID(MachineLoopInfoID);
+  AU.addPreservedID(MachineDominatorsID);
+  AU.addPreservedID(TwoAddressInstructionPassID);
+  AU.addPreservedID(PHIEliminationID);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool
+ProcessImplicitDefs::CanTurnIntoImplicitDef(MachineInstr *MI,
+                                            unsigned Reg, unsigned OpIdx,
+                                            const TargetInstrInfo *tii_,
+                                            SmallSet<unsigned, 8> &ImpDefRegs) {
+  switch(OpIdx) {
+  case 1:
+    return MI->isCopy() && (MI->getOperand(0).getSubReg() == 0 ||
+                            ImpDefRegs.count(MI->getOperand(0).getReg()));
+  case 2:
+    return MI->isSubregToReg() && (MI->getOperand(0).getSubReg() == 0 ||
+                                  ImpDefRegs.count(MI->getOperand(0).getReg()));
+  default: return false;
+  }
+}
+
+static bool isUndefCopy(MachineInstr *MI, unsigned Reg,
+                        const TargetInstrInfo *tii_,
+                        SmallSet<unsigned, 8> &ImpDefRegs) {
+  if (MI->isCopy()) {
+    MachineOperand &MO0 = MI->getOperand(0);
+    MachineOperand &MO1 = MI->getOperand(1);
+    if (MO1.getReg() != Reg)
+      return false;
+    if (!MO0.getSubReg() || ImpDefRegs.count(MO0.getReg()))
+      return true;
+    return false;
+  }
+  return false;
+}
+
+/// processImplicitDefs - Process IMPLICIT_DEF instructions and make sure
+/// there is one implicit_def for each use. Add isUndef marker to
+/// implicit_def defs and their uses.
+bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) {
+
+  DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
+               << "********** Function: "
+               << ((Value*)fn.getFunction())->getName() << '\n');
+
+  bool Changed = false;
+
+  const TargetInstrInfo *tii_ = fn.getTarget().getInstrInfo();
+  const TargetRegisterInfo *tri_ = fn.getTarget().getRegisterInfo();
+  MachineRegisterInfo *mri_ = &fn.getRegInfo();
+
+  LiveVariables *lv_ = &getAnalysis<LiveVariables>();
+
+  SmallSet<unsigned, 8> ImpDefRegs;
+  SmallVector<MachineInstr*, 8> ImpDefMIs;
+  SmallVector<MachineInstr*, 4> RUses;
+  SmallPtrSet<MachineBasicBlock*,16> Visited;
+  SmallPtrSet<MachineInstr*, 8> ModInsts;
+
+  MachineBasicBlock *Entry = fn.begin();
+  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*,16> >
+         DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited);
+       DFI != E; ++DFI) {
+    MachineBasicBlock *MBB = *DFI;
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+         I != E; ) {
+      MachineInstr *MI = &*I;
+      ++I;
+      if (MI->isImplicitDef()) {
+        if (MI->getOperand(0).getSubReg())
+          continue;
+        unsigned Reg = MI->getOperand(0).getReg();
+        ImpDefRegs.insert(Reg);
+        if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+          for (const unsigned *SS = tri_->getSubRegisters(Reg); *SS; ++SS)
+            ImpDefRegs.insert(*SS);
+        }
+        ImpDefMIs.push_back(MI);
+        continue;
+      }
+
+      // Eliminate %reg1032:sub<def> = COPY undef.
+      if (MI->isCopy() && MI->getOperand(0).getSubReg()) {
+        MachineOperand &MO = MI->getOperand(1);
+        if (MO.isUndef() || ImpDefRegs.count(MO.getReg())) {
+          if (MO.isKill()) {
+            LiveVariables::VarInfo& vi = lv_->getVarInfo(MO.getReg());
+            vi.removeKill(MI);
+          }
+          MI->eraseFromParent();
+          Changed = true;
+          continue;
+        }
+      }
+
+      bool ChangedToImpDef = false;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand& MO = MI->getOperand(i);
+        if (!MO.isReg() || (MO.isDef() && !MO.getSubReg()) || MO.isUndef())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (!Reg)
+          continue;
+        if (!ImpDefRegs.count(Reg))
+          continue;
+        // Use is a copy, just turn it into an implicit_def.
+        if (CanTurnIntoImplicitDef(MI, Reg, i, tii_, ImpDefRegs)) {
+          bool isKill = MO.isKill();
+          MI->setDesc(tii_->get(TargetOpcode::IMPLICIT_DEF));
+          for (int j = MI->getNumOperands() - 1, ee = 0; j > ee; --j)
+            MI->RemoveOperand(j);
+          if (isKill) {
+            ImpDefRegs.erase(Reg);
+            LiveVariables::VarInfo& vi = lv_->getVarInfo(Reg);
+            vi.removeKill(MI);
+          }
+          ChangedToImpDef = true;
+          Changed = true;
+          break;
+        }
+
+        Changed = true;
+        MO.setIsUndef();
+        // This is a partial register redef of an implicit def.
+        // Make sure the whole register is defined by the instruction.
+        if (MO.isDef()) {
+          MI->addRegisterDefined(Reg);
+          continue;
+        }
+        if (MO.isKill() || MI->isRegTiedToDefOperand(i)) {
+          // Make sure other uses of 
+          for (unsigned j = i+1; j != e; ++j) {
+            MachineOperand &MOJ = MI->getOperand(j);
+            if (MOJ.isReg() && MOJ.isUse() && MOJ.getReg() == Reg)
+              MOJ.setIsUndef();
+          }
+          ImpDefRegs.erase(Reg);
+        }
+      }
+
+      if (ChangedToImpDef) {
+        // Backtrack to process this new implicit_def.
+        --I;
+      } else {
+        for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
+          MachineOperand& MO = MI->getOperand(i);
+          if (!MO.isReg() || !MO.isDef())
+            continue;
+          ImpDefRegs.erase(MO.getReg());
+        }
+      }
+    }
+
+    // Any outstanding liveout implicit_def's?
+    for (unsigned i = 0, e = ImpDefMIs.size(); i != e; ++i) {
+      MachineInstr *MI = ImpDefMIs[i];
+      unsigned Reg = MI->getOperand(0).getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+          !ImpDefRegs.count(Reg)) {
+        // Delete all "local" implicit_def's. That include those which define
+        // physical registers since they cannot be liveout.
+        MI->eraseFromParent();
+        Changed = true;
+        continue;
+      }
+
+      // If there are multiple defs of the same register and at least one
+      // is not an implicit_def, do not insert implicit_def's before the
+      // uses.
+      bool Skip = false;
+      SmallVector<MachineInstr*, 4> DeadImpDefs;
+      for (MachineRegisterInfo::def_iterator DI = mri_->def_begin(Reg),
+             DE = mri_->def_end(); DI != DE; ++DI) {
+        MachineInstr *DeadImpDef = &*DI;
+        if (!DeadImpDef->isImplicitDef()) {
+          Skip = true;
+          break;
+        }
+        DeadImpDefs.push_back(DeadImpDef);
+      }
+      if (Skip)
+        continue;
+
+      // The only implicit_def which we want to keep are those that are live
+      // out of its block.
+      for (unsigned j = 0, ee = DeadImpDefs.size(); j != ee; ++j)
+        DeadImpDefs[j]->eraseFromParent();
+      Changed = true;
+
+      // Process each use instruction once.
+      for (MachineRegisterInfo::use_iterator UI = mri_->use_begin(Reg),
+             UE = mri_->use_end(); UI != UE; ++UI) {
+        if (UI.getOperand().isUndef())
+          continue;
+        MachineInstr *RMI = &*UI;
+        if (ModInsts.insert(RMI))
+          RUses.push_back(RMI);
+      }
+
+      for (unsigned i = 0, e = RUses.size(); i != e; ++i) {
+        MachineInstr *RMI = RUses[i];
+
+        // Turn a copy use into an implicit_def.
+        if (isUndefCopy(RMI, Reg, tii_, ImpDefRegs)) {
+          RMI->setDesc(tii_->get(TargetOpcode::IMPLICIT_DEF));
+
+          bool isKill = false;
+          SmallVector<unsigned, 4> Ops;
+          for (unsigned j = 0, ee = RMI->getNumOperands(); j != ee; ++j) {
+            MachineOperand &RRMO = RMI->getOperand(j);
+            if (RRMO.isReg() && RRMO.getReg() == Reg) {
+              Ops.push_back(j);
+              if (RRMO.isKill())
+                isKill = true;
+            }
+          }
+          // Leave the other operands along.
+          for (unsigned j = 0, ee = Ops.size(); j != ee; ++j) {
+            unsigned OpIdx = Ops[j];
+            RMI->RemoveOperand(OpIdx-j);
+          }
+
+          // Update LiveVariables varinfo if the instruction is a kill.
+          if (isKill) {
+            LiveVariables::VarInfo& vi = lv_->getVarInfo(Reg);
+            vi.removeKill(RMI);
+          }
+          continue;
+        }
+
+        // Replace Reg with a new vreg that's marked implicit.
+        const TargetRegisterClass* RC = mri_->getRegClass(Reg);
+        unsigned NewVReg = mri_->createVirtualRegister(RC);
+        bool isKill = true;
+        for (unsigned j = 0, ee = RMI->getNumOperands(); j != ee; ++j) {
+          MachineOperand &RRMO = RMI->getOperand(j);
+          if (RRMO.isReg() && RRMO.getReg() == Reg) {
+            RRMO.setReg(NewVReg);
+            RRMO.setIsUndef();
+            if (isKill) {
+              // Only the first operand of NewVReg is marked kill.
+              RRMO.setIsKill();
+              isKill = false;
+            }
+          }
+        }
+      }
+      RUses.clear();
+      ModInsts.clear();
+    }
+    ImpDefRegs.clear();
+    ImpDefMIs.clear();
+  }
+
+  return Changed;
+}
+
diff --git a/src/LLVM/lib/CodeGen/PrologEpilogInserter.cpp b/src/LLVM/lib/CodeGen/PrologEpilogInserter.cpp
new file mode 100644
index 0000000..166cf96
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PrologEpilogInserter.cpp
@@ -0,0 +1,942 @@
+//===-- PrologEpilogInserter.cpp - Insert Prolog/Epilog code in function --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is responsible for finalizing the functions frame layout, saving
+// callee saved registers, and for emitting prolog & epilog code for the
+// function.
+//
+// This pass must be run after register allocation.  After this pass is
+// executed, it is illegal to construct MO_FrameIndex operands.
+//
+// This pass provides an optional shrink wrapping variant of prolog/epilog
+// insertion, enabled via --shrink-wrap. See ShrinkWrapping.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PrologEpilogInserter.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include <climits>
+
+using namespace llvm;
+
+char PEI::ID = 0;
+
+INITIALIZE_PASS(PEI, "prologepilog",
+                "Prologue/Epilogue Insertion", false, false);
+
+/// createPrologEpilogCodeInserter - This function returns a pass that inserts
+/// prolog and epilog code, and eliminates abstract frame references.
+///
+FunctionPass *llvm::createPrologEpilogCodeInserter() { return new PEI(); }
+
+/// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+/// frame indexes with appropriate references.
+///
+bool PEI::runOnMachineFunction(MachineFunction &Fn) {
+  const Function* F = Fn.getFunction();
+  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+  RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : NULL;
+  FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn);
+  FrameConstantRegMap.clear();
+
+  // Calculate the MaxCallFrameSize and AdjustsStack variables for the
+  // function's frame information. Also eliminates call frame pseudo
+  // instructions.
+  calculateCallsInformation(Fn);
+
+  // Allow the target machine to make some adjustments to the function
+  // e.g. UsedPhysRegs before calculateCalleeSavedRegisters.
+  TRI->processFunctionBeforeCalleeSavedScan(Fn, RS);
+
+  // Scan the function for modified callee saved registers and insert spill code
+  // for any callee saved registers that are modified.
+  calculateCalleeSavedRegisters(Fn);
+
+  // Determine placement of CSR spill/restore code:
+  //  - With shrink wrapping, place spills and restores to tightly
+  //    enclose regions in the Machine CFG of the function where
+  //    they are used.
+  //  - Without shink wrapping (default), place all spills in the
+  //    entry block, all restores in return blocks.
+  placeCSRSpillsAndRestores(Fn);
+
+  // Add the code to save and restore the callee saved registers
+  if (!F->hasFnAttr(Attribute::Naked))
+    insertCSRSpillsAndRestores(Fn);
+
+  // Allow the target machine to make final modifications to the function
+  // before the frame layout is finalized.
+  TRI->processFunctionBeforeFrameFinalized(Fn);
+
+  // Calculate actual frame offsets for all abstract stack objects...
+  calculateFrameObjectOffsets(Fn);
+
+  // Add prolog and epilog code to the function.  This function is required
+  // to align the stack frame as necessary for any stack variables or
+  // called functions.  Because of this, calculateCalleeSavedRegisters()
+  // must be called before this function in order to set the AdjustsStack
+  // and MaxCallFrameSize variables.
+  if (!F->hasFnAttr(Attribute::Naked))
+    insertPrologEpilogCode(Fn);
+
+  // Replace all MO_FrameIndex operands with physical register references
+  // and actual offsets.
+  //
+  replaceFrameIndices(Fn);
+
+  // If register scavenging is needed, as we've enabled doing it as a
+  // post-pass, scavenge the virtual registers that frame index elimiation
+  // inserted.
+  if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging)
+    scavengeFrameVirtualRegs(Fn);
+
+  delete RS;
+  clearAllSets();
+  return true;
+}
+
+#if 0
+void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  if (ShrinkWrapping || ShrinkWrapFunc != "") {
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<MachineDominatorTree>();
+  }
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+#endif
+
+/// calculateCallsInformation - Calculate the MaxCallFrameSize and AdjustsStack
+/// variables for the function's frame information and eliminate call frame
+/// pseudo instructions.
+void PEI::calculateCallsInformation(MachineFunction &Fn) {
+  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  unsigned MaxCallFrameSize = 0;
+  bool AdjustsStack = MFI->adjustsStack();
+
+  // Get the function call frame set-up and tear-down instruction opcode
+  int FrameSetupOpcode   = RegInfo->getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = RegInfo->getCallFrameDestroyOpcode();
+
+  // Early exit for targets which have no call frame setup/destroy pseudo
+  // instructions.
+  if (FrameSetupOpcode == -1 && FrameDestroyOpcode == -1)
+    return;
+
+  std::vector<MachineBasicBlock::iterator> FrameSDOps;
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+      if (I->getOpcode() == FrameSetupOpcode ||
+          I->getOpcode() == FrameDestroyOpcode) {
+        assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo"
+               " instructions should have a single immediate argument!");
+        unsigned Size = I->getOperand(0).getImm();
+        if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
+        AdjustsStack = true;
+        FrameSDOps.push_back(I);
+      }
+
+  MFI->setAdjustsStack(AdjustsStack);
+  MFI->setMaxCallFrameSize(MaxCallFrameSize);
+
+  for (std::vector<MachineBasicBlock::iterator>::iterator
+         i = FrameSDOps.begin(), e = FrameSDOps.end(); i != e; ++i) {
+    MachineBasicBlock::iterator I = *i;
+
+    // If call frames are not being included as part of the stack frame, and
+    // the target doesn't indicate otherwise, remove the call frame pseudos
+    // here. The sub/add sp instruction pairs are still inserted, but we don't
+    // need to track the SP adjustment for frame index elimination.
+    if (RegInfo->canSimplifyCallFramePseudos(Fn))
+      RegInfo->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I);
+  }
+}
+
+
+/// calculateCalleeSavedRegisters - Scan the function for modified callee saved
+/// registers.
+void PEI::calculateCalleeSavedRegisters(MachineFunction &Fn) {
+  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+  const TargetFrameInfo *TFI = Fn.getTarget().getFrameInfo();
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  // Get the callee saved register list...
+  const unsigned *CSRegs = RegInfo->getCalleeSavedRegs(&Fn);
+
+  // These are used to keep track the callee-save area. Initialize them.
+  MinCSFrameIndex = INT_MAX;
+  MaxCSFrameIndex = 0;
+
+  // Early exit for targets which have no callee saved registers.
+  if (CSRegs == 0 || CSRegs[0] == 0)
+    return;
+
+  // In Naked functions we aren't going to save any registers.
+  if (Fn.getFunction()->hasFnAttr(Attribute::Naked))
+    return;
+
+  std::vector<CalleeSavedInfo> CSI;
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    if (Fn.getRegInfo().isPhysRegUsed(Reg)) {
+      // If the reg is modified, save it!
+      CSI.push_back(CalleeSavedInfo(Reg));
+    } else {
+      for (const unsigned *AliasSet = RegInfo->getAliasSet(Reg);
+           *AliasSet; ++AliasSet) {  // Check alias registers too.
+        if (Fn.getRegInfo().isPhysRegUsed(*AliasSet)) {
+          CSI.push_back(CalleeSavedInfo(Reg));
+          break;
+        }
+      }
+    }
+  }
+
+  if (CSI.empty())
+    return;   // Early exit if no callee saved registers are modified!
+
+  unsigned NumFixedSpillSlots;
+  const TargetFrameInfo::SpillSlot *FixedSpillSlots =
+    TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots);
+
+  // Now that we know which registers need to be saved and restored, allocate
+  // stack slots for them.
+  for (std::vector<CalleeSavedInfo>::iterator
+         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+    unsigned Reg = I->getReg();
+    const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
+
+    int FrameIdx;
+    if (RegInfo->hasReservedSpillSlot(Fn, Reg, FrameIdx)) {
+      I->setFrameIdx(FrameIdx);
+      continue;
+    }
+
+    // Check to see if this physreg must be spilled to a particular stack slot
+    // on this target.
+    const TargetFrameInfo::SpillSlot *FixedSlot = FixedSpillSlots;
+    while (FixedSlot != FixedSpillSlots+NumFixedSpillSlots &&
+           FixedSlot->Reg != Reg)
+      ++FixedSlot;
+
+    if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
+      // Nope, just spill it anywhere convenient.
+      unsigned Align = RC->getAlignment();
+      unsigned StackAlign = TFI->getStackAlignment();
+
+      // We may not be able to satisfy the desired alignment specification of
+      // the TargetRegisterClass if the stack alignment is smaller. Use the
+      // min.
+      Align = std::min(Align, StackAlign);
+      FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true);
+      if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
+      if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
+    } else {
+      // Spill it to the stack where we must.
+      FrameIdx = MFI->CreateFixedObject(RC->getSize(), FixedSlot->Offset, true);
+    }
+
+    I->setFrameIdx(FrameIdx);
+  }
+
+  MFI->setCalleeSavedInfo(CSI);
+}
+
+/// insertCSRSpillsAndRestores - Insert spill and restore code for
+/// callee saved registers used in the function, handling shrink wrapping.
+///
+void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
+  // Get callee saved register information.
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  MFI->setCalleeSavedInfoValid(true);
+
+  // Early exit if no callee saved registers are modified!
+  if (CSI.empty())
+    return;
+
+  const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+  MachineBasicBlock::iterator I;
+
+  if (! ShrinkWrapThisFunction) {
+    // Spill using target interface.
+    I = EntryBlock->begin();
+    if (!TII.spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) {
+      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+        // Add the callee-saved register as live-in.
+        // It's killed at the spill.
+        EntryBlock->addLiveIn(CSI[i].getReg());
+
+        // Insert the spill to the stack frame.
+        unsigned Reg = CSI[i].getReg();
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.storeRegToStackSlot(*EntryBlock, I, Reg, true,
+                                CSI[i].getFrameIdx(), RC, TRI);
+      }
+    }
+
+    // Restore using target interface.
+    for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri) {
+      MachineBasicBlock* MBB = ReturnBlocks[ri];
+      I = MBB->end(); --I;
+
+      // Skip over all terminator instructions, which are part of the return
+      // sequence.
+      MachineBasicBlock::iterator I2 = I;
+      while (I2 != MBB->begin() && (--I2)->getDesc().isTerminator())
+        I = I2;
+
+      bool AtStart = I == MBB->begin();
+      MachineBasicBlock::iterator BeforeI = I;
+      if (!AtStart)
+        --BeforeI;
+
+      // Restore all registers immediately before the return and any
+      // terminators that preceed it.
+      if (!TII.restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
+        for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+          unsigned Reg = CSI[i].getReg();
+          const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+          TII.loadRegFromStackSlot(*MBB, I, Reg,
+                                   CSI[i].getFrameIdx(),
+                                   RC, TRI);
+          assert(I != MBB->begin() &&
+                 "loadRegFromStackSlot didn't insert any code!");
+          // Insert in reverse order.  loadRegFromStackSlot can insert
+          // multiple instructions.
+          if (AtStart)
+            I = MBB->begin();
+          else {
+            I = BeforeI;
+            ++I;
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  // Insert spills.
+  std::vector<CalleeSavedInfo> blockCSI;
+  for (CSRegBlockMap::iterator BI = CSRSave.begin(),
+         BE = CSRSave.end(); BI != BE; ++BI) {
+    MachineBasicBlock* MBB = BI->first;
+    CSRegSet save = BI->second;
+
+    if (save.empty())
+      continue;
+
+    blockCSI.clear();
+    for (CSRegSet::iterator RI = save.begin(),
+           RE = save.end(); RI != RE; ++RI) {
+      blockCSI.push_back(CSI[*RI]);
+    }
+    assert(blockCSI.size() > 0 &&
+           "Could not collect callee saved register info");
+
+    I = MBB->begin();
+
+    // When shrink wrapping, use stack slot stores/loads.
+    for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
+      // Add the callee-saved register as live-in.
+      // It's killed at the spill.
+      MBB->addLiveIn(blockCSI[i].getReg());
+
+      // Insert the spill to the stack frame.
+      unsigned Reg = blockCSI[i].getReg();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(*MBB, I, Reg,
+                              true,
+                              blockCSI[i].getFrameIdx(),
+                              RC, TRI);
+    }
+  }
+
+  for (CSRegBlockMap::iterator BI = CSRRestore.begin(),
+         BE = CSRRestore.end(); BI != BE; ++BI) {
+    MachineBasicBlock* MBB = BI->first;
+    CSRegSet restore = BI->second;
+
+    if (restore.empty())
+      continue;
+
+    blockCSI.clear();
+    for (CSRegSet::iterator RI = restore.begin(),
+           RE = restore.end(); RI != RE; ++RI) {
+      blockCSI.push_back(CSI[*RI]);
+    }
+    assert(blockCSI.size() > 0 &&
+           "Could not find callee saved register info");
+
+    // If MBB is empty and needs restores, insert at the _beginning_.
+    if (MBB->empty()) {
+      I = MBB->begin();
+    } else {
+      I = MBB->end();
+      --I;
+
+      // Skip over all terminator instructions, which are part of the
+      // return sequence.
+      if (! I->getDesc().isTerminator()) {
+        ++I;
+      } else {
+        MachineBasicBlock::iterator I2 = I;
+        while (I2 != MBB->begin() && (--I2)->getDesc().isTerminator())
+          I = I2;
+      }
+    }
+
+    bool AtStart = I == MBB->begin();
+    MachineBasicBlock::iterator BeforeI = I;
+    if (!AtStart)
+      --BeforeI;
+
+    // Restore all registers immediately before the return and any
+    // terminators that preceed it.
+    for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
+      unsigned Reg = blockCSI[i].getReg();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(*MBB, I, Reg,
+                               blockCSI[i].getFrameIdx(),
+                               RC, TRI);
+      assert(I != MBB->begin() &&
+             "loadRegFromStackSlot didn't insert any code!");
+      // Insert in reverse order.  loadRegFromStackSlot can insert
+      // multiple instructions.
+      if (AtStart)
+        I = MBB->begin();
+      else {
+        I = BeforeI;
+        ++I;
+      }
+    }
+  }
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
+                  bool StackGrowsDown, int64_t &Offset,
+                  unsigned &MaxAlign) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = (Offset + Align - 1) / Align * Align;
+
+  if (StackGrowsDown) {
+    MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+  } else {
+    MFI->setObjectOffset(FrameIdx, Offset);
+    Offset += MFI->getObjectSize(FrameIdx);
+  }
+}
+
+/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
+/// abstract stack objects.
+///
+void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  const TargetFrameInfo &TFI = *Fn.getTarget().getFrameInfo();
+
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameInfo::StackGrowsDown;
+
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  // Start at the beginning of the local area.
+  // The Offset is the distance from the stack top in the direction
+  // of stack growth -- so it's always nonnegative.
+  int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+  if (StackGrowsDown)
+    LocalAreaOffset = -LocalAreaOffset;
+  assert(LocalAreaOffset >= 0
+         && "Local area offset should be in direction of stack growth");
+  int64_t Offset = LocalAreaOffset;
+
+  // If there are fixed sized objects that are preallocated in the local area,
+  // non-fixed objects can't be allocated right at the start of local area.
+  // We currently don't support filling in holes in between fixed sized
+  // objects, so we adjust 'Offset' to point to the end of last fixed sized
+  // preallocated object.
+  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
+    int64_t FixedOff;
+    if (StackGrowsDown) {
+      // The maximum distance from the stack pointer is at lower address of
+      // the object -- which is given by offset. For down growing stack
+      // the offset is negative, so we negate the offset to get the distance.
+      FixedOff = -MFI->getObjectOffset(i);
+    } else {
+      // The maximum distance from the start pointer is at the upper
+      // address of the object.
+      FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
+    }
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+
+  // First assign frame offsets to stack objects that are used to spill
+  // callee saved registers.
+  if (StackGrowsDown) {
+    for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) {
+      // If the stack grows down, we need to add the size to find the lowest
+      // address of the object.
+      Offset += MFI->getObjectSize(i);
+
+      unsigned Align = MFI->getObjectAlignment(i);
+      // Adjust to alignment boundary
+      Offset = (Offset+Align-1)/Align*Align;
+
+      MFI->setObjectOffset(i, -Offset);        // Set the computed offset
+    }
+  } else {
+    int MaxCSFI = MaxCSFrameIndex, MinCSFI = MinCSFrameIndex;
+    for (int i = MaxCSFI; i >= MinCSFI ; --i) {
+      unsigned Align = MFI->getObjectAlignment(i);
+      // Adjust to alignment boundary
+      Offset = (Offset+Align-1)/Align*Align;
+
+      MFI->setObjectOffset(i, Offset);
+      Offset += MFI->getObjectSize(i);
+    }
+  }
+
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Make sure the special register scavenging spill slot is closest to the
+  // frame pointer if a frame pointer is required.
+  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+  if (RS && RegInfo->hasFP(Fn) && !RegInfo->needsStackRealignment(Fn)) {
+    int SFI = RS->getScavengingFrameIndex();
+    if (SFI >= 0)
+      AdjustStackOffset(MFI, SFI, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // Make sure that the stack protector comes before the local variables on the
+  // stack.
+  SmallSet<int, 16> LargeStackObjs;
+  if (MFI->getStackProtectorIndex() >= 0) {
+    AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown,
+                      Offset, MaxAlign);
+
+    // Assign large stack objects first.
+    for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+      if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+        continue;
+      if (RS && (int)i == RS->getScavengingFrameIndex())
+        continue;
+      if (MFI->isDeadObjectIndex(i))
+        continue;
+      if (MFI->getStackProtectorIndex() == (int)i)
+        continue;
+      if (!MFI->MayNeedStackProtector(i))
+        continue;
+
+      AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+      LargeStackObjs.insert(i);
+    }
+  }
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+      continue;
+    if (RS && (int)i == RS->getScavengingFrameIndex())
+      continue;
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+    if (MFI->getStackProtectorIndex() == (int)i)
+      continue;
+    if (LargeStackObjs.count(i))
+      continue;
+
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // Make sure the special register scavenging spill slot is closest to the
+  // stack pointer.
+  if (RS && (!RegInfo->hasFP(Fn) || RegInfo->needsStackRealignment(Fn))) {
+    int SFI = RS->getScavengingFrameIndex();
+    if (SFI >= 0)
+      AdjustStackOffset(MFI, SFI, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  if (!RegInfo->targetHandlesStackFrameRounding()) {
+    // If we have reserved argument space for call sites in the function
+    // immediately on entry to the current function, count it as part of the
+    // overall stack size.
+    if (MFI->adjustsStack() && RegInfo->hasReservedCallFrame(Fn))
+      Offset += MFI->getMaxCallFrameSize();
+
+    // Round up the size to a multiple of the alignment.  If the function has
+    // any calls or alloca's, align to the target's StackAlignment value to
+    // ensure that the callee's frame or the alloca data is suitably aligned;
+    // otherwise, for leaf functions, align to the TransientStackAlignment
+    // value.
+    unsigned StackAlign;
+    if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
+        (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
+      StackAlign = TFI.getStackAlignment();
+    else
+      StackAlign = TFI.getTransientStackAlignment();
+
+    // If the frame pointer is eliminated, all frame offsets will be relative to
+    // SP not FP. Align to MaxAlign so this works.
+    StackAlign = std::max(StackAlign, MaxAlign);
+    unsigned AlignMask = StackAlign - 1;
+    Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+  }
+
+  // Update frame info to pretend that this is part of the stack...
+  MFI->setStackSize(Offset - LocalAreaOffset);
+}
+
+/// insertPrologEpilogCode - Scan the function for modified callee saved
+/// registers, insert spill code for these callee saved registers, then add
+/// prolog and epilog code to the function.
+///
+void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
+  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+
+  // Add prologue to the function...
+  TRI->emitPrologue(Fn);
+
+  // Add epilogue to restore the callee-save registers in each exiting block
+  for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (!I->empty() && I->back().getDesc().isReturn())
+      TRI->emitEpilogue(Fn, *I);
+  }
+}
+
+/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
+/// register references and actual offsets.
+///
+void PEI::replaceFrameIndices(MachineFunction &Fn) {
+  if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do?
+
+  const TargetMachine &TM = Fn.getTarget();
+  assert(TM.getRegisterInfo() && "TM::getRegisterInfo() must be implemented!");
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  const TargetFrameInfo *TFI = TM.getFrameInfo();
+  bool StackGrowsDown =
+    TFI->getStackGrowthDirection() == TargetFrameInfo::StackGrowsDown;
+  int FrameSetupOpcode   = TRI.getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = TRI.getCallFrameDestroyOpcode();
+
+  for (MachineFunction::iterator BB = Fn.begin(),
+         E = Fn.end(); BB != E; ++BB) {
+#ifndef NDEBUG
+    int SPAdjCount = 0; // frame setup / destroy count.
+#endif
+    int SPAdj = 0;  // SP offset due to call frame setup / destroy.
+    if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
+
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+
+      if (I->getOpcode() == FrameSetupOpcode ||
+          I->getOpcode() == FrameDestroyOpcode) {
+#ifndef NDEBUG
+        // Track whether we see even pairs of them
+        SPAdjCount += I->getOpcode() == FrameSetupOpcode ? 1 : -1;
+#endif
+        // Remember how much SP has been adjusted to create the call
+        // frame.
+        int Size = I->getOperand(0).getImm();
+
+        if ((!StackGrowsDown && I->getOpcode() == FrameSetupOpcode) ||
+            (StackGrowsDown && I->getOpcode() == FrameDestroyOpcode))
+          Size = -Size;
+
+        SPAdj += Size;
+
+        MachineBasicBlock::iterator PrevI = BB->end();
+        if (I != BB->begin()) PrevI = prior(I);
+        TRI.eliminateCallFramePseudoInstr(Fn, *BB, I);
+
+        // Visit the instructions created by eliminateCallFramePseudoInstr().
+        if (PrevI == BB->end())
+          I = BB->begin();     // The replaced instr was the first in the block.
+        else
+          I = llvm::next(PrevI);
+        continue;
+      }
+
+      MachineInstr *MI = I;
+      bool DoIncr = true;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
+        if (MI->getOperand(i).isFI()) {
+          // Some instructions (e.g. inline asm instructions) can have
+          // multiple frame indices and/or cause eliminateFrameIndex
+          // to insert more than one instruction. We need the register
+          // scavenger to go through all of these instructions so that
+          // it can update its register information. We keep the
+          // iterator at the point before insertion so that we can
+          // revisit them in full.
+          bool AtBeginning = (I == BB->begin());
+          if (!AtBeginning) --I;
+
+          // If this instruction has a FrameIndex operand, we need to
+          // use that target machine register info object to eliminate
+          // it.
+          TargetRegisterInfo::FrameIndexValue Value;
+          unsigned VReg =
+            TRI.eliminateFrameIndex(MI, SPAdj, &Value,
+                                    FrameIndexVirtualScavenging ?  NULL : RS);
+          if (VReg) {
+            assert (FrameIndexVirtualScavenging &&
+                    "Not scavenging, but virtual returned from "
+                    "eliminateFrameIndex()!");
+            FrameConstantRegMap[VReg] = FrameConstantEntry(Value, SPAdj);
+          }
+
+          // Reset the iterator if we were at the beginning of the BB.
+          if (AtBeginning) {
+            I = BB->begin();
+            DoIncr = false;
+          }
+
+          MI = 0;
+          break;
+        }
+
+      if (DoIncr && I != BB->end()) ++I;
+
+      // Update register states.
+      if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
+    }
+
+    // If we have evenly matched pairs of frame setup / destroy instructions,
+    // make sure the adjustments come out to zero. If we don't have matched
+    // pairs, we can't be sure the missing bit isn't in another basic block
+    // due to a custom inserter playing tricks, so just asserting SPAdj==0
+    // isn't sufficient. See tMOVCC on Thumb1, for example.
+    assert((SPAdjCount || SPAdj == 0) &&
+           "Unbalanced call frame setup / destroy pairs?");
+  }
+}
+
+/// findLastUseReg - find the killing use of the specified register within
+/// the instruciton range. Return the operand number of the kill in Operand.
+static MachineBasicBlock::iterator
+findLastUseReg(MachineBasicBlock::iterator I, MachineBasicBlock::iterator ME,
+               unsigned Reg) {
+  // Scan forward to find the last use of this virtual register
+  for (++I; I != ME; ++I) {
+    MachineInstr *MI = I;
+    bool isDefInsn = false;
+    bool isKillInsn = false;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
+      if (MI->getOperand(i).isReg()) {
+        unsigned OpReg = MI->getOperand(i).getReg();
+        if (OpReg == 0 || !TargetRegisterInfo::isVirtualRegister(OpReg))
+          continue;
+        assert (OpReg == Reg
+                && "overlapping use of scavenged index register!");
+        // If this is the killing use, we have a candidate.
+        if (MI->getOperand(i).isKill())
+          isKillInsn = true;
+        else if (MI->getOperand(i).isDef())
+          isDefInsn = true;
+      }
+    if (isKillInsn && !isDefInsn)
+      return I;
+  }
+  // If we hit the end of the basic block, there was no kill of
+  // the virtual register, which is wrong.
+  assert (0 && "scavenged index register never killed!");
+  return ME;
+}
+
+/// scavengeFrameVirtualRegs - Replace all frame index virtual registers
+/// with physical registers. Use the register scavenger to find an
+/// appropriate register to use.
+void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
+  // Run through the instructions and find any virtual registers.
+  for (MachineFunction::iterator BB = Fn.begin(),
+       E = Fn.end(); BB != E; ++BB) {
+    RS->enterBasicBlock(BB);
+
+    // FIXME: The logic flow in this function is still too convoluted.
+    // It needs a cleanup refactoring. Do that in preparation for tracking
+    // more than one scratch register value and using ranges to find
+    // available scratch registers.
+    unsigned CurrentVirtReg = 0;
+    unsigned CurrentScratchReg = 0;
+    bool havePrevValue = false;
+    TargetRegisterInfo::FrameIndexValue PrevValue(0,0);
+    TargetRegisterInfo::FrameIndexValue Value(0,0);
+    MachineInstr *PrevLastUseMI = NULL;
+    unsigned PrevLastUseOp = 0;
+    bool trackingCurrentValue = false;
+    int SPAdj = 0;
+
+    // The instruction stream may change in the loop, so check BB->end()
+    // directly.
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+      MachineInstr *MI = I;
+      bool isDefInsn = false;
+      bool isKillInsn = false;
+      bool clobbersScratchReg = false;
+      bool DoIncr = true;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        if (MI->getOperand(i).isReg()) {
+          MachineOperand &MO = MI->getOperand(i);
+          unsigned Reg = MO.getReg();
+          if (Reg == 0)
+            continue;
+          if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+            // If we have a previous scratch reg, check and see if anything
+            // here kills whatever value is in there.
+            if (Reg == CurrentScratchReg) {
+              if (MO.isUse()) {
+                // Two-address operands implicitly kill
+                if (MO.isKill() || MI->isRegTiedToDefOperand(i))
+                  clobbersScratchReg = true;
+              } else {
+                assert (MO.isDef());
+                clobbersScratchReg = true;
+              }
+            }
+            continue;
+          }
+          // If this is a def, remember that this insn defines the value.
+          // This lets us properly consider insns which re-use the scratch
+          // register, such as r2 = sub r2, #imm, in the middle of the
+          // scratch range.
+          if (MO.isDef())
+            isDefInsn = true;
+
+          // Have we already allocated a scratch register for this virtual?
+          if (Reg != CurrentVirtReg) {
+            // When we first encounter a new virtual register, it
+            // must be a definition.
+            assert(MI->getOperand(i).isDef() &&
+                   "frame index virtual missing def!");
+            // We can't have nested virtual register live ranges because
+            // there's only a guarantee of one scavenged register at a time.
+            assert (CurrentVirtReg == 0 &&
+                    "overlapping frame index virtual registers!");
+
+            // If the target gave us information about what's in the register,
+            // we can use that to re-use scratch regs.
+            DenseMap<unsigned, FrameConstantEntry>::iterator Entry =
+              FrameConstantRegMap.find(Reg);
+            trackingCurrentValue = Entry != FrameConstantRegMap.end();
+            if (trackingCurrentValue) {
+              SPAdj = (*Entry).second.second;
+              Value = (*Entry).second.first;
+            } else {
+              SPAdj = 0;
+              Value.first = 0;
+              Value.second = 0;
+            }
+
+            // If the scratch register from the last allocation is still
+            // available, see if the value matches. If it does, just re-use it.
+            if (trackingCurrentValue && havePrevValue && PrevValue == Value) {
+              // FIXME: This assumes that the instructions in the live range
+              // for the virtual register are exclusively for the purpose
+              // of populating the value in the register. That's reasonable
+              // for these frame index registers, but it's still a very, very
+              // strong assumption. rdar://7322732. Better would be to
+              // explicitly check each instruction in the range for references
+              // to the virtual register. Only delete those insns that
+              // touch the virtual register.
+
+              // Find the last use of the new virtual register. Remove all
+              // instruction between here and there, and update the current
+              // instruction to reference the last use insn instead.
+              MachineBasicBlock::iterator LastUseMI =
+                findLastUseReg(I, BB->end(), Reg);
+
+              // Remove all instructions up 'til the last use, since they're
+              // just calculating the value we already have.
+              BB->erase(I, LastUseMI);
+              I = LastUseMI;
+
+              // Extend the live range of the scratch register
+              PrevLastUseMI->getOperand(PrevLastUseOp).setIsKill(false);
+              RS->setUsed(CurrentScratchReg);
+              CurrentVirtReg = Reg;
+
+              // We deleted the instruction we were scanning the operands of.
+              // Jump back to the instruction iterator loop. Don't increment
+              // past this instruction since we updated the iterator already.
+              DoIncr = false;
+              break;
+            }
+
+            // Scavenge a new scratch register
+            CurrentVirtReg = Reg;
+            const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
+            CurrentScratchReg = RS->scavengeRegister(RC, I, SPAdj);
+            PrevValue = Value;
+          }
+          // replace this reference to the virtual register with the
+          // scratch register.
+          assert (CurrentScratchReg && "Missing scratch register!");
+          MI->getOperand(i).setReg(CurrentScratchReg);
+
+          if (MI->getOperand(i).isKill()) {
+            isKillInsn = true;
+            PrevLastUseOp = i;
+            PrevLastUseMI = MI;
+          }
+        }
+      }
+      // If this is the last use of the scratch, stop tracking it. The
+      // last use will be a kill operand in an instruction that does
+      // not also define the scratch register.
+      if (isKillInsn && !isDefInsn) {
+        CurrentVirtReg = 0;
+        havePrevValue = trackingCurrentValue;
+      }
+      // Similarly, notice if instruction clobbered the value in the
+      // register we're tracking for possible later reuse. This is noted
+      // above, but enforced here since the value is still live while we
+      // process the rest of the operands of the instruction.
+      if (clobbersScratchReg) {
+        havePrevValue = false;
+        CurrentScratchReg = 0;
+      }
+      if (DoIncr) {
+        RS->forward(I);
+        ++I;
+      }
+    }
+  }
+}
diff --git a/src/LLVM/lib/CodeGen/PrologEpilogInserter.h b/src/LLVM/lib/CodeGen/PrologEpilogInserter.h
new file mode 100644
index 0000000..e56c36f
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PrologEpilogInserter.h
@@ -0,0 +1,182 @@
+//===-- PrologEpilogInserter.h - Prolog/Epilog code insertion -*- C++ -* --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is responsible for finalizing the functions frame layout, saving
+// callee saved registers, and for emitting prolog & epilog code for the
+// function.
+//
+// This pass must be run after register allocation.  After this pass is
+// executed, it is illegal to construct MO_FrameIndex operands.
+//
+// This pass also implements a shrink wrapping variant of prolog/epilog
+// insertion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PEI_H
+#define LLVM_CODEGEN_PEI_H
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+  class RegScavenger;
+  class MachineBasicBlock;
+
+  class PEI : public MachineFunctionPass {
+  public:
+    static char ID;
+    PEI() : MachineFunctionPass(ID) {}
+
+    const char *getPassName() const {
+      return "Prolog/Epilog Insertion & Frame Finalization";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+    /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+    /// frame indexes with appropriate references.
+    ///
+    bool runOnMachineFunction(MachineFunction &Fn);
+
+  private:
+    RegScavenger *RS;
+
+    // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved
+    // stack frame indexes.
+    unsigned MinCSFrameIndex, MaxCSFrameIndex;
+
+    // Analysis info for spill/restore placement.
+    // "CSR": "callee saved register".
+
+    // CSRegSet contains indices into the Callee Saved Register Info
+    // vector built by calculateCalleeSavedRegisters() and accessed
+    // via MF.getFrameInfo()->getCalleeSavedInfo().
+    typedef SparseBitVector<> CSRegSet;
+
+    // CSRegBlockMap maps MachineBasicBlocks to sets of callee
+    // saved register indices.
+    typedef DenseMap<MachineBasicBlock*, CSRegSet> CSRegBlockMap;
+
+    // Set and maps for computing CSR spill/restore placement:
+    //  used in function (UsedCSRegs)
+    //  used in a basic block (CSRUsed)
+    //  anticipatable in a basic block (Antic{In,Out})
+    //  available in a basic block (Avail{In,Out})
+    //  to be spilled at the entry to a basic block (CSRSave)
+    //  to be restored at the end of a basic block (CSRRestore)
+    CSRegSet UsedCSRegs;
+    CSRegBlockMap CSRUsed;
+    CSRegBlockMap AnticIn, AnticOut;
+    CSRegBlockMap AvailIn, AvailOut;
+    CSRegBlockMap CSRSave;
+    CSRegBlockMap CSRRestore;
+
+    // Entry and return blocks of the current function.
+    MachineBasicBlock* EntryBlock;
+    SmallVector<MachineBasicBlock*, 4> ReturnBlocks;
+
+    // Map of MBBs to top level MachineLoops.
+    DenseMap<MachineBasicBlock*, MachineLoop*> TLLoops;
+
+    // Flag to control shrink wrapping per-function:
+    // may choose to skip shrink wrapping for certain
+    // functions.
+    bool ShrinkWrapThisFunction;
+
+    // Flag to control whether to use the register scavenger to resolve
+    // frame index materialization registers. Set according to
+    // TRI->requiresFrameIndexScavenging() for the curren function.
+    bool FrameIndexVirtualScavenging;
+
+    // When using the scavenger post-pass to resolve frame reference
+    // materialization registers, maintain a map of the registers to
+    // the constant value and SP adjustment associated with it.
+    typedef std::pair<TargetRegisterInfo::FrameIndexValue, int>
+      FrameConstantEntry;
+    DenseMap<unsigned, FrameConstantEntry> FrameConstantRegMap;
+
+#ifndef NDEBUG
+    // Machine function handle.
+    MachineFunction* MF;
+
+    // Flag indicating that the current function
+    // has at least one "short" path in the machine
+    // CFG from the entry block to an exit block.
+    bool HasFastExitPath;
+#endif
+
+    bool calculateSets(MachineFunction &Fn);
+    bool calcAnticInOut(MachineBasicBlock* MBB);
+    bool calcAvailInOut(MachineBasicBlock* MBB);
+    void calculateAnticAvail(MachineFunction &Fn);
+    bool addUsesForMEMERegion(MachineBasicBlock* MBB,
+                              SmallVector<MachineBasicBlock*, 4>& blks);
+    bool addUsesForTopLevelLoops(SmallVector<MachineBasicBlock*, 4>& blks);
+    bool calcSpillPlacements(MachineBasicBlock* MBB,
+                             SmallVector<MachineBasicBlock*, 4> &blks,
+                             CSRegBlockMap &prevSpills);
+    bool calcRestorePlacements(MachineBasicBlock* MBB,
+                               SmallVector<MachineBasicBlock*, 4> &blks,
+                               CSRegBlockMap &prevRestores);
+    void placeSpillsAndRestores(MachineFunction &Fn);
+    void placeCSRSpillsAndRestores(MachineFunction &Fn);
+    void calculateCallsInformation(MachineFunction &Fn);
+    void calculateCalleeSavedRegisters(MachineFunction &Fn);
+    void insertCSRSpillsAndRestores(MachineFunction &Fn);
+    void calculateFrameObjectOffsets(MachineFunction &Fn);
+    void replaceFrameIndices(MachineFunction &Fn);
+    void scavengeFrameVirtualRegs(MachineFunction &Fn);
+    void insertPrologEpilogCode(MachineFunction &Fn);
+
+    // Initialize DFA sets, called before iterations.
+    void clearAnticAvailSets();
+    // Clear all sets constructed by shrink wrapping.
+    void clearAllSets();
+
+    // Initialize all shrink wrapping data.
+    void initShrinkWrappingInfo();
+
+    // Convienences for dealing with machine loops.
+    MachineBasicBlock* getTopLevelLoopPreheader(MachineLoop* LP);
+    MachineLoop* getTopLevelLoopParent(MachineLoop *LP);
+
+    // Propgate CSRs used in MBB to all MBBs of loop LP.
+    void propagateUsesAroundLoop(MachineBasicBlock* MBB, MachineLoop* LP);
+
+    // Convenience for recognizing return blocks.
+    bool isReturnBlock(MachineBasicBlock* MBB);
+
+#ifndef NDEBUG
+    // Debugging methods.
+
+    // Mark this function as having fast exit paths.
+    void findFastExitPath();
+
+    // Verify placement of spills/restores.
+    void verifySpillRestorePlacement();
+
+    std::string getBasicBlockName(const MachineBasicBlock* MBB);
+    std::string stringifyCSRegSet(const CSRegSet& s);
+    void dumpSet(const CSRegSet& s);
+    void dumpUsed(MachineBasicBlock* MBB);
+    void dumpAllUsed();
+    void dumpSets(MachineBasicBlock* MBB);
+    void dumpSets1(MachineBasicBlock* MBB);
+    void dumpAllSets();
+    void dumpSRSets();
+#endif
+
+  };
+} // End llvm namespace
+#endif
diff --git a/src/LLVM/lib/CodeGen/PseudoSourceValue.cpp b/src/LLVM/lib/CodeGen/PseudoSourceValue.cpp
new file mode 100644
index 0000000..5e86e5a
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/PseudoSourceValue.cpp
@@ -0,0 +1,134 @@
+//===-- llvm/CodeGen/PseudoSourceValue.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PseudoSourceValue class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/System/Mutex.h"
+#include <map>
+using namespace llvm;
+
+namespace {
+struct PSVGlobalsTy {
+  // PseudoSourceValues are immutable so don't need locking.
+  const PseudoSourceValue PSVs[4];
+  sys::Mutex Lock;  // Guards FSValues, but not the values inside it.
+  std::map<int, const PseudoSourceValue *> FSValues;
+
+  PSVGlobalsTy() : PSVs() {}
+  ~PSVGlobalsTy() {
+    for (std::map<int, const PseudoSourceValue *>::iterator
+           I = FSValues.begin(), E = FSValues.end(); I != E; ++I) {
+      delete I->second;
+    }
+  }
+};
+
+static ManagedStatic<PSVGlobalsTy> PSVGlobals;
+
+}  // anonymous namespace
+
+const PseudoSourceValue *PseudoSourceValue::getStack()
+{ return &PSVGlobals->PSVs[0]; }
+const PseudoSourceValue *PseudoSourceValue::getGOT()
+{ return &PSVGlobals->PSVs[1]; }
+const PseudoSourceValue *PseudoSourceValue::getJumpTable()
+{ return &PSVGlobals->PSVs[2]; }
+const PseudoSourceValue *PseudoSourceValue::getConstantPool()
+{ return &PSVGlobals->PSVs[3]; }
+
+static const char *const PSVNames[] = {
+  "Stack",
+  "GOT",
+  "JumpTable",
+  "ConstantPool"
+};
+
+// FIXME: THIS IS A HACK!!!!
+// Eventually these should be uniqued on LLVMContext rather than in a managed
+// static.  For now, we can safely use the global context for the time being to
+// squeak by.
+PseudoSourceValue::PseudoSourceValue(enum ValueTy Subclass) :
+  Value(Type::getInt8PtrTy(getGlobalContext()),
+        Subclass) {}
+
+void PseudoSourceValue::printCustom(raw_ostream &O) const {
+  O << PSVNames[this - PSVGlobals->PSVs];
+}
+
+const PseudoSourceValue *PseudoSourceValue::getFixedStack(int FI) {
+  PSVGlobalsTy &PG = *PSVGlobals;
+  sys::ScopedLock locked(PG.Lock);
+  const PseudoSourceValue *&V = PG.FSValues[FI];
+  if (!V)
+    V = new FixedStackPseudoSourceValue(FI);
+  return V;
+}
+
+bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const {
+  if (this == getStack())
+    return false;
+  if (this == getGOT() ||
+      this == getConstantPool() ||
+      this == getJumpTable())
+    return true;
+  llvm_unreachable("Unknown PseudoSourceValue!");
+  return false;
+}
+
+bool PseudoSourceValue::isAliased(const MachineFrameInfo *MFI) const {
+  if (this == getStack() ||
+      this == getGOT() ||
+      this == getConstantPool() ||
+      this == getJumpTable())
+    return false;
+  llvm_unreachable("Unknown PseudoSourceValue!");
+  return true;
+}
+
+bool PseudoSourceValue::mayAlias(const MachineFrameInfo *MFI) const {
+  if (this == getGOT() ||
+      this == getConstantPool() ||
+      this == getJumpTable())
+    return false;
+  return true;
+}
+
+bool FixedStackPseudoSourceValue::isConstant(const MachineFrameInfo *MFI) const{
+  return MFI && MFI->isImmutableObjectIndex(FI);
+}
+
+bool FixedStackPseudoSourceValue::isAliased(const MachineFrameInfo *MFI) const {
+  // Negative frame indices are used for special things that don't
+  // appear in LLVM IR. Non-negative indices may be used for things
+  // like static allocas.
+  if (!MFI)
+    return FI >= 0;
+  // Spill slots should not alias others.
+  return !MFI->isFixedObjectIndex(FI) && !MFI->isSpillSlotObjectIndex(FI);
+}
+
+bool FixedStackPseudoSourceValue::mayAlias(const MachineFrameInfo *MFI) const {
+  if (!MFI)
+    return true;
+  // Spill slots will not alias any LLVM IR value.
+  return !MFI->isSpillSlotObjectIndex(FI);
+}
+
+void FixedStackPseudoSourceValue::printCustom(raw_ostream &OS) const {
+  OS << "FixedStack" << FI;
+}
diff --git a/src/LLVM/lib/CodeGen/README.txt b/src/LLVM/lib/CodeGen/README.txt
new file mode 100644
index 0000000..b655dda
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/README.txt
@@ -0,0 +1,199 @@
+//===---------------------------------------------------------------------===//
+
+Common register allocation / spilling problem:
+
+        mul lr, r4, lr
+        str lr, [sp, #+52]
+        ldr lr, [r1, #+32]
+        sxth r3, r3
+        ldr r4, [sp, #+52]
+        mla r4, r3, lr, r4
+
+can be:
+
+        mul lr, r4, lr
+        mov r4, lr
+        str lr, [sp, #+52]
+        ldr lr, [r1, #+32]
+        sxth r3, r3
+        mla r4, r3, lr, r4
+
+and then "merge" mul and mov:
+
+        mul r4, r4, lr
+        str lr, [sp, #+52]
+        ldr lr, [r1, #+32]
+        sxth r3, r3
+        mla r4, r3, lr, r4
+
+It also increase the likelyhood the store may become dead.
+
+//===---------------------------------------------------------------------===//
+
+bb27 ...
+        ...
+        %reg1037 = ADDri %reg1039, 1
+        %reg1038 = ADDrs %reg1032, %reg1039, %NOREG, 10
+    Successors according to CFG: 0x8b03bf0 (#5)
+
+bb76 (0x8b03bf0, LLVM BB @0x8b032d0, ID#5):
+    Predecessors according to CFG: 0x8b0c5f0 (#3) 0x8b0a7c0 (#4)
+        %reg1039 = PHI %reg1070, mbb<bb76.outer,0x8b0c5f0>, %reg1037, mbb<bb27,0x8b0a7c0>
+
+Note ADDri is not a two-address instruction. However, its result %reg1037 is an
+operand of the PHI node in bb76 and its operand %reg1039 is the result of the
+PHI node. We should treat it as a two-address code and make sure the ADDri is
+scheduled after any node that reads %reg1039.
+
+//===---------------------------------------------------------------------===//
+
+Use local info (i.e. register scavenger) to assign it a free register to allow
+reuse:
+        ldr r3, [sp, #+4]
+        add r3, r3, #3
+        ldr r2, [sp, #+8]
+        add r2, r2, #2
+        ldr r1, [sp, #+4]  <==
+        add r1, r1, #1
+        ldr r0, [sp, #+4]
+        add r0, r0, #2
+
+//===---------------------------------------------------------------------===//
+
+LLVM aggressively lift CSE out of loop. Sometimes this can be negative side-
+effects:
+
+R1 = X + 4
+R2 = X + 7
+R3 = X + 15
+
+loop:
+load [i + R1]
+...
+load [i + R2]
+...
+load [i + R3]
+
+Suppose there is high register pressure, R1, R2, R3, can be spilled. We need
+to implement proper re-materialization to handle this:
+
+R1 = X + 4
+R2 = X + 7
+R3 = X + 15
+
+loop:
+R1 = X + 4  @ re-materialized
+load [i + R1]
+...
+R2 = X + 7 @ re-materialized
+load [i + R2]
+...
+R3 = X + 15 @ re-materialized
+load [i + R3]
+
+Furthermore, with re-association, we can enable sharing:
+
+R1 = X + 4
+R2 = X + 7
+R3 = X + 15
+
+loop:
+T = i + X
+load [T + 4]
+...
+load [T + 7]
+...
+load [T + 15]
+//===---------------------------------------------------------------------===//
+
+It's not always a good idea to choose rematerialization over spilling. If all
+the load / store instructions would be folded then spilling is cheaper because
+it won't require new live intervals / registers. See 2003-05-31-LongShifts for
+an example.
+
+//===---------------------------------------------------------------------===//
+
+With a copying garbage collector, derived pointers must not be retained across
+collector safe points; the collector could move the objects and invalidate the
+derived pointer. This is bad enough in the first place, but safe points can
+crop up unpredictably. Consider:
+
+        %array = load { i32, [0 x %obj] }** %array_addr
+        %nth_el = getelementptr { i32, [0 x %obj] }* %array, i32 0, i32 %n
+        %old = load %obj** %nth_el
+        %z = div i64 %x, %y
+        store %obj* %new, %obj** %nth_el
+
+If the i64 division is lowered to a libcall, then a safe point will (must)
+appear for the call site. If a collection occurs, %array and %nth_el no longer
+point into the correct object.
+
+The fix for this is to copy address calculations so that dependent pointers
+are never live across safe point boundaries. But the loads cannot be copied
+like this if there was an intervening store, so may be hard to get right.
+
+Only a concurrent mutator can trigger a collection at the libcall safe point.
+So single-threaded programs do not have this requirement, even with a copying
+collector. Still, LLVM optimizations would probably undo a front-end's careful
+work.
+
+//===---------------------------------------------------------------------===//
+
+The ocaml frametable structure supports liveness information. It would be good
+to support it.
+
+//===---------------------------------------------------------------------===//
+
+The FIXME in ComputeCommonTailLength in BranchFolding.cpp needs to be
+revisited. The check is there to work around a misuse of directives in inline
+assembly.
+
+//===---------------------------------------------------------------------===//
+
+It would be good to detect collector/target compatibility instead of silently
+doing the wrong thing.
+
+//===---------------------------------------------------------------------===//
+
+It would be really nice to be able to write patterns in .td files for copies,
+which would eliminate a bunch of explicit predicates on them (e.g. no side 
+effects).  Once this is in place, it would be even better to have tblgen 
+synthesize the various copy insertion/inspection methods in TargetInstrInfo.
+
+//===---------------------------------------------------------------------===//
+
+Stack coloring improvments:
+
+1. Do proper LiveStackAnalysis on all stack objects including those which are
+   not spill slots.
+2. Reorder objects to fill in gaps between objects.
+   e.g. 4, 1, <gap>, 4, 1, 1, 1, <gap>, 4 => 4, 1, 1, 1, 1, 4, 4
+
+//===---------------------------------------------------------------------===//
+
+The scheduler should be able to sort nearby instructions by their address. For
+example, in an expanded memset sequence it's not uncommon to see code like this:
+
+  movl $0, 4(%rdi)
+  movl $0, 8(%rdi)
+  movl $0, 12(%rdi)
+  movl $0, 0(%rdi)
+
+Each of the stores is independent, and the scheduler is currently making an
+arbitrary decision about the order.
+
+//===---------------------------------------------------------------------===//
+
+Another opportunitiy in this code is that the $0 could be moved to a register:
+
+  movl $0, 4(%rdi)
+  movl $0, 8(%rdi)
+  movl $0, 12(%rdi)
+  movl $0, 0(%rdi)
+
+This would save substantial code size, especially for longer sequences like
+this. It would be easy to have a rule telling isel to avoid matching MOV32mi
+if the immediate has more than some fixed number of uses. It's more involved
+to teach the register allocator how to do late folding to recover from
+excessive register pressure.
+
diff --git a/src/LLVM/lib/CodeGen/RegAllocFast.cpp b/src/LLVM/lib/CodeGen/RegAllocFast.cpp
new file mode 100644
index 0000000..5a6187b
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/RegAllocFast.cpp
@@ -0,0 +1,1037 @@
+//===-- RegAllocFast.cpp - A fast register allocator for debug code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This register allocator allocates registers to a basic block at a time,
+// attempting to keep values in registers and reusing registers as appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/BasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumStores, "Number of stores added");
+STATISTIC(NumLoads , "Number of loads added");
+STATISTIC(NumCopies, "Number of copies coalesced");
+
+static RegisterRegAlloc
+  fastRegAlloc("fast", "fast register allocator", createFastRegisterAllocator);
+
+namespace {
+  class RAFast : public MachineFunctionPass {
+  public:
+    static char ID;
+    RAFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1),
+               isBulkSpilling(false) {}
+  private:
+    const TargetMachine *TM;
+    MachineFunction *MF;
+    MachineRegisterInfo *MRI;
+    const TargetRegisterInfo *TRI;
+    const TargetInstrInfo *TII;
+
+    // Basic block currently being allocated.
+    MachineBasicBlock *MBB;
+
+    // StackSlotForVirtReg - Maps virtual regs to the frame index where these
+    // values are spilled.
+    IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg;
+
+    // Everything we know about a live virtual register.
+    struct LiveReg {
+      MachineInstr *LastUse;    // Last instr to use reg.
+      unsigned PhysReg;         // Currently held here.
+      unsigned short LastOpNum; // OpNum on LastUse.
+      bool Dirty;               // Register needs spill.
+
+      LiveReg(unsigned p=0) : LastUse(0), PhysReg(p), LastOpNum(0),
+                              Dirty(false) {}
+    };
+
+    typedef DenseMap<unsigned, LiveReg> LiveRegMap;
+    typedef LiveRegMap::value_type LiveRegEntry;
+
+    // LiveVirtRegs - This map contains entries for each virtual register
+    // that is currently available in a physical register.
+    LiveRegMap LiveVirtRegs;
+
+    DenseMap<unsigned, MachineInstr *> LiveDbgValueMap;
+
+    // RegState - Track the state of a physical register.
+    enum RegState {
+      // A disabled register is not available for allocation, but an alias may
+      // be in use. A register can only be moved out of the disabled state if
+      // all aliases are disabled.
+      regDisabled,
+
+      // A free register is not currently in use and can be allocated
+      // immediately without checking aliases.
+      regFree,
+
+      // A reserved register has been assigned expolicitly (e.g., setting up a
+      // call parameter), and it remains reserved until it is used.
+      regReserved
+
+      // A register state may also be a virtual register number, indication that
+      // the physical register is currently allocated to a virtual register. In
+      // that case, LiveVirtRegs contains the inverse mapping.
+    };
+
+    // PhysRegState - One of the RegState enums, or a virtreg.
+    std::vector<unsigned> PhysRegState;
+
+    // UsedInInstr - BitVector of physregs that are used in the current
+    // instruction, and so cannot be allocated.
+    BitVector UsedInInstr;
+
+    // Allocatable - vector of allocatable physical registers.
+    BitVector Allocatable;
+
+    // SkippedInstrs - Descriptors of instructions whose clobber list was ignored
+    // because all registers were spilled. It is still necessary to mark all the
+    // clobbered registers as used by the function.
+    SmallPtrSet<const TargetInstrDesc*, 4> SkippedInstrs;
+
+    // isBulkSpilling - This flag is set when LiveRegMap will be cleared
+    // completely after spilling all live registers. LiveRegMap entries should
+    // not be erased.
+    bool isBulkSpilling;
+
+    enum {
+      spillClean = 1,
+      spillDirty = 100,
+      spillImpossible = ~0u
+    };
+  public:
+    virtual const char *getPassName() const {
+      return "Fast Register Allocator";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequiredID(PHIEliminationID);
+      AU.addRequiredID(TwoAddressInstructionPassID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    bool runOnMachineFunction(MachineFunction &Fn);
+    void AllocateBasicBlock();
+    void handleThroughOperands(MachineInstr *MI,
+                               SmallVectorImpl<unsigned> &VirtDead);
+    int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC);
+    bool isLastUseOfLocalReg(MachineOperand&);
+
+    void addKillFlag(const LiveReg&);
+    void killVirtReg(LiveRegMap::iterator);
+    void killVirtReg(unsigned VirtReg);
+    void spillVirtReg(MachineBasicBlock::iterator MI, LiveRegMap::iterator);
+    void spillVirtReg(MachineBasicBlock::iterator MI, unsigned VirtReg);
+
+    void usePhysReg(MachineOperand&);
+    void definePhysReg(MachineInstr *MI, unsigned PhysReg, RegState NewState);
+    unsigned calcSpillCost(unsigned PhysReg) const;
+    void assignVirtToPhysReg(LiveRegEntry &LRE, unsigned PhysReg);
+    void allocVirtReg(MachineInstr *MI, LiveRegEntry &LRE, unsigned Hint);
+    LiveRegMap::iterator defineVirtReg(MachineInstr *MI, unsigned OpNum,
+                                       unsigned VirtReg, unsigned Hint);
+    LiveRegMap::iterator reloadVirtReg(MachineInstr *MI, unsigned OpNum,
+                                       unsigned VirtReg, unsigned Hint);
+    void spillAll(MachineInstr *MI);
+    bool setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg);
+  };
+  char RAFast::ID = 0;
+}
+
+/// getStackSpaceFor - This allocates space for the specified virtual register
+/// to be held on the stack.
+int RAFast::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
+  // Find the location Reg would belong...
+  int SS = StackSlotForVirtReg[VirtReg];
+  if (SS != -1)
+    return SS;          // Already has space allocated?
+
+  // Allocate a new stack object for this spill location...
+  int FrameIdx = MF->getFrameInfo()->CreateSpillStackObject(RC->getSize(),
+                                                            RC->getAlignment());
+
+  // Assign the slot.
+  StackSlotForVirtReg[VirtReg] = FrameIdx;
+  return FrameIdx;
+}
+
+/// isLastUseOfLocalReg - Return true if MO is the only remaining reference to
+/// its virtual register, and it is guaranteed to be a block-local register.
+///
+bool RAFast::isLastUseOfLocalReg(MachineOperand &MO) {
+  // Check for non-debug uses or defs following MO.
+  // This is the most likely way to fail - fast path it.
+  MachineOperand *Next = &MO;
+  while ((Next = Next->getNextOperandForReg()))
+    if (!Next->isDebug())
+      return false;
+
+  // If the register has ever been spilled or reloaded, we conservatively assume
+  // it is a global register used in multiple blocks.
+  if (StackSlotForVirtReg[MO.getReg()] != -1)
+    return false;
+
+  // Check that the use/def chain has exactly one operand - MO.
+  return &MRI->reg_nodbg_begin(MO.getReg()).getOperand() == &MO;
+}
+
+/// addKillFlag - Set kill flags on last use of a virtual register.
+void RAFast::addKillFlag(const LiveReg &LR) {
+  if (!LR.LastUse) return;
+  MachineOperand &MO = LR.LastUse->getOperand(LR.LastOpNum);
+  if (MO.isUse() && !LR.LastUse->isRegTiedToDefOperand(LR.LastOpNum)) {
+    if (MO.getReg() == LR.PhysReg)
+      MO.setIsKill();
+    else
+      LR.LastUse->addRegisterKilled(LR.PhysReg, TRI, true);
+  }
+}
+
+/// killVirtReg - Mark virtreg as no longer available.
+void RAFast::killVirtReg(LiveRegMap::iterator LRI) {
+  addKillFlag(LRI->second);
+  const LiveReg &LR = LRI->second;
+  assert(PhysRegState[LR.PhysReg] == LRI->first && "Broken RegState mapping");
+  PhysRegState[LR.PhysReg] = regFree;
+  // Erase from LiveVirtRegs unless we're spilling in bulk.
+  if (!isBulkSpilling)
+    LiveVirtRegs.erase(LRI);
+}
+
+/// killVirtReg - Mark virtreg as no longer available.
+void RAFast::killVirtReg(unsigned VirtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "killVirtReg needs a virtual register");
+  LiveRegMap::iterator LRI = LiveVirtRegs.find(VirtReg);
+  if (LRI != LiveVirtRegs.end())
+    killVirtReg(LRI);
+}
+
+/// spillVirtReg - This method spills the value specified by VirtReg into the
+/// corresponding stack slot if needed. If isKill is set, the register is also
+/// killed.
+void RAFast::spillVirtReg(MachineBasicBlock::iterator MI, unsigned VirtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Spilling a physical register is illegal!");
+  LiveRegMap::iterator LRI = LiveVirtRegs.find(VirtReg);
+  assert(LRI != LiveVirtRegs.end() && "Spilling unmapped virtual register");
+  spillVirtReg(MI, LRI);
+}
+
+/// spillVirtReg - Do the actual work of spilling.
+void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
+                          LiveRegMap::iterator LRI) {
+  LiveReg &LR = LRI->second;
+  assert(PhysRegState[LR.PhysReg] == LRI->first && "Broken RegState mapping");
+
+  if (LR.Dirty) {
+    // If this physreg is used by the instruction, we want to kill it on the
+    // instruction, not on the spill.
+    bool SpillKill = LR.LastUse != MI;
+    LR.Dirty = false;
+    DEBUG(dbgs() << "Spilling %reg" << LRI->first
+                 << " in " << TRI->getName(LR.PhysReg));
+    const TargetRegisterClass *RC = MRI->getRegClass(LRI->first);
+    int FI = getStackSpaceFor(LRI->first, RC);
+    DEBUG(dbgs() << " to stack slot #" << FI << "\n");
+    TII->storeRegToStackSlot(*MBB, MI, LR.PhysReg, SpillKill, FI, RC, TRI);
+    ++NumStores;   // Update statistics
+
+    // If this register is used by DBG_VALUE then insert new DBG_VALUE to 
+    // identify spilled location as the place to find corresponding variable's
+    // value.
+    if (MachineInstr *DBG = LiveDbgValueMap.lookup(LRI->first)) {
+      const MDNode *MDPtr = 
+        DBG->getOperand(DBG->getNumOperands()-1).getMetadata();
+      int64_t Offset = 0;
+      if (DBG->getOperand(1).isImm())
+        Offset = DBG->getOperand(1).getImm();
+      DebugLoc DL;
+      if (MI == MBB->end()) {
+        // If MI is at basic block end then use last instruction's location.
+        MachineBasicBlock::iterator EI = MI;
+        DL = (--EI)->getDebugLoc();
+      }
+      else
+        DL = MI->getDebugLoc();
+      if (MachineInstr *NewDV = 
+          TII->emitFrameIndexDebugValue(*MF, FI, Offset, MDPtr, DL)) {
+        MachineBasicBlock *MBB = DBG->getParent();
+        MBB->insert(MI, NewDV);
+        DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV);
+        LiveDbgValueMap[LRI->first] = NewDV;
+      }
+    }
+    if (SpillKill)
+      LR.LastUse = 0; // Don't kill register again
+  }
+  killVirtReg(LRI);
+}
+
+/// spillAll - Spill all dirty virtregs without killing them.
+void RAFast::spillAll(MachineInstr *MI) {
+  if (LiveVirtRegs.empty()) return;
+  isBulkSpilling = true;
+  // The LiveRegMap is keyed by an unsigned (the virtreg number), so the order
+  // of spilling here is deterministic, if arbitrary.
+  for (LiveRegMap::iterator i = LiveVirtRegs.begin(), e = LiveVirtRegs.end();
+       i != e; ++i)
+    spillVirtReg(MI, i);
+  LiveVirtRegs.clear();
+  isBulkSpilling = false;
+}
+
+/// usePhysReg - Handle the direct use of a physical register.
+/// Check that the register is not used by a virtreg.
+/// Kill the physreg, marking it free.
+/// This may add implicit kills to MO->getParent() and invalidate MO.
+void RAFast::usePhysReg(MachineOperand &MO) {
+  unsigned PhysReg = MO.getReg();
+  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
+         "Bad usePhysReg operand");
+
+  switch (PhysRegState[PhysReg]) {
+  case regDisabled:
+    break;
+  case regReserved:
+    PhysRegState[PhysReg] = regFree;
+    // Fall through
+  case regFree:
+    UsedInInstr.set(PhysReg);
+    MO.setIsKill();
+    return;
+  default:
+    // The physreg was allocated to a virtual register. That means to value we
+    // wanted has been clobbered.
+    llvm_unreachable("Instruction uses an allocated register");
+  }
+
+  // Maybe a superregister is reserved?
+  for (const unsigned *AS = TRI->getAliasSet(PhysReg);
+       unsigned Alias = *AS; ++AS) {
+    switch (PhysRegState[Alias]) {
+    case regDisabled:
+      break;
+    case regReserved:
+      assert(TRI->isSuperRegister(PhysReg, Alias) &&
+             "Instruction is not using a subregister of a reserved register");
+      // Leave the superregister in the working set.
+      PhysRegState[Alias] = regFree;
+      UsedInInstr.set(Alias);
+      MO.getParent()->addRegisterKilled(Alias, TRI, true);
+      return;
+    case regFree:
+      if (TRI->isSuperRegister(PhysReg, Alias)) {
+        // Leave the superregister in the working set.
+        UsedInInstr.set(Alias);
+        MO.getParent()->addRegisterKilled(Alias, TRI, true);
+        return;
+      }
+      // Some other alias was in the working set - clear it.
+      PhysRegState[Alias] = regDisabled;
+      break;
+    default:
+      llvm_unreachable("Instruction uses an alias of an allocated register");
+    }
+  }
+
+  // All aliases are disabled, bring register into working set.
+  PhysRegState[PhysReg] = regFree;
+  UsedInInstr.set(PhysReg);
+  MO.setIsKill();
+}
+
+/// definePhysReg - Mark PhysReg as reserved or free after spilling any
+/// virtregs. This is very similar to defineVirtReg except the physreg is
+/// reserved instead of allocated.
+void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
+                           RegState NewState) {
+  UsedInInstr.set(PhysReg);
+  switch (unsigned VirtReg = PhysRegState[PhysReg]) {
+  case regDisabled:
+    break;
+  default:
+    spillVirtReg(MI, VirtReg);
+    // Fall through.
+  case regFree:
+  case regReserved:
+    PhysRegState[PhysReg] = NewState;
+    return;
+  }
+
+  // This is a disabled register, disable all aliases.
+  PhysRegState[PhysReg] = NewState;
+  for (const unsigned *AS = TRI->getAliasSet(PhysReg);
+       unsigned Alias = *AS; ++AS) {
+    UsedInInstr.set(Alias);
+    switch (unsigned VirtReg = PhysRegState[Alias]) {
+    case regDisabled:
+      break;
+    default:
+      spillVirtReg(MI, VirtReg);
+      // Fall through.
+    case regFree:
+    case regReserved:
+      PhysRegState[Alias] = regDisabled;
+      if (TRI->isSuperRegister(PhysReg, Alias))
+        return;
+      break;
+    }
+  }
+}
+
+
+// calcSpillCost - Return the cost of spilling clearing out PhysReg and
+// aliases so it is free for allocation.
+// Returns 0 when PhysReg is free or disabled with all aliases disabled - it
+// can be allocated directly.
+// Returns spillImpossible when PhysReg or an alias can't be spilled.
+unsigned RAFast::calcSpillCost(unsigned PhysReg) const {
+  if (UsedInInstr.test(PhysReg))
+    return spillImpossible;
+  switch (unsigned VirtReg = PhysRegState[PhysReg]) {
+  case regDisabled:
+    break;
+  case regFree:
+    return 0;
+  case regReserved:
+    return spillImpossible;
+  default:
+    return LiveVirtRegs.lookup(VirtReg).Dirty ? spillDirty : spillClean;
+  }
+
+  // This is a disabled register, add up const of aliases.
+  unsigned Cost = 0;
+  for (const unsigned *AS = TRI->getAliasSet(PhysReg);
+       unsigned Alias = *AS; ++AS) {
+    if (UsedInInstr.test(Alias))
+      return spillImpossible;
+    switch (unsigned VirtReg = PhysRegState[Alias]) {
+    case regDisabled:
+      break;
+    case regFree:
+      ++Cost;
+      break;
+    case regReserved:
+      return spillImpossible;
+    default:
+      Cost += LiveVirtRegs.lookup(VirtReg).Dirty ? spillDirty : spillClean;
+      break;
+    }
+  }
+  return Cost;
+}
+
+
+/// assignVirtToPhysReg - This method updates local state so that we know
+/// that PhysReg is the proper container for VirtReg now.  The physical
+/// register must not be used for anything else when this is called.
+///
+void RAFast::assignVirtToPhysReg(LiveRegEntry &LRE, unsigned PhysReg) {
+  DEBUG(dbgs() << "Assigning %reg" << LRE.first << " to "
+               << TRI->getName(PhysReg) << "\n");
+  PhysRegState[PhysReg] = LRE.first;
+  assert(!LRE.second.PhysReg && "Already assigned a physreg");
+  LRE.second.PhysReg = PhysReg;
+}
+
+/// allocVirtReg - Allocate a physical register for VirtReg.
+void RAFast::allocVirtReg(MachineInstr *MI, LiveRegEntry &LRE, unsigned Hint) {
+  const unsigned VirtReg = LRE.first;
+
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Can only allocate virtual registers");
+
+  const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
+
+  // Ignore invalid hints.
+  if (Hint && (!TargetRegisterInfo::isPhysicalRegister(Hint) ||
+               !RC->contains(Hint) || !Allocatable.test(Hint)))
+    Hint = 0;
+
+  // Take hint when possible.
+  if (Hint) {
+    switch(calcSpillCost(Hint)) {
+    default:
+      definePhysReg(MI, Hint, regFree);
+      // Fall through.
+    case 0:
+      return assignVirtToPhysReg(LRE, Hint);
+    case spillImpossible:
+      break;
+    }
+  }
+
+  TargetRegisterClass::iterator AOB = RC->allocation_order_begin(*MF);
+  TargetRegisterClass::iterator AOE = RC->allocation_order_end(*MF);
+
+  // First try to find a completely free register.
+  for (TargetRegisterClass::iterator I = AOB; I != AOE; ++I) {
+    unsigned PhysReg = *I;
+    if (PhysRegState[PhysReg] == regFree && !UsedInInstr.test(PhysReg))
+      return assignVirtToPhysReg(LRE, PhysReg);
+  }
+
+  DEBUG(dbgs() << "Allocating %reg" << VirtReg << " from " << RC->getName()
+               << "\n");
+
+  unsigned BestReg = 0, BestCost = spillImpossible;
+  for (TargetRegisterClass::iterator I = AOB; I != AOE; ++I) {
+    unsigned Cost = calcSpillCost(*I);
+    // Cost is 0 when all aliases are already disabled.
+    if (Cost == 0)
+      return assignVirtToPhysReg(LRE, *I);
+    if (Cost < BestCost)
+      BestReg = *I, BestCost = Cost;
+  }
+
+  if (BestReg) {
+    definePhysReg(MI, BestReg, regFree);
+    return assignVirtToPhysReg(LRE, BestReg);
+  }
+
+  // Nothing we can do.
+  std::string msg;
+  raw_string_ostream Msg(msg);
+  Msg << "Ran out of registers during register allocation!";
+  report_fatal_error(Msg.str());
+}
+
+/// defineVirtReg - Allocate a register for VirtReg and mark it as dirty.
+RAFast::LiveRegMap::iterator
+RAFast::defineVirtReg(MachineInstr *MI, unsigned OpNum,
+                      unsigned VirtReg, unsigned Hint) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Not a virtual register");
+  LiveRegMap::iterator LRI;
+  bool New;
+  tie(LRI, New) = LiveVirtRegs.insert(std::make_pair(VirtReg, LiveReg()));
+  LiveReg &LR = LRI->second;
+  if (New) {
+    // If there is no hint, peek at the only use of this register.
+    if ((!Hint || !TargetRegisterInfo::isPhysicalRegister(Hint)) &&
+        MRI->hasOneNonDBGUse(VirtReg)) {
+      const MachineInstr &UseMI = *MRI->use_nodbg_begin(VirtReg);
+      // It's a copy, use the destination register as a hint.
+      if (UseMI.isCopyLike())
+        Hint = UseMI.getOperand(0).getReg();
+    }
+    allocVirtReg(MI, *LRI, Hint);
+  } else if (LR.LastUse) {
+    // Redefining a live register - kill at the last use, unless it is this
+    // instruction defining VirtReg multiple times.
+    if (LR.LastUse != MI || LR.LastUse->getOperand(LR.LastOpNum).isUse())
+      addKillFlag(LR);
+  }
+  assert(LR.PhysReg && "Register not assigned");
+  LR.LastUse = MI;
+  LR.LastOpNum = OpNum;
+  LR.Dirty = true;
+  UsedInInstr.set(LR.PhysReg);
+  return LRI;
+}
+
+/// reloadVirtReg - Make sure VirtReg is available in a physreg and return it.
+RAFast::LiveRegMap::iterator
+RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
+                      unsigned VirtReg, unsigned Hint) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Not a virtual register");
+  LiveRegMap::iterator LRI;
+  bool New;
+  tie(LRI, New) = LiveVirtRegs.insert(std::make_pair(VirtReg, LiveReg()));
+  LiveReg &LR = LRI->second;
+  MachineOperand &MO = MI->getOperand(OpNum);
+  if (New) {
+    allocVirtReg(MI, *LRI, Hint);
+    const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
+    int FrameIndex = getStackSpaceFor(VirtReg, RC);
+    DEBUG(dbgs() << "Reloading %reg" << VirtReg << " into "
+                 << TRI->getName(LR.PhysReg) << "\n");
+    TII->loadRegFromStackSlot(*MBB, MI, LR.PhysReg, FrameIndex, RC, TRI);
+    ++NumLoads;
+  } else if (LR.Dirty) {
+    if (isLastUseOfLocalReg(MO)) {
+      DEBUG(dbgs() << "Killing last use: " << MO << "\n");
+      if (MO.isUse())
+        MO.setIsKill();
+      else
+        MO.setIsDead();
+    } else if (MO.isKill()) {
+      DEBUG(dbgs() << "Clearing dubious kill: " << MO << "\n");
+      MO.setIsKill(false);
+    } else if (MO.isDead()) {
+      DEBUG(dbgs() << "Clearing dubious dead: " << MO << "\n");
+      MO.setIsDead(false);
+    }
+  } else if (MO.isKill()) {
+    // We must remove kill flags from uses of reloaded registers because the
+    // register would be killed immediately, and there might be a second use:
+    //   %foo = OR %x<kill>, %x
+    // This would cause a second reload of %x into a different register.
+    DEBUG(dbgs() << "Clearing clean kill: " << MO << "\n");
+    MO.setIsKill(false);
+  } else if (MO.isDead()) {
+    DEBUG(dbgs() << "Clearing clean dead: " << MO << "\n");
+    MO.setIsDead(false);
+  }
+  assert(LR.PhysReg && "Register not assigned");
+  LR.LastUse = MI;
+  LR.LastOpNum = OpNum;
+  UsedInInstr.set(LR.PhysReg);
+  return LRI;
+}
+
+// setPhysReg - Change operand OpNum in MI the refer the PhysReg, considering
+// subregs. This may invalidate any operand pointers.
+// Return true if the operand kills its register.
+bool RAFast::setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg) {
+  MachineOperand &MO = MI->getOperand(OpNum);
+  if (!MO.getSubReg()) {
+    MO.setReg(PhysReg);
+    return MO.isKill() || MO.isDead();
+  }
+
+  // Handle subregister index.
+  MO.setReg(PhysReg ? TRI->getSubReg(PhysReg, MO.getSubReg()) : 0);
+  MO.setSubReg(0);
+
+  // A kill flag implies killing the full register. Add corresponding super
+  // register kill.
+  if (MO.isKill()) {
+    MI->addRegisterKilled(PhysReg, TRI, true);
+    return true;
+  }
+  return MO.isDead();
+}
+
+// Handle special instruction operand like early clobbers and tied ops when
+// there are additional physreg defines.
+void RAFast::handleThroughOperands(MachineInstr *MI,
+                                   SmallVectorImpl<unsigned> &VirtDead) {
+  DEBUG(dbgs() << "Scanning for through registers:");
+  SmallSet<unsigned, 8> ThroughRegs;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    if (MO.isEarlyClobber() || MI->isRegTiedToDefOperand(i) ||
+        (MO.getSubReg() && MI->readsVirtualRegister(Reg))) {
+      if (ThroughRegs.insert(Reg))
+        DEBUG(dbgs() << " %reg" << Reg);
+    }
+  }
+
+  // If any physreg defines collide with preallocated through registers,
+  // we must spill and reallocate.
+  DEBUG(dbgs() << "\nChecking for physdef collisions.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    UsedInInstr.set(Reg);
+    if (ThroughRegs.count(PhysRegState[Reg]))
+      definePhysReg(MI, Reg, regFree);
+    for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
+      UsedInInstr.set(*AS);
+      if (ThroughRegs.count(PhysRegState[*AS]))
+        definePhysReg(MI, *AS, regFree);
+    }
+  }
+
+  SmallVector<unsigned, 8> PartialDefs;
+  DEBUG(dbgs() << "Allocating tied uses and early clobbers.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    if (MO.isUse()) {
+      unsigned DefIdx = 0;
+      if (!MI->isRegTiedToDefOperand(i, &DefIdx)) continue;
+      DEBUG(dbgs() << "Operand " << i << "("<< MO << ") is tied to operand "
+        << DefIdx << ".\n");
+      LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, 0);
+      unsigned PhysReg = LRI->second.PhysReg;
+      setPhysReg(MI, i, PhysReg);
+      // Note: we don't update the def operand yet. That would cause the normal
+      // def-scan to attempt spilling.
+    } else if (MO.getSubReg() && MI->readsVirtualRegister(Reg)) {
+      DEBUG(dbgs() << "Partial redefine: " << MO << "\n");
+      // Reload the register, but don't assign to the operand just yet.
+      // That would confuse the later phys-def processing pass.
+      LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, 0);
+      PartialDefs.push_back(LRI->second.PhysReg);
+    } else if (MO.isEarlyClobber()) {
+      // Note: defineVirtReg may invalidate MO.
+      LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, 0);
+      unsigned PhysReg = LRI->second.PhysReg;
+      if (setPhysReg(MI, i, PhysReg))
+        VirtDead.push_back(Reg);
+    }
+  }
+
+  // Restore UsedInInstr to a state usable for allocating normal virtual uses.
+  UsedInInstr.reset();
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    UsedInInstr.set(Reg);
+    for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+      UsedInInstr.set(*AS);
+  }
+
+  // Also mark PartialDefs as used to avoid reallocation.
+  for (unsigned i = 0, e = PartialDefs.size(); i != e; ++i)
+    UsedInInstr.set(PartialDefs[i]);
+}
+
+void RAFast::AllocateBasicBlock() {
+  DEBUG(dbgs() << "\nAllocating " << *MBB);
+
+  PhysRegState.assign(TRI->getNumRegs(), regDisabled);
+  assert(LiveVirtRegs.empty() && "Mapping not cleared form last block?");
+
+  MachineBasicBlock::iterator MII = MBB->begin();
+
+  // Add live-in registers as live.
+  for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
+         E = MBB->livein_end(); I != E; ++I)
+    definePhysReg(MII, *I, regReserved);
+
+  SmallVector<unsigned, 8> VirtDead;
+  SmallVector<MachineInstr*, 32> Coalesced;
+
+  // Otherwise, sequentially allocate each instruction in the MBB.
+  while (MII != MBB->end()) {
+    MachineInstr *MI = MII++;
+    const TargetInstrDesc &TID = MI->getDesc();
+    DEBUG({
+        dbgs() << "\n>> " << *MI << "Regs:";
+        for (unsigned Reg = 1, E = TRI->getNumRegs(); Reg != E; ++Reg) {
+          if (PhysRegState[Reg] == regDisabled) continue;
+          dbgs() << " " << TRI->getName(Reg);
+          switch(PhysRegState[Reg]) {
+          case regFree:
+            break;
+          case regReserved:
+            dbgs() << "*";
+            break;
+          default:
+            dbgs() << "=%reg" << PhysRegState[Reg];
+            if (LiveVirtRegs[PhysRegState[Reg]].Dirty)
+              dbgs() << "*";
+            assert(LiveVirtRegs[PhysRegState[Reg]].PhysReg == Reg &&
+                   "Bad inverse map");
+            break;
+          }
+        }
+        dbgs() << '\n';
+        // Check that LiveVirtRegs is the inverse.
+        for (LiveRegMap::iterator i = LiveVirtRegs.begin(),
+             e = LiveVirtRegs.end(); i != e; ++i) {
+           assert(TargetRegisterInfo::isVirtualRegister(i->first) &&
+                  "Bad map key");
+           assert(TargetRegisterInfo::isPhysicalRegister(i->second.PhysReg) &&
+                  "Bad map value");
+           assert(PhysRegState[i->second.PhysReg] == i->first &&
+                  "Bad inverse map");
+        }
+      });
+
+    // Debug values are not allowed to change codegen in any way.
+    if (MI->isDebugValue()) {
+      bool ScanDbgValue = true;
+      while (ScanDbgValue) {
+        ScanDbgValue = false;
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          MachineOperand &MO = MI->getOperand(i);
+          if (!MO.isReg()) continue;
+          unsigned Reg = MO.getReg();
+          if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+          LiveDbgValueMap[Reg] = MI;
+          LiveRegMap::iterator LRI = LiveVirtRegs.find(Reg);
+          if (LRI != LiveVirtRegs.end())
+            setPhysReg(MI, i, LRI->second.PhysReg);
+          else {
+            int SS = StackSlotForVirtReg[Reg];
+            if (SS == -1)
+              MO.setReg(0); // We can't allocate a physreg for a DebugValue, sorry!
+            else {
+              // Modify DBG_VALUE now that the value is in a spill slot.
+              int64_t Offset = MI->getOperand(1).getImm();
+              const MDNode *MDPtr = 
+                MI->getOperand(MI->getNumOperands()-1).getMetadata();
+              DebugLoc DL = MI->getDebugLoc();
+              if (MachineInstr *NewDV = 
+                  TII->emitFrameIndexDebugValue(*MF, SS, Offset, MDPtr, DL)) {
+                DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
+                MachineBasicBlock *MBB = MI->getParent();
+                MBB->insert(MBB->erase(MI), NewDV);
+                // Scan NewDV operands from the beginning.
+                MI = NewDV;
+                ScanDbgValue = true;
+                break;
+              } else
+                MO.setReg(0); // We can't allocate a physreg for a DebugValue, sorry!
+            }
+          }
+        }
+      }
+      // Next instruction.
+      continue;
+    }
+
+    // If this is a copy, we may be able to coalesce.
+    unsigned CopySrc = 0, CopyDst = 0, CopySrcSub = 0, CopyDstSub = 0;
+    if (MI->isCopy()) {
+      CopyDst = MI->getOperand(0).getReg();
+      CopySrc = MI->getOperand(1).getReg();
+      CopyDstSub = MI->getOperand(0).getSubReg();
+      CopySrcSub = MI->getOperand(1).getSubReg();
+    }
+
+    // Track registers used by instruction.
+    UsedInInstr.reset();
+
+    // First scan.
+    // Mark physreg uses and early clobbers as used.
+    // Find the end of the virtreg operands
+    unsigned VirtOpEnd = 0;
+    bool hasTiedOps = false;
+    bool hasEarlyClobbers = false;
+    bool hasPartialRedefs = false;
+    bool hasPhysDefs = false;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg) continue;
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        VirtOpEnd = i+1;
+        if (MO.isUse()) {
+          hasTiedOps = hasTiedOps ||
+                                TID.getOperandConstraint(i, TOI::TIED_TO) != -1;
+        } else {
+          if (MO.isEarlyClobber())
+            hasEarlyClobbers = true;
+          if (MO.getSubReg() && MI->readsVirtualRegister(Reg))
+            hasPartialRedefs = true;
+        }
+        continue;
+      }
+      if (!Allocatable.test(Reg)) continue;
+      if (MO.isUse()) {
+        usePhysReg(MO);
+      } else if (MO.isEarlyClobber()) {
+        definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
+                               regFree : regReserved);
+        hasEarlyClobbers = true;
+      } else
+        hasPhysDefs = true;
+    }
+
+    // The instruction may have virtual register operands that must be allocated
+    // the same register at use-time and def-time: early clobbers and tied
+    // operands. If there are also physical defs, these registers must avoid
+    // both physical defs and uses, making them more constrained than normal
+    // operands.
+    // Similarly, if there are multiple defs and tied operands, we must make sure
+    // the same register is allocated to uses and defs.
+    // We didn't detect inline asm tied operands above, so just make this extra
+    // pass for all inline asm.
+    if (hasEarlyClobbers || hasPartialRedefs ||
+        (hasTiedOps && (hasPhysDefs || TID.getNumDefs() > 1))) {
+      handleThroughOperands(MI, VirtDead);
+      // Don't attempt coalescing when we have funny stuff going on.
+      CopyDst = 0;
+      // Pretend we have early clobbers so the use operands get marked below.
+      // This is not necessary for the common case of a single tied use.
+      hasEarlyClobbers = true;
+    }
+
+    // Second scan.
+    // Allocate virtreg uses.
+    for (unsigned i = 0; i != VirtOpEnd; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+      if (MO.isUse()) {
+        LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, CopyDst);
+        unsigned PhysReg = LRI->second.PhysReg;
+        CopySrc = (CopySrc == Reg || CopySrc == PhysReg) ? PhysReg : 0;
+        if (setPhysReg(MI, i, PhysReg))
+          killVirtReg(LRI);
+      }
+    }
+
+    MRI->addPhysRegsUsed(UsedInInstr);
+
+    // Track registers defined by instruction - early clobbers and tied uses at
+    // this point.
+    UsedInInstr.reset();
+    if (hasEarlyClobbers) {
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg()) continue;
+        unsigned Reg = MO.getReg();
+        if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+        // Look for physreg defs and tied uses.
+        if (!MO.isDef() && !MI->isRegTiedToDefOperand(i)) continue;
+        UsedInInstr.set(Reg);
+        for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+          UsedInInstr.set(*AS);
+      }
+    }
+
+    unsigned DefOpEnd = MI->getNumOperands();
+    if (TID.isCall()) {
+      // Spill all virtregs before a call. This serves two purposes: 1. If an
+      // exception is thrown, the landing pad is going to expect to find registers
+      // in their spill slots, and 2. we don't have to wade through all the
+      // <imp-def> operands on the call instruction.
+      DefOpEnd = VirtOpEnd;
+      DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
+      spillAll(MI);
+
+      // The imp-defs are skipped below, but we still need to mark those
+      // registers as used by the function.
+      SkippedInstrs.insert(&TID);
+    }
+
+    // Third scan.
+    // Allocate defs and collect dead defs.
+    for (unsigned i = 0; i != DefOpEnd; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
+        continue;
+      unsigned Reg = MO.getReg();
+
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        if (!Allocatable.test(Reg)) continue;
+        definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
+                               regFree : regReserved);
+        continue;
+      }
+      LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, CopySrc);
+      unsigned PhysReg = LRI->second.PhysReg;
+      if (setPhysReg(MI, i, PhysReg)) {
+        VirtDead.push_back(Reg);
+        CopyDst = 0; // cancel coalescing;
+      } else
+        CopyDst = (CopyDst == Reg || CopyDst == PhysReg) ? PhysReg : 0;
+    }
+
+    // Kill dead defs after the scan to ensure that multiple defs of the same
+    // register are allocated identically. We didn't need to do this for uses
+    // because we are crerating our own kill flags, and they are always at the
+    // last use.
+    for (unsigned i = 0, e = VirtDead.size(); i != e; ++i)
+      killVirtReg(VirtDead[i]);
+    VirtDead.clear();
+
+    MRI->addPhysRegsUsed(UsedInInstr);
+
+    if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) {
+      DEBUG(dbgs() << "-- coalescing: " << *MI);
+      Coalesced.push_back(MI);
+    } else {
+      DEBUG(dbgs() << "<< " << *MI);
+    }
+  }
+
+  // Spill all physical registers holding virtual registers now.
+  DEBUG(dbgs() << "Spilling live registers at end of block.\n");
+  spillAll(MBB->getFirstTerminator());
+
+  // Erase all the coalesced copies. We are delaying it until now because
+  // LiveVirtRegs might refer to the instrs.
+  for (unsigned i = 0, e = Coalesced.size(); i != e; ++i)
+    MBB->erase(Coalesced[i]);
+  NumCopies += Coalesced.size();
+
+  DEBUG(MBB->dump());
+}
+
+/// runOnMachineFunction - Register allocate the whole function
+///
+bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
+  DEBUG(dbgs() << "********** FAST REGISTER ALLOCATION **********\n"
+               << "********** Function: "
+               << ((Value*)Fn.getFunction())->getName() << '\n');
+  MF = &Fn;
+  MRI = &MF->getRegInfo();
+  TM = &Fn.getTarget();
+  TRI = TM->getRegisterInfo();
+  TII = TM->getInstrInfo();
+
+  UsedInInstr.resize(TRI->getNumRegs());
+  Allocatable = TRI->getAllocatableSet(*MF);
+
+  // initialize the virtual->physical register map to have a 'null'
+  // mapping for all virtual registers
+  unsigned LastVirtReg = MRI->getLastVirtReg();
+  StackSlotForVirtReg.grow(LastVirtReg);
+
+  // Loop over all of the basic blocks, eliminating virtual register references
+  for (MachineFunction::iterator MBBi = Fn.begin(), MBBe = Fn.end();
+       MBBi != MBBe; ++MBBi) {
+    MBB = &*MBBi;
+    AllocateBasicBlock();
+  }
+
+  // Make sure the set of used physregs is closed under subreg operations.
+  MRI->closePhysRegsUsed(*TRI);
+
+  // Add the clobber lists for all the instructions we skipped earlier.
+  for (SmallPtrSet<const TargetInstrDesc*, 4>::const_iterator
+       I = SkippedInstrs.begin(), E = SkippedInstrs.end(); I != E; ++I)
+    if (const unsigned *Defs = (*I)->getImplicitDefs())
+      while (*Defs)
+        MRI->setPhysRegUsed(*Defs++);
+
+  SkippedInstrs.clear();
+  StackSlotForVirtReg.clear();
+  LiveDbgValueMap.clear();
+  return true;
+}
+
+FunctionPass *llvm::createFastRegisterAllocator() {
+  return new RAFast();
+}
diff --git a/src/LLVM/lib/CodeGen/RegAllocLinearScan.cpp b/src/LLVM/lib/CodeGen/RegAllocLinearScan.cpp
new file mode 100644
index 0000000..c9c51b3
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/RegAllocLinearScan.cpp
@@ -0,0 +1,1503 @@
+//===-- RegAllocLinearScan.cpp - Linear Scan register allocator -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a linear scan register allocator.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "VirtRegMap.h"
+#include "VirtRegRewriter.h"
+#include "Spiller.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <set>
+#include <queue>
+#include <memory>
+#include <cmath>
+
+using namespace llvm;
+
+STATISTIC(NumIters     , "Number of iterations performed");
+STATISTIC(NumBacktracks, "Number of times we had to backtrack");
+STATISTIC(NumCoalesce,   "Number of copies coalesced");
+STATISTIC(NumDowngrade,  "Number of registers downgraded");
+
+static cl::opt<bool>
+NewHeuristic("new-spilling-heuristic",
+             cl::desc("Use new spilling heuristic"),
+             cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+PreSplitIntervals("pre-alloc-split",
+                  cl::desc("Pre-register allocation live interval splitting"),
+                  cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+TrivCoalesceEnds("trivial-coalesce-ends",
+                  cl::desc("Attempt trivial coalescing of interval ends"),
+                  cl::init(false), cl::Hidden);
+
+static RegisterRegAlloc
+linearscanRegAlloc("linearscan", "linear scan register allocator",
+                   createLinearScanRegisterAllocator);
+
+namespace {
+  // When we allocate a register, add it to a fixed-size queue of
+  // registers to skip in subsequent allocations. This trades a small
+  // amount of register pressure and increased spills for flexibility in
+  // the post-pass scheduler.
+  //
+  // Note that in a the number of registers used for reloading spills
+  // will be one greater than the value of this option.
+  //
+  // One big limitation of this is that it doesn't differentiate between
+  // different register classes. So on x86-64, if there is xmm register
+  // pressure, it can caused fewer GPRs to be held in the queue.
+  static cl::opt<unsigned>
+  NumRecentlyUsedRegs("linearscan-skip-count",
+                      cl::desc("Number of registers for linearscan to remember"
+                               "to skip."),
+                      cl::init(0),
+                      cl::Hidden);
+ 
+  struct RALinScan : public MachineFunctionPass {
+    static char ID;
+    RALinScan() : MachineFunctionPass(ID) {
+      // Initialize the queue to record recently-used registers.
+      if (NumRecentlyUsedRegs > 0)
+        RecentRegs.resize(NumRecentlyUsedRegs, 0);
+      RecentNext = RecentRegs.begin();
+    }
+
+    typedef std::pair<LiveInterval*, LiveInterval::iterator> IntervalPtr;
+    typedef SmallVector<IntervalPtr, 32> IntervalPtrs;
+  private:
+    /// RelatedRegClasses - This structure is built the first time a function is
+    /// compiled, and keeps track of which register classes have registers that
+    /// belong to multiple classes or have aliases that are in other classes.
+    EquivalenceClasses<const TargetRegisterClass*> RelatedRegClasses;
+    DenseMap<unsigned, const TargetRegisterClass*> OneClassForEachPhysReg;
+
+    // NextReloadMap - For each register in the map, it maps to the another
+    // register which is defined by a reload from the same stack slot and
+    // both reloads are in the same basic block.
+    DenseMap<unsigned, unsigned> NextReloadMap;
+
+    // DowngradedRegs - A set of registers which are being "downgraded", i.e.
+    // un-favored for allocation.
+    SmallSet<unsigned, 8> DowngradedRegs;
+
+    // DowngradeMap - A map from virtual registers to physical registers being
+    // downgraded for the virtual registers.
+    DenseMap<unsigned, unsigned> DowngradeMap;
+
+    MachineFunction* mf_;
+    MachineRegisterInfo* mri_;
+    const TargetMachine* tm_;
+    const TargetRegisterInfo* tri_;
+    const TargetInstrInfo* tii_;
+    BitVector allocatableRegs_;
+    LiveIntervals* li_;
+    LiveStacks* ls_;
+    MachineLoopInfo *loopInfo;
+
+    /// handled_ - Intervals are added to the handled_ set in the order of their
+    /// start value.  This is uses for backtracking.
+    std::vector<LiveInterval*> handled_;
+
+    /// fixed_ - Intervals that correspond to machine registers.
+    ///
+    IntervalPtrs fixed_;
+
+    /// active_ - Intervals that are currently being processed, and which have a
+    /// live range active for the current point.
+    IntervalPtrs active_;
+
+    /// inactive_ - Intervals that are currently being processed, but which have
+    /// a hold at the current point.
+    IntervalPtrs inactive_;
+
+    typedef std::priority_queue<LiveInterval*,
+                                SmallVector<LiveInterval*, 64>,
+                                greater_ptr<LiveInterval> > IntervalHeap;
+    IntervalHeap unhandled_;
+
+    /// regUse_ - Tracks register usage.
+    SmallVector<unsigned, 32> regUse_;
+    SmallVector<unsigned, 32> regUseBackUp_;
+
+    /// vrm_ - Tracks register assignments.
+    VirtRegMap* vrm_;
+
+    std::auto_ptr<VirtRegRewriter> rewriter_;
+
+    std::auto_ptr<Spiller> spiller_;
+
+    // The queue of recently-used registers.
+    SmallVector<unsigned, 4> RecentRegs;
+    SmallVector<unsigned, 4>::iterator RecentNext;
+
+    // Record that we just picked this register.
+    void recordRecentlyUsed(unsigned reg) {
+      assert(reg != 0 && "Recently used register is NOREG!");
+      if (!RecentRegs.empty()) {
+        *RecentNext++ = reg;
+        if (RecentNext == RecentRegs.end())
+          RecentNext = RecentRegs.begin();
+      }
+    }
+
+  public:
+    virtual const char* getPassName() const {
+      return "Linear Scan Register Allocator";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<LiveIntervals>();
+      AU.addPreserved<SlotIndexes>();
+      if (StrongPHIElim)
+        AU.addRequiredID(StrongPHIEliminationID);
+      // Make sure PassManager knows which analyses to make available
+      // to coalescing and which analyses coalescing invalidates.
+      AU.addRequiredTransitive<RegisterCoalescer>();
+      AU.addRequired<CalculateSpillWeights>();
+      if (PreSplitIntervals)
+        AU.addRequiredID(PreAllocSplittingID);
+      AU.addRequired<LiveStacks>();
+      AU.addPreserved<LiveStacks>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      AU.addRequired<VirtRegMap>();
+      AU.addPreserved<VirtRegMap>();
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    /// runOnMachineFunction - register allocate the whole function
+    bool runOnMachineFunction(MachineFunction&);
+
+    // Determine if we skip this register due to its being recently used.
+    bool isRecentlyUsed(unsigned reg) const {
+      return std::find(RecentRegs.begin(), RecentRegs.end(), reg) !=
+             RecentRegs.end();
+    }
+
+  private:
+    /// linearScan - the linear scan algorithm
+    void linearScan();
+
+    /// initIntervalSets - initialize the interval sets.
+    ///
+    void initIntervalSets();
+
+    /// processActiveIntervals - expire old intervals and move non-overlapping
+    /// ones to the inactive list.
+    void processActiveIntervals(SlotIndex CurPoint);
+
+    /// processInactiveIntervals - expire old intervals and move overlapping
+    /// ones to the active list.
+    void processInactiveIntervals(SlotIndex CurPoint);
+
+    /// hasNextReloadInterval - Return the next liveinterval that's being
+    /// defined by a reload from the same SS as the specified one.
+    LiveInterval *hasNextReloadInterval(LiveInterval *cur);
+
+    /// DowngradeRegister - Downgrade a register for allocation.
+    void DowngradeRegister(LiveInterval *li, unsigned Reg);
+
+    /// UpgradeRegister - Upgrade a register for allocation.
+    void UpgradeRegister(unsigned Reg);
+
+    /// assignRegOrStackSlotAtInterval - assign a register if one
+    /// is available, or spill.
+    void assignRegOrStackSlotAtInterval(LiveInterval* cur);
+
+    void updateSpillWeights(std::vector<float> &Weights,
+                            unsigned reg, float weight,
+                            const TargetRegisterClass *RC);
+
+    /// findIntervalsToSpill - Determine the intervals to spill for the
+    /// specified interval. It's passed the physical registers whose spill
+    /// weight is the lowest among all the registers whose live intervals
+    /// conflict with the interval.
+    void findIntervalsToSpill(LiveInterval *cur,
+                            std::vector<std::pair<unsigned,float> > &Candidates,
+                            unsigned NumCands,
+                            SmallVector<LiveInterval*, 8> &SpillIntervals);
+
+    /// attemptTrivialCoalescing - If a simple interval is defined by a copy,
+    /// try to allocate the definition to the same register as the source,
+    /// if the register is not defined during the life time of the interval.
+    /// This eliminates a copy, and is used to coalesce copies which were not
+    /// coalesced away before allocation either due to dest and src being in
+    /// different register classes or because the coalescer was overly
+    /// conservative.
+    unsigned attemptTrivialCoalescing(LiveInterval &cur, unsigned Reg);
+
+    ///
+    /// Register usage / availability tracking helpers.
+    ///
+
+    void initRegUses() {
+      regUse_.resize(tri_->getNumRegs(), 0);
+      regUseBackUp_.resize(tri_->getNumRegs(), 0);
+    }
+
+    void finalizeRegUses() {
+#ifndef NDEBUG
+      // Verify all the registers are "freed".
+      bool Error = false;
+      for (unsigned i = 0, e = tri_->getNumRegs(); i != e; ++i) {
+        if (regUse_[i] != 0) {
+          dbgs() << tri_->getName(i) << " is still in use!\n";
+          Error = true;
+        }
+      }
+      if (Error)
+        llvm_unreachable(0);
+#endif
+      regUse_.clear();
+      regUseBackUp_.clear();
+    }
+
+    void addRegUse(unsigned physReg) {
+      assert(TargetRegisterInfo::isPhysicalRegister(physReg) &&
+             "should be physical register!");
+      ++regUse_[physReg];
+      for (const unsigned* as = tri_->getAliasSet(physReg); *as; ++as)
+        ++regUse_[*as];
+    }
+
+    void delRegUse(unsigned physReg) {
+      assert(TargetRegisterInfo::isPhysicalRegister(physReg) &&
+             "should be physical register!");
+      assert(regUse_[physReg] != 0);
+      --regUse_[physReg];
+      for (const unsigned* as = tri_->getAliasSet(physReg); *as; ++as) {
+        assert(regUse_[*as] != 0);
+        --regUse_[*as];
+      }
+    }
+
+    bool isRegAvail(unsigned physReg) const {
+      assert(TargetRegisterInfo::isPhysicalRegister(physReg) &&
+             "should be physical register!");
+      return regUse_[physReg] == 0;
+    }
+
+    void backUpRegUses() {
+      regUseBackUp_ = regUse_;
+    }
+
+    void restoreRegUses() {
+      regUse_ = regUseBackUp_;
+    }
+
+    ///
+    /// Register handling helpers.
+    ///
+
+    /// getFreePhysReg - return a free physical register for this virtual
+    /// register interval if we have one, otherwise return 0.
+    unsigned getFreePhysReg(LiveInterval* cur);
+    unsigned getFreePhysReg(LiveInterval* cur,
+                            const TargetRegisterClass *RC,
+                            unsigned MaxInactiveCount,
+                            SmallVector<unsigned, 256> &inactiveCounts,
+                            bool SkipDGRegs);
+
+    void ComputeRelatedRegClasses();
+
+    template <typename ItTy>
+    void printIntervals(const char* const str, ItTy i, ItTy e) const {
+      DEBUG({
+          if (str)
+            dbgs() << str << " intervals:\n";
+
+          for (; i != e; ++i) {
+            dbgs() << "\t" << *i->first << " -> ";
+
+            unsigned reg = i->first->reg;
+            if (TargetRegisterInfo::isVirtualRegister(reg))
+              reg = vrm_->getPhys(reg);
+
+            dbgs() << tri_->getName(reg) << '\n';
+          }
+        });
+    }
+  };
+  char RALinScan::ID = 0;
+}
+
+INITIALIZE_PASS(RALinScan, "linearscan-regalloc",
+                "Linear Scan Register Allocator", false, false);
+
+void RALinScan::ComputeRelatedRegClasses() {
+  // First pass, add all reg classes to the union, and determine at least one
+  // reg class that each register is in.
+  bool HasAliases = false;
+  for (TargetRegisterInfo::regclass_iterator RCI = tri_->regclass_begin(),
+       E = tri_->regclass_end(); RCI != E; ++RCI) {
+    RelatedRegClasses.insert(*RCI);
+    for (TargetRegisterClass::iterator I = (*RCI)->begin(), E = (*RCI)->end();
+         I != E; ++I) {
+      HasAliases = HasAliases || *tri_->getAliasSet(*I) != 0;
+      
+      const TargetRegisterClass *&PRC = OneClassForEachPhysReg[*I];
+      if (PRC) {
+        // Already processed this register.  Just make sure we know that
+        // multiple register classes share a register.
+        RelatedRegClasses.unionSets(PRC, *RCI);
+      } else {
+        PRC = *RCI;
+      }
+    }
+  }
+  
+  // Second pass, now that we know conservatively what register classes each reg
+  // belongs to, add info about aliases.  We don't need to do this for targets
+  // without register aliases.
+  if (HasAliases)
+    for (DenseMap<unsigned, const TargetRegisterClass*>::iterator
+         I = OneClassForEachPhysReg.begin(), E = OneClassForEachPhysReg.end();
+         I != E; ++I)
+      for (const unsigned *AS = tri_->getAliasSet(I->first); *AS; ++AS)
+        RelatedRegClasses.unionSets(I->second, OneClassForEachPhysReg[*AS]);
+}
+
+/// attemptTrivialCoalescing - If a simple interval is defined by a copy, try
+/// allocate the definition the same register as the source register if the
+/// register is not defined during live time of the interval. If the interval is
+/// killed by a copy, try to use the destination register. This eliminates a
+/// copy. This is used to coalesce copies which were not coalesced away before
+/// allocation either due to dest and src being in different register classes or
+/// because the coalescer was overly conservative.
+unsigned RALinScan::attemptTrivialCoalescing(LiveInterval &cur, unsigned Reg) {
+  unsigned Preference = vrm_->getRegAllocPref(cur.reg);
+  if ((Preference && Preference == Reg) || !cur.containsOneValue())
+    return Reg;
+
+  // We cannot handle complicated live ranges. Simple linear stuff only.
+  if (cur.ranges.size() != 1)
+    return Reg;
+
+  const LiveRange &range = cur.ranges.front();
+
+  VNInfo *vni = range.valno;
+  if (vni->isUnused())
+    return Reg;
+
+  unsigned CandReg;
+  {
+    MachineInstr *CopyMI;
+    if (vni->def != SlotIndex() && vni->isDefAccurate() &&
+        (CopyMI = li_->getInstructionFromIndex(vni->def)) && CopyMI->isCopy())
+      // Defined by a copy, try to extend SrcReg forward
+      CandReg = CopyMI->getOperand(1).getReg();
+    else if (TrivCoalesceEnds &&
+            (CopyMI = li_->getInstructionFromIndex(range.end.getBaseIndex())) &&
+             CopyMI->isCopy() && cur.reg == CopyMI->getOperand(1).getReg())
+      // Only used by a copy, try to extend DstReg backwards
+      CandReg = CopyMI->getOperand(0).getReg();
+    else
+      return Reg;
+  }
+
+  if (TargetRegisterInfo::isVirtualRegister(CandReg)) {
+    if (!vrm_->isAssignedReg(CandReg))
+      return Reg;
+    CandReg = vrm_->getPhys(CandReg);
+  }
+  if (Reg == CandReg)
+    return Reg;
+
+  const TargetRegisterClass *RC = mri_->getRegClass(cur.reg);
+  if (!RC->contains(CandReg))
+    return Reg;
+
+  if (li_->conflictsWithPhysReg(cur, *vrm_, CandReg))
+    return Reg;
+
+  // Try to coalesce.
+  DEBUG(dbgs() << "Coalescing: " << cur << " -> " << tri_->getName(CandReg)
+        << '\n');
+  vrm_->clearVirt(cur.reg);
+  vrm_->assignVirt2Phys(cur.reg, CandReg);
+
+  ++NumCoalesce;
+  return CandReg;
+}
+
+bool RALinScan::runOnMachineFunction(MachineFunction &fn) {
+  mf_ = &fn;
+  mri_ = &fn.getRegInfo();
+  tm_ = &fn.getTarget();
+  tri_ = tm_->getRegisterInfo();
+  tii_ = tm_->getInstrInfo();
+  allocatableRegs_ = tri_->getAllocatableSet(fn);
+  li_ = &getAnalysis<LiveIntervals>();
+  ls_ = &getAnalysis<LiveStacks>();
+  loopInfo = &getAnalysis<MachineLoopInfo>();
+
+  // We don't run the coalescer here because we have no reason to
+  // interact with it.  If the coalescer requires interaction, it
+  // won't do anything.  If it doesn't require interaction, we assume
+  // it was run as a separate pass.
+
+  // If this is the first function compiled, compute the related reg classes.
+  if (RelatedRegClasses.empty())
+    ComputeRelatedRegClasses();
+
+  // Also resize register usage trackers.
+  initRegUses();
+
+  vrm_ = &getAnalysis<VirtRegMap>();
+  if (!rewriter_.get()) rewriter_.reset(createVirtRegRewriter());
+  
+  spiller_.reset(createSpiller(*this, *mf_, *vrm_));
+  
+  initIntervalSets();
+
+  linearScan();
+
+  // Rewrite spill code and update the PhysRegsUsed set.
+  rewriter_->runOnMachineFunction(*mf_, *vrm_, li_);
+
+  assert(unhandled_.empty() && "Unhandled live intervals remain!");
+
+  finalizeRegUses();
+
+  fixed_.clear();
+  active_.clear();
+  inactive_.clear();
+  handled_.clear();
+  NextReloadMap.clear();
+  DowngradedRegs.clear();
+  DowngradeMap.clear();
+  spiller_.reset(0);
+
+  return true;
+}
+
+/// initIntervalSets - initialize the interval sets.
+///
+void RALinScan::initIntervalSets()
+{
+  assert(unhandled_.empty() && fixed_.empty() &&
+         active_.empty() && inactive_.empty() &&
+         "interval sets should be empty on initialization");
+
+  handled_.reserve(li_->getNumIntervals());
+
+  for (LiveIntervals::iterator i = li_->begin(), e = li_->end(); i != e; ++i) {
+    if (TargetRegisterInfo::isPhysicalRegister(i->second->reg)) {
+      if (!i->second->empty()) {
+        mri_->setPhysRegUsed(i->second->reg);
+        fixed_.push_back(std::make_pair(i->second, i->second->begin()));
+      }
+    } else {
+      if (i->second->empty()) {
+        assignRegOrStackSlotAtInterval(i->second);
+      }
+      else
+        unhandled_.push(i->second);
+    }
+  }
+}
+
+void RALinScan::linearScan() {
+  // linear scan algorithm
+  DEBUG({
+      dbgs() << "********** LINEAR SCAN **********\n"
+             << "********** Function: " 
+             << mf_->getFunction()->getName() << '\n';
+      printIntervals("fixed", fixed_.begin(), fixed_.end());
+    });
+
+  while (!unhandled_.empty()) {
+    // pick the interval with the earliest start point
+    LiveInterval* cur = unhandled_.top();
+    unhandled_.pop();
+    ++NumIters;
+    DEBUG(dbgs() << "\n*** CURRENT ***: " << *cur << '\n');
+
+    assert(!cur->empty() && "Empty interval in unhandled set.");
+
+    processActiveIntervals(cur->beginIndex());
+    processInactiveIntervals(cur->beginIndex());
+
+    assert(TargetRegisterInfo::isVirtualRegister(cur->reg) &&
+           "Can only allocate virtual registers!");
+
+    // Allocating a virtual register. try to find a free
+    // physical register or spill an interval (possibly this one) in order to
+    // assign it one.
+    assignRegOrStackSlotAtInterval(cur);
+
+    DEBUG({
+        printIntervals("active", active_.begin(), active_.end());
+        printIntervals("inactive", inactive_.begin(), inactive_.end());
+      });
+  }
+
+  // Expire any remaining active intervals
+  while (!active_.empty()) {
+    IntervalPtr &IP = active_.back();
+    unsigned reg = IP.first->reg;
+    DEBUG(dbgs() << "\tinterval " << *IP.first << " expired\n");
+    assert(TargetRegisterInfo::isVirtualRegister(reg) &&
+           "Can only allocate virtual registers!");
+    reg = vrm_->getPhys(reg);
+    delRegUse(reg);
+    active_.pop_back();
+  }
+
+  // Expire any remaining inactive intervals
+  DEBUG({
+      for (IntervalPtrs::reverse_iterator
+             i = inactive_.rbegin(); i != inactive_.rend(); ++i)
+        dbgs() << "\tinterval " << *i->first << " expired\n";
+    });
+  inactive_.clear();
+
+  // Add live-ins to every BB except for entry. Also perform trivial coalescing.
+  MachineFunction::iterator EntryMBB = mf_->begin();
+  SmallVector<MachineBasicBlock*, 8> LiveInMBBs;
+  for (LiveIntervals::iterator i = li_->begin(), e = li_->end(); i != e; ++i) {
+    LiveInterval &cur = *i->second;
+    unsigned Reg = 0;
+    bool isPhys = TargetRegisterInfo::isPhysicalRegister(cur.reg);
+    if (isPhys)
+      Reg = cur.reg;
+    else if (vrm_->isAssignedReg(cur.reg))
+      Reg = attemptTrivialCoalescing(cur, vrm_->getPhys(cur.reg));
+    if (!Reg)
+      continue;
+    // Ignore splited live intervals.
+    if (!isPhys && vrm_->getPreSplitReg(cur.reg))
+      continue;
+
+    for (LiveInterval::Ranges::const_iterator I = cur.begin(), E = cur.end();
+         I != E; ++I) {
+      const LiveRange &LR = *I;
+      if (li_->findLiveInMBBs(LR.start, LR.end, LiveInMBBs)) {
+        for (unsigned i = 0, e = LiveInMBBs.size(); i != e; ++i)
+          if (LiveInMBBs[i] != EntryMBB) {
+            assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+                   "Adding a virtual register to livein set?");
+            LiveInMBBs[i]->addLiveIn(Reg);
+          }
+        LiveInMBBs.clear();
+      }
+    }
+  }
+
+  DEBUG(dbgs() << *vrm_);
+
+  // Look for physical registers that end up not being allocated even though
+  // register allocator had to spill other registers in its register class.
+  if (ls_->getNumIntervals() == 0)
+    return;
+  if (!vrm_->FindUnusedRegisters(li_))
+    return;
+}
+
+/// processActiveIntervals - expire old intervals and move non-overlapping ones
+/// to the inactive list.
+void RALinScan::processActiveIntervals(SlotIndex CurPoint)
+{
+  DEBUG(dbgs() << "\tprocessing active intervals:\n");
+
+  for (unsigned i = 0, e = active_.size(); i != e; ++i) {
+    LiveInterval *Interval = active_[i].first;
+    LiveInterval::iterator IntervalPos = active_[i].second;
+    unsigned reg = Interval->reg;
+
+    IntervalPos = Interval->advanceTo(IntervalPos, CurPoint);
+
+    if (IntervalPos == Interval->end()) {     // Remove expired intervals.
+      DEBUG(dbgs() << "\t\tinterval " << *Interval << " expired\n");
+      assert(TargetRegisterInfo::isVirtualRegister(reg) &&
+             "Can only allocate virtual registers!");
+      reg = vrm_->getPhys(reg);
+      delRegUse(reg);
+
+      // Pop off the end of the list.
+      active_[i] = active_.back();
+      active_.pop_back();
+      --i; --e;
+
+    } else if (IntervalPos->start > CurPoint) {
+      // Move inactive intervals to inactive list.
+      DEBUG(dbgs() << "\t\tinterval " << *Interval << " inactive\n");
+      assert(TargetRegisterInfo::isVirtualRegister(reg) &&
+             "Can only allocate virtual registers!");
+      reg = vrm_->getPhys(reg);
+      delRegUse(reg);
+      // add to inactive.
+      inactive_.push_back(std::make_pair(Interval, IntervalPos));
+
+      // Pop off the end of the list.
+      active_[i] = active_.back();
+      active_.pop_back();
+      --i; --e;
+    } else {
+      // Otherwise, just update the iterator position.
+      active_[i].second = IntervalPos;
+    }
+  }
+}
+
+/// processInactiveIntervals - expire old intervals and move overlapping
+/// ones to the active list.
+void RALinScan::processInactiveIntervals(SlotIndex CurPoint)
+{
+  DEBUG(dbgs() << "\tprocessing inactive intervals:\n");
+
+  for (unsigned i = 0, e = inactive_.size(); i != e; ++i) {
+    LiveInterval *Interval = inactive_[i].first;
+    LiveInterval::iterator IntervalPos = inactive_[i].second;
+    unsigned reg = Interval->reg;
+
+    IntervalPos = Interval->advanceTo(IntervalPos, CurPoint);
+
+    if (IntervalPos == Interval->end()) {       // remove expired intervals.
+      DEBUG(dbgs() << "\t\tinterval " << *Interval << " expired\n");
+
+      // Pop off the end of the list.
+      inactive_[i] = inactive_.back();
+      inactive_.pop_back();
+      --i; --e;
+    } else if (IntervalPos->start <= CurPoint) {
+      // move re-activated intervals in active list
+      DEBUG(dbgs() << "\t\tinterval " << *Interval << " active\n");
+      assert(TargetRegisterInfo::isVirtualRegister(reg) &&
+             "Can only allocate virtual registers!");
+      reg = vrm_->getPhys(reg);
+      addRegUse(reg);
+      // add to active
+      active_.push_back(std::make_pair(Interval, IntervalPos));
+
+      // Pop off the end of the list.
+      inactive_[i] = inactive_.back();
+      inactive_.pop_back();
+      --i; --e;
+    } else {
+      // Otherwise, just update the iterator position.
+      inactive_[i].second = IntervalPos;
+    }
+  }
+}
+
+/// updateSpillWeights - updates the spill weights of the specifed physical
+/// register and its weight.
+void RALinScan::updateSpillWeights(std::vector<float> &Weights,
+                                   unsigned reg, float weight,
+                                   const TargetRegisterClass *RC) {
+  SmallSet<unsigned, 4> Processed;
+  SmallSet<unsigned, 4> SuperAdded;
+  SmallVector<unsigned, 4> Supers;
+  Weights[reg] += weight;
+  Processed.insert(reg);
+  for (const unsigned* as = tri_->getAliasSet(reg); *as; ++as) {
+    Weights[*as] += weight;
+    Processed.insert(*as);
+    if (tri_->isSubRegister(*as, reg) &&
+        SuperAdded.insert(*as) &&
+        RC->contains(*as)) {
+      Supers.push_back(*as);
+    }
+  }
+
+  // If the alias is a super-register, and the super-register is in the
+  // register class we are trying to allocate. Then add the weight to all
+  // sub-registers of the super-register even if they are not aliases.
+  // e.g. allocating for GR32, bh is not used, updating bl spill weight.
+  //      bl should get the same spill weight otherwise it will be choosen
+  //      as a spill candidate since spilling bh doesn't make ebx available.
+  for (unsigned i = 0, e = Supers.size(); i != e; ++i) {
+    for (const unsigned *sr = tri_->getSubRegisters(Supers[i]); *sr; ++sr)
+      if (!Processed.count(*sr))
+        Weights[*sr] += weight;
+  }
+}
+
+static
+RALinScan::IntervalPtrs::iterator
+FindIntervalInVector(RALinScan::IntervalPtrs &IP, LiveInterval *LI) {
+  for (RALinScan::IntervalPtrs::iterator I = IP.begin(), E = IP.end();
+       I != E; ++I)
+    if (I->first == LI) return I;
+  return IP.end();
+}
+
+static void RevertVectorIteratorsTo(RALinScan::IntervalPtrs &V, SlotIndex Point){
+  for (unsigned i = 0, e = V.size(); i != e; ++i) {
+    RALinScan::IntervalPtr &IP = V[i];
+    LiveInterval::iterator I = std::upper_bound(IP.first->begin(),
+                                                IP.second, Point);
+    if (I != IP.first->begin()) --I;
+    IP.second = I;
+  }
+}
+
+/// addStackInterval - Create a LiveInterval for stack if the specified live
+/// interval has been spilled.
+static void addStackInterval(LiveInterval *cur, LiveStacks *ls_,
+                             LiveIntervals *li_,
+                             MachineRegisterInfo* mri_, VirtRegMap &vrm_) {
+  int SS = vrm_.getStackSlot(cur->reg);
+  if (SS == VirtRegMap::NO_STACK_SLOT)
+    return;
+
+  const TargetRegisterClass *RC = mri_->getRegClass(cur->reg);
+  LiveInterval &SI = ls_->getOrCreateInterval(SS, RC);
+
+  VNInfo *VNI;
+  if (SI.hasAtLeastOneValue())
+    VNI = SI.getValNumInfo(0);
+  else
+    VNI = SI.getNextValue(SlotIndex(), 0, false,
+                          ls_->getVNInfoAllocator());
+
+  LiveInterval &RI = li_->getInterval(cur->reg);
+  // FIXME: This may be overly conservative.
+  SI.MergeRangesInAsValue(RI, VNI);
+}
+
+/// getConflictWeight - Return the number of conflicts between cur
+/// live interval and defs and uses of Reg weighted by loop depthes.
+static
+float getConflictWeight(LiveInterval *cur, unsigned Reg, LiveIntervals *li_,
+                        MachineRegisterInfo *mri_,
+                        MachineLoopInfo *loopInfo) {
+  float Conflicts = 0;
+  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(Reg),
+         E = mri_->reg_end(); I != E; ++I) {
+    MachineInstr *MI = &*I;
+    if (cur->liveAt(li_->getInstructionIndex(MI))) {
+      unsigned loopDepth = loopInfo->getLoopDepth(MI->getParent());
+      Conflicts += std::pow(10.0f, (float)loopDepth);
+    }
+  }
+  return Conflicts;
+}
+
+/// findIntervalsToSpill - Determine the intervals to spill for the
+/// specified interval. It's passed the physical registers whose spill
+/// weight is the lowest among all the registers whose live intervals
+/// conflict with the interval.
+void RALinScan::findIntervalsToSpill(LiveInterval *cur,
+                            std::vector<std::pair<unsigned,float> > &Candidates,
+                            unsigned NumCands,
+                            SmallVector<LiveInterval*, 8> &SpillIntervals) {
+  // We have figured out the *best* register to spill. But there are other
+  // registers that are pretty good as well (spill weight within 3%). Spill
+  // the one that has fewest defs and uses that conflict with cur.
+  float Conflicts[3] = { 0.0f, 0.0f, 0.0f };
+  SmallVector<LiveInterval*, 8> SLIs[3];
+
+  DEBUG({
+      dbgs() << "\tConsidering " << NumCands << " candidates: ";
+      for (unsigned i = 0; i != NumCands; ++i)
+        dbgs() << tri_->getName(Candidates[i].first) << " ";
+      dbgs() << "\n";
+    });
+  
+  // Calculate the number of conflicts of each candidate.
+  for (IntervalPtrs::iterator i = active_.begin(); i != active_.end(); ++i) {
+    unsigned Reg = i->first->reg;
+    unsigned PhysReg = vrm_->getPhys(Reg);
+    if (!cur->overlapsFrom(*i->first, i->second))
+      continue;
+    for (unsigned j = 0; j < NumCands; ++j) {
+      unsigned Candidate = Candidates[j].first;
+      if (tri_->regsOverlap(PhysReg, Candidate)) {
+        if (NumCands > 1)
+          Conflicts[j] += getConflictWeight(cur, Reg, li_, mri_, loopInfo);
+        SLIs[j].push_back(i->first);
+      }
+    }
+  }
+
+  for (IntervalPtrs::iterator i = inactive_.begin(); i != inactive_.end(); ++i){
+    unsigned Reg = i->first->reg;
+    unsigned PhysReg = vrm_->getPhys(Reg);
+    if (!cur->overlapsFrom(*i->first, i->second-1))
+      continue;
+    for (unsigned j = 0; j < NumCands; ++j) {
+      unsigned Candidate = Candidates[j].first;
+      if (tri_->regsOverlap(PhysReg, Candidate)) {
+        if (NumCands > 1)
+          Conflicts[j] += getConflictWeight(cur, Reg, li_, mri_, loopInfo);
+        SLIs[j].push_back(i->first);
+      }
+    }
+  }
+
+  // Which is the best candidate?
+  unsigned BestCandidate = 0;
+  float MinConflicts = Conflicts[0];
+  for (unsigned i = 1; i != NumCands; ++i) {
+    if (Conflicts[i] < MinConflicts) {
+      BestCandidate = i;
+      MinConflicts = Conflicts[i];
+    }
+  }
+
+  std::copy(SLIs[BestCandidate].begin(), SLIs[BestCandidate].end(),
+            std::back_inserter(SpillIntervals));
+}
+
+namespace {
+  struct WeightCompare {
+  private:
+    const RALinScan &Allocator;
+
+  public:
+    WeightCompare(const RALinScan &Alloc) : Allocator(Alloc) {}
+
+    typedef std::pair<unsigned, float> RegWeightPair;
+    bool operator()(const RegWeightPair &LHS, const RegWeightPair &RHS) const {
+      return LHS.second < RHS.second && !Allocator.isRecentlyUsed(LHS.first);
+    }
+  };
+}
+
+static bool weightsAreClose(float w1, float w2) {
+  if (!NewHeuristic)
+    return false;
+
+  float diff = w1 - w2;
+  if (diff <= 0.02f)  // Within 0.02f
+    return true;
+  return (diff / w2) <= 0.05f;  // Within 5%.
+}
+
+LiveInterval *RALinScan::hasNextReloadInterval(LiveInterval *cur) {
+  DenseMap<unsigned, unsigned>::iterator I = NextReloadMap.find(cur->reg);
+  if (I == NextReloadMap.end())
+    return 0;
+  return &li_->getInterval(I->second);
+}
+
+void RALinScan::DowngradeRegister(LiveInterval *li, unsigned Reg) {
+  bool isNew = DowngradedRegs.insert(Reg);
+  isNew = isNew; // Silence compiler warning.
+  assert(isNew && "Multiple reloads holding the same register?");
+  DowngradeMap.insert(std::make_pair(li->reg, Reg));
+  for (const unsigned *AS = tri_->getAliasSet(Reg); *AS; ++AS) {
+    isNew = DowngradedRegs.insert(*AS);
+    isNew = isNew; // Silence compiler warning.
+    assert(isNew && "Multiple reloads holding the same register?");
+    DowngradeMap.insert(std::make_pair(li->reg, *AS));
+  }
+  ++NumDowngrade;
+}
+
+void RALinScan::UpgradeRegister(unsigned Reg) {
+  if (Reg) {
+    DowngradedRegs.erase(Reg);
+    for (const unsigned *AS = tri_->getAliasSet(Reg); *AS; ++AS)
+      DowngradedRegs.erase(*AS);
+  }
+}
+
+namespace {
+  struct LISorter {
+    bool operator()(LiveInterval* A, LiveInterval* B) {
+      return A->beginIndex() < B->beginIndex();
+    }
+  };
+}
+
+/// assignRegOrStackSlotAtInterval - assign a register if one is available, or
+/// spill.
+void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
+  DEBUG(dbgs() << "\tallocating current interval: ");
+
+  // This is an implicitly defined live interval, just assign any register.
+  const TargetRegisterClass *RC = mri_->getRegClass(cur->reg);
+  if (cur->empty()) {
+    unsigned physReg = vrm_->getRegAllocPref(cur->reg);
+    if (!physReg)
+      physReg = *RC->allocation_order_begin(*mf_);
+    DEBUG(dbgs() <<  tri_->getName(physReg) << '\n');
+    // Note the register is not really in use.
+    vrm_->assignVirt2Phys(cur->reg, physReg);
+    return;
+  }
+
+  backUpRegUses();
+
+  std::vector<std::pair<unsigned, float> > SpillWeightsToAdd;
+  SlotIndex StartPosition = cur->beginIndex();
+  const TargetRegisterClass *RCLeader = RelatedRegClasses.getLeaderValue(RC);
+
+  // If start of this live interval is defined by a move instruction and its
+  // source is assigned a physical register that is compatible with the target
+  // register class, then we should try to assign it the same register.
+  // This can happen when the move is from a larger register class to a smaller
+  // one, e.g. X86::mov32to32_. These move instructions are not coalescable.
+  if (!vrm_->getRegAllocPref(cur->reg) && cur->hasAtLeastOneValue()) {
+    VNInfo *vni = cur->begin()->valno;
+    if ((vni->def != SlotIndex()) && !vni->isUnused() &&
+         vni->isDefAccurate()) {
+      MachineInstr *CopyMI = li_->getInstructionFromIndex(vni->def);
+      if (CopyMI && CopyMI->isCopy()) {
+        unsigned DstSubReg = CopyMI->getOperand(0).getSubReg();
+        unsigned SrcReg = CopyMI->getOperand(1).getReg();
+        unsigned SrcSubReg = CopyMI->getOperand(1).getSubReg();
+        unsigned Reg = 0;
+        if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
+          Reg = SrcReg;
+        else if (vrm_->isAssignedReg(SrcReg))
+          Reg = vrm_->getPhys(SrcReg);
+        if (Reg) {
+          if (SrcSubReg)
+            Reg = tri_->getSubReg(Reg, SrcSubReg);
+          if (DstSubReg)
+            Reg = tri_->getMatchingSuperReg(Reg, DstSubReg, RC);
+          if (Reg && allocatableRegs_[Reg] && RC->contains(Reg))
+            mri_->setRegAllocationHint(cur->reg, 0, Reg);
+        }
+      }
+    }
+  }
+
+  // For every interval in inactive we overlap with, mark the
+  // register as not free and update spill weights.
+  for (IntervalPtrs::const_iterator i = inactive_.begin(),
+         e = inactive_.end(); i != e; ++i) {
+    unsigned Reg = i->first->reg;
+    assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+           "Can only allocate virtual registers!");
+    const TargetRegisterClass *RegRC = mri_->getRegClass(Reg);
+    // If this is not in a related reg class to the register we're allocating, 
+    // don't check it.
+    if (RelatedRegClasses.getLeaderValue(RegRC) == RCLeader &&
+        cur->overlapsFrom(*i->first, i->second-1)) {
+      Reg = vrm_->getPhys(Reg);
+      addRegUse(Reg);
+      SpillWeightsToAdd.push_back(std::make_pair(Reg, i->first->weight));
+    }
+  }
+  
+  // Speculatively check to see if we can get a register right now.  If not,
+  // we know we won't be able to by adding more constraints.  If so, we can
+  // check to see if it is valid.  Doing an exhaustive search of the fixed_ list
+  // is very bad (it contains all callee clobbered registers for any functions
+  // with a call), so we want to avoid doing that if possible.
+  unsigned physReg = getFreePhysReg(cur);
+  unsigned BestPhysReg = physReg;
+  if (physReg) {
+    // We got a register.  However, if it's in the fixed_ list, we might
+    // conflict with it.  Check to see if we conflict with it or any of its
+    // aliases.
+    SmallSet<unsigned, 8> RegAliases;
+    for (const unsigned *AS = tri_->getAliasSet(physReg); *AS; ++AS)
+      RegAliases.insert(*AS);
+    
+    bool ConflictsWithFixed = false;
+    for (unsigned i = 0, e = fixed_.size(); i != e; ++i) {
+      IntervalPtr &IP = fixed_[i];
+      if (physReg == IP.first->reg || RegAliases.count(IP.first->reg)) {
+        // Okay, this reg is on the fixed list.  Check to see if we actually
+        // conflict.
+        LiveInterval *I = IP.first;
+        if (I->endIndex() > StartPosition) {
+          LiveInterval::iterator II = I->advanceTo(IP.second, StartPosition);
+          IP.second = II;
+          if (II != I->begin() && II->start > StartPosition)
+            --II;
+          if (cur->overlapsFrom(*I, II)) {
+            ConflictsWithFixed = true;
+            break;
+          }
+        }
+      }
+    }
+    
+    // Okay, the register picked by our speculative getFreePhysReg call turned
+    // out to be in use.  Actually add all of the conflicting fixed registers to
+    // regUse_ so we can do an accurate query.
+    if (ConflictsWithFixed) {
+      // For every interval in fixed we overlap with, mark the register as not
+      // free and update spill weights.
+      for (unsigned i = 0, e = fixed_.size(); i != e; ++i) {
+        IntervalPtr &IP = fixed_[i];
+        LiveInterval *I = IP.first;
+
+        const TargetRegisterClass *RegRC = OneClassForEachPhysReg[I->reg];
+        if (RelatedRegClasses.getLeaderValue(RegRC) == RCLeader &&       
+            I->endIndex() > StartPosition) {
+          LiveInterval::iterator II = I->advanceTo(IP.second, StartPosition);
+          IP.second = II;
+          if (II != I->begin() && II->start > StartPosition)
+            --II;
+          if (cur->overlapsFrom(*I, II)) {
+            unsigned reg = I->reg;
+            addRegUse(reg);
+            SpillWeightsToAdd.push_back(std::make_pair(reg, I->weight));
+          }
+        }
+      }
+
+      // Using the newly updated regUse_ object, which includes conflicts in the
+      // future, see if there are any registers available.
+      physReg = getFreePhysReg(cur);
+    }
+  }
+    
+  // Restore the physical register tracker, removing information about the
+  // future.
+  restoreRegUses();
+  
+  // If we find a free register, we are done: assign this virtual to
+  // the free physical register and add this interval to the active
+  // list.
+  if (physReg) {
+    DEBUG(dbgs() <<  tri_->getName(physReg) << '\n');
+    vrm_->assignVirt2Phys(cur->reg, physReg);
+    addRegUse(physReg);
+    active_.push_back(std::make_pair(cur, cur->begin()));
+    handled_.push_back(cur);
+
+    // "Upgrade" the physical register since it has been allocated.
+    UpgradeRegister(physReg);
+    if (LiveInterval *NextReloadLI = hasNextReloadInterval(cur)) {
+      // "Downgrade" physReg to try to keep physReg from being allocated until
+      // the next reload from the same SS is allocated. 
+      mri_->setRegAllocationHint(NextReloadLI->reg, 0, physReg);
+      DowngradeRegister(cur, physReg);
+    }
+    return;
+  }
+  DEBUG(dbgs() << "no free registers\n");
+
+  // Compile the spill weights into an array that is better for scanning.
+  std::vector<float> SpillWeights(tri_->getNumRegs(), 0.0f);
+  for (std::vector<std::pair<unsigned, float> >::iterator
+       I = SpillWeightsToAdd.begin(), E = SpillWeightsToAdd.end(); I != E; ++I)
+    updateSpillWeights(SpillWeights, I->first, I->second, RC);
+  
+  // for each interval in active, update spill weights.
+  for (IntervalPtrs::const_iterator i = active_.begin(), e = active_.end();
+       i != e; ++i) {
+    unsigned reg = i->first->reg;
+    assert(TargetRegisterInfo::isVirtualRegister(reg) &&
+           "Can only allocate virtual registers!");
+    reg = vrm_->getPhys(reg);
+    updateSpillWeights(SpillWeights, reg, i->first->weight, RC);
+  }
+ 
+  DEBUG(dbgs() << "\tassigning stack slot at interval "<< *cur << ":\n");
+
+  // Find a register to spill.
+  float minWeight = HUGE_VALF;
+  unsigned minReg = 0;
+
+  bool Found = false;
+  std::vector<std::pair<unsigned,float> > RegsWeights;
+  if (!minReg || SpillWeights[minReg] == HUGE_VALF)
+    for (TargetRegisterClass::iterator i = RC->allocation_order_begin(*mf_),
+           e = RC->allocation_order_end(*mf_); i != e; ++i) {
+      unsigned reg = *i;
+      float regWeight = SpillWeights[reg];
+      // Skip recently allocated registers.
+      if (minWeight > regWeight && !isRecentlyUsed(reg))
+        Found = true;
+      RegsWeights.push_back(std::make_pair(reg, regWeight));
+    }
+  
+  // If we didn't find a register that is spillable, try aliases?
+  if (!Found) {
+    for (TargetRegisterClass::iterator i = RC->allocation_order_begin(*mf_),
+           e = RC->allocation_order_end(*mf_); i != e; ++i) {
+      unsigned reg = *i;
+      // No need to worry about if the alias register size < regsize of RC.
+      // We are going to spill all registers that alias it anyway.
+      for (const unsigned* as = tri_->getAliasSet(reg); *as; ++as)
+        RegsWeights.push_back(std::make_pair(*as, SpillWeights[*as]));
+    }
+  }
+
+  // Sort all potential spill candidates by weight.
+  std::sort(RegsWeights.begin(), RegsWeights.end(), WeightCompare(*this));
+  minReg = RegsWeights[0].first;
+  minWeight = RegsWeights[0].second;
+  if (minWeight == HUGE_VALF) {
+    // All registers must have inf weight. Just grab one!
+    minReg = BestPhysReg ? BestPhysReg : *RC->allocation_order_begin(*mf_);
+    if (cur->weight == HUGE_VALF ||
+        li_->getApproximateInstructionCount(*cur) == 0) {
+      // Spill a physical register around defs and uses.
+      if (li_->spillPhysRegAroundRegDefsUses(*cur, minReg, *vrm_)) {
+        // spillPhysRegAroundRegDefsUses may have invalidated iterator stored
+        // in fixed_. Reset them.
+        for (unsigned i = 0, e = fixed_.size(); i != e; ++i) {
+          IntervalPtr &IP = fixed_[i];
+          LiveInterval *I = IP.first;
+          if (I->reg == minReg || tri_->isSubRegister(minReg, I->reg))
+            IP.second = I->advanceTo(I->begin(), StartPosition);
+        }
+
+        DowngradedRegs.clear();
+        assignRegOrStackSlotAtInterval(cur);
+      } else {
+        assert(false && "Ran out of registers during register allocation!");
+        report_fatal_error("Ran out of registers during register allocation!");
+      }
+      return;
+    }
+  }
+
+  // Find up to 3 registers to consider as spill candidates.
+  unsigned LastCandidate = RegsWeights.size() >= 3 ? 3 : 1;
+  while (LastCandidate > 1) {
+    if (weightsAreClose(RegsWeights[LastCandidate-1].second, minWeight))
+      break;
+    --LastCandidate;
+  }
+
+  DEBUG({
+      dbgs() << "\t\tregister(s) with min weight(s): ";
+
+      for (unsigned i = 0; i != LastCandidate; ++i)
+        dbgs() << tri_->getName(RegsWeights[i].first)
+               << " (" << RegsWeights[i].second << ")\n";
+    });
+
+  // If the current has the minimum weight, we need to spill it and
+  // add any added intervals back to unhandled, and restart
+  // linearscan.
+  if (cur->weight != HUGE_VALF && cur->weight <= minWeight) {
+    DEBUG(dbgs() << "\t\t\tspilling(c): " << *cur << '\n');
+    SmallVector<LiveInterval*, 8> spillIs;
+    std::vector<LiveInterval*> added;
+    spiller_->spill(cur, added, spillIs);
+
+    std::sort(added.begin(), added.end(), LISorter());
+    addStackInterval(cur, ls_, li_, mri_, *vrm_);
+    if (added.empty())
+      return;  // Early exit if all spills were folded.
+
+    // Merge added with unhandled.  Note that we have already sorted
+    // intervals returned by addIntervalsForSpills by their starting
+    // point.
+    // This also update the NextReloadMap. That is, it adds mapping from a
+    // register defined by a reload from SS to the next reload from SS in the
+    // same basic block.
+    MachineBasicBlock *LastReloadMBB = 0;
+    LiveInterval *LastReload = 0;
+    int LastReloadSS = VirtRegMap::NO_STACK_SLOT;
+    for (unsigned i = 0, e = added.size(); i != e; ++i) {
+      LiveInterval *ReloadLi = added[i];
+      if (ReloadLi->weight == HUGE_VALF &&
+          li_->getApproximateInstructionCount(*ReloadLi) == 0) {
+        SlotIndex ReloadIdx = ReloadLi->beginIndex();
+        MachineBasicBlock *ReloadMBB = li_->getMBBFromIndex(ReloadIdx);
+        int ReloadSS = vrm_->getStackSlot(ReloadLi->reg);
+        if (LastReloadMBB == ReloadMBB && LastReloadSS == ReloadSS) {
+          // Last reload of same SS is in the same MBB. We want to try to
+          // allocate both reloads the same register and make sure the reg
+          // isn't clobbered in between if at all possible.
+          assert(LastReload->beginIndex() < ReloadIdx);
+          NextReloadMap.insert(std::make_pair(LastReload->reg, ReloadLi->reg));
+        }
+        LastReloadMBB = ReloadMBB;
+        LastReload = ReloadLi;
+        LastReloadSS = ReloadSS;
+      }
+      unhandled_.push(ReloadLi);
+    }
+    return;
+  }
+
+  ++NumBacktracks;
+
+  // Push the current interval back to unhandled since we are going
+  // to re-run at least this iteration. Since we didn't modify it it
+  // should go back right in the front of the list
+  unhandled_.push(cur);
+
+  assert(TargetRegisterInfo::isPhysicalRegister(minReg) &&
+         "did not choose a register to spill?");
+
+  // We spill all intervals aliasing the register with
+  // minimum weight, rollback to the interval with the earliest
+  // start point and let the linear scan algorithm run again
+  SmallVector<LiveInterval*, 8> spillIs;
+
+  // Determine which intervals have to be spilled.
+  findIntervalsToSpill(cur, RegsWeights, LastCandidate, spillIs);
+
+  // Set of spilled vregs (used later to rollback properly)
+  SmallSet<unsigned, 8> spilled;
+
+  // The earliest start of a Spilled interval indicates up to where
+  // in handled we need to roll back
+  assert(!spillIs.empty() && "No spill intervals?"); 
+  SlotIndex earliestStart = spillIs[0]->beginIndex();
+  
+  // Spill live intervals of virtual regs mapped to the physical register we
+  // want to clear (and its aliases).  We only spill those that overlap with the
+  // current interval as the rest do not affect its allocation. we also keep
+  // track of the earliest start of all spilled live intervals since this will
+  // mark our rollback point.
+  std::vector<LiveInterval*> added;
+  while (!spillIs.empty()) {
+    LiveInterval *sli = spillIs.back();
+    spillIs.pop_back();
+    DEBUG(dbgs() << "\t\t\tspilling(a): " << *sli << '\n');
+    if (sli->beginIndex() < earliestStart)
+      earliestStart = sli->beginIndex();
+       
+    spiller_->spill(sli, added, spillIs, &earliestStart);
+    addStackInterval(sli, ls_, li_, mri_, *vrm_);
+    spilled.insert(sli->reg);
+  }
+
+  DEBUG(dbgs() << "\t\trolling back to: " << earliestStart << '\n');
+
+  // Scan handled in reverse order up to the earliest start of a
+  // spilled live interval and undo each one, restoring the state of
+  // unhandled.
+  while (!handled_.empty()) {
+    LiveInterval* i = handled_.back();
+    // If this interval starts before t we are done.
+    if (!i->empty() && i->beginIndex() < earliestStart)
+      break;
+    DEBUG(dbgs() << "\t\t\tundo changes for: " << *i << '\n');
+    handled_.pop_back();
+
+    // When undoing a live interval allocation we must know if it is active or
+    // inactive to properly update regUse_ and the VirtRegMap.
+    IntervalPtrs::iterator it;
+    if ((it = FindIntervalInVector(active_, i)) != active_.end()) {
+      active_.erase(it);
+      assert(!TargetRegisterInfo::isPhysicalRegister(i->reg));
+      if (!spilled.count(i->reg))
+        unhandled_.push(i);
+      delRegUse(vrm_->getPhys(i->reg));
+      vrm_->clearVirt(i->reg);
+    } else if ((it = FindIntervalInVector(inactive_, i)) != inactive_.end()) {
+      inactive_.erase(it);
+      assert(!TargetRegisterInfo::isPhysicalRegister(i->reg));
+      if (!spilled.count(i->reg))
+        unhandled_.push(i);
+      vrm_->clearVirt(i->reg);
+    } else {
+      assert(TargetRegisterInfo::isVirtualRegister(i->reg) &&
+             "Can only allocate virtual registers!");
+      vrm_->clearVirt(i->reg);
+      unhandled_.push(i);
+    }
+
+    DenseMap<unsigned, unsigned>::iterator ii = DowngradeMap.find(i->reg);
+    if (ii == DowngradeMap.end())
+      // It interval has a preference, it must be defined by a copy. Clear the
+      // preference now since the source interval allocation may have been
+      // undone as well.
+      mri_->setRegAllocationHint(i->reg, 0, 0);
+    else {
+      UpgradeRegister(ii->second);
+    }
+  }
+
+  // Rewind the iterators in the active, inactive, and fixed lists back to the
+  // point we reverted to.
+  RevertVectorIteratorsTo(active_, earliestStart);
+  RevertVectorIteratorsTo(inactive_, earliestStart);
+  RevertVectorIteratorsTo(fixed_, earliestStart);
+
+  // Scan the rest and undo each interval that expired after t and
+  // insert it in active (the next iteration of the algorithm will
+  // put it in inactive if required)
+  for (unsigned i = 0, e = handled_.size(); i != e; ++i) {
+    LiveInterval *HI = handled_[i];
+    if (!HI->expiredAt(earliestStart) &&
+        HI->expiredAt(cur->beginIndex())) {
+      DEBUG(dbgs() << "\t\t\tundo changes for: " << *HI << '\n');
+      active_.push_back(std::make_pair(HI, HI->begin()));
+      assert(!TargetRegisterInfo::isPhysicalRegister(HI->reg));
+      addRegUse(vrm_->getPhys(HI->reg));
+    }
+  }
+
+  // Merge added with unhandled.
+  // This also update the NextReloadMap. That is, it adds mapping from a
+  // register defined by a reload from SS to the next reload from SS in the
+  // same basic block.
+  MachineBasicBlock *LastReloadMBB = 0;
+  LiveInterval *LastReload = 0;
+  int LastReloadSS = VirtRegMap::NO_STACK_SLOT;
+  std::sort(added.begin(), added.end(), LISorter());
+  for (unsigned i = 0, e = added.size(); i != e; ++i) {
+    LiveInterval *ReloadLi = added[i];
+    if (ReloadLi->weight == HUGE_VALF &&
+        li_->getApproximateInstructionCount(*ReloadLi) == 0) {
+      SlotIndex ReloadIdx = ReloadLi->beginIndex();
+      MachineBasicBlock *ReloadMBB = li_->getMBBFromIndex(ReloadIdx);
+      int ReloadSS = vrm_->getStackSlot(ReloadLi->reg);
+      if (LastReloadMBB == ReloadMBB && LastReloadSS == ReloadSS) {
+        // Last reload of same SS is in the same MBB. We want to try to
+        // allocate both reloads the same register and make sure the reg
+        // isn't clobbered in between if at all possible.
+        assert(LastReload->beginIndex() < ReloadIdx);
+        NextReloadMap.insert(std::make_pair(LastReload->reg, ReloadLi->reg));
+      }
+      LastReloadMBB = ReloadMBB;
+      LastReload = ReloadLi;
+      LastReloadSS = ReloadSS;
+    }
+    unhandled_.push(ReloadLi);
+  }
+}
+
+unsigned RALinScan::getFreePhysReg(LiveInterval* cur,
+                                   const TargetRegisterClass *RC,
+                                   unsigned MaxInactiveCount,
+                                   SmallVector<unsigned, 256> &inactiveCounts,
+                                   bool SkipDGRegs) {
+  unsigned FreeReg = 0;
+  unsigned FreeRegInactiveCount = 0;
+
+  std::pair<unsigned, unsigned> Hint = mri_->getRegAllocationHint(cur->reg);
+  // Resolve second part of the hint (if possible) given the current allocation.
+  unsigned physReg = Hint.second;
+  if (physReg &&
+      TargetRegisterInfo::isVirtualRegister(physReg) && vrm_->hasPhys(physReg))
+    physReg = vrm_->getPhys(physReg);
+
+  TargetRegisterClass::iterator I, E;
+  tie(I, E) = tri_->getAllocationOrder(RC, Hint.first, physReg, *mf_);
+  assert(I != E && "No allocatable register in this register class!");
+
+  // Scan for the first available register.
+  for (; I != E; ++I) {
+    unsigned Reg = *I;
+    // Ignore "downgraded" registers.
+    if (SkipDGRegs && DowngradedRegs.count(Reg))
+      continue;
+    // Skip recently allocated registers.
+    if (isRegAvail(Reg) && !isRecentlyUsed(Reg)) {
+      FreeReg = Reg;
+      if (FreeReg < inactiveCounts.size())
+        FreeRegInactiveCount = inactiveCounts[FreeReg];
+      else
+        FreeRegInactiveCount = 0;
+      break;
+    }
+  }
+
+  // If there are no free regs, or if this reg has the max inactive count,
+  // return this register.
+  if (FreeReg == 0 || FreeRegInactiveCount == MaxInactiveCount) {
+    // Remember what register we picked so we can skip it next time.
+    if (FreeReg != 0) recordRecentlyUsed(FreeReg);
+    return FreeReg;
+  }
+
+  // Continue scanning the registers, looking for the one with the highest
+  // inactive count.  Alkis found that this reduced register pressure very
+  // slightly on X86 (in rev 1.94 of this file), though this should probably be
+  // reevaluated now.
+  for (; I != E; ++I) {
+    unsigned Reg = *I;
+    // Ignore "downgraded" registers.
+    if (SkipDGRegs && DowngradedRegs.count(Reg))
+      continue;
+    if (isRegAvail(Reg) && Reg < inactiveCounts.size() &&
+        FreeRegInactiveCount < inactiveCounts[Reg] && !isRecentlyUsed(Reg)) {
+      FreeReg = Reg;
+      FreeRegInactiveCount = inactiveCounts[Reg];
+      if (FreeRegInactiveCount == MaxInactiveCount)
+        break;    // We found the one with the max inactive count.
+    }
+  }
+
+  // Remember what register we picked so we can skip it next time.
+  recordRecentlyUsed(FreeReg);
+
+  return FreeReg;
+}
+
+/// getFreePhysReg - return a free physical register for this virtual register
+/// interval if we have one, otherwise return 0.
+unsigned RALinScan::getFreePhysReg(LiveInterval *cur) {
+  SmallVector<unsigned, 256> inactiveCounts;
+  unsigned MaxInactiveCount = 0;
+  
+  const TargetRegisterClass *RC = mri_->getRegClass(cur->reg);
+  const TargetRegisterClass *RCLeader = RelatedRegClasses.getLeaderValue(RC);
+ 
+  for (IntervalPtrs::iterator i = inactive_.begin(), e = inactive_.end();
+       i != e; ++i) {
+    unsigned reg = i->first->reg;
+    assert(TargetRegisterInfo::isVirtualRegister(reg) &&
+           "Can only allocate virtual registers!");
+
+    // If this is not in a related reg class to the register we're allocating, 
+    // don't check it.
+    const TargetRegisterClass *RegRC = mri_->getRegClass(reg);
+    if (RelatedRegClasses.getLeaderValue(RegRC) == RCLeader) {
+      reg = vrm_->getPhys(reg);
+      if (inactiveCounts.size() <= reg)
+        inactiveCounts.resize(reg+1);
+      ++inactiveCounts[reg];
+      MaxInactiveCount = std::max(MaxInactiveCount, inactiveCounts[reg]);
+    }
+  }
+
+  // If copy coalescer has assigned a "preferred" register, check if it's
+  // available first.
+  unsigned Preference = vrm_->getRegAllocPref(cur->reg);
+  if (Preference) {
+    DEBUG(dbgs() << "(preferred: " << tri_->getName(Preference) << ") ");
+    if (isRegAvail(Preference) && 
+        RC->contains(Preference))
+      return Preference;
+  }
+
+  if (!DowngradedRegs.empty()) {
+    unsigned FreeReg = getFreePhysReg(cur, RC, MaxInactiveCount, inactiveCounts,
+                                      true);
+    if (FreeReg)
+      return FreeReg;
+  }
+  return getFreePhysReg(cur, RC, MaxInactiveCount, inactiveCounts, false);
+}
+
+FunctionPass* llvm::createLinearScanRegisterAllocator() {
+  return new RALinScan();
+}
diff --git a/src/LLVM/lib/CodeGen/RegisterCoalescer.cpp b/src/LLVM/lib/CodeGen/RegisterCoalescer.cpp
new file mode 100644
index 0000000..02b5539
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/RegisterCoalescer.cpp
@@ -0,0 +1,196 @@
+//===- RegisterCoalescer.cpp - Generic Register Coalescing Interface -------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the generic RegisterCoalescer interface which
+// is used as the common interface used by all clients and
+// implementations of register coalescing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+// Register the RegisterCoalescer interface, providing a nice name to refer to.
+static RegisterAnalysisGroup<RegisterCoalescer> Z("Register Coalescer");
+char RegisterCoalescer::ID = 0;
+
+// RegisterCoalescer destructor: DO NOT move this to the header file
+// for RegisterCoalescer or else clients of the RegisterCoalescer
+// class may not depend on the RegisterCoalescer.o file in the current
+// .a file, causing alias analysis support to not be included in the
+// tool correctly!
+//
+RegisterCoalescer::~RegisterCoalescer() {}
+
+unsigned CoalescerPair::compose(unsigned a, unsigned b) const {
+  if (!a) return b;
+  if (!b) return a;
+  return tri_.composeSubRegIndices(a, b);
+}
+
+bool CoalescerPair::isMoveInstr(const MachineInstr *MI,
+                                unsigned &Src, unsigned &Dst,
+                                unsigned &SrcSub, unsigned &DstSub) const {
+  if (MI->isCopy()) {
+    Dst = MI->getOperand(0).getReg();
+    DstSub = MI->getOperand(0).getSubReg();
+    Src = MI->getOperand(1).getReg();
+    SrcSub = MI->getOperand(1).getSubReg();
+  } else if (MI->isSubregToReg()) {
+    Dst = MI->getOperand(0).getReg();
+    DstSub = compose(MI->getOperand(0).getSubReg(), MI->getOperand(3).getImm());
+    Src = MI->getOperand(2).getReg();
+    SrcSub = MI->getOperand(2).getSubReg();
+  } else
+    return false;
+  return true;
+}
+
+bool CoalescerPair::setRegisters(const MachineInstr *MI) {
+  srcReg_ = dstReg_ = subIdx_ = 0;
+  newRC_ = 0;
+  flipped_ = crossClass_ = false;
+
+  unsigned Src, Dst, SrcSub, DstSub;
+  if (!isMoveInstr(MI, Src, Dst, SrcSub, DstSub))
+    return false;
+  partial_ = SrcSub || DstSub;
+
+  // If one register is a physreg, it must be Dst.
+  if (TargetRegisterInfo::isPhysicalRegister(Src)) {
+    if (TargetRegisterInfo::isPhysicalRegister(Dst))
+      return false;
+    std::swap(Src, Dst);
+    std::swap(SrcSub, DstSub);
+    flipped_ = true;
+  }
+
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+  if (TargetRegisterInfo::isPhysicalRegister(Dst)) {
+    // Eliminate DstSub on a physreg.
+    if (DstSub) {
+      Dst = tri_.getSubReg(Dst, DstSub);
+      if (!Dst) return false;
+      DstSub = 0;
+    }
+
+    // Eliminate SrcSub by picking a corresponding Dst superregister.
+    if (SrcSub) {
+      Dst = tri_.getMatchingSuperReg(Dst, SrcSub, MRI.getRegClass(Src));
+      if (!Dst) return false;
+      SrcSub = 0;
+    } else if (!MRI.getRegClass(Src)->contains(Dst)) {
+      return false;
+    }
+  } else {
+    // Both registers are virtual.
+
+    // Both registers have subreg indices.
+    if (SrcSub && DstSub) {
+      // For now we only handle the case of identical indices in commensurate
+      // registers: Dreg:ssub_1 + Dreg:ssub_1 -> Dreg
+      // FIXME: Handle Qreg:ssub_3 + Dreg:ssub_1 as QReg:dsub_1 + Dreg.
+      if (SrcSub != DstSub)
+        return false;
+      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+      if (!getCommonSubClass(DstRC, SrcRC))
+        return false;
+      SrcSub = DstSub = 0;
+    }
+
+    // There can be no SrcSub.
+    if (SrcSub) {
+      std::swap(Src, Dst);
+      DstSub = SrcSub;
+      SrcSub = 0;
+      assert(!flipped_ && "Unexpected flip");
+      flipped_ = true;
+    }
+
+    // Find the new register class.
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+    if (DstSub)
+      newRC_ = tri_.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+    else
+      newRC_ = getCommonSubClass(DstRC, SrcRC);
+    if (!newRC_)
+      return false;
+    crossClass_ = newRC_ != DstRC || newRC_ != SrcRC;
+  }
+  // Check our invariants
+  assert(TargetRegisterInfo::isVirtualRegister(Src) && "Src must be virtual");
+  assert(!(TargetRegisterInfo::isPhysicalRegister(Dst) && DstSub) &&
+         "Cannot have a physical SubIdx");
+  srcReg_ = Src;
+  dstReg_ = Dst;
+  subIdx_ = DstSub;
+  return true;
+}
+
+bool CoalescerPair::flip() {
+  if (subIdx_ || TargetRegisterInfo::isPhysicalRegister(dstReg_))
+    return false;
+  std::swap(srcReg_, dstReg_);
+  flipped_ = !flipped_;
+  return true;
+}
+
+bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
+  if (!MI)
+    return false;
+  unsigned Src, Dst, SrcSub, DstSub;
+  if (!isMoveInstr(MI, Src, Dst, SrcSub, DstSub))
+    return false;
+
+  // Find the virtual register that is srcReg_.
+  if (Dst == srcReg_) {
+    std::swap(Src, Dst);
+    std::swap(SrcSub, DstSub);
+  } else if (Src != srcReg_) {
+    return false;
+  }
+
+  // Now check that Dst matches dstReg_.
+  if (TargetRegisterInfo::isPhysicalRegister(dstReg_)) {
+    if (!TargetRegisterInfo::isPhysicalRegister(Dst))
+      return false;
+    assert(!subIdx_ && "Inconsistent CoalescerPair state.");
+    // DstSub could be set for a physreg from INSERT_SUBREG.
+    if (DstSub)
+      Dst = tri_.getSubReg(Dst, DstSub);
+    // Full copy of Src.
+    if (!SrcSub)
+      return dstReg_ == Dst;
+    // This is a partial register copy. Check that the parts match.
+    return tri_.getSubReg(dstReg_, SrcSub) == Dst;
+  } else {
+    // dstReg_ is virtual.
+    if (dstReg_ != Dst)
+      return false;
+    // Registers match, do the subregisters line up?
+    return compose(subIdx_, SrcSub) == DstSub;
+  }
+}
+
+// Because of the way .a files work, we must force the SimpleRC
+// implementation to be pulled in if the RegisterCoalescer classes are
+// pulled in.  Otherwise we run the risk of RegisterCoalescer being
+// used, but the default implementation not being linked into the tool
+// that uses it.
+DEFINING_FILE_FOR(RegisterCoalescer)
diff --git a/src/LLVM/lib/CodeGen/RegisterScavenging.cpp b/src/LLVM/lib/CodeGen/RegisterScavenging.cpp
new file mode 100644
index 0000000..43b3fb6
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/RegisterScavenging.cpp
@@ -0,0 +1,384 @@
+//===-- RegisterScavenging.cpp - Machine register scavenging --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine register scavenger. It can provide
+// information, such as unused registers, at any point in a machine basic block.
+// It also provides a mechanism to make registers available by evicting them to
+// spill slots.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reg-scavenging"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+/// setUsed - Set the register and its sub-registers as being used.
+void RegScavenger::setUsed(unsigned Reg) {
+  RegsAvailable.reset(Reg);
+
+  for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+       unsigned SubReg = *SubRegs; ++SubRegs)
+    RegsAvailable.reset(SubReg);
+}
+
+bool RegScavenger::isAliasUsed(unsigned Reg) const {
+  if (isUsed(Reg))
+    return true;
+  for (const unsigned *R = TRI->getAliasSet(Reg); *R; ++R)
+    if (isUsed(*R))
+      return true;
+  return false;
+}
+
+void RegScavenger::initRegState() {
+  ScavengedReg = 0;
+  ScavengedRC = NULL;
+  ScavengeRestore = NULL;
+
+  // All registers started out unused.
+  RegsAvailable.set();
+
+  // Reserved registers are always used.
+  RegsAvailable ^= ReservedRegs;
+
+  if (!MBB)
+    return;
+
+  // Live-in registers are in use.
+  for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
+         E = MBB->livein_end(); I != E; ++I)
+    setUsed(*I);
+
+  // Pristine CSRs are also unavailable.
+  BitVector PR = MBB->getParent()->getFrameInfo()->getPristineRegs(MBB);
+  for (int I = PR.find_first(); I>0; I = PR.find_next(I))
+    setUsed(I);
+}
+
+void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
+  MachineFunction &MF = *mbb->getParent();
+  const TargetMachine &TM = MF.getTarget();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  assert((NumPhysRegs == 0 || NumPhysRegs == TRI->getNumRegs()) &&
+         "Target changed?");
+
+  // Self-initialize.
+  if (!MBB) {
+    NumPhysRegs = TRI->getNumRegs();
+    RegsAvailable.resize(NumPhysRegs);
+
+    // Create reserved registers bitvector.
+    ReservedRegs = TRI->getReservedRegs(MF);
+
+    // Create callee-saved registers bitvector.
+    CalleeSavedRegs.resize(NumPhysRegs);
+    const unsigned *CSRegs = TRI->getCalleeSavedRegs();
+    if (CSRegs != NULL)
+      for (unsigned i = 0; CSRegs[i]; ++i)
+        CalleeSavedRegs.set(CSRegs[i]);
+  }
+
+  MBB = mbb;
+  initRegState();
+
+  Tracking = false;
+}
+
+void RegScavenger::addRegWithSubRegs(BitVector &BV, unsigned Reg) {
+  BV.set(Reg);
+  for (const unsigned *R = TRI->getSubRegisters(Reg); *R; R++)
+    BV.set(*R);
+}
+
+void RegScavenger::addRegWithAliases(BitVector &BV, unsigned Reg) {
+  BV.set(Reg);
+  for (const unsigned *R = TRI->getAliasSet(Reg); *R; R++)
+    BV.set(*R);
+}
+
+void RegScavenger::forward() {
+  // Move ptr forward.
+  if (!Tracking) {
+    MBBI = MBB->begin();
+    Tracking = true;
+  } else {
+    assert(MBBI != MBB->end() && "Already at the end of the basic block!");
+    MBBI = llvm::next(MBBI);
+  }
+
+  MachineInstr *MI = MBBI;
+
+  if (MI == ScavengeRestore) {
+    ScavengedReg = 0;
+    ScavengedRC = NULL;
+    ScavengeRestore = NULL;
+  }
+
+  if (MI->isDebugValue())
+    return;
+
+  // Find out which registers are early clobbered, killed, defined, and marked
+  // def-dead in this instruction.
+  // FIXME: The scavenger is not predication aware. If the instruction is
+  // predicated, conservatively assume "kill" markers do not actually kill the
+  // register. Similarly ignores "dead" markers.
+  bool isPred = TII->isPredicated(MI);
+  BitVector EarlyClobberRegs(NumPhysRegs);
+  BitVector KillRegs(NumPhysRegs);
+  BitVector DefRegs(NumPhysRegs);
+  BitVector DeadRegs(NumPhysRegs);
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isUndef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || isReserved(Reg))
+      continue;
+
+    if (MO.isUse()) {
+      // Two-address operands implicitly kill.
+      if (!isPred && (MO.isKill() || MI->isRegTiedToDefOperand(i)))
+        addRegWithSubRegs(KillRegs, Reg);
+    } else {
+      assert(MO.isDef());
+      if (!isPred && MO.isDead())
+        addRegWithSubRegs(DeadRegs, Reg);
+      else
+        addRegWithSubRegs(DefRegs, Reg);
+      if (MO.isEarlyClobber())
+        addRegWithAliases(EarlyClobberRegs, Reg);
+    }
+  }
+
+  // Verify uses and defs.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isUndef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || isReserved(Reg))
+      continue;
+    if (MO.isUse()) {
+      if (!isUsed(Reg)) {
+        // Check if it's partial live: e.g.
+        // D0 = insert_subreg D0<undef>, S0
+        // ... D0
+        // The problem is the insert_subreg could be eliminated. The use of
+        // D0 is using a partially undef value. This is not *incorrect* since
+        // S1 is can be freely clobbered.
+        // Ideally we would like a way to model this, but leaving the
+        // insert_subreg around causes both correctness and performance issues.
+        bool SubUsed = false;
+        for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+             unsigned SubReg = *SubRegs; ++SubRegs)
+          if (isUsed(SubReg)) {
+            SubUsed = true;
+            break;
+          }
+        assert(SubUsed && "Using an undefined register!");
+      }
+      assert((!EarlyClobberRegs.test(Reg) || MI->isRegTiedToDefOperand(i)) &&
+             "Using an early clobbered register!");
+    } else {
+      assert(MO.isDef());
+#if 0
+      // FIXME: Enable this once we've figured out how to correctly transfer
+      // implicit kills during codegen passes like the coalescer.
+      assert((KillRegs.test(Reg) || isUnused(Reg) ||
+              isLiveInButUnusedBefore(Reg, MI, MBB, TRI, MRI)) &&
+             "Re-defining a live register!");
+#endif
+    }
+  }
+
+  // Commit the changes.
+  setUnused(KillRegs);
+  setUnused(DeadRegs);
+  setUsed(DefRegs);
+}
+
+void RegScavenger::getRegsUsed(BitVector &used, bool includeReserved) {
+  if (includeReserved)
+    used = ~RegsAvailable;
+  else
+    used = ~RegsAvailable & ~ReservedRegs;
+}
+
+/// CreateRegClassMask - Set the bits that represent the registers in the
+/// TargetRegisterClass.
+static void CreateRegClassMask(const TargetRegisterClass *RC, BitVector &Mask) {
+  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I != E;
+       ++I)
+    Mask.set(*I);
+}
+
+unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
+  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+       I != E; ++I)
+    if (!isAliasUsed(*I))
+      return *I;
+  return 0;
+}
+
+/// getRegsAvailable - Return all available registers in the register class
+/// in Mask.
+void RegScavenger::getRegsAvailable(const TargetRegisterClass *RC,
+                                    BitVector &Mask) {
+  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+       I != E; ++I)
+    if (!isAliasUsed(*I))
+      Mask.set(*I);
+}
+
+/// findSurvivorReg - Return the candidate register that is unused for the
+/// longest after StargMII. UseMI is set to the instruction where the search
+/// stopped.
+///
+/// No more than InstrLimit instructions are inspected.
+///
+unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
+                                       BitVector &Candidates,
+                                       unsigned InstrLimit,
+                                       MachineBasicBlock::iterator &UseMI) {
+  int Survivor = Candidates.find_first();
+  assert(Survivor > 0 && "No candidates for scavenging");
+
+  MachineBasicBlock::iterator ME = MBB->getFirstTerminator();
+  assert(StartMI != ME && "MI already at terminator");
+  MachineBasicBlock::iterator RestorePointMI = StartMI;
+  MachineBasicBlock::iterator MI = StartMI;
+
+  bool inVirtLiveRange = false;
+  for (++MI; InstrLimit > 0 && MI != ME; ++MI, --InstrLimit) {
+    if (MI->isDebugValue()) {
+      ++InstrLimit; // Don't count debug instructions
+      continue;
+    }
+    bool isVirtKillInsn = false;
+    bool isVirtDefInsn = false;
+    // Remove any candidates touched by instruction.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || MO.isUndef() || !MO.getReg())
+        continue;
+      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        if (MO.isDef())
+          isVirtDefInsn = true;
+        else if (MO.isKill())
+          isVirtKillInsn = true;
+        continue;
+      }
+      Candidates.reset(MO.getReg());
+      for (const unsigned *R = TRI->getAliasSet(MO.getReg()); *R; R++)
+        Candidates.reset(*R);
+    }
+    // If we're not in a virtual reg's live range, this is a valid
+    // restore point.
+    if (!inVirtLiveRange) RestorePointMI = MI;
+
+    // Update whether we're in the live range of a virtual register
+    if (isVirtKillInsn) inVirtLiveRange = false;
+    if (isVirtDefInsn) inVirtLiveRange = true;
+
+    // Was our survivor untouched by this instruction?
+    if (Candidates.test(Survivor))
+      continue;
+
+    // All candidates gone?
+    if (Candidates.none())
+      break;
+
+    Survivor = Candidates.find_first();
+  }
+  // If we ran off the end, that's where we want to restore.
+  if (MI == ME) RestorePointMI = ME;
+  assert (RestorePointMI != StartMI &&
+          "No available scavenger restore location!");
+
+  // We ran out of candidates, so stop the search.
+  UseMI = RestorePointMI;
+  return Survivor;
+}
+
+unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
+                                        MachineBasicBlock::iterator I,
+                                        int SPAdj) {
+  // Mask off the registers which are not in the TargetRegisterClass.
+  BitVector Candidates(NumPhysRegs, false);
+  CreateRegClassMask(RC, Candidates);
+  // Do not include reserved registers.
+  Candidates ^= ReservedRegs & Candidates;
+
+  // Exclude all the registers being used by the instruction.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = I->getOperand(i);
+    if (MO.isReg() && MO.getReg() != 0 &&
+        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      Candidates.reset(MO.getReg());
+  }
+
+  // Try to find a register that's unused if there is one, as then we won't
+  // have to spill.
+  if ((Candidates & RegsAvailable).any())
+     Candidates &= RegsAvailable;
+
+  // Find the register whose use is furthest away.
+  MachineBasicBlock::iterator UseMI;
+  unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
+
+  // If we found an unused register there is no reason to spill it.
+  if (!isAliasUsed(SReg))
+    return SReg;
+
+  assert(ScavengedReg == 0 &&
+         "Scavenger slot is live, unable to scavenge another register!");
+
+  // Avoid infinite regress
+  ScavengedReg = SReg;
+
+  // If the target knows how to save/restore the register, let it do so;
+  // otherwise, use the emergency stack spill slot.
+  if (!TRI->saveScavengerRegister(*MBB, I, UseMI, RC, SReg)) {
+    // Spill the scavenged register before I.
+    assert(ScavengingFrameIndex >= 0 &&
+           "Cannot scavenge register without an emergency spill slot!");
+    TII->storeRegToStackSlot(*MBB, I, SReg, true, ScavengingFrameIndex, RC,TRI);
+    MachineBasicBlock::iterator II = prior(I);
+    TRI->eliminateFrameIndex(II, SPAdj, NULL, this);
+
+    // Restore the scavenged register before its use (or first terminator).
+    TII->loadRegFromStackSlot(*MBB, UseMI, SReg, ScavengingFrameIndex, RC, TRI);
+    II = prior(UseMI);
+    TRI->eliminateFrameIndex(II, SPAdj, NULL, this);
+  }
+
+  ScavengeRestore = prior(UseMI);
+
+  // Doing this here leads to infinite regress.
+  // ScavengedReg = SReg;
+  ScavengedRC = RC;
+
+  return SReg;
+}
diff --git a/src/LLVM/lib/CodeGen/RenderMachineFunction.h b/src/LLVM/lib/CodeGen/RenderMachineFunction.h
new file mode 100644
index 0000000..71a613b
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/RenderMachineFunction.h
@@ -0,0 +1,314 @@
+//===-- llvm/CodeGen/RenderMachineFunction.h - MF->HTML -*- C++ -*---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_RENDERMACHINEFUNCTION_H
+#define LLVM_CODEGEN_RENDERMACHINEFUNCTION_H
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+
+namespace llvm {
+
+  class LiveInterval;
+  class LiveIntervals;
+  class MachineInstr;
+  class MachineRegisterInfo;
+  class TargetRegisterClass;
+  class TargetRegisterInfo;
+  class VirtRegMap;
+  class raw_ostream;
+
+  /// \brief Provide extra information about the physical and virtual registers
+  ///        in the function being compiled.
+  class TargetRegisterExtraInfo {
+  public:
+    TargetRegisterExtraInfo();
+
+    /// \brief Set up TargetRegisterExtraInfo with pointers to necessary
+    ///        sources of information.
+    void setup(MachineFunction *mf, MachineRegisterInfo *mri,
+               const TargetRegisterInfo *tri, LiveIntervals *lis);
+
+    /// \brief Recompute tables for changed function.
+    void reset(); 
+
+    /// \brief Free all tables in TargetRegisterExtraInfo.
+    void clear();
+
+    /// \brief Maximum number of registers from trc which alias reg.
+    unsigned getWorst(unsigned reg, const TargetRegisterClass *trc) const;
+
+    /// \brief Returns the number of allocable registers in trc.
+    unsigned getCapacity(const TargetRegisterClass *trc) const;
+
+    /// \brief Return the number of registers of class trc that may be
+    ///        needed at slot i.
+    unsigned getPressureAtSlot(const TargetRegisterClass *trc,
+                               SlotIndex i) const;
+
+    /// \brief Return true if the number of registers of type trc that may be
+    ///        needed at slot i is greater than the capacity of trc.
+    bool classOverCapacityAtSlot(const TargetRegisterClass *trc,
+                                 SlotIndex i) const;
+
+  private:
+
+    MachineFunction *mf;
+    MachineRegisterInfo *mri;
+    const TargetRegisterInfo *tri;
+    LiveIntervals *lis;
+
+    typedef std::map<const TargetRegisterClass*, unsigned> WorstMapLine;
+    typedef std::map<const TargetRegisterClass*, WorstMapLine> VRWorstMap;
+    VRWorstMap vrWorst;
+
+    typedef std::map<unsigned, WorstMapLine> PRWorstMap;
+    PRWorstMap prWorst;
+
+    typedef std::map<const TargetRegisterClass*, unsigned> CapacityMap;
+    CapacityMap capacityMap;
+
+    typedef std::map<const TargetRegisterClass*, unsigned> PressureMapLine;
+    typedef std::map<SlotIndex, PressureMapLine> PressureMap;
+    PressureMap pressureMap;
+
+    bool mapsPopulated;
+
+    /// \brief Initialise the 'worst' table.
+    void initWorst();
+ 
+    /// \brief Initialise the 'capacity' table.
+    void initCapacity();
+
+    /// \brief Initialise/Reset the 'pressure' and live states tables.
+    void resetPressureAndLiveStates();
+  };
+
+  /// \brief Helper class to process rendering options. Tries to be as lazy as
+  ///        possible.
+  class MFRenderingOptions {
+  public:
+
+    struct RegClassComp {
+      bool operator()(const TargetRegisterClass *trc1,
+                      const TargetRegisterClass *trc2) const {
+        std::string trc1Name(trc1->getName()), trc2Name(trc2->getName());
+        return std::lexicographical_compare(trc1Name.begin(), trc1Name.end(),
+                                            trc2Name.begin(), trc2Name.end());
+      }
+    };
+
+    typedef std::set<const TargetRegisterClass*, RegClassComp> RegClassSet;
+
+    struct IntervalComp {
+      bool operator()(const LiveInterval *li1, const LiveInterval *li2) const {
+        return li1->reg < li2->reg;
+      }
+    };
+
+    typedef std::set<const LiveInterval*, IntervalComp> IntervalSet;
+
+    /// Initialise the rendering options.
+    void setup(MachineFunction *mf, const TargetRegisterInfo *tri,
+               LiveIntervals *lis);
+
+    /// Clear translations of options to the current function.
+    void clear();
+
+    /// Reset any options computed for this specific rendering.
+    void resetRenderSpecificOptions();
+
+    /// Should we render the current function.
+    bool shouldRenderCurrentMachineFunction() const;
+
+    /// Return the set of register classes to render pressure for.
+    const RegClassSet& regClasses() const;
+
+    /// Return the set of live intervals to render liveness for.
+    const IntervalSet& intervals() const;
+
+    /// Render indexes which are not associated with instructions / MBB starts.
+    bool renderEmptyIndexes() const;
+
+    /// Return whether or not to render using SVG for fancy vertical text.
+    bool fancyVerticals() const;
+
+  private:
+
+    static bool renderingOptionsProcessed;
+    static std::set<std::string> mfNamesToRender;
+    static bool renderAllMFs;
+
+    static std::set<std::string> classNamesToRender;
+    static bool renderAllClasses;
+
+
+    static std::set<std::pair<unsigned, unsigned> > intervalNumsToRender;
+    typedef enum { ExplicitOnly     = 0,
+                   VirtPlusExplicit = 1,
+                   PhysPlusExplicit = 2,
+                   All              = 3 }
+      IntervalTypesToRender;
+    static unsigned intervalTypesToRender;
+
+    template <typename OutputItr>
+    static void splitComaSeperatedList(const std::string &s, OutputItr outItr);
+
+    static void processOptions();
+
+    static void processFuncNames();
+    static void processRegClassNames();
+    static void processIntervalNumbers();
+
+    static void processIntervalRange(const std::string &intervalRangeStr);
+
+    MachineFunction *mf;
+    const TargetRegisterInfo *tri;
+    LiveIntervals *lis;
+
+    mutable bool regClassesTranslatedToCurrentFunction;
+    mutable RegClassSet regClassSet;
+
+    mutable bool intervalsTranslatedToCurrentFunction;
+    mutable IntervalSet intervalSet;
+
+    void translateRegClassNamesToCurrentFunction() const;
+
+    void translateIntervalNumbersToCurrentFunction() const;
+  };
+
+  /// \brief Render MachineFunction objects and related information to a HTML
+  ///        page.
+  class RenderMachineFunction : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    RenderMachineFunction() : MachineFunctionPass(ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &au) const;
+
+    virtual bool runOnMachineFunction(MachineFunction &fn);
+
+    virtual void releaseMemory();
+
+    /// \brief Render this machine function to HTML.
+    /// 
+    /// @param renderContextStr This parameter will be included in the top of
+    ///                         the html file to explain where (in the
+    ///                         codegen pipeline) this function was rendered
+    ///                         from. Set it to something like
+    ///                         "Pre-register-allocation".
+    /// @param vrm              If non-null the VRM will be queried to determine
+    ///                         whether a virtual register was allocated to a
+    ///                         physical register or spilled.
+    /// @param renderFilePrefix This string will be appended to the function
+    ///                         name (before the output file suffix) to enable
+    ///                         multiple renderings from the same function.
+    void renderMachineFunction(const char *renderContextStr,
+                               const VirtRegMap *vrm = 0,
+                               const char *renderSuffix = 0);
+
+  private:
+    class Spacer;
+
+    friend raw_ostream& operator<<(raw_ostream &os, const Spacer &s);
+
+
+    std::string fqn;
+
+    MachineFunction *mf;
+    MachineRegisterInfo *mri;
+    const TargetRegisterInfo *tri;
+    LiveIntervals *lis;
+    SlotIndexes *sis;
+    const VirtRegMap *vrm;
+
+    TargetRegisterExtraInfo trei;
+    MFRenderingOptions ro;
+
+    // Utilities.
+    typedef enum { Dead, Defined, Used, AliveReg, AliveStack } LiveState;
+    LiveState getLiveStateAt(const LiveInterval *li, SlotIndex i) const;
+
+    typedef enum { Zero, Low, High } PressureState;
+    PressureState getPressureStateAt(const TargetRegisterClass *trc,
+                                     SlotIndex i) const;
+
+    // ---------- Rendering methods ----------
+
+    /// For inserting spaces when pretty printing.
+    class Spacer {
+    public:
+      explicit Spacer(unsigned numSpaces) : ns(numSpaces) {}
+      Spacer operator+(const Spacer &o) const { return Spacer(ns + o.ns); }
+      void print(raw_ostream &os) const;
+    private:
+      unsigned ns;
+    };
+
+    Spacer s(unsigned ns) const;
+
+    template <typename Iterator>
+    std::string escapeChars(Iterator sBegin, Iterator sEnd) const;
+
+    /// \brief Render a machine instruction.
+    void renderMachineInstr(raw_ostream &os,
+                            const MachineInstr *mi) const;
+
+    /// \brief Render vertical text.
+    template <typename T>
+    void renderVertical(const Spacer &indent,
+                        raw_ostream &os,
+                        const T &t) const;
+
+    /// \brief Insert CSS layout info.
+    void insertCSS(const Spacer &indent,
+                   raw_ostream &os) const;
+
+    /// \brief Render a brief summary of the function (including rendering
+    ///        context).
+    void renderFunctionSummary(const Spacer &indent,
+                               raw_ostream &os,
+                               const char * const renderContextStr) const;
+
+    /// \brief Render a legend for the pressure table.
+    void renderPressureTableLegend(const Spacer &indent,
+                                   raw_ostream &os) const;
+
+    /// \brief Render a consecutive set of HTML cells of the same class using
+    /// the colspan attribute for run-length encoding.
+    template <typename CellType>
+    void renderCellsWithRLE(
+                     const Spacer &indent, raw_ostream &os,
+                     const std::pair<CellType, unsigned> &rleAccumulator,
+                     const std::map<CellType, std::string> &cellTypeStrs) const;
+
+    /// \brief Render code listing, potentially with register pressure
+    ///        and live intervals shown alongside.
+    void renderCodeTablePlusPI(const Spacer &indent,
+                               raw_ostream &os) const;
+
+    /// \brief Render the HTML page representing the MachineFunction.
+    void renderFunctionPage(raw_ostream &os,
+                            const char * const renderContextStr) const;
+
+    std::string escapeChars(const std::string &s) const;
+  };
+}
+
+#endif /* LLVM_CODEGEN_RENDERMACHINEFUNCTION_H */
diff --git a/src/LLVM/lib/CodeGen/ScheduleDAG.cpp b/src/LLVM/lib/CodeGen/ScheduleDAG.cpp
new file mode 100644
index 0000000..7d39dc4
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/ScheduleDAG.cpp
@@ -0,0 +1,586 @@
+//===---- ScheduleDAG.cpp - Implement the ScheduleDAG class ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG class, which is a base class used by
+// scheduling implementation classes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <climits>
+using namespace llvm;
+
+ScheduleDAG::ScheduleDAG(MachineFunction &mf)
+  : TM(mf.getTarget()),
+    TII(TM.getInstrInfo()),
+    TRI(TM.getRegisterInfo()),
+    MF(mf), MRI(mf.getRegInfo()),
+    EntrySU(), ExitSU() {
+}
+
+ScheduleDAG::~ScheduleDAG() {}
+
+/// dump - dump the schedule.
+void ScheduleDAG::dumpSchedule() const {
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    if (SUnit *SU = Sequence[i])
+      SU->dump(this);
+    else
+      dbgs() << "**** NOOP ****\n";
+  }
+}
+
+
+/// Run - perform scheduling.
+///
+void ScheduleDAG::Run(MachineBasicBlock *bb,
+                      MachineBasicBlock::iterator insertPos) {
+  BB = bb;
+  InsertPos = insertPos;
+
+  SUnits.clear();
+  Sequence.clear();
+  EntrySU = SUnit();
+  ExitSU = SUnit();
+
+  Schedule();
+
+  DEBUG({
+      dbgs() << "*** Final schedule ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+}
+
+/// addPred - This adds the specified edge as a pred of the current node if
+/// not already.  It also adds the current node as a successor of the
+/// specified node.
+void SUnit::addPred(const SDep &D) {
+  // If this node already has this depenence, don't add a redundant one.
+  for (SmallVector<SDep, 4>::const_iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I)
+    if (*I == D)
+      return;
+  // Now add a corresponding succ to N.
+  SDep P = D;
+  P.setSUnit(this);
+  SUnit *N = D.getSUnit();
+  // Update the bookkeeping.
+  if (D.getKind() == SDep::Data) {
+    assert(NumPreds < UINT_MAX && "NumPreds will overflow!");
+    assert(N->NumSuccs < UINT_MAX && "NumSuccs will overflow!");
+    ++NumPreds;
+    ++N->NumSuccs;
+  }
+  if (!N->isScheduled) {
+    assert(NumPredsLeft < UINT_MAX && "NumPredsLeft will overflow!");
+    ++NumPredsLeft;
+  }
+  if (!isScheduled) {
+    assert(N->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!");
+    ++N->NumSuccsLeft;
+  }
+  Preds.push_back(D);
+  N->Succs.push_back(P);
+  if (P.getLatency() != 0) {
+    this->setDepthDirty();
+    N->setHeightDirty();
+  }
+}
+
+/// removePred - This removes the specified edge as a pred of the current
+/// node if it exists.  It also removes the current node as a successor of
+/// the specified node.
+void SUnit::removePred(const SDep &D) {
+  // Find the matching predecessor.
+  for (SmallVector<SDep, 4>::iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I)
+    if (*I == D) {
+      bool FoundSucc = false;
+      // Find the corresponding successor in N.
+      SDep P = D;
+      P.setSUnit(this);
+      SUnit *N = D.getSUnit();
+      for (SmallVector<SDep, 4>::iterator II = N->Succs.begin(),
+             EE = N->Succs.end(); II != EE; ++II)
+        if (*II == P) {
+          FoundSucc = true;
+          N->Succs.erase(II);
+          break;
+        }
+      assert(FoundSucc && "Mismatching preds / succs lists!");
+      Preds.erase(I);
+      // Update the bookkeeping.
+      if (P.getKind() == SDep::Data) {
+        assert(NumPreds > 0 && "NumPreds will underflow!");
+        assert(N->NumSuccs > 0 && "NumSuccs will underflow!");
+        --NumPreds;
+        --N->NumSuccs;
+      }
+      if (!N->isScheduled) {
+        assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!");
+        --NumPredsLeft;
+      }
+      if (!isScheduled) {
+        assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!");
+        --N->NumSuccsLeft;
+      }
+      if (P.getLatency() != 0) {
+        this->setDepthDirty();
+        N->setHeightDirty();
+      }
+      return;
+    }
+}
+
+void SUnit::setDepthDirty() {
+  if (!isDepthCurrent) return;
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *SU = WorkList.pop_back_val();
+    SU->isDepthCurrent = false;
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
+         E = SU->Succs.end(); I != E; ++I) {
+      SUnit *SuccSU = I->getSUnit();
+      if (SuccSU->isDepthCurrent)
+        WorkList.push_back(SuccSU);
+    }
+  } while (!WorkList.empty());
+}
+
+void SUnit::setHeightDirty() {
+  if (!isHeightCurrent) return;
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *SU = WorkList.pop_back_val();
+    SU->isHeightCurrent = false;
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(),
+         E = SU->Preds.end(); I != E; ++I) {
+      SUnit *PredSU = I->getSUnit();
+      if (PredSU->isHeightCurrent)
+        WorkList.push_back(PredSU);
+    }
+  } while (!WorkList.empty());
+}
+
+/// setDepthToAtLeast - Update this node's successors to reflect the
+/// fact that this node's depth just increased.
+///
+void SUnit::setDepthToAtLeast(unsigned NewDepth) {
+  if (NewDepth <= getDepth())
+    return;
+  setDepthDirty();
+  Depth = NewDepth;
+  isDepthCurrent = true;
+}
+
+/// setHeightToAtLeast - Update this node's predecessors to reflect the
+/// fact that this node's height just increased.
+///
+void SUnit::setHeightToAtLeast(unsigned NewHeight) {
+  if (NewHeight <= getHeight())
+    return;
+  setHeightDirty();
+  Height = NewHeight;
+  isHeightCurrent = true;
+}
+
+/// ComputeDepth - Calculate the maximal path from the node to the exit.
+///
+void SUnit::ComputeDepth() {
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *Cur = WorkList.back();
+
+    bool Done = true;
+    unsigned MaxPredDepth = 0;
+    for (SUnit::const_pred_iterator I = Cur->Preds.begin(),
+         E = Cur->Preds.end(); I != E; ++I) {
+      SUnit *PredSU = I->getSUnit();
+      if (PredSU->isDepthCurrent)
+        MaxPredDepth = std::max(MaxPredDepth,
+                                PredSU->Depth + I->getLatency());
+      else {
+        Done = false;
+        WorkList.push_back(PredSU);
+      }
+    }
+
+    if (Done) {
+      WorkList.pop_back();
+      if (MaxPredDepth != Cur->Depth) {
+        Cur->setDepthDirty();
+        Cur->Depth = MaxPredDepth;
+      }
+      Cur->isDepthCurrent = true;
+    }
+  } while (!WorkList.empty());
+}
+
+/// ComputeHeight - Calculate the maximal path from the node to the entry.
+///
+void SUnit::ComputeHeight() {
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *Cur = WorkList.back();
+
+    bool Done = true;
+    unsigned MaxSuccHeight = 0;
+    for (SUnit::const_succ_iterator I = Cur->Succs.begin(),
+         E = Cur->Succs.end(); I != E; ++I) {
+      SUnit *SuccSU = I->getSUnit();
+      if (SuccSU->isHeightCurrent)
+        MaxSuccHeight = std::max(MaxSuccHeight,
+                                 SuccSU->Height + I->getLatency());
+      else {
+        Done = false;
+        WorkList.push_back(SuccSU);
+      }
+    }
+
+    if (Done) {
+      WorkList.pop_back();
+      if (MaxSuccHeight != Cur->Height) {
+        Cur->setHeightDirty();
+        Cur->Height = MaxSuccHeight;
+      }
+      Cur->isHeightCurrent = true;
+    }
+  } while (!WorkList.empty());
+}
+
+/// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
+/// a group of nodes flagged together.
+void SUnit::dump(const ScheduleDAG *G) const {
+  dbgs() << "SU(" << NodeNum << "): ";
+  G->dumpNode(this);
+}
+
+void SUnit::dumpAll(const ScheduleDAG *G) const {
+  dump(G);
+
+  dbgs() << "  # preds left       : " << NumPredsLeft << "\n";
+  dbgs() << "  # succs left       : " << NumSuccsLeft << "\n";
+  dbgs() << "  Latency            : " << Latency << "\n";
+  dbgs() << "  Depth              : " << Depth << "\n";
+  dbgs() << "  Height             : " << Height << "\n";
+
+  if (Preds.size() != 0) {
+    dbgs() << "  Predecessors:\n";
+    for (SUnit::const_succ_iterator I = Preds.begin(), E = Preds.end();
+         I != E; ++I) {
+      dbgs() << "   ";
+      switch (I->getKind()) {
+      case SDep::Data:        dbgs() << "val "; break;
+      case SDep::Anti:        dbgs() << "anti"; break;
+      case SDep::Output:      dbgs() << "out "; break;
+      case SDep::Order:       dbgs() << "ch  "; break;
+      }
+      dbgs() << "#";
+      dbgs() << I->getSUnit() << " - SU(" << I->getSUnit()->NodeNum << ")";
+      if (I->isArtificial())
+        dbgs() << " *";
+      dbgs() << ": Latency=" << I->getLatency();
+      dbgs() << "\n";
+    }
+  }
+  if (Succs.size() != 0) {
+    dbgs() << "  Successors:\n";
+    for (SUnit::const_succ_iterator I = Succs.begin(), E = Succs.end();
+         I != E; ++I) {
+      dbgs() << "   ";
+      switch (I->getKind()) {
+      case SDep::Data:        dbgs() << "val "; break;
+      case SDep::Anti:        dbgs() << "anti"; break;
+      case SDep::Output:      dbgs() << "out "; break;
+      case SDep::Order:       dbgs() << "ch  "; break;
+      }
+      dbgs() << "#";
+      dbgs() << I->getSUnit() << " - SU(" << I->getSUnit()->NodeNum << ")";
+      if (I->isArtificial())
+        dbgs() << " *";
+      dbgs() << ": Latency=" << I->getLatency();
+      dbgs() << "\n";
+    }
+  }
+  dbgs() << "\n";
+}
+
+#ifndef NDEBUG
+/// VerifySchedule - Verify that all SUnits were scheduled and that
+/// their state is consistent.
+///
+void ScheduleDAG::VerifySchedule(bool isBottomUp) {
+  bool AnyNotSched = false;
+  unsigned DeadNodes = 0;
+  unsigned Noops = 0;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    if (!SUnits[i].isScheduled) {
+      if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) {
+        ++DeadNodes;
+        continue;
+      }
+      if (!AnyNotSched)
+        dbgs() << "*** Scheduling failed! ***\n";
+      SUnits[i].dump(this);
+      dbgs() << "has not been scheduled!\n";
+      AnyNotSched = true;
+    }
+    if (SUnits[i].isScheduled &&
+        (isBottomUp ? SUnits[i].getHeight() : SUnits[i].getDepth()) >
+          unsigned(INT_MAX)) {
+      if (!AnyNotSched)
+        dbgs() << "*** Scheduling failed! ***\n";
+      SUnits[i].dump(this);
+      dbgs() << "has an unexpected "
+           << (isBottomUp ? "Height" : "Depth") << " value!\n";
+      AnyNotSched = true;
+    }
+    if (isBottomUp) {
+      if (SUnits[i].NumSuccsLeft != 0) {
+        if (!AnyNotSched)
+          dbgs() << "*** Scheduling failed! ***\n";
+        SUnits[i].dump(this);
+        dbgs() << "has successors left!\n";
+        AnyNotSched = true;
+      }
+    } else {
+      if (SUnits[i].NumPredsLeft != 0) {
+        if (!AnyNotSched)
+          dbgs() << "*** Scheduling failed! ***\n";
+        SUnits[i].dump(this);
+        dbgs() << "has predecessors left!\n";
+        AnyNotSched = true;
+      }
+    }
+  }
+  for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
+    if (!Sequence[i])
+      ++Noops;
+  assert(!AnyNotSched);
+  assert(Sequence.size() + DeadNodes - Noops == SUnits.size() &&
+         "The number of nodes scheduled doesn't match the expected number!");
+}
+#endif
+
+/// InitDAGTopologicalSorting - create the initial topological
+/// ordering from the DAG to be scheduled.
+///
+/// The idea of the algorithm is taken from
+/// "Online algorithms for managing the topological order of
+/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
+/// This is the MNR algorithm, which was first introduced by
+/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in
+/// "Maintaining a topological order under edge insertions".
+///
+/// Short description of the algorithm:
+///
+/// Topological ordering, ord, of a DAG maps each node to a topological
+/// index so that for all edges X->Y it is the case that ord(X) < ord(Y).
+///
+/// This means that if there is a path from the node X to the node Z,
+/// then ord(X) < ord(Z).
+///
+/// This property can be used to check for reachability of nodes:
+/// if Z is reachable from X, then an insertion of the edge Z->X would
+/// create a cycle.
+///
+/// The algorithm first computes a topological ordering for the DAG by
+/// initializing the Index2Node and Node2Index arrays and then tries to keep
+/// the ordering up-to-date after edge insertions by reordering the DAG.
+///
+/// On insertion of the edge X->Y, the algorithm first marks by calling DFS
+/// the nodes reachable from Y, and then shifts them using Shift to lie
+/// immediately after X in Index2Node.
+void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
+  unsigned DAGSize = SUnits.size();
+  std::vector<SUnit*> WorkList;
+  WorkList.reserve(DAGSize);
+
+  Index2Node.resize(DAGSize);
+  Node2Index.resize(DAGSize);
+
+  // Initialize the data structures.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    int NodeNum = SU->NodeNum;
+    unsigned Degree = SU->Succs.size();
+    // Temporarily use the Node2Index array as scratch space for degree counts.
+    Node2Index[NodeNum] = Degree;
+
+    // Is it a node without dependencies?
+    if (Degree == 0) {
+      assert(SU->Succs.empty() && "SUnit should have no successors");
+      // Collect leaf nodes.
+      WorkList.push_back(SU);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    Allocate(SU->NodeNum, --Id);
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      SUnit *SU = I->getSUnit();
+      if (!--Node2Index[SU->NodeNum])
+        // If all dependencies of the node are processed already,
+        // then the node can be computed now.
+        WorkList.push_back(SU);
+    }
+  }
+
+  Visited.resize(DAGSize);
+
+#ifndef NDEBUG
+  // Check correctness of the ordering
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] &&
+      "Wrong topological sorting");
+    }
+  }
+#endif
+}
+
+/// AddPred - Updates the topological ordering to accomodate an edge
+/// to be added from SUnit X to SUnit Y.
+void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
+  int UpperBound, LowerBound;
+  LowerBound = Node2Index[Y->NodeNum];
+  UpperBound = Node2Index[X->NodeNum];
+  bool HasLoop = false;
+  // Is Ord(X) < Ord(Y) ?
+  if (LowerBound < UpperBound) {
+    // Update the topological order.
+    Visited.reset();
+    DFS(Y, UpperBound, HasLoop);
+    assert(!HasLoop && "Inserted edge creates a loop!");
+    // Recompute topological indexes.
+    Shift(Visited, LowerBound, UpperBound);
+  }
+}
+
+/// RemovePred - Updates the topological ordering to accomodate an
+/// an edge to be removed from the specified node N from the predecessors
+/// of the current node M.
+void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
+  // InitDAGTopologicalSorting();
+}
+
+/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark
+/// all nodes affected by the edge insertion. These nodes will later get new
+/// topological indexes by means of the Shift method.
+void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
+                                     bool& HasLoop) {
+  std::vector<const SUnit*> WorkList;
+  WorkList.reserve(SUnits.size());
+
+  WorkList.push_back(SU);
+  do {
+    SU = WorkList.back();
+    WorkList.pop_back();
+    Visited.set(SU->NodeNum);
+    for (int I = SU->Succs.size()-1; I >= 0; --I) {
+      int s = SU->Succs[I].getSUnit()->NodeNum;
+      if (Node2Index[s] == UpperBound) {
+        HasLoop = true;
+        return;
+      }
+      // Visit successors if not already and in affected region.
+      if (!Visited.test(s) && Node2Index[s] < UpperBound) {
+        WorkList.push_back(SU->Succs[I].getSUnit());
+      }
+    }
+  } while (!WorkList.empty());
+}
+
+/// Shift - Renumber the nodes so that the topological ordering is
+/// preserved.
+void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,
+                                       int UpperBound) {
+  std::vector<int> L;
+  int shift = 0;
+  int i;
+
+  for (i = LowerBound; i <= UpperBound; ++i) {
+    // w is node at topological index i.
+    int w = Index2Node[i];
+    if (Visited.test(w)) {
+      // Unmark.
+      Visited.reset(w);
+      L.push_back(w);
+      shift = shift + 1;
+    } else {
+      Allocate(w, i - shift);
+    }
+  }
+
+  for (unsigned j = 0; j < L.size(); ++j) {
+    Allocate(L[j], i - shift);
+    i = i + 1;
+  }
+}
+
+
+/// WillCreateCycle - Returns true if adding an edge from SU to TargetSU will
+/// create a cycle.
+bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *SU, SUnit *TargetSU) {
+  if (IsReachable(TargetSU, SU))
+    return true;
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I)
+    if (I->isAssignedRegDep() &&
+        IsReachable(TargetSU, I->getSUnit()))
+      return true;
+  return false;
+}
+
+/// IsReachable - Checks if SU is reachable from TargetSU.
+bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
+                                             const SUnit *TargetSU) {
+  // If insertion of the edge SU->TargetSU would create a cycle
+  // then there is a path from TargetSU to SU.
+  int UpperBound, LowerBound;
+  LowerBound = Node2Index[TargetSU->NodeNum];
+  UpperBound = Node2Index[SU->NodeNum];
+  bool HasLoop = false;
+  // Is Ord(TargetSU) < Ord(SU) ?
+  if (LowerBound < UpperBound) {
+    Visited.reset();
+    // There may be a path from TargetSU to SU. Check for it.
+    DFS(TargetSU, UpperBound, HasLoop);
+  }
+  return HasLoop;
+}
+
+/// Allocate - assign the topological index to the node n.
+void ScheduleDAGTopologicalSort::Allocate(int n, int index) {
+  Node2Index[n] = index;
+  Index2Node[index] = n;
+}
+
+ScheduleDAGTopologicalSort::
+ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits) : SUnits(sunits) {}
+
+ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {}
diff --git a/src/LLVM/lib/CodeGen/ScheduleDAGEmit.cpp b/src/LLVM/lib/CodeGen/ScheduleDAGEmit.cpp
new file mode 100644
index 0000000..0a2fb37
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/ScheduleDAGEmit.cpp
@@ -0,0 +1,67 @@
+//===---- ScheduleDAGEmit.cpp - Emit routines for the ScheduleDAG class ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the Emit routines for the ScheduleDAG class, which creates
+// MachineInstrs according to the computed schedule.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+void ScheduleDAG::EmitNoop() {
+  TII->insertNoop(*BB, InsertPos);
+}
+
+void ScheduleDAG::EmitPhysRegCopy(SUnit *SU,
+                                  DenseMap<SUnit*, unsigned> &VRBaseMap) {
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    if (I->getSUnit()->CopyDstRC) {
+      // Copy to physical register.
+      DenseMap<SUnit*, unsigned>::iterator VRI = VRBaseMap.find(I->getSUnit());
+      assert(VRI != VRBaseMap.end() && "Node emitted out of order - late");
+      // Find the destination physical register.
+      unsigned Reg = 0;
+      for (SUnit::const_succ_iterator II = SU->Succs.begin(),
+             EE = SU->Succs.end(); II != EE; ++II) {
+        if (II->getReg()) {
+          Reg = II->getReg();
+          break;
+        }
+      }
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
+        .addReg(VRI->second);
+    } else {
+      // Copy from physical register.
+      assert(I->getReg() && "Unknown physical register!");
+      unsigned VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
+      bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
+      isNew = isNew; // Silence compiler warning.
+      assert(isNew && "Node emitted out of order - early");
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), VRBase)
+        .addReg(I->getReg());
+    }
+    break;
+  }
+}
diff --git a/src/LLVM/lib/CodeGen/ScheduleDAGInstrs.cpp b/src/LLVM/lib/CodeGen/ScheduleDAGInstrs.cpp
new file mode 100644
index 0000000..ea93dd5
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -0,0 +1,620 @@
+//===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAGInstrs class, which implements re-scheduling
+// of MachineInstrs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sched-instrs"
+#include "ScheduleDAGInstrs.h"
+#include "llvm/Operator.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallSet.h"
+using namespace llvm;
+
+ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
+                                     const MachineLoopInfo &mli,
+                                     const MachineDominatorTree &mdt)
+  : ScheduleDAG(mf), MLI(mli), MDT(mdt), Defs(TRI->getNumRegs()),
+    Uses(TRI->getNumRegs()), LoopRegs(MLI, MDT) {
+  MFI = mf.getFrameInfo();
+  DbgValueVec.clear();
+}
+
+/// Run - perform scheduling.
+///
+void ScheduleDAGInstrs::Run(MachineBasicBlock *bb,
+                            MachineBasicBlock::iterator begin,
+                            MachineBasicBlock::iterator end,
+                            unsigned endcount) {
+  BB = bb;
+  Begin = begin;
+  InsertPosIndex = endcount;
+
+  ScheduleDAG::Run(bb, end);
+}
+
+/// getUnderlyingObjectFromInt - This is the function that does the work of
+/// looking through basic ptrtoint+arithmetic+inttoptr sequences.
+static const Value *getUnderlyingObjectFromInt(const Value *V) {
+  do {
+    if (const Operator *U = dyn_cast<Operator>(V)) {
+      // If we find a ptrtoint, we can transfer control back to the
+      // regular getUnderlyingObjectFromInt.
+      if (U->getOpcode() == Instruction::PtrToInt)
+        return U->getOperand(0);
+      // If we find an add of a constant or a multiplied value, it's
+      // likely that the other operand will lead us to the base
+      // object. We don't have to worry about the case where the
+      // object address is somehow being computed by the multiply,
+      // because our callers only care when the result is an
+      // identifibale object.
+      if (U->getOpcode() != Instruction::Add ||
+          (!isa<ConstantInt>(U->getOperand(1)) &&
+           Operator::getOpcode(U->getOperand(1)) != Instruction::Mul))
+        return V;
+      V = U->getOperand(0);
+    } else {
+      return V;
+    }
+    assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
+  } while (1);
+}
+
+/// getUnderlyingObject - This is a wrapper around Value::getUnderlyingObject
+/// and adds support for basic ptrtoint+arithmetic+inttoptr sequences.
+static const Value *getUnderlyingObject(const Value *V) {
+  // First just call Value::getUnderlyingObject to let it do what it does.
+  do {
+    V = V->getUnderlyingObject();
+    // If it found an inttoptr, use special code to continue climing.
+    if (Operator::getOpcode(V) != Instruction::IntToPtr)
+      break;
+    const Value *O = getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
+    // If that succeeded in finding a pointer, continue the search.
+    if (!O->getType()->isPointerTy())
+      break;
+    V = O;
+  } while (1);
+  return V;
+}
+
+/// getUnderlyingObjectForInstr - If this machine instr has memory reference
+/// information and it can be tracked to a normal reference to a known
+/// object, return the Value for that object. Otherwise return null.
+static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI,
+                                                const MachineFrameInfo *MFI,
+                                                bool &MayAlias) {
+  MayAlias = true;
+  if (!MI->hasOneMemOperand() ||
+      !(*MI->memoperands_begin())->getValue() ||
+      (*MI->memoperands_begin())->isVolatile())
+    return 0;
+
+  const Value *V = (*MI->memoperands_begin())->getValue();
+  if (!V)
+    return 0;
+
+  V = getUnderlyingObject(V);
+  if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
+    // For now, ignore PseudoSourceValues which may alias LLVM IR values
+    // because the code that uses this function has no way to cope with
+    // such aliases.
+    if (PSV->isAliased(MFI))
+      return 0;
+    
+    MayAlias = PSV->mayAlias(MFI);
+    return V;
+  }
+
+  if (isIdentifiedObject(V))
+    return V;
+
+  return 0;
+}
+
+void ScheduleDAGInstrs::StartBlock(MachineBasicBlock *BB) {
+  if (MachineLoop *ML = MLI.getLoopFor(BB))
+    if (BB == ML->getLoopLatch()) {
+      MachineBasicBlock *Header = ML->getHeader();
+      for (MachineBasicBlock::livein_iterator I = Header->livein_begin(),
+           E = Header->livein_end(); I != E; ++I)
+        LoopLiveInRegs.insert(*I);
+      LoopRegs.VisitLoop(ML);
+    }
+}
+
+void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
+  // We'll be allocating one SUnit for each instruction, plus one for
+  // the region exit node.
+  SUnits.reserve(BB->size());
+
+  // We build scheduling units by walking a block's instruction list from bottom
+  // to top.
+
+  // Remember where a generic side-effecting instruction is as we procede.
+  SUnit *BarrierChain = 0, *AliasChain = 0;
+
+  // Memory references to specific known memory locations are tracked
+  // so that they can be given more precise dependencies. We track
+  // separately the known memory locations that may alias and those
+  // that are known not to alias
+  std::map<const Value *, SUnit *> AliasMemDefs, NonAliasMemDefs;
+  std::map<const Value *, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses;
+
+  // Keep track of dangling debug references to registers.
+  std::vector<std::pair<MachineInstr*, unsigned> >
+    DanglingDebugValue(TRI->getNumRegs(),
+    std::make_pair(static_cast<MachineInstr*>(0), 0));
+
+  // Check to see if the scheduler cares about latencies.
+  bool UnitLatencies = ForceUnitLatencies();
+
+  // Ask the target if address-backscheduling is desirable, and if so how much.
+  const TargetSubtarget &ST = TM.getSubtarget<TargetSubtarget>();
+  unsigned SpecialAddressLatency = ST.getSpecialAddressLatency();
+
+  // Remove any stale debug info; sometimes BuildSchedGraph is called again
+  // without emitting the info from the previous call.
+  DbgValueVec.clear();
+
+  // Walk the list of instructions, from bottom moving up.
+  for (MachineBasicBlock::iterator MII = InsertPos, MIE = Begin;
+       MII != MIE; --MII) {
+    MachineInstr *MI = prior(MII);
+    // DBG_VALUE does not have SUnit's built, so just remember these for later
+    // reinsertion.
+    if (MI->isDebugValue()) {
+      if (MI->getNumOperands()==3 && MI->getOperand(0).isReg() &&
+          MI->getOperand(0).getReg())
+        DanglingDebugValue[MI->getOperand(0).getReg()] =
+             std::make_pair(MI, DbgValueVec.size());
+      DbgValueVec.push_back(MI);
+      continue;
+    }
+    const TargetInstrDesc &TID = MI->getDesc();
+    assert(!TID.isTerminator() && !MI->isLabel() &&
+           "Cannot schedule terminators or labels!");
+    // Create the SUnit for this MI.
+    SUnit *SU = NewSUnit(MI);
+
+    // Assign the Latency field of SU using target-provided information.
+    if (UnitLatencies)
+      SU->Latency = 1;
+    else
+      ComputeLatency(SU);
+
+    // Add register-based dependencies (data, anti, and output).
+    for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) {
+      const MachineOperand &MO = MI->getOperand(j);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+
+      assert(TRI->isPhysicalRegister(Reg) && "Virtual register encountered!");
+
+      if (MO.isDef() && DanglingDebugValue[Reg].first!=0) {
+        SU->DbgInstrList.push_back(DanglingDebugValue[Reg].first);
+        DbgValueVec[DanglingDebugValue[Reg].second] = 0;
+        DanglingDebugValue[Reg] = std::make_pair((MachineInstr*)0, 0);
+      }
+
+      std::vector<SUnit *> &UseList = Uses[Reg];
+      std::vector<SUnit *> &DefList = Defs[Reg];
+      // Optionally add output and anti dependencies. For anti
+      // dependencies we use a latency of 0 because for a multi-issue
+      // target we want to allow the defining instruction to issue
+      // in the same cycle as the using instruction.
+      // TODO: Using a latency of 1 here for output dependencies assumes
+      //       there's no cost for reusing registers.
+      SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
+      unsigned AOLatency = (Kind == SDep::Anti) ? 0 : 1;
+      for (unsigned i = 0, e = DefList.size(); i != e; ++i) {
+        SUnit *DefSU = DefList[i];
+        if (DefSU != SU &&
+            (Kind != SDep::Output || !MO.isDead() ||
+             !DefSU->getInstr()->registerDefIsDead(Reg)))
+          DefSU->addPred(SDep(SU, Kind, AOLatency, /*Reg=*/Reg));
+      }
+      for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+        std::vector<SUnit *> &DefList = Defs[*Alias];
+        for (unsigned i = 0, e = DefList.size(); i != e; ++i) {
+          SUnit *DefSU = DefList[i];
+          if (DefSU != SU &&
+              (Kind != SDep::Output || !MO.isDead() ||
+               !DefSU->getInstr()->registerDefIsDead(*Alias)))
+            DefSU->addPred(SDep(SU, Kind, AOLatency, /*Reg=*/ *Alias));
+        }
+      }
+
+      if (MO.isDef()) {
+        // Add any data dependencies.
+        unsigned DataLatency = SU->Latency;
+        for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
+          SUnit *UseSU = UseList[i];
+          if (UseSU == SU)
+            continue;
+          unsigned LDataLatency = DataLatency;
+          // Optionally add in a special extra latency for nodes that
+          // feed addresses.
+          // TODO: Do this for register aliases too.
+          // TODO: Perhaps we should get rid of
+          // SpecialAddressLatency and just move this into
+          // adjustSchedDependency for the targets that care about it.
+          if (SpecialAddressLatency != 0 && !UnitLatencies) {
+            MachineInstr *UseMI = UseSU->getInstr();
+            const TargetInstrDesc &UseTID = UseMI->getDesc();
+            int RegUseIndex = UseMI->findRegisterUseOperandIdx(Reg);
+            assert(RegUseIndex >= 0 && "UseMI doesn's use register!");
+            if ((UseTID.mayLoad() || UseTID.mayStore()) &&
+                (unsigned)RegUseIndex < UseTID.getNumOperands() &&
+                UseTID.OpInfo[RegUseIndex].isLookupPtrRegClass())
+              LDataLatency += SpecialAddressLatency;
+          }
+          // Adjust the dependence latency using operand def/use
+          // information (if any), and then allow the target to
+          // perform its own adjustments.
+          const SDep& dep = SDep(SU, SDep::Data, LDataLatency, Reg);
+          if (!UnitLatencies) {
+            ComputeOperandLatency(SU, UseSU, const_cast<SDep &>(dep));
+            ST.adjustSchedDependency(SU, UseSU, const_cast<SDep &>(dep));
+          }
+          UseSU->addPred(dep);
+        }
+        for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+          std::vector<SUnit *> &UseList = Uses[*Alias];
+          for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
+            SUnit *UseSU = UseList[i];
+            if (UseSU == SU)
+              continue;
+            const SDep& dep = SDep(SU, SDep::Data, DataLatency, *Alias);
+            if (!UnitLatencies) {
+              ComputeOperandLatency(SU, UseSU, const_cast<SDep &>(dep));
+              ST.adjustSchedDependency(SU, UseSU, const_cast<SDep &>(dep));
+            }
+            UseSU->addPred(dep);
+          }
+        }
+
+        // If a def is going to wrap back around to the top of the loop,
+        // backschedule it.
+        if (!UnitLatencies && DefList.empty()) {
+          LoopDependencies::LoopDeps::iterator I = LoopRegs.Deps.find(Reg);
+          if (I != LoopRegs.Deps.end()) {
+            const MachineOperand *UseMO = I->second.first;
+            unsigned Count = I->second.second;
+            const MachineInstr *UseMI = UseMO->getParent();
+            unsigned UseMOIdx = UseMO - &UseMI->getOperand(0);
+            const TargetInstrDesc &UseTID = UseMI->getDesc();
+            // TODO: If we knew the total depth of the region here, we could
+            // handle the case where the whole loop is inside the region but
+            // is large enough that the isScheduleHigh trick isn't needed.
+            if (UseMOIdx < UseTID.getNumOperands()) {
+              // Currently, we only support scheduling regions consisting of
+              // single basic blocks. Check to see if the instruction is in
+              // the same region by checking to see if it has the same parent.
+              if (UseMI->getParent() != MI->getParent()) {
+                unsigned Latency = SU->Latency;
+                if (UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass())
+                  Latency += SpecialAddressLatency;
+                // This is a wild guess as to the portion of the latency which
+                // will be overlapped by work done outside the current
+                // scheduling region.
+                Latency -= std::min(Latency, Count);
+                // Add the artifical edge.
+                ExitSU.addPred(SDep(SU, SDep::Order, Latency,
+                                    /*Reg=*/0, /*isNormalMemory=*/false,
+                                    /*isMustAlias=*/false,
+                                    /*isArtificial=*/true));
+              } else if (SpecialAddressLatency > 0 &&
+                         UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass()) {
+                // The entire loop body is within the current scheduling region
+                // and the latency of this operation is assumed to be greater
+                // than the latency of the loop.
+                // TODO: Recursively mark data-edge predecessors as
+                //       isScheduleHigh too.
+                SU->isScheduleHigh = true;
+              }
+            }
+            LoopRegs.Deps.erase(I);
+          }
+        }
+
+        UseList.clear();
+        if (!MO.isDead())
+          DefList.clear();
+        DefList.push_back(SU);
+      } else {
+        UseList.push_back(SU);
+      }
+    }
+
+    // Add chain dependencies.
+    // Chain dependencies used to enforce memory order should have
+    // latency of 0 (except for true dependency of Store followed by
+    // aliased Load... we estimate that with a single cycle of latency
+    // assuming the hardware will bypass)
+    // Note that isStoreToStackSlot and isLoadFromStackSLot are not usable
+    // after stack slots are lowered to actual addresses.
+    // TODO: Use an AliasAnalysis and do real alias-analysis queries, and
+    // produce more precise dependence information.
+#define STORE_LOAD_LATENCY 1
+    unsigned TrueMemOrderLatency = 0;
+    if (TID.isCall() || TID.hasUnmodeledSideEffects() ||
+        (MI->hasVolatileMemoryRef() && 
+         (!TID.mayLoad() || !MI->isInvariantLoad(AA)))) {
+      // Be conservative with these and add dependencies on all memory
+      // references, even those that are known to not alias.
+      for (std::map<const Value *, SUnit *>::iterator I = 
+             NonAliasMemDefs.begin(), E = NonAliasMemDefs.end(); I != E; ++I) {
+        I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+      }
+      for (std::map<const Value *, std::vector<SUnit *> >::iterator I =
+             NonAliasMemUses.begin(), E = NonAliasMemUses.end(); I != E; ++I) {
+        for (unsigned i = 0, e = I->second.size(); i != e; ++i)
+          I->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
+      }
+      NonAliasMemDefs.clear();
+      NonAliasMemUses.clear();
+      // Add SU to the barrier chain.
+      if (BarrierChain)
+        BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+      BarrierChain = SU;
+
+      // fall-through
+    new_alias_chain:
+      // Chain all possibly aliasing memory references though SU.
+      if (AliasChain)
+        AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+      AliasChain = SU;
+      for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
+        PendingLoads[k]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
+      for (std::map<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(),
+           E = AliasMemDefs.end(); I != E; ++I) {
+        I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+      }
+      for (std::map<const Value *, std::vector<SUnit *> >::iterator I =
+           AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) {
+        for (unsigned i = 0, e = I->second.size(); i != e; ++i)
+          I->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
+      }
+      PendingLoads.clear();
+      AliasMemDefs.clear();
+      AliasMemUses.clear();
+    } else if (TID.mayStore()) {
+      bool MayAlias = true;
+      TrueMemOrderLatency = STORE_LOAD_LATENCY;
+      if (const Value *V = getUnderlyingObjectForInstr(MI, MFI, MayAlias)) {
+        // A store to a specific PseudoSourceValue. Add precise dependencies.
+        // Record the def in MemDefs, first adding a dep if there is
+        // an existing def.
+        std::map<const Value *, SUnit *>::iterator I = 
+          ((MayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
+        std::map<const Value *, SUnit *>::iterator IE = 
+          ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
+        if (I != IE) {
+          I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0,
+                                  /*isNormalMemory=*/true));
+          I->second = SU;
+        } else {
+          if (MayAlias)
+            AliasMemDefs[V] = SU;
+          else
+            NonAliasMemDefs[V] = SU;
+        }
+        // Handle the uses in MemUses, if there are any.
+        std::map<const Value *, std::vector<SUnit *> >::iterator J =
+          ((MayAlias) ? AliasMemUses.find(V) : NonAliasMemUses.find(V));
+        std::map<const Value *, std::vector<SUnit *> >::iterator JE =
+          ((MayAlias) ? AliasMemUses.end() : NonAliasMemUses.end());
+        if (J != JE) {
+          for (unsigned i = 0, e = J->second.size(); i != e; ++i)
+            J->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency,
+                                       /*Reg=*/0, /*isNormalMemory=*/true));
+          J->second.clear();
+        }
+        if (MayAlias) {
+          // Add dependencies from all the PendingLoads, i.e. loads
+          // with no underlying object.
+          for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
+            PendingLoads[k]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
+          // Add dependence on alias chain, if needed.
+          if (AliasChain)
+            AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+        }
+        // Add dependence on barrier chain, if needed.
+        if (BarrierChain)
+          BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+      } else {
+        // Treat all other stores conservatively.
+        goto new_alias_chain;
+      }
+    } else if (TID.mayLoad()) {
+      bool MayAlias = true;
+      TrueMemOrderLatency = 0;
+      if (MI->isInvariantLoad(AA)) {
+        // Invariant load, no chain dependencies needed!
+      } else {
+        if (const Value *V = 
+            getUnderlyingObjectForInstr(MI, MFI, MayAlias)) {
+          // A load from a specific PseudoSourceValue. Add precise dependencies.
+          std::map<const Value *, SUnit *>::iterator I = 
+            ((MayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
+          std::map<const Value *, SUnit *>::iterator IE = 
+            ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
+          if (I != IE)
+            I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0,
+                                    /*isNormalMemory=*/true));
+          if (MayAlias)
+            AliasMemUses[V].push_back(SU);
+          else 
+            NonAliasMemUses[V].push_back(SU);
+        } else {
+          // A load with no underlying object. Depend on all
+          // potentially aliasing stores.
+          for (std::map<const Value *, SUnit *>::iterator I = 
+                 AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
+            I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+          
+          PendingLoads.push_back(SU);
+          MayAlias = true;
+        }
+        
+        // Add dependencies on alias and barrier chains, if needed.
+        if (MayAlias && AliasChain)
+          AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+        if (BarrierChain)
+          BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+      } 
+    }
+  }
+
+  for (int i = 0, e = TRI->getNumRegs(); i != e; ++i) {
+    Defs[i].clear();
+    Uses[i].clear();
+  }
+  PendingLoads.clear();
+}
+
+void ScheduleDAGInstrs::FinishBlock() {
+  // Nothing to do.
+}
+
+void ScheduleDAGInstrs::ComputeLatency(SUnit *SU) {
+  const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
+
+  // Compute the latency for the node.
+  SU->Latency =
+    InstrItins.getStageLatency(SU->getInstr()->getDesc().getSchedClass());
+
+  // Simplistic target-independent heuristic: assume that loads take
+  // extra time.
+  if (InstrItins.isEmpty())
+    if (SU->getInstr()->getDesc().mayLoad())
+      SU->Latency += 2;
+}
+
+void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use, 
+                                              SDep& dep) const {
+  const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
+  if (InstrItins.isEmpty())
+    return;
+  
+  // For a data dependency with a known register...
+  if ((dep.getKind() != SDep::Data) || (dep.getReg() == 0))
+    return;
+
+  const unsigned Reg = dep.getReg();
+
+  // ... find the definition of the register in the defining
+  // instruction
+  MachineInstr *DefMI = Def->getInstr();
+  int DefIdx = DefMI->findRegisterDefOperandIdx(Reg);
+  if (DefIdx != -1) {
+    int DefCycle = InstrItins.getOperandCycle(DefMI->getDesc().getSchedClass(),
+                                              DefIdx);
+    if (DefCycle >= 0) {
+      MachineInstr *UseMI = Use->getInstr();
+      const unsigned UseClass = UseMI->getDesc().getSchedClass();
+
+      // For all uses of the register, calculate the maxmimum latency
+      int Latency = -1;
+      for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = UseMI->getOperand(i);
+        if (!MO.isReg() || !MO.isUse())
+          continue;
+        unsigned MOReg = MO.getReg();
+        if (MOReg != Reg)
+          continue;
+
+        int UseCycle = InstrItins.getOperandCycle(UseClass, i);
+        if (UseCycle >= 0)
+          Latency = std::max(Latency, DefCycle - UseCycle + 1);
+      }
+
+      // If we found a latency, then replace the existing dependence latency.
+      if (Latency >= 0)
+        dep.setLatency(Latency);
+    }
+  }
+}
+
+void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
+  SU->getInstr()->dump();
+}
+
+std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
+  std::string s;
+  raw_string_ostream oss(s);
+  if (SU == &EntrySU)
+    oss << "<entry>";
+  else if (SU == &ExitSU)
+    oss << "<exit>";
+  else
+    SU->getInstr()->print(oss);
+  return oss.str();
+}
+
+// EmitSchedule - Emit the machine code in scheduled order.
+MachineBasicBlock *ScheduleDAGInstrs::EmitSchedule() {
+  // For MachineInstr-based scheduling, we're rescheduling the instructions in
+  // the block, so start by removing them from the block.
+  while (Begin != InsertPos) {
+    MachineBasicBlock::iterator I = Begin;
+    ++Begin;
+    BB->remove(I);
+  }
+
+  // First reinsert any remaining debug_values; these are either constants,
+  // or refer to live-in registers.  The beginning of the block is the right
+  // place for the latter.  The former might reasonably be placed elsewhere
+  // using some kind of ordering algorithm, but right now it doesn't matter.
+  for (int i = DbgValueVec.size()-1; i>=0; --i)
+    if (DbgValueVec[i])
+      BB->insert(InsertPos, DbgValueVec[i]);
+
+  // Then re-insert them according to the given schedule.
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    SUnit *SU = Sequence[i];
+    if (!SU) {
+      // Null SUnit* is a noop.
+      EmitNoop();
+      continue;
+    }
+
+    BB->insert(InsertPos, SU->getInstr());
+    for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i)
+      BB->insert(InsertPos, SU->DbgInstrList[i]);
+  }
+
+  // Update the Begin iterator, as the first instruction in the block
+  // may have been scheduled later.
+  if (!DbgValueVec.empty()) {
+    for (int i = DbgValueVec.size()-1; i>=0; --i)
+      if (DbgValueVec[i]!=0) {
+        Begin = DbgValueVec[DbgValueVec.size()-1];
+        break;
+      }
+  } else if (!Sequence.empty())
+    Begin = Sequence[0]->getInstr();
+
+  DbgValueVec.clear();
+  return BB;
+}
diff --git a/src/LLVM/lib/CodeGen/ScheduleDAGInstrs.h b/src/LLVM/lib/CodeGen/ScheduleDAGInstrs.h
new file mode 100644
index 0000000..c8f543f
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/ScheduleDAGInstrs.h
@@ -0,0 +1,197 @@
+//==- ScheduleDAGInstrs.h - MachineInstr Scheduling --------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ScheduleDAGInstrs class, which implements
+// scheduling for a MachineInstr-based dependency graph.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCHEDULEDAGINSTRS_H
+#define SCHEDULEDAGINSTRS_H
+
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include <map>
+
+namespace llvm {
+  class MachineLoopInfo;
+  class MachineDominatorTree;
+
+  /// LoopDependencies - This class analyzes loop-oriented register
+  /// dependencies, which are used to guide scheduling decisions.
+  /// For example, loop induction variable increments should be
+  /// scheduled as soon as possible after the variable's last use.
+  ///
+  class LLVM_LIBRARY_VISIBILITY LoopDependencies {
+    const MachineLoopInfo &MLI;
+    const MachineDominatorTree &MDT;
+
+  public:
+    typedef std::map<unsigned, std::pair<const MachineOperand *, unsigned> >
+      LoopDeps;
+    LoopDeps Deps;
+
+    LoopDependencies(const MachineLoopInfo &mli,
+                     const MachineDominatorTree &mdt) :
+      MLI(mli), MDT(mdt) {}
+
+    /// VisitLoop - Clear out any previous state and analyze the given loop.
+    ///
+    void VisitLoop(const MachineLoop *Loop) {
+      Deps.clear();
+      MachineBasicBlock *Header = Loop->getHeader();
+      SmallSet<unsigned, 8> LoopLiveIns;
+      for (MachineBasicBlock::livein_iterator LI = Header->livein_begin(),
+           LE = Header->livein_end(); LI != LE; ++LI)
+        LoopLiveIns.insert(*LI);
+
+      const MachineDomTreeNode *Node = MDT.getNode(Header);
+      const MachineBasicBlock *MBB = Node->getBlock();
+      assert(Loop->contains(MBB) &&
+             "Loop does not contain header!");
+      VisitRegion(Node, MBB, Loop, LoopLiveIns);
+    }
+
+  private:
+    void VisitRegion(const MachineDomTreeNode *Node,
+                     const MachineBasicBlock *MBB,
+                     const MachineLoop *Loop,
+                     const SmallSet<unsigned, 8> &LoopLiveIns) {
+      unsigned Count = 0;
+      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I) {
+        const MachineInstr *MI = I;
+        if (MI->isDebugValue())
+          continue;
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          const MachineOperand &MO = MI->getOperand(i);
+          if (!MO.isReg() || !MO.isUse())
+            continue;
+          unsigned MOReg = MO.getReg();
+          if (LoopLiveIns.count(MOReg))
+            Deps.insert(std::make_pair(MOReg, std::make_pair(&MO, Count)));
+        }
+        ++Count; // Not every iteration due to dbg_value above.
+      }
+
+      const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
+      for (std::vector<MachineDomTreeNode*>::const_iterator I =
+           Children.begin(), E = Children.end(); I != E; ++I) {
+        const MachineDomTreeNode *ChildNode = *I;
+        MachineBasicBlock *ChildBlock = ChildNode->getBlock();
+        if (Loop->contains(ChildBlock))
+          VisitRegion(ChildNode, ChildBlock, Loop, LoopLiveIns);
+      }
+    }
+  };
+
+  /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of
+  /// MachineInstrs.
+  class LLVM_LIBRARY_VISIBILITY ScheduleDAGInstrs : public ScheduleDAG {
+    const MachineLoopInfo &MLI;
+    const MachineDominatorTree &MDT;
+    const MachineFrameInfo *MFI;
+
+    /// Defs, Uses - Remember where defs and uses of each physical register
+    /// are as we iterate upward through the instructions. This is allocated
+    /// here instead of inside BuildSchedGraph to avoid the need for it to be
+    /// initialized and destructed for each block.
+    std::vector<std::vector<SUnit *> > Defs;
+    std::vector<std::vector<SUnit *> > Uses;
+ 
+    /// DbgValueVec - Remember DBG_VALUEs that refer to a particular
+    /// register.
+    std::vector<MachineInstr *>DbgValueVec;
+
+    /// PendingLoads - Remember where unknown loads are after the most recent
+    /// unknown store, as we iterate. As with Defs and Uses, this is here
+    /// to minimize construction/destruction.
+    std::vector<SUnit *> PendingLoads;
+
+    /// LoopRegs - Track which registers are used for loop-carried dependencies.
+    ///
+    LoopDependencies LoopRegs;
+
+    /// LoopLiveInRegs - Track which regs are live into a loop, to help guide
+    /// back-edge-aware scheduling.
+    ///
+    SmallSet<unsigned, 8> LoopLiveInRegs;
+
+  public:
+    MachineBasicBlock::iterator Begin;    // The beginning of the range to
+                                          // be scheduled. The range extends
+                                          // to InsertPos.
+    unsigned InsertPosIndex;              // The index in BB of InsertPos.
+
+    explicit ScheduleDAGInstrs(MachineFunction &mf,
+                               const MachineLoopInfo &mli,
+                               const MachineDominatorTree &mdt);
+
+    virtual ~ScheduleDAGInstrs() {}
+
+    /// NewSUnit - Creates a new SUnit and return a ptr to it.
+    ///
+    SUnit *NewSUnit(MachineInstr *MI) {
+#ifndef NDEBUG
+      const SUnit *Addr = SUnits.empty() ? 0 : &SUnits[0];
+#endif
+      SUnits.push_back(SUnit(MI, (unsigned)SUnits.size()));
+      assert((Addr == 0 || Addr == &SUnits[0]) &&
+             "SUnits std::vector reallocated on the fly!");
+      SUnits.back().OrigNode = &SUnits.back();
+      return &SUnits.back();
+    }
+
+    /// Run - perform scheduling.
+    ///
+    void Run(MachineBasicBlock *bb,
+             MachineBasicBlock::iterator begin,
+             MachineBasicBlock::iterator end,
+             unsigned endindex);
+
+    /// BuildSchedGraph - Build SUnits from the MachineBasicBlock that we are
+    /// input.
+    virtual void BuildSchedGraph(AliasAnalysis *AA);
+
+    /// ComputeLatency - Compute node latency.
+    ///
+    virtual void ComputeLatency(SUnit *SU);
+
+    /// ComputeOperandLatency - Override dependence edge latency using
+    /// operand use/def information
+    ///
+    virtual void ComputeOperandLatency(SUnit *Def, SUnit *Use,
+                                       SDep& dep) const;
+
+    virtual MachineBasicBlock *EmitSchedule();
+
+    /// StartBlock - Prepare to perform scheduling in the given block.
+    ///
+    virtual void StartBlock(MachineBasicBlock *BB);
+
+    /// Schedule - Order nodes according to selected style, filling
+    /// in the Sequence member.
+    ///
+    virtual void Schedule() = 0;
+
+    /// FinishBlock - Clean up after scheduling in the given block.
+    ///
+    virtual void FinishBlock();
+
+    virtual void dumpNode(const SUnit *SU) const;
+
+    virtual std::string getGraphNodeLabel(const SUnit *SU) const;
+  };
+}
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/ScheduleDAGPrinter.cpp b/src/LLVM/lib/CodeGen/ScheduleDAGPrinter.cpp
new file mode 100644
index 0000000..027f615
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -0,0 +1,98 @@
+//===-- ScheduleDAGPrinter.cpp - Implement ScheduleDAG::viewGraph() -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG::viewGraph method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/config.h"
+#include <fstream>
+using namespace llvm;
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<ScheduleDAG*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+
+    static std::string getGraphName(const ScheduleDAG *G) {
+      return G->MF.getFunction()->getName();
+    }
+
+    static bool renderGraphFromBottomUp() {
+      return true;
+    }
+    
+    static bool hasNodeAddressLabel(const SUnit *Node,
+                                    const ScheduleDAG *Graph) {
+      return true;
+    }
+    
+    /// If you want to override the dot attributes printed for a particular
+    /// edge, override this method.
+    static std::string getEdgeAttributes(const SUnit *Node,
+                                         SUnitIterator EI) {
+      if (EI.isArtificialDep())
+        return "color=cyan,style=dashed";
+      if (EI.isCtrlDep())
+        return "color=blue,style=dashed";
+      return "";
+    }
+    
+
+    std::string getNodeLabel(const SUnit *Node, const ScheduleDAG *Graph);
+    static std::string getNodeAttributes(const SUnit *N,
+                                         const ScheduleDAG *Graph) {
+      return "shape=Mrecord";
+    }
+
+    static void addCustomGraphFeatures(ScheduleDAG *G,
+                                       GraphWriter<ScheduleDAG*> &GW) {
+      return G->addCustomGraphFeatures(GW);
+    }
+  };
+}
+
+std::string DOTGraphTraits<ScheduleDAG*>::getNodeLabel(const SUnit *SU,
+                                                       const ScheduleDAG *G) {
+  return G->getGraphNodeLabel(SU);
+}
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void ScheduleDAG::viewGraph() {
+// This code is only for debugging!
+#ifndef NDEBUG
+  if (BB->getBasicBlock())
+    ViewGraph(this, "dag." + MF.getFunction()->getNameStr(), false,
+              "Scheduling-Units Graph for " + MF.getFunction()->getNameStr() + 
+              ":" + BB->getBasicBlock()->getNameStr());
+  else
+    ViewGraph(this, "dag." + MF.getFunction()->getNameStr(), false,
+              "Scheduling-Units Graph for " + MF.getFunction()->getNameStr());
+#else
+  errs() << "ScheduleDAG::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/CMakeLists.txt b/src/LLVM/lib/CodeGen/SelectionDAG/CMakeLists.txt
new file mode 100644
index 0000000..799988a
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -0,0 +1,25 @@
+add_llvm_library(LLVMSelectionDAG
+  DAGCombiner.cpp
+  FastISel.cpp
+  FunctionLoweringInfo.cpp
+  InstrEmitter.cpp
+  LegalizeDAG.cpp
+  LegalizeFloatTypes.cpp
+  LegalizeIntegerTypes.cpp
+  LegalizeTypes.cpp
+  LegalizeTypesGeneric.cpp
+  LegalizeVectorOps.cpp
+  LegalizeVectorTypes.cpp
+  ScheduleDAGFast.cpp
+  ScheduleDAGList.cpp
+  ScheduleDAGRRList.cpp
+  ScheduleDAGSDNodes.cpp
+  SelectionDAG.cpp
+  SelectionDAGBuilder.cpp
+  SelectionDAGISel.cpp
+  SelectionDAGPrinter.cpp
+  TargetLowering.cpp
+  TargetSelectionDAGInfo.cpp
+  )
+
+target_link_libraries (LLVMSelectionDAG LLVMAnalysis LLVMAsmPrinter LLVMCodeGen)
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
new file mode 100644
index 0000000..7ffbf8d
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -0,0 +1,7221 @@
+//===-- DAGCombiner.cpp - Implement a DAG node combiner -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass combines dag nodes to form fewer, simpler DAG nodes.  It can be run
+// both before and after the DAG is legalized.
+//
+// This pass is not a substitute for the LLVM IR instcombine pass. This pass is
+// primarily intended to handle simplification opportunities that are implicit
+// in the LLVM IR and exposed by the various codegen lowering phases.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dagcombine"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NodesCombined   , "Number of dag nodes combined");
+STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
+STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
+STATISTIC(OpsNarrowed     , "Number of load/op/store narrowed");
+
+namespace {
+  static cl::opt<bool>
+    CombinerAA("combiner-alias-analysis", cl::Hidden,
+               cl::desc("Turn on alias analysis during testing"));
+
+  static cl::opt<bool>
+    CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
+               cl::desc("Include global information in alias analysis"));
+
+//------------------------------ DAGCombiner ---------------------------------//
+
+  class DAGCombiner {
+    SelectionDAG &DAG;
+    const TargetLowering &TLI;
+    CombineLevel Level;
+    CodeGenOpt::Level OptLevel;
+    bool LegalOperations;
+    bool LegalTypes;
+
+    // Worklist of all of the nodes that need to be simplified.
+    std::vector<SDNode*> WorkList;
+
+    // AA - Used for DAG load/store alias analysis.
+    AliasAnalysis &AA;
+
+    /// AddUsersToWorkList - When an instruction is simplified, add all users of
+    /// the instruction to the work lists because they might get more simplified
+    /// now.
+    ///
+    void AddUsersToWorkList(SDNode *N) {
+      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+           UI != UE; ++UI)
+        AddToWorkList(*UI);
+    }
+
+    /// visit - call the node-specific routine that knows how to fold each
+    /// particular type of node.
+    SDValue visit(SDNode *N);
+
+  public:
+    /// AddToWorkList - Add to the work list making sure it's instance is at the
+    /// the back (next to be processed.)
+    void AddToWorkList(SDNode *N) {
+      removeFromWorkList(N);
+      WorkList.push_back(N);
+    }
+
+    /// removeFromWorkList - remove all instances of N from the worklist.
+    ///
+    void removeFromWorkList(SDNode *N) {
+      WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), N),
+                     WorkList.end());
+    }
+
+    SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
+                      bool AddTo = true);
+
+    SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
+      return CombineTo(N, &Res, 1, AddTo);
+    }
+
+    SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
+                      bool AddTo = true) {
+      SDValue To[] = { Res0, Res1 };
+      return CombineTo(N, To, 2, AddTo);
+    }
+
+    void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
+
+  private:
+
+    /// SimplifyDemandedBits - Check the specified integer node value to see if
+    /// it can be simplified or if things it uses can be simplified by bit
+    /// propagation.  If so, return true.
+    bool SimplifyDemandedBits(SDValue Op) {
+      unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+      APInt Demanded = APInt::getAllOnesValue(BitWidth);
+      return SimplifyDemandedBits(Op, Demanded);
+    }
+
+    bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded);
+
+    bool CombineToPreIndexedLoadStore(SDNode *N);
+    bool CombineToPostIndexedLoadStore(SDNode *N);
+
+    void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
+    SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
+    SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
+    SDValue ZExtPromoteOperand(SDValue Op, EVT PVT);
+    SDValue PromoteIntBinOp(SDValue Op);
+    SDValue PromoteIntShiftOp(SDValue Op);
+    SDValue PromoteExtend(SDValue Op);
+    bool PromoteLoad(SDValue Op);
+
+    /// combine - call the node-specific routine that knows how to fold each
+    /// particular type of node. If that doesn't do anything, try the
+    /// target-specific DAG combines.
+    SDValue combine(SDNode *N);
+
+    // Visitation implementation - Implement dag node combining for different
+    // node types.  The semantics are as follows:
+    // Return Value:
+    //   SDValue.getNode() == 0 - No change was made
+    //   SDValue.getNode() == N - N was replaced, is dead and has been handled.
+    //   otherwise              - N should be replaced by the returned Operand.
+    //
+    SDValue visitTokenFactor(SDNode *N);
+    SDValue visitMERGE_VALUES(SDNode *N);
+    SDValue visitADD(SDNode *N);
+    SDValue visitSUB(SDNode *N);
+    SDValue visitADDC(SDNode *N);
+    SDValue visitADDE(SDNode *N);
+    SDValue visitMUL(SDNode *N);
+    SDValue visitSDIV(SDNode *N);
+    SDValue visitUDIV(SDNode *N);
+    SDValue visitSREM(SDNode *N);
+    SDValue visitUREM(SDNode *N);
+    SDValue visitMULHU(SDNode *N);
+    SDValue visitMULHS(SDNode *N);
+    SDValue visitSMUL_LOHI(SDNode *N);
+    SDValue visitUMUL_LOHI(SDNode *N);
+    SDValue visitSDIVREM(SDNode *N);
+    SDValue visitUDIVREM(SDNode *N);
+    SDValue visitAND(SDNode *N);
+    SDValue visitOR(SDNode *N);
+    SDValue visitXOR(SDNode *N);
+    SDValue SimplifyVBinOp(SDNode *N);
+    SDValue visitSHL(SDNode *N);
+    SDValue visitSRA(SDNode *N);
+    SDValue visitSRL(SDNode *N);
+    SDValue visitCTLZ(SDNode *N);
+    SDValue visitCTTZ(SDNode *N);
+    SDValue visitCTPOP(SDNode *N);
+    SDValue visitSELECT(SDNode *N);
+    SDValue visitSELECT_CC(SDNode *N);
+    SDValue visitSETCC(SDNode *N);
+    SDValue visitSIGN_EXTEND(SDNode *N);
+    SDValue visitZERO_EXTEND(SDNode *N);
+    SDValue visitANY_EXTEND(SDNode *N);
+    SDValue visitSIGN_EXTEND_INREG(SDNode *N);
+    SDValue visitTRUNCATE(SDNode *N);
+    SDValue visitBIT_CONVERT(SDNode *N);
+    SDValue visitBUILD_PAIR(SDNode *N);
+    SDValue visitFADD(SDNode *N);
+    SDValue visitFSUB(SDNode *N);
+    SDValue visitFMUL(SDNode *N);
+    SDValue visitFDIV(SDNode *N);
+    SDValue visitFREM(SDNode *N);
+    SDValue visitFCOPYSIGN(SDNode *N);
+    SDValue visitSINT_TO_FP(SDNode *N);
+    SDValue visitUINT_TO_FP(SDNode *N);
+    SDValue visitFP_TO_SINT(SDNode *N);
+    SDValue visitFP_TO_UINT(SDNode *N);
+    SDValue visitFP_ROUND(SDNode *N);
+    SDValue visitFP_ROUND_INREG(SDNode *N);
+    SDValue visitFP_EXTEND(SDNode *N);
+    SDValue visitFNEG(SDNode *N);
+    SDValue visitFABS(SDNode *N);
+    SDValue visitBRCOND(SDNode *N);
+    SDValue visitBR_CC(SDNode *N);
+    SDValue visitLOAD(SDNode *N);
+    SDValue visitSTORE(SDNode *N);
+    SDValue visitINSERT_VECTOR_ELT(SDNode *N);
+    SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
+    SDValue visitBUILD_VECTOR(SDNode *N);
+    SDValue visitCONCAT_VECTORS(SDNode *N);
+    SDValue visitVECTOR_SHUFFLE(SDNode *N);
+    SDValue visitMEMBARRIER(SDNode *N);
+
+    SDValue XformToShuffleWithZero(SDNode *N);
+    SDValue ReassociateOps(unsigned Opc, DebugLoc DL, SDValue LHS, SDValue RHS);
+
+    SDValue visitShiftByConstant(SDNode *N, unsigned Amt);
+
+    bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
+    SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
+    SDValue SimplifySelect(DebugLoc DL, SDValue N0, SDValue N1, SDValue N2);
+    SDValue SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1, SDValue N2,
+                             SDValue N3, ISD::CondCode CC,
+                             bool NotExtCompare = false);
+    SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
+                          DebugLoc DL, bool foldBooleans = true);
+    SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
+                                         unsigned HiOp);
+    SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
+    SDValue ConstantFoldBIT_CONVERTofBUILD_VECTOR(SDNode *, EVT);
+    SDValue BuildSDIV(SDNode *N);
+    SDValue BuildUDIV(SDNode *N);
+    SDNode *MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL);
+    SDValue ReduceLoadWidth(SDNode *N);
+    SDValue ReduceLoadOpStoreWidth(SDNode *N);
+
+    SDValue GetDemandedBits(SDValue V, const APInt &Mask);
+
+    /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
+    /// looking for aliasing nodes and adding them to the Aliases vector.
+    void GatherAllAliases(SDNode *N, SDValue OriginalChain,
+                          SmallVector<SDValue, 8> &Aliases);
+
+    /// isAlias - Return true if there is any possibility that the two addresses
+    /// overlap.
+    bool isAlias(SDValue Ptr1, int64_t Size1,
+                 const Value *SrcValue1, int SrcValueOffset1,
+                 unsigned SrcValueAlign1,
+                 SDValue Ptr2, int64_t Size2,
+                 const Value *SrcValue2, int SrcValueOffset2,
+                 unsigned SrcValueAlign2) const;
+
+    /// FindAliasInfo - Extracts the relevant alias information from the memory
+    /// node.  Returns true if the operand was a load.
+    bool FindAliasInfo(SDNode *N,
+                       SDValue &Ptr, int64_t &Size,
+                       const Value *&SrcValue, int &SrcValueOffset,
+                       unsigned &SrcValueAlignment) const;
+
+    /// FindBetterChain - Walk up chain skipping non-aliasing memory nodes,
+    /// looking for a better chain (aliasing node.)
+    SDValue FindBetterChain(SDNode *N, SDValue Chain);
+
+  public:
+    DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
+      : DAG(D), TLI(D.getTargetLoweringInfo()), Level(Unrestricted),
+        OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {}
+
+    /// Run - runs the dag combiner on all nodes in the work list
+    void Run(CombineLevel AtLevel);
+    
+    SelectionDAG &getDAG() const { return DAG; }
+    
+    /// getShiftAmountTy - Returns a type large enough to hold any valid
+    /// shift amount - before type legalization these can be huge.
+    EVT getShiftAmountTy() {
+      return LegalTypes ? TLI.getShiftAmountTy() : TLI.getPointerTy();
+    }
+    
+    /// isTypeLegal - This method returns true if we are running before type
+    /// legalization or if the specified VT is legal.
+    bool isTypeLegal(const EVT &VT) {
+      if (!LegalTypes) return true;
+      return TLI.isTypeLegal(VT);
+    }
+  };
+}
+
+
+namespace {
+/// WorkListRemover - This class is a DAGUpdateListener that removes any deleted
+/// nodes from the worklist.
+class WorkListRemover : public SelectionDAG::DAGUpdateListener {
+  DAGCombiner &DC;
+public:
+  explicit WorkListRemover(DAGCombiner &dc) : DC(dc) {}
+
+  virtual void NodeDeleted(SDNode *N, SDNode *E) {
+    DC.removeFromWorkList(N);
+  }
+
+  virtual void NodeUpdated(SDNode *N) {
+    // Ignore updates.
+  }
+};
+}
+
+//===----------------------------------------------------------------------===//
+//  TargetLowering::DAGCombinerInfo implementation
+//===----------------------------------------------------------------------===//
+
+void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
+  ((DAGCombiner*)DC)->AddToWorkList(N);
+}
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, const std::vector<SDValue> &To, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
+}
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, SDValue Res, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
+}
+
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
+}
+
+void TargetLowering::DAGCombinerInfo::
+CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+  return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// isNegatibleForFree - Return 1 if we can compute the negated form of the
+/// specified expression for the same cost as the expression itself, or 2 if we
+/// can compute the negated form more cheaply than the expression itself.
+static char isNegatibleForFree(SDValue Op, bool LegalOperations,
+                               unsigned Depth = 0) {
+  // No compile time optimizations on this type.
+  if (Op.getValueType() == MVT::ppcf128)
+    return 0;
+
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG) return 2;
+
+  // Don't allow anything with multiple uses.
+  if (!Op.hasOneUse()) return 0;
+
+  // Don't recurse exponentially.
+  if (Depth > 6) return 0;
+
+  switch (Op.getOpcode()) {
+  default: return false;
+  case ISD::ConstantFP:
+    // Don't invert constant FP values after legalize.  The negated constant
+    // isn't necessarily legal.
+    return LegalOperations ? 0 : 1;
+  case ISD::FADD:
+    // FIXME: determine better conditions for this xform.
+    if (!UnsafeFPMath) return 0;
+
+    // fold (fsub (fadd A, B)) -> (fsub (fneg A), B)
+    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, Depth+1))
+      return V;
+    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
+    return isNegatibleForFree(Op.getOperand(1), LegalOperations, Depth+1);
+  case ISD::FSUB:
+    // We can't turn -(A-B) into B-A when we honor signed zeros.
+    if (!UnsafeFPMath) return 0;
+
+    // fold (fneg (fsub A, B)) -> (fsub B, A)
+    return 1;
+
+  case ISD::FMUL:
+  case ISD::FDIV:
+    if (HonorSignDependentRoundingFPMath()) return 0;
+
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
+    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, Depth+1))
+      return V;
+
+    return isNegatibleForFree(Op.getOperand(1), LegalOperations, Depth+1);
+
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::FSIN:
+    return isNegatibleForFree(Op.getOperand(0), LegalOperations, Depth+1);
+  }
+}
+
+/// GetNegatedExpression - If isNegatibleForFree returns true, this function
+/// returns the newly negated expression.
+static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                                    bool LegalOperations, unsigned Depth = 0) {
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0);
+
+  // Don't allow anything with multiple uses.
+  assert(Op.hasOneUse() && "Unknown reuse!");
+
+  assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree");
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Unknown code");
+  case ISD::ConstantFP: {
+    APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
+    V.changeSign();
+    return DAG.getConstantFP(V, Op.getValueType());
+  }
+  case ISD::FADD:
+    // FIXME: determine better conditions for this xform.
+    assert(UnsafeFPMath);
+
+    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
+    if (isNegatibleForFree(Op.getOperand(0), LegalOperations, Depth+1))
+      return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+                         GetNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, Depth+1),
+                         Op.getOperand(1));
+    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
+    return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+                       GetNegatedExpression(Op.getOperand(1), DAG,
+                                            LegalOperations, Depth+1),
+                       Op.getOperand(0));
+  case ISD::FSUB:
+    // We can't turn -(A-B) into B-A when we honor signed zeros.
+    assert(UnsafeFPMath);
+
+    // fold (fneg (fsub 0, B)) -> B
+    if (ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Op.getOperand(0)))
+      if (N0CFP->getValueAPF().isZero())
+        return Op.getOperand(1);
+
+    // fold (fneg (fsub A, B)) -> (fsub B, A)
+    return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(0));
+
+  case ISD::FMUL:
+  case ISD::FDIV:
+    assert(!HonorSignDependentRoundingFPMath());
+
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
+    if (isNegatibleForFree(Op.getOperand(0), LegalOperations, Depth+1))
+      return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
+                         GetNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, Depth+1),
+                         Op.getOperand(1));
+
+    // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
+    return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
+                       Op.getOperand(0),
+                       GetNegatedExpression(Op.getOperand(1), DAG,
+                                            LegalOperations, Depth+1));
+
+  case ISD::FP_EXTEND:
+  case ISD::FSIN:
+    return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
+                       GetNegatedExpression(Op.getOperand(0), DAG,
+                                            LegalOperations, Depth+1));
+  case ISD::FP_ROUND:
+      return DAG.getNode(ISD::FP_ROUND, Op.getDebugLoc(), Op.getValueType(),
+                         GetNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, Depth+1),
+                         Op.getOperand(1));
+  }
+}
+
+
+// isSetCCEquivalent - Return true if this node is a setcc, or is a select_cc
+// that selects between the values 1 and 0, making it equivalent to a setcc.
+// Also, set the incoming LHS, RHS, and CC references to the appropriate
+// nodes based on the type of node we are checking.  This simplifies life a
+// bit for the callers.
+static bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
+                              SDValue &CC) {
+  if (N.getOpcode() == ISD::SETCC) {
+    LHS = N.getOperand(0);
+    RHS = N.getOperand(1);
+    CC  = N.getOperand(2);
+    return true;
+  }
+  if (N.getOpcode() == ISD::SELECT_CC &&
+      N.getOperand(2).getOpcode() == ISD::Constant &&
+      N.getOperand(3).getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(N.getOperand(2))->getAPIntValue() == 1 &&
+      cast<ConstantSDNode>(N.getOperand(3))->isNullValue()) {
+    LHS = N.getOperand(0);
+    RHS = N.getOperand(1);
+    CC  = N.getOperand(4);
+    return true;
+  }
+  return false;
+}
+
+// isOneUseSetCC - Return true if this is a SetCC-equivalent operation with only
+// one use.  If this is true, it allows the users to invert the operation for
+// free when it is profitable to do so.
+static bool isOneUseSetCC(SDValue N) {
+  SDValue N0, N1, N2;
+  if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
+    return true;
+  return false;
+}
+
+SDValue DAGCombiner::ReassociateOps(unsigned Opc, DebugLoc DL,
+                                    SDValue N0, SDValue N1) {
+  EVT VT = N0.getValueType();
+  if (N0.getOpcode() == Opc && isa<ConstantSDNode>(N0.getOperand(1))) {
+    if (isa<ConstantSDNode>(N1)) {
+      // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2))
+      SDValue OpNode =
+        DAG.FoldConstantArithmetic(Opc, VT,
+                                   cast<ConstantSDNode>(N0.getOperand(1)),
+                                   cast<ConstantSDNode>(N1));
+      return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
+    } else if (N0.hasOneUse()) {
+      // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one use
+      SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT,
+                                   N0.getOperand(0), N1);
+      AddToWorkList(OpNode.getNode());
+      return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
+    }
+  }
+
+  if (N1.getOpcode() == Opc && isa<ConstantSDNode>(N1.getOperand(1))) {
+    if (isa<ConstantSDNode>(N0)) {
+      // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2))
+      SDValue OpNode =
+        DAG.FoldConstantArithmetic(Opc, VT,
+                                   cast<ConstantSDNode>(N1.getOperand(1)),
+                                   cast<ConstantSDNode>(N0));
+      return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
+    } else if (N1.hasOneUse()) {
+      // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one use
+      SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT,
+                                   N1.getOperand(0), N0);
+      AddToWorkList(OpNode.getNode());
+      return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
+                               bool AddTo) {
+  assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
+  ++NodesCombined;
+  DEBUG(dbgs() << "\nReplacing.1 ";
+        N->dump(&DAG);
+        dbgs() << "\nWith: ";
+        To[0].getNode()->dump(&DAG);
+        dbgs() << " and " << NumTo-1 << " other values\n";
+        for (unsigned i = 0, e = NumTo; i != e; ++i)
+          assert((!To[i].getNode() ||
+                  N->getValueType(i) == To[i].getValueType()) &&
+                 "Cannot combine value to value of different type!"));
+  WorkListRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesWith(N, To, &DeadNodes);
+
+  if (AddTo) {
+    // Push the new nodes and any users onto the worklist
+    for (unsigned i = 0, e = NumTo; i != e; ++i) {
+      if (To[i].getNode()) {
+        AddToWorkList(To[i].getNode());
+        AddUsersToWorkList(To[i].getNode());
+      }
+    }
+  }
+
+  // Finally, if the node is now dead, remove it from the graph.  The node
+  // may not be dead if the replacement process recursively simplified to
+  // something else needing this node.
+  if (N->use_empty()) {
+    // Nodes can be reintroduced into the worklist.  Make sure we do not
+    // process a node that has been replaced.
+    removeFromWorkList(N);
+
+    // Finally, since the node is now dead, remove it from the graph.
+    DAG.DeleteNode(N);
+  }
+  return SDValue(N, 0);
+}
+
+void DAGCombiner::
+CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+  // Replace all uses.  If any nodes become isomorphic to other nodes and
+  // are deleted, make sure to remove them from our worklist.
+  WorkListRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New, &DeadNodes);
+
+  // Push the new node and any (possibly new) users onto the worklist.
+  AddToWorkList(TLO.New.getNode());
+  AddUsersToWorkList(TLO.New.getNode());
+
+  // Finally, if the node is now dead, remove it from the graph.  The node
+  // may not be dead if the replacement process recursively simplified to
+  // something else needing this node.
+  if (TLO.Old.getNode()->use_empty()) {
+    removeFromWorkList(TLO.Old.getNode());
+
+    // If the operands of this node are only used by the node, they will now
+    // be dead.  Make sure to visit them first to delete dead nodes early.
+    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands(); i != e; ++i)
+      if (TLO.Old.getNode()->getOperand(i).getNode()->hasOneUse())
+        AddToWorkList(TLO.Old.getNode()->getOperand(i).getNode());
+
+    DAG.DeleteNode(TLO.Old.getNode());
+  }
+}
+
+/// SimplifyDemandedBits - Check the specified integer node value to see if
+/// it can be simplified or if things it uses can be simplified by bit
+/// propagation.  If so, return true.
+bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
+  TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
+  APInt KnownZero, KnownOne;
+  if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+    return false;
+
+  // Revisit the node.
+  AddToWorkList(Op.getNode());
+
+  // Replace the old value with the new one.
+  ++NodesCombined;
+  DEBUG(dbgs() << "\nReplacing.2 "; 
+        TLO.Old.getNode()->dump(&DAG);
+        dbgs() << "\nWith: ";
+        TLO.New.getNode()->dump(&DAG);
+        dbgs() << '\n');
+
+  CommitTargetLoweringOpt(TLO);
+  return true;
+}
+
+void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
+  DebugLoc dl = Load->getDebugLoc();
+  EVT VT = Load->getValueType(0);
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, SDValue(ExtLoad, 0));
+
+  DEBUG(dbgs() << "\nReplacing.9 ";
+        Load->dump(&DAG);
+        dbgs() << "\nWith: ";
+        Trunc.getNode()->dump(&DAG);
+        dbgs() << '\n');
+  WorkListRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc, &DeadNodes);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1),
+                                &DeadNodes);
+  removeFromWorkList(Load);
+  DAG.DeleteNode(Load);
+  AddToWorkList(Trunc.getNode());
+}
+
+SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
+  Replace = false;
+  DebugLoc dl = Op.getDebugLoc();
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
+    EVT MemVT = LD->getMemoryVT();
+    ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
+      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD)
+      : LD->getExtensionType();
+    Replace = true;
+    return DAG.getExtLoad(ExtType, PVT, dl,
+                          LD->getChain(), LD->getBasePtr(),
+                          LD->getSrcValue(), LD->getSrcValueOffset(),
+                          MemVT, LD->isVolatile(),
+                          LD->isNonTemporal(), LD->getAlignment());
+  }
+
+  unsigned Opc = Op.getOpcode();
+  switch (Opc) {
+  default: break;
+  case ISD::AssertSext:
+    return DAG.getNode(ISD::AssertSext, dl, PVT,
+                       SExtPromoteOperand(Op.getOperand(0), PVT),
+                       Op.getOperand(1));
+  case ISD::AssertZext:
+    return DAG.getNode(ISD::AssertZext, dl, PVT,
+                       ZExtPromoteOperand(Op.getOperand(0), PVT),
+                       Op.getOperand(1));
+  case ISD::Constant: {
+    unsigned ExtOpc =
+      Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    return DAG.getNode(ExtOpc, dl, PVT, Op);
+  }    
+  }
+
+  if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT))
+    return SDValue();
+  return DAG.getNode(ISD::ANY_EXTEND, dl, PVT, Op);
+}
+
+SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
+  if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT))
+    return SDValue();
+  EVT OldVT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  bool Replace = false;
+  SDValue NewOp = PromoteOperand(Op, PVT, Replace);
+  if (NewOp.getNode() == 0)
+    return SDValue();
+  AddToWorkList(NewOp.getNode());
+
+  if (Replace)
+    ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NewOp.getValueType(), NewOp,
+                     DAG.getValueType(OldVT));
+}
+
+SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
+  EVT OldVT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  bool Replace = false;
+  SDValue NewOp = PromoteOperand(Op, PVT, Replace);
+  if (NewOp.getNode() == 0)
+    return SDValue();
+  AddToWorkList(NewOp.getNode());
+
+  if (Replace)
+    ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
+  return DAG.getZeroExtendInReg(NewOp, dl, OldVT);
+}
+
+/// PromoteIntBinOp - Promote the specified integer binary operation if the
+/// target indicates it is beneficial. e.g. On x86, it's usually better to
+/// promote i16 operations to i32 since i16 instructions are longer.
+SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return SDValue();
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+
+    bool Replace0 = false;
+    SDValue N0 = Op.getOperand(0);
+    SDValue NN0 = PromoteOperand(N0, PVT, Replace0);
+    if (NN0.getNode() == 0)
+      return SDValue();
+
+    bool Replace1 = false;
+    SDValue N1 = Op.getOperand(1);
+    SDValue NN1;
+    if (N0 == N1)
+      NN1 = NN0;
+    else {
+      NN1 = PromoteOperand(N1, PVT, Replace1);
+      if (NN1.getNode() == 0)
+        return SDValue();
+    }
+
+    AddToWorkList(NN0.getNode());
+    if (NN1.getNode())
+      AddToWorkList(NN1.getNode());
+
+    if (Replace0)
+      ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
+    if (Replace1)
+      ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
+
+    DEBUG(dbgs() << "\nPromoting ";
+          Op.getNode()->dump(&DAG));
+    DebugLoc dl = Op.getDebugLoc();
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                       DAG.getNode(Opc, dl, PVT, NN0, NN1));
+  }
+  return SDValue();
+}
+
+/// PromoteIntShiftOp - Promote the specified integer shift operation if the
+/// target indicates it is beneficial. e.g. On x86, it's usually better to
+/// promote i16 operations to i32 since i16 instructions are longer.
+SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return SDValue();
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+
+    bool Replace = false;
+    SDValue N0 = Op.getOperand(0);
+    if (Opc == ISD::SRA)
+      N0 = SExtPromoteOperand(Op.getOperand(0), PVT);
+    else if (Opc == ISD::SRL)
+      N0 = ZExtPromoteOperand(Op.getOperand(0), PVT);
+    else
+      N0 = PromoteOperand(N0, PVT, Replace);
+    if (N0.getNode() == 0)
+      return SDValue();
+
+    AddToWorkList(N0.getNode());
+    if (Replace)
+      ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
+
+    DEBUG(dbgs() << "\nPromoting ";
+          Op.getNode()->dump(&DAG));
+    DebugLoc dl = Op.getDebugLoc();
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                       DAG.getNode(Opc, dl, PVT, N0, Op.getOperand(1)));
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::PromoteExtend(SDValue Op) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return SDValue();
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+    // fold (aext (aext x)) -> (aext x)
+    // fold (aext (zext x)) -> (zext x)
+    // fold (aext (sext x)) -> (sext x)
+    DEBUG(dbgs() << "\nPromoting ";
+          Op.getNode()->dump(&DAG));
+    return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), VT, Op.getOperand(0));
+  }
+  return SDValue();
+}
+
+bool DAGCombiner::PromoteLoad(SDValue Op) {
+  if (!LegalOperations)
+    return false;
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return false;
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return false;
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+
+    DebugLoc dl = Op.getDebugLoc();
+    SDNode *N = Op.getNode();
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    EVT MemVT = LD->getMemoryVT();
+    ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
+      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD)
+      : LD->getExtensionType();
+    SDValue NewLD = DAG.getExtLoad(ExtType, PVT, dl,
+                                   LD->getChain(), LD->getBasePtr(),
+                                   LD->getSrcValue(), LD->getSrcValueOffset(),
+                                   MemVT, LD->isVolatile(),
+                                   LD->isNonTemporal(), LD->getAlignment());
+    SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, VT, NewLD);
+
+    DEBUG(dbgs() << "\nPromoting ";
+          N->dump(&DAG);
+          dbgs() << "\nTo: ";
+          Result.getNode()->dump(&DAG);
+          dbgs() << '\n');
+    WorkListRemover DeadNodes(*this);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result, &DeadNodes);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1), &DeadNodes);
+    removeFromWorkList(N);
+    DAG.DeleteNode(N);
+    AddToWorkList(Result.getNode());
+    return true;
+  }
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Main DAG Combiner implementation
+//===----------------------------------------------------------------------===//
+
+void DAGCombiner::Run(CombineLevel AtLevel) {
+  // set the instance variables, so that the various visit routines may use it.
+  Level = AtLevel;
+  LegalOperations = Level >= NoIllegalOperations;
+  LegalTypes = Level >= NoIllegalTypes;
+
+  // Add all the dag nodes to the worklist.
+  WorkList.reserve(DAG.allnodes_size());
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ++I)
+    WorkList.push_back(I);
+
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted, and tracking any
+  // changes of the root.
+  HandleSDNode Dummy(DAG.getRoot());
+
+  // The root of the dag may dangle to deleted nodes until the dag combiner is
+  // done.  Set it to null to avoid confusion.
+  DAG.setRoot(SDValue());
+
+  // while the worklist isn't empty, inspect the node on the end of it and
+  // try and combine it.
+  while (!WorkList.empty()) {
+    SDNode *N = WorkList.back();
+    WorkList.pop_back();
+
+    // If N has no uses, it is dead.  Make sure to revisit all N's operands once
+    // N is deleted from the DAG, since they too may now be dead or may have a
+    // reduced number of uses, allowing other xforms.
+    if (N->use_empty() && N != &Dummy) {
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+        AddToWorkList(N->getOperand(i).getNode());
+
+      DAG.DeleteNode(N);
+      continue;
+    }
+
+    SDValue RV = combine(N);
+
+    if (RV.getNode() == 0)
+      continue;
+
+    ++NodesCombined;
+
+    // If we get back the same node we passed in, rather than a new node or
+    // zero, we know that the node must have defined multiple values and
+    // CombineTo was used.  Since CombineTo takes care of the worklist
+    // mechanics for us, we have no work to do in this case.
+    if (RV.getNode() == N)
+      continue;
+
+    assert(N->getOpcode() != ISD::DELETED_NODE &&
+           RV.getNode()->getOpcode() != ISD::DELETED_NODE &&
+           "Node was deleted but visit returned new node!");
+
+    DEBUG(dbgs() << "\nReplacing.3 "; 
+          N->dump(&DAG);
+          dbgs() << "\nWith: ";
+          RV.getNode()->dump(&DAG);
+          dbgs() << '\n');
+    WorkListRemover DeadNodes(*this);
+    if (N->getNumValues() == RV.getNode()->getNumValues())
+      DAG.ReplaceAllUsesWith(N, RV.getNode(), &DeadNodes);
+    else {
+      assert(N->getValueType(0) == RV.getValueType() &&
+             N->getNumValues() == 1 && "Type mismatch");
+      SDValue OpV = RV;
+      DAG.ReplaceAllUsesWith(N, &OpV, &DeadNodes);
+    }
+
+    // Push the new node and any users onto the worklist
+    AddToWorkList(RV.getNode());
+    AddUsersToWorkList(RV.getNode());
+
+    // Add any uses of the old node to the worklist in case this node is the
+    // last one that uses them.  They may become dead after this node is
+    // deleted.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      AddToWorkList(N->getOperand(i).getNode());
+
+    // Finally, if the node is now dead, remove it from the graph.  The node
+    // may not be dead if the replacement process recursively simplified to
+    // something else needing this node.
+    if (N->use_empty()) {
+      // Nodes can be reintroduced into the worklist.  Make sure we do not
+      // process a node that has been replaced.
+      removeFromWorkList(N);
+
+      // Finally, since the node is now dead, remove it from the graph.
+      DAG.DeleteNode(N);
+    }
+  }
+
+  // If the root changed (e.g. it was a dead load, update the root).
+  DAG.setRoot(Dummy.getValue());
+}
+
+SDValue DAGCombiner::visit(SDNode *N) {
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::TokenFactor:        return visitTokenFactor(N);
+  case ISD::MERGE_VALUES:       return visitMERGE_VALUES(N);
+  case ISD::ADD:                return visitADD(N);
+  case ISD::SUB:                return visitSUB(N);
+  case ISD::ADDC:               return visitADDC(N);
+  case ISD::ADDE:               return visitADDE(N);
+  case ISD::MUL:                return visitMUL(N);
+  case ISD::SDIV:               return visitSDIV(N);
+  case ISD::UDIV:               return visitUDIV(N);
+  case ISD::SREM:               return visitSREM(N);
+  case ISD::UREM:               return visitUREM(N);
+  case ISD::MULHU:              return visitMULHU(N);
+  case ISD::MULHS:              return visitMULHS(N);
+  case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
+  case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
+  case ISD::SDIVREM:            return visitSDIVREM(N);
+  case ISD::UDIVREM:            return visitUDIVREM(N);
+  case ISD::AND:                return visitAND(N);
+  case ISD::OR:                 return visitOR(N);
+  case ISD::XOR:                return visitXOR(N);
+  case ISD::SHL:                return visitSHL(N);
+  case ISD::SRA:                return visitSRA(N);
+  case ISD::SRL:                return visitSRL(N);
+  case ISD::CTLZ:               return visitCTLZ(N);
+  case ISD::CTTZ:               return visitCTTZ(N);
+  case ISD::CTPOP:              return visitCTPOP(N);
+  case ISD::SELECT:             return visitSELECT(N);
+  case ISD::SELECT_CC:          return visitSELECT_CC(N);
+  case ISD::SETCC:              return visitSETCC(N);
+  case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);
+  case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);
+  case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
+  case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
+  case ISD::TRUNCATE:           return visitTRUNCATE(N);
+  case ISD::BIT_CONVERT:        return visitBIT_CONVERT(N);
+  case ISD::BUILD_PAIR:         return visitBUILD_PAIR(N);
+  case ISD::FADD:               return visitFADD(N);
+  case ISD::FSUB:               return visitFSUB(N);
+  case ISD::FMUL:               return visitFMUL(N);
+  case ISD::FDIV:               return visitFDIV(N);
+  case ISD::FREM:               return visitFREM(N);
+  case ISD::FCOPYSIGN:          return visitFCOPYSIGN(N);
+  case ISD::SINT_TO_FP:         return visitSINT_TO_FP(N);
+  case ISD::UINT_TO_FP:         return visitUINT_TO_FP(N);
+  case ISD::FP_TO_SINT:         return visitFP_TO_SINT(N);
+  case ISD::FP_TO_UINT:         return visitFP_TO_UINT(N);
+  case ISD::FP_ROUND:           return visitFP_ROUND(N);
+  case ISD::FP_ROUND_INREG:     return visitFP_ROUND_INREG(N);
+  case ISD::FP_EXTEND:          return visitFP_EXTEND(N);
+  case ISD::FNEG:               return visitFNEG(N);
+  case ISD::FABS:               return visitFABS(N);
+  case ISD::BRCOND:             return visitBRCOND(N);
+  case ISD::BR_CC:              return visitBR_CC(N);
+  case ISD::LOAD:               return visitLOAD(N);
+  case ISD::STORE:              return visitSTORE(N);
+  case ISD::INSERT_VECTOR_ELT:  return visitINSERT_VECTOR_ELT(N);
+  case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
+  case ISD::BUILD_VECTOR:       return visitBUILD_VECTOR(N);
+  case ISD::CONCAT_VECTORS:     return visitCONCAT_VECTORS(N);
+  case ISD::VECTOR_SHUFFLE:     return visitVECTOR_SHUFFLE(N);
+  case ISD::MEMBARRIER:         return visitMEMBARRIER(N);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::combine(SDNode *N) {
+  SDValue RV = visit(N);
+
+  // If nothing happened, try a target-specific DAG combine.
+  if (RV.getNode() == 0) {
+    assert(N->getOpcode() != ISD::DELETED_NODE &&
+           "Node was deleted but visit returned NULL!");
+
+    if (N->getOpcode() >= ISD::BUILTIN_OP_END ||
+        TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {
+
+      // Expose the DAG combiner to the target combiner impls.
+      TargetLowering::DAGCombinerInfo
+        DagCombineInfo(DAG, !LegalTypes, !LegalOperations, false, this);
+
+      RV = TLI.PerformDAGCombine(N, DagCombineInfo);
+    }
+  }
+
+  // If nothing happened still, try promoting the operation.
+  if (RV.getNode() == 0) {
+    switch (N->getOpcode()) {
+    default: break;
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+      RV = PromoteIntBinOp(SDValue(N, 0));
+      break;
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+      RV = PromoteIntShiftOp(SDValue(N, 0));
+      break;
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::ANY_EXTEND:
+      RV = PromoteExtend(SDValue(N, 0));
+      break;
+    case ISD::LOAD:
+      if (PromoteLoad(SDValue(N, 0)))
+        RV = SDValue(N, 0);
+      break;
+    }
+  }
+
+  // If N is a commutative binary node, try commuting it to enable more
+  // sdisel CSE.
+  if (RV.getNode() == 0 &&
+      SelectionDAG::isCommutativeBinOp(N->getOpcode()) &&
+      N->getNumValues() == 1) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+
+    // Constant operands are canonicalized to RHS.
+    if (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1)) {
+      SDValue Ops[] = { N1, N0 };
+      SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(),
+                                            Ops, 2);
+      if (CSENode)
+        return SDValue(CSENode, 0);
+    }
+  }
+
+  return RV;
+}
+
+/// getInputChainForNode - Given a node, return its input chain if it has one,
+/// otherwise return a null sd operand.
+static SDValue getInputChainForNode(SDNode *N) {
+  if (unsigned NumOps = N->getNumOperands()) {
+    if (N->getOperand(0).getValueType() == MVT::Other)
+      return N->getOperand(0);
+    else if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
+      return N->getOperand(NumOps-1);
+    for (unsigned i = 1; i < NumOps-1; ++i)
+      if (N->getOperand(i).getValueType() == MVT::Other)
+        return N->getOperand(i);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
+  // If N has two operands, where one has an input chain equal to the other,
+  // the 'other' chain is redundant.
+  if (N->getNumOperands() == 2) {
+    if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1))
+      return N->getOperand(0);
+    if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0))
+      return N->getOperand(1);
+  }
+
+  SmallVector<SDNode *, 8> TFs;     // List of token factors to visit.
+  SmallVector<SDValue, 8> Ops;    // Ops for replacing token factor.
+  SmallPtrSet<SDNode*, 16> SeenOps;
+  bool Changed = false;             // If we should replace this token factor.
+
+  // Start out with this token factor.
+  TFs.push_back(N);
+
+  // Iterate through token factors.  The TFs grows when new token factors are
+  // encountered.
+  for (unsigned i = 0; i < TFs.size(); ++i) {
+    SDNode *TF = TFs[i];
+
+    // Check each of the operands.
+    for (unsigned i = 0, ie = TF->getNumOperands(); i != ie; ++i) {
+      SDValue Op = TF->getOperand(i);
+
+      switch (Op.getOpcode()) {
+      case ISD::EntryToken:
+        // Entry tokens don't need to be added to the list. They are
+        // rededundant.
+        Changed = true;
+        break;
+
+      case ISD::TokenFactor:
+        if (Op.hasOneUse() &&
+            std::find(TFs.begin(), TFs.end(), Op.getNode()) == TFs.end()) {
+          // Queue up for processing.
+          TFs.push_back(Op.getNode());
+          // Clean up in case the token factor is removed.
+          AddToWorkList(Op.getNode());
+          Changed = true;
+          break;
+        }
+        // Fall thru
+
+      default:
+        // Only add if it isn't already in the list.
+        if (SeenOps.insert(Op.getNode()))
+          Ops.push_back(Op);
+        else
+          Changed = true;
+        break;
+      }
+    }
+  }
+  
+  SDValue Result;
+
+  // If we've change things around then replace token factor.
+  if (Changed) {
+    if (Ops.empty()) {
+      // The entry token is the only possible outcome.
+      Result = DAG.getEntryNode();
+    } else {
+      // New and improved token factor.
+      Result = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(),
+                           MVT::Other, &Ops[0], Ops.size());
+    }
+
+    // Don't add users to work list.
+    return CombineTo(N, Result, false);
+  }
+
+  return Result;
+}
+
+/// MERGE_VALUES can always be eliminated.
+SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
+  WorkListRemover DeadNodes(*this);
+  // Replacing results may cause a different MERGE_VALUES to suddenly
+  // be CSE'd with N, and carry its uses with it. Iterate until no
+  // uses remain, to ensure that the node can be safely deleted.
+  do {
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i),
+                                    &DeadNodes);
+  } while (!N->use_empty());
+  removeFromWorkList(N);
+  DAG.DeleteNode(N);
+  return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+}
+
+static
+SDValue combineShlAddConstant(DebugLoc DL, SDValue N0, SDValue N1,
+                              SelectionDAG &DAG) {
+  EVT VT = N0.getValueType();
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+  ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N01);
+
+  if (N01C && N00.getOpcode() == ISD::ADD && N00.getNode()->hasOneUse() &&
+      isa<ConstantSDNode>(N00.getOperand(1))) {
+    // fold (add (shl (add x, c1), c2), ) -> (add (add (shl x, c2), c1<<c2), )
+    N0 = DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT,
+                     DAG.getNode(ISD::SHL, N00.getDebugLoc(), VT,
+                                 N00.getOperand(0), N01),
+                     DAG.getNode(ISD::SHL, N01.getDebugLoc(), VT,
+                                 N00.getOperand(1), N01));
+    return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (add x, undef) -> undef
+  if (N0.getOpcode() == ISD::UNDEF)
+    return N0;
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+  // fold (add c1, c2) -> c1+c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::ADD, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N1, N0);
+  // fold (add x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // fold (add Sym, c) -> Sym+c
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
+    if (!LegalOperations && TLI.isOffsetFoldingLegal(GA) && N1C &&
+        GA->getOpcode() == ISD::GlobalAddress)
+      return DAG.getGlobalAddress(GA->getGlobal(), N1C->getDebugLoc(), VT,
+                                  GA->getOffset() +
+                                    (uint64_t)N1C->getSExtValue());
+  // fold ((c1-A)+c2) -> (c1+c2)-A
+  if (N1C && N0.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getOperand(0)))
+      return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                         DAG.getConstant(N1C->getAPIntValue()+
+                                         N0C->getAPIntValue(), VT),
+                         N0.getOperand(1));
+  // reassociate add
+  SDValue RADD = ReassociateOps(ISD::ADD, N->getDebugLoc(), N0, N1);
+  if (RADD.getNode() != 0)
+    return RADD;
+  // fold ((0-A) + B) -> B-A
+  if (N0.getOpcode() == ISD::SUB && isa<ConstantSDNode>(N0.getOperand(0)) &&
+      cast<ConstantSDNode>(N0.getOperand(0))->isNullValue())
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1, N0.getOperand(1));
+  // fold (A + (0-B)) -> A-B
+  if (N1.getOpcode() == ISD::SUB && isa<ConstantSDNode>(N1.getOperand(0)) &&
+      cast<ConstantSDNode>(N1.getOperand(0))->isNullValue())
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, N1.getOperand(1));
+  // fold (A+(B-A)) -> B
+  if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
+    return N1.getOperand(0);
+  // fold ((B-A)+A) -> B
+  if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1))
+    return N0.getOperand(0);
+  // fold (A+(B-(A+C))) to (B-C)
+  if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
+      N0 == N1.getOperand(1).getOperand(0))
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1.getOperand(0),
+                       N1.getOperand(1).getOperand(1));
+  // fold (A+(B-(C+A))) to (B-C)
+  if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
+      N0 == N1.getOperand(1).getOperand(1))
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1.getOperand(0),
+                       N1.getOperand(1).getOperand(0));
+  // fold (A+((B-A)+or-C)) to (B+or-C)
+  if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) &&
+      N1.getOperand(0).getOpcode() == ISD::SUB &&
+      N0 == N1.getOperand(0).getOperand(1))
+    return DAG.getNode(N1.getOpcode(), N->getDebugLoc(), VT,
+                       N1.getOperand(0).getOperand(0), N1.getOperand(1));
+
+  // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
+  if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    SDValue N10 = N1.getOperand(0);
+    SDValue N11 = N1.getOperand(1);
+
+    if (isa<ConstantSDNode>(N00) || isa<ConstantSDNode>(N10))
+      return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                         DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT, N00, N10),
+                         DAG.getNode(ISD::ADD, N1.getDebugLoc(), VT, N01, N11));
+  }
+
+  if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (a+b) -> (a|b) iff a and b share no bits.
+  if (VT.isInteger() && !VT.isVector()) {
+    APInt LHSZero, LHSOne;
+    APInt RHSZero, RHSOne;
+    APInt Mask = APInt::getAllOnesValue(VT.getScalarType().getSizeInBits());
+    DAG.ComputeMaskedBits(N0, Mask, LHSZero, LHSOne);
+
+    if (LHSZero.getBoolValue()) {
+      DAG.ComputeMaskedBits(N1, Mask, RHSZero, RHSOne);
+
+      // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
+      // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
+      if ((RHSZero & (~LHSZero & Mask)) == (~LHSZero & Mask) ||
+          (LHSZero & (~RHSZero & Mask)) == (~RHSZero & Mask))
+        return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N1);
+    }
+  }
+
+  // fold (add (shl (add x, c1), c2), ) -> (add (add (shl x, c2), c1<<c2), )
+  if (N0.getOpcode() == ISD::SHL && N0.getNode()->hasOneUse()) {
+    SDValue Result = combineShlAddConstant(N->getDebugLoc(), N0, N1, DAG);
+    if (Result.getNode()) return Result;
+  }
+  if (N1.getOpcode() == ISD::SHL && N1.getNode()->hasOneUse()) {
+    SDValue Result = combineShlAddConstant(N->getDebugLoc(), N1, N0, DAG);
+    if (Result.getNode()) return Result;
+  }
+
+  // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
+  if (N1.getOpcode() == ISD::SHL &&
+      N1.getOperand(0).getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C =
+          dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(0)))
+      if (C->getAPIntValue() == 0)
+        return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0,
+                           DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+                                       N1.getOperand(0).getOperand(1),
+                                       N1.getOperand(1)));
+  if (N0.getOpcode() == ISD::SHL &&
+      N0.getOperand(0).getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C =
+          dyn_cast<ConstantSDNode>(N0.getOperand(0).getOperand(0)))
+      if (C->getAPIntValue() == 0)
+        return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1,
+                           DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+                                       N0.getOperand(0).getOperand(1),
+                                       N0.getOperand(1)));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // If the flag result is dead, turn this into an ADD.
+  if (N->hasNUsesOfValue(0, 1))
+    return CombineTo(N, DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N1, N0),
+                     DAG.getNode(ISD::CARRY_FALSE,
+                                 N->getDebugLoc(), MVT::Flag));
+
+  // canonicalize constant to RHS.
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADDC, N->getDebugLoc(), N->getVTList(), N1, N0);
+
+  // fold (addc x, 0) -> x + no carry out
+  if (N1C && N1C->isNullValue())
+    return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
+                                        N->getDebugLoc(), MVT::Flag));
+
+  // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits.
+  APInt LHSZero, LHSOne;
+  APInt RHSZero, RHSOne;
+  APInt Mask = APInt::getAllOnesValue(VT.getScalarType().getSizeInBits());
+  DAG.ComputeMaskedBits(N0, Mask, LHSZero, LHSOne);
+
+  if (LHSZero.getBoolValue()) {
+    DAG.ComputeMaskedBits(N1, Mask, RHSZero, RHSOne);
+
+    // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
+    // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
+    if ((RHSZero & (~LHSZero & Mask)) == (~LHSZero & Mask) ||
+        (LHSZero & (~RHSZero & Mask)) == (~RHSZero & Mask))
+      return CombineTo(N, DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N1),
+                       DAG.getNode(ISD::CARRY_FALSE,
+                                   N->getDebugLoc(), MVT::Flag));
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADDE, N->getDebugLoc(), N->getVTList(),
+                       N1, N0, CarryIn);
+
+  // fold (adde x, y, false) -> (addc x, y)
+  if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
+    return DAG.getNode(ISD::ADDC, N->getDebugLoc(), N->getVTList(), N1, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUB(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (sub x, x) -> 0
+  if (N0 == N1)
+    return DAG.getConstant(0, N->getValueType(0));
+  // fold (sub c1, c2) -> c1-c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::SUB, VT, N0C, N1C);
+  // fold (sub x, c) -> (add x, -c)
+  if (N1C)
+    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0,
+                       DAG.getConstant(-N1C->getAPIntValue(), VT));
+  // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
+  if (N0C && N0C->isAllOnesValue())
+    return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N1, N0);
+  // fold (A+B)-A -> B
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1)
+    return N0.getOperand(1);
+  // fold (A+B)-B -> A
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
+    return N0.getOperand(0);
+  // fold ((A+(B+or-C))-B) -> A+or-C
+  if (N0.getOpcode() == ISD::ADD &&
+      (N0.getOperand(1).getOpcode() == ISD::SUB ||
+       N0.getOperand(1).getOpcode() == ISD::ADD) &&
+      N0.getOperand(1).getOperand(0) == N1)
+    return DAG.getNode(N0.getOperand(1).getOpcode(), N->getDebugLoc(), VT,
+                       N0.getOperand(0), N0.getOperand(1).getOperand(1));
+  // fold ((A+(C+B))-B) -> A+C
+  if (N0.getOpcode() == ISD::ADD &&
+      N0.getOperand(1).getOpcode() == ISD::ADD &&
+      N0.getOperand(1).getOperand(1) == N1)
+    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT,
+                       N0.getOperand(0), N0.getOperand(1).getOperand(0));
+  // fold ((A-(B-C))-C) -> A-B
+  if (N0.getOpcode() == ISD::SUB &&
+      N0.getOperand(1).getOpcode() == ISD::SUB &&
+      N0.getOperand(1).getOperand(1) == N1)
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       N0.getOperand(0), N0.getOperand(1).getOperand(0));
+
+  // If either operand of a sub is undef, the result is undef
+  if (N0.getOpcode() == ISD::UNDEF)
+    return N0;
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  // If the relocation model supports it, consider symbol offsets.
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
+    if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
+      // fold (sub Sym, c) -> Sym-c
+      if (N1C && GA->getOpcode() == ISD::GlobalAddress)
+        return DAG.getGlobalAddress(GA->getGlobal(), N1C->getDebugLoc(), VT,
+                                    GA->getOffset() -
+                                      (uint64_t)N1C->getSExtValue());
+      // fold (sub Sym+c1, Sym+c2) -> c1-c2
+      if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1))
+        if (GA->getGlobal() == GB->getGlobal())
+          return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(),
+                                 VT);
+    }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMUL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (mul x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // fold (mul c1, c2) -> c1*c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::MUL, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::MUL, N->getDebugLoc(), VT, N1, N0);
+  // fold (mul x, 0) -> 0
+  if (N1C && N1C->isNullValue())
+    return N1;
+  // fold (mul x, -1) -> 0-x
+  if (N1C && N1C->isAllOnesValue())
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       DAG.getConstant(0, VT), N0);
+  // fold (mul x, (1 << c)) -> x << c
+  if (N1C && N1C->getAPIntValue().isPowerOf2())
+    return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0,
+                       DAG.getConstant(N1C->getAPIntValue().logBase2(),
+                                       getShiftAmountTy()));
+  // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
+  if (N1C && (-N1C->getAPIntValue()).isPowerOf2()) {
+    unsigned Log2Val = (-N1C->getAPIntValue()).logBase2();
+    // FIXME: If the input is something that is easily negated (e.g. a
+    // single-use add), we should put the negate there.
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       DAG.getConstant(0, VT),
+                       DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0,
+                            DAG.getConstant(Log2Val, getShiftAmountTy())));
+  }
+  // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
+  if (N1C && N0.getOpcode() == ISD::SHL &&
+      isa<ConstantSDNode>(N0.getOperand(1))) {
+    SDValue C3 = DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+                             N1, N0.getOperand(1));
+    AddToWorkList(C3.getNode());
+    return DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+                       N0.getOperand(0), C3);
+  }
+
+  // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
+  // use.
+  {
+    SDValue Sh(0,0), Y(0,0);
+    // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
+    if (N0.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N0.getOperand(1)) &&
+        N0.getNode()->hasOneUse()) {
+      Sh = N0; Y = N1;
+    } else if (N1.getOpcode() == ISD::SHL &&
+               isa<ConstantSDNode>(N1.getOperand(1)) &&
+               N1.getNode()->hasOneUse()) {
+      Sh = N1; Y = N0;
+    }
+
+    if (Sh.getNode()) {
+      SDValue Mul = DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+                                Sh.getOperand(0), Y);
+      return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+                         Mul, Sh.getOperand(1));
+    }
+  }
+
+  // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
+  if (N1C && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
+      isa<ConstantSDNode>(N0.getOperand(1)))
+    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT,
+                       DAG.getNode(ISD::MUL, N0.getDebugLoc(), VT,
+                                   N0.getOperand(0), N1),
+                       DAG.getNode(ISD::MUL, N1.getDebugLoc(), VT,
+                                   N0.getOperand(1), N1));
+
+  // reassociate mul
+  SDValue RMUL = ReassociateOps(ISD::MUL, N->getDebugLoc(), N0, N1);
+  if (RMUL.getNode() != 0)
+    return RMUL;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  EVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (sdiv c1, c2) -> c1/c2
+  if (N0C && N1C && !N1C->isNullValue())
+    return DAG.FoldConstantArithmetic(ISD::SDIV, VT, N0C, N1C);
+  // fold (sdiv X, 1) -> X
+  if (N1C && N1C->getSExtValue() == 1LL)
+    return N0;
+  // fold (sdiv X, -1) -> 0-X
+  if (N1C && N1C->isAllOnesValue())
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       DAG.getConstant(0, VT), N0);
+  // If we know the sign bits of both operands are zero, strength reduce to a
+  // udiv instead.  Handles (X&15) /s 4 -> X&15 >> 2
+  if (!VT.isVector()) {
+    if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UDIV, N->getDebugLoc(), N1.getValueType(),
+                         N0, N1);
+  }
+  // fold (sdiv X, pow2) -> simple ops after legalize
+  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap() &&
+      (isPowerOf2_64(N1C->getSExtValue()) ||
+       isPowerOf2_64(-N1C->getSExtValue()))) {
+    // If dividing by powers of two is cheap, then don't perform the following
+    // fold.
+    if (TLI.isPow2DivCheap())
+      return SDValue();
+
+    int64_t pow2 = N1C->getSExtValue();
+    int64_t abs2 = pow2 > 0 ? pow2 : -pow2;
+    unsigned lg2 = Log2_64(abs2);
+
+    // Splat the sign bit into the register
+    SDValue SGN = DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0,
+                              DAG.getConstant(VT.getSizeInBits()-1,
+                                              getShiftAmountTy()));
+    AddToWorkList(SGN.getNode());
+
+    // Add (N0 < 0) ? abs2 - 1 : 0;
+    SDValue SRL = DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, SGN,
+                              DAG.getConstant(VT.getSizeInBits() - lg2,
+                                              getShiftAmountTy()));
+    SDValue ADD = DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, SRL);
+    AddToWorkList(SRL.getNode());
+    AddToWorkList(ADD.getNode());    // Divide by pow2
+    SDValue SRA = DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, ADD,
+                              DAG.getConstant(lg2, getShiftAmountTy()));
+
+    // If we're dividing by a positive value, we're done.  Otherwise, we must
+    // negate the result.
+    if (pow2 > 0)
+      return SRA;
+
+    AddToWorkList(SRA.getNode());
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       DAG.getConstant(0, VT), SRA);
+  }
+
+  // if integer divide is expensive and we satisfy the requirements, emit an
+  // alternate sequence.
+  if (N1C && (N1C->getSExtValue() < -1 || N1C->getSExtValue() > 1) &&
+      !TLI.isIntDivCheap()) {
+    SDValue Op = BuildSDIV(N);
+    if (Op.getNode()) return Op;
+  }
+
+  // undef / X -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // X / undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  EVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (udiv c1, c2) -> c1/c2
+  if (N0C && N1C && !N1C->isNullValue())
+    return DAG.FoldConstantArithmetic(ISD::UDIV, VT, N0C, N1C);
+  // fold (udiv x, (1 << c)) -> x >>u c
+  if (N1C && N1C->getAPIntValue().isPowerOf2())
+    return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0,
+                       DAG.getConstant(N1C->getAPIntValue().logBase2(),
+                                       getShiftAmountTy()));
+  // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
+  if (N1.getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *SHC = dyn_cast<ConstantSDNode>(N1.getOperand(0))) {
+      if (SHC->getAPIntValue().isPowerOf2()) {
+        EVT ADDVT = N1.getOperand(1).getValueType();
+        SDValue Add = DAG.getNode(ISD::ADD, N->getDebugLoc(), ADDVT,
+                                  N1.getOperand(1),
+                                  DAG.getConstant(SHC->getAPIntValue()
+                                                                  .logBase2(),
+                                                  ADDVT));
+        AddToWorkList(Add.getNode());
+        return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0, Add);
+      }
+    }
+  }
+  // fold (udiv x, c) -> alternate
+  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap()) {
+    SDValue Op = BuildUDIV(N);
+    if (Op.getNode()) return Op;
+  }
+
+  // undef / X -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // X / undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSREM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold (srem c1, c2) -> c1%c2
+  if (N0C && N1C && !N1C->isNullValue())
+    return DAG.FoldConstantArithmetic(ISD::SREM, VT, N0C, N1C);
+  // If we know the sign bits of both operands are zero, strength reduce to a
+  // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15
+  if (!VT.isVector()) {
+    if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UREM, N->getDebugLoc(), VT, N0, N1);
+  }
+
+  // If X/C can be simplified by the division-by-constant logic, lower
+  // X%C to the equivalent of X-X/C*C.
+  if (N1C && !N1C->isNullValue()) {
+    SDValue Div = DAG.getNode(ISD::SDIV, N->getDebugLoc(), VT, N0, N1);
+    AddToWorkList(Div.getNode());
+    SDValue OptimizedDiv = combine(Div.getNode());
+    if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
+      SDValue Mul = DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+                                OptimizedDiv, N1);
+      SDValue Sub = DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, Mul);
+      AddToWorkList(Mul.getNode());
+      return Sub;
+    }
+  }
+
+  // undef % X -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // X % undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUREM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold (urem c1, c2) -> c1%c2
+  if (N0C && N1C && !N1C->isNullValue())
+    return DAG.FoldConstantArithmetic(ISD::UREM, VT, N0C, N1C);
+  // fold (urem x, pow2) -> (and x, pow2-1)
+  if (N1C && !N1C->isNullValue() && N1C->getAPIntValue().isPowerOf2())
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0,
+                       DAG.getConstant(N1C->getAPIntValue()-1,VT));
+  // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+  if (N1.getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *SHC = dyn_cast<ConstantSDNode>(N1.getOperand(0))) {
+      if (SHC->getAPIntValue().isPowerOf2()) {
+        SDValue Add =
+          DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N1,
+                 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()),
+                                 VT));
+        AddToWorkList(Add.getNode());
+        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0, Add);
+      }
+    }
+  }
+
+  // If X/C can be simplified by the division-by-constant logic, lower
+  // X%C to the equivalent of X-X/C*C.
+  if (N1C && !N1C->isNullValue()) {
+    SDValue Div = DAG.getNode(ISD::UDIV, N->getDebugLoc(), VT, N0, N1);
+    AddToWorkList(Div.getNode());
+    SDValue OptimizedDiv = combine(Div.getNode());
+    if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
+      SDValue Mul = DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+                                OptimizedDiv, N1);
+      SDValue Sub = DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, Mul);
+      AddToWorkList(Mul.getNode());
+      return Sub;
+    }
+  }
+
+  // undef % X -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // X % undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMULHS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold (mulhs x, 0) -> 0
+  if (N1C && N1C->isNullValue())
+    return N1;
+  // fold (mulhs x, 1) -> (sra x, size(x)-1)
+  if (N1C && N1C->getAPIntValue() == 1)
+    return DAG.getNode(ISD::SRA, N->getDebugLoc(), N0.getValueType(), N0,
+                       DAG.getConstant(N0.getValueType().getSizeInBits() - 1,
+                                       getShiftAmountTy()));
+  // fold (mulhs x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMULHU(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold (mulhu x, 0) -> 0
+  if (N1C && N1C->isNullValue())
+    return N1;
+  // fold (mulhu x, 1) -> 0
+  if (N1C && N1C->getAPIntValue() == 1)
+    return DAG.getConstant(0, N0.getValueType());
+  // fold (mulhu x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+
+  return SDValue();
+}
+
+/// SimplifyNodeWithTwoResults - Perform optimizations common to nodes that
+/// compute two values. LoOp and HiOp give the opcodes for the two computations
+/// that are being performed. Return true if a simplification was made.
+///
+SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
+                                                unsigned HiOp) {
+  // If the high half is not needed, just compute the low half.
+  bool HiExists = N->hasAnyUseOfValue(1);
+  if (!HiExists &&
+      (!LegalOperations ||
+       TLI.isOperationLegal(LoOp, N->getValueType(0)))) {
+    SDValue Res = DAG.getNode(LoOp, N->getDebugLoc(), N->getValueType(0),
+                              N->op_begin(), N->getNumOperands());
+    return CombineTo(N, Res, Res);
+  }
+
+  // If the low half is not needed, just compute the high half.
+  bool LoExists = N->hasAnyUseOfValue(0);
+  if (!LoExists &&
+      (!LegalOperations ||
+       TLI.isOperationLegal(HiOp, N->getValueType(1)))) {
+    SDValue Res = DAG.getNode(HiOp, N->getDebugLoc(), N->getValueType(1),
+                              N->op_begin(), N->getNumOperands());
+    return CombineTo(N, Res, Res);
+  }
+
+  // If both halves are used, return as it is.
+  if (LoExists && HiExists)
+    return SDValue();
+
+  // If the two computed results can be simplified separately, separate them.
+  if (LoExists) {
+    SDValue Lo = DAG.getNode(LoOp, N->getDebugLoc(), N->getValueType(0),
+                             N->op_begin(), N->getNumOperands());
+    AddToWorkList(Lo.getNode());
+    SDValue LoOpt = combine(Lo.getNode());
+    if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(LoOpt.getOpcode(), LoOpt.getValueType())))
+      return CombineTo(N, LoOpt, LoOpt);
+  }
+
+  if (HiExists) {
+    SDValue Hi = DAG.getNode(HiOp, N->getDebugLoc(), N->getValueType(1),
+                             N->op_begin(), N->getNumOperands());
+    AddToWorkList(Hi.getNode());
+    SDValue HiOpt = combine(Hi.getNode());
+    if (HiOpt.getNode() && HiOpt != Hi &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(HiOpt.getOpcode(), HiOpt.getValueType())))
+      return CombineTo(N, HiOpt, HiOpt);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
+  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS);
+  if (Res.getNode()) return Res;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
+  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU);
+  if (Res.getNode()) return Res;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSDIVREM(SDNode *N) {
+  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM);
+  if (Res.getNode()) return Res;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUDIVREM(SDNode *N) {
+  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::UDIV, ISD::UREM);
+  if (Res.getNode()) return Res;
+
+  return SDValue();
+}
+
+/// SimplifyBinOpWithSameOpcodeHands - If this is a binary operator with
+/// two operands of the same opcode, try to simplify it.
+SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  assert(N0.getOpcode() == N1.getOpcode() && "Bad input!");
+
+  // Bail early if none of these transforms apply.
+  if (N0.getNode()->getNumOperands() == 0) return SDValue();
+
+  // For each of OP in AND/OR/XOR:
+  // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
+  // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
+  // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
+  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
+  //
+  // do not sink logical op inside of a vector extend, since it may combine
+  // into a vsetcc.
+  EVT Op0VT = N0.getOperand(0).getValueType();
+  if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
+       N0.getOpcode() == ISD::SIGN_EXTEND ||
+       // Avoid infinite looping with PromoteIntBinOp.
+       (N0.getOpcode() == ISD::ANY_EXTEND &&
+        (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) ||
+       (N0.getOpcode() == ISD::TRUNCATE &&
+        (!TLI.isZExtFree(VT, Op0VT) ||
+         !TLI.isTruncateFree(Op0VT, VT)) &&
+        TLI.isTypeLegal(Op0VT))) &&
+      !VT.isVector() &&
+      Op0VT == N1.getOperand(0).getValueType() &&
+      (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) {
+    SDValue ORNode = DAG.getNode(N->getOpcode(), N0.getDebugLoc(),
+                                 N0.getOperand(0).getValueType(),
+                                 N0.getOperand(0), N1.getOperand(0));
+    AddToWorkList(ORNode.getNode());
+    return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, ORNode);
+  }
+
+  // For each of OP in SHL/SRL/SRA/AND...
+  //   fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z)
+  //   fold (or  (OP x, z), (OP y, z)) -> (OP (or  x, y), z)
+  //   fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z)
+  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL ||
+       N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::AND) &&
+      N0.getOperand(1) == N1.getOperand(1)) {
+    SDValue ORNode = DAG.getNode(N->getOpcode(), N0.getDebugLoc(),
+                                 N0.getOperand(0).getValueType(),
+                                 N0.getOperand(0), N1.getOperand(0));
+    AddToWorkList(ORNode.getNode());
+    return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+                       ORNode, N0.getOperand(1));
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitAND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue LL, LR, RL, RR, CC0, CC1;
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N1.getValueType();
+  unsigned BitWidth = VT.getScalarType().getSizeInBits();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (and x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // fold (and c1, c2) -> c1&c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::AND, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N1, N0);
+  // fold (and x, -1) -> x
+  if (N1C && N1C->isAllOnesValue())
+    return N0;
+  // if (and x, c) is known to be zero, return 0
+  if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
+                                   APInt::getAllOnesValue(BitWidth)))
+    return DAG.getConstant(0, VT);
+  // reassociate and
+  SDValue RAND = ReassociateOps(ISD::AND, N->getDebugLoc(), N0, N1);
+  if (RAND.getNode() != 0)
+    return RAND;
+  // fold (and (or x, C), D) -> D if (C & D) == D
+  if (N1C && N0.getOpcode() == ISD::OR)
+    if (ConstantSDNode *ORI = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
+      if ((ORI->getAPIntValue() & N1C->getAPIntValue()) == N1C->getAPIntValue())
+        return N1;
+  // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
+  if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
+    SDValue N0Op0 = N0.getOperand(0);
+    APInt Mask = ~N1C->getAPIntValue();
+    Mask.trunc(N0Op0.getValueSizeInBits());
+    if (DAG.MaskedValueIsZero(N0Op0, Mask)) {
+      SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(),
+                                 N0.getValueType(), N0Op0);
+
+      // Replace uses of the AND with uses of the Zero extend node.
+      CombineTo(N, Zext);
+
+      // We actually want to replace all uses of the any_extend with the
+      // zero_extend, to avoid duplicating things.  This will later cause this
+      // AND to be folded.
+      CombineTo(N0.getNode(), Zext);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+  // fold (and (setcc x), (setcc y)) -> (setcc (and x, y))
+  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
+    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
+    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
+
+    if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 &&
+        LL.getValueType().isInteger()) {
+      // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0)
+      if (cast<ConstantSDNode>(LR)->isNullValue() && Op1 == ISD::SETEQ) {
+        SDValue ORNode = DAG.getNode(ISD::OR, N0.getDebugLoc(),
+                                     LR.getValueType(), LL, RL);
+        AddToWorkList(ORNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ORNode, LR, Op1);
+      }
+      // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1)
+      if (cast<ConstantSDNode>(LR)->isAllOnesValue() && Op1 == ISD::SETEQ) {
+        SDValue ANDNode = DAG.getNode(ISD::AND, N0.getDebugLoc(),
+                                      LR.getValueType(), LL, RL);
+        AddToWorkList(ANDNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ANDNode, LR, Op1);
+      }
+      // fold (and (setgt X,  -1), (setgt Y,  -1)) -> (setgt (or X, Y), -1)
+      if (cast<ConstantSDNode>(LR)->isAllOnesValue() && Op1 == ISD::SETGT) {
+        SDValue ORNode = DAG.getNode(ISD::OR, N0.getDebugLoc(),
+                                     LR.getValueType(), LL, RL);
+        AddToWorkList(ORNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ORNode, LR, Op1);
+      }
+    }
+    // canonicalize equivalent to ll == rl
+    if (LL == RR && LR == RL) {
+      Op1 = ISD::getSetCCSwappedOperands(Op1);
+      std::swap(RL, RR);
+    }
+    if (LL == RL && LR == RR) {
+      bool isInteger = LL.getValueType().isInteger();
+      ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger);
+      if (Result != ISD::SETCC_INVALID &&
+          (!LegalOperations || TLI.isCondCodeLegal(Result, LL.getValueType())))
+        return DAG.getSetCC(N->getDebugLoc(), N0.getValueType(),
+                            LL, LR, Result);
+    }
+  }
+
+  // Simplify: (and (op x...), (op y...))  -> (op (and x, y))
+  if (N0.getOpcode() == N1.getOpcode()) {
+    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
+    if (Tmp.getNode()) return Tmp;
+  }
+
+  // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
+  // fold (and (sra)) -> (and (srl)) when possible.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (zext_inreg (extload x)) -> (zextload x)
+  if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode())) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    // If we zero all the possible extended bits, then we can turn this into
+    // a zextload if we are running before legalize or the operation is legal.
+    unsigned BitWidth = N1.getValueType().getScalarType().getSizeInBits();
+    if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
+                           BitWidth - MemVT.getScalarType().getSizeInBits())) &&
+        ((!LegalOperations && !LN0->isVolatile()) ||
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, VT, N0.getDebugLoc(),
+                                       LN0->getChain(), LN0->getBasePtr(),
+                                       LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(), MemVT,
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      AddToWorkList(N);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+  // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use
+  if (ISD::isSEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    // If we zero all the possible extended bits, then we can turn this into
+    // a zextload if we are running before legalize or the operation is legal.
+    unsigned BitWidth = N1.getValueType().getScalarType().getSizeInBits();
+    if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
+                           BitWidth - MemVT.getScalarType().getSizeInBits())) &&
+        ((!LegalOperations && !LN0->isVolatile()) ||
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, VT, N0.getDebugLoc(),
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(), MemVT,
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      AddToWorkList(N);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (and (load x), 255) -> (zextload x, i8)
+  // fold (and (extload x, i16), 255) -> (zextload x, i8)
+  // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
+  if (N1C && (N0.getOpcode() == ISD::LOAD ||
+              (N0.getOpcode() == ISD::ANY_EXTEND &&
+               N0.getOperand(0).getOpcode() == ISD::LOAD))) {
+    bool HasAnyExt = N0.getOpcode() == ISD::ANY_EXTEND;
+    LoadSDNode *LN0 = HasAnyExt
+      ? cast<LoadSDNode>(N0.getOperand(0))
+      : cast<LoadSDNode>(N0);
+    if (LN0->getExtensionType() != ISD::SEXTLOAD &&
+        LN0->isUnindexed() && N0.hasOneUse() && LN0->hasOneUse()) {
+      uint32_t ActiveBits = N1C->getAPIntValue().getActiveBits();
+      if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue())){
+        EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+        EVT LoadedVT = LN0->getMemoryVT();
+
+        if (ExtVT == LoadedVT &&
+            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
+          EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
+          
+          SDValue NewLoad = 
+            DAG.getExtLoad(ISD::ZEXTLOAD, LoadResultTy, LN0->getDebugLoc(),
+                           LN0->getChain(), LN0->getBasePtr(),
+                           LN0->getSrcValue(), LN0->getSrcValueOffset(),
+                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
+                           LN0->getAlignment());
+          AddToWorkList(N);
+          CombineTo(LN0, NewLoad, NewLoad.getValue(1));
+          return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+        }
+        
+        // Do not change the width of a volatile load.
+        // Do not generate loads of non-round integer types since these can
+        // be expensive (and would be wrong if the type is not byte sized).
+        if (!LN0->isVolatile() && LoadedVT.bitsGT(ExtVT) && ExtVT.isRound() &&
+            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
+          EVT PtrType = LN0->getOperand(1).getValueType();
+
+          unsigned Alignment = LN0->getAlignment();
+          SDValue NewPtr = LN0->getBasePtr();
+
+          // For big endian targets, we need to add an offset to the pointer
+          // to load the correct bytes.  For little endian systems, we merely
+          // need to read fewer bytes from the same pointer.
+          if (TLI.isBigEndian()) {
+            unsigned LVTStoreBytes = LoadedVT.getStoreSize();
+            unsigned EVTStoreBytes = ExtVT.getStoreSize();
+            unsigned PtrOff = LVTStoreBytes - EVTStoreBytes;
+            NewPtr = DAG.getNode(ISD::ADD, LN0->getDebugLoc(), PtrType,
+                                 NewPtr, DAG.getConstant(PtrOff, PtrType));
+            Alignment = MinAlign(Alignment, PtrOff);
+          }
+
+          AddToWorkList(NewPtr.getNode());
+          
+          EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
+          SDValue Load =
+            DAG.getExtLoad(ISD::ZEXTLOAD, LoadResultTy, LN0->getDebugLoc(),
+                           LN0->getChain(), NewPtr,
+                           LN0->getSrcValue(), LN0->getSrcValueOffset(),
+                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
+                           Alignment);
+          AddToWorkList(N);
+          CombineTo(LN0, Load, Load.getValue(1));
+          return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue LL, LR, RL, RR, CC0, CC1;
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N1.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (or x, undef) -> -1
+  if (!LegalOperations &&
+      (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)) {
+    EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
+    return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT);
+  }
+  // fold (or c1, c2) -> c1|c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::OR, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N1, N0);
+  // fold (or x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // fold (or x, -1) -> -1
+  if (N1C && N1C->isAllOnesValue())
+    return N1;
+  // fold (or x, c) -> c iff (x & ~c) == 0
+  if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
+    return N1;
+  // reassociate or
+  SDValue ROR = ReassociateOps(ISD::OR, N->getDebugLoc(), N0, N1);
+  if (ROR.getNode() != 0)
+    return ROR;
+  // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
+  // iff (c1 & c2) == 0.
+  if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+             isa<ConstantSDNode>(N0.getOperand(1))) {
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1));
+    if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0)
+      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                         DAG.getNode(ISD::OR, N0.getDebugLoc(), VT,
+                                     N0.getOperand(0), N1),
+                         DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1));
+  }
+  // fold (or (setcc x), (setcc y)) -> (setcc (or x, y))
+  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
+    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
+    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
+
+    if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 &&
+        LL.getValueType().isInteger()) {
+      // fold (or (setne X, 0), (setne Y, 0)) -> (setne (or X, Y), 0)
+      // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0)
+      if (cast<ConstantSDNode>(LR)->isNullValue() &&
+          (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) {
+        SDValue ORNode = DAG.getNode(ISD::OR, LR.getDebugLoc(),
+                                     LR.getValueType(), LL, RL);
+        AddToWorkList(ORNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ORNode, LR, Op1);
+      }
+      // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1)
+      // fold (or (setgt X, -1), (setgt Y  -1)) -> (setgt (and X, Y), -1)
+      if (cast<ConstantSDNode>(LR)->isAllOnesValue() &&
+          (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) {
+        SDValue ANDNode = DAG.getNode(ISD::AND, LR.getDebugLoc(),
+                                      LR.getValueType(), LL, RL);
+        AddToWorkList(ANDNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ANDNode, LR, Op1);
+      }
+    }
+    // canonicalize equivalent to ll == rl
+    if (LL == RR && LR == RL) {
+      Op1 = ISD::getSetCCSwappedOperands(Op1);
+      std::swap(RL, RR);
+    }
+    if (LL == RL && LR == RR) {
+      bool isInteger = LL.getValueType().isInteger();
+      ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger);
+      if (Result != ISD::SETCC_INVALID &&
+          (!LegalOperations || TLI.isCondCodeLegal(Result, LL.getValueType())))
+        return DAG.getSetCC(N->getDebugLoc(), N0.getValueType(),
+                            LL, LR, Result);
+    }
+  }
+
+  // Simplify: (or (op x...), (op y...))  -> (op (or x, y))
+  if (N0.getOpcode() == N1.getOpcode()) {
+    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
+    if (Tmp.getNode()) return Tmp;
+  }
+
+  // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.
+  if (N0.getOpcode() == ISD::AND &&
+      N1.getOpcode() == ISD::AND &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      N1.getOperand(1).getOpcode() == ISD::Constant &&
+      // Don't increase # computations.
+      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+    // We can only do this xform if we know that bits from X that are set in C2
+    // but not in C1 are already zero.  Likewise for Y.
+    const APInt &LHSMask =
+      cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    const APInt &RHSMask =
+      cast<ConstantSDNode>(N1.getOperand(1))->getAPIntValue();
+
+    if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
+        DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
+      SDValue X = DAG.getNode(ISD::OR, N0.getDebugLoc(), VT,
+                              N0.getOperand(0), N1.getOperand(0));
+      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, X,
+                         DAG.getConstant(LHSMask | RHSMask, VT));
+    }
+  }
+
+  // See if this is some rotate idiom.
+  if (SDNode *Rot = MatchRotate(N0, N1, N->getDebugLoc()))
+    return SDValue(Rot, 0);
+
+  // Simplify the operands using demanded-bits information.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// MatchRotateHalf - Match "(X shl/srl V1) & V2" where V2 may not be present.
+static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
+  if (Op.getOpcode() == ISD::AND) {
+    if (isa<ConstantSDNode>(Op.getOperand(1))) {
+      Mask = Op.getOperand(1);
+      Op = Op.getOperand(0);
+    } else {
+      return false;
+    }
+  }
+
+  if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) {
+    Shift = Op;
+    return true;
+  }
+
+  return false;
+}
+
+// MatchRotate - Handle an 'or' of two operands.  If this is one of the many
+// idioms for rotate, and if the target supports rotation instructions, generate
+// a rot[lr].
+SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL) {
+  // Must be a legal type.  Expanded 'n promoted things won't work with rotates.
+  EVT VT = LHS.getValueType();
+  if (!TLI.isTypeLegal(VT)) return 0;
+
+  // The target must have at least one rotate flavor.
+  bool HasROTL = TLI.isOperationLegalOrCustom(ISD::ROTL, VT);
+  bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT);
+  if (!HasROTL && !HasROTR) return 0;
+
+  // Match "(X shl/srl V1) & V2" where V2 may not be present.
+  SDValue LHSShift;   // The shift.
+  SDValue LHSMask;    // AND value if any.
+  if (!MatchRotateHalf(LHS, LHSShift, LHSMask))
+    return 0; // Not part of a rotate.
+
+  SDValue RHSShift;   // The shift.
+  SDValue RHSMask;    // AND value if any.
+  if (!MatchRotateHalf(RHS, RHSShift, RHSMask))
+    return 0; // Not part of a rotate.
+
+  if (LHSShift.getOperand(0) != RHSShift.getOperand(0))
+    return 0;   // Not shifting the same value.
+
+  if (LHSShift.getOpcode() == RHSShift.getOpcode())
+    return 0;   // Shifts must disagree.
+
+  // Canonicalize shl to left side in a shl/srl pair.
+  if (RHSShift.getOpcode() == ISD::SHL) {
+    std::swap(LHS, RHS);
+    std::swap(LHSShift, RHSShift);
+    std::swap(LHSMask , RHSMask );
+  }
+
+  unsigned OpSizeInBits = VT.getSizeInBits();
+  SDValue LHSShiftArg = LHSShift.getOperand(0);
+  SDValue LHSShiftAmt = LHSShift.getOperand(1);
+  SDValue RHSShiftAmt = RHSShift.getOperand(1);
+
+  // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
+  // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
+  if (LHSShiftAmt.getOpcode() == ISD::Constant &&
+      RHSShiftAmt.getOpcode() == ISD::Constant) {
+    uint64_t LShVal = cast<ConstantSDNode>(LHSShiftAmt)->getZExtValue();
+    uint64_t RShVal = cast<ConstantSDNode>(RHSShiftAmt)->getZExtValue();
+    if ((LShVal + RShVal) != OpSizeInBits)
+      return 0;
+
+    SDValue Rot;
+    if (HasROTL)
+      Rot = DAG.getNode(ISD::ROTL, DL, VT, LHSShiftArg, LHSShiftAmt);
+    else
+      Rot = DAG.getNode(ISD::ROTR, DL, VT, LHSShiftArg, RHSShiftAmt);
+
+    // If there is an AND of either shifted operand, apply it to the result.
+    if (LHSMask.getNode() || RHSMask.getNode()) {
+      APInt Mask = APInt::getAllOnesValue(OpSizeInBits);
+
+      if (LHSMask.getNode()) {
+        APInt RHSBits = APInt::getLowBitsSet(OpSizeInBits, LShVal);
+        Mask &= cast<ConstantSDNode>(LHSMask)->getAPIntValue() | RHSBits;
+      }
+      if (RHSMask.getNode()) {
+        APInt LHSBits = APInt::getHighBitsSet(OpSizeInBits, RShVal);
+        Mask &= cast<ConstantSDNode>(RHSMask)->getAPIntValue() | LHSBits;
+      }
+
+      Rot = DAG.getNode(ISD::AND, DL, VT, Rot, DAG.getConstant(Mask, VT));
+    }
+
+    return Rot.getNode();
+  }
+
+  // If there is a mask here, and we have a variable shift, we can't be sure
+  // that we're masking out the right stuff.
+  if (LHSMask.getNode() || RHSMask.getNode())
+    return 0;
+
+  // fold (or (shl x, y), (srl x, (sub 32, y))) -> (rotl x, y)
+  // fold (or (shl x, y), (srl x, (sub 32, y))) -> (rotr x, (sub 32, y))
+  if (RHSShiftAmt.getOpcode() == ISD::SUB &&
+      LHSShiftAmt == RHSShiftAmt.getOperand(1)) {
+    if (ConstantSDNode *SUBC =
+          dyn_cast<ConstantSDNode>(RHSShiftAmt.getOperand(0))) {
+      if (SUBC->getAPIntValue() == OpSizeInBits) {
+        if (HasROTL)
+          return DAG.getNode(ISD::ROTL, DL, VT,
+                             LHSShiftArg, LHSShiftAmt).getNode();
+        else
+          return DAG.getNode(ISD::ROTR, DL, VT,
+                             LHSShiftArg, RHSShiftAmt).getNode();
+      }
+    }
+  }
+
+  // fold (or (shl x, (sub 32, y)), (srl x, r)) -> (rotr x, y)
+  // fold (or (shl x, (sub 32, y)), (srl x, r)) -> (rotl x, (sub 32, y))
+  if (LHSShiftAmt.getOpcode() == ISD::SUB &&
+      RHSShiftAmt == LHSShiftAmt.getOperand(1)) {
+    if (ConstantSDNode *SUBC =
+          dyn_cast<ConstantSDNode>(LHSShiftAmt.getOperand(0))) {
+      if (SUBC->getAPIntValue() == OpSizeInBits) {
+        if (HasROTR)
+          return DAG.getNode(ISD::ROTR, DL, VT,
+                             LHSShiftArg, RHSShiftAmt).getNode();
+        else
+          return DAG.getNode(ISD::ROTL, DL, VT,
+                             LHSShiftArg, LHSShiftAmt).getNode();
+      }
+    }
+  }
+
+  // Look for sign/zext/any-extended or truncate cases:
+  if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND
+       || LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND
+       || LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND
+       || LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
+      (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND
+       || RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND
+       || RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND
+       || RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
+    SDValue LExtOp0 = LHSShiftAmt.getOperand(0);
+    SDValue RExtOp0 = RHSShiftAmt.getOperand(0);
+    if (RExtOp0.getOpcode() == ISD::SUB &&
+        RExtOp0.getOperand(1) == LExtOp0) {
+      // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
+      //   (rotl x, y)
+      // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
+      //   (rotr x, (sub 32, y))
+      if (ConstantSDNode *SUBC =
+            dyn_cast<ConstantSDNode>(RExtOp0.getOperand(0))) {
+        if (SUBC->getAPIntValue() == OpSizeInBits) {
+          return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
+                             LHSShiftArg,
+                             HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
+        }
+      }
+    } else if (LExtOp0.getOpcode() == ISD::SUB &&
+               RExtOp0 == LExtOp0.getOperand(1)) {
+      // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
+      //   (rotr x, y)
+      // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
+      //   (rotl x, (sub 32, y))
+      if (ConstantSDNode *SUBC =
+            dyn_cast<ConstantSDNode>(LExtOp0.getOperand(0))) {
+        if (SUBC->getAPIntValue() == OpSizeInBits) {
+          return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT,
+                             LHSShiftArg,
+                             HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+SDValue DAGCombiner::visitXOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue LHS, RHS, CC;
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (xor undef, undef) -> 0. This is a common idiom (misuse).
+  if (N0.getOpcode() == ISD::UNDEF && N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // fold (xor x, undef) -> undef
+  if (N0.getOpcode() == ISD::UNDEF)
+    return N0;
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+  // fold (xor c1, c2) -> c1^c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::XOR, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N1, N0);
+  // fold (xor x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // reassociate xor
+  SDValue RXOR = ReassociateOps(ISD::XOR, N->getDebugLoc(), N0, N1);
+  if (RXOR.getNode() != 0)
+    return RXOR;
+
+  // fold !(x cc y) -> (x !cc y)
+  if (N1C && N1C->getAPIntValue() == 1 && isSetCCEquivalent(N0, LHS, RHS, CC)) {
+    bool isInt = LHS.getValueType().isInteger();
+    ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                                               isInt);
+
+    if (!LegalOperations || TLI.isCondCodeLegal(NotCC, LHS.getValueType())) {
+      switch (N0.getOpcode()) {
+      default:
+        llvm_unreachable("Unhandled SetCC Equivalent!");
+      case ISD::SETCC:
+        return DAG.getSetCC(N->getDebugLoc(), VT, LHS, RHS, NotCC);
+      case ISD::SELECT_CC:
+        return DAG.getSelectCC(N->getDebugLoc(), LHS, RHS, N0.getOperand(2),
+                               N0.getOperand(3), NotCC);
+      }
+    }
+  }
+
+  // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
+  if (N1C && N1C->getAPIntValue() == 1 && N0.getOpcode() == ISD::ZERO_EXTEND &&
+      N0.getNode()->hasOneUse() &&
+      isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
+    SDValue V = N0.getOperand(0);
+    V = DAG.getNode(ISD::XOR, N0.getDebugLoc(), V.getValueType(), V,
+                    DAG.getConstant(1, V.getValueType()));
+    AddToWorkList(V.getNode());
+    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, V);
+  }
+
+  // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
+  if (N1C && N1C->getAPIntValue() == 1 && VT == MVT::i1 &&
+      (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
+    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
+    if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) {
+      unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
+      LHS = DAG.getNode(ISD::XOR, LHS.getDebugLoc(), VT, LHS, N1); // LHS = ~LHS
+      RHS = DAG.getNode(ISD::XOR, RHS.getDebugLoc(), VT, RHS, N1); // RHS = ~RHS
+      AddToWorkList(LHS.getNode()); AddToWorkList(RHS.getNode());
+      return DAG.getNode(NewOpcode, N->getDebugLoc(), VT, LHS, RHS);
+    }
+  }
+  // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
+  if (N1C && N1C->isAllOnesValue() &&
+      (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
+    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
+    if (isa<ConstantSDNode>(RHS) || isa<ConstantSDNode>(LHS)) {
+      unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
+      LHS = DAG.getNode(ISD::XOR, LHS.getDebugLoc(), VT, LHS, N1); // LHS = ~LHS
+      RHS = DAG.getNode(ISD::XOR, RHS.getDebugLoc(), VT, RHS, N1); // RHS = ~RHS
+      AddToWorkList(LHS.getNode()); AddToWorkList(RHS.getNode());
+      return DAG.getNode(NewOpcode, N->getDebugLoc(), VT, LHS, RHS);
+    }
+  }
+  // fold (xor (xor x, c1), c2) -> (xor x, (xor c1, c2))
+  if (N1C && N0.getOpcode() == ISD::XOR) {
+    ConstantSDNode *N00C = dyn_cast<ConstantSDNode>(N0.getOperand(0));
+    ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (N00C)
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N0.getOperand(1),
+                         DAG.getConstant(N1C->getAPIntValue() ^
+                                         N00C->getAPIntValue(), VT));
+    if (N01C)
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N0.getOperand(0),
+                         DAG.getConstant(N1C->getAPIntValue() ^
+                                         N01C->getAPIntValue(), VT));
+  }
+  // fold (xor x, x) -> 0
+  if (N0 == N1) {
+    if (!VT.isVector()) {
+      return DAG.getConstant(0, VT);
+    } else if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)){
+      // Produce a vector of zeros.
+      SDValue El = DAG.getConstant(0, VT.getVectorElementType());
+      std::vector<SDValue> Ops(VT.getVectorNumElements(), El);
+      return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
+                         &Ops[0], Ops.size());
+    }
+  }
+
+  // Simplify: xor (op x...), (op y...)  -> (op (xor x, y))
+  if (N0.getOpcode() == N1.getOpcode()) {
+    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
+    if (Tmp.getNode()) return Tmp;
+  }
+
+  // Simplify the expression using non-local knowledge.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// visitShiftByConstant - Handle transforms common to the three shifts, when
+/// the shift amount is a constant.
+SDValue DAGCombiner::visitShiftByConstant(SDNode *N, unsigned Amt) {
+  SDNode *LHS = N->getOperand(0).getNode();
+  if (!LHS->hasOneUse()) return SDValue();
+
+  // We want to pull some binops through shifts, so that we have (and (shift))
+  // instead of (shift (and)), likewise for add, or, xor, etc.  This sort of
+  // thing happens with address calculations, so it's important to canonicalize
+  // it.
+  bool HighBitSet = false;  // Can we transform this if the high bit is set?
+
+  switch (LHS->getOpcode()) {
+  default: return SDValue();
+  case ISD::OR:
+  case ISD::XOR:
+    HighBitSet = false; // We can only transform sra if the high bit is clear.
+    break;
+  case ISD::AND:
+    HighBitSet = true;  // We can only transform sra if the high bit is set.
+    break;
+  case ISD::ADD:
+    if (N->getOpcode() != ISD::SHL)
+      return SDValue(); // only shl(add) not sr[al](add).
+    HighBitSet = false; // We can only transform sra if the high bit is clear.
+    break;
+  }
+
+  // We require the RHS of the binop to be a constant as well.
+  ConstantSDNode *BinOpCst = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+  if (!BinOpCst) return SDValue();
+
+  // FIXME: disable this unless the input to the binop is a shift by a constant.
+  // If it is not a shift, it pessimizes some common cases like:
+  //
+  //    void foo(int *X, int i) { X[i & 1235] = 1; }
+  //    int bar(int *X, int i) { return X[i & 255]; }
+  SDNode *BinOpLHSVal = LHS->getOperand(0).getNode();
+  if ((BinOpLHSVal->getOpcode() != ISD::SHL &&
+       BinOpLHSVal->getOpcode() != ISD::SRA &&
+       BinOpLHSVal->getOpcode() != ISD::SRL) ||
+      !isa<ConstantSDNode>(BinOpLHSVal->getOperand(1)))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  // If this is a signed shift right, and the high bit is modified by the
+  // logical operation, do not perform the transformation. The highBitSet
+  // boolean indicates the value of the high bit of the constant which would
+  // cause it to be modified for this operation.
+  if (N->getOpcode() == ISD::SRA) {
+    bool BinOpRHSSignSet = BinOpCst->getAPIntValue().isNegative();
+    if (BinOpRHSSignSet != HighBitSet)
+      return SDValue();
+  }
+
+  // Fold the constants, shifting the binop RHS by the shift amount.
+  SDValue NewRHS = DAG.getNode(N->getOpcode(), LHS->getOperand(1).getDebugLoc(),
+                               N->getValueType(0),
+                               LHS->getOperand(1), N->getOperand(1));
+
+  // Create the new shift.
+  SDValue NewShift = DAG.getNode(N->getOpcode(), LHS->getOperand(0).getDebugLoc(),
+                                 VT, LHS->getOperand(0), N->getOperand(1));
+
+  // Create the new binop.
+  return DAG.getNode(LHS->getOpcode(), N->getDebugLoc(), VT, NewShift, NewRHS);
+}
+
+SDValue DAGCombiner::visitSHL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
+
+  // fold (shl c1, c2) -> c1<<c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::SHL, VT, N0C, N1C);
+  // fold (shl 0, x) -> 0
+  if (N0C && N0C->isNullValue())
+    return N0;
+  // fold (shl x, c >= size(x)) -> undef
+  if (N1C && N1C->getZExtValue() >= OpSizeInBits)
+    return DAG.getUNDEF(VT);
+  // fold (shl x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // if (shl x, c) is known to be zero, return 0
+  if (DAG.MaskedValueIsZero(SDValue(N, 0),
+                            APInt::getAllOnesValue(OpSizeInBits)))
+    return DAG.getConstant(0, VT);
+  // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND &&
+      N1.hasOneUse() && N1.getOperand(0).hasOneUse()) {
+    SDValue N101 = N1.getOperand(0).getOperand(1);
+    if (ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N101)) {
+      EVT TruncVT = N1.getValueType();
+      SDValue N100 = N1.getOperand(0).getOperand(0);
+      APInt TruncC = N101C->getAPIntValue();
+      TruncC.trunc(TruncVT.getSizeInBits());
+      return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0,
+                         DAG.getNode(ISD::AND, N->getDebugLoc(), TruncVT,
+                                     DAG.getNode(ISD::TRUNCATE,
+                                                 N->getDebugLoc(),
+                                                 TruncVT, N100),
+                                     DAG.getConstant(TruncC, TruncVT)));
+    }
+  }
+
+  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SHL &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    uint64_t c1 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+    uint64_t c2 = N1C->getZExtValue();
+    if (c1 + c2 > OpSizeInBits)
+      return DAG.getConstant(0, VT);
+    return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getConstant(c1 + c2, N1.getValueType()));
+  }
+  // fold (shl (srl x, c1), c2) -> (shl (and x, (shl -1, c1)), (sub c2, c1)) or
+  //                               (srl (and x, (shl -1, c1)), (sub c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SRL &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    uint64_t c1 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+    if (c1 < VT.getSizeInBits()) {
+      uint64_t c2 = N1C->getZExtValue();
+      SDValue HiBitsMask =
+        DAG.getConstant(APInt::getHighBitsSet(VT.getSizeInBits(),
+                                              VT.getSizeInBits() - c1),
+                        VT);
+      SDValue Mask = DAG.getNode(ISD::AND, N0.getDebugLoc(), VT,
+                                 N0.getOperand(0),
+                                 HiBitsMask);
+      if (c2 > c1)
+        return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, Mask,
+                           DAG.getConstant(c2-c1, N1.getValueType()));
+      else
+        return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, Mask,
+                           DAG.getConstant(c1-c2, N1.getValueType()));
+    }
+  }
+  // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
+  if (N1C && N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1)) {
+    SDValue HiBitsMask =
+      DAG.getConstant(APInt::getHighBitsSet(VT.getSizeInBits(),
+                                            VT.getSizeInBits() -
+                                              N1C->getZExtValue()),
+                      VT);
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0.getOperand(0),
+                       HiBitsMask);
+  }
+
+  if (N1C) {
+    SDValue NewSHL = visitShiftByConstant(N, N1C->getZExtValue());
+    if (NewSHL.getNode())
+      return NewSHL;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSRA(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
+
+  // fold (sra c1, c2) -> (sra c1, c2)
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::SRA, VT, N0C, N1C);
+  // fold (sra 0, x) -> 0
+  if (N0C && N0C->isNullValue())
+    return N0;
+  // fold (sra -1, x) -> -1
+  if (N0C && N0C->isAllOnesValue())
+    return N0;
+  // fold (sra x, (setge c, size(x))) -> undef
+  if (N1C && N1C->getZExtValue() >= OpSizeInBits)
+    return DAG.getUNDEF(VT);
+  // fold (sra x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
+  // sext_inreg.
+  if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
+    unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
+    EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
+    if (VT.isVector())
+      ExtVT = EVT::getVectorVT(*DAG.getContext(),
+                               ExtVT, VT.getVectorNumElements());
+    if ((!LegalOperations ||
+         TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT)))
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT,
+                         N0.getOperand(0), DAG.getValueType(ExtVT));
+  }
+
+  // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SRA) {
+    if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      unsigned Sum = N1C->getZExtValue() + C1->getZExtValue();
+      if (Sum >= OpSizeInBits) Sum = OpSizeInBits-1;
+      return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0.getOperand(0),
+                         DAG.getConstant(Sum, N1C->getValueType(0)));
+    }
+  }
+
+  // fold (sra (shl X, m), (sub result_size, n))
+  // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
+  // result_size - n != m.
+  // If truncate is free for the target sext(shl) is likely to result in better
+  // code.
+  if (N0.getOpcode() == ISD::SHL) {
+    // Get the two constanst of the shifts, CN0 = m, CN = n.
+    const ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (N01C && N1C) {
+      // Determine what the truncate's result bitsize and type would be.
+      EVT TruncVT =
+        EVT::getIntegerVT(*DAG.getContext(), OpSizeInBits - N1C->getZExtValue());
+      // Determine the residual right-shift amount.
+      signed ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
+
+      // If the shift is not a no-op (in which case this should be just a sign
+      // extend already), the truncated to type is legal, sign_extend is legal
+      // on that type, and the truncate to that type is both legal and free,
+      // perform the transform.
+      if ((ShiftAmt > 0) &&
+          TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
+          TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
+          TLI.isTruncateFree(VT, TruncVT)) {
+
+          SDValue Amt = DAG.getConstant(ShiftAmt, getShiftAmountTy());
+          SDValue Shift = DAG.getNode(ISD::SRL, N0.getDebugLoc(), VT,
+                                      N0.getOperand(0), Amt);
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), TruncVT,
+                                      Shift);
+          return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(),
+                             N->getValueType(0), Trunc);
+      }
+    }
+  }
+
+  // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND &&
+      N1.hasOneUse() && N1.getOperand(0).hasOneUse()) {
+    SDValue N101 = N1.getOperand(0).getOperand(1);
+    if (ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N101)) {
+      EVT TruncVT = N1.getValueType();
+      SDValue N100 = N1.getOperand(0).getOperand(0);
+      APInt TruncC = N101C->getAPIntValue();
+      TruncC.trunc(TruncVT.getScalarType().getSizeInBits());
+      return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0,
+                         DAG.getNode(ISD::AND, N->getDebugLoc(),
+                                     TruncVT,
+                                     DAG.getNode(ISD::TRUNCATE,
+                                                 N->getDebugLoc(),
+                                                 TruncVT, N100),
+                                     DAG.getConstant(TruncC, TruncVT)));
+    }
+  }
+
+  // Simplify, based on bits shifted out of the LHS.
+  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+
+  // If the sign bit is known to be zero, switch this to a SRL.
+  if (DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0, N1);
+
+  if (N1C) {
+    SDValue NewSRA = visitShiftByConstant(N, N1C->getZExtValue());
+    if (NewSRA.getNode())
+      return NewSRA;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSRL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
+
+  // fold (srl c1, c2) -> c1 >>u c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::SRL, VT, N0C, N1C);
+  // fold (srl 0, x) -> 0
+  if (N0C && N0C->isNullValue())
+    return N0;
+  // fold (srl x, c >= size(x)) -> undef
+  if (N1C && N1C->getZExtValue() >= OpSizeInBits)
+    return DAG.getUNDEF(VT);
+  // fold (srl x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // if (srl x, c) is known to be zero, return 0
+  if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
+                                   APInt::getAllOnesValue(OpSizeInBits)))
+    return DAG.getConstant(0, VT);
+
+  // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SRL &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    uint64_t c1 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+    uint64_t c2 = N1C->getZExtValue();
+    if (c1 + c2 > OpSizeInBits)
+      return DAG.getConstant(0, VT);
+    return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getConstant(c1 + c2, N1.getValueType()));
+  }
+  
+  // fold (srl (shl x, c), c) -> (and x, cst2)
+  if (N1C && N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
+      N0.getValueSizeInBits() <= 64) {
+    uint64_t ShAmt = N1C->getZExtValue()+64-N0.getValueSizeInBits();
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getConstant(~0ULL >> ShAmt, VT));
+  }
+  
+
+  // fold (srl (anyextend x), c) -> (anyextend (srl x, c))
+  if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
+    // Shifting in all undef bits?
+    EVT SmallVT = N0.getOperand(0).getValueType();
+    if (N1C->getZExtValue() >= SmallVT.getSizeInBits())
+      return DAG.getUNDEF(VT);
+
+    if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
+      SDValue SmallShift = DAG.getNode(ISD::SRL, N0.getDebugLoc(), SmallVT,
+                                       N0.getOperand(0), N1);
+      AddToWorkList(SmallShift.getNode());
+      return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, SmallShift);
+    }
+  }
+
+  // fold (srl (sra X, Y), 31) -> (srl X, 31).  This srl only looks at the sign
+  // bit, which is unmodified by sra.
+  if (N1C && N1C->getZExtValue() + 1 == VT.getSizeInBits()) {
+    if (N0.getOpcode() == ISD::SRA)
+      return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0), N1);
+  }
+
+  // fold (srl (ctlz x), "5") -> x  iff x has one bit set (the low bit).
+  if (N1C && N0.getOpcode() == ISD::CTLZ &&
+      N1C->getAPIntValue() == Log2_32(VT.getSizeInBits())) {
+    APInt KnownZero, KnownOne;
+    APInt Mask = APInt::getAllOnesValue(VT.getScalarType().getSizeInBits());
+    DAG.ComputeMaskedBits(N0.getOperand(0), Mask, KnownZero, KnownOne);
+
+    // If any of the input bits are KnownOne, then the input couldn't be all
+    // zeros, thus the result of the srl will always be zero.
+    if (KnownOne.getBoolValue()) return DAG.getConstant(0, VT);
+
+    // If all of the bits input the to ctlz node are known to be zero, then
+    // the result of the ctlz is "32" and the result of the shift is one.
+    APInt UnknownBits = ~KnownZero & Mask;
+    if (UnknownBits == 0) return DAG.getConstant(1, VT);
+
+    // Otherwise, check to see if there is exactly one bit input to the ctlz.
+    if ((UnknownBits & (UnknownBits - 1)) == 0) {
+      // Okay, we know that only that the single bit specified by UnknownBits
+      // could be set on input to the CTLZ node. If this bit is set, the SRL
+      // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
+      // to an SRL/XOR pair, which is likely to simplify more.
+      unsigned ShAmt = UnknownBits.countTrailingZeros();
+      SDValue Op = N0.getOperand(0);
+
+      if (ShAmt) {
+        Op = DAG.getNode(ISD::SRL, N0.getDebugLoc(), VT, Op,
+                         DAG.getConstant(ShAmt, getShiftAmountTy()));
+        AddToWorkList(Op.getNode());
+      }
+
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT,
+                         Op, DAG.getConstant(1, VT));
+    }
+  }
+
+  // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND &&
+      N1.hasOneUse() && N1.getOperand(0).hasOneUse()) {
+    SDValue N101 = N1.getOperand(0).getOperand(1);
+    if (ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N101)) {
+      EVT TruncVT = N1.getValueType();
+      SDValue N100 = N1.getOperand(0).getOperand(0);
+      APInt TruncC = N101C->getAPIntValue();
+      TruncC.trunc(TruncVT.getSizeInBits());
+      return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0,
+                         DAG.getNode(ISD::AND, N->getDebugLoc(),
+                                     TruncVT,
+                                     DAG.getNode(ISD::TRUNCATE,
+                                                 N->getDebugLoc(),
+                                                 TruncVT, N100),
+                                     DAG.getConstant(TruncC, TruncVT)));
+    }
+  }
+
+  // fold operands of srl based on knowledge that the low bits are not
+  // demanded.
+  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  if (N1C) {
+    SDValue NewSRL = visitShiftByConstant(N, N1C->getZExtValue());
+    if (NewSRL.getNode())
+      return NewSRL;
+  }
+
+  // Attempt to convert a srl of a load into a narrower zero-extending load.
+  SDValue NarrowLoad = ReduceLoadWidth(N);
+  if (NarrowLoad.getNode())
+    return NarrowLoad;
+
+  // Here is a common situation. We want to optimize:
+  //
+  //   %a = ...
+  //   %b = and i32 %a, 2
+  //   %c = srl i32 %b, 1
+  //   brcond i32 %c ...
+  //
+  // into
+  // 
+  //   %a = ...
+  //   %b = and %a, 2
+  //   %c = setcc eq %b, 0
+  //   brcond %c ...
+  //
+  // However when after the source operand of SRL is optimized into AND, the SRL
+  // itself may not be optimized further. Look for it and add the BRCOND into
+  // the worklist.
+  if (N->hasOneUse()) {
+    SDNode *Use = *N->use_begin();
+    if (Use->getOpcode() == ISD::BRCOND)
+      AddToWorkList(Use);
+    else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) {
+      // Also look pass the truncate.
+      Use = *Use->use_begin();
+      if (Use->getOpcode() == ISD::BRCOND)
+        AddToWorkList(Use);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTLZ(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ctlz c1) -> c2
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::CTLZ, N->getDebugLoc(), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTTZ(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (cttz c1) -> c2
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::CTTZ, N->getDebugLoc(), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTPOP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ctpop c1) -> c2
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::CTPOP, N->getDebugLoc(), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSELECT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+  EVT VT = N->getValueType(0);
+  EVT VT0 = N0.getValueType();
+
+  // fold (select C, X, X) -> X
+  if (N1 == N2)
+    return N1;
+  // fold (select true, X, Y) -> X
+  if (N0C && !N0C->isNullValue())
+    return N1;
+  // fold (select false, X, Y) -> Y
+  if (N0C && N0C->isNullValue())
+    return N2;
+  // fold (select C, 1, X) -> (or C, X)
+  if (VT == MVT::i1 && N1C && N1C->getAPIntValue() == 1)
+    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N2);
+  // fold (select C, 0, 1) -> (xor C, 1)
+  if (VT.isInteger() &&
+      (VT0 == MVT::i1 ||
+       (VT0.isInteger() &&
+        TLI.getBooleanContents() == TargetLowering::ZeroOrOneBooleanContent)) &&
+      N1C && N2C && N1C->isNullValue() && N2C->getAPIntValue() == 1) {
+    SDValue XORNode;
+    if (VT == VT0)
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT0,
+                         N0, DAG.getConstant(1, VT0));
+    XORNode = DAG.getNode(ISD::XOR, N0.getDebugLoc(), VT0,
+                          N0, DAG.getConstant(1, VT0));
+    AddToWorkList(XORNode.getNode());
+    if (VT.bitsGT(VT0))
+      return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, XORNode);
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, XORNode);
+  }
+  // fold (select C, 0, X) -> (and (not C), X)
+  if (VT == VT0 && VT == MVT::i1 && N1C && N1C->isNullValue()) {
+    SDValue NOTNode = DAG.getNOT(N0.getDebugLoc(), N0, VT);
+    AddToWorkList(NOTNode.getNode());
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, NOTNode, N2);
+  }
+  // fold (select C, X, 1) -> (or (not C), X)
+  if (VT == VT0 && VT == MVT::i1 && N2C && N2C->getAPIntValue() == 1) {
+    SDValue NOTNode = DAG.getNOT(N0.getDebugLoc(), N0, VT);
+    AddToWorkList(NOTNode.getNode());
+    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, NOTNode, N1);
+  }
+  // fold (select C, X, 0) -> (and C, X)
+  if (VT == MVT::i1 && N2C && N2C->isNullValue())
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0, N1);
+  // fold (select X, X, Y) -> (or X, Y)
+  // fold (select X, 1, Y) -> (or X, Y)
+  if (VT == MVT::i1 && (N0 == N1 || (N1C && N1C->getAPIntValue() == 1)))
+    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N2);
+  // fold (select X, Y, X) -> (and X, Y)
+  // fold (select X, Y, 0) -> (and X, Y)
+  if (VT == MVT::i1 && (N0 == N2 || (N2C && N2C->getAPIntValue() == 0)))
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0, N1);
+
+  // If we can fold this based on the true/false value, do so.
+  if (SimplifySelectOps(N, N1, N2))
+    return SDValue(N, 0);  // Don't revisit N.
+
+  // fold selects based on a setcc into other things, such as min/max/abs
+  if (N0.getOpcode() == ISD::SETCC) {
+    // FIXME:
+    // Check against MVT::Other for SELECT_CC, which is a workaround for targets
+    // having to say they don't support SELECT_CC on every type the DAG knows
+    // about, since there is no way to mark an opcode illegal at all value types
+    if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other) &&
+        TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         N1, N2, N0.getOperand(2));
+    return SimplifySelect(N->getDebugLoc(), N0, N1, N2);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDValue N3 = N->getOperand(3);
+  SDValue N4 = N->getOperand(4);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get();
+
+  // fold select_cc lhs, rhs, x, x, cc -> x
+  if (N2 == N3)
+    return N2;
+
+  // Determine if the condition we're dealing with is constant
+  SDValue SCC = SimplifySetCC(TLI.getSetCCResultType(N0.getValueType()),
+                              N0, N1, CC, N->getDebugLoc(), false);
+  if (SCC.getNode()) AddToWorkList(SCC.getNode());
+
+  if (ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode())) {
+    if (!SCCC->isNullValue())
+      return N2;    // cond always true -> true val
+    else
+      return N3;    // cond always false -> false val
+  }
+
+  // Fold to a simpler select_cc
+  if (SCC.getNode() && SCC.getOpcode() == ISD::SETCC)
+    return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N2.getValueType(),
+                       SCC.getOperand(0), SCC.getOperand(1), N2, N3,
+                       SCC.getOperand(2));
+
+  // If we can fold this based on the true/false value, do so.
+  if (SimplifySelectOps(N, N2, N3))
+    return SDValue(N, 0);  // Don't revisit N.
+
+  // fold select_cc into other things, such as min/max/abs
+  return SimplifySelectCC(N->getDebugLoc(), N0, N1, N2, N3, CC);
+}
+
+SDValue DAGCombiner::visitSETCC(SDNode *N) {
+  return SimplifySetCC(N->getValueType(0), N->getOperand(0), N->getOperand(1),
+                       cast<CondCodeSDNode>(N->getOperand(2))->get(),
+                       N->getDebugLoc());
+}
+
+// ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
+// "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))"
+// transformation. Returns true if extension are possible and the above
+// mentioned transformation is profitable.
+static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0,
+                                    unsigned ExtOpc,
+                                    SmallVector<SDNode*, 4> &ExtendNodes,
+                                    const TargetLowering &TLI) {
+  bool HasCopyToRegUses = false;
+  bool isTruncFree = TLI.isTruncateFree(N->getValueType(0), N0.getValueType());
+  for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
+                            UE = N0.getNode()->use_end();
+       UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User == N)
+      continue;
+    if (UI.getUse().getResNo() != N0.getResNo())
+      continue;
+    // FIXME: Only extend SETCC N, N and SETCC N, c for now.
+    if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
+      if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC))
+        // Sign bits will be lost after a zext.
+        return false;
+      bool Add = false;
+      for (unsigned i = 0; i != 2; ++i) {
+        SDValue UseOp = User->getOperand(i);
+        if (UseOp == N0)
+          continue;
+        if (!isa<ConstantSDNode>(UseOp))
+          return false;
+        Add = true;
+      }
+      if (Add)
+        ExtendNodes.push_back(User);
+      continue;
+    }
+    // If truncates aren't free and there are users we can't
+    // extend, it isn't worthwhile.
+    if (!isTruncFree)
+      return false;
+    // Remember if this value is live-out.
+    if (User->getOpcode() == ISD::CopyToReg)
+      HasCopyToRegUses = true;
+  }
+
+  if (HasCopyToRegUses) {
+    bool BothLiveOut = false;
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI) {
+      SDUse &Use = UI.getUse();
+      if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
+        BothLiveOut = true;
+        break;
+      }
+    }
+    if (BothLiveOut)
+      // Both unextended and extended values are live out. There had better be
+      // good a reason for the transformation.
+      return ExtendNodes.size();
+  }
+  return true;
+}
+
+SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (sext c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, N0);
+
+  // fold (sext (sext x)) -> (sext x)
+  // fold (sext (aext x)) -> (sext x)
+  if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
+    return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT,
+                       N0.getOperand(0));
+
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    // fold (sext (truncate (load x))) -> (sext (smaller load x))
+    // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
+    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
+    if (NarrowLoad.getNode()) {
+      SDNode* oye = N0.getNode()->getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorkList(oye);
+      }
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+
+    // See if the value being truncated is already sign extended.  If so, just
+    // eliminate the trunc/sext pair.
+    SDValue Op = N0.getOperand(0);
+    unsigned OpBits   = Op.getValueType().getScalarType().getSizeInBits();
+    unsigned MidBits  = N0.getValueType().getScalarType().getSizeInBits();
+    unsigned DestBits = VT.getScalarType().getSizeInBits();
+    unsigned NumSignBits = DAG.ComputeNumSignBits(Op);
+
+    if (OpBits == DestBits) {
+      // Op is i32, Mid is i8, and Dest is i32.  If Op has more than 24 sign
+      // bits, it is already ready.
+      if (NumSignBits > DestBits-MidBits)
+        return Op;
+    } else if (OpBits < DestBits) {
+      // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
+      // bits, just sext from i32.
+      if (NumSignBits > OpBits-MidBits)
+        return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, Op);
+    } else {
+      // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
+      // bits, just truncate to i32.
+      if (NumSignBits > OpBits-MidBits)
+        return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+    }
+
+    // fold (sext (truncate x)) -> (sextinreg x).
+    if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
+                                                 N0.getValueType())) {
+      if (OpBits < DestBits)
+        Op = DAG.getNode(ISD::ANY_EXTEND, N0.getDebugLoc(), VT, Op);
+      else if (OpBits > DestBits)
+        Op = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), VT, Op);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT, Op,
+                         DAG.getValueType(N0.getValueType()));
+    }
+  }
+
+  // fold (sext (load x)) -> (sext (truncate (sextload x)))
+  if (ISD::isNON_EXTLoad(N0.getNode()) &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()))) {
+    bool DoXform = true;
+    SmallVector<SDNode*, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, VT, N->getDebugLoc(),
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(),
+                                       N0.getValueType(),
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                                  N0.getValueType(), ExtLoad);
+      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
+
+      // Extend SetCC uses if necessary.
+      for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
+        SDNode *SetCC = SetCCs[i];
+        SmallVector<SDValue, 4> Ops;
+
+        for (unsigned j = 0; j != 2; ++j) {
+          SDValue SOp = SetCC->getOperand(j);
+          if (SOp == Trunc)
+            Ops.push_back(ExtLoad);
+          else
+            Ops.push_back(DAG.getNode(ISD::SIGN_EXTEND,
+                                      N->getDebugLoc(), VT, SOp));
+        }
+
+        Ops.push_back(SetCC->getOperand(2));
+        CombineTo(SetCC, DAG.getNode(ISD::SETCC, N->getDebugLoc(),
+                                     SetCC->getValueType(0),
+                                     &Ops[0], Ops.size()));
+      }
+
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (sext (sextload x)) -> (sext (truncate (sextload x)))
+  // fold (sext ( extload x)) -> (sext (truncate (sextload x)))
+  if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    if ((!LegalOperations && !LN0->isVolatile()) ||
+        TLI.isLoadExtLegal(ISD::SEXTLOAD, MemVT)) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, VT, N->getDebugLoc(),
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(), MemVT,
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                            N0.getValueType(), ExtLoad),
+                ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  if (N0.getOpcode() == ISD::SETCC) {
+    // sext(setcc) -> sext_in_reg(vsetcc) for vectors.
+    // Only do this before legalize for now.
+    if (VT.isVector() && !LegalOperations) {
+      EVT N0VT = N0.getOperand(0).getValueType();
+        // We know that the # elements of the results is the same as the
+        // # elements of the compare (and the # elements of the compare result
+        // for that matter).  Check to see that they are the same size.  If so,
+        // we know that the element size of the sext'd result matches the
+        // element size of the compare operands.
+      if (VT.getSizeInBits() == N0VT.getSizeInBits())
+        return DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+                             N0.getOperand(1),
+                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      // If the desired elements are smaller or larger than the source
+      // elements we can use a matching integer vector type and then
+      // truncate/sign extend
+      else {
+        EVT MatchingElementType =
+          EVT::getIntegerVT(*DAG.getContext(),
+                            N0VT.getScalarType().getSizeInBits());
+        EVT MatchingVectorType =
+          EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                           N0VT.getVectorNumElements());
+        SDValue VsetCC =
+          DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+                        N0.getOperand(1),
+                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
+      }
+    }
+
+    // sext(setcc x, y, cc) -> (select_cc x, y, -1, 0, cc)
+    unsigned ElementWidth = VT.getScalarType().getSizeInBits();
+    SDValue NegOne =
+      DAG.getConstant(APInt::getAllOnesValue(ElementWidth), VT);
+    SDValue SCC =
+      SimplifySelectCC(N->getDebugLoc(), N0.getOperand(0), N0.getOperand(1),
+                       NegOne, DAG.getConstant(0, VT),
+                       cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
+    if (SCC.getNode()) return SCC;
+    if (!LegalOperations ||
+        TLI.isOperationLegal(ISD::SETCC, TLI.getSetCCResultType(VT)))
+      return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
+                         DAG.getSetCC(N->getDebugLoc(),
+                                      TLI.getSetCCResultType(VT),
+                                      N0.getOperand(0), N0.getOperand(1),
+                                 cast<CondCodeSDNode>(N0.getOperand(2))->get()),
+                         NegOne, DAG.getConstant(0, VT));
+  }  
+
+  // fold (sext x) -> (zext x) if the sign bit is known zero.
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
+      DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (zext c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, N0);
+  // fold (zext (zext x)) -> (zext x)
+  // fold (zext (aext x)) -> (zext x)
+  if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
+    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT,
+                       N0.getOperand(0));
+
+  // fold (zext (truncate (load x))) -> (zext (smaller load x))
+  // fold (zext (truncate (srl (load x), c))) -> (zext (small load (x+c/n)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
+    if (NarrowLoad.getNode()) {
+      SDNode* oye = N0.getNode()->getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorkList(oye);
+      }
+      return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, NarrowLoad);
+    }
+  }
+
+  // fold (zext (truncate x)) -> (and x, mask)
+  if (N0.getOpcode() == ISD::TRUNCATE &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT))) {
+    SDValue Op = N0.getOperand(0);
+    if (Op.getValueType().bitsLT(VT)) {
+      Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op);
+    } else if (Op.getValueType().bitsGT(VT)) {
+      Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+    }
+    return DAG.getZeroExtendInReg(Op, N->getDebugLoc(),
+                                  N0.getValueType().getScalarType());
+  }
+
+  // Fold (zext (and (trunc x), cst)) -> (and x, cst),
+  // if either of the casts is not free.
+  if (N0.getOpcode() == ISD::AND &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
+                           N0.getValueType()) ||
+       !TLI.isZExtFree(N0.getValueType(), VT))) {
+    SDValue X = N0.getOperand(0).getOperand(0);
+    if (X.getValueType().bitsLT(VT)) {
+      X = DAG.getNode(ISD::ANY_EXTEND, X.getDebugLoc(), VT, X);
+    } else if (X.getValueType().bitsGT(VT)) {
+      X = DAG.getNode(ISD::TRUNCATE, X.getDebugLoc(), VT, X);
+    }
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    Mask.zext(VT.getSizeInBits());
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                       X, DAG.getConstant(Mask, VT));
+  }
+
+  // fold (zext (load x)) -> (zext (truncate (zextload x)))
+  if (ISD::isNON_EXTLoad(N0.getNode()) &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()))) {
+    bool DoXform = true;
+    SmallVector<SDNode*, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, VT, N->getDebugLoc(),
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(),
+                                       N0.getValueType(),
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                                  N0.getValueType(), ExtLoad);
+      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
+
+      // Extend SetCC uses if necessary.
+      for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
+        SDNode *SetCC = SetCCs[i];
+        SmallVector<SDValue, 4> Ops;
+
+        for (unsigned j = 0; j != 2; ++j) {
+          SDValue SOp = SetCC->getOperand(j);
+          if (SOp == Trunc)
+            Ops.push_back(ExtLoad);
+          else
+            Ops.push_back(DAG.getNode(ISD::ZERO_EXTEND,
+                                      N->getDebugLoc(), VT, SOp));
+        }
+
+        Ops.push_back(SetCC->getOperand(2));
+        CombineTo(SetCC, DAG.getNode(ISD::SETCC, N->getDebugLoc(),
+                                     SetCC->getValueType(0),
+                                     &Ops[0], Ops.size()));
+      }
+
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (zext (zextload x)) -> (zext (truncate (zextload x)))
+  // fold (zext ( extload x)) -> (zext (truncate (zextload x)))
+  if ((ISD::isZEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    if ((!LegalOperations && !LN0->isVolatile()) ||
+        TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT)) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, VT, N->getDebugLoc(),
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(), MemVT,
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), N0.getValueType(),
+                            ExtLoad),
+                ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  if (N0.getOpcode() == ISD::SETCC) {
+    if (!LegalOperations && VT.isVector()) {
+      // zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors.
+      // Only do this before legalize for now.
+      EVT N0VT = N0.getOperand(0).getValueType();
+      EVT EltVT = VT.getVectorElementType();
+      SmallVector<SDValue,8> OneOps(VT.getVectorNumElements(),
+                                    DAG.getConstant(1, EltVT));
+      if (VT.getSizeInBits() == N0VT.getSizeInBits()) {
+        // We know that the # elements of the results is the same as the
+        // # elements of the compare (and the # elements of the compare result
+        // for that matter).  Check to see that they are the same size.  If so,
+        // we know that the element size of the sext'd result matches the
+        // element size of the compare operands.
+        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                           DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+                                         N0.getOperand(1),
+                                 cast<CondCodeSDNode>(N0.getOperand(2))->get()),
+                           DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
+                                       &OneOps[0], OneOps.size()));
+      } else {
+        // If the desired elements are smaller or larger than the source
+        // elements we can use a matching integer vector type and then
+        // truncate/sign extend
+        EVT MatchingElementType =
+          EVT::getIntegerVT(*DAG.getContext(),
+                            N0VT.getScalarType().getSizeInBits());
+        EVT MatchingVectorType =
+          EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                           N0VT.getVectorNumElements());
+        SDValue VsetCC =
+          DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+                        N0.getOperand(1),
+                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                           DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT),
+                           DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
+                                       &OneOps[0], OneOps.size()));
+      }
+    }
+
+    // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
+    SDValue SCC =
+      SimplifySelectCC(N->getDebugLoc(), N0.getOperand(0), N0.getOperand(1),
+                       DAG.getConstant(1, VT), DAG.getConstant(0, VT),
+                       cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
+    if (SCC.getNode()) return SCC;
+  }
+
+  // (zext (shl (zext x), cst)) -> (shl (zext x), cst)
+  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
+      isa<ConstantSDNode>(N0.getOperand(1)) &&
+      N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
+      N0.hasOneUse()) {
+    if (N0.getOpcode() == ISD::SHL) {
+      // If the original shl may be shifting out bits, do not perform this
+      // transformation.
+      unsigned ShAmt = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+      unsigned KnownZeroBits = N0.getOperand(0).getValueType().getSizeInBits() -
+        N0.getOperand(0).getOperand(0).getValueType().getSizeInBits();
+      if (ShAmt > KnownZeroBits)
+        return SDValue();
+    }
+    DebugLoc dl = N->getDebugLoc();
+    return DAG.getNode(N0.getOpcode(), dl, VT,
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, VT, N0.getOperand(0)),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl,
+                                   N0.getOperand(1).getValueType(),
+                                   N0.getOperand(1)));
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (aext c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, N0);
+  // fold (aext (aext x)) -> (aext x)
+  // fold (aext (zext x)) -> (zext x)
+  // fold (aext (sext x)) -> (sext x)
+  if (N0.getOpcode() == ISD::ANY_EXTEND  ||
+      N0.getOpcode() == ISD::ZERO_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND)
+    return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, N0.getOperand(0));
+
+  // fold (aext (truncate (load x))) -> (aext (smaller load x))
+  // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
+    if (NarrowLoad.getNode()) {
+      SDNode* oye = N0.getNode()->getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorkList(oye);
+      }
+      return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, NarrowLoad);
+    }
+  }
+
+  // fold (aext (truncate x))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue TruncOp = N0.getOperand(0);
+    if (TruncOp.getValueType() == VT)
+      return TruncOp; // x iff x size == zext size.
+    if (TruncOp.getValueType().bitsGT(VT))
+      return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, TruncOp);
+    return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, TruncOp);
+  }
+
+  // Fold (aext (and (trunc x), cst)) -> (and x, cst)
+  // if the trunc is not free.
+  if (N0.getOpcode() == ISD::AND &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
+                          N0.getValueType())) {
+    SDValue X = N0.getOperand(0).getOperand(0);
+    if (X.getValueType().bitsLT(VT)) {
+      X = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, X);
+    } else if (X.getValueType().bitsGT(VT)) {
+      X = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, X);
+    }
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    Mask.zext(VT.getSizeInBits());
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                       X, DAG.getConstant(Mask, VT));
+  }
+
+  // fold (aext (load x)) -> (aext (truncate (extload x)))
+  if (ISD::isNON_EXTLoad(N0.getNode()) &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
+    bool DoXform = true;
+    SmallVector<SDNode*, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, VT, N->getDebugLoc(),
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(),
+                                       N0.getValueType(),
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                                  N0.getValueType(), ExtLoad);
+      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
+
+      // Extend SetCC uses if necessary.
+      for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
+        SDNode *SetCC = SetCCs[i];
+        SmallVector<SDValue, 4> Ops;
+
+        for (unsigned j = 0; j != 2; ++j) {
+          SDValue SOp = SetCC->getOperand(j);
+          if (SOp == Trunc)
+            Ops.push_back(ExtLoad);
+          else
+            Ops.push_back(DAG.getNode(ISD::ANY_EXTEND,
+                                      N->getDebugLoc(), VT, SOp));
+        }
+
+        Ops.push_back(SetCC->getOperand(2));
+        CombineTo(SetCC, DAG.getNode(ISD::SETCC, N->getDebugLoc(),
+                                     SetCC->getValueType(0),
+                                     &Ops[0], Ops.size()));
+      }
+
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (aext (zextload x)) -> (aext (truncate (zextload x)))
+  // fold (aext (sextload x)) -> (aext (truncate (sextload x)))
+  // fold (aext ( extload x)) -> (aext (truncate (extload  x)))
+  if (N0.getOpcode() == ISD::LOAD &&
+      !ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    SDValue ExtLoad = DAG.getExtLoad(LN0->getExtensionType(), VT,
+                                     N->getDebugLoc(),
+                                     LN0->getChain(), LN0->getBasePtr(),
+                                     LN0->getSrcValue(),
+                                     LN0->getSrcValueOffset(), MemVT,
+                                     LN0->isVolatile(), LN0->isNonTemporal(),
+                                     LN0->getAlignment());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(),
+              DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                          N0.getValueType(), ExtLoad),
+              ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+
+  if (N0.getOpcode() == ISD::SETCC) {
+    // aext(setcc) -> sext_in_reg(vsetcc) for vectors.
+    // Only do this before legalize for now.
+    if (VT.isVector() && !LegalOperations) {
+      EVT N0VT = N0.getOperand(0).getValueType();
+        // We know that the # elements of the results is the same as the
+        // # elements of the compare (and the # elements of the compare result
+        // for that matter).  Check to see that they are the same size.  If so,
+        // we know that the element size of the sext'd result matches the
+        // element size of the compare operands.
+      if (VT.getSizeInBits() == N0VT.getSizeInBits())
+        return DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+                             N0.getOperand(1),
+                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      // If the desired elements are smaller or larger than the source
+      // elements we can use a matching integer vector type and then
+      // truncate/sign extend
+      else {
+        EVT MatchingElementType =
+          EVT::getIntegerVT(*DAG.getContext(),
+                            N0VT.getScalarType().getSizeInBits());
+        EVT MatchingVectorType =
+          EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                           N0VT.getVectorNumElements());
+        SDValue VsetCC =
+          DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+                        N0.getOperand(1),
+                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
+      }
+    }
+
+    // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
+    SDValue SCC =
+      SimplifySelectCC(N->getDebugLoc(), N0.getOperand(0), N0.getOperand(1),
+                       DAG.getConstant(1, VT), DAG.getConstant(0, VT),
+                       cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
+    if (SCC.getNode())
+      return SCC;
+  }
+
+  return SDValue();
+}
+
+/// GetDemandedBits - See if the specified operand can be simplified with the
+/// knowledge that only the bits specified by Mask are used.  If so, return the
+/// simpler operand, otherwise return a null SDValue.
+SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
+  switch (V.getOpcode()) {
+  default: break;
+  case ISD::OR:
+  case ISD::XOR:
+    // If the LHS or RHS don't contribute bits to the or, drop them.
+    if (DAG.MaskedValueIsZero(V.getOperand(0), Mask))
+      return V.getOperand(1);
+    if (DAG.MaskedValueIsZero(V.getOperand(1), Mask))
+      return V.getOperand(0);
+    break;
+  case ISD::SRL:
+    // Only look at single-use SRLs.
+    if (!V.getNode()->hasOneUse())
+      break;
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
+      // See if we can recursively simplify the LHS.
+      unsigned Amt = RHSC->getZExtValue();
+
+      // Watch out for shift count overflow though.
+      if (Amt >= Mask.getBitWidth()) break;
+      APInt NewMask = Mask << Amt;
+      SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask);
+      if (SimplifyLHS.getNode())
+        return DAG.getNode(ISD::SRL, V.getDebugLoc(), V.getValueType(),
+                           SimplifyLHS, V.getOperand(1));
+    }
+  }
+  return SDValue();
+}
+
+/// ReduceLoadWidth - If the result of a wider load is shifted to right of N
+/// bits and then truncated to a narrower type and where N is a multiple
+/// of number of bits of the narrower type, transform it to a narrower load
+/// from address + N / num of bits of new type. If the result is to be
+/// extended, also fold the extension to form a extending load.
+SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
+  unsigned Opc = N->getOpcode();
+
+  ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT ExtVT = VT;
+
+  // This transformation isn't valid for vector loads.
+  if (VT.isVector())
+    return SDValue();
+
+  // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
+  // extended to VT.
+  if (Opc == ISD::SIGN_EXTEND_INREG) {
+    ExtType = ISD::SEXTLOAD;
+    ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+    if (LegalOperations && !TLI.isLoadExtLegal(ISD::SEXTLOAD, ExtVT))
+      return SDValue();
+  } else if (Opc == ISD::SRL) {
+    // Annother special-case: SRL is basically zero-extending a narrower
+    // value.
+    ExtType = ISD::ZEXTLOAD;
+    N0 = SDValue(N, 0);
+    ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (!N01) return SDValue();
+    ExtVT = EVT::getIntegerVT(*DAG.getContext(),
+                              VT.getSizeInBits() - N01->getZExtValue());
+  }
+
+  unsigned EVTBits = ExtVT.getSizeInBits();
+  unsigned ShAmt = 0;
+  if (N0.getOpcode() == ISD::SRL && N0.hasOneUse() && ExtVT.isRound()) {
+    if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      ShAmt = N01->getZExtValue();
+      // Is the shift amount a multiple of size of VT?
+      if ((ShAmt & (EVTBits-1)) == 0) {
+        N0 = N0.getOperand(0);
+        // Is the load width a multiple of size of VT?
+        if ((N0.getValueType().getSizeInBits() & (EVTBits-1)) != 0)
+          return SDValue();
+      }
+    }
+  }
+
+  // Do not generate loads of non-round integer types since these can
+  // be expensive (and would be wrong if the type is not byte sized).
+  if (isa<LoadSDNode>(N0) && N0.hasOneUse() && ExtVT.isRound() &&
+      cast<LoadSDNode>(N0)->getMemoryVT().getSizeInBits() >= EVTBits &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT PtrType = N0.getOperand(1).getValueType();
+
+    // For big endian targets, we need to adjust the offset to the pointer to
+    // load the correct bytes.
+    if (TLI.isBigEndian()) {
+      unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits();
+      unsigned EVTStoreBits = ExtVT.getStoreSizeInBits();
+      ShAmt = LVTStoreBits - EVTStoreBits - ShAmt;
+    }
+
+    uint64_t PtrOff =  ShAmt / 8;
+    unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
+    SDValue NewPtr = DAG.getNode(ISD::ADD, LN0->getDebugLoc(),
+                                 PtrType, LN0->getBasePtr(),
+                                 DAG.getConstant(PtrOff, PtrType));
+    AddToWorkList(NewPtr.getNode());
+
+    SDValue Load = (ExtType == ISD::NON_EXTLOAD)
+      ? DAG.getLoad(VT, N0.getDebugLoc(), LN0->getChain(), NewPtr,
+                    LN0->getSrcValue(), LN0->getSrcValueOffset() + PtrOff,
+                    LN0->isVolatile(), LN0->isNonTemporal(), NewAlign)
+      : DAG.getExtLoad(ExtType, VT, N0.getDebugLoc(), LN0->getChain(), NewPtr,
+                       LN0->getSrcValue(), LN0->getSrcValueOffset() + PtrOff,
+                       ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
+                       NewAlign);
+
+    // Replace the old load's chain with the new load's chain.
+    WorkListRemover DeadNodes(*this);
+    DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1),
+                                  &DeadNodes);
+
+    // Return the new loaded value.
+    return Load;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT EVT = cast<VTSDNode>(N1)->getVT();
+  unsigned VTBits = VT.getScalarType().getSizeInBits();
+  unsigned EVTBits = EVT.getScalarType().getSizeInBits();
+
+  // fold (sext_in_reg c1) -> c1
+  if (isa<ConstantSDNode>(N0) || N0.getOpcode() == ISD::UNDEF)
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT, N0, N1);
+
+  // If the input is already sign extended, just drop the extension.
+  if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1)
+    return N0;
+
+  // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
+  if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+      EVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT())) {
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT,
+                       N0.getOperand(0), N1);
+  }
+
+  // fold (sext_in_reg (sext x)) -> (sext x)
+  // fold (sext_in_reg (aext x)) -> (sext x)
+  // if x is small enough.
+  if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getValueType().getScalarType().getSizeInBits() <= EVTBits &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
+      return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, N00, N1);
+  }
+
+  // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
+  if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits)))
+    return DAG.getZeroExtendInReg(N0, N->getDebugLoc(), EVT);
+
+  // fold operands of sext_in_reg based on knowledge that the top bits are not
+  // demanded.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (sext_in_reg (load x)) -> (smaller sextload x)
+  // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
+  SDValue NarrowLoad = ReduceLoadWidth(N);
+  if (NarrowLoad.getNode())
+    return NarrowLoad;
+
+  // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
+  // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible.
+  // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
+  if (N0.getOpcode() == ISD::SRL) {
+    if (ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
+      if (ShAmt->getZExtValue()+EVTBits <= VTBits) {
+        // We can turn this into an SRA iff the input to the SRL is already sign
+        // extended enough.
+        unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
+        if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits)
+          return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT,
+                             N0.getOperand(0), N0.getOperand(1));
+      }
+  }
+
+  // fold (sext_inreg (extload x)) -> (sextload x)
+  if (ISD::isEXTLoad(N0.getNode()) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, VT, N->getDebugLoc(),
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), LN0->getSrcValue(),
+                                     LN0->getSrcValueOffset(), EVT,
+                                     LN0->isVolatile(), LN0->isNonTemporal(),
+                                     LN0->getAlignment());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+  // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
+  if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse() &&
+      EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, VT, N->getDebugLoc(),
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), LN0->getSrcValue(),
+                                     LN0->getSrcValueOffset(), EVT,
+                                     LN0->isVolatile(), LN0->isNonTemporal(),
+                                     LN0->getAlignment());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // noop truncate
+  if (N0.getValueType() == N->getValueType(0))
+    return N0;
+  // fold (truncate c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0);
+  // fold (truncate (truncate x)) -> (truncate x)
+  if (N0.getOpcode() == ISD::TRUNCATE)
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0.getOperand(0));
+  // fold (truncate (ext x)) -> (ext x) or (truncate x) or x
+  if (N0.getOpcode() == ISD::ZERO_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND ||
+      N0.getOpcode() == ISD::ANY_EXTEND) {
+    if (N0.getOperand(0).getValueType().bitsLT(VT))
+      // if the source is smaller than the dest, we still need an extend
+      return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+                         N0.getOperand(0));
+    else if (N0.getOperand(0).getValueType().bitsGT(VT))
+      // if the source is larger than the dest, than we just need the truncate
+      return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0.getOperand(0));
+    else
+      // if the source and dest are the same type, we can drop both the extend
+      // and the truncate.
+      return N0.getOperand(0);
+  }
+
+  // See if we can simplify the input to this truncate through knowledge that
+  // only the low bits are being used.  For example "trunc (or (shl x, 8), y)"
+  // -> trunc y
+  SDValue Shorter =
+    GetDemandedBits(N0, APInt::getLowBitsSet(N0.getValueSizeInBits(),
+                                             VT.getSizeInBits()));
+  if (Shorter.getNode())
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Shorter);
+
+  // fold (truncate (load x)) -> (smaller load x)
+  // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
+  if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
+    SDValue Reduced = ReduceLoadWidth(N);
+    if (Reduced.getNode())
+      return Reduced;
+  }
+
+  // Simplify the operands using demanded-bits information.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
+  SDValue Elt = N->getOperand(i);
+  if (Elt.getOpcode() != ISD::MERGE_VALUES)
+    return Elt.getNode();
+  return Elt.getOperand(Elt.getResNo()).getNode();
+}
+
+/// CombineConsecutiveLoads - build_pair (load, load) -> load
+/// if load locations are consecutive.
+SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
+  assert(N->getOpcode() == ISD::BUILD_PAIR);
+
+  LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
+  LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
+  if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse())
+    return SDValue();
+  EVT LD1VT = LD1->getValueType(0);
+
+  if (ISD::isNON_EXTLoad(LD2) &&
+      LD2->hasOneUse() &&
+      // If both are volatile this would reduce the number of volatile loads.
+      // If one is volatile it might be ok, but play conservative and bail out.
+      !LD1->isVolatile() &&
+      !LD2->isVolatile() &&
+      DAG.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1)) {
+    unsigned Align = LD1->getAlignment();
+    unsigned NewAlign = TLI.getTargetData()->
+      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+
+    if (NewAlign <= Align &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
+      return DAG.getLoad(VT, N->getDebugLoc(), LD1->getChain(),
+                         LD1->getBasePtr(), LD1->getSrcValue(),
+                         LD1->getSrcValueOffset(), false, false, Align);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBIT_CONVERT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // If the input is a BUILD_VECTOR with all constant elements, fold this now.
+  // Only do this before legalize, since afterward the target may be depending
+  // on the bitconvert.
+  // First check to see if this is all constant.
+  if (!LegalTypes &&
+      N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() &&
+      VT.isVector()) {
+    bool isSimple = true;
+    for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i)
+      if (N0.getOperand(i).getOpcode() != ISD::UNDEF &&
+          N0.getOperand(i).getOpcode() != ISD::Constant &&
+          N0.getOperand(i).getOpcode() != ISD::ConstantFP) {
+        isSimple = false;
+        break;
+      }
+
+    EVT DestEltVT = N->getValueType(0).getVectorElementType();
+    assert(!DestEltVT.isVector() &&
+           "Element type of vector ValueType must not be vector!");
+    if (isSimple)
+      return ConstantFoldBIT_CONVERTofBUILD_VECTOR(N0.getNode(), DestEltVT);
+  }
+
+  // If the input is a constant, let getNode fold it.
+  if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0)) {
+    SDValue Res = DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, N0);
+    if (Res.getNode() != N) {
+      if (!LegalOperations ||
+          TLI.isOperationLegal(Res.getNode()->getOpcode(), VT))
+        return Res;
+
+      // Folding it resulted in an illegal node, and it's too late to
+      // do that. Clean up the old node and forego the transformation.
+      // Ideally this won't happen very often, because instcombine
+      // and the earlier dagcombine runs (where illegal nodes are
+      // permitted) should have folded most of them already.
+      DAG.DeleteNode(Res.getNode());
+    }
+  }
+
+  // (conv (conv x, t1), t2) -> (conv x, t2)
+  if (N0.getOpcode() == ISD::BIT_CONVERT)
+    return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT,
+                       N0.getOperand(0));
+
+  // fold (conv (load x)) -> (load (conv*)x)
+  // If the resultant load doesn't need a higher alignment than the original!
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile() &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    unsigned Align = TLI.getTargetData()->
+      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+    unsigned OrigAlign = LN0->getAlignment();
+
+    if (Align <= OrigAlign) {
+      SDValue Load = DAG.getLoad(VT, N->getDebugLoc(), LN0->getChain(),
+                                 LN0->getBasePtr(),
+                                 LN0->getSrcValue(), LN0->getSrcValueOffset(),
+                                 LN0->isVolatile(), LN0->isNonTemporal(),
+                                 OrigAlign);
+      AddToWorkList(N);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::BIT_CONVERT, N0.getDebugLoc(),
+                            N0.getValueType(), Load),
+                Load.getValue(1));
+      return Load;
+    }
+  }
+
+  // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
+  // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
+  // This often reduces constant pool loads.
+  if ((N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FABS) &&
+      N0.getNode()->hasOneUse() && VT.isInteger() && !VT.isVector()) {
+    SDValue NewConv = DAG.getNode(ISD::BIT_CONVERT, N0.getDebugLoc(), VT,
+                                  N0.getOperand(0));
+    AddToWorkList(NewConv.getNode());
+
+    APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
+    if (N0.getOpcode() == ISD::FNEG)
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT,
+                         NewConv, DAG.getConstant(SignBit, VT));
+    assert(N0.getOpcode() == ISD::FABS);
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                       NewConv, DAG.getConstant(~SignBit, VT));
+  }
+
+  // fold (bitconvert (fcopysign cst, x)) ->
+  //         (or (and (bitconvert x), sign), (and cst, (not sign)))
+  // Note that we don't handle (copysign x, cst) because this can always be
+  // folded to an fneg or fabs.
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
+      isa<ConstantFPSDNode>(N0.getOperand(0)) &&
+      VT.isInteger() && !VT.isVector()) {
+    unsigned OrigXWidth = N0.getOperand(1).getValueType().getSizeInBits();
+    EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
+    if (isTypeLegal(IntXVT)) {
+      SDValue X = DAG.getNode(ISD::BIT_CONVERT, N0.getDebugLoc(),
+                              IntXVT, N0.getOperand(1));
+      AddToWorkList(X.getNode());
+
+      // If X has a different width than the result/lhs, sext it or truncate it.
+      unsigned VTWidth = VT.getSizeInBits();
+      if (OrigXWidth < VTWidth) {
+        X = DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, X);
+        AddToWorkList(X.getNode());
+      } else if (OrigXWidth > VTWidth) {
+        // To get the sign bit in the right place, we have to shift it right
+        // before truncating.
+        X = DAG.getNode(ISD::SRL, X.getDebugLoc(),
+                        X.getValueType(), X,
+                        DAG.getConstant(OrigXWidth-VTWidth, X.getValueType()));
+        AddToWorkList(X.getNode());
+        X = DAG.getNode(ISD::TRUNCATE, X.getDebugLoc(), VT, X);
+        AddToWorkList(X.getNode());
+      }
+
+      APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
+      X = DAG.getNode(ISD::AND, X.getDebugLoc(), VT,
+                      X, DAG.getConstant(SignBit, VT));
+      AddToWorkList(X.getNode());
+
+      SDValue Cst = DAG.getNode(ISD::BIT_CONVERT, N0.getDebugLoc(),
+                                VT, N0.getOperand(0));
+      Cst = DAG.getNode(ISD::AND, Cst.getDebugLoc(), VT,
+                        Cst, DAG.getConstant(~SignBit, VT));
+      AddToWorkList(Cst.getNode());
+
+      return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, X, Cst);
+    }
+  }
+
+  // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
+  if (N0.getOpcode() == ISD::BUILD_PAIR) {
+    SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT);
+    if (CombineLD.getNode())
+      return CombineLD;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  return CombineConsecutiveLoads(N, VT);
+}
+
+/// ConstantFoldBIT_CONVERTofBUILD_VECTOR - We know that BV is a build_vector
+/// node with Constant, ConstantFP or Undef operands.  DstEltVT indicates the
+/// destination element value type.
+SDValue DAGCombiner::
+ConstantFoldBIT_CONVERTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
+  EVT SrcEltVT = BV->getValueType(0).getVectorElementType();
+
+  // If this is already the right type, we're done.
+  if (SrcEltVT == DstEltVT) return SDValue(BV, 0);
+
+  unsigned SrcBitSize = SrcEltVT.getSizeInBits();
+  unsigned DstBitSize = DstEltVT.getSizeInBits();
+
+  // If this is a conversion of N elements of one type to N elements of another
+  // type, convert each element.  This handles FP<->INT cases.
+  if (SrcBitSize == DstBitSize) {
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
+                              BV->getValueType(0).getVectorNumElements());
+
+    // Due to the FP element handling below calling this routine recursively,
+    // we can end up with a scalar-to-vector node here.
+    if (BV->getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return DAG.getNode(ISD::SCALAR_TO_VECTOR, BV->getDebugLoc(), VT, 
+                         DAG.getNode(ISD::BIT_CONVERT, BV->getDebugLoc(),
+                                     DstEltVT, BV->getOperand(0)));
+      
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
+      SDValue Op = BV->getOperand(i);
+      // If the vector element type is not legal, the BUILD_VECTOR operands
+      // are promoted and implicitly truncated.  Make that explicit here.
+      if (Op.getValueType() != SrcEltVT)
+        Op = DAG.getNode(ISD::TRUNCATE, BV->getDebugLoc(), SrcEltVT, Op);
+      Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, BV->getDebugLoc(),
+                                DstEltVT, Op));
+      AddToWorkList(Ops.back().getNode());
+    }
+    return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
+                       &Ops[0], Ops.size());
+  }
+
+  // Otherwise, we're growing or shrinking the elements.  To avoid having to
+  // handle annoying details of growing/shrinking FP values, we convert them to
+  // int first.
+  if (SrcEltVT.isFloatingPoint()) {
+    // Convert the input float vector to a int vector where the elements are the
+    // same sizes.
+    assert((SrcEltVT == MVT::f32 || SrcEltVT == MVT::f64) && "Unknown FP VT!");
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
+    BV = ConstantFoldBIT_CONVERTofBUILD_VECTOR(BV, IntVT).getNode();
+    SrcEltVT = IntVT;
+  }
+
+  // Now we know the input is an integer vector.  If the output is a FP type,
+  // convert to integer first, then to FP of the right size.
+  if (DstEltVT.isFloatingPoint()) {
+    assert((DstEltVT == MVT::f32 || DstEltVT == MVT::f64) && "Unknown FP VT!");
+    EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
+    SDNode *Tmp = ConstantFoldBIT_CONVERTofBUILD_VECTOR(BV, TmpVT).getNode();
+
+    // Next, convert to FP elements of the same size.
+    return ConstantFoldBIT_CONVERTofBUILD_VECTOR(Tmp, DstEltVT);
+  }
+
+  // Okay, we know the src/dst types are both integers of differing types.
+  // Handling growing first.
+  assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
+  if (SrcBitSize < DstBitSize) {
+    unsigned NumInputsPerOutput = DstBitSize/SrcBitSize;
+
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0, e = BV->getNumOperands(); i != e;
+         i += NumInputsPerOutput) {
+      bool isLE = TLI.isLittleEndian();
+      APInt NewBits = APInt(DstBitSize, 0);
+      bool EltIsUndef = true;
+      for (unsigned j = 0; j != NumInputsPerOutput; ++j) {
+        // Shift the previously computed bits over.
+        NewBits <<= SrcBitSize;
+        SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j));
+        if (Op.getOpcode() == ISD::UNDEF) continue;
+        EltIsUndef = false;
+
+        NewBits |= APInt(cast<ConstantSDNode>(Op)->getAPIntValue()).
+                   zextOrTrunc(SrcBitSize).zext(DstBitSize);
+      }
+
+      if (EltIsUndef)
+        Ops.push_back(DAG.getUNDEF(DstEltVT));
+      else
+        Ops.push_back(DAG.getConstant(NewBits, DstEltVT));
+    }
+
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
+    return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
+                       &Ops[0], Ops.size());
+  }
+
+  // Finally, this must be the case where we are shrinking elements: each input
+  // turns into multiple outputs.
+  bool isS2V = ISD::isScalarToVector(BV);
+  unsigned NumOutputsPerInput = SrcBitSize/DstBitSize;
+  EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
+                            NumOutputsPerInput*BV->getNumOperands());
+  SmallVector<SDValue, 8> Ops;
+
+  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
+    if (BV->getOperand(i).getOpcode() == ISD::UNDEF) {
+      for (unsigned j = 0; j != NumOutputsPerInput; ++j)
+        Ops.push_back(DAG.getUNDEF(DstEltVT));
+      continue;
+    }
+
+    APInt OpVal = APInt(cast<ConstantSDNode>(BV->getOperand(i))->
+                        getAPIntValue()).zextOrTrunc(SrcBitSize);
+
+    for (unsigned j = 0; j != NumOutputsPerInput; ++j) {
+      APInt ThisVal = APInt(OpVal).trunc(DstBitSize);
+      Ops.push_back(DAG.getConstant(ThisVal, DstEltVT));
+      if (isS2V && i == 0 && j == 0 && APInt(ThisVal).zext(SrcBitSize) == OpVal)
+        // Simply turn this into a SCALAR_TO_VECTOR of the new type.
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, BV->getDebugLoc(), VT,
+                           Ops[0]);
+      OpVal = OpVal.lshr(DstBitSize);
+    }
+
+    // For big endian targets, swap the order of the pieces of each element.
+    if (TLI.isBigEndian())
+      std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
+                     &Ops[0], Ops.size());
+}
+
+SDValue DAGCombiner::visitFADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fadd c1, c2) -> (fadd c1, c2)
+  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N1);
+  // canonicalize constant to RHS
+  if (N0CFP && !N1CFP)
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N1, N0);
+  // fold (fadd A, 0) -> A
+  if (UnsafeFPMath && N1CFP && N1CFP->getValueAPF().isZero())
+    return N0;
+  // fold (fadd A, (fneg B)) -> (fsub A, B)
+  if (isNegatibleForFree(N1, LegalOperations) == 2)
+    return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N0,
+                       GetNegatedExpression(N1, DAG, LegalOperations));
+  // fold (fadd (fneg A), B) -> (fsub B, A)
+  if (isNegatibleForFree(N0, LegalOperations) == 2)
+    return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N1,
+                       GetNegatedExpression(N0, DAG, LegalOperations));
+
+  // If allowed, fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
+  if (UnsafeFPMath && N1CFP && N0.getOpcode() == ISD::FADD &&
+      N0.getNode()->hasOneUse() && isa<ConstantFPSDNode>(N0.getOperand(1)))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                   N0.getOperand(1), N1));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFSUB(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fsub c1, c2) -> c1-c2
+  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N0, N1);
+  // fold (fsub A, 0) -> A
+  if (UnsafeFPMath && N1CFP && N1CFP->getValueAPF().isZero())
+    return N0;
+  // fold (fsub 0, B) -> -B
+  if (UnsafeFPMath && N0CFP && N0CFP->getValueAPF().isZero()) {
+    if (isNegatibleForFree(N1, LegalOperations))
+      return GetNegatedExpression(N1, DAG, LegalOperations);
+    if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+      return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT, N1);
+  }
+  // fold (fsub A, (fneg B)) -> (fadd A, B)
+  if (isNegatibleForFree(N1, LegalOperations))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0,
+                       GetNegatedExpression(N1, DAG, LegalOperations));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFMUL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fmul c1, c2) -> c1*c2
+  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N0, N1);
+  // canonicalize constant to RHS
+  if (N0CFP && !N1CFP)
+    return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N1, N0);
+  // fold (fmul A, 0) -> 0
+  if (UnsafeFPMath && N1CFP && N1CFP->getValueAPF().isZero())
+    return N1;
+  // fold (fmul A, 0) -> 0, vector edition.
+  if (UnsafeFPMath && ISD::isBuildVectorAllZeros(N1.getNode()))
+    return N1;
+  // fold (fmul X, 2.0) -> (fadd X, X)
+  if (N1CFP && N1CFP->isExactlyValue(+2.0))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N0);
+  // fold (fmul X, -1.0) -> (fneg X)
+  if (N1CFP && N1CFP->isExactlyValue(-1.0))
+    if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+      return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT, N0);
+
+  // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations)) {
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations)) {
+      // Both can be negated for free, check to see if at least one is cheaper
+      // negated.
+      if (LHSNeg == 2 || RHSNeg == 2)
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           GetNegatedExpression(N0, DAG, LegalOperations),
+                           GetNegatedExpression(N1, DAG, LegalOperations));
+    }
+  }
+
+  // If allowed, fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
+  if (UnsafeFPMath && N1CFP && N0.getOpcode() == ISD::FMUL &&
+      N0.getNode()->hasOneUse() && isa<ConstantFPSDNode>(N0.getOperand(1)))
+    return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                                   N0.getOperand(1), N1));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fdiv c1, c2) -> c1/c2
+  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FDIV, N->getDebugLoc(), VT, N0, N1);
+
+
+  // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations)) {
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations)) {
+      // Both can be negated for free, check to see if at least one is cheaper
+      // negated.
+      if (LHSNeg == 2 || RHSNeg == 2)
+        return DAG.getNode(ISD::FDIV, N->getDebugLoc(), VT,
+                           GetNegatedExpression(N0, DAG, LegalOperations),
+                           GetNegatedExpression(N1, DAG, LegalOperations));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFREM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold (frem c1, c2) -> fmod(c1,c2)
+  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FREM, N->getDebugLoc(), VT, N0, N1);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  if (N0CFP && N1CFP && VT != MVT::ppcf128)  // Constant fold
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT, N0, N1);
+
+  if (N1CFP) {
+    const APFloat& V = N1CFP->getValueAPF();
+    // copysign(x, c1) -> fabs(x)       iff ispos(c1)
+    // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
+    if (!V.isNegative()) {
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
+        return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
+    } else {
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+        return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT,
+                           DAG.getNode(ISD::FABS, N0.getDebugLoc(), VT, N0));
+    }
+  }
+
+  // copysign(fabs(x), y) -> copysign(x, y)
+  // copysign(fneg(x), y) -> copysign(x, y)
+  // copysign(copysign(x,z), y) -> copysign(x, y)
+  if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
+      N0.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+                       N0.getOperand(0), N1);
+
+  // copysign(x, abs(y)) -> abs(x)
+  if (N1.getOpcode() == ISD::FABS)
+    return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
+
+  // copysign(x, copysign(y,z)) -> copysign(x, z)
+  if (N1.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+                       N0, N1.getOperand(1));
+
+  // copysign(x, fp_extend(y)) -> copysign(x, y)
+  // copysign(x, fp_round(y)) -> copysign(x, y)
+  if (N1.getOpcode() == ISD::FP_EXTEND || N1.getOpcode() == ISD::FP_ROUND)
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+                       N0, N1.getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N0.getValueType();
+
+  // fold (sint_to_fp c1) -> c1fp
+  if (N0C && OpVT != MVT::ppcf128)
+    return DAG.getNode(ISD::SINT_TO_FP, N->getDebugLoc(), VT, N0);
+
+  // If the input is a legal type, and SINT_TO_FP is not legal on this target,
+  // but UINT_TO_FP is legal on this target, try to convert.
+  if (!TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT) &&
+      TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT)) {
+    // If the sign bit is known to be zero, we can change this to UINT_TO_FP.
+    if (DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), VT, N0);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N0.getValueType();
+
+  // fold (uint_to_fp c1) -> c1fp
+  if (N0C && OpVT != MVT::ppcf128)
+    return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), VT, N0);
+
+  // If the input is a legal type, and UINT_TO_FP is not legal on this target,
+  // but SINT_TO_FP is legal on this target, try to convert.
+  if (!TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT) &&
+      TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT)) {
+    // If the sign bit is known to be zero, we can change this to SINT_TO_FP.
+    if (DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::SINT_TO_FP, N->getDebugLoc(), VT, N0);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fp_to_sint c1fp) -> c1
+  if (N0CFP)
+    return DAG.getNode(ISD::FP_TO_SINT, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fp_to_uint c1fp) -> c1
+  if (N0CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FP_TO_UINT, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fp_round c1fp) -> c1fp
+  if (N0CFP && N0.getValueType() != MVT::ppcf128)
+    return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT, N0, N1);
+
+  // fold (fp_round (fp_extend x)) -> x
+  if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
+    return N0.getOperand(0);
+
+  // fold (fp_round (fp_round x)) -> (fp_round x)
+  if (N0.getOpcode() == ISD::FP_ROUND) {
+    // This is a value preserving truncation if both round's are.
+    bool IsTrunc = N->getConstantOperandVal(1) == 1 &&
+                   N0.getNode()->getConstantOperandVal(1) == 1;
+    return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getIntPtrConstant(IsTrunc));
+  }
+
+  // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
+    SDValue Tmp = DAG.getNode(ISD::FP_ROUND, N0.getDebugLoc(), VT,
+                              N0.getOperand(0), N1);
+    AddToWorkList(Tmp.getNode());
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+                       Tmp, N0.getOperand(1));
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+
+  // fold (fp_round_inreg c1fp) -> c1fp
+  if (N0CFP && isTypeLegal(EVT)) {
+    SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), EVT);
+    return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, Round);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
+  if (N->hasOneUse() &&
+      N->use_begin()->getOpcode() == ISD::FP_ROUND)
+    return SDValue();
+
+  // fold (fp_extend c1fp) -> c1fp
+  if (N0CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, N0);
+
+  // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
+  // value of X.
+  if (N0.getOpcode() == ISD::FP_ROUND
+      && N0.getNode()->getConstantOperandVal(1) == 1) {
+    SDValue In = N0.getOperand(0);
+    if (In.getValueType() == VT) return In;
+    if (VT.bitsLT(In.getValueType()))
+      return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT,
+                         In, N0.getOperand(1));
+    return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, In);
+  }
+
+  // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
+  if (ISD::isNON_EXTLoad(N0.getNode()) && N0.hasOneUse() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, VT, N->getDebugLoc(),
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), LN0->getSrcValue(),
+                                     LN0->getSrcValueOffset(),
+                                     N0.getValueType(),
+                                     LN0->isVolatile(), LN0->isNonTemporal(),
+                                     LN0->getAlignment());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(),
+              DAG.getNode(ISD::FP_ROUND, N0.getDebugLoc(),
+                          N0.getValueType(), ExtLoad, DAG.getIntPtrConstant(1)),
+              ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFNEG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (isNegatibleForFree(N0, LegalOperations))
+    return GetNegatedExpression(N0, DAG, LegalOperations);
+
+  // Transform fneg(bitconvert(x)) -> bitconvert(x^sign) to avoid loading
+  // constant pool values.
+  if (N0.getOpcode() == ISD::BIT_CONVERT && 
+      !VT.isVector() &&
+      N0.getNode()->hasOneUse() &&
+      N0.getOperand(0).getValueType().isInteger()) {
+    SDValue Int = N0.getOperand(0);
+    EVT IntVT = Int.getValueType();
+    if (IntVT.isInteger() && !IntVT.isVector()) {
+      Int = DAG.getNode(ISD::XOR, N0.getDebugLoc(), IntVT, Int,
+              DAG.getConstant(APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
+      AddToWorkList(Int.getNode());
+      return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(),
+                         VT, Int);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFABS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fabs c1) -> fabs(c1)
+  if (N0CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
+  // fold (fabs (fabs x)) -> (fabs x)
+  if (N0.getOpcode() == ISD::FABS)
+    return N->getOperand(0);
+  // fold (fabs (fneg x)) -> (fabs x)
+  // fold (fabs (fcopysign x, y)) -> (fabs x)
+  if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0.getOperand(0));
+
+  // Transform fabs(bitconvert(x)) -> bitconvert(x&~sign) to avoid loading
+  // constant pool values.
+  if (N0.getOpcode() == ISD::BIT_CONVERT && N0.getNode()->hasOneUse() &&
+      N0.getOperand(0).getValueType().isInteger() &&
+      !N0.getOperand(0).getValueType().isVector()) {
+    SDValue Int = N0.getOperand(0);
+    EVT IntVT = Int.getValueType();
+    if (IntVT.isInteger() && !IntVT.isVector()) {
+      Int = DAG.getNode(ISD::AND, N0.getDebugLoc(), IntVT, Int,
+             DAG.getConstant(~APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
+      AddToWorkList(Int.getNode());
+      return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(),
+                         N->getValueType(0), Int);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBRCOND(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+
+  // If N is a constant we could fold this into a fallthrough or unconditional
+  // branch. However that doesn't happen very often in normal code, because
+  // Instcombine/SimplifyCFG should have handled the available opportunities.
+  // If we did this folding here, it would be necessary to update the
+  // MachineBasicBlock CFG, which is awkward.
+
+  // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
+  // on the target.
+  if (N1.getOpcode() == ISD::SETCC &&
+      TLI.isOperationLegalOrCustom(ISD::BR_CC, MVT::Other)) {
+    return DAG.getNode(ISD::BR_CC, N->getDebugLoc(), MVT::Other,
+                       Chain, N1.getOperand(2),
+                       N1.getOperand(0), N1.getOperand(1), N2);
+  }
+
+  SDNode *Trunc = 0;
+  if (N1.getOpcode() == ISD::TRUNCATE && N1.hasOneUse()) {
+    // Look past truncate.
+    Trunc = N1.getNode();
+    N1 = N1.getOperand(0);
+  }
+
+  if (N1.hasOneUse() && N1.getOpcode() == ISD::SRL) {
+    // Match this pattern so that we can generate simpler code:
+    //
+    //   %a = ...
+    //   %b = and i32 %a, 2
+    //   %c = srl i32 %b, 1
+    //   brcond i32 %c ...
+    //
+    // into
+    // 
+    //   %a = ...
+    //   %b = and i32 %a, 2
+    //   %c = setcc eq %b, 0
+    //   brcond %c ...
+    //
+    // This applies only when the AND constant value has one bit set and the
+    // SRL constant is equal to the log2 of the AND constant. The back-end is
+    // smart enough to convert the result into a TEST/JMP sequence.
+    SDValue Op0 = N1.getOperand(0);
+    SDValue Op1 = N1.getOperand(1);
+
+    if (Op0.getOpcode() == ISD::AND &&
+        Op1.getOpcode() == ISD::Constant) {
+      SDValue AndOp1 = Op0.getOperand(1);
+
+      if (AndOp1.getOpcode() == ISD::Constant) {
+        const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();
+
+        if (AndConst.isPowerOf2() &&
+            cast<ConstantSDNode>(Op1)->getAPIntValue()==AndConst.logBase2()) {
+          SDValue SetCC =
+            DAG.getSetCC(N->getDebugLoc(),
+                         TLI.getSetCCResultType(Op0.getValueType()),
+                         Op0, DAG.getConstant(0, Op0.getValueType()),
+                         ISD::SETNE);
+
+          SDValue NewBRCond = DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
+                                          MVT::Other, Chain, SetCC, N2);
+          // Don't add the new BRCond into the worklist or else SimplifySelectCC
+          // will convert it back to (X & C1) >> C2.
+          CombineTo(N, NewBRCond, false);
+          // Truncate is dead.
+          if (Trunc) {
+            removeFromWorkList(Trunc);
+            DAG.DeleteNode(Trunc);
+          }
+          // Replace the uses of SRL with SETCC
+          WorkListRemover DeadNodes(*this);
+          DAG.ReplaceAllUsesOfValueWith(N1, SetCC, &DeadNodes);
+          removeFromWorkList(N1.getNode());
+          DAG.DeleteNode(N1.getNode());
+          return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+        }
+      }
+    }
+  }
+  
+  // Transform br(xor(x, y)) -> br(x != y)
+  // Transform br(xor(xor(x,y), 1)) -> br (x == y)
+  if (N1.hasOneUse() && N1.getOpcode() == ISD::XOR) {
+    SDNode *TheXor = N1.getNode();
+    SDValue Op0 = TheXor->getOperand(0);
+    SDValue Op1 = TheXor->getOperand(1);
+    if (Op0.getOpcode() == Op1.getOpcode()) {
+      // Avoid missing important xor optimizations.
+      SDValue Tmp = visitXOR(TheXor);
+      if (Tmp.getNode() && Tmp.getNode() != TheXor) {
+        DEBUG(dbgs() << "\nReplacing.8 ";
+              TheXor->dump(&DAG);
+              dbgs() << "\nWith: ";
+              Tmp.getNode()->dump(&DAG);
+              dbgs() << '\n');
+        WorkListRemover DeadNodes(*this);
+        DAG.ReplaceAllUsesOfValueWith(N1, Tmp, &DeadNodes);
+        removeFromWorkList(TheXor);
+        DAG.DeleteNode(TheXor);
+        return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
+                           MVT::Other, Chain, Tmp, N2);
+      }
+    }
+
+    if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
+      bool Equal = false;
+      if (ConstantSDNode *RHSCI = dyn_cast<ConstantSDNode>(Op0))
+        if (RHSCI->getAPIntValue() == 1 && Op0.hasOneUse() &&
+            Op0.getOpcode() == ISD::XOR) {
+          TheXor = Op0.getNode();
+          Equal = true;
+        }
+
+      SDValue NodeToReplace = Trunc ? SDValue(Trunc, 0) : N1;
+      
+      EVT SetCCVT = NodeToReplace.getValueType();
+      if (LegalTypes)
+        SetCCVT = TLI.getSetCCResultType(SetCCVT);
+      SDValue SetCC = DAG.getSetCC(TheXor->getDebugLoc(),
+                                   SetCCVT,
+                                   Op0, Op1,
+                                   Equal ? ISD::SETEQ : ISD::SETNE);
+      // Replace the uses of XOR with SETCC
+      WorkListRemover DeadNodes(*this);
+      DAG.ReplaceAllUsesOfValueWith(NodeToReplace, SetCC, &DeadNodes);
+      removeFromWorkList(NodeToReplace.getNode());
+      DAG.DeleteNode(NodeToReplace.getNode());
+      return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
+                         MVT::Other, Chain, SetCC, N2);
+    }
+  }
+
+  return SDValue();
+}
+
+// Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB.
+//
+SDValue DAGCombiner::visitBR_CC(SDNode *N) {
+  CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1));
+  SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3);
+
+  // If N is a constant we could fold this into a fallthrough or unconditional
+  // branch. However that doesn't happen very often in normal code, because
+  // Instcombine/SimplifyCFG should have handled the available opportunities.
+  // If we did this folding here, it would be necessary to update the
+  // MachineBasicBlock CFG, which is awkward.
+
+  // Use SimplifySetCC to simplify SETCC's.
+  SDValue Simp = SimplifySetCC(TLI.getSetCCResultType(CondLHS.getValueType()),
+                               CondLHS, CondRHS, CC->get(), N->getDebugLoc(),
+                               false);
+  if (Simp.getNode()) AddToWorkList(Simp.getNode());
+
+  // fold to a simpler setcc
+  if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
+    return DAG.getNode(ISD::BR_CC, N->getDebugLoc(), MVT::Other,
+                       N->getOperand(0), Simp.getOperand(2),
+                       Simp.getOperand(0), Simp.getOperand(1),
+                       N->getOperand(4));
+
+  return SDValue();
+}
+
+/// CombineToPreIndexedLoadStore - Try turning a load / store into a
+/// pre-indexed load / store when the base pointer is an add or subtract
+/// and it has other uses besides the load / store. After the
+/// transformation, the new indexed load / store has effectively folded
+/// the add / subtract in and all of its other uses are redirected to the
+/// new load / store.
+bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
+  if (!LegalOperations)
+    return false;
+
+  bool isLoad = true;
+  SDValue Ptr;
+  EVT VT;
+  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(N)) {
+    if (LD->isIndexed())
+      return false;
+    VT = LD->getMemoryVT();
+    if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) &&
+        !TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT))
+      return false;
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(N)) {
+    if (ST->isIndexed())
+      return false;
+    VT = ST->getMemoryVT();
+    if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) &&
+        !TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT))
+      return false;
+    Ptr = ST->getBasePtr();
+    isLoad = false;
+  } else {
+    return false;
+  }
+
+  // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
+  // out.  There is no reason to make this a preinc/predec.
+  if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) ||
+      Ptr.getNode()->hasOneUse())
+    return false;
+
+  // Ask the target to do addressing mode selection.
+  SDValue BasePtr;
+  SDValue Offset;
+  ISD::MemIndexedMode AM = ISD::UNINDEXED;
+  if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
+    return false;
+  // Don't create a indexed load / store with zero offset.
+  if (isa<ConstantSDNode>(Offset) &&
+      cast<ConstantSDNode>(Offset)->isNullValue())
+    return false;
+
+  // Try turning it into a pre-indexed load / store except when:
+  // 1) The new base ptr is a frame index.
+  // 2) If N is a store and the new base ptr is either the same as or is a
+  //    predecessor of the value being stored.
+  // 3) Another use of old base ptr is a predecessor of N. If ptr is folded
+  //    that would create a cycle.
+  // 4) All uses are load / store ops that use it as old base ptr.
+
+  // Check #1.  Preinc'ing a frame index would require copying the stack pointer
+  // (plus the implicit offset) to a register to preinc anyway.
+  if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
+    return false;
+
+  // Check #2.
+  if (!isLoad) {
+    SDValue Val = cast<StoreSDNode>(N)->getValue();
+    if (Val == BasePtr || BasePtr.getNode()->isPredecessorOf(Val.getNode()))
+      return false;
+  }
+
+  // Now check for #3 and #4.
+  bool RealUse = false;
+  for (SDNode::use_iterator I = Ptr.getNode()->use_begin(),
+         E = Ptr.getNode()->use_end(); I != E; ++I) {
+    SDNode *Use = *I;
+    if (Use == N)
+      continue;
+    if (Use->isPredecessorOf(N))
+      return false;
+
+    if (!((Use->getOpcode() == ISD::LOAD &&
+           cast<LoadSDNode>(Use)->getBasePtr() == Ptr) ||
+          (Use->getOpcode() == ISD::STORE &&
+           cast<StoreSDNode>(Use)->getBasePtr() == Ptr)))
+      RealUse = true;
+  }
+
+  if (!RealUse)
+    return false;
+
+  SDValue Result;
+  if (isLoad)
+    Result = DAG.getIndexedLoad(SDValue(N,0), N->getDebugLoc(),
+                                BasePtr, Offset, AM);
+  else
+    Result = DAG.getIndexedStore(SDValue(N,0), N->getDebugLoc(),
+                                 BasePtr, Offset, AM);
+  ++PreIndexedNodes;
+  ++NodesCombined;
+  DEBUG(dbgs() << "\nReplacing.4 ";
+        N->dump(&DAG);
+        dbgs() << "\nWith: ";
+        Result.getNode()->dump(&DAG);
+        dbgs() << '\n');
+  WorkListRemover DeadNodes(*this);
+  if (isLoad) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0),
+                                  &DeadNodes);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2),
+                                  &DeadNodes);
+  } else {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1),
+                                  &DeadNodes);
+  }
+
+  // Finally, since the node is now dead, remove it from the graph.
+  DAG.DeleteNode(N);
+
+  // Replace the uses of Ptr with uses of the updated base value.
+  DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0),
+                                &DeadNodes);
+  removeFromWorkList(Ptr.getNode());
+  DAG.DeleteNode(Ptr.getNode());
+
+  return true;
+}
+
+/// CombineToPostIndexedLoadStore - Try to combine a load / store with a
+/// add / sub of the base pointer node into a post-indexed load / store.
+/// The transformation folded the add / subtract into the new indexed
+/// load / store effectively and all of its uses are redirected to the
+/// new load / store.
+bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
+  if (!LegalOperations)
+    return false;
+
+  bool isLoad = true;
+  SDValue Ptr;
+  EVT VT;
+  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(N)) {
+    if (LD->isIndexed())
+      return false;
+    VT = LD->getMemoryVT();
+    if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) &&
+        !TLI.isIndexedLoadLegal(ISD::POST_DEC, VT))
+      return false;
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(N)) {
+    if (ST->isIndexed())
+      return false;
+    VT = ST->getMemoryVT();
+    if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) &&
+        !TLI.isIndexedStoreLegal(ISD::POST_DEC, VT))
+      return false;
+    Ptr = ST->getBasePtr();
+    isLoad = false;
+  } else {
+    return false;
+  }
+
+  if (Ptr.getNode()->hasOneUse())
+    return false;
+
+  for (SDNode::use_iterator I = Ptr.getNode()->use_begin(),
+         E = Ptr.getNode()->use_end(); I != E; ++I) {
+    SDNode *Op = *I;
+    if (Op == N ||
+        (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB))
+      continue;
+
+    SDValue BasePtr;
+    SDValue Offset;
+    ISD::MemIndexedMode AM = ISD::UNINDEXED;
+    if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) {
+      // Don't create a indexed load / store with zero offset.
+      if (isa<ConstantSDNode>(Offset) &&
+          cast<ConstantSDNode>(Offset)->isNullValue())
+        continue;
+
+      // Try turning it into a post-indexed load / store except when
+      // 1) All uses are load / store ops that use it as base ptr.
+      // 2) Op must be independent of N, i.e. Op is neither a predecessor
+      //    nor a successor of N. Otherwise, if Op is folded that would
+      //    create a cycle.
+
+      if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
+        continue;
+
+      // Check for #1.
+      bool TryNext = false;
+      for (SDNode::use_iterator II = BasePtr.getNode()->use_begin(),
+             EE = BasePtr.getNode()->use_end(); II != EE; ++II) {
+        SDNode *Use = *II;
+        if (Use == Ptr.getNode())
+          continue;
+
+        // If all the uses are load / store addresses, then don't do the
+        // transformation.
+        if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){
+          bool RealUse = false;
+          for (SDNode::use_iterator III = Use->use_begin(),
+                 EEE = Use->use_end(); III != EEE; ++III) {
+            SDNode *UseUse = *III;
+            if (!((UseUse->getOpcode() == ISD::LOAD &&
+                   cast<LoadSDNode>(UseUse)->getBasePtr().getNode() == Use) ||
+                  (UseUse->getOpcode() == ISD::STORE &&
+                   cast<StoreSDNode>(UseUse)->getBasePtr().getNode() == Use)))
+              RealUse = true;
+          }
+
+          if (!RealUse) {
+            TryNext = true;
+            break;
+          }
+        }
+      }
+
+      if (TryNext)
+        continue;
+
+      // Check for #2
+      if (!Op->isPredecessorOf(N) && !N->isPredecessorOf(Op)) {
+        SDValue Result = isLoad
+          ? DAG.getIndexedLoad(SDValue(N,0), N->getDebugLoc(),
+                               BasePtr, Offset, AM)
+          : DAG.getIndexedStore(SDValue(N,0), N->getDebugLoc(),
+                                BasePtr, Offset, AM);
+        ++PostIndexedNodes;
+        ++NodesCombined;
+        DEBUG(dbgs() << "\nReplacing.5 ";
+              N->dump(&DAG);
+              dbgs() << "\nWith: ";
+              Result.getNode()->dump(&DAG);
+              dbgs() << '\n');
+        WorkListRemover DeadNodes(*this);
+        if (isLoad) {
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0),
+                                        &DeadNodes);
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2),
+                                        &DeadNodes);
+        } else {
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1),
+                                        &DeadNodes);
+        }
+
+        // Finally, since the node is now dead, remove it from the graph.
+        DAG.DeleteNode(N);
+
+        // Replace the uses of Use with uses of the updated base value.
+        DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
+                                      Result.getValue(isLoad ? 1 : 0),
+                                      &DeadNodes);
+        removeFromWorkList(Op);
+        DAG.DeleteNode(Op);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+SDValue DAGCombiner::visitLOAD(SDNode *N) {
+  LoadSDNode *LD  = cast<LoadSDNode>(N);
+  SDValue Chain = LD->getChain();
+  SDValue Ptr   = LD->getBasePtr();
+
+  // If load is not volatile and there are no uses of the loaded value (and
+  // the updated indexed value in case of indexed loads), change uses of the
+  // chain value into uses of the chain input (i.e. delete the dead load).
+  if (!LD->isVolatile()) {
+    if (N->getValueType(1) == MVT::Other) {
+      // Unindexed loads.
+      if (N->hasNUsesOfValue(0, 0)) {
+        // It's not safe to use the two value CombineTo variant here. e.g.
+        // v1, chain2 = load chain1, loc
+        // v2, chain3 = load chain2, loc
+        // v3         = add v2, c
+        // Now we replace use of chain2 with chain1.  This makes the second load
+        // isomorphic to the one we are deleting, and thus makes this load live.
+        DEBUG(dbgs() << "\nReplacing.6 ";
+              N->dump(&DAG);
+              dbgs() << "\nWith chain: ";
+              Chain.getNode()->dump(&DAG);
+              dbgs() << "\n");
+        WorkListRemover DeadNodes(*this);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain, &DeadNodes);
+
+        if (N->use_empty()) {
+          removeFromWorkList(N);
+          DAG.DeleteNode(N);
+        }
+
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    } else {
+      // Indexed loads.
+      assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
+      if (N->hasNUsesOfValue(0, 0) && N->hasNUsesOfValue(0, 1)) {
+        SDValue Undef = DAG.getUNDEF(N->getValueType(0));
+        DEBUG(dbgs() << "\nReplacing.7 ";
+              N->dump(&DAG);
+              dbgs() << "\nWith: ";
+              Undef.getNode()->dump(&DAG);
+              dbgs() << " and 2 other values\n");
+        WorkListRemover DeadNodes(*this);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef, &DeadNodes);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1),
+                                      DAG.getUNDEF(N->getValueType(1)),
+                                      &DeadNodes);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain, &DeadNodes);
+        removeFromWorkList(N);
+        DAG.DeleteNode(N);
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
+  // If this load is directly stored, replace the load value with the stored
+  // value.
+  // TODO: Handle store large -> read small portion.
+  // TODO: Handle TRUNCSTORE/LOADEXT
+  if (LD->getExtensionType() == ISD::NON_EXTLOAD &&
+      !LD->isVolatile()) {
+    if (ISD::isNON_TRUNCStore(Chain.getNode())) {
+      StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
+      if (PrevST->getBasePtr() == Ptr &&
+          PrevST->getValue().getValueType() == N->getValueType(0))
+      return CombineTo(N, Chain.getOperand(1), Chain);
+    }
+  }
+
+  // Try to infer better alignment information than the load already has.
+  if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
+    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
+      if (Align > LD->getAlignment())
+        return DAG.getExtLoad(LD->getExtensionType(), LD->getValueType(0),
+                              N->getDebugLoc(),
+                              Chain, Ptr, LD->getSrcValue(),
+                              LD->getSrcValueOffset(), LD->getMemoryVT(),
+                              LD->isVolatile(), LD->isNonTemporal(), Align);
+    }
+  }
+
+  if (CombinerAA) {
+    // Walk up chain skipping non-aliasing memory nodes.
+    SDValue BetterChain = FindBetterChain(N, Chain);
+
+    // If there is a better chain.
+    if (Chain != BetterChain) {
+      SDValue ReplLoad;
+
+      // Replace the chain to void dependency.
+      if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
+        ReplLoad = DAG.getLoad(N->getValueType(0), LD->getDebugLoc(),
+                               BetterChain, Ptr,
+                               LD->getSrcValue(), LD->getSrcValueOffset(),
+                               LD->isVolatile(), LD->isNonTemporal(),
+                               LD->getAlignment());
+      } else {
+        ReplLoad = DAG.getExtLoad(LD->getExtensionType(), LD->getValueType(0),
+                                  LD->getDebugLoc(),
+                                  BetterChain, Ptr, LD->getSrcValue(),
+                                  LD->getSrcValueOffset(),
+                                  LD->getMemoryVT(),
+                                  LD->isVolatile(),
+                                  LD->isNonTemporal(),
+                                  LD->getAlignment());
+      }
+
+      // Create token factor to keep old chain connected.
+      SDValue Token = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(),
+                                  MVT::Other, Chain, ReplLoad.getValue(1));
+      
+      // Make sure the new and old chains are cleaned up.
+      AddToWorkList(Token.getNode());
+      
+      // Replace uses with load result and token factor. Don't add users
+      // to work list.
+      return CombineTo(N, ReplLoad.getValue(0), Token, false);
+    }
+  }
+
+  // Try transforming N to an indexed load.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// CheckForMaskedLoad - Check to see if V is (and load (ptr), imm), where the
+/// load is having specific bytes cleared out.  If so, return the byte size
+/// being masked out and the shift amount.
+static std::pair<unsigned, unsigned>
+CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
+  std::pair<unsigned, unsigned> Result(0, 0);
+  
+  // Check for the structure we're looking for.
+  if (V->getOpcode() != ISD::AND ||
+      !isa<ConstantSDNode>(V->getOperand(1)) ||
+      !ISD::isNormalLoad(V->getOperand(0).getNode()))
+    return Result;
+  
+  // Check the chain and pointer.
+  LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0));
+  if (LD->getBasePtr() != Ptr) return Result;  // Not from same pointer.
+  
+  // The store should be chained directly to the load or be an operand of a
+  // tokenfactor.
+  if (LD == Chain.getNode())
+    ; // ok.
+  else if (Chain->getOpcode() != ISD::TokenFactor)
+    return Result; // Fail.
+  else {
+    bool isOk = false;
+    for (unsigned i = 0, e = Chain->getNumOperands(); i != e; ++i)
+      if (Chain->getOperand(i).getNode() == LD) {
+        isOk = true;
+        break;
+      }
+    if (!isOk) return Result;
+  }
+  
+  // This only handles simple types.
+  if (V.getValueType() != MVT::i16 &&
+      V.getValueType() != MVT::i32 &&
+      V.getValueType() != MVT::i64)
+    return Result;
+
+  // Check the constant mask.  Invert it so that the bits being masked out are
+  // 0 and the bits being kept are 1.  Use getSExtValue so that leading bits
+  // follow the sign bit for uniformity.
+  uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue();
+  unsigned NotMaskLZ = CountLeadingZeros_64(NotMask);
+  if (NotMaskLZ & 7) return Result;  // Must be multiple of a byte.
+  unsigned NotMaskTZ = CountTrailingZeros_64(NotMask);
+  if (NotMaskTZ & 7) return Result;  // Must be multiple of a byte.
+  if (NotMaskLZ == 64) return Result;  // All zero mask.
+  
+  // See if we have a continuous run of bits.  If so, we have 0*1+0*
+  if (CountTrailingOnes_64(NotMask >> NotMaskTZ)+NotMaskTZ+NotMaskLZ != 64)
+    return Result;
+
+  // Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
+  if (V.getValueType() != MVT::i64 && NotMaskLZ)
+    NotMaskLZ -= 64-V.getValueSizeInBits();
+  
+  unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8;
+  switch (MaskedBytes) {
+  case 1: 
+  case 2: 
+  case 4: break;
+  default: return Result; // All one mask, or 5-byte mask.
+  }
+  
+  // Verify that the first bit starts at a multiple of mask so that the access
+  // is aligned the same as the access width.
+  if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result;
+  
+  Result.first = MaskedBytes;
+  Result.second = NotMaskTZ/8;
+  return Result;
+}
+
+
+/// ShrinkLoadReplaceStoreWithStore - Check to see if IVal is something that
+/// provides a value as specified by MaskInfo.  If so, replace the specified
+/// store with a narrower store of truncated IVal.
+static SDNode *
+ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
+                                SDValue IVal, StoreSDNode *St,
+                                DAGCombiner *DC) {
+  unsigned NumBytes = MaskInfo.first;
+  unsigned ByteShift = MaskInfo.second;
+  SelectionDAG &DAG = DC->getDAG();
+  
+  // Check to see if IVal is all zeros in the part being masked in by the 'or'
+  // that uses this.  If not, this is not a replacement.
+  APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
+                                  ByteShift*8, (ByteShift+NumBytes)*8);
+  if (!DAG.MaskedValueIsZero(IVal, Mask)) return 0;
+  
+  // Check that it is legal on the target to do this.  It is legal if the new
+  // VT we're shrinking to (i8/i16/i32) is legal or we're still before type
+  // legalization.
+  MVT VT = MVT::getIntegerVT(NumBytes*8);
+  if (!DC->isTypeLegal(VT))
+    return 0;
+  
+  // Okay, we can do this!  Replace the 'St' store with a store of IVal that is
+  // shifted by ByteShift and truncated down to NumBytes.
+  if (ByteShift)
+    IVal = DAG.getNode(ISD::SRL, IVal->getDebugLoc(), IVal.getValueType(), IVal,
+                       DAG.getConstant(ByteShift*8, DC->getShiftAmountTy()));
+
+  // Figure out the offset for the store and the alignment of the access.
+  unsigned StOffset;
+  unsigned NewAlign = St->getAlignment();
+
+  if (DAG.getTargetLoweringInfo().isLittleEndian())
+    StOffset = ByteShift;
+  else
+    StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes;
+  
+  SDValue Ptr = St->getBasePtr();
+  if (StOffset) {
+    Ptr = DAG.getNode(ISD::ADD, IVal->getDebugLoc(), Ptr.getValueType(),
+                      Ptr, DAG.getConstant(StOffset, Ptr.getValueType()));
+    NewAlign = MinAlign(NewAlign, StOffset);
+  }
+  
+  // Truncate down to the new size.
+  IVal = DAG.getNode(ISD::TRUNCATE, IVal->getDebugLoc(), VT, IVal);
+  
+  ++OpsNarrowed;
+  return DAG.getStore(St->getChain(), St->getDebugLoc(), IVal, Ptr, 
+                      St->getSrcValue(), St->getSrcValueOffset()+StOffset,
+                      false, false, NewAlign).getNode();
+}
+
+
+/// ReduceLoadOpStoreWidth - Look for sequence of load / op / store where op is
+/// one of 'or', 'xor', and 'and' of immediates. If 'op' is only touching some
+/// of the loaded bits, try narrowing the load and store if it would end up
+/// being a win for performance or code size.
+SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  if (ST->isVolatile())
+    return SDValue();
+
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr   = ST->getBasePtr();
+  EVT VT = Value.getValueType();
+
+  if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse())
+    return SDValue();
+
+  unsigned Opc = Value.getOpcode();
+  
+  // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
+  // is a byte mask indicating a consecutive number of bytes, check to see if
+  // Y is known to provide just those bytes.  If so, we try to replace the
+  // load + replace + store sequence with a single (narrower) store, which makes
+  // the load dead.
+  if (Opc == ISD::OR) {
+    std::pair<unsigned, unsigned> MaskedLoad;
+    MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
+    if (MaskedLoad.first)
+      if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
+                                                  Value.getOperand(1), ST,this))
+        return SDValue(NewST, 0);
+                                           
+    // Or is commutative, so try swapping X and Y.
+    MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
+    if (MaskedLoad.first)
+      if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
+                                                  Value.getOperand(0), ST,this))
+        return SDValue(NewST, 0);
+  }
+  
+  if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
+      Value.getOperand(1).getOpcode() != ISD::Constant)
+    return SDValue();
+
+  SDValue N0 = Value.getOperand(0);
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LD = cast<LoadSDNode>(N0);
+    if (LD->getBasePtr() != Ptr)
+      return SDValue();
+
+    // Find the type to narrow it the load / op / store to.
+    SDValue N1 = Value.getOperand(1);
+    unsigned BitWidth = N1.getValueSizeInBits();
+    APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
+    if (Opc == ISD::AND)
+      Imm ^= APInt::getAllOnesValue(BitWidth);
+    if (Imm == 0 || Imm.isAllOnesValue())
+      return SDValue();
+    unsigned ShAmt = Imm.countTrailingZeros();
+    unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
+    unsigned NewBW = NextPowerOf2(MSB - ShAmt);
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
+    while (NewBW < BitWidth &&
+           !(TLI.isOperationLegalOrCustom(Opc, NewVT) &&
+             TLI.isNarrowingProfitable(VT, NewVT))) {
+      NewBW = NextPowerOf2(NewBW);
+      NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
+    }
+    if (NewBW >= BitWidth)
+      return SDValue();
+
+    // If the lsb changed does not start at the type bitwidth boundary,
+    // start at the previous one.
+    if (ShAmt % NewBW)
+      ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
+    APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, ShAmt + NewBW);
+    if ((Imm & Mask) == Imm) {
+      APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
+      if (Opc == ISD::AND)
+        NewImm ^= APInt::getAllOnesValue(NewBW);
+      uint64_t PtrOff = ShAmt / 8;
+      // For big endian targets, we need to adjust the offset to the pointer to
+      // load the correct bytes.
+      if (TLI.isBigEndian())
+        PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
+
+      unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
+      const Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext());
+      if (NewAlign < TLI.getTargetData()->getABITypeAlignment(NewVTTy))
+        return SDValue();
+
+      SDValue NewPtr = DAG.getNode(ISD::ADD, LD->getDebugLoc(),
+                                   Ptr.getValueType(), Ptr,
+                                   DAG.getConstant(PtrOff, Ptr.getValueType()));
+      SDValue NewLD = DAG.getLoad(NewVT, N0.getDebugLoc(),
+                                  LD->getChain(), NewPtr,
+                                  LD->getSrcValue(), LD->getSrcValueOffset(),
+                                  LD->isVolatile(), LD->isNonTemporal(),
+                                  NewAlign);
+      SDValue NewVal = DAG.getNode(Opc, Value.getDebugLoc(), NewVT, NewLD,
+                                   DAG.getConstant(NewImm, NewVT));
+      SDValue NewST = DAG.getStore(Chain, N->getDebugLoc(),
+                                   NewVal, NewPtr,
+                                   ST->getSrcValue(), ST->getSrcValueOffset(),
+                                   false, false, NewAlign);
+
+      AddToWorkList(NewPtr.getNode());
+      AddToWorkList(NewLD.getNode());
+      AddToWorkList(NewVal.getNode());
+      WorkListRemover DeadNodes(*this);
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1),
+                                    &DeadNodes);
+      ++OpsNarrowed;
+      return NewST;
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSTORE(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr   = ST->getBasePtr();
+
+  // If this is a store of a bit convert, store the input value if the
+  // resultant store does not need a higher alignment than the original.
+  if (Value.getOpcode() == ISD::BIT_CONVERT && !ST->isTruncatingStore() &&
+      ST->isUnindexed()) {
+    unsigned OrigAlign = ST->getAlignment();
+    EVT SVT = Value.getOperand(0).getValueType();
+    unsigned Align = TLI.getTargetData()->
+      getABITypeAlignment(SVT.getTypeForEVT(*DAG.getContext()));
+    if (Align <= OrigAlign &&
+        ((!LegalOperations && !ST->isVolatile()) ||
+         TLI.isOperationLegalOrCustom(ISD::STORE, SVT)))
+      return DAG.getStore(Chain, N->getDebugLoc(), Value.getOperand(0),
+                          Ptr, ST->getSrcValue(),
+                          ST->getSrcValueOffset(), ST->isVolatile(),
+                          ST->isNonTemporal(), OrigAlign);
+  }
+
+  // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Value)) {
+    // NOTE: If the original store is volatile, this transform must not increase
+    // the number of stores.  For example, on x86-32 an f64 can be stored in one
+    // processor operation but an i64 (which is not legal) requires two.  So the
+    // transform should not be done in this case.
+    if (Value.getOpcode() != ISD::TargetConstantFP) {
+      SDValue Tmp;
+      switch (CFP->getValueType(0).getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unknown FP type");
+      case MVT::f80:    // We don't do this for these yet.
+      case MVT::f128:
+      case MVT::ppcf128:
+        break;
+      case MVT::f32:
+        if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) ||
+            TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+          Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
+                              bitcastToAPInt().getZExtValue(), MVT::i32);
+          return DAG.getStore(Chain, N->getDebugLoc(), Tmp,
+                              Ptr, ST->getSrcValue(),
+                              ST->getSrcValueOffset(), ST->isVolatile(),
+                              ST->isNonTemporal(), ST->getAlignment());
+        }
+        break;
+      case MVT::f64:
+        if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
+             !ST->isVolatile()) ||
+            TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
+          Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+                                getZExtValue(), MVT::i64);
+          return DAG.getStore(Chain, N->getDebugLoc(), Tmp,
+                              Ptr, ST->getSrcValue(),
+                              ST->getSrcValueOffset(), ST->isVolatile(),
+                              ST->isNonTemporal(), ST->getAlignment());
+        } else if (!ST->isVolatile() &&
+                   TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+          // Many FP stores are not made apparent until after legalize, e.g. for
+          // argument passing.  Since this is so common, custom legalize the
+          // 64-bit integer store into two 32-bit stores.
+          uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+          SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, MVT::i32);
+          SDValue Hi = DAG.getConstant(Val >> 32, MVT::i32);
+          if (TLI.isBigEndian()) std::swap(Lo, Hi);
+
+          int SVOffset = ST->getSrcValueOffset();
+          unsigned Alignment = ST->getAlignment();
+          bool isVolatile = ST->isVolatile();
+          bool isNonTemporal = ST->isNonTemporal();
+
+          SDValue St0 = DAG.getStore(Chain, ST->getDebugLoc(), Lo,
+                                     Ptr, ST->getSrcValue(),
+                                     ST->getSrcValueOffset(),
+                                     isVolatile, isNonTemporal,
+                                     ST->getAlignment());
+          Ptr = DAG.getNode(ISD::ADD, N->getDebugLoc(), Ptr.getValueType(), Ptr,
+                            DAG.getConstant(4, Ptr.getValueType()));
+          SVOffset += 4;
+          Alignment = MinAlign(Alignment, 4U);
+          SDValue St1 = DAG.getStore(Chain, ST->getDebugLoc(), Hi,
+                                     Ptr, ST->getSrcValue(),
+                                     SVOffset, isVolatile, isNonTemporal,
+                                     Alignment);
+          return DAG.getNode(ISD::TokenFactor, N->getDebugLoc(), MVT::Other,
+                             St0, St1);
+        }
+
+        break;
+      }
+    }
+  }
+
+  // Try to infer better alignment information than the store already has.
+  if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
+    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
+      if (Align > ST->getAlignment())
+        return DAG.getTruncStore(Chain, N->getDebugLoc(), Value,
+                                 Ptr, ST->getSrcValue(),
+                                 ST->getSrcValueOffset(), ST->getMemoryVT(),
+                                 ST->isVolatile(), ST->isNonTemporal(), Align);
+    }
+  }
+
+  if (CombinerAA) {
+    // Walk up chain skipping non-aliasing memory nodes.
+    SDValue BetterChain = FindBetterChain(N, Chain);
+
+    // If there is a better chain.
+    if (Chain != BetterChain) {
+      SDValue ReplStore;
+
+      // Replace the chain to avoid dependency.
+      if (ST->isTruncatingStore()) {
+        ReplStore = DAG.getTruncStore(BetterChain, N->getDebugLoc(), Value, Ptr,
+                                      ST->getSrcValue(),ST->getSrcValueOffset(),
+                                      ST->getMemoryVT(), ST->isVolatile(),
+                                      ST->isNonTemporal(), ST->getAlignment());
+      } else {
+        ReplStore = DAG.getStore(BetterChain, N->getDebugLoc(), Value, Ptr,
+                                 ST->getSrcValue(), ST->getSrcValueOffset(),
+                                 ST->isVolatile(), ST->isNonTemporal(),
+                                 ST->getAlignment());
+      }
+
+      // Create token to keep both nodes around.
+      SDValue Token = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(),
+                                  MVT::Other, Chain, ReplStore);
+
+      // Make sure the new and old chains are cleaned up.
+      AddToWorkList(Token.getNode());
+
+      // Don't add users to work list.
+      return CombineTo(N, Token, false);
+    }
+  }
+
+  // Try transforming N to an indexed store.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
+  // FIXME: is there such a thing as a truncating indexed store?
+  if (ST->isTruncatingStore() && ST->isUnindexed() &&
+      Value.getValueType().isInteger()) {
+    // See if we can simplify the input to this truncstore with knowledge that
+    // only the low bits are being used.  For example:
+    // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
+    SDValue Shorter =
+      GetDemandedBits(Value,
+                      APInt::getLowBitsSet(Value.getValueSizeInBits(),
+                                           ST->getMemoryVT().getSizeInBits()));
+    AddToWorkList(Value.getNode());
+    if (Shorter.getNode())
+      return DAG.getTruncStore(Chain, N->getDebugLoc(), Shorter,
+                               Ptr, ST->getSrcValue(),
+                               ST->getSrcValueOffset(), ST->getMemoryVT(),
+                               ST->isVolatile(), ST->isNonTemporal(),
+                               ST->getAlignment());
+
+    // Otherwise, see if we can simplify the operation with
+    // SimplifyDemandedBits, which only works if the value has a single use.
+    if (SimplifyDemandedBits(Value,
+                             APInt::getLowBitsSet(
+                               Value.getValueType().getScalarType().getSizeInBits(),
+                               ST->getMemoryVT().getScalarType().getSizeInBits())))
+      return SDValue(N, 0);
+  }
+
+  // If this is a load followed by a store to the same location, then the store
+  // is dead/noop.
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
+    if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
+        ST->isUnindexed() && !ST->isVolatile() &&
+        // There can't be any side effects between the load and store, such as
+        // a call or store.
+        Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
+      // The store is dead, remove it.
+      return Chain;
+    }
+  }
+
+  // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
+  // truncating store.  We can do this even if this is already a truncstore.
+  if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE)
+      && Value.getNode()->hasOneUse() && ST->isUnindexed() &&
+      TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
+                            ST->getMemoryVT())) {
+    return DAG.getTruncStore(Chain, N->getDebugLoc(), Value.getOperand(0),
+                             Ptr, ST->getSrcValue(),
+                             ST->getSrcValueOffset(), ST->getMemoryVT(),
+                             ST->isVolatile(), ST->isNonTemporal(),
+                             ST->getAlignment());
+  }
+
+  return ReduceLoadOpStoreWidth(N);
+}
+
+SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
+  SDValue InVec = N->getOperand(0);
+  SDValue InVal = N->getOperand(1);
+  SDValue EltNo = N->getOperand(2);
+
+  // If the inserted element is an UNDEF, just use the input vector.
+  if (InVal.getOpcode() == ISD::UNDEF)
+    return InVec;
+
+  // If the invec is a BUILD_VECTOR and if EltNo is a constant, build a new
+  // vector with the inserted element.
+  if (InVec.getOpcode() == ISD::BUILD_VECTOR && isa<ConstantSDNode>(EltNo)) {
+    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+    SmallVector<SDValue, 8> Ops(InVec.getNode()->op_begin(),
+                                InVec.getNode()->op_end());
+    if (Elt < Ops.size())
+      Ops[Elt] = InVal;
+    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                       InVec.getValueType(), &Ops[0], Ops.size());
+  }
+  // If the invec is an UNDEF and if EltNo is a constant, create a new 
+  // BUILD_VECTOR with undef elements and the inserted element.
+  if (!LegalOperations && InVec.getOpcode() == ISD::UNDEF && 
+      isa<ConstantSDNode>(EltNo)) {
+    EVT VT = InVec.getValueType();
+    EVT EltVT = VT.getVectorElementType();
+    unsigned NElts = VT.getVectorNumElements();
+    SmallVector<SDValue, 8> Ops(NElts, DAG.getUNDEF(EltVT));
+
+    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+    if (Elt < Ops.size())
+      Ops[Elt] = InVal;
+    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                       InVec.getValueType(), &Ops[0], Ops.size());
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
+  // (vextract (scalar_to_vector val, 0) -> val
+  SDValue InVec = N->getOperand(0);
+
+ if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+   // Check if the result type doesn't match the inserted element type. A
+   // SCALAR_TO_VECTOR may truncate the inserted element and the
+   // EXTRACT_VECTOR_ELT may widen the extracted vector.
+   SDValue InOp = InVec.getOperand(0);
+   EVT NVT = N->getValueType(0);
+   if (InOp.getValueType() != NVT) {
+     assert(InOp.getValueType().isInteger() && NVT.isInteger());
+     return DAG.getSExtOrTrunc(InOp, InVec.getDebugLoc(), NVT);
+   }
+   return InOp;
+ }
+
+  // Perform only after legalization to ensure build_vector / vector_shuffle
+  // optimizations have already been done.
+  if (!LegalOperations) return SDValue();
+
+  // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
+  // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
+  // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)
+  SDValue EltNo = N->getOperand(1);
+
+  if (isa<ConstantSDNode>(EltNo)) {
+    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+    bool NewLoad = false;
+    bool BCNumEltsChanged = false;
+    EVT VT = InVec.getValueType();
+    EVT ExtVT = VT.getVectorElementType();
+    EVT LVT = ExtVT;
+
+    if (InVec.getOpcode() == ISD::BIT_CONVERT) {
+      EVT BCVT = InVec.getOperand(0).getValueType();
+      if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
+        return SDValue();
+      if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
+        BCNumEltsChanged = true;
+      InVec = InVec.getOperand(0);
+      ExtVT = BCVT.getVectorElementType();
+      NewLoad = true;
+    }
+
+    LoadSDNode *LN0 = NULL;
+    const ShuffleVectorSDNode *SVN = NULL;
+    if (ISD::isNormalLoad(InVec.getNode())) {
+      LN0 = cast<LoadSDNode>(InVec);
+    } else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+               InVec.getOperand(0).getValueType() == ExtVT &&
+               ISD::isNormalLoad(InVec.getOperand(0).getNode())) {
+      LN0 = cast<LoadSDNode>(InVec.getOperand(0));
+    } else if ((SVN = dyn_cast<ShuffleVectorSDNode>(InVec))) {
+      // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
+      // =>
+      // (load $addr+1*size)
+
+      // If the bit convert changed the number of elements, it is unsafe
+      // to examine the mask.
+      if (BCNumEltsChanged)
+        return SDValue();
+
+      // Select the input vector, guarding against out of range extract vector.
+      unsigned NumElems = VT.getVectorNumElements();
+      int Idx = (Elt > NumElems) ? -1 : SVN->getMaskElt(Elt);
+      InVec = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1);
+
+      if (InVec.getOpcode() == ISD::BIT_CONVERT)
+        InVec = InVec.getOperand(0);
+      if (ISD::isNormalLoad(InVec.getNode())) {
+        LN0 = cast<LoadSDNode>(InVec);
+        Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems;
+      }
+    }
+
+    if (!LN0 || !LN0->hasOneUse() || LN0->isVolatile())
+      return SDValue();
+
+    unsigned Align = LN0->getAlignment();
+    if (NewLoad) {
+      // Check the resultant load doesn't need a higher alignment than the
+      // original load.
+      unsigned NewAlign =
+        TLI.getTargetData()->getABITypeAlignment(LVT.getTypeForEVT(*DAG.getContext()));
+
+      if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, LVT))
+        return SDValue();
+
+      Align = NewAlign;
+    }
+
+    SDValue NewPtr = LN0->getBasePtr();
+    if (Elt) {
+      unsigned PtrOff = LVT.getSizeInBits() * Elt / 8;
+      EVT PtrType = NewPtr.getValueType();
+      if (TLI.isBigEndian())
+        PtrOff = VT.getSizeInBits() / 8 - PtrOff;
+      NewPtr = DAG.getNode(ISD::ADD, N->getDebugLoc(), PtrType, NewPtr,
+                           DAG.getConstant(PtrOff, PtrType));
+    }
+
+    return DAG.getLoad(LVT, N->getDebugLoc(), LN0->getChain(), NewPtr,
+                       LN0->getSrcValue(), LN0->getSrcValueOffset(),
+                       LN0->isVolatile(), LN0->isNonTemporal(), Align);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
+  unsigned NumInScalars = N->getNumOperands();
+  EVT VT = N->getValueType(0);
+
+  // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
+  // operations.  If so, and if the EXTRACT_VECTOR_ELT vector inputs come from
+  // at most two distinct vectors, turn this into a shuffle node.
+  SDValue VecIn1, VecIn2;
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    // Ignore undef inputs.
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+
+    // If this input is something other than a EXTRACT_VECTOR_ELT with a
+    // constant index, bail out.
+    if (N->getOperand(i).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(N->getOperand(i).getOperand(1))) {
+      VecIn1 = VecIn2 = SDValue(0, 0);
+      break;
+    }
+
+    // If the input vector type disagrees with the result of the build_vector,
+    // we can't make a shuffle.
+    SDValue ExtractedFromVec = N->getOperand(i).getOperand(0);
+    if (ExtractedFromVec.getValueType() != VT) {
+      VecIn1 = VecIn2 = SDValue(0, 0);
+      break;
+    }
+
+    // Otherwise, remember this.  We allow up to two distinct input vectors.
+    if (ExtractedFromVec == VecIn1 || ExtractedFromVec == VecIn2)
+      continue;
+
+    if (VecIn1.getNode() == 0) {
+      VecIn1 = ExtractedFromVec;
+    } else if (VecIn2.getNode() == 0) {
+      VecIn2 = ExtractedFromVec;
+    } else {
+      // Too many inputs.
+      VecIn1 = VecIn2 = SDValue(0, 0);
+      break;
+    }
+  }
+
+  // If everything is good, we can make a shuffle operation.
+  if (VecIn1.getNode()) {
+    SmallVector<int, 8> Mask;
+    for (unsigned i = 0; i != NumInScalars; ++i) {
+      if (N->getOperand(i).getOpcode() == ISD::UNDEF) {
+        Mask.push_back(-1);
+        continue;
+      }
+
+      // If extracting from the first vector, just use the index directly.
+      SDValue Extract = N->getOperand(i);
+      SDValue ExtVal = Extract.getOperand(1);
+      if (Extract.getOperand(0) == VecIn1) {
+        unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();
+        if (ExtIndex > VT.getVectorNumElements())
+          return SDValue();
+        
+        Mask.push_back(ExtIndex);
+        continue;
+      }
+
+      // Otherwise, use InIdx + VecSize
+      unsigned Idx = cast<ConstantSDNode>(ExtVal)->getZExtValue();
+      Mask.push_back(Idx+NumInScalars);
+    }
+
+    // Add count and size info.
+    if (!isTypeLegal(VT))
+      return SDValue();
+
+    // Return the new VECTOR_SHUFFLE node.
+    SDValue Ops[2];
+    Ops[0] = VecIn1;
+    Ops[1] = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+    return DAG.getVectorShuffle(VT, N->getDebugLoc(), Ops[0], Ops[1], &Mask[0]);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
+  // TODO: Check to see if this is a CONCAT_VECTORS of a bunch of
+  // EXTRACT_SUBVECTOR operations.  If so, and if the EXTRACT_SUBVECTOR vector
+  // inputs come from at most two distinct vectors, turn this into a shuffle
+  // node.
+
+  // If we only have one input vector, we don't need to do any concatenation.
+  if (N->getNumOperands() == 1)
+    return N->getOperand(0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SDValue N0 = N->getOperand(0);
+
+  assert(N0.getValueType().getVectorNumElements() == NumElts &&
+        "Vector shuffle must be normalized in DAG");
+
+  // FIXME: implement canonicalizations from DAG.getVectorShuffle()
+
+  // If it is a splat, check if the argument vector is a build_vector with
+  // all scalar elements the same.
+  if (cast<ShuffleVectorSDNode>(N)->isSplat()) {
+    SDNode *V = N0.getNode();
+
+    // If this is a bit convert that changes the element type of the vector but
+    // not the number of vector elements, look through it.  Be careful not to
+    // look though conversions that change things like v4f32 to v2f64.
+    if (V->getOpcode() == ISD::BIT_CONVERT) {
+      SDValue ConvInput = V->getOperand(0);
+      if (ConvInput.getValueType().isVector() &&
+          ConvInput.getValueType().getVectorNumElements() == NumElts)
+        V = ConvInput.getNode();
+    }
+
+    if (V->getOpcode() == ISD::BUILD_VECTOR) {
+      unsigned NumElems = V->getNumOperands();
+      unsigned BaseIdx = cast<ShuffleVectorSDNode>(N)->getSplatIndex();
+      if (NumElems > BaseIdx) {
+        SDValue Base;
+        bool AllSame = true;
+        for (unsigned i = 0; i != NumElems; ++i) {
+          if (V->getOperand(i).getOpcode() != ISD::UNDEF) {
+            Base = V->getOperand(i);
+            break;
+          }
+        }
+        // Splat of <u, u, u, u>, return <u, u, u, u>
+        if (!Base.getNode())
+          return N0;
+        for (unsigned i = 0; i != NumElems; ++i) {
+          if (V->getOperand(i) != Base) {
+            AllSame = false;
+            break;
+          }
+        }
+        // Splat of <x, x, x, x>, return <x, x, x, x>
+        if (AllSame)
+          return N0;
+      }
+    }
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMEMBARRIER(SDNode* N) {
+  if (!TLI.getShouldFoldAtomicFences())
+    return SDValue();
+
+  SDValue atomic = N->getOperand(0);
+  switch (atomic.getOpcode()) {
+    case ISD::ATOMIC_CMP_SWAP:
+    case ISD::ATOMIC_SWAP:
+    case ISD::ATOMIC_LOAD_ADD:
+    case ISD::ATOMIC_LOAD_SUB:
+    case ISD::ATOMIC_LOAD_AND:
+    case ISD::ATOMIC_LOAD_OR:
+    case ISD::ATOMIC_LOAD_XOR:
+    case ISD::ATOMIC_LOAD_NAND:
+    case ISD::ATOMIC_LOAD_MIN:
+    case ISD::ATOMIC_LOAD_MAX:
+    case ISD::ATOMIC_LOAD_UMIN:
+    case ISD::ATOMIC_LOAD_UMAX:
+      break;
+    default:
+      return SDValue();
+  }
+
+  SDValue fence = atomic.getOperand(0);
+  if (fence.getOpcode() != ISD::MEMBARRIER)
+    return SDValue();
+
+  switch (atomic.getOpcode()) {
+    case ISD::ATOMIC_CMP_SWAP:
+      return SDValue(DAG.UpdateNodeOperands(atomic.getNode(),
+                                    fence.getOperand(0),
+                                    atomic.getOperand(1), atomic.getOperand(2),
+                                    atomic.getOperand(3)), atomic.getResNo());
+    case ISD::ATOMIC_SWAP:
+    case ISD::ATOMIC_LOAD_ADD:
+    case ISD::ATOMIC_LOAD_SUB:
+    case ISD::ATOMIC_LOAD_AND:
+    case ISD::ATOMIC_LOAD_OR:
+    case ISD::ATOMIC_LOAD_XOR:
+    case ISD::ATOMIC_LOAD_NAND:
+    case ISD::ATOMIC_LOAD_MIN:
+    case ISD::ATOMIC_LOAD_MAX:
+    case ISD::ATOMIC_LOAD_UMIN:
+    case ISD::ATOMIC_LOAD_UMAX:
+      return SDValue(DAG.UpdateNodeOperands(atomic.getNode(),
+                                    fence.getOperand(0),
+                                    atomic.getOperand(1), atomic.getOperand(2)),
+                     atomic.getResNo());
+    default:
+      return SDValue();
+  }
+}
+
+/// XformToShuffleWithZero - Returns a vector_shuffle if it able to transform
+/// an AND to a vector_shuffle with the destination vector and a zero vector.
+/// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
+///      vector_shuffle V, Zero, <0, 4, 2, 4>
+SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if (N->getOpcode() == ISD::AND) {
+    if (RHS.getOpcode() == ISD::BIT_CONVERT)
+      RHS = RHS.getOperand(0);
+    if (RHS.getOpcode() == ISD::BUILD_VECTOR) {
+      SmallVector<int, 8> Indices;
+      unsigned NumElts = RHS.getNumOperands();
+      for (unsigned i = 0; i != NumElts; ++i) {
+        SDValue Elt = RHS.getOperand(i);
+        if (!isa<ConstantSDNode>(Elt))
+          return SDValue();
+        else if (cast<ConstantSDNode>(Elt)->isAllOnesValue())
+          Indices.push_back(i);
+        else if (cast<ConstantSDNode>(Elt)->isNullValue())
+          Indices.push_back(NumElts);
+        else
+          return SDValue();
+      }
+
+      // Let's see if the target supports this vector_shuffle.
+      EVT RVT = RHS.getValueType();
+      if (!TLI.isVectorClearMaskLegal(Indices, RVT))
+        return SDValue();
+
+      // Return the new VECTOR_SHUFFLE node.
+      EVT EltVT = RVT.getVectorElementType();
+      SmallVector<SDValue,8> ZeroOps(RVT.getVectorNumElements(),
+                                     DAG.getConstant(0, EltVT));
+      SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                                 RVT, &ZeroOps[0], ZeroOps.size());
+      LHS = DAG.getNode(ISD::BIT_CONVERT, dl, RVT, LHS);
+      SDValue Shuf = DAG.getVectorShuffle(RVT, dl, LHS, Zero, &Indices[0]);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Shuf);
+    }
+  }
+
+  return SDValue();
+}
+
+/// SimplifyVBinOp - Visit a binary vector operation, like ADD.
+SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
+  // After legalize, the target may be depending on adds and other
+  // binary ops to provide legal ways to construct constants or other
+  // things. Simplifying them may result in a loss of legality.
+  if (LegalOperations) return SDValue();
+
+  EVT VT = N->getValueType(0);
+  assert(VT.isVector() && "SimplifyVBinOp only works on vectors!");
+
+  EVT EltType = VT.getVectorElementType();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Shuffle = XformToShuffleWithZero(N);
+  if (Shuffle.getNode()) return Shuffle;
+
+  // If the LHS and RHS are BUILD_VECTOR nodes, see if we can constant fold
+  // this operation.
+  if (LHS.getOpcode() == ISD::BUILD_VECTOR &&
+      RHS.getOpcode() == ISD::BUILD_VECTOR) {
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
+      SDValue LHSOp = LHS.getOperand(i);
+      SDValue RHSOp = RHS.getOperand(i);
+      // If these two elements can't be folded, bail out.
+      if ((LHSOp.getOpcode() != ISD::UNDEF &&
+           LHSOp.getOpcode() != ISD::Constant &&
+           LHSOp.getOpcode() != ISD::ConstantFP) ||
+          (RHSOp.getOpcode() != ISD::UNDEF &&
+           RHSOp.getOpcode() != ISD::Constant &&
+           RHSOp.getOpcode() != ISD::ConstantFP))
+        break;
+
+      // Can't fold divide by zero.
+      if (N->getOpcode() == ISD::SDIV || N->getOpcode() == ISD::UDIV ||
+          N->getOpcode() == ISD::FDIV) {
+        if ((RHSOp.getOpcode() == ISD::Constant &&
+             cast<ConstantSDNode>(RHSOp.getNode())->isNullValue()) ||
+            (RHSOp.getOpcode() == ISD::ConstantFP &&
+             cast<ConstantFPSDNode>(RHSOp.getNode())->getValueAPF().isZero()))
+          break;
+      }
+
+      // If the vector element type is not legal, the BUILD_VECTOR operands
+      // are promoted and implicitly truncated.  Make that explicit here.
+      if (LHSOp.getValueType() != EltType)
+        LHSOp = DAG.getNode(ISD::TRUNCATE, LHS.getDebugLoc(), EltType, LHSOp);
+      if (RHSOp.getValueType() != EltType)
+        RHSOp = DAG.getNode(ISD::TRUNCATE, RHS.getDebugLoc(), EltType, RHSOp);
+
+      SDValue FoldOp = DAG.getNode(N->getOpcode(), LHS.getDebugLoc(), EltType,
+                                   LHSOp, RHSOp);
+      if (FoldOp.getOpcode() != ISD::UNDEF &&
+          FoldOp.getOpcode() != ISD::Constant &&
+          FoldOp.getOpcode() != ISD::ConstantFP)
+        break;
+      Ops.push_back(FoldOp);
+      AddToWorkList(FoldOp.getNode());
+    }
+
+    if (Ops.size() == LHS.getNumOperands()) {
+      EVT VT = LHS.getValueType();
+      return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
+                         &Ops[0], Ops.size());
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::SimplifySelect(DebugLoc DL, SDValue N0,
+                                    SDValue N1, SDValue N2){
+  assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");
+
+  SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
+                                 cast<CondCodeSDNode>(N0.getOperand(2))->get());
+
+  // If we got a simplified select_cc node back from SimplifySelectCC, then
+  // break it down into a new SETCC node, and a new SELECT node, and then return
+  // the SELECT node, since we were called with a SELECT node.
+  if (SCC.getNode()) {
+    // Check to see if we got a select_cc back (to turn into setcc/select).
+    // Otherwise, just return whatever node we got back, like fabs.
+    if (SCC.getOpcode() == ISD::SELECT_CC) {
+      SDValue SETCC = DAG.getNode(ISD::SETCC, N0.getDebugLoc(),
+                                  N0.getValueType(),
+                                  SCC.getOperand(0), SCC.getOperand(1),
+                                  SCC.getOperand(4));
+      AddToWorkList(SETCC.getNode());
+      return DAG.getNode(ISD::SELECT, SCC.getDebugLoc(), SCC.getValueType(),
+                         SCC.getOperand(2), SCC.getOperand(3), SETCC);
+    }
+
+    return SCC;
+  }
+  return SDValue();
+}
+
+/// SimplifySelectOps - Given a SELECT or a SELECT_CC node, where LHS and RHS
+/// are the two values being selected between, see if we can simplify the
+/// select.  Callers of this should assume that TheSelect is deleted if this
+/// returns true.  As such, they should return the appropriate thing (e.g. the
+/// node) back to the top-level of the DAG combiner loop to avoid it being
+/// looked at.
+bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
+                                    SDValue RHS) {
+
+  // If this is a select from two identical things, try to pull the operation
+  // through the select.
+  if (LHS.getOpcode() == RHS.getOpcode() && LHS.hasOneUse() && RHS.hasOneUse()){
+    // If this is a load and the token chain is identical, replace the select
+    // of two loads with a load through a select of the address to load from.
+    // This triggers in things like "select bool X, 10.0, 123.0" after the FP
+    // constants have been dropped into the constant pool.
+    if (LHS.getOpcode() == ISD::LOAD &&
+        // Do not let this transformation reduce the number of volatile loads.
+        !cast<LoadSDNode>(LHS)->isVolatile() &&
+        !cast<LoadSDNode>(RHS)->isVolatile() &&
+        // Token chains must be identical.
+        LHS.getOperand(0) == RHS.getOperand(0)) {
+      LoadSDNode *LLD = cast<LoadSDNode>(LHS);
+      LoadSDNode *RLD = cast<LoadSDNode>(RHS);
+
+      // If this is an EXTLOAD, the VT's must match.
+      if (LLD->getMemoryVT() == RLD->getMemoryVT()) {
+        // FIXME: this discards src value information.  This is
+        // over-conservative. It would be beneficial to be able to remember
+        // both potential memory locations.  Since we are discarding
+        // src value info, don't do the transformation if the memory
+        // locations are not in the default address space.
+        unsigned LLDAddrSpace = 0, RLDAddrSpace = 0;
+        if (const Value *LLDVal = LLD->getMemOperand()->getValue()) {
+          if (const PointerType *PT = dyn_cast<PointerType>(LLDVal->getType()))
+            LLDAddrSpace = PT->getAddressSpace();
+        }
+        if (const Value *RLDVal = RLD->getMemOperand()->getValue()) {
+          if (const PointerType *PT = dyn_cast<PointerType>(RLDVal->getType()))
+            RLDAddrSpace = PT->getAddressSpace();
+        }
+        SDValue Addr;
+        if (LLDAddrSpace == 0 && RLDAddrSpace == 0) {
+          if (TheSelect->getOpcode() == ISD::SELECT) {
+            // Check that the condition doesn't reach either load.  If so, folding
+            // this will induce a cycle into the DAG.
+            if ((!LLD->hasAnyUseOfValue(1) ||
+                 !LLD->isPredecessorOf(TheSelect->getOperand(0).getNode())) &&
+                (!RLD->hasAnyUseOfValue(1) ||
+                 !RLD->isPredecessorOf(TheSelect->getOperand(0).getNode()))) {
+              Addr = DAG.getNode(ISD::SELECT, TheSelect->getDebugLoc(),
+                                 LLD->getBasePtr().getValueType(),
+                                 TheSelect->getOperand(0), LLD->getBasePtr(),
+                                 RLD->getBasePtr());
+            }
+          } else {
+            // Check that the condition doesn't reach either load.  If so, folding
+            // this will induce a cycle into the DAG.
+            if ((!LLD->hasAnyUseOfValue(1) ||
+                 (!LLD->isPredecessorOf(TheSelect->getOperand(0).getNode()) &&
+                  !LLD->isPredecessorOf(TheSelect->getOperand(1).getNode()))) &&
+                (!RLD->hasAnyUseOfValue(1) ||
+                 (!RLD->isPredecessorOf(TheSelect->getOperand(0).getNode()) &&
+                  !RLD->isPredecessorOf(TheSelect->getOperand(1).getNode())))) {
+              Addr = DAG.getNode(ISD::SELECT_CC, TheSelect->getDebugLoc(),
+                                 LLD->getBasePtr().getValueType(),
+                                 TheSelect->getOperand(0),
+                                 TheSelect->getOperand(1),
+                                 LLD->getBasePtr(), RLD->getBasePtr(),
+                                 TheSelect->getOperand(4));
+            }
+          }
+        }
+
+        if (Addr.getNode()) {
+          SDValue Load;
+          if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
+            Load = DAG.getLoad(TheSelect->getValueType(0),
+                               TheSelect->getDebugLoc(),
+                               LLD->getChain(),
+                               Addr, 0, 0,
+                               LLD->isVolatile(),
+                               LLD->isNonTemporal(),
+                               LLD->getAlignment());
+          } else {
+            Load = DAG.getExtLoad(LLD->getExtensionType(),
+                                  TheSelect->getValueType(0),
+                                  TheSelect->getDebugLoc(),
+                                  LLD->getChain(), Addr, 0, 0,
+                                  LLD->getMemoryVT(),
+                                  LLD->isVolatile(),
+                                  LLD->isNonTemporal(),
+                                  LLD->getAlignment());
+          }
+
+          // Users of the select now use the result of the load.
+          CombineTo(TheSelect, Load);
+
+          // Users of the old loads now use the new load's chain.  We know the
+          // old-load value is dead now.
+          CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1));
+          CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1));
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+/// SimplifySelectCC - Simplify an expression of the form (N0 cond N1) ? N2 : N3
+/// where 'cond' is the comparison specified by CC.
+SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
+                                      SDValue N2, SDValue N3,
+                                      ISD::CondCode CC, bool NotExtCompare) {
+  // (x ? y : y) -> y.
+  if (N2 == N3) return N2;
+  
+  EVT VT = N2.getValueType();
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+  ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3.getNode());
+
+  // Determine if the condition we're dealing with is constant
+  SDValue SCC = SimplifySetCC(TLI.getSetCCResultType(N0.getValueType()),
+                              N0, N1, CC, DL, false);
+  if (SCC.getNode()) AddToWorkList(SCC.getNode());
+  ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode());
+
+  // fold select_cc true, x, y -> x
+  if (SCCC && !SCCC->isNullValue())
+    return N2;
+  // fold select_cc false, x, y -> y
+  if (SCCC && SCCC->isNullValue())
+    return N3;
+
+  // Check to see if we can simplify the select into an fabs node
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1)) {
+    // Allow either -0.0 or 0.0
+    if (CFP->getValueAPF().isZero()) {
+      // select (setg[te] X, +/-0.0), X, fneg(X) -> fabs
+      if ((CC == ISD::SETGE || CC == ISD::SETGT) &&
+          N0 == N2 && N3.getOpcode() == ISD::FNEG &&
+          N2 == N3.getOperand(0))
+        return DAG.getNode(ISD::FABS, DL, VT, N0);
+
+      // select (setl[te] X, +/-0.0), fneg(X), X -> fabs
+      if ((CC == ISD::SETLT || CC == ISD::SETLE) &&
+          N0 == N3 && N2.getOpcode() == ISD::FNEG &&
+          N2.getOperand(0) == N3)
+        return DAG.getNode(ISD::FABS, DL, VT, N3);
+    }
+  }
+  
+  // Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
+  // where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
+  // in it.  This is a win when the constant is not otherwise available because
+  // it replaces two constant pool loads with one.  We only do this if the FP
+  // type is known to be legal, because if it isn't, then we are before legalize
+  // types an we want the other legalization to happen first (e.g. to avoid
+  // messing with soft float) and if the ConstantFP is not legal, because if
+  // it is legal, we may not need to store the FP constant in a constant pool.
+  if (ConstantFPSDNode *TV = dyn_cast<ConstantFPSDNode>(N2))
+    if (ConstantFPSDNode *FV = dyn_cast<ConstantFPSDNode>(N3)) {
+      if (TLI.isTypeLegal(N2.getValueType()) &&
+          (TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) !=
+           TargetLowering::Legal) &&
+          // If both constants have multiple uses, then we won't need to do an
+          // extra load, they are likely around in registers for other users.
+          (TV->hasOneUse() || FV->hasOneUse())) {
+        Constant *Elts[] = {
+          const_cast<ConstantFP*>(FV->getConstantFPValue()),
+          const_cast<ConstantFP*>(TV->getConstantFPValue())
+        };
+        const Type *FPTy = Elts[0]->getType();
+        const TargetData &TD = *TLI.getTargetData();
+        
+        // Create a ConstantArray of the two constants.
+        Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts, 2);
+        SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(),
+                                            TD.getPrefTypeAlignment(FPTy));
+        unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+
+        // Get the offsets to the 0 and 1 element of the array so that we can
+        // select between them.
+        SDValue Zero = DAG.getIntPtrConstant(0);
+        unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
+        SDValue One = DAG.getIntPtrConstant(EltSize);
+        
+        SDValue Cond = DAG.getSetCC(DL,
+                                    TLI.getSetCCResultType(N0.getValueType()),
+                                    N0, N1, CC);
+        SDValue CstOffset = DAG.getNode(ISD::SELECT, DL, Zero.getValueType(),
+                                        Cond, One, Zero);
+        CPIdx = DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(), CPIdx,
+                            CstOffset);
+        return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
+                           PseudoSourceValue::getConstantPool(), 0, false,
+                           false, Alignment);
+
+      }
+    }  
+
+  // Check to see if we can perform the "gzip trick", transforming
+  // (select_cc setlt X, 0, A, 0) -> (and (sra X, (sub size(X), 1), A)
+  if (N1C && N3C && N3C->isNullValue() && CC == ISD::SETLT &&
+      N0.getValueType().isInteger() &&
+      N2.getValueType().isInteger() &&
+      (N1C->isNullValue() ||                         // (a < 0) ? b : 0
+       (N1C->getAPIntValue() == 1 && N0 == N2))) {   // (a < 1) ? a : 0
+    EVT XType = N0.getValueType();
+    EVT AType = N2.getValueType();
+    if (XType.bitsGE(AType)) {
+      // and (sra X, size(X)-1, A) -> "and (srl X, C2), A" iff A is a
+      // single-bit constant.
+      if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue()-1)) == 0)) {
+        unsigned ShCtV = N2C->getAPIntValue().logBase2();
+        ShCtV = XType.getSizeInBits()-ShCtV-1;
+        SDValue ShCt = DAG.getConstant(ShCtV, getShiftAmountTy());
+        SDValue Shift = DAG.getNode(ISD::SRL, N0.getDebugLoc(),
+                                    XType, N0, ShCt);
+        AddToWorkList(Shift.getNode());
+
+        if (XType.bitsGT(AType)) {
+          Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
+          AddToWorkList(Shift.getNode());
+        }
+
+        return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
+      }
+
+      SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(),
+                                  XType, N0,
+                                  DAG.getConstant(XType.getSizeInBits()-1,
+                                                  getShiftAmountTy()));
+      AddToWorkList(Shift.getNode());
+
+      if (XType.bitsGT(AType)) {
+        Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
+        AddToWorkList(Shift.getNode());
+      }
+
+      return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
+    }
+  }
+
+  // fold select C, 16, 0 -> shl C, 4
+  if (N2C && N3C && N3C->isNullValue() && N2C->getAPIntValue().isPowerOf2() &&
+      TLI.getBooleanContents() == TargetLowering::ZeroOrOneBooleanContent) {
+
+    // If the caller doesn't want us to simplify this into a zext of a compare,
+    // don't do it.
+    if (NotExtCompare && N2C->getAPIntValue() == 1)
+      return SDValue();
+
+    // Get a SetCC of the condition
+    // FIXME: Should probably make sure that setcc is legal if we ever have a
+    // target where it isn't.
+    SDValue Temp, SCC;
+    // cast from setcc result type to select result type
+    if (LegalTypes) {
+      SCC  = DAG.getSetCC(DL, TLI.getSetCCResultType(N0.getValueType()),
+                          N0, N1, CC);
+      if (N2.getValueType().bitsLT(SCC.getValueType()))
+        Temp = DAG.getZeroExtendInReg(SCC, N2.getDebugLoc(), N2.getValueType());
+      else
+        Temp = DAG.getNode(ISD::ZERO_EXTEND, N2.getDebugLoc(),
+                           N2.getValueType(), SCC);
+    } else {
+      SCC  = DAG.getSetCC(N0.getDebugLoc(), MVT::i1, N0, N1, CC);
+      Temp = DAG.getNode(ISD::ZERO_EXTEND, N2.getDebugLoc(),
+                         N2.getValueType(), SCC);
+    }
+
+    AddToWorkList(SCC.getNode());
+    AddToWorkList(Temp.getNode());
+
+    if (N2C->getAPIntValue() == 1)
+      return Temp;
+
+    // shl setcc result by log2 n2c
+    return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
+                       DAG.getConstant(N2C->getAPIntValue().logBase2(),
+                                       getShiftAmountTy()));
+  }
+
+  // Check to see if this is the equivalent of setcc
+  // FIXME: Turn all of these into setcc if setcc if setcc is legal
+  // otherwise, go ahead with the folds.
+  if (0 && N3C && N3C->isNullValue() && N2C && (N2C->getAPIntValue() == 1ULL)) {
+    EVT XType = N0.getValueType();
+    if (!LegalOperations ||
+        TLI.isOperationLegal(ISD::SETCC, TLI.getSetCCResultType(XType))) {
+      SDValue Res = DAG.getSetCC(DL, TLI.getSetCCResultType(XType), N0, N1, CC);
+      if (Res.getValueType() != VT)
+        Res = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Res);
+      return Res;
+    }
+
+    // fold (seteq X, 0) -> (srl (ctlz X, log2(size(X))))
+    if (N1C && N1C->isNullValue() && CC == ISD::SETEQ &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(ISD::CTLZ, XType))) {
+      SDValue Ctlz = DAG.getNode(ISD::CTLZ, N0.getDebugLoc(), XType, N0);
+      return DAG.getNode(ISD::SRL, DL, XType, Ctlz,
+                         DAG.getConstant(Log2_32(XType.getSizeInBits()),
+                                         getShiftAmountTy()));
+    }
+    // fold (setgt X, 0) -> (srl (and (-X, ~X), size(X)-1))
+    if (N1C && N1C->isNullValue() && CC == ISD::SETGT) {
+      SDValue NegN0 = DAG.getNode(ISD::SUB, N0.getDebugLoc(),
+                                  XType, DAG.getConstant(0, XType), N0);
+      SDValue NotN0 = DAG.getNOT(N0.getDebugLoc(), N0, XType);
+      return DAG.getNode(ISD::SRL, DL, XType,
+                         DAG.getNode(ISD::AND, DL, XType, NegN0, NotN0),
+                         DAG.getConstant(XType.getSizeInBits()-1,
+                                         getShiftAmountTy()));
+    }
+    // fold (setgt X, -1) -> (xor (srl (X, size(X)-1), 1))
+    if (N1C && N1C->isAllOnesValue() && CC == ISD::SETGT) {
+      SDValue Sign = DAG.getNode(ISD::SRL, N0.getDebugLoc(), XType, N0,
+                                 DAG.getConstant(XType.getSizeInBits()-1,
+                                                 getShiftAmountTy()));
+      return DAG.getNode(ISD::XOR, DL, XType, Sign, DAG.getConstant(1, XType));
+    }
+  }
+
+  // Check to see if this is an integer abs.
+  // select_cc setg[te] X,  0,  X, -X ->
+  // select_cc setgt    X, -1,  X, -X ->
+  // select_cc setl[te] X,  0, -X,  X ->
+  // select_cc setlt    X,  1, -X,  X ->
+  // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+  if (N1C) {
+    ConstantSDNode *SubC = NULL;
+    if (((N1C->isNullValue() && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
+         (N1C->isAllOnesValue() && CC == ISD::SETGT)) &&
+        N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1))
+      SubC = dyn_cast<ConstantSDNode>(N3.getOperand(0));
+    else if (((N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE)) ||
+              (N1C->isOne() && CC == ISD::SETLT)) &&
+             N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1))
+      SubC = dyn_cast<ConstantSDNode>(N2.getOperand(0));
+
+    EVT XType = N0.getValueType();
+    if (SubC && SubC->isNullValue() && XType.isInteger()) {
+      SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(), XType,
+                                  N0,
+                                  DAG.getConstant(XType.getSizeInBits()-1,
+                                                  getShiftAmountTy()));
+      SDValue Add = DAG.getNode(ISD::ADD, N0.getDebugLoc(),
+                                XType, N0, Shift);
+      AddToWorkList(Shift.getNode());
+      AddToWorkList(Add.getNode());
+      return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
+    }
+  }
+
+  return SDValue();
+}
+
+/// SimplifySetCC - This is a stub for TargetLowering::SimplifySetCC.
+SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0,
+                                   SDValue N1, ISD::CondCode Cond,
+                                   DebugLoc DL, bool foldBooleans) {
+  TargetLowering::DAGCombinerInfo
+    DagCombineInfo(DAG, !LegalTypes, !LegalOperations, false, this);
+  return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
+}
+
+/// BuildSDIVSequence - Given an ISD::SDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.  See:
+/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+SDValue DAGCombiner::BuildSDIV(SDNode *N) {
+  std::vector<SDNode*> Built;
+  SDValue S = TLI.BuildSDIV(N, DAG, &Built);
+
+  for (std::vector<SDNode*>::iterator ii = Built.begin(), ee = Built.end();
+       ii != ee; ++ii)
+    AddToWorkList(*ii);
+  return S;
+}
+
+/// BuildUDIVSequence - Given an ISD::UDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.  See:
+/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+SDValue DAGCombiner::BuildUDIV(SDNode *N) {
+  std::vector<SDNode*> Built;
+  SDValue S = TLI.BuildUDIV(N, DAG, &Built);
+
+  for (std::vector<SDNode*>::iterator ii = Built.begin(), ee = Built.end();
+       ii != ee; ++ii)
+    AddToWorkList(*ii);
+  return S;
+}
+
+/// FindBaseOffset - Return true if base is a frame index, which is known not
+// to alias with anything but itself.  Provides base object and offset as results.
+static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
+                           const GlobalValue *&GV, void *&CV) {
+  // Assume it is a primitive operation.
+  Base = Ptr; Offset = 0; GV = 0; CV = 0;
+
+  // If it's an adding a simple constant then integrate the offset.
+  if (Base.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) {
+      Base = Base.getOperand(0);
+      Offset += C->getZExtValue();
+    }
+  }
+  
+  // Return the underlying GlobalValue, and update the Offset.  Return false
+  // for GlobalAddressSDNode since the same GlobalAddress may be represented
+  // by multiple nodes with different offsets.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Base)) {
+    GV = G->getGlobal();
+    Offset += G->getOffset();
+    return false;
+  }
+
+  // Return the underlying Constant value, and update the Offset.  Return false
+  // for ConstantSDNodes since the same constant pool entry may be represented
+  // by multiple nodes with different offsets.
+  if (ConstantPoolSDNode *C = dyn_cast<ConstantPoolSDNode>(Base)) {
+    CV = C->isMachineConstantPoolEntry() ? (void *)C->getMachineCPVal()
+                                         : (void *)C->getConstVal();
+    Offset += C->getOffset();
+    return false;
+  }
+  // If it's any of the following then it can't alias with anything but itself.
+  return isa<FrameIndexSDNode>(Base);
+}
+
+/// isAlias - Return true if there is any possibility that the two addresses
+/// overlap.
+bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
+                          const Value *SrcValue1, int SrcValueOffset1,
+                          unsigned SrcValueAlign1,
+                          SDValue Ptr2, int64_t Size2,
+                          const Value *SrcValue2, int SrcValueOffset2,
+                          unsigned SrcValueAlign2) const {
+  // If they are the same then they must be aliases.
+  if (Ptr1 == Ptr2) return true;
+
+  // Gather base node and offset information.
+  SDValue Base1, Base2;
+  int64_t Offset1, Offset2;
+  const GlobalValue *GV1, *GV2;
+  void *CV1, *CV2;
+  bool isFrameIndex1 = FindBaseOffset(Ptr1, Base1, Offset1, GV1, CV1);
+  bool isFrameIndex2 = FindBaseOffset(Ptr2, Base2, Offset2, GV2, CV2);
+
+  // If they have a same base address then check to see if they overlap.
+  if (Base1 == Base2 || (GV1 && (GV1 == GV2)) || (CV1 && (CV1 == CV2)))
+    return !((Offset1 + Size1) <= Offset2 || (Offset2 + Size2) <= Offset1);
+
+  // If we know what the bases are, and they aren't identical, then we know they
+  // cannot alias.
+  if ((isFrameIndex1 || CV1 || GV1) && (isFrameIndex2 || CV2 || GV2))
+    return false;
+
+  // If we know required SrcValue1 and SrcValue2 have relatively large alignment
+  // compared to the size and offset of the access, we may be able to prove they
+  // do not alias.  This check is conservative for now to catch cases created by
+  // splitting vector types.
+  if ((SrcValueAlign1 == SrcValueAlign2) &&
+      (SrcValueOffset1 != SrcValueOffset2) &&
+      (Size1 == Size2) && (SrcValueAlign1 > Size1)) {
+    int64_t OffAlign1 = SrcValueOffset1 % SrcValueAlign1;
+    int64_t OffAlign2 = SrcValueOffset2 % SrcValueAlign1;
+    
+    // There is no overlap between these relatively aligned accesses of similar
+    // size, return no alias.
+    if ((OffAlign1 + Size1) <= OffAlign2 || (OffAlign2 + Size2) <= OffAlign1)
+      return false;
+  }
+  
+  if (CombinerGlobalAA) {
+    // Use alias analysis information.
+    int64_t MinOffset = std::min(SrcValueOffset1, SrcValueOffset2);
+    int64_t Overlap1 = Size1 + SrcValueOffset1 - MinOffset;
+    int64_t Overlap2 = Size2 + SrcValueOffset2 - MinOffset;
+    AliasAnalysis::AliasResult AAResult =
+                             AA.alias(SrcValue1, Overlap1, SrcValue2, Overlap2);
+    if (AAResult == AliasAnalysis::NoAlias)
+      return false;
+  }
+
+  // Otherwise we have to assume they alias.
+  return true;
+}
+
+/// FindAliasInfo - Extracts the relevant alias information from the memory
+/// node.  Returns true if the operand was a load.
+bool DAGCombiner::FindAliasInfo(SDNode *N,
+                        SDValue &Ptr, int64_t &Size,
+                        const Value *&SrcValue, 
+                        int &SrcValueOffset,
+                        unsigned &SrcValueAlign) const {
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    Size = LD->getMemoryVT().getSizeInBits() >> 3;
+    SrcValue = LD->getSrcValue();
+    SrcValueOffset = LD->getSrcValueOffset();
+    SrcValueAlign = LD->getOriginalAlignment();
+    return true;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    Ptr = ST->getBasePtr();
+    Size = ST->getMemoryVT().getSizeInBits() >> 3;
+    SrcValue = ST->getSrcValue();
+    SrcValueOffset = ST->getSrcValueOffset();
+    SrcValueAlign = ST->getOriginalAlignment();
+  } else {
+    llvm_unreachable("FindAliasInfo expected a memory operand");
+  }
+
+  return false;
+}
+
+/// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
+/// looking for aliasing nodes and adding them to the Aliases vector.
+void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
+                                   SmallVector<SDValue, 8> &Aliases) {
+  SmallVector<SDValue, 8> Chains;     // List of chains to visit.
+  SmallPtrSet<SDNode *, 16> Visited;  // Visited node set.
+
+  // Get alias information for node.
+  SDValue Ptr;
+  int64_t Size;
+  const Value *SrcValue;
+  int SrcValueOffset;
+  unsigned SrcValueAlign;
+  bool IsLoad = FindAliasInfo(N, Ptr, Size, SrcValue, SrcValueOffset, 
+                              SrcValueAlign);
+
+  // Starting off.
+  Chains.push_back(OriginalChain);
+  unsigned Depth = 0;
+  
+  // Look at each chain and determine if it is an alias.  If so, add it to the
+  // aliases list.  If not, then continue up the chain looking for the next
+  // candidate.
+  while (!Chains.empty()) {
+    SDValue Chain = Chains.back();
+    Chains.pop_back();
+    
+    // For TokenFactor nodes, look at each operand and only continue up the 
+    // chain until we find two aliases.  If we've seen two aliases, assume we'll 
+    // find more and revert to original chain since the xform is unlikely to be
+    // profitable.
+    // 
+    // FIXME: The depth check could be made to return the last non-aliasing 
+    // chain we found before we hit a tokenfactor rather than the original
+    // chain.
+    if (Depth > 6 || Aliases.size() == 2) {
+      Aliases.clear();
+      Aliases.push_back(OriginalChain);
+      break;
+    }
+
+    // Don't bother if we've been before.
+    if (!Visited.insert(Chain.getNode()))
+      continue;
+
+    switch (Chain.getOpcode()) {
+    case ISD::EntryToken:
+      // Entry token is ideal chain operand, but handled in FindBetterChain.
+      break;
+
+    case ISD::LOAD:
+    case ISD::STORE: {
+      // Get alias information for Chain.
+      SDValue OpPtr;
+      int64_t OpSize;
+      const Value *OpSrcValue;
+      int OpSrcValueOffset;
+      unsigned OpSrcValueAlign;
+      bool IsOpLoad = FindAliasInfo(Chain.getNode(), OpPtr, OpSize,
+                                    OpSrcValue, OpSrcValueOffset,
+                                    OpSrcValueAlign);
+
+      // If chain is alias then stop here.
+      if (!(IsLoad && IsOpLoad) &&
+          isAlias(Ptr, Size, SrcValue, SrcValueOffset, SrcValueAlign,
+                  OpPtr, OpSize, OpSrcValue, OpSrcValueOffset,
+                  OpSrcValueAlign)) {
+        Aliases.push_back(Chain);
+      } else {
+        // Look further up the chain.
+        Chains.push_back(Chain.getOperand(0));
+        ++Depth;
+      }
+      break;
+    }
+
+    case ISD::TokenFactor:
+      // We have to check each of the operands of the token factor for "small"
+      // token factors, so we queue them up.  Adding the operands to the queue
+      // (stack) in reverse order maintains the original order and increases the
+      // likelihood that getNode will find a matching token factor (CSE.)
+      if (Chain.getNumOperands() > 16) {
+        Aliases.push_back(Chain);
+        break;
+      }
+      for (unsigned n = Chain.getNumOperands(); n;)
+        Chains.push_back(Chain.getOperand(--n));
+      ++Depth;
+      break;
+
+    default:
+      // For all other instructions we will just have to take what we can get.
+      Aliases.push_back(Chain);
+      break;
+    }
+  }
+}
+
+/// FindBetterChain - Walk up chain skipping non-aliasing memory nodes, looking
+/// for a better chain (aliasing node.)
+SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
+  SmallVector<SDValue, 8> Aliases;  // Ops for replacing token factor.
+
+  // Accumulate all the aliases to this node.
+  GatherAllAliases(N, OldChain, Aliases);
+
+  if (Aliases.size() == 0) {
+    // If no operands then chain to entry token.
+    return DAG.getEntryNode();
+  } else if (Aliases.size() == 1) {
+    // If a single operand then chain to it.  We don't need to revisit it.
+    return Aliases[0];
+  }
+  
+  // Construct a custom tailored token factor.
+  return DAG.getNode(ISD::TokenFactor, N->getDebugLoc(), MVT::Other, 
+                     &Aliases[0], Aliases.size());
+}
+
+// SelectionDAG::Combine - This is the entry point for the file.
+//
+void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA,
+                           CodeGenOpt::Level OptLevel) {
+  /// run - This is the main entry point to this class.
+  ///
+  DAGCombiner(*this, AA, OptLevel).Run(Level);
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
new file mode 100644
index 0000000..c097f11
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -0,0 +1,218 @@
+//===-- FunctionLoweringInfo.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating functions from LLVM IR into
+// Machine IR.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "function-lowering-info"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+/// isUsedOutsideOfDefiningBlock - Return true if this instruction is used by
+/// PHI nodes or outside of the basic block that defines it, or used by a
+/// switch or atomic instruction, which may expand to multiple basic blocks.
+static bool isUsedOutsideOfDefiningBlock(const Instruction *I) {
+  if (I->use_empty()) return false;
+  if (isa<PHINode>(I)) return true;
+  const BasicBlock *BB = I->getParent();
+  for (Value::const_use_iterator UI = I->use_begin(), E = I->use_end();
+        UI != E; ++UI) {
+    const User *U = *UI;
+    if (cast<Instruction>(U)->getParent() != BB || isa<PHINode>(U))
+      return true;
+  }
+  return false;
+}
+
+/// isOnlyUsedInEntryBlock - If the specified argument is only used in the
+/// entry block, return true.  This includes arguments used by switches, since
+/// the switch may expand into multiple basic blocks.
+static bool isOnlyUsedInEntryBlock(const Argument *A) {
+  const BasicBlock *Entry = A->getParent()->begin();
+  for (Value::const_use_iterator UI = A->use_begin(), E = A->use_end();
+       UI != E; ++UI) {
+    const User *U = *UI;
+    if (cast<Instruction>(U)->getParent() != Entry || isa<SwitchInst>(U))
+      return false;  // Use not in entry block.
+  }
+  return true;
+}
+
+FunctionLoweringInfo::FunctionLoweringInfo(const TargetLowering &tli)
+  : TLI(tli) {
+}
+
+void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
+  Fn = &fn;
+  MF = &mf;
+  RegInfo = &MF->getRegInfo();
+
+  // Check whether the function can return without sret-demotion.
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(Fn->getReturnType(),
+                Fn->getAttributes().getRetAttributes(), Outs, TLI);
+  CanLowerReturn = TLI.CanLowerReturn(Fn->getCallingConv(), Fn->isVarArg(),
+                                      Outs, Fn->getContext());
+
+  // Create a vreg for each argument register that is not dead and is used
+  // outside of the entry block for the function.
+  for (Function::const_arg_iterator AI = Fn->arg_begin(), E = Fn->arg_end();
+       AI != E; ++AI)
+    if (!isOnlyUsedInEntryBlock(AI))
+      InitializeRegForValue(AI);
+
+  // Initialize the mapping of values to registers.  This is only set up for
+  // instruction values that are used outside of the block that defines
+  // them.
+  Function::const_iterator BB = Fn->begin(), EB = Fn->end();
+  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      if (const ConstantInt *CUI = dyn_cast<ConstantInt>(AI->getArraySize())) {
+        const Type *Ty = AI->getAllocatedType();
+        uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(Ty);
+        unsigned Align =
+          std::max((unsigned)TLI.getTargetData()->getPrefTypeAlignment(Ty),
+                   AI->getAlignment());
+
+        TySize *= CUI->getZExtValue();   // Get total allocated size.
+        if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects.
+
+        // The object may need to be placed onto the stack near the stack
+        // protector if one exists. Determine here if this object is a suitable
+        // candidate. I.e., it would trigger the creation of a stack protector.
+        bool MayNeedSP =
+          (AI->isArrayAllocation() ||
+           (TySize > 8 && isa<ArrayType>(Ty) &&
+            cast<ArrayType>(Ty)->getElementType()->isIntegerTy(8)));
+        StaticAllocaMap[AI] =
+          MF->getFrameInfo()->CreateStackObject(TySize, Align, false, MayNeedSP);
+      }
+
+  for (; BB != EB; ++BB)
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Mark values used outside their block as exported, by allocating
+      // a virtual register for them.
+      if (isUsedOutsideOfDefiningBlock(I))
+        if (!isa<AllocaInst>(I) ||
+            !StaticAllocaMap.count(cast<AllocaInst>(I)))
+          InitializeRegForValue(I);
+    }
+
+  // Create an initial MachineBasicBlock for each LLVM BasicBlock in F.  This
+  // also creates the initial PHI MachineInstrs, though none of the input
+  // operands are populated.
+  for (BB = Fn->begin(); BB != EB; ++BB) {
+    MachineBasicBlock *MBB = mf.CreateMachineBasicBlock(BB);
+    MBBMap[BB] = MBB;
+    MF->push_back(MBB);
+
+    // Transfer the address-taken flag. This is necessary because there could
+    // be multiple MachineBasicBlocks corresponding to one BasicBlock, and only
+    // the first one should be marked.
+    if (BB->hasAddressTaken())
+      MBB->setHasAddressTaken();
+
+    // Create Machine PHI nodes for LLVM PHI nodes, lowering them as
+    // appropriate.
+    for (BasicBlock::const_iterator I = BB->begin();
+         const PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+      if (PN->use_empty()) continue;
+
+      DebugLoc DL = PN->getDebugLoc();
+      unsigned PHIReg = ValueMap[PN];
+      assert(PHIReg && "PHI node does not have an assigned virtual register!");
+
+      SmallVector<EVT, 4> ValueVTs;
+      ComputeValueVTs(TLI, PN->getType(), ValueVTs);
+      for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
+        EVT VT = ValueVTs[vti];
+        unsigned NumRegisters = TLI.getNumRegisters(Fn->getContext(), VT);
+        const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+        for (unsigned i = 0; i != NumRegisters; ++i)
+          BuildMI(MBB, DL, TII->get(TargetOpcode::PHI), PHIReg + i);
+        PHIReg += NumRegisters;
+      }
+    }
+  }
+}
+
+/// clear - Clear out all the function-specific state. This returns this
+/// FunctionLoweringInfo to an empty state, ready to be used for a
+/// different function.
+void FunctionLoweringInfo::clear() {
+  assert(CatchInfoFound.size() == CatchInfoLost.size() &&
+         "Not all catch info was assigned to a landing pad!");
+
+  MBBMap.clear();
+  ValueMap.clear();
+  StaticAllocaMap.clear();
+#ifndef NDEBUG
+  CatchInfoLost.clear();
+  CatchInfoFound.clear();
+#endif
+  LiveOutRegInfo.clear();
+  ArgDbgValues.clear();
+  RegFixups.clear();
+}
+
+/// CreateReg - Allocate a single virtual register for the given type.
+unsigned FunctionLoweringInfo::CreateReg(EVT VT) {
+  return RegInfo->createVirtualRegister(TLI.getRegClassFor(VT));
+}
+
+/// CreateRegs - Allocate the appropriate number of virtual registers of
+/// the correctly promoted or expanded types.  Assign these registers
+/// consecutive vreg numbers and return the first assigned number.
+///
+/// In the case that the given value has struct or array type, this function
+/// will assign registers for each member or element.
+///
+unsigned FunctionLoweringInfo::CreateRegs(const Type *Ty) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, Ty, ValueVTs);
+
+  unsigned FirstReg = 0;
+  for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    EVT ValueVT = ValueVTs[Value];
+    EVT RegisterVT = TLI.getRegisterType(Ty->getContext(), ValueVT);
+
+    unsigned NumRegs = TLI.getNumRegisters(Ty->getContext(), ValueVT);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      unsigned R = CreateReg(RegisterVT);
+      if (!FirstReg) FirstReg = R;
+    }
+  }
+  return FirstReg;
+}
+
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
new file mode 100644
index 0000000..6246f8e
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -0,0 +1,816 @@
+//==--- InstrEmitter.cpp - Emit MachineInstrs for the SelectionDAG class ---==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the Emit routines for the SelectionDAG class, which creates
+// MachineInstrs based on the decisions of the SelectionDAG instruction
+// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "instr-emitter"
+#include "InstrEmitter.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+/// CountResults - The results of target nodes have register or immediate
+/// operands first, then an optional chain, and optional flag operands (which do
+/// not go into the resulting MachineInstr).
+unsigned InstrEmitter::CountResults(SDNode *Node) {
+  unsigned N = Node->getNumValues();
+  while (N && Node->getValueType(N - 1) == MVT::Flag)
+    --N;
+  if (N && Node->getValueType(N - 1) == MVT::Other)
+    --N;    // Skip over chain result.
+  return N;
+}
+
+/// CountOperands - The inputs to target nodes have any actual inputs first,
+/// followed by an optional chain operand, then an optional flag operand.
+/// Compute the number of actual operands that will go into the resulting
+/// MachineInstr.
+unsigned InstrEmitter::CountOperands(SDNode *Node) {
+  unsigned N = Node->getNumOperands();
+  while (N && Node->getOperand(N - 1).getValueType() == MVT::Flag)
+    --N;
+  if (N && Node->getOperand(N - 1).getValueType() == MVT::Other)
+    --N; // Ignore chain if it exists.
+  return N;
+}
+
+/// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
+/// implicit physical register output.
+void InstrEmitter::
+EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
+                unsigned SrcReg, DenseMap<SDValue, unsigned> &VRBaseMap) {
+  unsigned VRBase = 0;
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+    // Just use the input register directly!
+    SDValue Op(Node, ResNo);
+    if (IsClone)
+      VRBaseMap.erase(Op);
+    bool isNew = VRBaseMap.insert(std::make_pair(Op, SrcReg)).second;
+    isNew = isNew; // Silence compiler warning.
+    assert(isNew && "Node emitted out of order - early");
+    return;
+  }
+
+  // If the node is only used by a CopyToReg and the dest reg is a vreg, use
+  // the CopyToReg'd destination register instead of creating a new vreg.
+  bool MatchReg = true;
+  const TargetRegisterClass *UseRC = NULL;
+  if (!IsClone && !IsCloned)
+    for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
+         UI != E; ++UI) {
+      SDNode *User = *UI;
+      bool Match = true;
+      if (User->getOpcode() == ISD::CopyToReg && 
+          User->getOperand(2).getNode() == Node &&
+          User->getOperand(2).getResNo() == ResNo) {
+        unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+        if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+          VRBase = DestReg;
+          Match = false;
+        } else if (DestReg != SrcReg)
+          Match = false;
+      } else {
+        for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+          SDValue Op = User->getOperand(i);
+          if (Op.getNode() != Node || Op.getResNo() != ResNo)
+            continue;
+          EVT VT = Node->getValueType(Op.getResNo());
+          if (VT == MVT::Other || VT == MVT::Flag)
+            continue;
+          Match = false;
+          if (User->isMachineOpcode()) {
+            const TargetInstrDesc &II = TII->get(User->getMachineOpcode());
+            const TargetRegisterClass *RC = 0;
+            if (i+II.getNumDefs() < II.getNumOperands())
+              RC = II.OpInfo[i+II.getNumDefs()].getRegClass(TRI);
+            if (!UseRC)
+              UseRC = RC;
+            else if (RC) {
+              const TargetRegisterClass *ComRC = getCommonSubClass(UseRC, RC);
+              // If multiple uses expect disjoint register classes, we emit
+              // copies in AddRegisterOperand.
+              if (ComRC)
+                UseRC = ComRC;
+            }
+          }
+        }
+      }
+      MatchReg &= Match;
+      if (VRBase)
+        break;
+    }
+
+  EVT VT = Node->getValueType(ResNo);
+  const TargetRegisterClass *SrcRC = 0, *DstRC = 0;
+  SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
+  
+  // Figure out the register class to create for the destreg.
+  if (VRBase) {
+    DstRC = MRI->getRegClass(VRBase);
+  } else if (UseRC) {
+    assert(UseRC->hasType(VT) && "Incompatible phys register def and uses!");
+    DstRC = UseRC;
+  } else {
+    DstRC = TLI->getRegClassFor(VT);
+  }
+    
+  // If all uses are reading from the src physical register and copying the
+  // register is either impossible or very expensive, then don't create a copy.
+  if (MatchReg && SrcRC->getCopyCost() < 0) {
+    VRBase = SrcReg;
+  } else {
+    // Create the reg, emit the copy.
+    VRBase = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            VRBase).addReg(SrcReg);
+  }
+
+  SDValue Op(Node, ResNo);
+  if (IsClone)
+    VRBaseMap.erase(Op);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+  isNew = isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// getDstOfCopyToRegUse - If the only use of the specified result number of
+/// node is a CopyToReg, return its destination register. Return 0 otherwise.
+unsigned InstrEmitter::getDstOfOnlyCopyToRegUse(SDNode *Node,
+                                                unsigned ResNo) const {
+  if (!Node->hasOneUse())
+    return 0;
+
+  SDNode *User = *Node->use_begin();
+  if (User->getOpcode() == ISD::CopyToReg && 
+      User->getOperand(2).getNode() == Node &&
+      User->getOperand(2).getResNo() == ResNo) {
+    unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      return Reg;
+  }
+  return 0;
+}
+
+void InstrEmitter::CreateVirtualRegisters(SDNode *Node, MachineInstr *MI,
+                                       const TargetInstrDesc &II,
+                                       bool IsClone, bool IsCloned,
+                                       DenseMap<SDValue, unsigned> &VRBaseMap) {
+  assert(Node->getMachineOpcode() != TargetOpcode::IMPLICIT_DEF &&
+         "IMPLICIT_DEF should have been handled as a special case elsewhere!");
+
+  for (unsigned i = 0; i < II.getNumDefs(); ++i) {
+    // If the specific node value is only used by a CopyToReg and the dest reg
+    // is a vreg in the same register class, use the CopyToReg'd destination
+    // register instead of creating a new vreg.
+    unsigned VRBase = 0;
+    const TargetRegisterClass *RC = II.OpInfo[i].getRegClass(TRI);
+    if (II.OpInfo[i].isOptionalDef()) {
+      // Optional def must be a physical register.
+      unsigned NumResults = CountResults(Node);
+      VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg();
+      assert(TargetRegisterInfo::isPhysicalRegister(VRBase));
+      MI->addOperand(MachineOperand::CreateReg(VRBase, true));
+    }
+
+    if (!VRBase && !IsClone && !IsCloned)
+      for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
+           UI != E; ++UI) {
+        SDNode *User = *UI;
+        if (User->getOpcode() == ISD::CopyToReg && 
+            User->getOperand(2).getNode() == Node &&
+            User->getOperand(2).getResNo() == i) {
+          unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+          if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+            const TargetRegisterClass *RegRC = MRI->getRegClass(Reg);
+            if (RegRC == RC) {
+              VRBase = Reg;
+              MI->addOperand(MachineOperand::CreateReg(Reg, true));
+              break;
+            }
+          }
+        }
+      }
+
+    // Create the result registers for this node and add the result regs to
+    // the machine instruction.
+    if (VRBase == 0) {
+      assert(RC && "Isn't a register operand!");
+      VRBase = MRI->createVirtualRegister(RC);
+      MI->addOperand(MachineOperand::CreateReg(VRBase, true));
+    }
+
+    SDValue Op(Node, i);
+    if (IsClone)
+      VRBaseMap.erase(Op);
+    bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+    isNew = isNew; // Silence compiler warning.
+    assert(isNew && "Node emitted out of order - early");
+  }
+}
+
+/// getVR - Return the virtual register corresponding to the specified result
+/// of the specified node.
+unsigned InstrEmitter::getVR(SDValue Op,
+                             DenseMap<SDValue, unsigned> &VRBaseMap) {
+  if (Op.isMachineOpcode() &&
+      Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
+    // Add an IMPLICIT_DEF instruction before every use.
+    unsigned VReg = getDstOfOnlyCopyToRegUse(Op.getNode(), Op.getResNo());
+    // IMPLICIT_DEF can produce any type of result so its TargetInstrDesc
+    // does not include operand register class info.
+    if (!VReg) {
+      const TargetRegisterClass *RC = TLI->getRegClassFor(Op.getValueType());
+      VReg = MRI->createVirtualRegister(RC);
+    }
+    BuildMI(*MBB, InsertPos, Op.getDebugLoc(),
+            TII->get(TargetOpcode::IMPLICIT_DEF), VReg);
+    return VReg;
+  }
+
+  DenseMap<SDValue, unsigned>::iterator I = VRBaseMap.find(Op);
+  assert(I != VRBaseMap.end() && "Node emitted out of order - late");
+  return I->second;
+}
+
+
+/// AddRegisterOperand - Add the specified register as an operand to the
+/// specified machine instr. Insert register copies if the register is
+/// not in the required register class.
+void
+InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
+                                 unsigned IIOpNum,
+                                 const TargetInstrDesc *II,
+                                 DenseMap<SDValue, unsigned> &VRBaseMap,
+                                 bool IsDebug, bool IsClone, bool IsCloned) {
+  assert(Op.getValueType() != MVT::Other &&
+         Op.getValueType() != MVT::Flag &&
+         "Chain and flag operands should occur at end of operand list!");
+  // Get/emit the operand.
+  unsigned VReg = getVR(Op, VRBaseMap);
+  assert(TargetRegisterInfo::isVirtualRegister(VReg) && "Not a vreg?");
+
+  const TargetInstrDesc &TID = MI->getDesc();
+  bool isOptDef = IIOpNum < TID.getNumOperands() &&
+    TID.OpInfo[IIOpNum].isOptionalDef();
+
+  // If the instruction requires a register in a different class, create
+  // a new virtual register and copy the value into it.
+  if (II) {
+    const TargetRegisterClass *SrcRC = MRI->getRegClass(VReg);
+    const TargetRegisterClass *DstRC = 0;
+    if (IIOpNum < II->getNumOperands())
+      DstRC = II->OpInfo[IIOpNum].getRegClass(TRI);
+    assert((DstRC || (TID.isVariadic() && IIOpNum >= TID.getNumOperands())) &&
+           "Don't have operand info for this instruction!");
+    if (DstRC && SrcRC != DstRC && !SrcRC->hasSuperClass(DstRC)) {
+      unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+      BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
+      VReg = NewVReg;
+    }
+  }
+
+  // If this value has only one use, that use is a kill. This is a
+  // conservative approximation. InstrEmitter does trivial coalescing
+  // with CopyFromReg nodes, so don't emit kill flags for them.
+  // Avoid kill flags on Schedule cloned nodes, since there will be
+  // multiple uses.
+  // Tied operands are never killed, so we need to check that. And that
+  // means we need to determine the index of the operand.
+  bool isKill = Op.hasOneUse() &&
+                Op.getNode()->getOpcode() != ISD::CopyFromReg &&
+                !IsDebug &&
+                !(IsClone || IsCloned);
+  if (isKill) {
+    unsigned Idx = MI->getNumOperands();
+    while (Idx > 0 &&
+           MI->getOperand(Idx-1).isReg() && MI->getOperand(Idx-1).isImplicit())
+      --Idx;
+    bool isTied = MI->getDesc().getOperandConstraint(Idx, TOI::TIED_TO) != -1;
+    if (isTied)
+      isKill = false;
+  }
+
+  MI->addOperand(MachineOperand::CreateReg(VReg, isOptDef,
+                                           false/*isImp*/, isKill,
+                                           false/*isDead*/, false/*isUndef*/,
+                                           false/*isEarlyClobber*/,
+                                           0/*SubReg*/, IsDebug));
+}
+
+/// AddOperand - Add the specified operand to the specified machine instr.  II
+/// specifies the instruction information for the node, and IIOpNum is the
+/// operand number (in the II) that we are adding. IIOpNum and II are used for 
+/// assertions only.
+void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
+                              unsigned IIOpNum,
+                              const TargetInstrDesc *II,
+                              DenseMap<SDValue, unsigned> &VRBaseMap,
+                              bool IsDebug, bool IsClone, bool IsCloned) {
+  if (Op.isMachineOpcode()) {
+    AddRegisterOperand(MI, Op, IIOpNum, II, VRBaseMap,
+                       IsDebug, IsClone, IsCloned);
+  } else if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateImm(C->getSExtValue()));
+  } else if (ConstantFPSDNode *F = dyn_cast<ConstantFPSDNode>(Op)) {
+    const ConstantFP *CFP = F->getConstantFPValue();
+    MI->addOperand(MachineOperand::CreateFPImm(CFP));
+  } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateReg(R->getReg(), false));
+  } else if (GlobalAddressSDNode *TGA = dyn_cast<GlobalAddressSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateGA(TGA->getGlobal(), TGA->getOffset(),
+                                            TGA->getTargetFlags()));
+  } else if (BasicBlockSDNode *BBNode = dyn_cast<BasicBlockSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateMBB(BBNode->getBasicBlock()));
+  } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateFI(FI->getIndex()));
+  } else if (JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateJTI(JT->getIndex(),
+                                             JT->getTargetFlags()));
+  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op)) {
+    int Offset = CP->getOffset();
+    unsigned Align = CP->getAlignment();
+    const Type *Type = CP->getType();
+    // MachineConstantPool wants an explicit alignment.
+    if (Align == 0) {
+      Align = TM->getTargetData()->getPrefTypeAlignment(Type);
+      if (Align == 0) {
+        // Alignment of vector types.  FIXME!
+        Align = TM->getTargetData()->getTypeAllocSize(Type);
+      }
+    }
+    
+    unsigned Idx;
+    MachineConstantPool *MCP = MF->getConstantPool();
+    if (CP->isMachineConstantPoolEntry())
+      Idx = MCP->getConstantPoolIndex(CP->getMachineCPVal(), Align);
+    else
+      Idx = MCP->getConstantPoolIndex(CP->getConstVal(), Align);
+    MI->addOperand(MachineOperand::CreateCPI(Idx, Offset,
+                                             CP->getTargetFlags()));
+  } else if (ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateES(ES->getSymbol(),
+                                            ES->getTargetFlags()));
+  } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateBA(BA->getBlockAddress(),
+                                            BA->getTargetFlags()));
+  } else {
+    assert(Op.getValueType() != MVT::Other &&
+           Op.getValueType() != MVT::Flag &&
+           "Chain and flag operands should occur at end of operand list!");
+    AddRegisterOperand(MI, Op, IIOpNum, II, VRBaseMap,
+                       IsDebug, IsClone, IsCloned);
+  }
+}
+
+/// getSuperRegisterRegClass - Returns the register class of a superreg A whose
+/// "SubIdx"'th sub-register class is the specified register class and whose
+/// type matches the specified type.
+static const TargetRegisterClass*
+getSuperRegisterRegClass(const TargetRegisterClass *TRC,
+                         unsigned SubIdx, EVT VT) {
+  // Pick the register class of the superegister for this type
+  for (TargetRegisterInfo::regclass_iterator I = TRC->superregclasses_begin(),
+         E = TRC->superregclasses_end(); I != E; ++I)
+    if ((*I)->hasType(VT) && (*I)->getSubRegisterRegClass(SubIdx) == TRC)
+      return *I;
+  assert(false && "Couldn't find the register class");
+  return 0;
+}
+
+/// EmitSubregNode - Generate machine code for subreg nodes.
+///
+void InstrEmitter::EmitSubregNode(SDNode *Node, 
+                                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                                  bool IsClone, bool IsCloned) {
+  unsigned VRBase = 0;
+  unsigned Opc = Node->getMachineOpcode();
+  
+  // If the node is only used by a CopyToReg and the dest reg is a vreg, use
+  // the CopyToReg'd destination register instead of creating a new vreg.
+  for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
+       UI != E; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() == ISD::CopyToReg && 
+        User->getOperand(2).getNode() == Node) {
+      unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+        VRBase = DestReg;
+        break;
+      }
+    }
+  }
+  
+  if (Opc == TargetOpcode::EXTRACT_SUBREG) {
+    // EXTRACT_SUBREG is lowered as %dst = COPY %src:sub
+    unsigned SubIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+
+    // Figure out the register class to create for the destreg.
+    unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
+    const TargetRegisterClass *TRC = MRI->getRegClass(VReg);
+    const TargetRegisterClass *SRC = TRC->getSubRegisterRegClass(SubIdx);
+    assert(SRC && "Invalid subregister index in EXTRACT_SUBREG");
+
+    // Figure out the register class to create for the destreg.
+    // Note that if we're going to directly use an existing register,
+    // it must be precisely the required class, and not a subclass
+    // thereof.
+    if (VRBase == 0 || SRC != MRI->getRegClass(VRBase)) {
+      // Create the reg
+      assert(SRC && "Couldn't find source register class");
+      VRBase = MRI->createVirtualRegister(SRC);
+    }
+
+    // Create the extract_subreg machine instruction.
+    MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(),
+                               TII->get(TargetOpcode::COPY), VRBase);
+
+    // Add source, and subreg index
+    AddOperand(MI, Node->getOperand(0), 0, 0, VRBaseMap, /*IsDebug=*/false,
+               IsClone, IsCloned);
+    assert(TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()) &&
+           "Cannot yet extract from physregs");
+    MI->getOperand(1).setSubReg(SubIdx);
+    MBB->insert(InsertPos, MI);
+  } else if (Opc == TargetOpcode::INSERT_SUBREG ||
+             Opc == TargetOpcode::SUBREG_TO_REG) {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+    SDValue N2 = Node->getOperand(2);
+    unsigned SubReg = getVR(N1, VRBaseMap);
+    unsigned SubIdx = cast<ConstantSDNode>(N2)->getZExtValue();
+    const TargetRegisterClass *TRC = MRI->getRegClass(SubReg);
+    const TargetRegisterClass *SRC =
+      getSuperRegisterRegClass(TRC, SubIdx, Node->getValueType(0));
+
+    // Figure out the register class to create for the destreg.
+    // Note that if we're going to directly use an existing register,
+    // it must be precisely the required class, and not a subclass
+    // thereof.
+    if (VRBase == 0 || SRC != MRI->getRegClass(VRBase)) {
+      // Create the reg
+      assert(SRC && "Couldn't find source register class");
+      VRBase = MRI->createVirtualRegister(SRC);
+    }
+
+    // Create the insert_subreg or subreg_to_reg machine instruction.
+    MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(), TII->get(Opc));
+    MI->addOperand(MachineOperand::CreateReg(VRBase, true));
+    
+    // If creating a subreg_to_reg, then the first input operand
+    // is an implicit value immediate, otherwise it's a register
+    if (Opc == TargetOpcode::SUBREG_TO_REG) {
+      const ConstantSDNode *SD = cast<ConstantSDNode>(N0);
+      MI->addOperand(MachineOperand::CreateImm(SD->getZExtValue()));
+    } else
+      AddOperand(MI, N0, 0, 0, VRBaseMap, /*IsDebug=*/false,
+                 IsClone, IsCloned);
+    // Add the subregster being inserted
+    AddOperand(MI, N1, 0, 0, VRBaseMap, /*IsDebug=*/false,
+               IsClone, IsCloned);
+    MI->addOperand(MachineOperand::CreateImm(SubIdx));
+    MBB->insert(InsertPos, MI);
+  } else
+    llvm_unreachable("Node is not insert_subreg, extract_subreg, or subreg_to_reg");
+     
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+  isNew = isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitCopyToRegClassNode - Generate machine code for COPY_TO_REGCLASS nodes.
+/// COPY_TO_REGCLASS is just a normal copy, except that the destination
+/// register is constrained to be in a particular register class.
+///
+void
+InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
+                                     DenseMap<SDValue, unsigned> &VRBaseMap) {
+  unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
+
+  // Create the new VReg in the destination class and emit a copy.
+  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+  const TargetRegisterClass *DstRC = TRI->getRegClass(DstRCIdx);
+  unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+  BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+    NewVReg).addReg(VReg);
+
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, NewVReg)).second;
+  isNew = isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitRegSequence - Generate machine code for REG_SEQUENCE nodes.
+///
+void InstrEmitter::EmitRegSequence(SDNode *Node,
+                                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                                  bool IsClone, bool IsCloned) {
+  const TargetRegisterClass *RC = TLI->getRegClassFor(Node->getValueType(0));
+  unsigned NewVReg = MRI->createVirtualRegister(RC);
+  MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(),
+                             TII->get(TargetOpcode::REG_SEQUENCE), NewVReg);
+  unsigned NumOps = Node->getNumOperands();
+  assert((NumOps & 1) == 0 &&
+         "REG_SEQUENCE must have an even number of operands!");
+  const TargetInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE);
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Op = Node->getOperand(i);
+    if (i & 1) {
+      unsigned SubIdx = cast<ConstantSDNode>(Op)->getZExtValue();
+      unsigned SubReg = getVR(Node->getOperand(i-1), VRBaseMap);
+      const TargetRegisterClass *TRC = MRI->getRegClass(SubReg);
+      const TargetRegisterClass *SRC =
+        TRI->getMatchingSuperRegClass(RC, TRC, SubIdx);
+      if (!SRC)
+        llvm_unreachable("Invalid subregister index in REG_SEQUENCE");
+      if (SRC != RC) {
+        MRI->setRegClass(NewVReg, SRC);
+        RC = SRC;
+      }
+    }
+    AddOperand(MI, Op, i+1, &II, VRBaseMap, /*IsDebug=*/false,
+               IsClone, IsCloned);
+  }
+
+  MBB->insert(InsertPos, MI);
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, NewVReg)).second;
+  isNew = isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitDbgValue - Generate machine instruction for a dbg_value node.
+///
+MachineInstr *
+InstrEmitter::EmitDbgValue(SDDbgValue *SD,
+                           DenseMap<SDValue, unsigned> &VRBaseMap) {
+  uint64_t Offset = SD->getOffset();
+  MDNode* MDPtr = SD->getMDPtr();
+  DebugLoc DL = SD->getDebugLoc();
+
+  if (SD->getKind() == SDDbgValue::FRAMEIX) {
+    // Stack address; this needs to be lowered in target-dependent fashion.
+    // EmitTargetCodeForFrameDebugValue is responsible for allocation.
+    unsigned FrameIx = SD->getFrameIx();
+    return TII->emitFrameIndexDebugValue(*MF, FrameIx, Offset, MDPtr, DL);
+  }
+  // Otherwise, we're going to create an instruction here.
+  const TargetInstrDesc &II = TII->get(TargetOpcode::DBG_VALUE);
+  MachineInstrBuilder MIB = BuildMI(*MF, DL, II);
+  if (SD->getKind() == SDDbgValue::SDNODE) {
+    SDNode *Node = SD->getSDNode();
+    SDValue Op = SDValue(Node, SD->getResNo());
+    // It's possible we replaced this SDNode with other(s) and therefore
+    // didn't generate code for it.  It's better to catch these cases where
+    // they happen and transfer the debug info, but trying to guarantee that
+    // in all cases would be very fragile; this is a safeguard for any
+    // that were missed.
+    DenseMap<SDValue, unsigned>::iterator I = VRBaseMap.find(Op);
+    if (I==VRBaseMap.end())
+      MIB.addReg(0U);       // undef
+    else
+      AddOperand(&*MIB, Op, (*MIB).getNumOperands(), &II, VRBaseMap,
+                 /*IsDebug=*/true, /*IsClone=*/false, /*IsCloned=*/false);
+  } else if (SD->getKind() == SDDbgValue::CONST) {
+    const Value *V = SD->getConst();
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      // FIXME: SDDbgValue constants aren't updated with legalization, so it's 
+      // possible to have i128 constants in them at this point. Dwarf writer
+      // does not handle i128 constants at the moment so, as a crude workaround,
+      // just drop the debug info if this happens.
+      if (!CI->getValue().isSignedIntN(64))
+        MIB.addReg(0U);
+      else
+        MIB.addImm(CI->getSExtValue());
+    } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+      MIB.addFPImm(CF);
+    } else {
+      // Could be an Undef.  In any case insert an Undef so we can see what we
+      // dropped.
+      MIB.addReg(0U);
+    }
+  } else {
+    // Insert an Undef so we can see what we dropped.
+    MIB.addReg(0U);
+  }
+
+  MIB.addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
+/// EmitMachineNode - Generate machine code for a target-specific node and
+/// needed dependencies.
+///
+void InstrEmitter::
+EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, unsigned> &VRBaseMap) {
+  unsigned Opc = Node->getMachineOpcode();
+  
+  // Handle subreg insert/extract specially
+  if (Opc == TargetOpcode::EXTRACT_SUBREG || 
+      Opc == TargetOpcode::INSERT_SUBREG ||
+      Opc == TargetOpcode::SUBREG_TO_REG) {
+    EmitSubregNode(Node, VRBaseMap, IsClone, IsCloned);
+    return;
+  }
+
+  // Handle COPY_TO_REGCLASS specially.
+  if (Opc == TargetOpcode::COPY_TO_REGCLASS) {
+    EmitCopyToRegClassNode(Node, VRBaseMap);
+    return;
+  }
+
+  // Handle REG_SEQUENCE specially.
+  if (Opc == TargetOpcode::REG_SEQUENCE) {
+    EmitRegSequence(Node, VRBaseMap, IsClone, IsCloned);
+    return;
+  }
+
+  if (Opc == TargetOpcode::IMPLICIT_DEF)
+    // We want a unique VR for each IMPLICIT_DEF use.
+    return;
+  
+  const TargetInstrDesc &II = TII->get(Opc);
+  unsigned NumResults = CountResults(Node);
+  unsigned NodeOperands = CountOperands(Node);
+  bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0;
+#ifndef NDEBUG
+  unsigned NumMIOperands = NodeOperands + NumResults;
+  if (II.isVariadic())
+    assert(NumMIOperands >= II.getNumOperands() &&
+           "Too few operands for a variadic node!");
+  else
+    assert(NumMIOperands >= II.getNumOperands() &&
+           NumMIOperands <= II.getNumOperands()+II.getNumImplicitDefs() &&
+           "#operands for dag node doesn't match .td file!");
+#endif
+
+  // Create the new machine instruction.
+  MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(), II);
+
+  // The MachineInstr constructor adds implicit-def operands. Scan through
+  // these to determine which are dead.
+  if (MI->getNumOperands() != 0 &&
+      Node->getValueType(Node->getNumValues()-1) == MVT::Flag) {
+    // First, collect all used registers.
+    SmallVector<unsigned, 8> UsedRegs;
+    for (SDNode *F = Node->getFlaggedUser(); F; F = F->getFlaggedUser())
+      if (F->getOpcode() == ISD::CopyFromReg)
+        UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg());
+      else {
+        // Collect declared implicit uses.
+        const TargetInstrDesc &TID = TII->get(F->getMachineOpcode());
+        UsedRegs.append(TID.getImplicitUses(),
+                        TID.getImplicitUses() + TID.getNumImplicitUses());
+        // In addition to declared implicit uses, we must also check for
+        // direct RegisterSDNode operands.
+        for (unsigned i = 0, e = F->getNumOperands(); i != e; ++i)
+          if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(F->getOperand(i))) {
+            unsigned Reg = R->getReg();
+            if (Reg != 0 && TargetRegisterInfo::isPhysicalRegister(Reg))
+              UsedRegs.push_back(Reg);
+          }
+      }
+    // Then mark unused registers as dead.
+    MI->setPhysRegsDeadExcept(UsedRegs, *TRI);
+  }
+  
+  // Add result register values for things that are defined by this
+  // instruction.
+  if (NumResults)
+    CreateVirtualRegisters(Node, MI, II, IsClone, IsCloned, VRBaseMap);
+  
+  // Emit all of the actual operands of this instruction, adding them to the
+  // instruction as appropriate.
+  bool HasOptPRefs = II.getNumDefs() > NumResults;
+  assert((!HasOptPRefs || !HasPhysRegOuts) &&
+         "Unable to cope with optional defs and phys regs defs!");
+  unsigned NumSkip = HasOptPRefs ? II.getNumDefs() - NumResults : 0;
+  for (unsigned i = NumSkip; i != NodeOperands; ++i)
+    AddOperand(MI, Node->getOperand(i), i-NumSkip+II.getNumDefs(), &II,
+               VRBaseMap, /*IsDebug=*/false, IsClone, IsCloned);
+
+  // Transfer all of the memory reference descriptions of this instruction.
+  MI->setMemRefs(cast<MachineSDNode>(Node)->memoperands_begin(),
+                 cast<MachineSDNode>(Node)->memoperands_end());
+
+  // Insert the instruction into position in the block. This needs to
+  // happen before any custom inserter hook is called so that the
+  // hook knows where in the block to insert the replacement code.
+  MBB->insert(InsertPos, MI);
+
+  if (II.usesCustomInsertionHook()) {
+    // Insert this instruction into the basic block using a target
+    // specific inserter which may returns a new basic block.
+    bool AtEnd = InsertPos == MBB->end();
+    MachineBasicBlock *NewMBB = TLI->EmitInstrWithCustomInserter(MI, MBB);
+    if (NewMBB != MBB) {
+      if (AtEnd)
+        InsertPos = NewMBB->end();
+      MBB = NewMBB;
+    }
+    return;
+  }
+  
+  // Additional results must be an physical register def.
+  if (HasPhysRegOuts) {
+    for (unsigned i = II.getNumDefs(); i < NumResults; ++i) {
+      unsigned Reg = II.getImplicitDefs()[i - II.getNumDefs()];
+      if (Node->hasAnyUseOfValue(i))
+        EmitCopyFromReg(Node, i, IsClone, IsCloned, Reg, VRBaseMap);
+      // If there are no uses, mark the register as dead now, so that
+      // MachineLICM/Sink can see that it's dead. Don't do this if the
+      // node has a Flag value, for the benefit of targets still using
+      // Flag for values in physregs.
+      else if (Node->getValueType(Node->getNumValues()-1) != MVT::Flag)
+        MI->addRegisterDead(Reg, TRI);
+    }
+  }
+  
+  // If the instruction has implicit defs and the node doesn't, mark the
+  // implicit def as dead.  If the node has any flag outputs, we don't do this
+  // because we don't know what implicit defs are being used by flagged nodes.
+  if (Node->getValueType(Node->getNumValues()-1) != MVT::Flag)
+    if (const unsigned *IDList = II.getImplicitDefs()) {
+      for (unsigned i = NumResults, e = II.getNumDefs()+II.getNumImplicitDefs();
+           i != e; ++i)
+        MI->addRegisterDead(IDList[i-II.getNumDefs()], TRI);
+    }
+}
+
+/// EmitSpecialNode - Generate machine code for a target-independent node and
+/// needed dependencies.
+void InstrEmitter::
+EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, unsigned> &VRBaseMap) {
+  switch (Node->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    Node->dump();
+#endif
+    llvm_unreachable("This target-independent node should have been selected!");
+    break;
+  case ISD::EntryToken:
+    llvm_unreachable("EntryToken should have been excluded from the schedule!");
+    break;
+  case ISD::MERGE_VALUES:
+  case ISD::TokenFactor: // fall thru
+    break;
+  case ISD::CopyToReg: {
+    unsigned SrcReg;
+    SDValue SrcVal = Node->getOperand(2);
+    if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(SrcVal))
+      SrcReg = R->getReg();
+    else
+      SrcReg = getVR(SrcVal, VRBaseMap);
+      
+    unsigned DestReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+    if (SrcReg == DestReg) // Coalesced away the copy? Ignore.
+      break;
+
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            DestReg).addReg(SrcReg);
+    break;
+  }
+  case ISD::CopyFromReg: {
+    unsigned SrcReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+    EmitCopyFromReg(Node, 0, IsClone, IsCloned, SrcReg, VRBaseMap);
+    break;
+  }
+  }
+}
+
+/// InstrEmitter - Construct an InstrEmitter and set it to start inserting
+/// at the given position in the given block.
+InstrEmitter::InstrEmitter(MachineBasicBlock *mbb,
+                           MachineBasicBlock::iterator insertpos)
+  : MF(mbb->getParent()),
+    MRI(&MF->getRegInfo()),
+    TM(&MF->getTarget()),
+    TII(TM->getInstrInfo()),
+    TRI(TM->getRegisterInfo()),
+    TLI(TM->getTargetLowering()),
+    MBB(mbb), InsertPos(insertpos) {
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/InstrEmitter.h b/src/LLVM/lib/CodeGen/SelectionDAG/InstrEmitter.h
new file mode 100644
index 0000000..02c044c
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -0,0 +1,142 @@
+//===---- InstrEmitter.h - Emit MachineInstrs for the SelectionDAG class ---==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This declares the Emit routines for the SelectionDAG class, which creates
+// MachineInstrs based on the decisions of the SelectionDAG instruction
+// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef INSTREMITTER_H
+#define INSTREMITTER_H
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class TargetInstrDesc;
+class SDDbgValue;
+
+class InstrEmitter {
+  MachineFunction *MF;
+  MachineRegisterInfo *MRI;
+  const TargetMachine *TM;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const TargetLowering *TLI;
+
+  MachineBasicBlock *MBB;
+  MachineBasicBlock::iterator InsertPos;
+
+  /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
+  /// implicit physical register output.
+  void EmitCopyFromReg(SDNode *Node, unsigned ResNo,
+                       bool IsClone, bool IsCloned,
+                       unsigned SrcReg,
+                       DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// getDstOfCopyToRegUse - If the only use of the specified result number of
+  /// node is a CopyToReg, return its destination register. Return 0 otherwise.
+  unsigned getDstOfOnlyCopyToRegUse(SDNode *Node,
+                                    unsigned ResNo) const;
+
+  void CreateVirtualRegisters(SDNode *Node, MachineInstr *MI,
+                              const TargetInstrDesc &II,
+                              bool IsClone, bool IsCloned,
+                              DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// getVR - Return the virtual register corresponding to the specified result
+  /// of the specified node.
+  unsigned getVR(SDValue Op,
+                 DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// AddRegisterOperand - Add the specified register as an operand to the
+  /// specified machine instr. Insert register copies if the register is
+  /// not in the required register class.
+  void AddRegisterOperand(MachineInstr *MI, SDValue Op,
+                          unsigned IIOpNum,
+                          const TargetInstrDesc *II,
+                          DenseMap<SDValue, unsigned> &VRBaseMap,
+                          bool IsDebug, bool IsClone, bool IsCloned);
+
+  /// AddOperand - Add the specified operand to the specified machine instr.  II
+  /// specifies the instruction information for the node, and IIOpNum is the
+  /// operand number (in the II) that we are adding. IIOpNum and II are used for
+  /// assertions only.
+  void AddOperand(MachineInstr *MI, SDValue Op,
+                  unsigned IIOpNum,
+                  const TargetInstrDesc *II,
+                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                  bool IsDebug, bool IsClone, bool IsCloned);
+
+  /// EmitSubregNode - Generate machine code for subreg nodes.
+  ///
+  void EmitSubregNode(SDNode *Node, DenseMap<SDValue, unsigned> &VRBaseMap,
+                      bool IsClone, bool IsCloned);
+
+  /// EmitCopyToRegClassNode - Generate machine code for COPY_TO_REGCLASS nodes.
+  /// COPY_TO_REGCLASS is just a normal copy, except that the destination
+  /// register is constrained to be in a particular register class.
+  ///
+  void EmitCopyToRegClassNode(SDNode *Node,
+                              DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// EmitRegSequence - Generate machine code for REG_SEQUENCE nodes.
+  ///
+  void EmitRegSequence(SDNode *Node, DenseMap<SDValue, unsigned> &VRBaseMap,
+                       bool IsClone, bool IsCloned);
+public:
+  /// CountResults - The results of target nodes have register or immediate
+  /// operands first, then an optional chain, and optional flag operands
+  /// (which do not go into the machine instrs.)
+  static unsigned CountResults(SDNode *Node);
+
+  /// CountOperands - The inputs to target nodes have any actual inputs first,
+  /// followed by an optional chain operand, then flag operands.  Compute
+  /// the number of actual operands that will go into the resulting
+  /// MachineInstr.
+  static unsigned CountOperands(SDNode *Node);
+
+  /// EmitDbgValue - Generate machine instruction for a dbg_value node.
+  ///
+  MachineInstr *EmitDbgValue(SDDbgValue *SD,
+                             DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// EmitNode - Generate machine code for a node and needed dependencies.
+  ///
+  void EmitNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, unsigned> &VRBaseMap) {
+    if (Node->isMachineOpcode())
+      EmitMachineNode(Node, IsClone, IsCloned, VRBaseMap);
+    else
+      EmitSpecialNode(Node, IsClone, IsCloned, VRBaseMap);
+  }
+
+  /// getBlock - Return the current basic block.
+  MachineBasicBlock *getBlock() { return MBB; }
+
+  /// getInsertPos - Return the current insertion position.
+  MachineBasicBlock::iterator getInsertPos() { return InsertPos; }
+
+  /// InstrEmitter - Construct an InstrEmitter and set it to start inserting
+  /// at the given position in the given block.
+  InstrEmitter(MachineBasicBlock *mbb, MachineBasicBlock::iterator insertpos);
+  
+private:
+  void EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
+                       DenseMap<SDValue, unsigned> &VRBaseMap);
+  void EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
+                       DenseMap<SDValue, unsigned> &VRBaseMap);
+};
+
+}
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
new file mode 100644
index 0000000..4628561
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -0,0 +1,3376 @@
+//===-- LegalizeDAG.cpp - Implement SelectionDAG::Legalize ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::Legalize method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+/// SelectionDAGLegalize - This takes an arbitrary SelectionDAG as input and
+/// hacks on it until the target machine can handle it.  This involves
+/// eliminating value sizes the machine cannot handle (promoting small sizes to
+/// large sizes or splitting up large values into small values) as well as
+/// eliminating operations the machine cannot handle.
+///
+/// This code also does a small amount of optimization and recognition of idioms
+/// as part of its processing.  For example, if a target does not support a
+/// 'setcc' instruction efficiently, but does support 'brcc' instruction, this
+/// will attempt merge setcc and brc instructions into brcc's.
+///
+namespace {
+class SelectionDAGLegalize {
+  const TargetMachine &TM;
+  const TargetLowering &TLI;
+  SelectionDAG &DAG;
+  CodeGenOpt::Level OptLevel;
+
+  // Libcall insertion helpers.
+
+  /// LastCALLSEQ_END - This keeps track of the CALLSEQ_END node that has been
+  /// legalized.  We use this to ensure that calls are properly serialized
+  /// against each other, including inserted libcalls.
+  SDValue LastCALLSEQ_END;
+
+  /// IsLegalizingCall - This member is used *only* for purposes of providing
+  /// helpful assertions that a libcall isn't created while another call is
+  /// being legalized (which could lead to non-serialized call sequences).
+  bool IsLegalizingCall;
+
+  enum LegalizeAction {
+    Legal,      // The target natively supports this operation.
+    Promote,    // This operation should be executed in a larger type.
+    Expand      // Try to expand this to other ops, otherwise use a libcall.
+  };
+
+  /// ValueTypeActions - This is a bitvector that contains two bits for each
+  /// value type, where the two bits correspond to the LegalizeAction enum.
+  /// This can be queried with "getTypeAction(VT)".
+  TargetLowering::ValueTypeActionImpl ValueTypeActions;
+
+  /// LegalizedNodes - For nodes that are of legal width, and that have more
+  /// than one use, this map indicates what regularized operand to use.  This
+  /// allows us to avoid legalizing the same thing more than once.
+  DenseMap<SDValue, SDValue> LegalizedNodes;
+
+  void AddLegalizedOperand(SDValue From, SDValue To) {
+    LegalizedNodes.insert(std::make_pair(From, To));
+    // If someone requests legalization of the new node, return itself.
+    if (From != To)
+      LegalizedNodes.insert(std::make_pair(To, To));
+  }
+
+public:
+  SelectionDAGLegalize(SelectionDAG &DAG, CodeGenOpt::Level ol);
+
+  /// getTypeAction - Return how we should legalize values of this type, either
+  /// it is already legal or we need to expand it into multiple registers of
+  /// smaller integer type, or we need to promote it to a larger type.
+  LegalizeAction getTypeAction(EVT VT) const {
+    return
+        (LegalizeAction)ValueTypeActions.getTypeAction(*DAG.getContext(), VT);
+  }
+
+  /// isTypeLegal - Return true if this type is legal on this target.
+  ///
+  bool isTypeLegal(EVT VT) const {
+    return getTypeAction(VT) == Legal;
+  }
+
+  void LegalizeDAG();
+
+private:
+  /// LegalizeOp - We know that the specified value has a legal type.
+  /// Recursively ensure that the operands have legal types, then return the
+  /// result.
+  SDValue LegalizeOp(SDValue O);
+
+  SDValue OptimizeFloatStore(StoreSDNode *ST);
+
+  /// PerformInsertVectorEltInMemory - Some target cannot handle a variable
+  /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
+  /// is necessary to spill the vector being inserted into to memory, perform
+  /// the insert there, and then read the result back.
+  SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val,
+                                         SDValue Idx, DebugLoc dl);
+  SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
+                                  SDValue Idx, DebugLoc dl);
+
+  /// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
+  /// performs the same shuffe in terms of order or result bytes, but on a type
+  /// whose vector element type is narrower than the original shuffle type.
+  /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
+  SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, DebugLoc dl,
+                                     SDValue N1, SDValue N2,
+                                     SmallVectorImpl<int> &Mask) const;
+
+  bool LegalizeAllNodesNotLeadingTo(SDNode *N, SDNode *Dest,
+                                    SmallPtrSet<SDNode*, 32> &NodesLeadingTo);
+
+  void LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
+                             DebugLoc dl);
+
+  SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
+  std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
+                                                 SDNode *Node, bool isSigned);
+  SDValue ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
+                          RTLIB::Libcall Call_F64, RTLIB::Libcall Call_F80,
+                          RTLIB::Libcall Call_PPCF128);
+  SDValue ExpandIntLibCall(SDNode *Node, bool isSigned,
+                           RTLIB::Libcall Call_I8,
+                           RTLIB::Libcall Call_I16,
+                           RTLIB::Libcall Call_I32,
+                           RTLIB::Libcall Call_I64,
+                           RTLIB::Libcall Call_I128);
+
+  SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, DebugLoc dl);
+  SDValue ExpandBUILD_VECTOR(SDNode *Node);
+  SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
+  void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
+                                SmallVectorImpl<SDValue> &Results);
+  SDValue ExpandFCOPYSIGN(SDNode *Node);
+  SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue LegalOp, EVT DestVT,
+                               DebugLoc dl);
+  SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned,
+                                DebugLoc dl);
+  SDValue PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT, bool isSigned,
+                                DebugLoc dl);
+
+  SDValue ExpandBSWAP(SDValue Op, DebugLoc dl);
+  SDValue ExpandBitCount(unsigned Opc, SDValue Op, DebugLoc dl);
+
+  SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
+  SDValue ExpandVectorBuildThroughStack(SDNode* Node);
+
+  std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
+
+  void ExpandNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void PromoteNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+};
+}
+
+/// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
+/// performs the same shuffe in terms of order or result bytes, but on a type
+/// whose vector element type is narrower than the original shuffle type.
+/// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
+SDValue
+SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl,
+                                                 SDValue N1, SDValue N2,
+                                             SmallVectorImpl<int> &Mask) const {
+  unsigned NumMaskElts = VT.getVectorNumElements();
+  unsigned NumDestElts = NVT.getVectorNumElements();
+  unsigned NumEltsGrowth = NumDestElts / NumMaskElts;
+
+  assert(NumEltsGrowth && "Cannot promote to vector type with fewer elts!");
+
+  if (NumEltsGrowth == 1)
+    return DAG.getVectorShuffle(NVT, dl, N1, N2, &Mask[0]);
+
+  SmallVector<int, 8> NewMask;
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    int Idx = Mask[i];
+    for (unsigned j = 0; j != NumEltsGrowth; ++j) {
+      if (Idx < 0)
+        NewMask.push_back(-1);
+      else
+        NewMask.push_back(Idx * NumEltsGrowth + j);
+    }
+  }
+  assert(NewMask.size() == NumDestElts && "Non-integer NumEltsGrowth?");
+  assert(TLI.isShuffleMaskLegal(NewMask, NVT) && "Shuffle not legal?");
+  return DAG.getVectorShuffle(NVT, dl, N1, N2, &NewMask[0]);
+}
+
+SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag,
+                                           CodeGenOpt::Level ol)
+  : TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
+    DAG(dag), OptLevel(ol),
+    ValueTypeActions(TLI.getValueTypeActions()) {
+  assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
+         "Too many value types for ValueTypeActions to hold!");
+}
+
+void SelectionDAGLegalize::LegalizeDAG() {
+  LastCALLSEQ_END = DAG.getEntryNode();
+  IsLegalizingCall = false;
+
+  // The legalize process is inherently a bottom-up recursive process (users
+  // legalize their uses before themselves).  Given infinite stack space, we
+  // could just start legalizing on the root and traverse the whole graph.  In
+  // practice however, this causes us to run out of stack space on large basic
+  // blocks.  To avoid this problem, compute an ordering of the nodes where each
+  // node is only legalized after all of its operands are legalized.
+  DAG.AssignTopologicalOrder();
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = prior(DAG.allnodes_end()); I != llvm::next(E); ++I)
+    LegalizeOp(SDValue(I, 0));
+
+  // Finally, it's possible the root changed.  Get the new root.
+  SDValue OldRoot = DAG.getRoot();
+  assert(LegalizedNodes.count(OldRoot) && "Root didn't get legalized?");
+  DAG.setRoot(LegalizedNodes[OldRoot]);
+
+  LegalizedNodes.clear();
+
+  // Remove dead nodes now.
+  DAG.RemoveDeadNodes();
+}
+
+
+/// FindCallEndFromCallStart - Given a chained node that is part of a call
+/// sequence, find the CALLSEQ_END node that terminates the call sequence.
+static SDNode *FindCallEndFromCallStart(SDNode *Node) {
+  if (Node->getOpcode() == ISD::CALLSEQ_END)
+    return Node;
+  if (Node->use_empty())
+    return 0;   // No CallSeqEnd
+
+  // The chain is usually at the end.
+  SDValue TheChain(Node, Node->getNumValues()-1);
+  if (TheChain.getValueType() != MVT::Other) {
+    // Sometimes it's at the beginning.
+    TheChain = SDValue(Node, 0);
+    if (TheChain.getValueType() != MVT::Other) {
+      // Otherwise, hunt for it.
+      for (unsigned i = 1, e = Node->getNumValues(); i != e; ++i)
+        if (Node->getValueType(i) == MVT::Other) {
+          TheChain = SDValue(Node, i);
+          break;
+        }
+
+      // Otherwise, we walked into a node without a chain.
+      if (TheChain.getValueType() != MVT::Other)
+        return 0;
+    }
+  }
+
+  for (SDNode::use_iterator UI = Node->use_begin(),
+       E = Node->use_end(); UI != E; ++UI) {
+
+    // Make sure to only follow users of our token chain.
+    SDNode *User = *UI;
+    for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i)
+      if (User->getOperand(i) == TheChain)
+        if (SDNode *Result = FindCallEndFromCallStart(User))
+          return Result;
+  }
+  return 0;
+}
+
+/// FindCallStartFromCallEnd - Given a chained node that is part of a call
+/// sequence, find the CALLSEQ_START node that initiates the call sequence.
+static SDNode *FindCallStartFromCallEnd(SDNode *Node) {
+  assert(Node && "Didn't find callseq_start for a call??");
+  if (Node->getOpcode() == ISD::CALLSEQ_START) return Node;
+
+  assert(Node->getOperand(0).getValueType() == MVT::Other &&
+         "Node doesn't have a token chain argument!");
+  return FindCallStartFromCallEnd(Node->getOperand(0).getNode());
+}
+
+/// LegalizeAllNodesNotLeadingTo - Recursively walk the uses of N, looking to
+/// see if any uses can reach Dest.  If no dest operands can get to dest,
+/// legalize them, legalize ourself, and return false, otherwise, return true.
+///
+/// Keep track of the nodes we fine that actually do lead to Dest in
+/// NodesLeadingTo.  This avoids retraversing them exponential number of times.
+///
+bool SelectionDAGLegalize::LegalizeAllNodesNotLeadingTo(SDNode *N, SDNode *Dest,
+                                     SmallPtrSet<SDNode*, 32> &NodesLeadingTo) {
+  if (N == Dest) return true;  // N certainly leads to Dest :)
+
+  // If we've already processed this node and it does lead to Dest, there is no
+  // need to reprocess it.
+  if (NodesLeadingTo.count(N)) return true;
+
+  // If the first result of this node has been already legalized, then it cannot
+  // reach N.
+  if (LegalizedNodes.count(SDValue(N, 0))) return false;
+
+  // Okay, this node has not already been legalized.  Check and legalize all
+  // operands.  If none lead to Dest, then we can legalize this node.
+  bool OperandsLeadToDest = false;
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    OperandsLeadToDest |=     // If an operand leads to Dest, so do we.
+      LegalizeAllNodesNotLeadingTo(N->getOperand(i).getNode(), Dest,
+                                   NodesLeadingTo);
+
+  if (OperandsLeadToDest) {
+    NodesLeadingTo.insert(N);
+    return true;
+  }
+
+  // Okay, this node looks safe, legalize it and return false.
+  LegalizeOp(SDValue(N, 0));
+  return false;
+}
+
+/// ExpandConstantFP - Expands the ConstantFP node to an integer constant or
+/// a load from the constant pool.
+static SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP,
+                                SelectionDAG &DAG, const TargetLowering &TLI) {
+  bool Extend = false;
+  DebugLoc dl = CFP->getDebugLoc();
+
+  // If a FP immediate is precise when represented as a float and if the
+  // target can do an extending load from float to double, we put it into
+  // the constant pool as a float, even if it's is statically typed as a
+  // double.  This shrinks FP constants and canonicalizes them for targets where
+  // an FP extending load is the same cost as a normal load (such as on the x87
+  // fp stack or PPC FP unit).
+  EVT VT = CFP->getValueType(0);
+  ConstantFP *LLVMC = const_cast<ConstantFP*>(CFP->getConstantFPValue());
+  if (!UseCP) {
+    assert((VT == MVT::f64 || VT == MVT::f32) && "Invalid type expansion");
+    return DAG.getConstant(LLVMC->getValueAPF().bitcastToAPInt(),
+                           (VT == MVT::f64) ? MVT::i64 : MVT::i32);
+  }
+
+  EVT OrigVT = VT;
+  EVT SVT = VT;
+  while (SVT != MVT::f32) {
+    SVT = (MVT::SimpleValueType)(SVT.getSimpleVT().SimpleTy - 1);
+    if (ConstantFPSDNode::isValueValidForType(SVT, CFP->getValueAPF()) &&
+        // Only do this if the target has a native EXTLOAD instruction from
+        // smaller type.
+        TLI.isLoadExtLegal(ISD::EXTLOAD, SVT) &&
+        TLI.ShouldShrinkFPConstant(OrigVT)) {
+      const Type *SType = SVT.getTypeForEVT(*DAG.getContext());
+      LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
+      VT = SVT;
+      Extend = true;
+    }
+  }
+
+  SDValue CPIdx = DAG.getConstantPool(LLVMC, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  if (Extend)
+    return DAG.getExtLoad(ISD::EXTLOAD, OrigVT, dl,
+                          DAG.getEntryNode(),
+                          CPIdx, PseudoSourceValue::getConstantPool(),
+                          0, VT, false, false, Alignment);
+  return DAG.getLoad(OrigVT, dl, DAG.getEntryNode(), CPIdx,
+                     PseudoSourceValue::getConstantPool(), 0, false, false,
+                     Alignment);
+}
+
+/// ExpandUnalignedStore - Expands an unaligned store to 2 half-size stores.
+static
+SDValue ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
+                             const TargetLowering &TLI) {
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  SDValue Val = ST->getValue();
+  EVT VT = Val.getValueType();
+  int Alignment = ST->getAlignment();
+  int SVOffset = ST->getSrcValueOffset();
+  DebugLoc dl = ST->getDebugLoc();
+  if (ST->getMemoryVT().isFloatingPoint() ||
+      ST->getMemoryVT().isVector()) {
+    EVT intVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+    if (TLI.isTypeLegal(intVT)) {
+      // Expand to a bitconvert of the value to the integer type of the
+      // same size, then a (misaligned) int store.
+      // FIXME: Does not handle truncating floating point stores!
+      SDValue Result = DAG.getNode(ISD::BIT_CONVERT, dl, intVT, Val);
+      return DAG.getStore(Chain, dl, Result, Ptr, ST->getSrcValue(),
+                          SVOffset, ST->isVolatile(), ST->isNonTemporal(),
+                          Alignment);
+    } else {
+      // Do a (aligned) store to a stack slot, then copy from the stack slot
+      // to the final destination using (unaligned) integer loads and stores.
+      EVT StoredVT = ST->getMemoryVT();
+      EVT RegVT =
+        TLI.getRegisterType(*DAG.getContext(),
+                            EVT::getIntegerVT(*DAG.getContext(),
+                                              StoredVT.getSizeInBits()));
+      unsigned StoredBytes = StoredVT.getSizeInBits() / 8;
+      unsigned RegBytes = RegVT.getSizeInBits() / 8;
+      unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;
+
+      // Make sure the stack slot is also aligned for the register type.
+      SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT);
+
+      // Perform the original store, only redirected to the stack slot.
+      SDValue Store = DAG.getTruncStore(Chain, dl,
+                                        Val, StackPtr, NULL, 0, StoredVT,
+                                        false, false, 0);
+      SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
+      SmallVector<SDValue, 8> Stores;
+      unsigned Offset = 0;
+
+      // Do all but one copies using the full register width.
+      for (unsigned i = 1; i < NumRegs; i++) {
+        // Load one integer register's worth from the stack slot.
+        SDValue Load = DAG.getLoad(RegVT, dl, Store, StackPtr, NULL, 0,
+                                   false, false, 0);
+        // Store it to the final location.  Remember the store.
+        Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
+                                      ST->getSrcValue(), SVOffset + Offset,
+                                      ST->isVolatile(), ST->isNonTemporal(),
+                                      MinAlign(ST->getAlignment(), Offset)));
+        // Increment the pointers.
+        Offset += RegBytes;
+        StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                               Increment);
+        Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+      }
+
+      // The last store may be partial.  Do a truncating store.  On big-endian
+      // machines this requires an extending load from the stack slot to ensure
+      // that the bits are in the right place.
+      EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
+                                    8 * (StoredBytes - Offset));
+
+      // Load from the stack slot.
+      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, RegVT, dl, Store, StackPtr,
+                                    NULL, 0, MemVT, false, false, 0);
+
+      Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
+                                         ST->getSrcValue(), SVOffset + Offset,
+                                         MemVT, ST->isVolatile(),
+                                         ST->isNonTemporal(),
+                                         MinAlign(ST->getAlignment(), Offset)));
+      // The order of the stores doesn't matter - say it with a TokenFactor.
+      return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
+                         Stores.size());
+    }
+  }
+  assert(ST->getMemoryVT().isInteger() &&
+         !ST->getMemoryVT().isVector() &&
+         "Unaligned store of unknown type.");
+  // Get the half-size VT
+  EVT NewStoredVT = ST->getMemoryVT().getHalfSizedIntegerVT(*DAG.getContext());
+  int NumBits = NewStoredVT.getSizeInBits();
+  int IncrementSize = NumBits / 8;
+
+  // Divide the stored value in two parts.
+  SDValue ShiftAmount = DAG.getConstant(NumBits, TLI.getShiftAmountTy());
+  SDValue Lo = Val;
+  SDValue Hi = DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount);
+
+  // Store the two parts
+  SDValue Store1, Store2;
+  Store1 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Lo:Hi, Ptr,
+                             ST->getSrcValue(), SVOffset, NewStoredVT,
+                             ST->isVolatile(), ST->isNonTemporal(), Alignment);
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+  Alignment = MinAlign(Alignment, IncrementSize);
+  Store2 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Hi:Lo, Ptr,
+                             ST->getSrcValue(), SVOffset + IncrementSize,
+                             NewStoredVT, ST->isVolatile(), ST->isNonTemporal(),
+                             Alignment);
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+}
+
+/// ExpandUnalignedLoad - Expands an unaligned load to 2 half-size loads.
+static
+SDValue ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
+                            const TargetLowering &TLI) {
+  int SVOffset = LD->getSrcValueOffset();
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  EVT VT = LD->getValueType(0);
+  EVT LoadedVT = LD->getMemoryVT();
+  DebugLoc dl = LD->getDebugLoc();
+  if (VT.isFloatingPoint() || VT.isVector()) {
+    EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits());
+    if (TLI.isTypeLegal(intVT)) {
+      // Expand to a (misaligned) integer load of the same size,
+      // then bitconvert to floating point or vector.
+      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr, LD->getSrcValue(),
+                                    SVOffset, LD->isVolatile(),
+                                    LD->isNonTemporal(), LD->getAlignment());
+      SDValue Result = DAG.getNode(ISD::BIT_CONVERT, dl, LoadedVT, newLoad);
+      if (VT.isFloatingPoint() && LoadedVT != VT)
+        Result = DAG.getNode(ISD::FP_EXTEND, dl, VT, Result);
+
+      SDValue Ops[] = { Result, Chain };
+      return DAG.getMergeValues(Ops, 2, dl);
+    } else {
+      // Copy the value to a (aligned) stack slot using (unaligned) integer
+      // loads and stores, then do a (aligned) load from the stack slot.
+      EVT RegVT = TLI.getRegisterType(*DAG.getContext(), intVT);
+      unsigned LoadedBytes = LoadedVT.getSizeInBits() / 8;
+      unsigned RegBytes = RegVT.getSizeInBits() / 8;
+      unsigned NumRegs = (LoadedBytes + RegBytes - 1) / RegBytes;
+
+      // Make sure the stack slot is also aligned for the register type.
+      SDValue StackBase = DAG.CreateStackTemporary(LoadedVT, RegVT);
+
+      SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
+      SmallVector<SDValue, 8> Stores;
+      SDValue StackPtr = StackBase;
+      unsigned Offset = 0;
+
+      // Do all but one copies using the full register width.
+      for (unsigned i = 1; i < NumRegs; i++) {
+        // Load one integer register's worth from the original location.
+        SDValue Load = DAG.getLoad(RegVT, dl, Chain, Ptr, LD->getSrcValue(),
+                                   SVOffset + Offset, LD->isVolatile(),
+                                   LD->isNonTemporal(),
+                                   MinAlign(LD->getAlignment(), Offset));
+        // Follow the load with a store to the stack slot.  Remember the store.
+        Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, StackPtr,
+                                      NULL, 0, false, false, 0));
+        // Increment the pointers.
+        Offset += RegBytes;
+        Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+        StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                               Increment);
+      }
+
+      // The last copy may be partial.  Do an extending load.
+      EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
+                                    8 * (LoadedBytes - Offset));
+      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, RegVT, dl, Chain, Ptr,
+                                    LD->getSrcValue(), SVOffset + Offset,
+                                    MemVT, LD->isVolatile(),
+                                    LD->isNonTemporal(),
+                                    MinAlign(LD->getAlignment(), Offset));
+      // Follow the load with a store to the stack slot.  Remember the store.
+      // On big-endian machines this requires a truncating store to ensure
+      // that the bits end up in the right place.
+      Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, StackPtr,
+                                         NULL, 0, MemVT, false, false, 0));
+
+      // The order of the stores doesn't matter - say it with a TokenFactor.
+      SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
+                               Stores.size());
+
+      // Finally, perform the original load only redirected to the stack slot.
+      Load = DAG.getExtLoad(LD->getExtensionType(), VT, dl, TF, StackBase,
+                            NULL, 0, LoadedVT, false, false, 0);
+
+      // Callers expect a MERGE_VALUES node.
+      SDValue Ops[] = { Load, TF };
+      return DAG.getMergeValues(Ops, 2, dl);
+    }
+  }
+  assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
+         "Unaligned load of unsupported type.");
+
+  // Compute the new VT that is half the size of the old one.  This is an
+  // integer MVT.
+  unsigned NumBits = LoadedVT.getSizeInBits();
+  EVT NewLoadedVT;
+  NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
+  NumBits >>= 1;
+
+  unsigned Alignment = LD->getAlignment();
+  unsigned IncrementSize = NumBits / 8;
+  ISD::LoadExtType HiExtType = LD->getExtensionType();
+
+  // If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
+  if (HiExtType == ISD::NON_EXTLOAD)
+    HiExtType = ISD::ZEXTLOAD;
+
+  // Load the value in two parts
+  SDValue Lo, Hi;
+  if (TLI.isLittleEndian()) {
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, VT, dl, Chain, Ptr, LD->getSrcValue(),
+                        SVOffset, NewLoadedVT, LD->isVolatile(),
+                        LD->isNonTemporal(), Alignment);
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+    Hi = DAG.getExtLoad(HiExtType, VT, dl, Chain, Ptr, LD->getSrcValue(),
+                        SVOffset + IncrementSize, NewLoadedVT, LD->isVolatile(),
+                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
+  } else {
+    Hi = DAG.getExtLoad(HiExtType, VT, dl, Chain, Ptr, LD->getSrcValue(),
+                        SVOffset, NewLoadedVT, LD->isVolatile(),
+                        LD->isNonTemporal(), Alignment);
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, VT, dl, Chain, Ptr, LD->getSrcValue(),
+                        SVOffset + IncrementSize, NewLoadedVT, LD->isVolatile(),
+                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
+  }
+
+  // aggregate the two parts
+  SDValue ShiftAmount = DAG.getConstant(NumBits, TLI.getShiftAmountTy());
+  SDValue Result = DAG.getNode(ISD::SHL, dl, VT, Hi, ShiftAmount);
+  Result = DAG.getNode(ISD::OR, dl, VT, Result, Lo);
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                             Hi.getValue(1));
+
+  SDValue Ops[] = { Result, TF };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+/// PerformInsertVectorEltInMemory - Some target cannot handle a variable
+/// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
+/// is necessary to spill the vector being inserted into to memory, perform
+/// the insert there, and then read the result back.
+SDValue SelectionDAGLegalize::
+PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
+                               DebugLoc dl) {
+  SDValue Tmp1 = Vec;
+  SDValue Tmp2 = Val;
+  SDValue Tmp3 = Idx;
+
+  // If the target doesn't support this, we have to spill the input vector
+  // to a temporary stack slot, update the element, then reload it.  This is
+  // badness.  We could also load the value into a vector register (either
+  // with a "move to register" or "extload into register" instruction, then
+  // permute it into place, if the idx is a constant and if the idx is
+  // supported by the target.
+  EVT VT    = Tmp1.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  EVT IdxVT = Tmp3.getValueType();
+  EVT PtrVT = TLI.getPointerTy();
+  SDValue StackPtr = DAG.CreateStackTemporary(VT);
+
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+
+  // Store the vector.
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Tmp1, StackPtr,
+                            PseudoSourceValue::getFixedStack(SPFI), 0,
+                            false, false, 0);
+
+  // Truncate or zero extend offset to target pointer type.
+  unsigned CastOpc = IdxVT.bitsGT(PtrVT) ? ISD::TRUNCATE : ISD::ZERO_EXTEND;
+  Tmp3 = DAG.getNode(CastOpc, dl, PtrVT, Tmp3);
+  // Add the offset to the index.
+  unsigned EltSize = EltVT.getSizeInBits()/8;
+  Tmp3 = DAG.getNode(ISD::MUL, dl, IdxVT, Tmp3,DAG.getConstant(EltSize, IdxVT));
+  SDValue StackPtr2 = DAG.getNode(ISD::ADD, dl, IdxVT, Tmp3, StackPtr);
+  // Store the scalar value.
+  Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2,
+                         PseudoSourceValue::getFixedStack(SPFI), 0, EltVT,
+                         false, false, 0);
+  // Load the updated vector.
+  return DAG.getLoad(VT, dl, Ch, StackPtr,
+                     PseudoSourceValue::getFixedStack(SPFI), 0,
+                     false, false, 0);
+}
+
+
+SDValue SelectionDAGLegalize::
+ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx, DebugLoc dl) {
+  if (ConstantSDNode *InsertPos = dyn_cast<ConstantSDNode>(Idx)) {
+    // SCALAR_TO_VECTOR requires that the type of the value being inserted
+    // match the element type of the vector being created, except for
+    // integers in which case the inserted value can be over width.
+    EVT EltVT = Vec.getValueType().getVectorElementType();
+    if (Val.getValueType() == EltVT ||
+        (EltVT.isInteger() && Val.getValueType().bitsGE(EltVT))) {
+      SDValue ScVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                  Vec.getValueType(), Val);
+
+      unsigned NumElts = Vec.getValueType().getVectorNumElements();
+      // We generate a shuffle of InVec and ScVec, so the shuffle mask
+      // should be 0,1,2,3,4,5... with the appropriate element replaced with
+      // elt 0 of the RHS.
+      SmallVector<int, 8> ShufOps;
+      for (unsigned i = 0; i != NumElts; ++i)
+        ShufOps.push_back(i != InsertPos->getZExtValue() ? i : NumElts);
+
+      return DAG.getVectorShuffle(Vec.getValueType(), dl, Vec, ScVec,
+                                  &ShufOps[0]);
+    }
+  }
+  return PerformInsertVectorEltInMemory(Vec, Val, Idx, dl);
+}
+
+SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
+  // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
+  // FIXME: We shouldn't do this for TargetConstantFP's.
+  // FIXME: move this to the DAG Combiner!  Note that we can't regress due
+  // to phase ordering between legalized code and the dag combiner.  This
+  // probably means that we need to integrate dag combiner and legalizer
+  // together.
+  // We generally can't do this one for long doubles.
+  SDValue Tmp1 = ST->getChain();
+  SDValue Tmp2 = ST->getBasePtr();
+  SDValue Tmp3;
+  int SVOffset = ST->getSrcValueOffset();
+  unsigned Alignment = ST->getAlignment();
+  bool isVolatile = ST->isVolatile();
+  bool isNonTemporal = ST->isNonTemporal();
+  DebugLoc dl = ST->getDebugLoc();
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
+    if (CFP->getValueType(0) == MVT::f32 &&
+        getTypeAction(MVT::i32) == Legal) {
+      Tmp3 = DAG.getConstant(CFP->getValueAPF().
+                                      bitcastToAPInt().zextOrTrunc(32),
+                              MVT::i32);
+      return DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                          SVOffset, isVolatile, isNonTemporal, Alignment);
+    } else if (CFP->getValueType(0) == MVT::f64) {
+      // If this target supports 64-bit registers, do a single 64-bit store.
+      if (getTypeAction(MVT::i64) == Legal) {
+        Tmp3 = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+                                  zextOrTrunc(64), MVT::i64);
+        return DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                            SVOffset, isVolatile, isNonTemporal, Alignment);
+      } else if (getTypeAction(MVT::i32) == Legal && !ST->isVolatile()) {
+        // Otherwise, if the target supports 32-bit registers, use 2 32-bit
+        // stores.  If the target supports neither 32- nor 64-bits, this
+        // xform is certainly not worth it.
+        const APInt &IntVal =CFP->getValueAPF().bitcastToAPInt();
+        SDValue Lo = DAG.getConstant(APInt(IntVal).trunc(32), MVT::i32);
+        SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), MVT::i32);
+        if (TLI.isBigEndian()) std::swap(Lo, Hi);
+
+        Lo = DAG.getStore(Tmp1, dl, Lo, Tmp2, ST->getSrcValue(),
+                          SVOffset, isVolatile, isNonTemporal, Alignment);
+        Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                            DAG.getIntPtrConstant(4));
+        Hi = DAG.getStore(Tmp1, dl, Hi, Tmp2, ST->getSrcValue(), SVOffset+4,
+                          isVolatile, isNonTemporal, MinAlign(Alignment, 4U));
+
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+      }
+    }
+  }
+  return SDValue();
+}
+
+/// LegalizeOp - We know that the specified value has a legal type, and
+/// that its operands are legal.  Now ensure that the operation itself
+/// is legal, recursively ensuring that the operands' operations remain
+/// legal.
+SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
+  if (Op.getOpcode() == ISD::TargetConstant) // Allow illegal target nodes.
+    return Op;
+
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+
+  for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+    assert(getTypeAction(Node->getValueType(i)) == Legal &&
+           "Unexpected illegal type!");
+
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
+    assert((isTypeLegal(Node->getOperand(i).getValueType()) ||
+            Node->getOperand(i).getOpcode() == ISD::TargetConstant) &&
+           "Unexpected illegal type!");
+
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
+  if (I != LegalizedNodes.end()) return I->second;
+
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  SDValue Result = Op;
+  bool isCustom = false;
+
+  // Figure out the correct action; the way to query this varies by opcode
+  TargetLowering::LegalizeAction Action;
+  bool SimpleFinishLegalizing = true;
+  switch (Node->getOpcode()) {
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+  case ISD::VAARG:
+  case ISD::STACKSAVE:
+    Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
+    break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::EXTRACT_VECTOR_ELT:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(0).getValueType());
+    break;
+  case ISD::FP_ROUND_INREG:
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT InnerType = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    Action = TLI.getOperationAction(Node->getOpcode(), InnerType);
+    break;
+  }
+  case ISD::SELECT_CC:
+  case ISD::SETCC:
+  case ISD::BR_CC: {
+    unsigned CCOperand = Node->getOpcode() == ISD::SELECT_CC ? 4 :
+                         Node->getOpcode() == ISD::SETCC ? 2 : 1;
+    unsigned CompareOperand = Node->getOpcode() == ISD::BR_CC ? 2 : 0;
+    EVT OpVT = Node->getOperand(CompareOperand).getValueType();
+    ISD::CondCode CCCode =
+        cast<CondCodeSDNode>(Node->getOperand(CCOperand))->get();
+    Action = TLI.getCondCodeAction(CCCode, OpVT);
+    if (Action == TargetLowering::Legal) {
+      if (Node->getOpcode() == ISD::SELECT_CC)
+        Action = TLI.getOperationAction(Node->getOpcode(),
+                                        Node->getValueType(0));
+      else
+        Action = TLI.getOperationAction(Node->getOpcode(), OpVT);
+    }
+    break;
+  }
+  case ISD::LOAD:
+  case ISD::STORE:
+    // FIXME: Model these properly.  LOAD and STORE are complicated, and
+    // STORE expects the unlegalized operand in some cases.
+    SimpleFinishLegalizing = false;
+    break;
+  case ISD::CALLSEQ_START:
+  case ISD::CALLSEQ_END:
+    // FIXME: This shouldn't be necessary.  These nodes have special properties
+    // dealing with the recursive nature of legalization.  Removing this
+    // special case should be done as part of making LegalizeDAG non-recursive.
+    SimpleFinishLegalizing = false;
+    break;
+  case ISD::EXTRACT_ELEMENT:
+  case ISD::FLT_ROUNDS_:
+  case ISD::SADDO:
+  case ISD::SSUBO:
+  case ISD::UADDO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+  case ISD::FPOWI:
+  case ISD::MERGE_VALUES:
+  case ISD::FRAME_TO_ARGS_OFFSET:
+    // These operations lie about being legal: when they claim to be legal,
+    // they should actually be expanded.
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Legal)
+      Action = TargetLowering::Expand;
+    break;
+  case ISD::TRAMPOLINE:
+  case ISD::FRAMEADDR:
+  case ISD::RETURNADDR:
+    // These operations lie about being legal: when they claim to be legal,
+    // they should actually be custom-lowered.
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Legal)
+      Action = TargetLowering::Custom;
+    break;
+  case ISD::BUILD_VECTOR:
+    // A weird case: legalization for BUILD_VECTOR never legalizes the
+    // operands!
+    // FIXME: This really sucks... changing it isn't semantically incorrect,
+    // but it massively pessimizes the code for floating-point BUILD_VECTORs
+    // because ConstantFP operands get legalized into constant pool loads
+    // before the BUILD_VECTOR code can see them.  It doesn't usually bite,
+    // though, because BUILD_VECTORS usually get lowered into other nodes
+    // which get legalized properly.
+    SimpleFinishLegalizing = false;
+    break;
+  default:
+    if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
+      Action = TargetLowering::Legal;
+    } else {
+      Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    }
+    break;
+  }
+
+  if (SimpleFinishLegalizing) {
+    SmallVector<SDValue, 8> Ops, ResultVals;
+    for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
+      Ops.push_back(LegalizeOp(Node->getOperand(i)));
+    switch (Node->getOpcode()) {
+    default: break;
+    case ISD::BR:
+    case ISD::BRIND:
+    case ISD::BR_JT:
+    case ISD::BR_CC:
+    case ISD::BRCOND:
+      // Branches tweak the chain to include LastCALLSEQ_END
+      Ops[0] = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ops[0],
+                            LastCALLSEQ_END);
+      Ops[0] = LegalizeOp(Ops[0]);
+      LastCALLSEQ_END = DAG.getEntryNode();
+      break;
+    case ISD::SHL:
+    case ISD::SRL:
+    case ISD::SRA:
+    case ISD::ROTL:
+    case ISD::ROTR:
+      // Legalizing shifts/rotates requires adjusting the shift amount
+      // to the appropriate width.
+      if (!Ops[1].getValueType().isVector())
+        Ops[1] = LegalizeOp(DAG.getShiftAmountOperand(Ops[1]));
+      break;
+    case ISD::SRL_PARTS:
+    case ISD::SRA_PARTS:
+    case ISD::SHL_PARTS:
+      // Legalizing shifts/rotates requires adjusting the shift amount
+      // to the appropriate width.
+      if (!Ops[2].getValueType().isVector())
+        Ops[2] = LegalizeOp(DAG.getShiftAmountOperand(Ops[2]));
+      break;
+    }
+
+    Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(), Ops.data(),
+                                            Ops.size()), 0);
+    switch (Action) {
+    case TargetLowering::Legal:
+      for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+        ResultVals.push_back(Result.getValue(i));
+      break;
+    case TargetLowering::Custom:
+      // FIXME: The handling for custom lowering with multiple results is
+      // a complete mess.
+      Tmp1 = TLI.LowerOperation(Result, DAG);
+      if (Tmp1.getNode()) {
+        for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) {
+          if (e == 1)
+            ResultVals.push_back(Tmp1);
+          else
+            ResultVals.push_back(Tmp1.getValue(i));
+        }
+        break;
+      }
+
+      // FALL THROUGH
+    case TargetLowering::Expand:
+      ExpandNode(Result.getNode(), ResultVals);
+      break;
+    case TargetLowering::Promote:
+      PromoteNode(Result.getNode(), ResultVals);
+      break;
+    }
+    if (!ResultVals.empty()) {
+      for (unsigned i = 0, e = ResultVals.size(); i != e; ++i) {
+        if (ResultVals[i] != SDValue(Node, i))
+          ResultVals[i] = LegalizeOp(ResultVals[i]);
+        AddLegalizedOperand(SDValue(Node, i), ResultVals[i]);
+      }
+      return ResultVals[Op.getResNo()];
+    }
+  }
+
+  switch (Node->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "NODE: ";
+    Node->dump( &DAG);
+    dbgs() << "\n";
+#endif
+    assert(0 && "Do not know how to legalize this operator!");
+
+  case ISD::BUILD_VECTOR:
+    switch (TLI.getOperationAction(ISD::BUILD_VECTOR, Node->getValueType(0))) {
+    default: assert(0 && "This action is not supported yet!");
+    case TargetLowering::Custom:
+      Tmp3 = TLI.LowerOperation(Result, DAG);
+      if (Tmp3.getNode()) {
+        Result = Tmp3;
+        break;
+      }
+      // FALLTHROUGH
+    case TargetLowering::Expand:
+      Result = ExpandBUILD_VECTOR(Result.getNode());
+      break;
+    }
+    break;
+  case ISD::CALLSEQ_START: {
+    SDNode *CallEnd = FindCallEndFromCallStart(Node);
+
+    // Recursively Legalize all of the inputs of the call end that do not lead
+    // to this call start.  This ensures that any libcalls that need be inserted
+    // are inserted *before* the CALLSEQ_START.
+    {SmallPtrSet<SDNode*, 32> NodesLeadingTo;
+    for (unsigned i = 0, e = CallEnd->getNumOperands(); i != e; ++i)
+      LegalizeAllNodesNotLeadingTo(CallEnd->getOperand(i).getNode(), Node,
+                                   NodesLeadingTo);
+    }
+
+    // Now that we have legalized all of the inputs (which may have inserted
+    // libcalls), create the new CALLSEQ_START node.
+    Tmp1 = LegalizeOp(Node->getOperand(0));  // Legalize the chain.
+
+    // Merge in the last call to ensure that this call starts after the last
+    // call ended.
+    if (LastCALLSEQ_END.getOpcode() != ISD::EntryToken) {
+      Tmp1 = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                         Tmp1, LastCALLSEQ_END);
+      Tmp1 = LegalizeOp(Tmp1);
+    }
+
+    // Do not try to legalize the target-specific arguments (#1+).
+    if (Tmp1 != Node->getOperand(0)) {
+      SmallVector<SDValue, 8> Ops(Node->op_begin(), Node->op_end());
+      Ops[0] = Tmp1;
+      Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(), &Ops[0],
+                                              Ops.size()), Result.getResNo());
+    }
+
+    // Remember that the CALLSEQ_START is legalized.
+    AddLegalizedOperand(Op.getValue(0), Result);
+    if (Node->getNumValues() == 2)    // If this has a flag result, remember it.
+      AddLegalizedOperand(Op.getValue(1), Result.getValue(1));
+
+    // Now that the callseq_start and all of the non-call nodes above this call
+    // sequence have been legalized, legalize the call itself.  During this
+    // process, no libcalls can/will be inserted, guaranteeing that no calls
+    // can overlap.
+    assert(!IsLegalizingCall && "Inconsistent sequentialization of calls!");
+    // Note that we are selecting this call!
+    LastCALLSEQ_END = SDValue(CallEnd, 0);
+    IsLegalizingCall = true;
+
+    // Legalize the call, starting from the CALLSEQ_END.
+    LegalizeOp(LastCALLSEQ_END);
+    assert(!IsLegalizingCall && "CALLSEQ_END should have cleared this!");
+    return Result;
+  }
+  case ISD::CALLSEQ_END:
+    // If the CALLSEQ_START node hasn't been legalized first, legalize it.  This
+    // will cause this node to be legalized as well as handling libcalls right.
+    if (LastCALLSEQ_END.getNode() != Node) {
+      LegalizeOp(SDValue(FindCallStartFromCallEnd(Node), 0));
+      DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
+      assert(I != LegalizedNodes.end() &&
+             "Legalizing the call start should have legalized this node!");
+      return I->second;
+    }
+
+    // Otherwise, the call start has been legalized and everything is going
+    // according to plan.  Just legalize ourselves normally here.
+    Tmp1 = LegalizeOp(Node->getOperand(0));  // Legalize the chain.
+    // Do not try to legalize the target-specific arguments (#1+), except for
+    // an optional flag input.
+    if (Node->getOperand(Node->getNumOperands()-1).getValueType() != MVT::Flag){
+      if (Tmp1 != Node->getOperand(0)) {
+        SmallVector<SDValue, 8> Ops(Node->op_begin(), Node->op_end());
+        Ops[0] = Tmp1;
+        Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                &Ops[0], Ops.size()),
+                         Result.getResNo());
+      }
+    } else {
+      Tmp2 = LegalizeOp(Node->getOperand(Node->getNumOperands()-1));
+      if (Tmp1 != Node->getOperand(0) ||
+          Tmp2 != Node->getOperand(Node->getNumOperands()-1)) {
+        SmallVector<SDValue, 8> Ops(Node->op_begin(), Node->op_end());
+        Ops[0] = Tmp1;
+        Ops.back() = Tmp2;
+        Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                &Ops[0], Ops.size()),
+                         Result.getResNo());
+      }
+    }
+    assert(IsLegalizingCall && "Call sequence imbalance between start/end?");
+    // This finishes up call legalization.
+    IsLegalizingCall = false;
+
+    // If the CALLSEQ_END node has a flag, remember that we legalized it.
+    AddLegalizedOperand(SDValue(Node, 0), Result.getValue(0));
+    if (Node->getNumValues() == 2)
+      AddLegalizedOperand(SDValue(Node, 1), Result.getValue(1));
+    return Result.getValue(Op.getResNo());
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Node);
+    Tmp1 = LegalizeOp(LD->getChain());   // Legalize the chain.
+    Tmp2 = LegalizeOp(LD->getBasePtr()); // Legalize the base pointer.
+
+    ISD::LoadExtType ExtType = LD->getExtensionType();
+    if (ExtType == ISD::NON_EXTLOAD) {
+      EVT VT = Node->getValueType(0);
+      Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                              Tmp1, Tmp2, LD->getOffset()),
+                       Result.getResNo());
+      Tmp3 = Result.getValue(0);
+      Tmp4 = Result.getValue(1);
+
+      switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
+      default: assert(0 && "This action is not supported yet!");
+      case TargetLowering::Legal:
+        // If this is an unaligned load and the target doesn't support it,
+        // expand it.
+        if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+          const Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+          unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
+          if (LD->getAlignment() < ABIAlignment){
+            Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()),
+                                         DAG, TLI);
+            Tmp3 = Result.getOperand(0);
+            Tmp4 = Result.getOperand(1);
+            Tmp3 = LegalizeOp(Tmp3);
+            Tmp4 = LegalizeOp(Tmp4);
+          }
+        }
+        break;
+      case TargetLowering::Custom:
+        Tmp1 = TLI.LowerOperation(Tmp3, DAG);
+        if (Tmp1.getNode()) {
+          Tmp3 = LegalizeOp(Tmp1);
+          Tmp4 = LegalizeOp(Tmp1.getValue(1));
+        }
+        break;
+      case TargetLowering::Promote: {
+        // Only promote a load of vector type to another.
+        assert(VT.isVector() && "Cannot promote this load!");
+        // Change base type to a different vector type.
+        EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
+
+        Tmp1 = DAG.getLoad(NVT, dl, Tmp1, Tmp2, LD->getSrcValue(),
+                           LD->getSrcValueOffset(),
+                           LD->isVolatile(), LD->isNonTemporal(),
+                           LD->getAlignment());
+        Tmp3 = LegalizeOp(DAG.getNode(ISD::BIT_CONVERT, dl, VT, Tmp1));
+        Tmp4 = LegalizeOp(Tmp1.getValue(1));
+        break;
+      }
+      }
+      // Since loads produce two values, make sure to remember that we
+      // legalized both of them.
+      AddLegalizedOperand(SDValue(Node, 0), Tmp3);
+      AddLegalizedOperand(SDValue(Node, 1), Tmp4);
+      return Op.getResNo() ? Tmp4 : Tmp3;
+    } else {
+      EVT SrcVT = LD->getMemoryVT();
+      unsigned SrcWidth = SrcVT.getSizeInBits();
+      int SVOffset = LD->getSrcValueOffset();
+      unsigned Alignment = LD->getAlignment();
+      bool isVolatile = LD->isVolatile();
+      bool isNonTemporal = LD->isNonTemporal();
+
+      if (SrcWidth != SrcVT.getStoreSizeInBits() &&
+          // Some targets pretend to have an i1 loading operation, and actually
+          // load an i8.  This trick is correct for ZEXTLOAD because the top 7
+          // bits are guaranteed to be zero; it helps the optimizers understand
+          // that these bits are zero.  It is also useful for EXTLOAD, since it
+          // tells the optimizers that those bits are undefined.  It would be
+          // nice to have an effective generic way of getting these benefits...
+          // Until such a way is found, don't insist on promoting i1 here.
+          (SrcVT != MVT::i1 ||
+           TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) {
+        // Promote to a byte-sized load if not loading an integral number of
+        // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
+        unsigned NewWidth = SrcVT.getStoreSizeInBits();
+        EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
+        SDValue Ch;
+
+        // The extra bits are guaranteed to be zero, since we stored them that
+        // way.  A zext load from NVT thus automatically gives zext from SrcVT.
+
+        ISD::LoadExtType NewExtType =
+          ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
+
+        Result = DAG.getExtLoad(NewExtType, Node->getValueType(0), dl,
+                                Tmp1, Tmp2, LD->getSrcValue(), SVOffset,
+                                NVT, isVolatile, isNonTemporal, Alignment);
+
+        Ch = Result.getValue(1); // The chain.
+
+        if (ExtType == ISD::SEXTLOAD)
+          // Having the top bits zero doesn't help when sign extending.
+          Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                               Result.getValueType(),
+                               Result, DAG.getValueType(SrcVT));
+        else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType())
+          // All the top bits are guaranteed to be zero - inform the optimizers.
+          Result = DAG.getNode(ISD::AssertZext, dl,
+                               Result.getValueType(), Result,
+                               DAG.getValueType(SrcVT));
+
+        Tmp1 = LegalizeOp(Result);
+        Tmp2 = LegalizeOp(Ch);
+      } else if (SrcWidth & (SrcWidth - 1)) {
+        // If not loading a power-of-2 number of bits, expand as two loads.
+        assert(!SrcVT.isVector() && "Unsupported extload!");
+        unsigned RoundWidth = 1 << Log2_32(SrcWidth);
+        assert(RoundWidth < SrcWidth);
+        unsigned ExtraWidth = SrcWidth - RoundWidth;
+        assert(ExtraWidth < RoundWidth);
+        assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+               "Load size not an integral number of bytes!");
+        EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+        EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+        SDValue Lo, Hi, Ch;
+        unsigned IncrementSize;
+
+        if (TLI.isLittleEndian()) {
+          // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
+          // Load the bottom RoundWidth bits.
+          Lo = DAG.getExtLoad(ISD::ZEXTLOAD, Node->getValueType(0), dl,
+                              Tmp1, Tmp2,
+                              LD->getSrcValue(), SVOffset, RoundVT, isVolatile,
+                              isNonTemporal, Alignment);
+
+          // Load the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Hi = DAG.getExtLoad(ExtType, Node->getValueType(0), dl, Tmp1, Tmp2,
+                              LD->getSrcValue(), SVOffset + IncrementSize,
+                              ExtraVT, isVolatile, isNonTemporal,
+                              MinAlign(Alignment, IncrementSize));
+
+          // Build a factor node to remember that this load is independent of
+          // the other one.
+          Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                           Hi.getValue(1));
+
+          // Move the top bits to the right place.
+          Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
+                           DAG.getConstant(RoundWidth, TLI.getShiftAmountTy()));
+
+          // Join the hi and lo parts.
+          Result = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+        } else {
+          // Big endian - avoid unaligned loads.
+          // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
+          // Load the top RoundWidth bits.
+          Hi = DAG.getExtLoad(ExtType, Node->getValueType(0), dl, Tmp1, Tmp2,
+                              LD->getSrcValue(), SVOffset, RoundVT, isVolatile,
+                              isNonTemporal, Alignment);
+
+          // Load the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
+                              Node->getValueType(0), dl, Tmp1, Tmp2,
+                              LD->getSrcValue(), SVOffset + IncrementSize,
+                              ExtraVT, isVolatile, isNonTemporal,
+                              MinAlign(Alignment, IncrementSize));
+
+          // Build a factor node to remember that this load is independent of
+          // the other one.
+          Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                           Hi.getValue(1));
+
+          // Move the top bits to the right place.
+          Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
+                           DAG.getConstant(ExtraWidth, TLI.getShiftAmountTy()));
+
+          // Join the hi and lo parts.
+          Result = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+        }
+
+        Tmp1 = LegalizeOp(Result);
+        Tmp2 = LegalizeOp(Ch);
+      } else {
+        switch (TLI.getLoadExtAction(ExtType, SrcVT)) {
+        default: assert(0 && "This action is not supported yet!");
+        case TargetLowering::Custom:
+          isCustom = true;
+          // FALLTHROUGH
+        case TargetLowering::Legal:
+          Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                  Tmp1, Tmp2, LD->getOffset()),
+                           Result.getResNo());
+          Tmp1 = Result.getValue(0);
+          Tmp2 = Result.getValue(1);
+
+          if (isCustom) {
+            Tmp3 = TLI.LowerOperation(Result, DAG);
+            if (Tmp3.getNode()) {
+              Tmp1 = LegalizeOp(Tmp3);
+              Tmp2 = LegalizeOp(Tmp3.getValue(1));
+            }
+          } else {
+            // If this is an unaligned load and the target doesn't support it,
+            // expand it.
+            if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+              const Type *Ty =
+                LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+              unsigned ABIAlignment =
+                TLI.getTargetData()->getABITypeAlignment(Ty);
+              if (LD->getAlignment() < ABIAlignment){
+                Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()),
+                                             DAG, TLI);
+                Tmp1 = Result.getOperand(0);
+                Tmp2 = Result.getOperand(1);
+                Tmp1 = LegalizeOp(Tmp1);
+                Tmp2 = LegalizeOp(Tmp2);
+              }
+            }
+          }
+          break;
+        case TargetLowering::Expand:
+          // f64 = EXTLOAD f32 should expand to LOAD, FP_EXTEND
+          // f128 = EXTLOAD {f32,f64} too
+          if ((SrcVT == MVT::f32 && (Node->getValueType(0) == MVT::f64 ||
+                                     Node->getValueType(0) == MVT::f128)) ||
+              (SrcVT == MVT::f64 && Node->getValueType(0) == MVT::f128)) {
+            SDValue Load = DAG.getLoad(SrcVT, dl, Tmp1, Tmp2, LD->getSrcValue(),
+                                       LD->getSrcValueOffset(),
+                                       LD->isVolatile(), LD->isNonTemporal(),
+                                       LD->getAlignment());
+            Result = DAG.getNode(ISD::FP_EXTEND, dl,
+                                 Node->getValueType(0), Load);
+            Tmp1 = LegalizeOp(Result);  // Relegalize new nodes.
+            Tmp2 = LegalizeOp(Load.getValue(1));
+            break;
+          }
+          assert(ExtType != ISD::EXTLOAD &&
+                 "EXTLOAD should always be supported!");
+          // Turn the unsupported load into an EXTLOAD followed by an explicit
+          // zero/sign extend inreg.
+          Result = DAG.getExtLoad(ISD::EXTLOAD, Node->getValueType(0), dl,
+                                  Tmp1, Tmp2, LD->getSrcValue(),
+                                  LD->getSrcValueOffset(), SrcVT,
+                                  LD->isVolatile(), LD->isNonTemporal(),
+                                  LD->getAlignment());
+          SDValue ValRes;
+          if (ExtType == ISD::SEXTLOAD)
+            ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                                 Result.getValueType(),
+                                 Result, DAG.getValueType(SrcVT));
+          else
+            ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT);
+          Tmp1 = LegalizeOp(ValRes);  // Relegalize new nodes.
+          Tmp2 = LegalizeOp(Result.getValue(1));  // Relegalize new nodes.
+          break;
+        }
+      }
+
+      // Since loads produce two values, make sure to remember that we legalized
+      // both of them.
+      AddLegalizedOperand(SDValue(Node, 0), Tmp1);
+      AddLegalizedOperand(SDValue(Node, 1), Tmp2);
+      return Op.getResNo() ? Tmp2 : Tmp1;
+    }
+  }
+  case ISD::STORE: {
+    StoreSDNode *ST = cast<StoreSDNode>(Node);
+    Tmp1 = LegalizeOp(ST->getChain());    // Legalize the chain.
+    Tmp2 = LegalizeOp(ST->getBasePtr());  // Legalize the pointer.
+    int SVOffset = ST->getSrcValueOffset();
+    unsigned Alignment = ST->getAlignment();
+    bool isVolatile = ST->isVolatile();
+    bool isNonTemporal = ST->isNonTemporal();
+
+    if (!ST->isTruncatingStore()) {
+      if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
+        Result = SDValue(OptStore, 0);
+        break;
+      }
+
+      {
+        Tmp3 = LegalizeOp(ST->getValue());
+        Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                Tmp1, Tmp3, Tmp2,
+                                                ST->getOffset()),
+                         Result.getResNo());
+
+        EVT VT = Tmp3.getValueType();
+        switch (TLI.getOperationAction(ISD::STORE, VT)) {
+        default: assert(0 && "This action is not supported yet!");
+        case TargetLowering::Legal:
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+            const Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
+            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
+            if (ST->getAlignment() < ABIAlignment)
+              Result = ExpandUnalignedStore(cast<StoreSDNode>(Result.getNode()),
+                                            DAG, TLI);
+          }
+          break;
+        case TargetLowering::Custom:
+          Tmp1 = TLI.LowerOperation(Result, DAG);
+          if (Tmp1.getNode()) Result = Tmp1;
+          break;
+        case TargetLowering::Promote:
+          assert(VT.isVector() && "Unknown legal promote case!");
+          Tmp3 = DAG.getNode(ISD::BIT_CONVERT, dl,
+                             TLI.getTypeToPromoteTo(ISD::STORE, VT), Tmp3);
+          Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2,
+                                ST->getSrcValue(), SVOffset, isVolatile,
+                                isNonTemporal, Alignment);
+          break;
+        }
+        break;
+      }
+    } else {
+      Tmp3 = LegalizeOp(ST->getValue());
+
+      EVT StVT = ST->getMemoryVT();
+      unsigned StWidth = StVT.getSizeInBits();
+
+      if (StWidth != StVT.getStoreSizeInBits()) {
+        // Promote to a byte-sized store with upper bits zero if not
+        // storing an integral number of bytes.  For example, promote
+        // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
+        EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
+                                    StVT.getStoreSizeInBits());
+        Tmp3 = DAG.getZeroExtendInReg(Tmp3, dl, StVT);
+        Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                                   SVOffset, NVT, isVolatile, isNonTemporal,
+                                   Alignment);
+      } else if (StWidth & (StWidth - 1)) {
+        // If not storing a power-of-2 number of bits, expand as two stores.
+        assert(!StVT.isVector() && "Unsupported truncstore!");
+        unsigned RoundWidth = 1 << Log2_32(StWidth);
+        assert(RoundWidth < StWidth);
+        unsigned ExtraWidth = StWidth - RoundWidth;
+        assert(ExtraWidth < RoundWidth);
+        assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+               "Store size not an integral number of bytes!");
+        EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+        EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+        SDValue Lo, Hi;
+        unsigned IncrementSize;
+
+        if (TLI.isLittleEndian()) {
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
+          // Store the bottom RoundWidth bits.
+          Lo = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                                 SVOffset, RoundVT,
+                                 isVolatile, isNonTemporal, Alignment);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Hi = DAG.getNode(ISD::SRL, dl, Tmp3.getValueType(), Tmp3,
+                           DAG.getConstant(RoundWidth, TLI.getShiftAmountTy()));
+          Hi = DAG.getTruncStore(Tmp1, dl, Hi, Tmp2, ST->getSrcValue(),
+                                 SVOffset + IncrementSize, ExtraVT, isVolatile,
+                                 isNonTemporal,
+                                 MinAlign(Alignment, IncrementSize));
+        } else {
+          // Big endian - avoid unaligned stores.
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
+          // Store the top RoundWidth bits.
+          Hi = DAG.getNode(ISD::SRL, dl, Tmp3.getValueType(), Tmp3,
+                           DAG.getConstant(ExtraWidth, TLI.getShiftAmountTy()));
+          Hi = DAG.getTruncStore(Tmp1, dl, Hi, Tmp2, ST->getSrcValue(),
+                                 SVOffset, RoundVT, isVolatile, isNonTemporal,
+                                 Alignment);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Lo = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                                 SVOffset + IncrementSize, ExtraVT, isVolatile,
+                                 isNonTemporal,
+                                 MinAlign(Alignment, IncrementSize));
+        }
+
+        // The order of the stores doesn't matter.
+        Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+      } else {
+        if (Tmp1 != ST->getChain() || Tmp3 != ST->getValue() ||
+            Tmp2 != ST->getBasePtr())
+          Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                  Tmp1, Tmp3, Tmp2,
+                                                  ST->getOffset()),
+                           Result.getResNo());
+
+        switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
+        default: assert(0 && "This action is not supported yet!");
+        case TargetLowering::Legal:
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+            const Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
+            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
+            if (ST->getAlignment() < ABIAlignment)
+              Result = ExpandUnalignedStore(cast<StoreSDNode>(Result.getNode()),
+                                            DAG, TLI);
+          }
+          break;
+        case TargetLowering::Custom:
+          Result = TLI.LowerOperation(Result, DAG);
+          break;
+        case Expand:
+          // TRUNCSTORE:i16 i32 -> STORE i16
+          assert(isTypeLegal(StVT) && "Do not know how to expand this store!");
+          Tmp3 = DAG.getNode(ISD::TRUNCATE, dl, StVT, Tmp3);
+          Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                                SVOffset, isVolatile, isNonTemporal,
+                                Alignment);
+          break;
+        }
+      }
+    }
+    break;
+  }
+  }
+  assert(Result.getValueType() == Op.getValueType() &&
+         "Bad legalization!");
+
+  // Make sure that the generated code is itself legal.
+  if (Result != Op)
+    Result = LegalizeOp(Result);
+
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  AddLegalizedOperand(Op, Result);
+  return Result;
+}
+
+SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
+  SDValue Vec = Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  // Store the value to a temporary stack slot, then LOAD the returned part.
+  SDValue StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, NULL, 0,
+                            false, false, 0);
+
+  // Add the offset to the index.
+  unsigned EltSize =
+      Vec.getValueType().getVectorElementType().getSizeInBits()/8;
+  Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
+                    DAG.getConstant(EltSize, Idx.getValueType()));
+
+  if (Idx.getValueType().bitsGT(TLI.getPointerTy()))
+    Idx = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Idx);
+  else
+    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
+
+  StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr);
+
+  if (Op.getValueType().isVector())
+    return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, NULL, 0,
+                       false, false, 0);
+  else
+    return DAG.getExtLoad(ISD::EXTLOAD, Op.getValueType(), dl, Ch, StackPtr,
+                          NULL, 0, Vec.getValueType().getVectorElementType(),
+                          false, false, 0);
+}
+
+SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
+  // We can't handle this case efficiently.  Allocate a sufficiently
+  // aligned object on the stack, store each element into it, then load
+  // the result as a vector.
+  // Create the stack frame object.
+  EVT VT = Node->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue FIPtr = DAG.CreateStackTemporary(VT);
+  int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
+  const Value *SV = PseudoSourceValue::getFixedStack(FI);
+
+  // Emit a store of each element to the stack slot.
+  SmallVector<SDValue, 8> Stores;
+  unsigned TypeByteSize = EltVT.getSizeInBits() / 8;
+  // Store (in the right endianness) the elements to memory.
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
+    // Ignore undef elements.
+    if (Node->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+
+    unsigned Offset = TypeByteSize*i;
+
+    SDValue Idx = DAG.getConstant(Offset, FIPtr.getValueType());
+    Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx);
+
+    // If the destination vector element type is narrower than the source
+    // element type, only store the bits necessary.
+    if (EltVT.bitsLT(Node->getOperand(i).getValueType().getScalarType())) {
+      Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
+                                         Node->getOperand(i), Idx, SV, Offset,
+                                         EltVT, false, false, 0));
+    } else
+      Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl,
+                                    Node->getOperand(i), Idx, SV, Offset,
+                                    false, false, 0));
+  }
+
+  SDValue StoreChain;
+  if (!Stores.empty())    // Not all undef elements?
+    StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             &Stores[0], Stores.size());
+  else
+    StoreChain = DAG.getEntryNode();
+
+  // Result is a load from the stack slot.
+  return DAG.getLoad(VT, dl, StoreChain, FIPtr, SV, 0, false, false, 0);
+}
+
+SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) {
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Tmp1 = Node->getOperand(0);
+  SDValue Tmp2 = Node->getOperand(1);
+
+  // Get the sign bit of the RHS.  First obtain a value that has the same
+  // sign as the sign bit, i.e. negative if and only if the sign bit is 1.
+  SDValue SignBit;
+  EVT FloatVT = Tmp2.getValueType();
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), FloatVT.getSizeInBits());
+  if (isTypeLegal(IVT)) {
+    // Convert to an integer with the same sign bit.
+    SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, IVT, Tmp2);
+  } else {
+    // Store the float to memory, then load the sign part out as an integer.
+    MVT LoadTy = TLI.getPointerTy();
+    // First create a temporary that is aligned for both the load and store.
+    SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy);
+    // Then store the float to it.
+    SDValue Ch =
+      DAG.getStore(DAG.getEntryNode(), dl, Tmp2, StackPtr, NULL, 0,
+                   false, false, 0);
+    if (TLI.isBigEndian()) {
+      assert(FloatVT.isByteSized() && "Unsupported floating point type!");
+      // Load out a legal integer with the same sign bit as the float.
+      SignBit = DAG.getLoad(LoadTy, dl, Ch, StackPtr, NULL, 0, false, false, 0);
+    } else { // Little endian
+      SDValue LoadPtr = StackPtr;
+      // The float may be wider than the integer we are going to load.  Advance
+      // the pointer so that the loaded integer will contain the sign bit.
+      unsigned Strides = (FloatVT.getSizeInBits()-1)/LoadTy.getSizeInBits();
+      unsigned ByteOffset = (Strides * LoadTy.getSizeInBits()) / 8;
+      LoadPtr = DAG.getNode(ISD::ADD, dl, LoadPtr.getValueType(),
+                            LoadPtr, DAG.getIntPtrConstant(ByteOffset));
+      // Load a legal integer containing the sign bit.
+      SignBit = DAG.getLoad(LoadTy, dl, Ch, LoadPtr, NULL, 0, false, false, 0);
+      // Move the sign bit to the top bit of the loaded integer.
+      unsigned BitShift = LoadTy.getSizeInBits() -
+        (FloatVT.getSizeInBits() - 8 * ByteOffset);
+      assert(BitShift < LoadTy.getSizeInBits() && "Pointer advanced wrong?");
+      if (BitShift)
+        SignBit = DAG.getNode(ISD::SHL, dl, LoadTy, SignBit,
+                              DAG.getConstant(BitShift,TLI.getShiftAmountTy()));
+    }
+  }
+  // Now get the sign bit proper, by seeing whether the value is negative.
+  SignBit = DAG.getSetCC(dl, TLI.getSetCCResultType(SignBit.getValueType()),
+                         SignBit, DAG.getConstant(0, SignBit.getValueType()),
+                         ISD::SETLT);
+  // Get the absolute value of the result.
+  SDValue AbsVal = DAG.getNode(ISD::FABS, dl, Tmp1.getValueType(), Tmp1);
+  // Select between the nabs and abs value based on the sign bit of
+  // the input.
+  return DAG.getNode(ISD::SELECT, dl, AbsVal.getValueType(), SignBit,
+                     DAG.getNode(ISD::FNEG, dl, AbsVal.getValueType(), AbsVal),
+                     AbsVal);
+}
+
+void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
+                                           SmallVectorImpl<SDValue> &Results) {
+  unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+  assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+          " not tell us which reg is the stack pointer!");
+  DebugLoc dl = Node->getDebugLoc();
+  EVT VT = Node->getValueType(0);
+  SDValue Tmp1 = SDValue(Node, 0);
+  SDValue Tmp2 = SDValue(Node, 1);
+  SDValue Tmp3 = Node->getOperand(2);
+  SDValue Chain = Tmp1.getOperand(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
+
+  SDValue Size  = Tmp2.getOperand(1);
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  Chain = SP.getValue(1);
+  unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+  unsigned StackAlign = TM.getFrameInfo()->getStackAlignment();
+  if (Align > StackAlign)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP,
+                      DAG.getConstant(-(uint64_t)Align, VT));
+  Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size);       // Value
+  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);     // Output chain
+
+  Tmp2 = DAG.getCALLSEQ_END(Chain,  DAG.getIntPtrConstant(0, true),
+                            DAG.getIntPtrConstant(0, true), SDValue());
+
+  Results.push_back(Tmp1);
+  Results.push_back(Tmp2);
+}
+
+/// LegalizeSetCCCondCode - Legalize a SETCC with given LHS and RHS and
+/// condition code CC on the current target. This routine expands SETCC with
+/// illegal condition code into AND / OR of multiple SETCC values.
+void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
+                                                 SDValue &LHS, SDValue &RHS,
+                                                 SDValue &CC,
+                                                 DebugLoc dl) {
+  EVT OpVT = LHS.getValueType();
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
+  switch (TLI.getCondCodeAction(CCCode, OpVT)) {
+  default: assert(0 && "Unknown condition code action!");
+  case TargetLowering::Legal:
+    // Nothing to do.
+    break;
+  case TargetLowering::Expand: {
+    ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
+    unsigned Opc = 0;
+    switch (CCCode) {
+    default: assert(0 && "Don't know how to expand this condition!");
+    case ISD::SETOEQ: CC1 = ISD::SETEQ; CC2 = ISD::SETO;  Opc = ISD::AND; break;
+    case ISD::SETOGT: CC1 = ISD::SETGT; CC2 = ISD::SETO;  Opc = ISD::AND; break;
+    case ISD::SETOGE: CC1 = ISD::SETGE; CC2 = ISD::SETO;  Opc = ISD::AND; break;
+    case ISD::SETOLT: CC1 = ISD::SETLT; CC2 = ISD::SETO;  Opc = ISD::AND; break;
+    case ISD::SETOLE: CC1 = ISD::SETLE; CC2 = ISD::SETO;  Opc = ISD::AND; break;
+    case ISD::SETONE: CC1 = ISD::SETNE; CC2 = ISD::SETO;  Opc = ISD::AND; break;
+    case ISD::SETUEQ: CC1 = ISD::SETEQ; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
+    case ISD::SETUGT: CC1 = ISD::SETGT; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
+    case ISD::SETUGE: CC1 = ISD::SETGE; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
+    case ISD::SETULT: CC1 = ISD::SETLT; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
+    case ISD::SETULE: CC1 = ISD::SETLE; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
+    case ISD::SETUNE: CC1 = ISD::SETNE; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
+    // FIXME: Implement more expansions.
+    }
+
+    SDValue SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1);
+    SDValue SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2);
+    LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
+    RHS = SDValue();
+    CC  = SDValue();
+    break;
+  }
+  }
+}
+
+/// EmitStackConvert - Emit a store/load combination to the stack.  This stores
+/// SrcOp to a stack slot of type SlotVT, truncating it if needed.  It then does
+/// a load from the stack slot to DestVT, extending it if needed.
+/// The resultant code need not be legal.
+SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
+                                               EVT SlotVT,
+                                               EVT DestVT,
+                                               DebugLoc dl) {
+  // Create the stack frame object.
+  unsigned SrcAlign =
+    TLI.getTargetData()->getPrefTypeAlignment(SrcOp.getValueType().
+                                              getTypeForEVT(*DAG.getContext()));
+  SDValue FIPtr = DAG.CreateStackTemporary(SlotVT, SrcAlign);
+
+  FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(FIPtr);
+  int SPFI = StackPtrFI->getIndex();
+  const Value *SV = PseudoSourceValue::getFixedStack(SPFI);
+
+  unsigned SrcSize = SrcOp.getValueType().getSizeInBits();
+  unsigned SlotSize = SlotVT.getSizeInBits();
+  unsigned DestSize = DestVT.getSizeInBits();
+  const Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
+  unsigned DestAlign = TLI.getTargetData()->getPrefTypeAlignment(DestType);
+
+  // Emit a store to the stack slot.  Use a truncstore if the input value is
+  // later than DestVT.
+  SDValue Store;
+
+  if (SrcSize > SlotSize)
+    Store = DAG.getTruncStore(DAG.getEntryNode(), dl, SrcOp, FIPtr,
+                              SV, 0, SlotVT, false, false, SrcAlign);
+  else {
+    assert(SrcSize == SlotSize && "Invalid store");
+    Store = DAG.getStore(DAG.getEntryNode(), dl, SrcOp, FIPtr,
+                         SV, 0, false, false, SrcAlign);
+  }
+
+  // Result is a load from the stack slot.
+  if (SlotSize == DestSize)
+    return DAG.getLoad(DestVT, dl, Store, FIPtr, SV, 0, false, false,
+                       DestAlign);
+
+  assert(SlotSize < DestSize && "Unknown extension!");
+  return DAG.getExtLoad(ISD::EXTLOAD, DestVT, dl, Store, FIPtr, SV, 0, SlotVT,
+                        false, false, DestAlign);
+}
+
+SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
+  DebugLoc dl = Node->getDebugLoc();
+  // Create a vector sized/aligned stack slot, store the value to element #0,
+  // then load the whole vector back out.
+  SDValue StackPtr = DAG.CreateStackTemporary(Node->getValueType(0));
+
+  FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(StackPtr);
+  int SPFI = StackPtrFI->getIndex();
+
+  SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), dl, Node->getOperand(0),
+                                 StackPtr,
+                                 PseudoSourceValue::getFixedStack(SPFI), 0,
+                                 Node->getValueType(0).getVectorElementType(),
+                                 false, false, 0);
+  return DAG.getLoad(Node->getValueType(0), dl, Ch, StackPtr,
+                     PseudoSourceValue::getFixedStack(SPFI), 0,
+                     false, false, 0);
+}
+
+
+/// ExpandBUILD_VECTOR - Expand a BUILD_VECTOR node on targets that don't
+/// support the operation, but do support the resultant vector type.
+SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
+  unsigned NumElems = Node->getNumOperands();
+  SDValue Value1, Value2;
+  DebugLoc dl = Node->getDebugLoc();
+  EVT VT = Node->getValueType(0);
+  EVT OpVT = Node->getOperand(0).getValueType();
+  EVT EltVT = VT.getVectorElementType();
+
+  // If the only non-undef value is the low element, turn this into a
+  // SCALAR_TO_VECTOR node.  If this is { X, X, X, X }, determine X.
+  bool isOnlyLowElement = true;
+  bool MoreThanTwoValues = false;
+  bool isConstant = true;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue V = Node->getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    if (!Value1.getNode()) {
+      Value1 = V;
+    } else if (!Value2.getNode()) {
+      if (V != Value1)
+        Value2 = V;
+    } else if (V != Value1 && V != Value2) {
+      MoreThanTwoValues = true;
+    }
+  }
+
+  if (!Value1.getNode())
+    return DAG.getUNDEF(VT);
+
+  if (isOnlyLowElement)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Node->getOperand(0));
+
+  // If all elements are constants, create a load from the constant pool.
+  if (isConstant) {
+    std::vector<Constant*> CV;
+    for (unsigned i = 0, e = NumElems; i != e; ++i) {
+      if (ConstantFPSDNode *V =
+          dyn_cast<ConstantFPSDNode>(Node->getOperand(i))) {
+        CV.push_back(const_cast<ConstantFP *>(V->getConstantFPValue()));
+      } else if (ConstantSDNode *V =
+                 dyn_cast<ConstantSDNode>(Node->getOperand(i))) {
+        if (OpVT==EltVT)
+          CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
+        else {
+          // If OpVT and EltVT don't match, EltVT is not legal and the
+          // element values have been promoted/truncated earlier.  Undo this;
+          // we don't want a v16i8 to become a v16i32 for example.
+          const ConstantInt *CI = V->getConstantIntValue();
+          CV.push_back(ConstantInt::get(EltVT.getTypeForEVT(*DAG.getContext()),
+                                        CI->getZExtValue()));
+        }
+      } else {
+        assert(Node->getOperand(i).getOpcode() == ISD::UNDEF);
+        const Type *OpNTy = EltVT.getTypeForEVT(*DAG.getContext());
+        CV.push_back(UndefValue::get(OpNTy));
+      }
+    }
+    Constant *CP = ConstantVector::get(CV);
+    SDValue CPIdx = DAG.getConstantPool(CP, TLI.getPointerTy());
+    unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                       PseudoSourceValue::getConstantPool(), 0,
+                       false, false, Alignment);
+  }
+
+  if (!MoreThanTwoValues) {
+    SmallVector<int, 8> ShuffleVec(NumElems, -1);
+    for (unsigned i = 0; i < NumElems; ++i) {
+      SDValue V = Node->getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      ShuffleVec[i] = V == Value1 ? 0 : NumElems;
+    }
+    if (TLI.isShuffleMaskLegal(ShuffleVec, Node->getValueType(0))) {
+      // Get the splatted value into the low element of a vector register.
+      SDValue Vec1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value1);
+      SDValue Vec2;
+      if (Value2.getNode())
+        Vec2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value2);
+      else
+        Vec2 = DAG.getUNDEF(VT);
+
+      // Return shuffle(LowValVec, undef, <0,0,0,0>)
+      return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec.data());
+    }
+  }
+
+  // Otherwise, we can't handle this case efficiently.
+  return ExpandVectorBuildThroughStack(Node);
+}
+
+// ExpandLibCall - Expand a node into a call to a libcall.  If the result value
+// does not fit into a register, return the lo part and set the hi part to the
+// by-reg argument.  If it does fit into a single register, return the result
+// and leave the Hi part unset.
+SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
+                                            bool isSigned) {
+  assert(!IsLegalizingCall && "Cannot overlap legalization of calls!");
+  // The input chain to this libcall is the entry node of the function.
+  // Legalizing the call will automatically add the previous call to the
+  // dependence.
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Node->getOperand(i).getValueType();
+    const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Node->getOperand(i); Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  // Splice the libcall in wherever FindInputOutputChains tells us to.
+  const Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+  std::pair<SDValue, SDValue> CallInfo =
+    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+                    0, TLI.getLibcallCallingConv(LC), false,
+                    /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, Node->getDebugLoc());
+
+  // Legalize the call sequence, starting with the chain.  This will advance
+  // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that
+  // was added by LowerCallTo (guaranteeing proper serialization of calls).
+  LegalizeOp(CallInfo.second);
+  return CallInfo.first;
+}
+
+// ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
+// ExpandLibCall except that the first operand is the in-chain.
+std::pair<SDValue, SDValue>
+SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
+                                         SDNode *Node,
+                                         bool isSigned) {
+  assert(!IsLegalizingCall && "Cannot overlap legalization of calls!");
+  SDValue InChain = Node->getOperand(0);
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Node->getOperand(i).getValueType();
+    const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Node->getOperand(i);
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  // Splice the libcall in wherever FindInputOutputChains tells us to.
+  const Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+  std::pair<SDValue, SDValue> CallInfo =
+    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+                    0, TLI.getLibcallCallingConv(LC), false,
+                    /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, Node->getDebugLoc());
+
+  // Legalize the call sequence, starting with the chain.  This will advance
+  // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that
+  // was added by LowerCallTo (guaranteeing proper serialization of calls).
+  LegalizeOp(CallInfo.second);
+  return CallInfo;
+}
+
+SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
+                                              RTLIB::Libcall Call_F32,
+                                              RTLIB::Libcall Call_F64,
+                                              RTLIB::Libcall Call_F80,
+                                              RTLIB::Libcall Call_PPCF128) {
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  default: assert(0 && "Unexpected request for libcall!");
+  case MVT::f32: LC = Call_F32; break;
+  case MVT::f64: LC = Call_F64; break;
+  case MVT::f80: LC = Call_F80; break;
+  case MVT::ppcf128: LC = Call_PPCF128; break;
+  }
+  return ExpandLibCall(LC, Node, false);
+}
+
+SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
+                                               RTLIB::Libcall Call_I8,
+                                               RTLIB::Libcall Call_I16,
+                                               RTLIB::Libcall Call_I32,
+                                               RTLIB::Libcall Call_I64,
+                                               RTLIB::Libcall Call_I128) {
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  default: assert(0 && "Unexpected request for libcall!");
+  case MVT::i8:   LC = Call_I8; break;
+  case MVT::i16:  LC = Call_I16; break;
+  case MVT::i32:  LC = Call_I32; break;
+  case MVT::i64:  LC = Call_I64; break;
+  case MVT::i128: LC = Call_I128; break;
+  }
+  return ExpandLibCall(LC, Node, isSigned);
+}
+
+/// ExpandLegalINT_TO_FP - This function is responsible for legalizing a
+/// INT_TO_FP operation of the specified operand when the target requests that
+/// we expand it.  At this point, we know that the result and operand types are
+/// legal for the target.
+SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
+                                                   SDValue Op0,
+                                                   EVT DestVT,
+                                                   DebugLoc dl) {
+  if (Op0.getValueType() == MVT::i32) {
+    // simple 32-bit [signed|unsigned] integer to float/double expansion
+
+    // Get the stack frame index of a 8 byte buffer.
+    SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);
+
+    // word offset constant for Hi/Lo address computation
+    SDValue WordOff = DAG.getConstant(sizeof(int), TLI.getPointerTy());
+    // set up Hi and Lo (into buffer) address based on endian
+    SDValue Hi = StackSlot;
+    SDValue Lo = DAG.getNode(ISD::ADD, dl,
+                             TLI.getPointerTy(), StackSlot, WordOff);
+    if (TLI.isLittleEndian())
+      std::swap(Hi, Lo);
+
+    // if signed map to unsigned space
+    SDValue Op0Mapped;
+    if (isSigned) {
+      // constant used to invert sign bit (signed to unsigned mapping)
+      SDValue SignBit = DAG.getConstant(0x80000000u, MVT::i32);
+      Op0Mapped = DAG.getNode(ISD::XOR, dl, MVT::i32, Op0, SignBit);
+    } else {
+      Op0Mapped = Op0;
+    }
+    // store the lo of the constructed double - based on integer input
+    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl,
+                                  Op0Mapped, Lo, NULL, 0,
+                                  false, false, 0);
+    // initial hi portion of constructed double
+    SDValue InitialHi = DAG.getConstant(0x43300000u, MVT::i32);
+    // store the hi of the constructed double - biased exponent
+    SDValue Store2=DAG.getStore(Store1, dl, InitialHi, Hi, NULL, 0,
+                                false, false, 0);
+    // load the constructed double
+    SDValue Load = DAG.getLoad(MVT::f64, dl, Store2, StackSlot, NULL, 0,
+                               false, false, 0);
+    // FP constant to bias correct the final result
+    SDValue Bias = DAG.getConstantFP(isSigned ?
+                                     BitsToDouble(0x4330000080000000ULL) :
+                                     BitsToDouble(0x4330000000000000ULL),
+                                     MVT::f64);
+    // subtract the bias
+    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias);
+    // final result
+    SDValue Result;
+    // handle final rounding
+    if (DestVT == MVT::f64) {
+      // do nothing
+      Result = Sub;
+    } else if (DestVT.bitsLT(MVT::f64)) {
+      Result = DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+                           DAG.getIntPtrConstant(0));
+    } else if (DestVT.bitsGT(MVT::f64)) {
+      Result = DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+    }
+    return Result;
+  }
+  assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
+  // Code below here assumes !isSigned without checking again.
+
+  // Implementation of unsigned i64 to f64 following the algorithm in
+  // __floatundidf in compiler_rt. This implementation has the advantage
+  // of performing rounding correctly, both in the default rounding mode
+  // and in all alternate rounding modes.
+  // TODO: Generalize this for use with other types.
+  if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f64) {
+    SDValue TwoP52 =
+      DAG.getConstant(UINT64_C(0x4330000000000000), MVT::i64);
+    SDValue TwoP84PlusTwoP52 =
+      DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), MVT::f64);
+    SDValue TwoP84 =
+      DAG.getConstant(UINT64_C(0x4530000000000000), MVT::i64);
+
+    SDValue Lo = DAG.getZeroExtendInReg(Op0, dl, MVT::i32);
+    SDValue Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0,
+                             DAG.getConstant(32, MVT::i64));
+    SDValue LoOr = DAG.getNode(ISD::OR, dl, MVT::i64, Lo, TwoP52);
+    SDValue HiOr = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, TwoP84);
+    SDValue LoFlt = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, LoOr);
+    SDValue HiFlt = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, HiOr);
+    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt,
+                                TwoP84PlusTwoP52);
+    return DAG.getNode(ISD::FADD, dl, MVT::f64, LoFlt, HiSub);
+  }
+
+  // Implementation of unsigned i64 to f32.  This implementation has the
+  // advantage of performing rounding correctly.
+  // TODO: Generalize this for use with other types.
+  if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f32) {
+    EVT SHVT = TLI.getShiftAmountTy();
+
+    SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
+         DAG.getConstant(UINT64_C(0xfffffffffffff800), MVT::i64));
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And,
+         DAG.getConstant(UINT64_C(0x800), MVT::i64));
+    SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
+         DAG.getConstant(UINT64_C(0x7ff), MVT::i64));
+    SDValue Ne = DAG.getSetCC(dl, TLI.getSetCCResultType(MVT::i64),
+                   And2, DAG.getConstant(UINT64_C(0), MVT::i64), ISD::SETNE);
+    SDValue Sel = DAG.getNode(ISD::SELECT, dl, MVT::i64, Ne, Or, Op0);
+    SDValue Ge = DAG.getSetCC(dl, TLI.getSetCCResultType(MVT::i64),
+                   Op0, DAG.getConstant(UINT64_C(0x0020000000000000), MVT::i64),
+                    ISD::SETUGE);
+    SDValue Sel2 = DAG.getNode(ISD::SELECT, dl, MVT::i64, Ge, Sel, Op0);
+
+    SDValue Sh = DAG.getNode(ISD::SRL, dl, MVT::i64, Sel2,
+                             DAG.getConstant(32, SHVT));
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sh);
+    SDValue Fcvt = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Trunc);
+    SDValue TwoP32 =
+      DAG.getConstantFP(BitsToDouble(UINT64_C(0x41f0000000000000)), MVT::f64);
+    SDValue Fmul = DAG.getNode(ISD::FMUL, dl, MVT::f64, TwoP32, Fcvt);
+    SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sel2);
+    SDValue Fcvt2 = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Lo);
+    SDValue Fadd = DAG.getNode(ISD::FADD, dl, MVT::f64, Fmul, Fcvt2);
+    return DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Fadd,
+                       DAG.getIntPtrConstant(0));
+
+  }
+
+  SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
+
+  SDValue SignSet = DAG.getSetCC(dl, TLI.getSetCCResultType(Op0.getValueType()),
+                                 Op0, DAG.getConstant(0, Op0.getValueType()),
+                                 ISD::SETLT);
+  SDValue Zero = DAG.getIntPtrConstant(0), Four = DAG.getIntPtrConstant(4);
+  SDValue CstOffset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(),
+                                    SignSet, Four, Zero);
+
+  // If the sign bit of the integer is set, the large number will be treated
+  // as a negative number.  To counteract this, the dynamic code adds an
+  // offset depending on the data type.
+  uint64_t FF;
+  switch (Op0.getValueType().getSimpleVT().SimpleTy) {
+  default: assert(0 && "Unsupported integer type!");
+  case MVT::i8 : FF = 0x43800000ULL; break;  // 2^8  (as a float)
+  case MVT::i16: FF = 0x47800000ULL; break;  // 2^16 (as a float)
+  case MVT::i32: FF = 0x4F800000ULL; break;  // 2^32 (as a float)
+  case MVT::i64: FF = 0x5F800000ULL; break;  // 2^64 (as a float)
+  }
+  if (TLI.isLittleEndian()) FF <<= 32;
+  Constant *FudgeFactor = ConstantInt::get(
+                                       Type::getInt64Ty(*DAG.getContext()), FF);
+
+  SDValue CPIdx = DAG.getConstantPool(FudgeFactor, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  CPIdx = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), CPIdx, CstOffset);
+  Alignment = std::min(Alignment, 4u);
+  SDValue FudgeInReg;
+  if (DestVT == MVT::f32)
+    FudgeInReg = DAG.getLoad(MVT::f32, dl, DAG.getEntryNode(), CPIdx,
+                             PseudoSourceValue::getConstantPool(), 0,
+                             false, false, Alignment);
+  else {
+    FudgeInReg =
+      LegalizeOp(DAG.getExtLoad(ISD::EXTLOAD, DestVT, dl,
+                                DAG.getEntryNode(), CPIdx,
+                                PseudoSourceValue::getConstantPool(), 0,
+                                MVT::f32, false, false, Alignment));
+  }
+
+  return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg);
+}
+
+/// PromoteLegalINT_TO_FP - This function is responsible for legalizing a
+/// *INT_TO_FP operation of the specified operand when the target requests that
+/// we promote it.  At this point, we know that the result and operand types are
+/// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP
+/// operation that takes a larger input.
+SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp,
+                                                    EVT DestVT,
+                                                    bool isSigned,
+                                                    DebugLoc dl) {
+  // First step, figure out the appropriate *INT_TO_FP operation to use.
+  EVT NewInTy = LegalOp.getValueType();
+
+  unsigned OpToUse = 0;
+
+  // Scan for the appropriate larger type to use.
+  while (1) {
+    NewInTy = (MVT::SimpleValueType)(NewInTy.getSimpleVT().SimpleTy+1);
+    assert(NewInTy.isInteger() && "Ran out of possibilities!");
+
+    // If the target supports SINT_TO_FP of this type, use it.
+    if (TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, NewInTy)) {
+      OpToUse = ISD::SINT_TO_FP;
+      break;
+    }
+    if (isSigned) continue;
+
+    // If the target supports UINT_TO_FP of this type, use it.
+    if (TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, NewInTy)) {
+      OpToUse = ISD::UINT_TO_FP;
+      break;
+    }
+
+    // Otherwise, try a larger type.
+  }
+
+  // Okay, we found the operation and type to use.  Zero extend our input to the
+  // desired type then run the operation on it.
+  return DAG.getNode(OpToUse, dl, DestVT,
+                     DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                                 dl, NewInTy, LegalOp));
+}
+
+/// PromoteLegalFP_TO_INT - This function is responsible for legalizing a
+/// FP_TO_*INT operation of the specified operand when the target requests that
+/// we promote it.  At this point, we know that the result and operand types are
+/// legal for the target, and that there is a legal FP_TO_UINT or FP_TO_SINT
+/// operation that returns a larger result.
+SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
+                                                    EVT DestVT,
+                                                    bool isSigned,
+                                                    DebugLoc dl) {
+  // First step, figure out the appropriate FP_TO*INT operation to use.
+  EVT NewOutTy = DestVT;
+
+  unsigned OpToUse = 0;
+
+  // Scan for the appropriate larger type to use.
+  while (1) {
+    NewOutTy = (MVT::SimpleValueType)(NewOutTy.getSimpleVT().SimpleTy+1);
+    assert(NewOutTy.isInteger() && "Ran out of possibilities!");
+
+    if (TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NewOutTy)) {
+      OpToUse = ISD::FP_TO_SINT;
+      break;
+    }
+
+    if (TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewOutTy)) {
+      OpToUse = ISD::FP_TO_UINT;
+      break;
+    }
+
+    // Otherwise, try a larger type.
+  }
+
+
+  // Okay, we found the operation and type to use.
+  SDValue Operation = DAG.getNode(OpToUse, dl, NewOutTy, LegalOp);
+
+  // Truncate the result of the extended FP_TO_*INT operation to the desired
+  // size.
+  return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
+}
+
+/// ExpandBSWAP - Open code the operations for BSWAP of the specified operation.
+///
+SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, DebugLoc dl) {
+  EVT VT = Op.getValueType();
+  EVT SHVT = TLI.getShiftAmountTy();
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: assert(0 && "Unhandled Expand type in BSWAP!");
+  case MVT::i16:
+    Tmp2 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    return DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  case MVT::i32:
+    Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, SHVT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, SHVT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3, DAG.getConstant(0xFF0000, VT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(0xFF00, VT));
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
+    Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
+    return DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
+  case MVT::i64:
+    Tmp8 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(56, SHVT));
+    Tmp7 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(40, SHVT));
+    Tmp6 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, SHVT));
+    Tmp5 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp4 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp3 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, SHVT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(40, SHVT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(56, SHVT));
+    Tmp7 = DAG.getNode(ISD::AND, dl, VT, Tmp7, DAG.getConstant(255ULL<<48, VT));
+    Tmp6 = DAG.getNode(ISD::AND, dl, VT, Tmp6, DAG.getConstant(255ULL<<40, VT));
+    Tmp5 = DAG.getNode(ISD::AND, dl, VT, Tmp5, DAG.getConstant(255ULL<<32, VT));
+    Tmp4 = DAG.getNode(ISD::AND, dl, VT, Tmp4, DAG.getConstant(255ULL<<24, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3, DAG.getConstant(255ULL<<16, VT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(255ULL<<8 , VT));
+    Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp7);
+    Tmp6 = DAG.getNode(ISD::OR, dl, VT, Tmp6, Tmp5);
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
+    Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
+    Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp6);
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
+    return DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp4);
+  }
+}
+
+/// ExpandBitCount - Expand the specified bitcount instruction into operations.
+///
+SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
+                                             DebugLoc dl) {
+  switch (Opc) {
+  default: assert(0 && "Cannot expand this yet!");
+  case ISD::CTPOP: {
+    static const uint64_t mask[6] = {
+      0x5555555555555555ULL, 0x3333333333333333ULL,
+      0x0F0F0F0F0F0F0F0FULL, 0x00FF00FF00FF00FFULL,
+      0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL
+    };
+    EVT VT = Op.getValueType();
+    EVT ShVT = TLI.getShiftAmountTy();
+    unsigned len = VT.getSizeInBits();
+    for (unsigned i = 0; (1U << i) <= (len / 2); ++i) {
+      //x = (x & mask[i][len/8]) + (x >> (1 << i) & mask[i][len/8])
+      unsigned EltSize = VT.isVector() ?
+        VT.getVectorElementType().getSizeInBits() : len;
+      SDValue Tmp2 = DAG.getConstant(APInt(EltSize, mask[i]), VT);
+      SDValue Tmp3 = DAG.getConstant(1ULL << i, ShVT);
+      Op = DAG.getNode(ISD::ADD, dl, VT,
+                       DAG.getNode(ISD::AND, dl, VT, Op, Tmp2),
+                       DAG.getNode(ISD::AND, dl, VT,
+                                   DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3),
+                                   Tmp2));
+    }
+    return Op;
+  }
+  case ISD::CTLZ: {
+    // for now, we do this:
+    // x = x | (x >> 1);
+    // x = x | (x >> 2);
+    // ...
+    // x = x | (x >>16);
+    // x = x | (x >>32); // for 64-bit input
+    // return popcount(~x);
+    //
+    // but see also: http://www.hackersdelight.org/HDcode/nlz.cc
+    EVT VT = Op.getValueType();
+    EVT ShVT = TLI.getShiftAmountTy();
+    unsigned len = VT.getSizeInBits();
+    for (unsigned i = 0; (1U << i) <= (len / 2); ++i) {
+      SDValue Tmp3 = DAG.getConstant(1ULL << i, ShVT);
+      Op = DAG.getNode(ISD::OR, dl, VT, Op,
+                       DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3));
+    }
+    Op = DAG.getNOT(dl, Op, VT);
+    return DAG.getNode(ISD::CTPOP, dl, VT, Op);
+  }
+  case ISD::CTTZ: {
+    // for now, we use: { return popcount(~x & (x - 1)); }
+    // unless the target has ctlz but not ctpop, in which case we use:
+    // { return 32 - nlz(~x & (x-1)); }
+    // see also http://www.hackersdelight.org/HDcode/ntz.cc
+    EVT VT = Op.getValueType();
+    SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT,
+                               DAG.getNOT(dl, Op, VT),
+                               DAG.getNode(ISD::SUB, dl, VT, Op,
+                                           DAG.getConstant(1, VT)));
+    // If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
+    if (!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::CTLZ, VT))
+      return DAG.getNode(ISD::SUB, dl, VT,
+                         DAG.getConstant(VT.getSizeInBits(), VT),
+                         DAG.getNode(ISD::CTLZ, dl, VT, Tmp3));
+    return DAG.getNode(ISD::CTPOP, dl, VT, Tmp3);
+  }
+  }
+}
+
+std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
+  unsigned Opc = Node->getOpcode();
+  MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
+  RTLIB::Libcall LC;
+
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic intrinsic Expand!");
+    break;
+  case ISD::ATOMIC_SWAP:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_LOCK_TEST_AND_SET_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_8; break;
+    }
+    break;
+  case ISD::ATOMIC_CMP_SWAP:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_ADD:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_ADD_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_ADD_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_ADD_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_ADD_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_SUB:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_SUB_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_SUB_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_SUB_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_SUB_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_AND:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_AND_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_AND_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_AND_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_AND_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_OR:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_OR_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_OR_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_OR_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_OR_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_XOR:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_XOR_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_XOR_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_XOR_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_XOR_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_NAND:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_NAND_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_NAND_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_NAND_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_NAND_8; break;
+    }
+    break;
+  }
+
+  return ExpandChainLibCall(LC, Node, false);
+}
+
+void SelectionDAGLegalize::ExpandNode(SDNode *Node,
+                                      SmallVectorImpl<SDValue> &Results) {
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  switch (Node->getOpcode()) {
+  case ISD::CTPOP:
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+    Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::BSWAP:
+    Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
+    break;
+  case ISD::FRAMEADDR:
+  case ISD::RETURNADDR:
+  case ISD::FRAME_TO_ARGS_OFFSET:
+    Results.push_back(DAG.getConstant(0, Node->getValueType(0)));
+    break;
+  case ISD::FLT_ROUNDS_:
+    Results.push_back(DAG.getConstant(1, Node->getValueType(0)));
+    break;
+  case ISD::PREFETCH:
+  case ISD::VAEND:
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::MEMBARRIER: {
+    // If the target didn't lower this, lower it to '__sync_synchronize()' call
+    TargetLowering::ArgListTy Args;
+    std::pair<SDValue, SDValue> CallResult =
+      TLI.LowerCallTo(Node->getOperand(0), Type::getVoidTy(*DAG.getContext()),
+                      false, false, false, false, 0, CallingConv::C, false,
+                      /*isReturnValueUsed=*/true,
+                      DAG.getExternalSymbol("__sync_synchronize",
+                                            TLI.getPointerTy()),
+                      Args, DAG, dl);
+    Results.push_back(CallResult.second);
+    break;
+  }
+  // By default, atomic intrinsics are marked Legal and lowered. Targets
+  // which don't support them directly, however, may want libcalls, in which
+  // case they mark them Expand, and we get here.
+  // FIXME: Unimplemented for now. Add libcalls.
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_CMP_SWAP: {
+    std::pair<SDValue, SDValue> Tmp = ExpandAtomic(Node);
+    Results.push_back(Tmp.first);
+    Results.push_back(Tmp.second);
+    break;
+  }
+  case ISD::DYNAMIC_STACKALLOC:
+    ExpandDYNAMIC_STACKALLOC(Node, Results);
+    break;
+  case ISD::MERGE_VALUES:
+    for (unsigned i = 0; i < Node->getNumValues(); i++)
+      Results.push_back(Node->getOperand(i));
+    break;
+  case ISD::UNDEF: {
+    EVT VT = Node->getValueType(0);
+    if (VT.isInteger())
+      Results.push_back(DAG.getConstant(0, VT));
+    else {
+      assert(VT.isFloatingPoint() && "Unknown value type!");
+      Results.push_back(DAG.getConstantFP(0, VT));
+    }
+    break;
+  }
+  case ISD::TRAP: {
+    // If this operation is not supported, lower it to 'abort()' call
+    TargetLowering::ArgListTy Args;
+    std::pair<SDValue, SDValue> CallResult =
+      TLI.LowerCallTo(Node->getOperand(0), Type::getVoidTy(*DAG.getContext()),
+                      false, false, false, false, 0, CallingConv::C, false,
+                      /*isReturnValueUsed=*/true,
+                      DAG.getExternalSymbol("abort", TLI.getPointerTy()),
+                      Args, DAG, dl);
+    Results.push_back(CallResult.second);
+    break;
+  }
+  case ISD::FP_ROUND:
+  case ISD::BIT_CONVERT:
+    Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
+                            Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::FP_EXTEND:
+    Tmp1 = EmitStackConvert(Node->getOperand(0),
+                            Node->getOperand(0).getValueType(),
+                            Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::SIGN_EXTEND_INREG: {
+    // NOTE: we could fall back on load/store here too for targets without
+    // SAR.  However, it is doubtful that any exist.
+    EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    EVT VT = Node->getValueType(0);
+    EVT ShiftAmountTy = TLI.getShiftAmountTy();
+    if (VT.isVector())
+      ShiftAmountTy = VT;
+    unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
+                        ExtraVT.getScalarType().getSizeInBits();
+    SDValue ShiftCst = DAG.getConstant(BitsDiff, ShiftAmountTy);
+    Tmp1 = DAG.getNode(ISD::SHL, dl, Node->getValueType(0),
+                       Node->getOperand(0), ShiftCst);
+    Tmp1 = DAG.getNode(ISD::SRA, dl, Node->getValueType(0), Tmp1, ShiftCst);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::FP_ROUND_INREG: {
+    // The only way we can lower this is to turn it into a TRUNCSTORE,
+    // EXTLOAD pair, targetting a temporary location (a stack slot).
+
+    // NOTE: there is a choice here between constantly creating new stack
+    // slots and always reusing the same one.  We currently always create
+    // new ones, as reuse may inhibit scheduling.
+    EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    Tmp1 = EmitStackConvert(Node->getOperand(0), ExtraVT,
+                            Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP,
+                                Node->getOperand(0), Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::FP_TO_UINT: {
+    SDValue True, False;
+    EVT VT =  Node->getOperand(0).getValueType();
+    EVT NVT = Node->getValueType(0);
+    const uint64_t zero[] = {0, 0};
+    APFloat apf = APFloat(APInt(VT.getSizeInBits(), 2, zero));
+    APInt x = APInt::getSignBit(NVT.getSizeInBits());
+    (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
+    Tmp1 = DAG.getConstantFP(apf, VT);
+    Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(VT),
+                        Node->getOperand(0),
+                        Tmp1, ISD::SETLT);
+    True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
+    False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
+                        DAG.getNode(ISD::FSUB, dl, VT,
+                                    Node->getOperand(0), Tmp1));
+    False = DAG.getNode(ISD::XOR, dl, NVT, False,
+                        DAG.getConstant(x, NVT));
+    Tmp1 = DAG.getNode(ISD::SELECT, dl, NVT, Tmp2, True, False);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::VAARG: {
+    const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+    EVT VT = Node->getValueType(0);
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    unsigned Align = Node->getConstantOperandVal(3);
+
+    SDValue VAListLoad = DAG.getLoad(TLI.getPointerTy(), dl, Tmp1, Tmp2, V, 0,
+                                     false, false, 0);
+    SDValue VAList = VAListLoad;
+
+    if (Align > TLI.getMinStackArgumentAlignment()) {
+      assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+
+      VAList = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+                           DAG.getConstant(Align - 1,
+                                           TLI.getPointerTy()));
+
+      VAList = DAG.getNode(ISD::AND, dl, TLI.getPointerTy(), VAList,
+                           DAG.getConstant(-Align,
+                                           TLI.getPointerTy()));
+    }
+
+    // Increment the pointer, VAList, to the next vaarg
+    Tmp3 = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+                       DAG.getConstant(TLI.getTargetData()->
+                          getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext())),
+                                       TLI.getPointerTy()));
+    // Store the incremented VAList to the legalized pointer
+    Tmp3 = DAG.getStore(VAListLoad.getValue(1), dl, Tmp3, Tmp2, V, 0,
+                        false, false, 0);
+    // Load the actual argument out of the pointer VAList
+    Results.push_back(DAG.getLoad(VT, dl, Tmp3, VAList, NULL, 0,
+                                  false, false, 0));
+    Results.push_back(Results[0].getValue(1));
+    break;
+  }
+  case ISD::VACOPY: {
+    // This defaults to loading a pointer from the input and storing it to the
+    // output, returning the chain.
+    const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
+    const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
+    Tmp1 = DAG.getLoad(TLI.getPointerTy(), dl, Node->getOperand(0),
+                       Node->getOperand(2), VS, 0, false, false, 0);
+    Tmp1 = DAG.getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1), VD, 0,
+                        false, false, 0);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT:
+    if (Node->getOperand(0).getValueType().getVectorNumElements() == 1)
+      // This must be an access of the only element.  Return it.
+      Tmp1 = DAG.getNode(ISD::BIT_CONVERT, dl, Node->getValueType(0),
+                         Node->getOperand(0));
+    else
+      Tmp1 = ExpandExtractFromVectorThroughStack(SDValue(Node, 0));
+    Results.push_back(Tmp1);
+    break;
+  case ISD::EXTRACT_SUBVECTOR:
+    Results.push_back(ExpandExtractFromVectorThroughStack(SDValue(Node, 0)));
+    break;
+  case ISD::CONCAT_VECTORS: {
+    Results.push_back(ExpandVectorBuildThroughStack(Node));
+    break;
+  }
+  case ISD::SCALAR_TO_VECTOR:
+    Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
+    break;
+  case ISD::INSERT_VECTOR_ELT:
+    Results.push_back(ExpandINSERT_VECTOR_ELT(Node->getOperand(0),
+                                              Node->getOperand(1),
+                                              Node->getOperand(2), dl));
+    break;
+  case ISD::VECTOR_SHUFFLE: {
+    SmallVector<int, 8> Mask;
+    cast<ShuffleVectorSDNode>(Node)->getMask(Mask);
+
+    EVT VT = Node->getValueType(0);
+    EVT EltVT = VT.getVectorElementType();
+    if (getTypeAction(EltVT) == Promote)
+      EltVT = TLI.getTypeToTransformTo(*DAG.getContext(), EltVT);
+    unsigned NumElems = VT.getVectorNumElements();
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      if (Mask[i] < 0) {
+        Ops.push_back(DAG.getUNDEF(EltVT));
+        continue;
+      }
+      unsigned Idx = Mask[i];
+      if (Idx < NumElems)
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                  Node->getOperand(0),
+                                  DAG.getIntPtrConstant(Idx)));
+      else
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                  Node->getOperand(1),
+                                  DAG.getIntPtrConstant(Idx - NumElems)));
+    }
+    Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::EXTRACT_ELEMENT: {
+    EVT OpTy = Node->getOperand(0).getValueType();
+    if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+      // 1 -> Hi
+      Tmp1 = DAG.getNode(ISD::SRL, dl, OpTy, Node->getOperand(0),
+                         DAG.getConstant(OpTy.getSizeInBits()/2,
+                                         TLI.getShiftAmountTy()));
+      Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Tmp1);
+    } else {
+      // 0 -> Lo
+      Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0),
+                         Node->getOperand(0));
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::STACKSAVE:
+    // Expand to CopyFromReg if the target set
+    // StackPointerRegisterToSaveRestore.
+    if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
+      Results.push_back(DAG.getCopyFromReg(Node->getOperand(0), dl, SP,
+                                           Node->getValueType(0)));
+      Results.push_back(Results[0].getValue(1));
+    } else {
+      Results.push_back(DAG.getUNDEF(Node->getValueType(0)));
+      Results.push_back(Node->getOperand(0));
+    }
+    break;
+  case ISD::STACKRESTORE:
+    // Expand to CopyToReg if the target set
+    // StackPointerRegisterToSaveRestore.
+    if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
+      Results.push_back(DAG.getCopyToReg(Node->getOperand(0), dl, SP,
+                                         Node->getOperand(1)));
+    } else {
+      Results.push_back(Node->getOperand(0));
+    }
+    break;
+  case ISD::FCOPYSIGN:
+    Results.push_back(ExpandFCOPYSIGN(Node));
+    break;
+  case ISD::FNEG:
+    // Expand Y = FNEG(X) ->  Y = SUB -0.0, X
+    Tmp1 = DAG.getConstantFP(-0.0, Node->getValueType(0));
+    Tmp1 = DAG.getNode(ISD::FSUB, dl, Node->getValueType(0), Tmp1,
+                       Node->getOperand(0));
+    Results.push_back(Tmp1);
+    break;
+  case ISD::FABS: {
+    // Expand Y = FABS(X) -> Y = (X >u 0.0) ? X : fneg(X).
+    EVT VT = Node->getValueType(0);
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = DAG.getConstantFP(0.0, VT);
+    Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(Tmp1.getValueType()),
+                        Tmp1, Tmp2, ISD::SETUGT);
+    Tmp3 = DAG.getNode(ISD::FNEG, dl, VT, Tmp1);
+    Tmp1 = DAG.getNode(ISD::SELECT, dl, VT, Tmp2, Tmp1, Tmp3);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::FSQRT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
+                                      RTLIB::SQRT_F80, RTLIB::SQRT_PPCF128));
+    break;
+  case ISD::FSIN:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
+                                      RTLIB::SIN_F80, RTLIB::SIN_PPCF128));
+    break;
+  case ISD::FCOS:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
+                                      RTLIB::COS_F80, RTLIB::COS_PPCF128));
+    break;
+  case ISD::FLOG:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
+                                      RTLIB::LOG_F80, RTLIB::LOG_PPCF128));
+    break;
+  case ISD::FLOG2:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
+                                      RTLIB::LOG2_F80, RTLIB::LOG2_PPCF128));
+    break;
+  case ISD::FLOG10:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
+                                      RTLIB::LOG10_F80, RTLIB::LOG10_PPCF128));
+    break;
+  case ISD::FEXP:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
+                                      RTLIB::EXP_F80, RTLIB::EXP_PPCF128));
+    break;
+  case ISD::FEXP2:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
+                                      RTLIB::EXP2_F80, RTLIB::EXP2_PPCF128));
+    break;
+  case ISD::FTRUNC:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
+                                      RTLIB::TRUNC_F80, RTLIB::TRUNC_PPCF128));
+    break;
+  case ISD::FFLOOR:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
+                                      RTLIB::FLOOR_F80, RTLIB::FLOOR_PPCF128));
+    break;
+  case ISD::FCEIL:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64,
+                                      RTLIB::CEIL_F80, RTLIB::CEIL_PPCF128));
+    break;
+  case ISD::FRINT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64,
+                                      RTLIB::RINT_F80, RTLIB::RINT_PPCF128));
+    break;
+  case ISD::FNEARBYINT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32,
+                                      RTLIB::NEARBYINT_F64,
+                                      RTLIB::NEARBYINT_F80,
+                                      RTLIB::NEARBYINT_PPCF128));
+    break;
+  case ISD::FPOWI:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
+                                      RTLIB::POWI_F80, RTLIB::POWI_PPCF128));
+    break;
+  case ISD::FPOW:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
+                                      RTLIB::POW_F80, RTLIB::POW_PPCF128));
+    break;
+  case ISD::FDIV:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
+                                      RTLIB::DIV_F80, RTLIB::DIV_PPCF128));
+    break;
+  case ISD::FREM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
+                                      RTLIB::REM_F80, RTLIB::REM_PPCF128));
+    break;
+  case ISD::FP16_TO_FP32:
+    Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
+    break;
+  case ISD::FP32_TO_FP16:
+    Results.push_back(ExpandLibCall(RTLIB::FPROUND_F32_F16, Node, false));
+    break;
+  case ISD::ConstantFP: {
+    ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Node);
+    // Check to see if this FP immediate is already legal.
+    // If this is a legal constant, turn it into a TargetConstantFP node.
+    if (TLI.isFPImmLegal(CFP->getValueAPF(), Node->getValueType(0)))
+      Results.push_back(SDValue(Node, 0));
+    else
+      Results.push_back(ExpandConstantFP(CFP, true, DAG, TLI));
+    break;
+  }
+  case ISD::SUB: {
+    EVT VT = Node->getValueType(0);
+    assert(TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
+           TLI.isOperationLegalOrCustom(ISD::XOR, VT) &&
+           "Don't know how to expand this subtraction!");
+    Tmp1 = DAG.getNode(ISD::XOR, dl, VT, Node->getOperand(1),
+               DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT));
+    Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp2, DAG.getConstant(1, VT));
+    Results.push_back(DAG.getNode(ISD::ADD, dl, VT, Node->getOperand(0), Tmp1));
+    break;
+  }
+  case ISD::UREM:
+  case ISD::SREM: {
+    EVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    bool isSigned = Node->getOpcode() == ISD::SREM;
+    unsigned DivOpc = isSigned ? ISD::SDIV : ISD::UDIV;
+    unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+    Tmp2 = Node->getOperand(0);
+    Tmp3 = Node->getOperand(1);
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
+      Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
+    } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
+      // X % Y -> X-X/Y*Y
+      Tmp1 = DAG.getNode(DivOpc, dl, VT, Tmp2, Tmp3);
+      Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Tmp3);
+      Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp1);
+    } else if (isSigned) {
+      Tmp1 = ExpandIntLibCall(Node, true,
+                              RTLIB::SREM_I8,
+                              RTLIB::SREM_I16, RTLIB::SREM_I32,
+                              RTLIB::SREM_I64, RTLIB::SREM_I128);
+    } else {
+      Tmp1 = ExpandIntLibCall(Node, false,
+                              RTLIB::UREM_I8,
+                              RTLIB::UREM_I16, RTLIB::UREM_I32,
+                              RTLIB::UREM_I64, RTLIB::UREM_I128);
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::UDIV:
+  case ISD::SDIV: {
+    bool isSigned = Node->getOpcode() == ISD::SDIV;
+    unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+    EVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT))
+      Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0),
+                         Node->getOperand(1));
+    else if (isSigned)
+      Tmp1 = ExpandIntLibCall(Node, true,
+                              RTLIB::SDIV_I8,
+                              RTLIB::SDIV_I16, RTLIB::SDIV_I32,
+                              RTLIB::SDIV_I64, RTLIB::SDIV_I128);
+    else
+      Tmp1 = ExpandIntLibCall(Node, false,
+                              RTLIB::UDIV_I8,
+                              RTLIB::UDIV_I16, RTLIB::UDIV_I32,
+                              RTLIB::UDIV_I64, RTLIB::UDIV_I128);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::MULHU:
+  case ISD::MULHS: {
+    unsigned ExpandOpcode = Node->getOpcode() == ISD::MULHU ? ISD::UMUL_LOHI :
+                                                              ISD::SMUL_LOHI;
+    EVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    assert(TLI.isOperationLegalOrCustom(ExpandOpcode, VT) &&
+           "If this wasn't legal, it shouldn't have been created!");
+    Tmp1 = DAG.getNode(ExpandOpcode, dl, VTs, Node->getOperand(0),
+                       Node->getOperand(1));
+    Results.push_back(Tmp1.getValue(1));
+    break;
+  }
+  case ISD::MUL: {
+    EVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    // See if multiply or divide can be lowered using two-result operations.
+    // We just need the low half of the multiply; try both the signed
+    // and unsigned forms. If the target supports both SMUL_LOHI and
+    // UMUL_LOHI, form a preference by checking which forms of plain
+    // MULH it supports.
+    bool HasSMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::SMUL_LOHI, VT);
+    bool HasUMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, VT);
+    bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, VT);
+    bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, VT);
+    unsigned OpToUse = 0;
+    if (HasSMUL_LOHI && !HasMULHS) {
+      OpToUse = ISD::SMUL_LOHI;
+    } else if (HasUMUL_LOHI && !HasMULHU) {
+      OpToUse = ISD::UMUL_LOHI;
+    } else if (HasSMUL_LOHI) {
+      OpToUse = ISD::SMUL_LOHI;
+    } else if (HasUMUL_LOHI) {
+      OpToUse = ISD::UMUL_LOHI;
+    }
+    if (OpToUse) {
+      Results.push_back(DAG.getNode(OpToUse, dl, VTs, Node->getOperand(0),
+                                    Node->getOperand(1)));
+      break;
+    }
+    Tmp1 = ExpandIntLibCall(Node, false,
+                            RTLIB::MUL_I8,
+                            RTLIB::MUL_I16, RTLIB::MUL_I32,
+                            RTLIB::MUL_I64, RTLIB::MUL_I128);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SADDO:
+  case ISD::SSUBO: {
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
+                              ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                              LHS, RHS);
+    Results.push_back(Sum);
+    EVT OType = Node->getValueType(1);
+
+    SDValue Zero = DAG.getConstant(0, LHS.getValueType());
+
+    //   LHSSign -> LHS >= 0
+    //   RHSSign -> RHS >= 0
+    //   SumSign -> Sum >= 0
+    //
+    //   Add:
+    //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+    //   Sub:
+    //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+    //
+    SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
+    SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
+    SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
+                                      Node->getOpcode() == ISD::SADDO ?
+                                      ISD::SETEQ : ISD::SETNE);
+
+    SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
+    SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
+
+    SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
+    Results.push_back(Cmp);
+    break;
+  }
+  case ISD::UADDO:
+  case ISD::USUBO: {
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::UADDO ?
+                              ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                              LHS, RHS);
+    Results.push_back(Sum);
+    Results.push_back(DAG.getSetCC(dl, Node->getValueType(1), Sum, LHS,
+                                   Node->getOpcode () == ISD::UADDO ?
+                                   ISD::SETULT : ISD::SETUGT));
+    break;
+  }
+  case ISD::UMULO:
+  case ISD::SMULO: {
+    EVT VT = Node->getValueType(0);
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    SDValue BottomHalf;
+    SDValue TopHalf;
+    static const unsigned Ops[2][3] =
+        { { ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND },
+          { ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND }};
+    bool isSigned = Node->getOpcode() == ISD::SMULO;
+    if (TLI.isOperationLegalOrCustom(Ops[isSigned][0], VT)) {
+      BottomHalf = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+      TopHalf = DAG.getNode(Ops[isSigned][0], dl, VT, LHS, RHS);
+    } else if (TLI.isOperationLegalOrCustom(Ops[isSigned][1], VT)) {
+      BottomHalf = DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS,
+                               RHS);
+      TopHalf = BottomHalf.getValue(1);
+    } else {
+      // FIXME: We should be able to fall back to a libcall with an illegal
+      // type in some cases.
+      // Also, we can fall back to a division in some cases, but that's a big
+      // performance hit in the general case.
+      assert(TLI.isTypeLegal(EVT::getIntegerVT(*DAG.getContext(),
+                                               VT.getSizeInBits() * 2)) &&
+             "Don't know how to expand this operation yet!");
+      EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
+      LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
+      RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
+      Tmp1 = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
+      BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Tmp1,
+                               DAG.getIntPtrConstant(0));
+      TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Tmp1,
+                            DAG.getIntPtrConstant(1));
+    }
+    if (isSigned) {
+      Tmp1 = DAG.getConstant(VT.getSizeInBits() - 1, TLI.getShiftAmountTy());
+      Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, Tmp1);
+      TopHalf = DAG.getSetCC(dl, TLI.getSetCCResultType(VT), TopHalf, Tmp1,
+                             ISD::SETNE);
+    } else {
+      TopHalf = DAG.getSetCC(dl, TLI.getSetCCResultType(VT), TopHalf,
+                             DAG.getConstant(0, VT), ISD::SETNE);
+    }
+    Results.push_back(BottomHalf);
+    Results.push_back(TopHalf);
+    break;
+  }
+  case ISD::BUILD_PAIR: {
+    EVT PairTy = Node->getValueType(0);
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, PairTy, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, PairTy, Node->getOperand(1));
+    Tmp2 = DAG.getNode(ISD::SHL, dl, PairTy, Tmp2,
+                       DAG.getConstant(PairTy.getSizeInBits()/2,
+                                       TLI.getShiftAmountTy()));
+    Results.push_back(DAG.getNode(ISD::OR, dl, PairTy, Tmp1, Tmp2));
+    break;
+  }
+  case ISD::SELECT:
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = Node->getOperand(2);
+    if (Tmp1.getOpcode() == ISD::SETCC) {
+      Tmp1 = DAG.getSelectCC(dl, Tmp1.getOperand(0), Tmp1.getOperand(1),
+                             Tmp2, Tmp3,
+                             cast<CondCodeSDNode>(Tmp1.getOperand(2))->get());
+    } else {
+      Tmp1 = DAG.getSelectCC(dl, Tmp1,
+                             DAG.getConstant(0, Tmp1.getValueType()),
+                             Tmp2, Tmp3, ISD::SETNE);
+    }
+    Results.push_back(Tmp1);
+    break;
+  case ISD::BR_JT: {
+    SDValue Chain = Node->getOperand(0);
+    SDValue Table = Node->getOperand(1);
+    SDValue Index = Node->getOperand(2);
+
+    EVT PTy = TLI.getPointerTy();
+
+    const TargetData &TD = *TLI.getTargetData();
+    unsigned EntrySize =
+      DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
+
+    Index = DAG.getNode(ISD::MUL, dl, PTy,
+                        Index, DAG.getConstant(EntrySize, PTy));
+    SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
+
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
+    SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, PTy, dl, Chain, Addr,
+                                PseudoSourceValue::getJumpTable(), 0, MemVT,
+                                false, false, 0);
+    Addr = LD;
+    if (TM.getRelocationModel() == Reloc::PIC_) {
+      // For PIC, the sequence is:
+      // BRIND(load(Jumptable + index) + RelocBase)
+      // RelocBase can be JumpTable, GOT or some sort of global base.
+      Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr,
+                          TLI.getPICJumpTableRelocBase(Table, DAG));
+    }
+    Tmp1 = DAG.getNode(ISD::BRIND, dl, MVT::Other, LD.getValue(1), Addr);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BRCOND:
+    // Expand brcond's setcc into its constituent parts and create a BR_CC
+    // Node.
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    if (Tmp2.getOpcode() == ISD::SETCC) {
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other,
+                         Tmp1, Tmp2.getOperand(2),
+                         Tmp2.getOperand(0), Tmp2.getOperand(1),
+                         Node->getOperand(2));
+    } else {
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1,
+                         DAG.getCondCode(ISD::SETNE), Tmp2,
+                         DAG.getConstant(0, Tmp2.getValueType()),
+                         Node->getOperand(2));
+    }
+    Results.push_back(Tmp1);
+    break;
+  case ISD::SETCC: {
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = Node->getOperand(2);
+    LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2, Tmp3, dl);
+
+    // If we expanded the SETCC into an AND/OR, return the new node
+    if (Tmp2.getNode() == 0) {
+      Results.push_back(Tmp1);
+      break;
+    }
+
+    // Otherwise, SETCC for the given comparison type must be completely
+    // illegal; expand it into a SELECT_CC.
+    EVT VT = Node->getValueType(0);
+    Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2,
+                       DAG.getConstant(1, VT), DAG.getConstant(0, VT), Tmp3);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SELECT_CC: {
+    Tmp1 = Node->getOperand(0);   // LHS
+    Tmp2 = Node->getOperand(1);   // RHS
+    Tmp3 = Node->getOperand(2);   // True
+    Tmp4 = Node->getOperand(3);   // False
+    SDValue CC = Node->getOperand(4);
+
+    LegalizeSetCCCondCode(TLI.getSetCCResultType(Tmp1.getValueType()),
+                          Tmp1, Tmp2, CC, dl);
+
+    assert(!Tmp2.getNode() && "Can't legalize SELECT_CC with legal condition!");
+    Tmp2 = DAG.getConstant(0, Tmp1.getValueType());
+    CC = DAG.getCondCode(ISD::SETNE);
+    Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1, Tmp2,
+                       Tmp3, Tmp4, CC);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BR_CC: {
+    Tmp1 = Node->getOperand(0);              // Chain
+    Tmp2 = Node->getOperand(2);              // LHS
+    Tmp3 = Node->getOperand(3);              // RHS
+    Tmp4 = Node->getOperand(1);              // CC
+
+    LegalizeSetCCCondCode(TLI.getSetCCResultType(Tmp2.getValueType()),
+                          Tmp2, Tmp3, Tmp4, dl);
+    LastCALLSEQ_END = DAG.getEntryNode();
+
+    assert(!Tmp3.getNode() && "Can't legalize BR_CC with legal condition!");
+    Tmp3 = DAG.getConstant(0, Tmp2.getValueType());
+    Tmp4 = DAG.getCondCode(ISD::SETNE);
+    Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4, Tmp2,
+                       Tmp3, Node->getOperand(4));
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::GLOBAL_OFFSET_TABLE:
+  case ISD::GlobalAddress:
+  case ISD::GlobalTLSAddress:
+  case ISD::ExternalSymbol:
+  case ISD::ConstantPool:
+  case ISD::JumpTable:
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+    // FIXME: Custom lowering for these operations shouldn't return null!
+    for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+      Results.push_back(SDValue(Node, i));
+    break;
+  }
+}
+void SelectionDAGLegalize::PromoteNode(SDNode *Node,
+                                       SmallVectorImpl<SDValue> &Results) {
+  EVT OVT = Node->getValueType(0);
+  if (Node->getOpcode() == ISD::UINT_TO_FP ||
+      Node->getOpcode() == ISD::SINT_TO_FP ||
+      Node->getOpcode() == ISD::SETCC) {
+    OVT = Node->getOperand(0).getValueType();
+  }
+  EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OVT);
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Tmp1, Tmp2, Tmp3;
+  switch (Node->getOpcode()) {
+  case ISD::CTTZ:
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+    // Zero extend the argument.
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+    // Perform the larger operation.
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
+    if (Node->getOpcode() == ISD::CTTZ) {
+      //if Tmp1 == sizeinbits(NVT) then Tmp1 = sizeinbits(Old VT)
+      Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT),
+                          Tmp1, DAG.getConstant(NVT.getSizeInBits(), NVT),
+                          ISD::SETEQ);
+      Tmp1 = DAG.getNode(ISD::SELECT, dl, NVT, Tmp2,
+                          DAG.getConstant(OVT.getSizeInBits(), NVT), Tmp1);
+    } else if (Node->getOpcode() == ISD::CTLZ) {
+      // Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
+      Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
+                          DAG.getConstant(NVT.getSizeInBits() -
+                                          OVT.getSizeInBits(), NVT));
+    }
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
+    break;
+  case ISD::BSWAP: {
+    unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp1 = DAG.getNode(ISD::BSWAP, dl, NVT, Tmp1);
+    Tmp1 = DAG.getNode(ISD::SRL, dl, NVT, Tmp1,
+                          DAG.getConstant(DiffBits, TLI.getShiftAmountTy()));
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    Tmp1 = PromoteLegalFP_TO_INT(Node->getOperand(0), Node->getValueType(0),
+                                 Node->getOpcode() == ISD::FP_TO_SINT, dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+    Tmp1 = PromoteLegalINT_TO_FP(Node->getOperand(0), Node->getValueType(0),
+                                 Node->getOpcode() == ISD::SINT_TO_FP, dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    unsigned ExtOp, TruncOp;
+    if (OVT.isVector()) {
+      ExtOp   = ISD::BIT_CONVERT;
+      TruncOp = ISD::BIT_CONVERT;
+    } else {
+      assert(OVT.isInteger() && "Cannot promote logic operation");
+      ExtOp   = ISD::ANY_EXTEND;
+      TruncOp = ISD::TRUNCATE;
+    }
+    // Promote each of the values to the new type.
+    Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    // Perform the larger operation, then convert back
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
+    Results.push_back(DAG.getNode(TruncOp, dl, OVT, Tmp1));
+    break;
+  }
+  case ISD::SELECT: {
+    unsigned ExtOp, TruncOp;
+    if (Node->getValueType(0).isVector()) {
+      ExtOp   = ISD::BIT_CONVERT;
+      TruncOp = ISD::BIT_CONVERT;
+    } else if (Node->getValueType(0).isInteger()) {
+      ExtOp   = ISD::ANY_EXTEND;
+      TruncOp = ISD::TRUNCATE;
+    } else {
+      ExtOp   = ISD::FP_EXTEND;
+      TruncOp = ISD::FP_ROUND;
+    }
+    Tmp1 = Node->getOperand(0);
+    // Promote each of the values to the new type.
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
+    // Perform the larger operation, then round down.
+    Tmp1 = DAG.getNode(ISD::SELECT, dl, NVT, Tmp1, Tmp2, Tmp3);
+    if (TruncOp != ISD::FP_ROUND)
+      Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1);
+    else
+      Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1,
+                         DAG.getIntPtrConstant(0));
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    SmallVector<int, 8> Mask;
+    cast<ShuffleVectorSDNode>(Node)->getMask(Mask);
+
+    // Cast the two input vectors.
+    Tmp1 = DAG.getNode(ISD::BIT_CONVERT, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::BIT_CONVERT, dl, NVT, Node->getOperand(1));
+
+    // Convert the shuffle mask to the right # elements.
+    Tmp1 = ShuffleWithNarrowerEltType(NVT, OVT, dl, Tmp1, Tmp2, Mask);
+    Tmp1 = DAG.getNode(ISD::BIT_CONVERT, dl, OVT, Tmp1);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SETCC: {
+    unsigned ExtOp = ISD::FP_EXTEND;
+    if (NVT.isInteger()) {
+      ISD::CondCode CCCode =
+        cast<CondCodeSDNode>(Node->getOperand(2))->get();
+      ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    }
+    Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    Results.push_back(DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
+                                  Tmp1, Tmp2, Node->getOperand(2)));
+    break;
+  }
+  }
+}
+
+// SelectionDAG::Legalize - This is the entry point for the file.
+//
+void SelectionDAG::Legalize(CodeGenOpt::Level OptLevel) {
+  /// run - This is the main entry point to this class.
+  ///
+  SelectionDAGLegalize(*this, OptLevel).LegalizeDAG();
+}
+
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
new file mode 100644
index 0000000..650ee5a
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -0,0 +1,1427 @@
+//===-------- LegalizeFloatTypes.cpp - Legalization of float types --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements float type expansion and softening for LegalizeTypes.
+// Softening is the act of turning a computation in an illegal floating point
+// type into a computation in an integer type of the same size; also known as
+// "soft float".  For example, turning f32 arithmetic into operations using i32.
+// The resulting integer value is the same as what you would get by performing
+// the floating point operation and bitcasting the result to the integer type.
+// Expansion is the act of changing a computation in an illegal type to be a
+// computation in two identical registers of a smaller type.  For example,
+// implementing ppcf128 arithmetic in two f64 registers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+/// GetFPLibCall - Return the right libcall for the given floating point type.
+static RTLIB::Libcall GetFPLibCall(EVT VT,
+                                   RTLIB::Libcall Call_F32,
+                                   RTLIB::Libcall Call_F64,
+                                   RTLIB::Libcall Call_F80,
+                                   RTLIB::Libcall Call_PPCF128) {
+  return
+    VT == MVT::f32 ? Call_F32 :
+    VT == MVT::f64 ? Call_F64 :
+    VT == MVT::f80 ? Call_F80 :
+    VT == MVT::ppcf128 ? Call_PPCF128 :
+    RTLIB::UNKNOWN_LIBCALL;
+}
+
+//===----------------------------------------------------------------------===//
+//  Result Float to Integer Conversion.
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue R = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SoftenFloatResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to soften the result of this operator!");
+
+    case ISD::BIT_CONVERT: R = SoftenFloatRes_BIT_CONVERT(N); break;
+    case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
+    case ISD::ConstantFP:
+      R = SoftenFloatRes_ConstantFP(cast<ConstantFPSDNode>(N));
+      break;
+    case ISD::EXTRACT_VECTOR_ELT:
+      R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break;
+    case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
+    case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
+    case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
+    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
+    case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
+    case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
+    case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
+    case ISD::FEXP2:       R = SoftenFloatRes_FEXP2(N); break;
+    case ISD::FFLOOR:      R = SoftenFloatRes_FFLOOR(N); break;
+    case ISD::FLOG:        R = SoftenFloatRes_FLOG(N); break;
+    case ISD::FLOG2:       R = SoftenFloatRes_FLOG2(N); break;
+    case ISD::FLOG10:      R = SoftenFloatRes_FLOG10(N); break;
+    case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
+    case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
+    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
+    case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
+    case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
+    case ISD::FP16_TO_FP32:R = SoftenFloatRes_FP16_TO_FP32(N); break;
+    case ISD::FPOW:        R = SoftenFloatRes_FPOW(N); break;
+    case ISD::FPOWI:       R = SoftenFloatRes_FPOWI(N); break;
+    case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
+    case ISD::FRINT:       R = SoftenFloatRes_FRINT(N); break;
+    case ISD::FSIN:        R = SoftenFloatRes_FSIN(N); break;
+    case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
+    case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
+    case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
+    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
+    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
+    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
+    case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
+    case ISD::VAARG:       R = SoftenFloatRes_VAARG(N); break;
+  }
+
+  // If R is null, the sub-method took care of registering the result.
+  if (R.getNode())
+    SetSoftenedFloat(SDValue(N, ResNo), R);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_BIT_CONVERT(SDNode *N) {
+  return BitConvertToInteger(N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
+  // Convert the inputs to integers, and build a new pair out of them.
+  return DAG.getNode(ISD::BUILD_PAIR, N->getDebugLoc(),
+                     TLI.getTypeToTransformTo(*DAG.getContext(),
+                                              N->getValueType(0)),
+                     BitConvertToInteger(N->getOperand(0)),
+                     BitConvertToInteger(N->getOperand(1)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(ConstantFPSDNode *N) {
+  return DAG.getConstant(N->getValueAPF().bitcastToAPInt(),
+                         TLI.getTypeToTransformTo(*DAG.getContext(),
+                                                  N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
+                     NewOp.getValueType().getVectorElementType(),
+                     NewOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned Size = NVT.getSizeInBits();
+
+  // Mask = ~(1 << (Size-1))
+  SDValue Mask = DAG.getConstant(APInt::getAllOnesValue(Size).clear(Size-1),
+                                 NVT);
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return DAG.getNode(ISD::AND, N->getDebugLoc(), NVT, Op, Mask);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::ADD_F32,
+                                  RTLIB::ADD_F64,
+                                  RTLIB::ADD_F80,
+                                  RTLIB::ADD_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::CEIL_F32,
+                                  RTLIB::CEIL_F64,
+                                  RTLIB::CEIL_F80,
+                                  RTLIB::CEIL_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(0));
+  SDValue RHS = BitConvertToInteger(N->getOperand(1));
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT LVT = LHS.getValueType();
+  EVT RVT = RHS.getValueType();
+
+  unsigned LSize = LVT.getSizeInBits();
+  unsigned RSize = RVT.getSizeInBits();
+
+  // First get the sign bit of second operand.
+  SDValue SignBit = DAG.getNode(ISD::SHL, dl, RVT, DAG.getConstant(1, RVT),
+                                  DAG.getConstant(RSize - 1,
+                                                  TLI.getShiftAmountTy()));
+  SignBit = DAG.getNode(ISD::AND, dl, RVT, RHS, SignBit);
+
+  // Shift right or sign-extend it if the two operands have different types.
+  int SizeDiff = RVT.getSizeInBits() - LVT.getSizeInBits();
+  if (SizeDiff > 0) {
+    SignBit = DAG.getNode(ISD::SRL, dl, RVT, SignBit,
+                          DAG.getConstant(SizeDiff, TLI.getShiftAmountTy()));
+    SignBit = DAG.getNode(ISD::TRUNCATE, dl, LVT, SignBit);
+  } else if (SizeDiff < 0) {
+    SignBit = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, SignBit);
+    SignBit = DAG.getNode(ISD::SHL, dl, LVT, SignBit,
+                          DAG.getConstant(-SizeDiff, TLI.getShiftAmountTy()));
+  }
+
+  // Clear the sign bit of the first operand.
+  SDValue Mask = DAG.getNode(ISD::SHL, dl, LVT, DAG.getConstant(1, LVT),
+                               DAG.getConstant(LSize - 1,
+                                               TLI.getShiftAmountTy()));
+  Mask = DAG.getNode(ISD::SUB, dl, LVT, Mask, DAG.getConstant(1, LVT));
+  LHS = DAG.getNode(ISD::AND, dl, LVT, LHS, Mask);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(ISD::OR, dl, LVT, LHS, SignBit);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::COS_F32,
+                                  RTLIB::COS_F64,
+                                  RTLIB::COS_F80,
+                                  RTLIB::COS_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::DIV_F32,
+                                  RTLIB::DIV_F64,
+                                  RTLIB::DIV_F80,
+                                  RTLIB::DIV_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::EXP_F32,
+                                  RTLIB::EXP_F64,
+                                  RTLIB::EXP_F80,
+                                  RTLIB::EXP_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::EXP2_F32,
+                                  RTLIB::EXP2_F64,
+                                  RTLIB::EXP2_F80,
+                                  RTLIB::EXP2_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::FLOOR_F32,
+                                  RTLIB::FLOOR_F64,
+                                  RTLIB::FLOOR_F80,
+                                  RTLIB::FLOOR_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::LOG_F32,
+                                  RTLIB::LOG_F64,
+                                  RTLIB::LOG_F80,
+                                  RTLIB::LOG_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::LOG2_F32,
+                                  RTLIB::LOG2_F64,
+                                  RTLIB::LOG2_F80,
+                                  RTLIB::LOG2_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::LOG10_F32,
+                                  RTLIB::LOG10_F64,
+                                  RTLIB::LOG10_F80,
+                                  RTLIB::LOG10_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::MUL_F32,
+                                  RTLIB::MUL_F64,
+                                  RTLIB::MUL_F80,
+                                  RTLIB::MUL_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::NEARBYINT_F32,
+                                  RTLIB::NEARBYINT_F64,
+                                  RTLIB::NEARBYINT_F80,
+                                  RTLIB::NEARBYINT_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  // Expand Y = FNEG(X) -> Y = SUB -0.0, X
+  SDValue Ops[2] = { DAG.getConstantFP(-0.0, N->getValueType(0)),
+                     GetSoftenedFloat(N->getOperand(0)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::SUB_F32,
+                                  RTLIB::SUB_F64,
+                                  RTLIB::SUB_F80,
+                                  RTLIB::SUB_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = N->getOperand(0);
+  RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
+  return MakeLibCall(LC, NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+// FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
+// nodes?
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP32(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = N->getOperand(0);
+  return MakeLibCall(RTLIB::FPEXT_F16_F32, NVT, &Op, 1, false,
+                     N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = N->getOperand(0);
+  RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
+  return MakeLibCall(LC, NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::POW_F32,
+                                  RTLIB::POW_F64,
+                                  RTLIB::POW_F80,
+                                  RTLIB::POW_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
+  assert(N->getOperand(1).getValueType() == MVT::i32 &&
+         "Unsupported power type!");
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)), N->getOperand(1) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::POWI_F32,
+                                  RTLIB::POWI_F64,
+                                  RTLIB::POWI_F80,
+                                  RTLIB::POWI_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::REM_F32,
+                                  RTLIB::REM_F64,
+                                  RTLIB::REM_F80,
+                                  RTLIB::REM_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::RINT_F32,
+                                  RTLIB::RINT_F64,
+                                  RTLIB::RINT_F80,
+                                  RTLIB::RINT_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::SIN_F32,
+                                  RTLIB::SIN_F64,
+                                  RTLIB::SIN_F80,
+                                  RTLIB::SIN_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::SQRT_F32,
+                                  RTLIB::SQRT_F64,
+                                  RTLIB::SQRT_F80,
+                                  RTLIB::SQRT_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::SUB_F32,
+                                  RTLIB::SUB_F64,
+                                  RTLIB::SUB_F80,
+                                  RTLIB::SUB_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::TRUNC_F32,
+                                  RTLIB::TRUNC_F64,
+                                  RTLIB::TRUNC_F80,
+                                  RTLIB::TRUNC_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
+  LoadSDNode *L = cast<LoadSDNode>(N);
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue NewL;
+  if (L->getExtensionType() == ISD::NON_EXTLOAD) {
+    NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(),
+                       NVT, dl, L->getChain(), L->getBasePtr(), L->getOffset(),
+                       L->getSrcValue(), L->getSrcValueOffset(), NVT,
+                       L->isVolatile(), L->isNonTemporal(), L->getAlignment());
+    // Legalized the chain result - switch anything that used the old chain to
+    // use the new one.
+    ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+    return NewL;
+  }
+
+  // Do a non-extending load followed by FP_EXTEND.
+  NewL = DAG.getLoad(L->getAddressingMode(), ISD::NON_EXTLOAD,
+                     L->getMemoryVT(), dl, L->getChain(),
+                     L->getBasePtr(), L->getOffset(),
+                     L->getSrcValue(), L->getSrcValueOffset(),
+                     L->getMemoryVT(), L->isVolatile(),
+                     L->isNonTemporal(), L->getAlignment());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+  return BitConvertToInteger(DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(1));
+  SDValue RHS = GetSoftenedFloat(N->getOperand(2));
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0),LHS,RHS);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(2));
+  SDValue RHS = GetSoftenedFloat(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0),
+                     N->getOperand(1), LHS, RHS, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
+                                               N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
+  SDValue Chain = N->getOperand(0); // Get the chain.
+  SDValue Ptr = N->getOperand(1); // Get the pointer.
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue NewVAARG;
+  NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2),
+                          N->getConstantOperandVal(3));
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
+  return NewVAARG;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
+  bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
+  EVT SVT = N->getOperand(0).getValueType();
+  EVT RVT = N->getValueType(0);
+  EVT NVT = EVT();
+  DebugLoc dl = N->getDebugLoc();
+
+  // If the input is not legal, eg: i1 -> fp, then it needs to be promoted to
+  // a larger type, eg: i8 -> fp.  Even if it is legal, no libcall may exactly
+  // match.  Look for an appropriate libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  for (unsigned t = MVT::FIRST_INTEGER_VALUETYPE;
+       t <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL; ++t) {
+    NVT = (MVT::SimpleValueType)t;
+    // The source needs to big enough to hold the operand.
+    if (NVT.bitsGE(SVT))
+      LC = Signed ? RTLIB::getSINTTOFP(NVT, RVT):RTLIB::getUINTTOFP (NVT, RVT);
+  }
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
+
+  // Sign/zero extend the argument if the libcall takes a larger type.
+  SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                           NVT, N->getOperand(0));
+  return MakeLibCall(LC, TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
+                     &Op, 1, false, dl);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Operand Float to Integer Conversion..
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Soften float operand " << OpNo << ": "; N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SoftenFloatOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to soften this operator's operand!");
+
+  case ISD::BIT_CONVERT: Res = SoftenFloatOp_BIT_CONVERT(N); break;
+  case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
+  case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
+  case ISD::FP_TO_SINT:  Res = SoftenFloatOp_FP_TO_SINT(N); break;
+  case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_UINT(N); break;
+  case ISD::FP32_TO_FP16:Res = SoftenFloatOp_FP32_TO_FP16(N); break;
+  case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
+  case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
+  case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// SoftenSetCCOperands - Soften the operands of a comparison.  This code is
+/// shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::SoftenSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                                           ISD::CondCode &CCCode, DebugLoc dl) {
+  SDValue LHSInt = GetSoftenedFloat(NewLHS);
+  SDValue RHSInt = GetSoftenedFloat(NewRHS);
+  EVT VT = NewLHS.getValueType();
+
+  assert((VT == MVT::f32 || VT == MVT::f64) && "Unsupported setcc type!");
+
+  // Expand into one or more soft-fp libcall(s).
+  RTLIB::Libcall LC1 = RTLIB::UNKNOWN_LIBCALL, LC2 = RTLIB::UNKNOWN_LIBCALL;
+  switch (CCCode) {
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    LC1 = (VT == MVT::f32) ? RTLIB::OEQ_F32 : RTLIB::OEQ_F64;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    LC1 = (VT == MVT::f32) ? RTLIB::UNE_F32 : RTLIB::UNE_F64;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 : RTLIB::OGE_F64;
+    break;
+  case ISD::SETLT:
+  case ISD::SETOLT:
+    LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : RTLIB::OLT_F64;
+    break;
+  case ISD::SETLE:
+  case ISD::SETOLE:
+    LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 : RTLIB::OLE_F64;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 : RTLIB::OGT_F64;
+    break;
+  case ISD::SETUO:
+    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 : RTLIB::UO_F64;
+    break;
+  case ISD::SETO:
+    LC1 = (VT == MVT::f32) ? RTLIB::O_F32 : RTLIB::O_F64;
+    break;
+  default:
+    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 : RTLIB::UO_F64;
+    switch (CCCode) {
+    case ISD::SETONE:
+      // SETONE = SETOLT | SETOGT
+      LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : RTLIB::OLT_F64;
+      // Fallthrough
+    case ISD::SETUGT:
+      LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 : RTLIB::OGT_F64;
+      break;
+    case ISD::SETUGE:
+      LC2 = (VT == MVT::f32) ? RTLIB::OGE_F32 : RTLIB::OGE_F64;
+      break;
+    case ISD::SETULT:
+      LC2 = (VT == MVT::f32) ? RTLIB::OLT_F32 : RTLIB::OLT_F64;
+      break;
+    case ISD::SETULE:
+      LC2 = (VT == MVT::f32) ? RTLIB::OLE_F32 : RTLIB::OLE_F64;
+      break;
+    case ISD::SETUEQ:
+      LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 : RTLIB::OEQ_F64;
+      break;
+    default: assert(false && "Do not know how to soften this setcc!");
+    }
+  }
+
+  // Use the target specific return value for comparions lib calls.
+  EVT RetVT = TLI.getCmpLibcallReturnType();
+  SDValue Ops[2] = { LHSInt, RHSInt };
+  NewLHS = MakeLibCall(LC1, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+  NewRHS = DAG.getConstant(0, RetVT);
+  CCCode = TLI.getCmpLibcallCC(LC1);
+  if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
+    SDValue Tmp = DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(RetVT),
+                                NewLHS, NewRHS, DAG.getCondCode(CCCode));
+    NewLHS = MakeLibCall(LC2, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+    NewLHS = DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(RetVT), NewLHS,
+                         NewRHS, DAG.getCondCode(TLI.getCmpLibcallCC(LC2)));
+    NewLHS = DAG.getNode(ISD::OR, dl, Tmp.getValueType(), Tmp, NewLHS);
+    NewRHS = SDValue();
+  }
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_BIT_CONVERT(SDNode *N) {
+  return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), N->getValueType(0),
+                     GetSoftenedFloat(N->getOperand(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
+  EVT SVT = N->getOperand(0).getValueType();
+  EVT RVT = N->getValueType(0);
+
+  RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
+
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(LC, RVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  SoftenSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If SoftenSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_SINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(LC, RVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(LC, RVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP32_TO_FP16(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::FPROUND_F32_F16;
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(LC, RVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+  SoftenSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If SoftenSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SoftenSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If SoftenSetCCOperands returned a scalar, use it.
+  if (NewRHS.getNode() == 0) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only soften the stored value!");
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue Val = ST->getValue();
+  DebugLoc dl = N->getDebugLoc();
+
+  if (ST->isTruncatingStore())
+    // Do an FP_ROUND followed by a non-truncating store.
+    Val = BitConvertToInteger(DAG.getNode(ISD::FP_ROUND, dl, ST->getMemoryVT(),
+                                          Val, DAG.getIntPtrConstant(0)));
+  else
+    Val = GetSoftenedFloat(Val);
+
+  return DAG.getStore(ST->getChain(), dl, Val, ST->getBasePtr(),
+                      ST->getSrcValue(), ST->getSrcValueOffset(),
+                      ST->isVolatile(), ST->isNonTemporal(),
+                      ST->getAlignment());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Float Result Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandFloatResult - This method is called when the specified result of the
+/// specified node is found to need expansion.  At this point, the node may also
+/// have invalid operands or may have other results that need promotion, we just
+/// know that (at least) one result needs expansion.
+void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Expand float result: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Lo, Hi;
+  Lo = Hi = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ExpandFloatResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to expand the result of this operator!");
+
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, Lo, Hi); break;
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+
+  case ISD::BIT_CONVERT:        ExpandRes_BIT_CONVERT(N, Lo, Hi); break;
+  case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
+  case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
+  case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
+
+  case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
+  case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
+  case ISD::FADD:       ExpandFloatRes_FADD(N, Lo, Hi); break;
+  case ISD::FCEIL:      ExpandFloatRes_FCEIL(N, Lo, Hi); break;
+  case ISD::FCOPYSIGN:  ExpandFloatRes_FCOPYSIGN(N, Lo, Hi); break;
+  case ISD::FCOS:       ExpandFloatRes_FCOS(N, Lo, Hi); break;
+  case ISD::FDIV:       ExpandFloatRes_FDIV(N, Lo, Hi); break;
+  case ISD::FEXP:       ExpandFloatRes_FEXP(N, Lo, Hi); break;
+  case ISD::FEXP2:      ExpandFloatRes_FEXP2(N, Lo, Hi); break;
+  case ISD::FFLOOR:     ExpandFloatRes_FFLOOR(N, Lo, Hi); break;
+  case ISD::FLOG:       ExpandFloatRes_FLOG(N, Lo, Hi); break;
+  case ISD::FLOG2:      ExpandFloatRes_FLOG2(N, Lo, Hi); break;
+  case ISD::FLOG10:     ExpandFloatRes_FLOG10(N, Lo, Hi); break;
+  case ISD::FMUL:       ExpandFloatRes_FMUL(N, Lo, Hi); break;
+  case ISD::FNEARBYINT: ExpandFloatRes_FNEARBYINT(N, Lo, Hi); break;
+  case ISD::FNEG:       ExpandFloatRes_FNEG(N, Lo, Hi); break;
+  case ISD::FP_EXTEND:  ExpandFloatRes_FP_EXTEND(N, Lo, Hi); break;
+  case ISD::FPOW:       ExpandFloatRes_FPOW(N, Lo, Hi); break;
+  case ISD::FPOWI:      ExpandFloatRes_FPOWI(N, Lo, Hi); break;
+  case ISD::FRINT:      ExpandFloatRes_FRINT(N, Lo, Hi); break;
+  case ISD::FSIN:       ExpandFloatRes_FSIN(N, Lo, Hi); break;
+  case ISD::FSQRT:      ExpandFloatRes_FSQRT(N, Lo, Hi); break;
+  case ISD::FSUB:       ExpandFloatRes_FSUB(N, Lo, Hi); break;
+  case ISD::FTRUNC:     ExpandFloatRes_FTRUNC(N, Lo, Hi); break;
+  case ISD::LOAD:       ExpandFloatRes_LOAD(N, Lo, Hi); break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP: ExpandFloatRes_XINT_TO_FP(N, Lo, Hi); break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetExpandedFloat(SDValue(N, ResNo), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  assert(NVT.getSizeInBits() == integerPartWidth &&
+         "Do not know how to expand this float constant!");
+  APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt();
+  Lo = DAG.getConstantFP(APFloat(APInt(integerPartWidth, 1,
+                                       &C.getRawData()[1])), NVT);
+  Hi = DAG.getConstantFP(APFloat(APInt(integerPartWidth, 1,
+                                       &C.getRawData()[0])), NVT);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FABS(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  assert(N->getValueType(0) == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Tmp;
+  GetExpandedFloat(N->getOperand(0), Lo, Tmp);
+  Hi = DAG.getNode(ISD::FABS, dl, Tmp.getValueType(), Tmp);
+  // Lo = Hi==fabs(Hi) ? Lo : -Lo;
+  Lo = DAG.getNode(ISD::SELECT_CC, dl, Lo.getValueType(), Tmp, Hi, Lo,
+                   DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo),
+                   DAG.getCondCode(ISD::SETEQ));
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FADD(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::ADD_F32, RTLIB::ADD_F64,
+                                         RTLIB::ADD_F80, RTLIB::ADD_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCEIL(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::CEIL_F32, RTLIB::CEIL_F64,
+                                         RTLIB::CEIL_F80, RTLIB::CEIL_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCOPYSIGN(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::COPYSIGN_F32,
+                                         RTLIB::COPYSIGN_F64,
+                                         RTLIB::COPYSIGN_F80,
+                                         RTLIB::COPYSIGN_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCOS(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::COS_F32, RTLIB::COS_F64,
+                                         RTLIB::COS_F80, RTLIB::COS_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Call = MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                          RTLIB::DIV_F32,
+                                          RTLIB::DIV_F64,
+                                          RTLIB::DIV_F80,
+                                          RTLIB::DIV_PPCF128),
+                             N->getValueType(0), Ops, 2, false,
+                             N->getDebugLoc());
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FEXP(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::EXP_F32, RTLIB::EXP_F64,
+                                         RTLIB::EXP_F80, RTLIB::EXP_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FEXP2(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::EXP2_F32, RTLIB::EXP2_F64,
+                                         RTLIB::EXP2_F80, RTLIB::EXP2_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FFLOOR(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::FLOOR_F32,RTLIB::FLOOR_F64,
+                                         RTLIB::FLOOR_F80,RTLIB::FLOOR_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::LOG_F32, RTLIB::LOG_F64,
+                                         RTLIB::LOG_F80, RTLIB::LOG_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG2(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::LOG2_F32, RTLIB::LOG2_F64,
+                                         RTLIB::LOG2_F80, RTLIB::LOG2_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG10(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::LOG10_F32,RTLIB::LOG10_F64,
+                                         RTLIB::LOG10_F80,RTLIB::LOG10_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Call = MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                          RTLIB::MUL_F32,
+                                          RTLIB::MUL_F64,
+                                          RTLIB::MUL_F80,
+                                          RTLIB::MUL_PPCF128),
+                             N->getValueType(0), Ops, 2, false,
+                             N->getDebugLoc());
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FNEARBYINT(SDNode *N,
+                                                 SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::NEARBYINT_F32,
+                                         RTLIB::NEARBYINT_F64,
+                                         RTLIB::NEARBYINT_F80,
+                                         RTLIB::NEARBYINT_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+  Lo = DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  Hi = DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), NVT, N->getOperand(0));
+  Lo = DAG.getConstantFP(APFloat(APInt(NVT.getSizeInBits(), 0)), NVT);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FPOW(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::POW_F32, RTLIB::POW_F64,
+                                         RTLIB::POW_F80, RTLIB::POW_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FPOWI(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::POWI_F32, RTLIB::POWI_F64,
+                                         RTLIB::POWI_F80, RTLIB::POWI_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FRINT(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::RINT_F32, RTLIB::RINT_F64,
+                                         RTLIB::RINT_F80, RTLIB::RINT_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSIN(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::SIN_F32, RTLIB::SIN_F64,
+                                         RTLIB::SIN_F80, RTLIB::SIN_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSQRT(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::SQRT_F32, RTLIB::SQRT_F64,
+                                         RTLIB::SQRT_F80, RTLIB::SQRT_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Call = MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                          RTLIB::SUB_F32,
+                                          RTLIB::SUB_F64,
+                                          RTLIB::SUB_F80,
+                                          RTLIB::SUB_PPCF128),
+                             N->getValueType(0), Ops, 2, false,
+                             N->getDebugLoc());
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FTRUNC(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
+                                         RTLIB::TRUNC_F80, RTLIB::TRUNC_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  if (ISD::isNormalLoad(N)) {
+    ExpandRes_NormalLoad(N, Lo, Hi);
+    return;
+  }
+
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  assert(LD->getMemoryVT().bitsLE(NVT) && "Float type not round?");
+
+  Hi = DAG.getExtLoad(LD->getExtensionType(), NVT, dl, Chain, Ptr,
+                      LD->getSrcValue(), LD->getSrcValueOffset(),
+                      LD->getMemoryVT(), LD->isVolatile(),
+                      LD->isNonTemporal(), LD->getAlignment());
+
+  // Remember the chain.
+  Chain = Hi.getValue(1);
+
+  // The low part is zero.
+  Lo = DAG.getConstantFP(APFloat(APInt(NVT.getSizeInBits(), 0)), NVT);
+
+  // Modified the chain - switch anything that used the old chain to use the
+  // new one.
+  ReplaceValueWith(SDValue(LD, 1), Chain);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  assert(N->getValueType(0) == MVT::ppcf128 && "Unsupported XINT_TO_FP!");
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  bool isSigned = N->getOpcode() == ISD::SINT_TO_FP;
+  DebugLoc dl = N->getDebugLoc();
+
+  // First do an SINT_TO_FP, whether the original was signed or unsigned.
+  // When promoting partial word types to i32 we must honor the signedness,
+  // though.
+  if (SrcVT.bitsLE(MVT::i32)) {
+    // The integer can be represented exactly in an f64.
+    Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                      MVT::i32, Src);
+    Lo = DAG.getConstantFP(APFloat(APInt(NVT.getSizeInBits(), 0)), NVT);
+    Hi = DAG.getNode(ISD::SINT_TO_FP, dl, NVT, Src);
+  } else {
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+    if (SrcVT.bitsLE(MVT::i64)) {
+      Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                        MVT::i64, Src);
+      LC = RTLIB::SINTTOFP_I64_PPCF128;
+    } else if (SrcVT.bitsLE(MVT::i128)) {
+      Src = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i128, Src);
+      LC = RTLIB::SINTTOFP_I128_PPCF128;
+    }
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
+
+    Hi = MakeLibCall(LC, VT, &Src, 1, true, dl);
+    GetPairElements(Hi, Lo, Hi);
+  }
+
+  if (isSigned)
+    return;
+
+  // Unsigned - fix up the SINT_TO_FP value just calculated.
+  Hi = DAG.getNode(ISD::BUILD_PAIR, dl, VT, Lo, Hi);
+  SrcVT = Src.getValueType();
+
+  // x>=0 ? (ppcf128)(iN)x : (ppcf128)(iN)x + 2^N; N=32,64,128.
+  static const uint64_t TwoE32[]  = { 0x41f0000000000000LL, 0 };
+  static const uint64_t TwoE64[]  = { 0x43f0000000000000LL, 0 };
+  static const uint64_t TwoE128[] = { 0x47f0000000000000LL, 0 };
+  const uint64_t *Parts = 0;
+
+  switch (SrcVT.getSimpleVT().SimpleTy) {
+  default:
+    assert(false && "Unsupported UINT_TO_FP!");
+  case MVT::i32:
+    Parts = TwoE32;
+    break;
+  case MVT::i64:
+    Parts = TwoE64;
+    break;
+  case MVT::i128:
+    Parts = TwoE128;
+    break;
+  }
+
+  Lo = DAG.getNode(ISD::FADD, dl, VT, Hi,
+                   DAG.getConstantFP(APFloat(APInt(128, 2, Parts)),
+                                     MVT::ppcf128));
+  Lo = DAG.getNode(ISD::SELECT_CC, dl, VT, Src, DAG.getConstant(0, SrcVT),
+                   Lo, Hi, DAG.getCondCode(ISD::SETLT));
+  GetPairElements(Lo, Lo, Hi);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Float Operand Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandFloatOperand - This method is called when the specified operand of the
+/// specified node is found to need expansion.  At this point, all of the result
+/// types of the node are known to be legal, but other operands of the node may
+/// need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Expand float operand: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (TLI.getOperationAction(N->getOpcode(), N->getOperand(OpNo).getValueType())
+      == TargetLowering::Custom)
+    Res = TLI.LowerOperation(SDValue(N, 0), DAG);
+
+  if (Res.getNode() == 0) {
+    switch (N->getOpcode()) {
+    default:
+  #ifndef NDEBUG
+      dbgs() << "ExpandFloatOperand Op #" << OpNo << ": ";
+      N->dump(&DAG); dbgs() << "\n";
+  #endif
+      llvm_unreachable("Do not know how to expand this operator's operand!");
+
+    case ISD::BIT_CONVERT:     Res = ExpandOp_BIT_CONVERT(N); break;
+    case ISD::BUILD_VECTOR:    Res = ExpandOp_BUILD_VECTOR(N); break;
+    case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break;
+
+    case ISD::BR_CC:      Res = ExpandFloatOp_BR_CC(N); break;
+    case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
+    case ISD::FP_TO_SINT: Res = ExpandFloatOp_FP_TO_SINT(N); break;
+    case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_UINT(N); break;
+    case ISD::SELECT_CC:  Res = ExpandFloatOp_SELECT_CC(N); break;
+    case ISD::SETCC:      Res = ExpandFloatOp_SETCC(N); break;
+    case ISD::STORE:      Res = ExpandFloatOp_STORE(cast<StoreSDNode>(N),
+                                                    OpNo); break;
+    }
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// FloatExpandSetCCOperands - Expand the operands of a comparison.  This code
+/// is shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
+                                                SDValue &NewRHS,
+                                                ISD::CondCode &CCCode,
+                                                DebugLoc dl) {
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedFloat(NewLHS, LHSLo, LHSHi);
+  GetExpandedFloat(NewRHS, RHSLo, RHSHi);
+
+  EVT VT = NewLHS.getValueType();
+  assert(VT == MVT::ppcf128 && "Unsupported setcc type!");
+
+  // FIXME:  This generated code sucks.  We want to generate
+  //         FCMPU crN, hi1, hi2
+  //         BNE crN, L:
+  //         FCMPU crN, lo1, lo2
+  // The following can be improved, but not that much.
+  SDValue Tmp1, Tmp2, Tmp3;
+  Tmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+                      LHSHi, RHSHi, ISD::SETOEQ);
+  Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSLo.getValueType()),
+                      LHSLo, RHSLo, CCCode);
+  Tmp3 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
+  Tmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+                      LHSHi, RHSHi, ISD::SETUNE);
+  Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+                      LHSHi, RHSHi, CCCode);
+  Tmp1 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
+  NewLHS = DAG.getNode(ISD::OR, dl, Tmp1.getValueType(), Tmp1, Tmp3);
+  NewRHS = SDValue();   // LHS is the result, not a compare.
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
+  assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  SDValue Lo, Hi;
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+  // Round it the rest of the way (e.g. to f32) if needed.
+  return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(),
+                     N->getValueType(0), Hi, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
+  // PPC (the libcall is not available).  FIXME: Do this in a less hacky way.
+  if (RVT == MVT::i32) {
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+           "Logic only correct for ppcf128!");
+    SDValue Res = DAG.getNode(ISD::FP_ROUND_INREG, dl, MVT::ppcf128,
+                              N->getOperand(0), DAG.getValueType(MVT::f64));
+    Res = DAG.getNode(ISD::FP_ROUND, dl, MVT::f64, Res,
+                      DAG.getIntPtrConstant(1));
+    return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
+  }
+
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
+  return MakeLibCall(LC, RVT, &N->getOperand(0), 1, false, dl);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
+  // PPC (the libcall is not available).  FIXME: Do this in a less hacky way.
+  if (RVT == MVT::i32) {
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+           "Logic only correct for ppcf128!");
+    const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
+    APFloat APF = APFloat(APInt(128, 2, TwoE31));
+    SDValue Tmp = DAG.getConstantFP(APF, MVT::ppcf128);
+    //  X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
+    // FIXME: generated code sucks.
+    return DAG.getNode(ISD::SELECT_CC, dl, MVT::i32, N->getOperand(0), Tmp,
+                       DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                   DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
+                                               DAG.getNode(ISD::FSUB, dl,
+                                                           MVT::ppcf128,
+                                                           N->getOperand(0),
+                                                           Tmp)),
+                                   DAG.getConstant(0x80000000, MVT::i32)),
+                       DAG.getNode(ISD::FP_TO_SINT, dl,
+                                   MVT::i32, N->getOperand(0)),
+                       DAG.getCondCode(ISD::SETGE));
+  }
+
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
+  return MakeLibCall(LC, N->getValueType(0), &N->getOperand(0), 1, false, dl);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, use it.
+  if (NewRHS.getNode() == 0) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
+  if (ISD::isNormalStore(N))
+    return ExpandOp_NormalStore(N, OpNo);
+
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(),
+                                     ST->getValue().getValueType());
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  assert(ST->getMemoryVT().bitsLE(NVT) && "Float type not round?");
+
+  SDValue Lo, Hi;
+  GetExpandedOp(ST->getValue(), Lo, Hi);
+
+  return DAG.getTruncStore(Chain, N->getDebugLoc(), Hi, Ptr,
+                           ST->getSrcValue(), ST->getSrcValueOffset(),
+                           ST->getMemoryVT(), ST->isVolatile(),
+                           ST->isNonTemporal(), ST->getAlignment());
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
new file mode 100644
index 0000000..f8c5890
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -0,0 +1,2474 @@
+//===----- LegalizeIntegerTypes.cpp - Legalization of integer types -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements integer type expansion and promotion for LegalizeTypes.
+// Promotion is the act of changing a computation in an illegal type into a
+// computation in a larger type.  For example, implementing i8 arithmetic in an
+// i32 register (often needed on powerpc).
+// Expansion is the act of changing a computation in an illegal type into a
+// computation in two identical registers of a smaller type.  For example,
+// implementing i64 arithmetic in two i32 registers (often needed on 32-bit
+// targets).
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Integer Result Promotion
+//===----------------------------------------------------------------------===//
+
+/// PromoteIntegerResult - This method is called when a result of a node is
+/// found to be in need of promotion to a larger type.  At this point, the node
+/// may also have invalid operands or may have other results that need
+/// expansion, we just know that (at least) one result needs promotion.
+void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Promote integer result: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "PromoteIntegerResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to promote this operator!");
+  case ISD::AssertSext:  Res = PromoteIntRes_AssertSext(N); break;
+  case ISD::AssertZext:  Res = PromoteIntRes_AssertZext(N); break;
+  case ISD::BIT_CONVERT: Res = PromoteIntRes_BIT_CONVERT(N); break;
+  case ISD::BSWAP:       Res = PromoteIntRes_BSWAP(N); break;
+  case ISD::BUILD_PAIR:  Res = PromoteIntRes_BUILD_PAIR(N); break;
+  case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
+  case ISD::CONVERT_RNDSAT:
+                         Res = PromoteIntRes_CONVERT_RNDSAT(N); break;
+  case ISD::CTLZ:        Res = PromoteIntRes_CTLZ(N); break;
+  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP(N); break;
+  case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
+  case ISD::EXTRACT_VECTOR_ELT:
+                         Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N));break;
+  case ISD::SELECT:      Res = PromoteIntRes_SELECT(N); break;
+  case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
+  case ISD::SETCC:       Res = PromoteIntRes_SETCC(N); break;
+  case ISD::SHL:         Res = PromoteIntRes_SHL(N); break;
+  case ISD::SIGN_EXTEND_INREG:
+                         Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break;
+  case ISD::SRA:         Res = PromoteIntRes_SRA(N); break;
+  case ISD::SRL:         Res = PromoteIntRes_SRL(N); break;
+  case ISD::TRUNCATE:    Res = PromoteIntRes_TRUNCATE(N); break;
+  case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
+  case ISD::VAARG:       Res = PromoteIntRes_VAARG(N); break;
+
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:  Res = PromoteIntRes_INT_EXTEND(N); break;
+
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:  Res = PromoteIntRes_FP_TO_XINT(N); break;
+
+  case ISD::FP32_TO_FP16:Res = PromoteIntRes_FP32_TO_FP16(N); break;
+
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:         Res = PromoteIntRes_SimpleIntBinOp(N); break;
+
+  case ISD::SDIV:
+  case ISD::SREM:        Res = PromoteIntRes_SDIV(N); break;
+
+  case ISD::UDIV:
+  case ISD::UREM:        Res = PromoteIntRes_UDIV(N); break;
+
+  case ISD::SADDO:
+  case ISD::SSUBO:       Res = PromoteIntRes_SADDSUBO(N, ResNo); break;
+  case ISD::UADDO:
+  case ISD::USUBO:       Res = PromoteIntRes_UADDSUBO(N, ResNo); break;
+  case ISD::SMULO:
+  case ISD::UMULO:       Res = PromoteIntRes_XMULO(N, ResNo); break;
+
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_SWAP:
+    Res = PromoteIntRes_Atomic1(cast<AtomicSDNode>(N)); break;
+
+  case ISD::ATOMIC_CMP_SWAP:
+    Res = PromoteIntRes_Atomic2(cast<AtomicSDNode>(N)); break;
+  }
+
+  // If the result is null then the sub-method took care of registering it.
+  if (Res.getNode())
+    SetPromotedInteger(SDValue(N, ResNo), Res);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) {
+  // Sign-extend the new bits, and continue the assertion.
+  SDValue Op = SExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::AssertSext, N->getDebugLoc(),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_AssertZext(SDNode *N) {
+  // Zero the new bits, and continue the assertion.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::AssertZext, N->getDebugLoc(),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
+                              N->getMemoryVT(),
+                              N->getChain(), N->getBasePtr(),
+                              Op2, N->getSrcValue(), N->getAlignment());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Atomic2(AtomicSDNode *N) {
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  SDValue Op3 = GetPromotedInteger(N->getOperand(3));
+  SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
+                              N->getMemoryVT(), N->getChain(), N->getBasePtr(),
+                              Op2, Op3, N->getSrcValue(), N->getAlignment());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BIT_CONVERT(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  DebugLoc dl = N->getDebugLoc();
+
+  switch (getTypeAction(InVT)) {
+  default:
+    assert(false && "Unknown type action!");
+    break;
+  case Legal:
+    break;
+  case PromoteInteger:
+    if (NOutVT.bitsEq(NInVT))
+      // The input promotes to the same size.  Convert the promoted value.
+      return DAG.getNode(ISD::BIT_CONVERT, dl,
+                         NOutVT, GetPromotedInteger(InOp));
+    break;
+  case SoftenFloat:
+    // Promote the integer operand by hand.
+    return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp));
+  case ExpandInteger:
+  case ExpandFloat:
+    break;
+  case ScalarizeVector:
+    // Convert the element to an integer and promote it by hand.
+    return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+                       BitConvertToInteger(GetScalarizedVector(InOp)));
+  case SplitVector: {
+    // For example, i32 = BIT_CONVERT v2i16 on alpha.  Convert the split
+    // pieces of the input into integers and reassemble in the final type.
+    SDValue Lo, Hi;
+    GetSplitVector(N->getOperand(0), Lo, Hi);
+    Lo = BitConvertToInteger(Lo);
+    Hi = BitConvertToInteger(Hi);
+
+    if (TLI.isBigEndian())
+      std::swap(Lo, Hi);
+
+    InOp = DAG.getNode(ISD::ANY_EXTEND, dl,
+                       EVT::getIntegerVT(*DAG.getContext(),
+                                         NOutVT.getSizeInBits()),
+                       JoinIntegers(Lo, Hi));
+    return DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, InOp);
+  }
+  case WidenVector:
+    if (OutVT.bitsEq(NInVT))
+      // The input is widened to the same size.  Convert to the widened value.
+      return DAG.getNode(ISD::BIT_CONVERT, dl, OutVT, GetWidenedVector(InOp));
+  }
+
+  return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+                     CreateStackStoreLoad(InOp, OutVT));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
+  return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
+                     DAG.getConstant(DiffBits, TLI.getPointerTy()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
+  // The pair element type may be legal, or may not promote to the same type as
+  // the result, for example i14 = BUILD_PAIR (i7, i7).  Handle all cases.
+  return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(),
+                     TLI.getTypeToTransformTo(*DAG.getContext(),
+                     N->getValueType(0)), JoinIntegers(N->getOperand(0),
+                     N->getOperand(1)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  // FIXME there is no actual debug info here
+  DebugLoc dl = N->getDebugLoc();
+  // Zero extend things like i1, sign extend everything else.  It shouldn't
+  // matter in theory which one we pick, but this tends to give better code?
+  unsigned Opc = VT.isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+  SDValue Result = DAG.getNode(Opc, dl,
+                               TLI.getTypeToTransformTo(*DAG.getContext(), VT),
+                               SDValue(N, 0));
+  assert(isa<ConstantSDNode>(Result) && "Didn't constant fold ext?");
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CONVERT_RNDSAT(SDNode *N) {
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+  assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU ||
+           CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU ||
+           CvtCode == ISD::CVT_SF || CvtCode == ISD::CVT_UF) &&
+          "can only promote integers");
+  EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getConvertRndSat(OutVT, N->getDebugLoc(), N->getOperand(0),
+                              N->getOperand(1), N->getOperand(2),
+                              N->getOperand(3), N->getOperand(4), CvtCode);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
+  // Zero extend to the promoted type and do the count there.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  DebugLoc dl = N->getDebugLoc();
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  Op = DAG.getNode(ISD::CTLZ, dl, NVT, Op);
+  // Subtract off the extra leading bits in the bigger type.
+  return DAG.getNode(ISD::SUB, dl, NVT, Op,
+                     DAG.getConstant(NVT.getSizeInBits() -
+                                     OVT.getSizeInBits(), NVT));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) {
+  // Zero extend to the promoted type and do the count there.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::CTPOP, N->getDebugLoc(), Op.getValueType(), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+  // The count is the same in the promoted type except if the original
+  // value was zero.  This can be handled by setting the bit just off
+  // the top of the original type.
+  APInt TopBit(NVT.getSizeInBits(), 0);
+  TopBit.set(OVT.getSizeInBits());
+  Op = DAG.getNode(ISD::OR, dl, NVT, Op, DAG.getConstant(TopBit, NVT));
+  return DAG.getNode(ISD::CTTZ, dl, NVT, Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NVT, N->getOperand(0),
+                     N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned NewOpc = N->getOpcode();
+  DebugLoc dl = N->getDebugLoc();
+
+  // If we're promoting a UINT to a larger size and the larger FP_TO_UINT is
+  // not Legal, check to see if we can use FP_TO_SINT instead.  (If both UINT
+  // and SINT conversions are Custom, there is no way to tell which is
+  // preferable. We choose SINT because that's the right thing on PPC.)
+  if (N->getOpcode() == ISD::FP_TO_UINT &&
+      !TLI.isOperationLegal(ISD::FP_TO_UINT, NVT) &&
+      TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
+    NewOpc = ISD::FP_TO_SINT;
+
+  SDValue Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0));
+
+  // Assert that the converted value fits in the original type.  If it doesn't
+  // (eg: because the value being converted is too big), then the result of the
+  // original operation was undefined anyway, so the assert is still correct.
+  return DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ?
+                     ISD::AssertZext : ISD::AssertSext, dl,
+                     NVT, Res, DAG.getValueType(N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FP32_TO_FP16(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+
+  return DAG.getNode(ISD::AssertZext, dl,
+                     NVT, Res, DAG.getValueType(N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+
+  if (getTypeAction(N->getOperand(0).getValueType()) == PromoteInteger) {
+    SDValue Res = GetPromotedInteger(N->getOperand(0));
+    assert(Res.getValueType().bitsLE(NVT) && "Extension doesn't make sense!");
+
+    // If the result and operand types are the same after promotion, simplify
+    // to an in-register extension.
+    if (NVT == Res.getValueType()) {
+      // The high bits are not guaranteed to be anything.  Insert an extend.
+      if (N->getOpcode() == ISD::SIGN_EXTEND)
+        return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
+                           DAG.getValueType(N->getOperand(0).getValueType()));
+      if (N->getOpcode() == ISD::ZERO_EXTEND)
+        return DAG.getZeroExtendInReg(Res, dl, N->getOperand(0).getValueType());
+      assert(N->getOpcode() == ISD::ANY_EXTEND && "Unknown integer extension!");
+      return Res;
+    }
+  }
+
+  // Otherwise, just extend the original operand all the way to the larger type.
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  ISD::LoadExtType ExtType =
+    ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType();
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Res = DAG.getExtLoad(ExtType, NVT, dl, N->getChain(), N->getBasePtr(),
+                               N->getSrcValue(), N->getSrcValueOffset(),
+                               N->getMemoryVT(), N->isVolatile(),
+                               N->isNonTemporal(), N->getAlignment());
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+/// Promote the overflow flag of an overflowing arithmetic node.
+SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
+  // Simply change the return type of the boolean result.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
+  EVT ValueVTs[] = { N->getValueType(0), NVT };
+  SDValue Ops[] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Res = DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                            DAG.getVTList(ValueVTs, 2), Ops, 2);
+
+  // Modified the sum result - switch anything that used the old sum to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 0), Res);
+
+  return SDValue(Res.getNode(), 1);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  // The operation overflowed iff the result in the larger type is not the
+  // sign extension of its truncation to the original type.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
+  EVT OVT = N->getOperand(0).getValueType();
+  EVT NVT = LHS.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Do the arithmetic in the larger type.
+  unsigned Opcode = N->getOpcode() == ISD::SADDO ? ISD::ADD : ISD::SUB;
+  SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS);
+
+  // Calculate the overflow flag: sign extend the arithmetic result from
+  // the original type.
+  SDValue Ofl = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
+                            DAG.getValueType(OVT));
+  // Overflowed if and only if this is not equal to Res.
+  Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SDIV(SDNode *N) {
+  // Sign extend the input.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) {
+  SDValue LHS = GetPromotedInteger(N->getOperand(1));
+  SDValue RHS = GetPromotedInteger(N->getOperand(2));
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0),LHS,RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetPromotedInteger(N->getOperand(2));
+  SDValue RHS = GetPromotedInteger(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0),
+                     N->getOperand(1), LHS, RHS, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
+  EVT SVT = TLI.getSetCCResultType(N->getOperand(0).getValueType());
+  assert(isTypeLegal(SVT) && "Illegal SetCC type!");
+  DebugLoc dl = N->getDebugLoc();
+
+  // Get the SETCC result using the canonical SETCC type.
+  SDValue SetCC = DAG.getNode(ISD::SETCC, dl, SVT, N->getOperand(0),
+                              N->getOperand(1), N->getOperand(2));
+
+  // Convert to the expected type.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  assert(NVT.bitsLE(SVT) && "Integer type overpromoted?");
+  return DAG.getNode(ISD::TRUNCATE, dl, NVT, SetCC);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
+  return DAG.getNode(ISD::SHL, N->getDebugLoc(),
+                TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)),
+                     GetPromotedInteger(N->getOperand(0)), N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
+  // The input may have strange things in the top bits of the registers, but
+  // these operations don't care.  They may have weird bits going out, but
+  // that too is okay if they are integer operations.
+  SDValue LHS = GetPromotedInteger(N->getOperand(0));
+  SDValue RHS = GetPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                    LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
+  // The input value must be properly sign extended.
+  SDValue Res = SExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::SRA, N->getDebugLoc(),
+                     Res.getValueType(), Res, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
+  // The input value must be properly zero extended.
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Res = ZExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::SRL, N->getDebugLoc(), NVT, Res, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Res;
+
+  switch (getTypeAction(N->getOperand(0).getValueType())) {
+  default: llvm_unreachable("Unknown type action!");
+  case Legal:
+  case ExpandInteger:
+    Res = N->getOperand(0);
+    break;
+  case PromoteInteger:
+    Res = GetPromotedInteger(N->getOperand(0));
+    break;
+  }
+
+  // Truncate to NVT instead of VT
+  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), NVT, Res);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) {
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  // The operation overflowed iff the result in the larger type is not the
+  // zero extension of its truncation to the original type.
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
+  EVT OVT = N->getOperand(0).getValueType();
+  EVT NVT = LHS.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Do the arithmetic in the larger type.
+  unsigned Opcode = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB;
+  SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS);
+
+  // Calculate the overflow flag: zero extend the arithmetic result from
+  // the original type.
+  SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT);
+  // Overflowed if and only if this is not equal to Res.
+  Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UDIV(SDNode *N) {
+  // Zero extend the input.
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
+                                               N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
+  SDValue Chain = N->getOperand(0); // Get the chain.
+  SDValue Ptr = N->getOperand(1); // Get the pointer.
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT RegVT = TLI.getRegisterType(*DAG.getContext(), VT);
+  unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), VT);
+  // The argument is passed as NumRegs registers of type RegVT.
+
+  SmallVector<SDValue, 8> Parts(NumRegs);
+  for (unsigned i = 0; i < NumRegs; ++i) {
+    Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2),
+                            N->getConstantOperandVal(3));
+    Chain = Parts[i].getValue(1);
+  }
+
+  // Handle endianness of the load.
+  if (TLI.isBigEndian())
+    std::reverse(Parts.begin(), Parts.end());
+
+  // Assemble the parts in the promoted type.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Res = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[0]);
+  for (unsigned i = 1; i < NumRegs; ++i) {
+    SDValue Part = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[i]);
+    // Shift it to the right position and "or" it in.
+    Part = DAG.getNode(ISD::SHL, dl, NVT, Part,
+                       DAG.getConstant(i * RegVT.getSizeInBits(),
+                                       TLI.getPointerTy()));
+    Res = DAG.getNode(ISD::OR, dl, NVT, Res, Part);
+  }
+
+  // Modified the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
+  assert(ResNo == 1 && "Only boolean result promotion currently supported!");
+  return PromoteIntRes_Overflow(N);
+}
+
+//===----------------------------------------------------------------------===//
+//  Integer Operand Promotion
+//===----------------------------------------------------------------------===//
+
+/// PromoteIntegerOperand - This method is called when the specified operand of
+/// the specified node is found to need promotion.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Promote integer operand: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+    default:
+  #ifndef NDEBUG
+    dbgs() << "PromoteIntegerOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+  #endif
+    llvm_unreachable("Do not know how to promote this operator's operand!");
+
+  case ISD::ANY_EXTEND:   Res = PromoteIntOp_ANY_EXTEND(N); break;
+  case ISD::BIT_CONVERT:  Res = PromoteIntOp_BIT_CONVERT(N); break;
+  case ISD::BR_CC:        Res = PromoteIntOp_BR_CC(N, OpNo); break;
+  case ISD::BRCOND:       Res = PromoteIntOp_BRCOND(N, OpNo); break;
+  case ISD::BUILD_PAIR:   Res = PromoteIntOp_BUILD_PAIR(N); break;
+  case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break;
+  case ISD::CONVERT_RNDSAT:
+                          Res = PromoteIntOp_CONVERT_RNDSAT(N); break;
+  case ISD::INSERT_VECTOR_ELT:
+                          Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);break;
+  case ISD::MEMBARRIER:   Res = PromoteIntOp_MEMBARRIER(N); break;
+  case ISD::SCALAR_TO_VECTOR:
+                          Res = PromoteIntOp_SCALAR_TO_VECTOR(N); break;
+  case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
+  case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
+  case ISD::SETCC:        Res = PromoteIntOp_SETCC(N, OpNo); break;
+  case ISD::SIGN_EXTEND:  Res = PromoteIntOp_SIGN_EXTEND(N); break;
+  case ISD::SINT_TO_FP:   Res = PromoteIntOp_SINT_TO_FP(N); break;
+  case ISD::STORE:        Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
+                                                   OpNo); break;
+  case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
+  case ISD::FP16_TO_FP32:
+  case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
+  case ISD::ZERO_EXTEND:  Res = PromoteIntOp_ZERO_EXTEND(N); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR: Res = PromoteIntOp_Shift(N); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// PromoteSetCCOperands - Promote the operands of a comparison.  This code is
+/// shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
+                                            ISD::CondCode CCCode) {
+  // We have to insert explicit sign or zero extends.  Note that we could
+  // insert sign extends for ALL conditions, but zero extend is cheaper on
+  // many machines (an AND instead of two shifts), so prefer it.
+  switch (CCCode) {
+  default: llvm_unreachable("Unknown integer comparison!");
+  case ISD::SETEQ:
+  case ISD::SETNE:
+  case ISD::SETUGE:
+  case ISD::SETUGT:
+  case ISD::SETULE:
+  case ISD::SETULT:
+    // ALL of these operations will work if we either sign or zero extend
+    // the operands (including the unsigned comparisons!).  Zero extend is
+    // usually a simpler/cheaper operation, so prefer it.
+    NewLHS = ZExtPromotedInteger(NewLHS);
+    NewRHS = ZExtPromotedInteger(NewRHS);
+    break;
+  case ISD::SETGE:
+  case ISD::SETGT:
+  case ISD::SETLT:
+  case ISD::SETLE:
+    NewLHS = SExtPromotedInteger(NewLHS);
+    NewRHS = SExtPromotedInteger(NewRHS);
+    break;
+  }
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BIT_CONVERT(SDNode *N) {
+  // This should only occur in unusual situations like bitcasting to an
+  // x86_fp80, so just turn it into a store+load
+  return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 2 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(2);
+  SDValue RHS = N->getOperand(3);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(1))->get());
+
+  // The chain (Op#0), CC (#1) and basic block destination (Op#4) are always
+  // legal types.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                N->getOperand(1), LHS, RHS, N->getOperand(4)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "only know how to promote condition");
+
+  // Promote all the way up to the canonical SetCC type.
+  EVT SVT = TLI.getSetCCResultType(MVT::Other);
+  SDValue Cond = PromoteTargetBoolean(N->getOperand(1), SVT);
+
+  // The chain (Op#0) and basic block destination (Op#2) are always legal types.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Cond,
+                                        N->getOperand(2)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) {
+  // Since the result type is legal, the operands must promote to it.
+  EVT OVT = N->getOperand(0).getValueType();
+  SDValue Lo = ZExtPromotedInteger(N->getOperand(0));
+  SDValue Hi = GetPromotedInteger(N->getOperand(1));
+  assert(Lo.getValueType() == N->getValueType(0) && "Operand over promoted?");
+  DebugLoc dl = N->getDebugLoc();
+
+  Hi = DAG.getNode(ISD::SHL, dl, N->getValueType(0), Hi,
+                   DAG.getConstant(OVT.getSizeInBits(), TLI.getPointerTy()));
+  return DAG.getNode(ISD::OR, dl, N->getValueType(0), Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
+  // The vector type is legal but the element type is not.  This implies
+  // that the vector is a power-of-two in length and that the element
+  // type does not have a strange size (eg: it is not i1).
+  EVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  assert(!(NumElts & 1) && "Legal vector of one illegal element?");
+
+  // Promote the inserted value.  The type does not need to match the
+  // vector element type.  Check that any extra bits introduced will be
+  // truncated away.
+  assert(N->getOperand(0).getValueType().getSizeInBits() >=
+         N->getValueType(0).getVectorElementType().getSizeInBits() &&
+         "Type of inserted value narrower than vector element type!");
+
+  SmallVector<SDValue, 16> NewOps;
+  for (unsigned i = 0; i < NumElts; ++i)
+    NewOps.push_back(GetPromotedInteger(N->getOperand(i)));
+
+  return SDValue(DAG.UpdateNodeOperands(N, &NewOps[0], NumElts), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_CONVERT_RNDSAT(SDNode *N) {
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+  assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU ||
+           CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU ||
+           CvtCode == ISD::CVT_FS || CvtCode == ISD::CVT_FU) &&
+           "can only promote integer arguments");
+  SDValue InOp = GetPromotedInteger(N->getOperand(0));
+  return DAG.getConvertRndSat(N->getValueType(0), N->getDebugLoc(), InOp,
+                              N->getOperand(1), N->getOperand(2),
+                              N->getOperand(3), N->getOperand(4), CvtCode);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
+                                                         unsigned OpNo) {
+  if (OpNo == 1) {
+    // Promote the inserted value.  This is valid because the type does not
+    // have to match the vector element type.
+
+    // Check that any extra bits introduced will be truncated away.
+    assert(N->getOperand(1).getValueType().getSizeInBits() >=
+           N->getValueType(0).getVectorElementType().getSizeInBits() &&
+           "Type of inserted value narrower than vector element type!");
+    return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                  GetPromotedInteger(N->getOperand(1)),
+                                  N->getOperand(2)),
+                   0);
+  }
+
+  assert(OpNo == 2 && "Different operand and result vector types?");
+
+  // Promote the index.
+  SDValue Idx = ZExtPromotedInteger(N->getOperand(2));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                N->getOperand(1), Idx), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MEMBARRIER(SDNode *N) {
+  SDValue NewOps[6];
+  DebugLoc dl = N->getDebugLoc();
+  NewOps[0] = N->getOperand(0);
+  for (unsigned i = 1; i < array_lengthof(NewOps); ++i) {
+    SDValue Flag = GetPromotedInteger(N->getOperand(i));
+    NewOps[i] = DAG.getZeroExtendInReg(Flag, dl, MVT::i1);
+  }
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps, array_lengthof(NewOps)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) {
+  // Integer SCALAR_TO_VECTOR operands are implicitly truncated, so just promote
+  // the operand in place.
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                GetPromotedInteger(N->getOperand(0))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Only know how to promote condition");
+
+  // Promote all the way up to the canonical SetCC type.
+  EVT SVT = TLI.getSetCCResultType(N->getOperand(1).getValueType());
+  SDValue Cond = PromoteTargetBoolean(N->getOperand(0), SVT);
+
+  return SDValue(DAG.UpdateNodeOperands(N, Cond,
+                                N->getOperand(1), N->getOperand(2)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(4))->get());
+
+  // The CC (#4) and the possible return values (#2 and #3) have legal types.
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2),
+                                N->getOperand(3), N->getOperand(4)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(2))->get());
+
+  // The CC (#2) is always legal.
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                ZExtPromotedInteger(N->getOperand(1))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  DebugLoc dl = N->getDebugLoc();
+  Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(),
+                     Op, DAG.getValueType(N->getOperand(0).getValueType()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                SExtPromotedInteger(N->getOperand(0))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  SDValue Ch = N->getChain(), Ptr = N->getBasePtr();
+  int SVOffset = N->getSrcValueOffset();
+  unsigned Alignment = N->getAlignment();
+  bool isVolatile = N->isVolatile();
+  bool isNonTemporal = N->isNonTemporal();
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue Val = GetPromotedInteger(N->getValue());  // Get promoted value.
+
+  // Truncate the value and store the result.
+  return DAG.getTruncStore(Ch, dl, Val, Ptr, N->getSrcValue(),
+                           SVOffset, N->getMemoryVT(),
+                           isVolatile, isNonTemporal, Alignment);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                ZExtPromotedInteger(N->getOperand(0))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
+  return DAG.getZeroExtendInReg(Op, dl, N->getOperand(0).getValueType());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Integer Result Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandIntegerResult - This method is called when the specified result of the
+/// specified node is found to need expansion.  At this point, the node may also
+/// have invalid operands or may have other results that need promotion, we just
+/// know that (at least) one result needs expansion.
+void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Expand integer result: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Lo, Hi;
+  Lo = Hi = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ExpandIntegerResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to expand the result of this operator!");
+
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+
+  case ISD::BIT_CONVERT:        ExpandRes_BIT_CONVERT(N, Lo, Hi); break;
+  case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
+  case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
+  case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
+
+  case ISD::ANY_EXTEND:  ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break;
+  case ISD::AssertSext:  ExpandIntRes_AssertSext(N, Lo, Hi); break;
+  case ISD::AssertZext:  ExpandIntRes_AssertZext(N, Lo, Hi); break;
+  case ISD::BSWAP:       ExpandIntRes_BSWAP(N, Lo, Hi); break;
+  case ISD::Constant:    ExpandIntRes_Constant(N, Lo, Hi); break;
+  case ISD::CTLZ:        ExpandIntRes_CTLZ(N, Lo, Hi); break;
+  case ISD::CTPOP:       ExpandIntRes_CTPOP(N, Lo, Hi); break;
+  case ISD::CTTZ:        ExpandIntRes_CTTZ(N, Lo, Hi); break;
+  case ISD::FP_TO_SINT:  ExpandIntRes_FP_TO_SINT(N, Lo, Hi); break;
+  case ISD::FP_TO_UINT:  ExpandIntRes_FP_TO_UINT(N, Lo, Hi); break;
+  case ISD::LOAD:        ExpandIntRes_LOAD(cast<LoadSDNode>(N), Lo, Hi); break;
+  case ISD::MUL:         ExpandIntRes_MUL(N, Lo, Hi); break;
+  case ISD::SDIV:        ExpandIntRes_SDIV(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
+  case ISD::SREM:        ExpandIntRes_SREM(N, Lo, Hi); break;
+  case ISD::TRUNCATE:    ExpandIntRes_TRUNCATE(N, Lo, Hi); break;
+  case ISD::UDIV:        ExpandIntRes_UDIV(N, Lo, Hi); break;
+  case ISD::UREM:        ExpandIntRes_UREM(N, Lo, Hi); break;
+  case ISD::ZERO_EXTEND: ExpandIntRes_ZERO_EXTEND(N, Lo, Hi); break;
+
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: ExpandIntRes_Logical(N, Lo, Hi); break;
+
+  case ISD::ADD:
+  case ISD::SUB: ExpandIntRes_ADDSUB(N, Lo, Hi); break;
+
+  case ISD::ADDC:
+  case ISD::SUBC: ExpandIntRes_ADDSUBC(N, Lo, Hi); break;
+
+  case ISD::ADDE:
+  case ISD::SUBE: ExpandIntRes_ADDSUBE(N, Lo, Hi); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break;
+
+  case ISD::SADDO:
+  case ISD::SSUBO: ExpandIntRes_SADDSUBO(N, Lo, Hi); break;
+  case ISD::UADDO:
+  case ISD::USUBO: ExpandIntRes_UADDSUBO(N, Lo, Hi); break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetExpandedInteger(SDValue(N, ResNo), Lo, Hi);
+}
+
+/// ExpandShiftByConstant - N is a shift by a value that needs to be expanded,
+/// and the shift amount is a constant 'Amt'.  Expand the operation.
+void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt,
+                                             SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // Expand the incoming operand to be shifted, so that we have its parts
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  EVT NVT = InL.getValueType();
+  unsigned VTBits = N->getValueType(0).getSizeInBits();
+  unsigned NVTBits = NVT.getSizeInBits();
+  EVT ShTy = N->getOperand(1).getValueType();
+
+  if (N->getOpcode() == ISD::SHL) {
+    if (Amt > VTBits) {
+      Lo = Hi = DAG.getConstant(0, NVT);
+    } else if (Amt > NVTBits) {
+      Lo = DAG.getConstant(0, NVT);
+      Hi = DAG.getNode(ISD::SHL, dl,
+                       NVT, InL, DAG.getConstant(Amt-NVTBits,ShTy));
+    } else if (Amt == NVTBits) {
+      Lo = DAG.getConstant(0, NVT);
+      Hi = InL;
+    } else if (Amt == 1 &&
+               TLI.isOperationLegalOrCustom(ISD::ADDC,
+                              TLI.getTypeToExpandTo(*DAG.getContext(), NVT))) {
+      // Emit this X << 1 as X+X.
+      SDVTList VTList = DAG.getVTList(NVT, MVT::Flag);
+      SDValue LoOps[2] = { InL, InL };
+      Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps, 2);
+      SDValue HiOps[3] = { InH, InH, Lo.getValue(1) };
+      Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps, 3);
+    } else {
+      Lo = DAG.getNode(ISD::SHL, dl, NVT, InL, DAG.getConstant(Amt, ShTy));
+      Hi = DAG.getNode(ISD::OR, dl, NVT,
+                       DAG.getNode(ISD::SHL, dl, NVT, InH,
+                                   DAG.getConstant(Amt, ShTy)),
+                       DAG.getNode(ISD::SRL, dl, NVT, InL,
+                                   DAG.getConstant(NVTBits-Amt, ShTy)));
+    }
+    return;
+  }
+
+  if (N->getOpcode() == ISD::SRL) {
+    if (Amt > VTBits) {
+      Lo = DAG.getConstant(0, NVT);
+      Hi = DAG.getConstant(0, NVT);
+    } else if (Amt > NVTBits) {
+      Lo = DAG.getNode(ISD::SRL, dl,
+                       NVT, InH, DAG.getConstant(Amt-NVTBits,ShTy));
+      Hi = DAG.getConstant(0, NVT);
+    } else if (Amt == NVTBits) {
+      Lo = InH;
+      Hi = DAG.getConstant(0, NVT);
+    } else {
+      Lo = DAG.getNode(ISD::OR, dl, NVT,
+                       DAG.getNode(ISD::SRL, dl, NVT, InL,
+                                   DAG.getConstant(Amt, ShTy)),
+                       DAG.getNode(ISD::SHL, dl, NVT, InH,
+                                   DAG.getConstant(NVTBits-Amt, ShTy)));
+      Hi = DAG.getNode(ISD::SRL, dl, NVT, InH, DAG.getConstant(Amt, ShTy));
+    }
+    return;
+  }
+
+  assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+  if (Amt > VTBits) {
+    Hi = Lo = DAG.getNode(ISD::SRA, dl, NVT, InH,
+                          DAG.getConstant(NVTBits-1, ShTy));
+  } else if (Amt > NVTBits) {
+    Lo = DAG.getNode(ISD::SRA, dl, NVT, InH,
+                     DAG.getConstant(Amt-NVTBits, ShTy));
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, InH,
+                     DAG.getConstant(NVTBits-1, ShTy));
+  } else if (Amt == NVTBits) {
+    Lo = InH;
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, InH,
+                     DAG.getConstant(NVTBits-1, ShTy));
+  } else {
+    Lo = DAG.getNode(ISD::OR, dl, NVT,
+                     DAG.getNode(ISD::SRL, dl, NVT, InL,
+                                 DAG.getConstant(Amt, ShTy)),
+                     DAG.getNode(ISD::SHL, dl, NVT, InH,
+                                 DAG.getConstant(NVTBits-Amt, ShTy)));
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, InH, DAG.getConstant(Amt, ShTy));
+  }
+}
+
+/// ExpandShiftWithKnownAmountBit - Try to determine whether we can simplify
+/// this shift based on knowledge of the high bit of the shift amount.  If we
+/// can tell this, we know that it is >= 32 or < 32, without knowing the actual
+/// shift amount.
+bool DAGTypeLegalizer::
+ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue Amt = N->getOperand(1);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT ShTy = Amt.getValueType();
+  unsigned ShBits = ShTy.getScalarType().getSizeInBits();
+  unsigned NVTBits = NVT.getScalarType().getSizeInBits();
+  assert(isPowerOf2_32(NVTBits) &&
+         "Expanded integer type size not a power of two!");
+  DebugLoc dl = N->getDebugLoc();
+
+  APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits));
+  APInt KnownZero, KnownOne;
+  DAG.ComputeMaskedBits(N->getOperand(1), HighBitMask, KnownZero, KnownOne);
+
+  // If we don't know anything about the high bits, exit.
+  if (((KnownZero|KnownOne) & HighBitMask) == 0)
+    return false;
+
+  // Get the incoming operand to be shifted.
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  // If we know that any of the high bits of the shift amount are one, then we
+  // can do this as a couple of simple shifts.
+  if (KnownOne.intersects(HighBitMask)) {
+    // Mask out the high bit, which we know is set.
+    Amt = DAG.getNode(ISD::AND, dl, ShTy, Amt,
+                      DAG.getConstant(~HighBitMask, ShTy));
+
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unknown shift");
+    case ISD::SHL:
+      Lo = DAG.getConstant(0, NVT);              // Low part is zero.
+      Hi = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); // High part from Lo part.
+      return true;
+    case ISD::SRL:
+      Hi = DAG.getConstant(0, NVT);              // Hi part is zero.
+      Lo = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); // Lo part from Hi part.
+      return true;
+    case ISD::SRA:
+      Hi = DAG.getNode(ISD::SRA, dl, NVT, InH,       // Sign extend high part.
+                       DAG.getConstant(NVTBits-1, ShTy));
+      Lo = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); // Lo part from Hi part.
+      return true;
+    }
+  }
+
+#if 0
+  // FIXME: This code is broken for shifts with a zero amount!
+  // If we know that all of the high bits of the shift amount are zero, then we
+  // can do this as a couple of simple shifts.
+  if ((KnownZero & HighBitMask) == HighBitMask) {
+    // Compute 32-amt.
+    SDValue Amt2 = DAG.getNode(ISD::SUB, ShTy,
+                                 DAG.getConstant(NVTBits, ShTy),
+                                 Amt);
+    unsigned Op1, Op2;
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unknown shift");
+    case ISD::SHL:  Op1 = ISD::SHL; Op2 = ISD::SRL; break;
+    case ISD::SRL:
+    case ISD::SRA:  Op1 = ISD::SRL; Op2 = ISD::SHL; break;
+    }
+
+    Lo = DAG.getNode(N->getOpcode(), NVT, InL, Amt);
+    Hi = DAG.getNode(ISD::OR, NVT,
+                     DAG.getNode(Op1, NVT, InH, Amt),
+                     DAG.getNode(Op2, NVT, InL, Amt2));
+    return true;
+  }
+#endif
+
+  return false;
+}
+
+/// ExpandShiftWithUnknownAmountBit - Fully general expansion of integer shift
+/// of any size.
+bool DAGTypeLegalizer::
+ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue Amt = N->getOperand(1);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT ShTy = Amt.getValueType();
+  unsigned NVTBits = NVT.getSizeInBits();
+  assert(isPowerOf2_32(NVTBits) &&
+         "Expanded integer type size not a power of two!");
+  DebugLoc dl = N->getDebugLoc();
+
+  // Get the incoming operand to be shifted.
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  SDValue NVBitsNode = DAG.getConstant(NVTBits, ShTy);
+  SDValue AmtExcess = DAG.getNode(ISD::SUB, dl, ShTy, Amt, NVBitsNode);
+  SDValue AmtLack = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt);
+  SDValue isShort = DAG.getSetCC(dl, TLI.getSetCCResultType(ShTy),
+                                 Amt, NVBitsNode, ISD::SETULT);
+
+  SDValue LoS, HiS, LoL, HiL;
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("Unknown shift");
+  case ISD::SHL:
+    // Short: ShAmt < NVTBits
+    LoS = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt);
+    HiS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, Amt),
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    LoL = DAG.getConstant(0, NVT);                        // Lo part is zero.
+    HiL = DAG.getNode(ISD::SHL, dl, NVT, InL, AmtExcess); // Hi from Lo part.
+
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
+    return true;
+  case ISD::SRL:
+    // Short: ShAmt < NVTBits
+    HiS = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt);
+    LoS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    HiL = DAG.getConstant(0, NVT);                        // Hi part is zero.
+    LoL = DAG.getNode(ISD::SRL, dl, NVT, InH, AmtExcess); // Lo from Hi part.
+
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
+    return true;
+  case ISD::SRA:
+    // Short: ShAmt < NVTBits
+    HiS = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt);
+    LoS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    HiL = DAG.getNode(ISD::SRA, dl, NVT, InH,             // Sign of Hi part.
+                      DAG.getConstant(NVTBits-1, ShTy));
+    LoL = DAG.getNode(ISD::SRA, dl, NVT, InH, AmtExcess); // Lo from Hi part.
+
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
+    return true;
+  }
+
+  return false;
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+
+  EVT NVT = LHSL.getValueType();
+  SDValue LoOps[2] = { LHSL, RHSL };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  // Do not generate ADDC/ADDE or SUBC/SUBE if the target does not support
+  // them.  TODO: Teach operation legalization how to expand unsupported
+  // ADDC/ADDE/SUBC/SUBE.  The problem is that these operations generate
+  // a carry of type MVT::Flag, but there doesn't seem to be any way to
+  // generate a value of this type in the expanded code sequence.
+  bool hasCarry =
+    TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
+                                   ISD::ADDC : ISD::SUBC,
+                                 TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+
+  if (hasCarry) {
+    SDVTList VTList = DAG.getVTList(NVT, MVT::Flag);
+    if (N->getOpcode() == ISD::ADD) {
+      Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps, 2);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps, 3);
+    } else {
+      Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps, 2);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps, 3);
+    }
+  } else {
+    if (N->getOpcode() == ISD::ADD) {
+      Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps, 2);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, HiOps, 2);
+      SDValue Cmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Lo, LoOps[0],
+                                  ISD::SETULT);
+      SDValue Carry1 = DAG.getNode(ISD::SELECT, dl, NVT, Cmp1,
+                                   DAG.getConstant(1, NVT),
+                                   DAG.getConstant(0, NVT));
+      SDValue Cmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Lo, LoOps[1],
+                                  ISD::SETULT);
+      SDValue Carry2 = DAG.getNode(ISD::SELECT, dl, NVT, Cmp2,
+                                   DAG.getConstant(1, NVT), Carry1);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry2);
+    } else {
+      Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps, 2);
+      Hi = DAG.getNode(ISD::SUB, dl, NVT, HiOps, 2);
+      SDValue Cmp =
+        DAG.getSetCC(dl, TLI.getSetCCResultType(LoOps[0].getValueType()),
+                     LoOps[0], LoOps[1], ISD::SETULT);
+      SDValue Borrow = DAG.getNode(ISD::SELECT, dl, NVT, Cmp,
+                                   DAG.getConstant(1, NVT),
+                                   DAG.getConstant(0, NVT));
+      Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow);
+    }
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBC(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Flag);
+  SDValue LoOps[2] = { LHSL, RHSL };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  if (N->getOpcode() == ISD::ADDC) {
+    Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps, 2);
+    HiOps[2] = Lo.getValue(1);
+    Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps, 3);
+  } else {
+    Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps, 2);
+    HiOps[2] = Lo.getValue(1);
+    Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps, 3);
+  }
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Flag);
+  SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps, 3);
+  HiOps[2] = Lo.getValue(1);
+  Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps, 3);
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is any extension of the input (which degenerates to a copy).
+    Lo = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Op);
+    Hi = DAG.getUNDEF(NVT);   // The high part is undefined.
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) == PromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  unsigned NVTBits = NVT.getSizeInBits();
+  unsigned EVTBits = EVT.getSizeInBits();
+
+  if (NVTBits < EVTBits) {
+    Hi = DAG.getNode(ISD::AssertSext, dl, NVT, Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        EVTBits - NVTBits)));
+  } else {
+    Lo = DAG.getNode(ISD::AssertSext, dl, NVT, Lo, DAG.getValueType(EVT));
+    // The high part replicates the sign bit of Lo, make it explicit.
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                     DAG.getConstant(NVTBits-1, TLI.getPointerTy()));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_AssertZext(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  unsigned NVTBits = NVT.getSizeInBits();
+  unsigned EVTBits = EVT.getSizeInBits();
+
+  if (NVTBits < EVTBits) {
+    Hi = DAG.getNode(ISD::AssertZext, dl, NVT, Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        EVTBits - NVTBits)));
+  } else {
+    Lo = DAG.getNode(ISD::AssertZext, dl, NVT, Lo, DAG.getValueType(EVT));
+    // The high part must be zero, make it explicit.
+    Hi = DAG.getConstant(0, NVT);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), Hi, Lo);  // Note swapped operands.
+  Lo = DAG.getNode(ISD::BSWAP, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned NBitWidth = NVT.getSizeInBits();
+  const APInt &Cst = cast<ConstantSDNode>(N)->getAPIntValue();
+  Lo = DAG.getConstant(APInt(Cst).trunc(NBitWidth), NVT);
+  Hi = DAG.getConstant(Cst.lshr(NBitWidth).trunc(NBitWidth), NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // ctlz (HiLo) -> Hi != 0 ? ctlz(Hi) : (ctlz(Lo)+32)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+
+  SDValue HiNotZero = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Hi,
+                                   DAG.getConstant(0, NVT), ISD::SETNE);
+
+  SDValue LoLZ = DAG.getNode(ISD::CTLZ, dl, NVT, Lo);
+  SDValue HiLZ = DAG.getNode(ISD::CTLZ, dl, NVT, Hi);
+
+  Lo = DAG.getNode(ISD::SELECT, dl, NVT, HiNotZero, HiLZ,
+                   DAG.getNode(ISD::ADD, dl, NVT, LoLZ,
+                               DAG.getConstant(NVT.getSizeInBits(), NVT)));
+  Hi = DAG.getConstant(0, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  Lo = DAG.getNode(ISD::ADD, dl, NVT, DAG.getNode(ISD::CTPOP, dl, NVT, Lo),
+                   DAG.getNode(ISD::CTPOP, dl, NVT, Hi));
+  Hi = DAG.getConstant(0, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // cttz (HiLo) -> Lo != 0 ? cttz(Lo) : (cttz(Hi)+32)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+
+  SDValue LoNotZero = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Lo,
+                                   DAG.getConstant(0, NVT), ISD::SETNE);
+
+  SDValue LoLZ = DAG.getNode(ISD::CTTZ, dl, NVT, Lo);
+  SDValue HiLZ = DAG.getNode(ISD::CTTZ, dl, NVT, Hi);
+
+  Lo = DAG.getNode(ISD::SELECT, dl, NVT, LoNotZero, LoLZ,
+                   DAG.getNode(ISD::ADD, dl, NVT, HiLZ,
+                               DAG.getConstant(NVT.getSizeInBits(), NVT)));
+  Hi = DAG.getConstant(0, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
+  SplitInteger(MakeLibCall(LC, VT, &Op, 1, true/*irrelevant*/, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
+  SplitInteger(MakeLibCall(LC, VT, &Op, 1, false/*irrelevant*/, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  if (ISD::isNormalLoad(N)) {
+    ExpandRes_NormalLoad(N, Lo, Hi);
+    return;
+  }
+
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  int SVOffset = N->getSrcValueOffset();
+  unsigned Alignment = N->getAlignment();
+  bool isVolatile = N->isVolatile();
+  bool isNonTemporal = N->isNonTemporal();
+  DebugLoc dl = N->getDebugLoc();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  if (N->getMemoryVT().bitsLE(NVT)) {
+    EVT MemVT = N->getMemoryVT();
+
+    Lo = DAG.getExtLoad(ExtType, NVT, dl, Ch, Ptr, N->getSrcValue(), SVOffset,
+                        MemVT, isVolatile, isNonTemporal, Alignment);
+
+    // Remember the chain.
+    Ch = Lo.getValue(1);
+
+    if (ExtType == ISD::SEXTLOAD) {
+      // The high part is obtained by SRA'ing all but one of the bits of the
+      // lo part.
+      unsigned LoSize = Lo.getValueType().getSizeInBits();
+      Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                       DAG.getConstant(LoSize-1, TLI.getPointerTy()));
+    } else if (ExtType == ISD::ZEXTLOAD) {
+      // The high part is just a zero.
+      Hi = DAG.getConstant(0, NVT);
+    } else {
+      assert(ExtType == ISD::EXTLOAD && "Unknown extload!");
+      // The high part is undefined.
+      Hi = DAG.getUNDEF(NVT);
+    }
+  } else if (TLI.isLittleEndian()) {
+    // Little-endian - low bits are at low addresses.
+    Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getSrcValue(), SVOffset,
+                     isVolatile, isNonTemporal, Alignment);
+
+    unsigned ExcessBits =
+      N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
+    EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits);
+
+    // Increment the pointer to the other half.
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getIntPtrConstant(IncrementSize));
+    Hi = DAG.getExtLoad(ExtType, NVT, dl, Ch, Ptr, N->getSrcValue(),
+                        SVOffset+IncrementSize, NEVT,
+                        isVolatile, isNonTemporal,
+                        MinAlign(Alignment, IncrementSize));
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                     Hi.getValue(1));
+  } else {
+    // Big-endian - high bits are at low addresses.  Favor aligned loads at
+    // the cost of some bit-fiddling.
+    EVT MemVT = N->getMemoryVT();
+    unsigned EBytes = MemVT.getStoreSize();
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    unsigned ExcessBits = (EBytes - IncrementSize)*8;
+
+    // Load both the high bits and maybe some of the low bits.
+    Hi = DAG.getExtLoad(ExtType, NVT, dl, Ch, Ptr, N->getSrcValue(), SVOffset,
+                        EVT::getIntegerVT(*DAG.getContext(),
+                                          MemVT.getSizeInBits() - ExcessBits),
+                        isVolatile, isNonTemporal, Alignment);
+
+    // Increment the pointer to the other half.
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getIntPtrConstant(IncrementSize));
+    // Load the rest of the low bits.
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, NVT, dl, Ch, Ptr, N->getSrcValue(),
+                        SVOffset+IncrementSize,
+                        EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
+                        isVolatile, isNonTemporal,
+                        MinAlign(Alignment, IncrementSize));
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                     Hi.getValue(1));
+
+    if (ExcessBits < NVT.getSizeInBits()) {
+      // Transfer low bits from the bottom of Hi to the top of Lo.
+      Lo = DAG.getNode(ISD::OR, dl, NVT, Lo,
+                       DAG.getNode(ISD::SHL, dl, NVT, Hi,
+                                   DAG.getConstant(ExcessBits,
+                                                   TLI.getPointerTy())));
+      // Move high bits to the right position in Hi.
+      Hi = DAG.getNode(ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, dl,
+                       NVT, Hi,
+                       DAG.getConstant(NVT.getSizeInBits() - ExcessBits,
+                                       TLI.getPointerTy()));
+    }
+  }
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Ch);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Logical(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue LL, LH, RL, RH;
+  GetExpandedInteger(N->getOperand(0), LL, LH);
+  GetExpandedInteger(N->getOperand(1), RL, RH);
+  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LL, RL);
+  Hi = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LH, RH);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
+                                        SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  DebugLoc dl = N->getDebugLoc();
+
+  bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, NVT);
+  bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, NVT);
+  bool HasSMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::SMUL_LOHI, NVT);
+  bool HasUMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, NVT);
+  if (HasMULHU || HasMULHS || HasUMUL_LOHI || HasSMUL_LOHI) {
+    SDValue LL, LH, RL, RH;
+    GetExpandedInteger(N->getOperand(0), LL, LH);
+    GetExpandedInteger(N->getOperand(1), RL, RH);
+    unsigned OuterBitSize = VT.getSizeInBits();
+    unsigned InnerBitSize = NVT.getSizeInBits();
+    unsigned LHSSB = DAG.ComputeNumSignBits(N->getOperand(0));
+    unsigned RHSSB = DAG.ComputeNumSignBits(N->getOperand(1));
+
+    APInt HighMask = APInt::getHighBitsSet(OuterBitSize, InnerBitSize);
+    if (DAG.MaskedValueIsZero(N->getOperand(0), HighMask) &&
+        DAG.MaskedValueIsZero(N->getOperand(1), HighMask)) {
+      // The inputs are both zero-extended.
+      if (HasUMUL_LOHI) {
+        // We can emit a umul_lohi.
+        Lo = DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(NVT, NVT), LL, RL);
+        Hi = SDValue(Lo.getNode(), 1);
+        return;
+      }
+      if (HasMULHU) {
+        // We can emit a mulhu+mul.
+        Lo = DAG.getNode(ISD::MUL, dl, NVT, LL, RL);
+        Hi = DAG.getNode(ISD::MULHU, dl, NVT, LL, RL);
+        return;
+      }
+    }
+    if (LHSSB > InnerBitSize && RHSSB > InnerBitSize) {
+      // The input values are both sign-extended.
+      if (HasSMUL_LOHI) {
+        // We can emit a smul_lohi.
+        Lo = DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(NVT, NVT), LL, RL);
+        Hi = SDValue(Lo.getNode(), 1);
+        return;
+      }
+      if (HasMULHS) {
+        // We can emit a mulhs+mul.
+        Lo = DAG.getNode(ISD::MUL, dl, NVT, LL, RL);
+        Hi = DAG.getNode(ISD::MULHS, dl, NVT, LL, RL);
+        return;
+      }
+    }
+    if (HasUMUL_LOHI) {
+      // Lo,Hi = umul LHS, RHS.
+      SDValue UMulLOHI = DAG.getNode(ISD::UMUL_LOHI, dl,
+                                       DAG.getVTList(NVT, NVT), LL, RL);
+      Lo = UMulLOHI;
+      Hi = UMulLOHI.getValue(1);
+      RH = DAG.getNode(ISD::MUL, dl, NVT, LL, RH);
+      LH = DAG.getNode(ISD::MUL, dl, NVT, LH, RL);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, RH);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, LH);
+      return;
+    }
+    if (HasMULHU) {
+      Lo = DAG.getNode(ISD::MUL, dl, NVT, LL, RL);
+      Hi = DAG.getNode(ISD::MULHU, dl, NVT, LL, RL);
+      RH = DAG.getNode(ISD::MUL, dl, NVT, LL, RH);
+      LH = DAG.getNode(ISD::MUL, dl, NVT, LH, RL);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, RH);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, LH);
+      return;
+    }
+  }
+
+  // If nothing else, we can make a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::MUL_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::MUL_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::MUL_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::MUL_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported MUL!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(MakeLibCall(LC, VT, Ops, 2, true/*irrelevant*/, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  DebugLoc dl = Node->getDebugLoc();
+
+  // Expand the result by simply replacing it with the equivalent
+  // non-overflow-checking operation.
+  SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
+                            ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                            LHS, RHS);
+  SplitInteger(Sum, Lo, Hi);
+
+  // Compute the overflow.
+  //
+  //   LHSSign -> LHS >= 0
+  //   RHSSign -> RHS >= 0
+  //   SumSign -> Sum >= 0
+  //
+  //   Add:
+  //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+  //   Sub:
+  //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+  //
+  EVT OType = Node->getValueType(1);
+  SDValue Zero = DAG.getConstant(0, LHS.getValueType());
+
+  SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
+  SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
+  SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
+                                    Node->getOpcode() == ISD::SADDO ?
+                                    ISD::SETEQ : ISD::SETNE);
+
+  SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
+  SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
+
+  SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(Node, 1), Cmp);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::SDIV_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::SDIV_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::SDIV_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::SDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(MakeLibCall(LC, VT, Ops, 2, true, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // If we can emit an efficient shift operation, do so now.  Check to see if
+  // the RHS is a constant.
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return ExpandShiftByConstant(N, CN->getZExtValue(), Lo, Hi);
+
+  // If we can determine that the high bit of the shift is zero or one, even if
+  // the low bits are variable, emit this shift in an optimized form.
+  if (ExpandShiftWithKnownAmountBit(N, Lo, Hi))
+    return;
+
+  // If this target supports shift_PARTS, use it.  First, map to the _PARTS opc.
+  unsigned PartsOpc;
+  if (N->getOpcode() == ISD::SHL) {
+    PartsOpc = ISD::SHL_PARTS;
+  } else if (N->getOpcode() == ISD::SRL) {
+    PartsOpc = ISD::SRL_PARTS;
+  } else {
+    assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+    PartsOpc = ISD::SRA_PARTS;
+  }
+
+  // Next check to see if the target supports this SHL_PARTS operation or if it
+  // will custom expand it.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  TargetLowering::LegalizeAction Action = TLI.getOperationAction(PartsOpc, NVT);
+  if ((Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) ||
+      Action == TargetLowering::Custom) {
+    // Expand the subcomponents.
+    SDValue LHSL, LHSH;
+    GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+
+    SDValue Ops[] = { LHSL, LHSH, N->getOperand(1) };
+    EVT VT = LHSL.getValueType();
+    Lo = DAG.getNode(PartsOpc, dl, DAG.getVTList(VT, VT), Ops, 3);
+    Hi = Lo.getValue(1);
+    return;
+  }
+
+  // Otherwise, emit a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  bool isSigned;
+  if (N->getOpcode() == ISD::SHL) {
+    isSigned = false; /*sign irrelevant*/
+    if (VT == MVT::i16)
+      LC = RTLIB::SHL_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SHL_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SHL_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SHL_I128;
+  } else if (N->getOpcode() == ISD::SRL) {
+    isSigned = false;
+    if (VT == MVT::i16)
+      LC = RTLIB::SRL_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SRL_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SRL_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SRL_I128;
+  } else {
+    assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+    isSigned = true;
+    if (VT == MVT::i16)
+      LC = RTLIB::SRA_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SRA_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SRA_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SRA_I128;
+  }
+
+  if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
+    SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+    SplitInteger(MakeLibCall(LC, VT, Ops, 2, isSigned, dl), Lo, Hi);
+    return;
+  }
+
+  if (!ExpandShiftWithUnknownAmountBit(N, Lo, Hi))
+    llvm_unreachable("Unsupported shift!");
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is sign extension of the input (degenerates to a copy).
+    Lo = DAG.getNode(ISD::SIGN_EXTEND, dl, NVT, N->getOperand(0));
+    // The high part is obtained by SRA'ing all but one of the bits of low part.
+    unsigned LoSize = NVT.getSizeInBits();
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                     DAG.getConstant(LoSize-1, TLI.getPointerTy()));
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) == PromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+    unsigned ExcessBits =
+      Op.getValueType().getSizeInBits() - NVT.getSizeInBits();
+    Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        ExcessBits)));
+  }
+}
+
+void DAGTypeLegalizer::
+ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+
+  if (EVT.bitsLE(Lo.getValueType())) {
+    // sext_inreg the low part if needed.
+    Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Lo.getValueType(), Lo,
+                     N->getOperand(1));
+
+    // The high part gets the sign extension from the lo-part.  This handles
+    // things like sextinreg V:i64 from i8.
+    Hi = DAG.getNode(ISD::SRA, dl, Hi.getValueType(), Lo,
+                     DAG.getConstant(Hi.getValueType().getSizeInBits()-1,
+                                     TLI.getPointerTy()));
+  } else {
+    // For example, extension of an i48 to an i64.  Leave the low part alone,
+    // sext_inreg the high part.
+    unsigned ExcessBits =
+      EVT.getSizeInBits() - Lo.getValueType().getSizeInBits();
+    Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        ExcessBits)));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::SREM_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::SREM_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::SREM_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::SREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(MakeLibCall(LC, VT, Ops, 2, true, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, N->getOperand(0));
+  Hi = DAG.getNode(ISD::SRL, dl,
+                   N->getOperand(0).getValueType(), N->getOperand(0),
+                   DAG.getConstant(NVT.getSizeInBits(), TLI.getPointerTy()));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Expand the result by simply replacing it with the equivalent
+  // non-overflow-checking operation.
+  SDValue Sum = DAG.getNode(N->getOpcode() == ISD::UADDO ?
+                            ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                            LHS, RHS);
+  SplitInteger(Sum, Lo, Hi);
+
+  // Calculate the overflow: addition overflows iff a + b < a, and subtraction
+  // overflows iff a - b > a.
+  SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS,
+                             N->getOpcode () == ISD::UADDO ?
+                             ISD::SETULT : ISD::SETUGT);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::UDIV_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::UDIV_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::UDIV_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::UDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(MakeLibCall(LC, VT, Ops, 2, false, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::UREM_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::UREM_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::UREM_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::UREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(MakeLibCall(LC, VT, Ops, 2, false, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is zero extension of the input (degenerates to a copy).
+    Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N->getOperand(0));
+    Hi = DAG.getConstant(0, NVT);   // The high part is just a zero.
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) == PromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+    unsigned ExcessBits =
+      Op.getValueType().getSizeInBits() - NVT.getSizeInBits();
+    Hi = DAG.getZeroExtendInReg(Hi, dl,
+                                EVT::getIntegerVT(*DAG.getContext(),
+                                                  ExcessBits));
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Integer Operand Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandIntegerOperand - This method is called when the specified operand of
+/// the specified node is found to need expansion.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Expand integer operand: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+  #ifndef NDEBUG
+    dbgs() << "ExpandIntegerOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+  #endif
+    llvm_unreachable("Do not know how to expand this operator's operand!");
+
+  case ISD::BIT_CONVERT:       Res = ExpandOp_BIT_CONVERT(N); break;
+  case ISD::BR_CC:             Res = ExpandIntOp_BR_CC(N); break;
+  case ISD::BUILD_VECTOR:      Res = ExpandOp_BUILD_VECTOR(N); break;
+  case ISD::EXTRACT_ELEMENT:   Res = ExpandOp_EXTRACT_ELEMENT(N); break;
+  case ISD::INSERT_VECTOR_ELT: Res = ExpandOp_INSERT_VECTOR_ELT(N); break;
+  case ISD::SCALAR_TO_VECTOR:  Res = ExpandOp_SCALAR_TO_VECTOR(N); break;
+  case ISD::SELECT_CC:         Res = ExpandIntOp_SELECT_CC(N); break;
+  case ISD::SETCC:             Res = ExpandIntOp_SETCC(N); break;
+  case ISD::SINT_TO_FP:        Res = ExpandIntOp_SINT_TO_FP(N); break;
+  case ISD::STORE:   Res = ExpandIntOp_STORE(cast<StoreSDNode>(N), OpNo); break;
+  case ISD::TRUNCATE:          Res = ExpandIntOp_TRUNCATE(N); break;
+  case ISD::UINT_TO_FP:        Res = ExpandIntOp_UINT_TO_FP(N); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:              Res = ExpandIntOp_Shift(N); break;
+  case ISD::RETURNADDR:
+  case ISD::FRAMEADDR:         Res = ExpandIntOp_RETURNADDR(N); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// IntegerExpandSetCCOperands - Expand the operands of a comparison.  This code
+/// is shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
+                                                  SDValue &NewRHS,
+                                                  ISD::CondCode &CCCode,
+                                                  DebugLoc dl) {
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedInteger(NewLHS, LHSLo, LHSHi);
+  GetExpandedInteger(NewRHS, RHSLo, RHSHi);
+
+  if (CCCode == ISD::SETEQ || CCCode == ISD::SETNE) {
+    if (RHSLo == RHSHi) {
+      if (ConstantSDNode *RHSCST = dyn_cast<ConstantSDNode>(RHSLo)) {
+        if (RHSCST->isAllOnesValue()) {
+          // Equality comparison to -1.
+          NewLHS = DAG.getNode(ISD::AND, dl,
+                               LHSLo.getValueType(), LHSLo, LHSHi);
+          NewRHS = RHSLo;
+          return;
+        }
+      }
+    }
+
+    NewLHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSLo, RHSLo);
+    NewRHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSHi, RHSHi);
+    NewLHS = DAG.getNode(ISD::OR, dl, NewLHS.getValueType(), NewLHS, NewRHS);
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    return;
+  }
+
+  // If this is a comparison of the sign bit, just look at the top part.
+  // X > -1,  x < 0
+  if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(NewRHS))
+    if ((CCCode == ISD::SETLT && CST->isNullValue()) ||     // X < 0
+        (CCCode == ISD::SETGT && CST->isAllOnesValue())) {  // X > -1
+      NewLHS = LHSHi;
+      NewRHS = RHSHi;
+      return;
+    }
+
+  // FIXME: This generated code sucks.
+  ISD::CondCode LowCC;
+  switch (CCCode) {
+  default: llvm_unreachable("Unknown integer setcc!");
+  case ISD::SETLT:
+  case ISD::SETULT: LowCC = ISD::SETULT; break;
+  case ISD::SETGT:
+  case ISD::SETUGT: LowCC = ISD::SETUGT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: LowCC = ISD::SETULE; break;
+  case ISD::SETGE:
+  case ISD::SETUGE: LowCC = ISD::SETUGE; break;
+  }
+
+  // Tmp1 = lo(op1) < lo(op2)   // Always unsigned comparison
+  // Tmp2 = hi(op1) < hi(op2)   // Signedness depends on operands
+  // dest = hi(op1) == hi(op2) ? Tmp1 : Tmp2;
+
+  // NOTE: on targets without efficient SELECT of bools, we can always use
+  // this identity: (B1 ? B2 : B3) --> (B1 & B2)|(!B1&B3)
+  TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, false, true, true, NULL);
+  SDValue Tmp1, Tmp2;
+  Tmp1 = TLI.SimplifySetCC(TLI.getSetCCResultType(LHSLo.getValueType()),
+                           LHSLo, RHSLo, LowCC, false, DagCombineInfo, dl);
+  if (!Tmp1.getNode())
+    Tmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSLo.getValueType()),
+                        LHSLo, RHSLo, LowCC);
+  Tmp2 = TLI.SimplifySetCC(TLI.getSetCCResultType(LHSHi.getValueType()),
+                           LHSHi, RHSHi, CCCode, false, DagCombineInfo, dl);
+  if (!Tmp2.getNode())
+    Tmp2 = DAG.getNode(ISD::SETCC, dl,
+                       TLI.getSetCCResultType(LHSHi.getValueType()),
+                       LHSHi, RHSHi, DAG.getCondCode(CCCode));
+
+  ConstantSDNode *Tmp1C = dyn_cast<ConstantSDNode>(Tmp1.getNode());
+  ConstantSDNode *Tmp2C = dyn_cast<ConstantSDNode>(Tmp2.getNode());
+  if ((Tmp1C && Tmp1C->isNullValue()) ||
+      (Tmp2C && Tmp2C->isNullValue() &&
+       (CCCode == ISD::SETLE || CCCode == ISD::SETGE ||
+        CCCode == ISD::SETUGE || CCCode == ISD::SETULE)) ||
+      (Tmp2C && Tmp2C->getAPIntValue() == 1 &&
+       (CCCode == ISD::SETLT || CCCode == ISD::SETGT ||
+        CCCode == ISD::SETUGT || CCCode == ISD::SETULT))) {
+    // low part is known false, returns high part.
+    // For LE / GE, if high part is known false, ignore the low part.
+    // For LT / GT, if high part is known true, ignore the low part.
+    NewLHS = Tmp2;
+    NewRHS = SDValue();
+    return;
+  }
+
+  NewLHS = TLI.SimplifySetCC(TLI.getSetCCResultType(LHSHi.getValueType()),
+                             LHSHi, RHSHi, ISD::SETEQ, false,
+                             DagCombineInfo, dl);
+  if (!NewLHS.getNode())
+    NewLHS = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+                          LHSHi, RHSHi, ISD::SETEQ);
+  NewLHS = DAG.getNode(ISD::SELECT, dl, Tmp1.getValueType(),
+                       NewLHS, Tmp1, Tmp2);
+  NewRHS = SDValue();
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, use it.
+  if (NewRHS.getNode() == 0) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
+  // The value being shifted is legal, but the shift amount is too big.
+  // It follows that either the result of the shift is undefined, or the
+  // upper half of the shift amount is zero.  Just use the lower half.
+  SDValue Lo, Hi;
+  GetExpandedInteger(N->getOperand(1), Lo, Hi);
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Lo), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) {
+  // The argument of RETURNADDR / FRAMEADDR builtin is 32 bit contant.  This
+  // surely makes pretty nice problems on 8/16 bit targets. Just truncate this
+  // constant to valid type.
+  SDValue Lo, Hi;
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  return SDValue(DAG.UpdateNodeOperands(N, Lo), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  EVT DstVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+         "Don't know how to expand this SINT_TO_FP!");
+  return MakeLibCall(LC, DstVT, &Op, 1, true, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
+  if (ISD::isNormalStore(N))
+    return ExpandOp_NormalStore(N, OpNo);
+
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+
+  EVT VT = N->getOperand(1).getValueType();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  int SVOffset = N->getSrcValueOffset();
+  unsigned Alignment = N->getAlignment();
+  bool isVolatile = N->isVolatile();
+  bool isNonTemporal = N->isNonTemporal();
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Lo, Hi;
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  if (N->getMemoryVT().bitsLE(NVT)) {
+    GetExpandedInteger(N->getValue(), Lo, Hi);
+    return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getSrcValue(), SVOffset,
+                             N->getMemoryVT(), isVolatile, isNonTemporal,
+                             Alignment);
+  } else if (TLI.isLittleEndian()) {
+    // Little-endian - low bits are at low addresses.
+    GetExpandedInteger(N->getValue(), Lo, Hi);
+
+    Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getSrcValue(), SVOffset,
+                      isVolatile, isNonTemporal, Alignment);
+
+    unsigned ExcessBits =
+      N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
+    EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits);
+
+    // Increment the pointer to the other half.
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getIntPtrConstant(IncrementSize));
+    Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getSrcValue(),
+                           SVOffset+IncrementSize, NEVT,
+                           isVolatile, isNonTemporal,
+                           MinAlign(Alignment, IncrementSize));
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+  } else {
+    // Big-endian - high bits are at low addresses.  Favor aligned stores at
+    // the cost of some bit-fiddling.
+    GetExpandedInteger(N->getValue(), Lo, Hi);
+
+    EVT ExtVT = N->getMemoryVT();
+    unsigned EBytes = ExtVT.getStoreSize();
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    unsigned ExcessBits = (EBytes - IncrementSize)*8;
+    EVT HiVT = EVT::getIntegerVT(*DAG.getContext(),
+                                 ExtVT.getSizeInBits() - ExcessBits);
+
+    if (ExcessBits < NVT.getSizeInBits()) {
+      // Transfer high bits from the top of Lo to the bottom of Hi.
+      Hi = DAG.getNode(ISD::SHL, dl, NVT, Hi,
+                       DAG.getConstant(NVT.getSizeInBits() - ExcessBits,
+                                       TLI.getPointerTy()));
+      Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
+                       DAG.getNode(ISD::SRL, dl, NVT, Lo,
+                                   DAG.getConstant(ExcessBits,
+                                                   TLI.getPointerTy())));
+    }
+
+    // Store both the high bits and maybe some of the low bits.
+    Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getSrcValue(),
+                           SVOffset, HiVT, isVolatile, isNonTemporal,
+                           Alignment);
+
+    // Increment the pointer to the other half.
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getIntPtrConstant(IncrementSize));
+    // Store the lowest ExcessBits bits in the second half.
+    Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getSrcValue(),
+                           SVOffset+IncrementSize,
+                           EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
+                           isVolatile, isNonTemporal,
+                           MinAlign(Alignment, IncrementSize));
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+  }
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_TRUNCATE(SDNode *N) {
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+  // Just truncate the low part of the source.
+  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), N->getValueType(0), InL);
+}
+
+static const fltSemantics *EVTToAPFloatSemantics(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unknown FP format");
+  case MVT::f32:     return &APFloat::IEEEsingle;
+  case MVT::f64:     return &APFloat::IEEEdouble;
+  case MVT::f80:     return &APFloat::x87DoubleExtended;
+  case MVT::f128:    return &APFloat::IEEEquad;
+  case MVT::ppcf128: return &APFloat::PPCDoubleDouble;
+  }
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  EVT SrcVT = Op.getValueType();
+  EVT DstVT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // The following optimization is valid only if every value in SrcVT (when
+  // treated as signed) is representable in DstVT.  Check that the mantissa
+  // size of DstVT is >= than the number of bits in SrcVT -1.
+  const fltSemantics *sem = EVTToAPFloatSemantics(DstVT);
+  if (APFloat::semanticsPrecision(*sem) >= SrcVT.getSizeInBits()-1 &&
+      TLI.getOperationAction(ISD::SINT_TO_FP, SrcVT) == TargetLowering::Custom){
+    // Do a signed conversion then adjust the result.
+    SDValue SignedConv = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Op);
+    SignedConv = TLI.LowerOperation(SignedConv, DAG);
+
+    // The result of the signed conversion needs adjusting if the 'sign bit' of
+    // the incoming integer was set.  To handle this, we dynamically test to see
+    // if it is set, and, if so, add a fudge factor.
+
+    const uint64_t F32TwoE32  = 0x4F800000ULL;
+    const uint64_t F32TwoE64  = 0x5F800000ULL;
+    const uint64_t F32TwoE128 = 0x7F800000ULL;
+
+    APInt FF(32, 0);
+    if (SrcVT == MVT::i32)
+      FF = APInt(32, F32TwoE32);
+    else if (SrcVT == MVT::i64)
+      FF = APInt(32, F32TwoE64);
+    else if (SrcVT == MVT::i128)
+      FF = APInt(32, F32TwoE128);
+    else
+      assert(false && "Unsupported UINT_TO_FP!");
+
+    // Check whether the sign bit is set.
+    SDValue Lo, Hi;
+    GetExpandedInteger(Op, Lo, Hi);
+    SDValue SignSet = DAG.getSetCC(dl,
+                                   TLI.getSetCCResultType(Hi.getValueType()),
+                                   Hi, DAG.getConstant(0, Hi.getValueType()),
+                                   ISD::SETLT);
+
+    // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+    SDValue FudgePtr = DAG.getConstantPool(
+                               ConstantInt::get(*DAG.getContext(), FF.zext(64)),
+                                           TLI.getPointerTy());
+
+    // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+    SDValue Zero = DAG.getIntPtrConstant(0);
+    SDValue Four = DAG.getIntPtrConstant(4);
+    if (TLI.isBigEndian()) std::swap(Zero, Four);
+    SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
+                                 Zero, Four);
+    unsigned Alignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlignment();
+    FudgePtr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), FudgePtr, Offset);
+    Alignment = std::min(Alignment, 4u);
+
+    // Load the value out, extending it from f32 to the destination float type.
+    // FIXME: Avoid the extend by constructing the right constant pool?
+    SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, DstVT, dl, DAG.getEntryNode(),
+                                   FudgePtr, NULL, 0, MVT::f32,
+                                   false, false, Alignment);
+    return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge);
+  }
+
+  // Otherwise, use a libcall.
+  RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+         "Don't know how to expand this UINT_TO_FP!");
+  return MakeLibCall(LC, DstVT, &Op, 1, true, dl);
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
new file mode 100644
index 0000000..6e56c98
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -0,0 +1,1114 @@
+//===-- LegalizeTypes.cpp - Common code for DAG type legalizer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::LegalizeTypes method.  It transforms
+// an arbitrary well-formed SelectionDAG to only consist of legal types.  This
+// is common code shared among the LegalizeTypes*.cpp files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableExpensiveChecks("enable-legalize-types-checking", cl::Hidden);
+
+/// PerformExpensiveChecks - Do extensive, expensive, sanity checking.
+void DAGTypeLegalizer::PerformExpensiveChecks() {
+  // If a node is not processed, then none of its values should be mapped by any
+  // of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues.
+
+  // If a node is processed, then each value with an illegal type must be mapped
+  // by exactly one of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues.
+  // Values with a legal type may be mapped by ReplacedValues, but not by any of
+  // the other maps.
+
+  // Note that these invariants may not hold momentarily when processing a node:
+  // the node being processed may be put in a map before being marked Processed.
+
+  // Note that it is possible to have nodes marked NewNode in the DAG.  This can
+  // occur in two ways.  Firstly, a node may be created during legalization but
+  // never passed to the legalization core.  This is usually due to the implicit
+  // folding that occurs when using the DAG.getNode operators.  Secondly, a new
+  // node may be passed to the legalization core, but when analyzed may morph
+  // into a different node, leaving the original node as a NewNode in the DAG.
+  // A node may morph if one of its operands changes during analysis.  Whether
+  // it actually morphs or not depends on whether, after updating its operands,
+  // it is equivalent to an existing node: if so, it morphs into that existing
+  // node (CSE).  An operand can change during analysis if the operand is a new
+  // node that morphs, or it is a processed value that was mapped to some other
+  // value (as recorded in ReplacedValues) in which case the operand is turned
+  // into that other value.  If a node morphs then the node it morphed into will
+  // be used instead of it for legalization, however the original node continues
+  // to live on in the DAG.
+  // The conclusion is that though there may be nodes marked NewNode in the DAG,
+  // all uses of such nodes are also marked NewNode: the result is a fungus of
+  // NewNodes growing on top of the useful nodes, and perhaps using them, but
+  // not used by them.
+
+  // If a value is mapped by ReplacedValues, then it must have no uses, except
+  // by nodes marked NewNode (see above).
+
+  // The final node obtained by mapping by ReplacedValues is not marked NewNode.
+  // Note that ReplacedValues should be applied iteratively.
+
+  // Note that the ReplacedValues map may also map deleted nodes (by iterating
+  // over the DAG we never dereference deleted nodes).  This means that it may
+  // also map nodes marked NewNode if the deallocated memory was reallocated as
+  // another node, and that new node was not seen by the LegalizeTypes machinery
+  // (for example because it was created but not used).  In general, we cannot
+  // distinguish between new nodes and deleted nodes.
+  SmallVector<SDNode*, 16> NewNodes;
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ++I) {
+    // Remember nodes marked NewNode - they are subject to extra checking below.
+    if (I->getNodeId() == NewNode)
+      NewNodes.push_back(I);
+
+    for (unsigned i = 0, e = I->getNumValues(); i != e; ++i) {
+      SDValue Res(I, i);
+      bool Failed = false;
+
+      unsigned Mapped = 0;
+      if (ReplacedValues.find(Res) != ReplacedValues.end()) {
+        Mapped |= 1;
+        // Check that remapped values are only used by nodes marked NewNode.
+        for (SDNode::use_iterator UI = I->use_begin(), UE = I->use_end();
+             UI != UE; ++UI)
+          if (UI.getUse().getResNo() == i)
+            assert(UI->getNodeId() == NewNode &&
+                   "Remapped value has non-trivial use!");
+
+        // Check that the final result of applying ReplacedValues is not
+        // marked NewNode.
+        SDValue NewVal = ReplacedValues[Res];
+        DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.find(NewVal);
+        while (I != ReplacedValues.end()) {
+          NewVal = I->second;
+          I = ReplacedValues.find(NewVal);
+        }
+        assert(NewVal.getNode()->getNodeId() != NewNode &&
+               "ReplacedValues maps to a new node!");
+      }
+      if (PromotedIntegers.find(Res) != PromotedIntegers.end())
+        Mapped |= 2;
+      if (SoftenedFloats.find(Res) != SoftenedFloats.end())
+        Mapped |= 4;
+      if (ScalarizedVectors.find(Res) != ScalarizedVectors.end())
+        Mapped |= 8;
+      if (ExpandedIntegers.find(Res) != ExpandedIntegers.end())
+        Mapped |= 16;
+      if (ExpandedFloats.find(Res) != ExpandedFloats.end())
+        Mapped |= 32;
+      if (SplitVectors.find(Res) != SplitVectors.end())
+        Mapped |= 64;
+      if (WidenedVectors.find(Res) != WidenedVectors.end())
+        Mapped |= 128;
+
+      if (I->getNodeId() != Processed) {
+        // Since we allow ReplacedValues to map deleted nodes, it may map nodes
+        // marked NewNode too, since a deleted node may have been reallocated as
+        // another node that has not been seen by the LegalizeTypes machinery.
+        if ((I->getNodeId() == NewNode && Mapped > 1) ||
+            (I->getNodeId() != NewNode && Mapped != 0)) {
+          dbgs() << "Unprocessed value in a map!";
+          Failed = true;
+        }
+      } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(I)) {
+        if (Mapped > 1) {
+          dbgs() << "Value with legal type was transformed!";
+          Failed = true;
+        }
+      } else {
+        if (Mapped == 0) {
+          dbgs() << "Processed value not in any map!";
+          Failed = true;
+        } else if (Mapped & (Mapped - 1)) {
+          dbgs() << "Value in multiple maps!";
+          Failed = true;
+        }
+      }
+
+      if (Failed) {
+        if (Mapped & 1)
+          dbgs() << " ReplacedValues";
+        if (Mapped & 2)
+          dbgs() << " PromotedIntegers";
+        if (Mapped & 4)
+          dbgs() << " SoftenedFloats";
+        if (Mapped & 8)
+          dbgs() << " ScalarizedVectors";
+        if (Mapped & 16)
+          dbgs() << " ExpandedIntegers";
+        if (Mapped & 32)
+          dbgs() << " ExpandedFloats";
+        if (Mapped & 64)
+          dbgs() << " SplitVectors";
+        if (Mapped & 128)
+          dbgs() << " WidenedVectors";
+        dbgs() << "\n";
+        llvm_unreachable(0);
+      }
+    }
+  }
+
+  // Checked that NewNodes are only used by other NewNodes.
+  for (unsigned i = 0, e = NewNodes.size(); i != e; ++i) {
+    SDNode *N = NewNodes[i];
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI)
+      assert(UI->getNodeId() == NewNode && "NewNode used by non-NewNode!");
+  }
+}
+
+/// run - This is the main entry point for the type legalizer.  This does a
+/// top-down traversal of the dag, legalizing types as it goes.  Returns "true"
+/// if it made any changes.
+bool DAGTypeLegalizer::run() {
+  bool Changed = false;
+
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted, and tracking any
+  // changes of the root.
+  HandleSDNode Dummy(DAG.getRoot());
+  Dummy.setNodeId(Unanalyzed);
+
+  // The root of the dag may dangle to deleted nodes until the type legalizer is
+  // done.  Set it to null to avoid confusion.
+  DAG.setRoot(SDValue());
+
+  // Walk all nodes in the graph, assigning them a NodeId of 'ReadyToProcess'
+  // (and remembering them) if they are leaves and assigning 'Unanalyzed' if
+  // non-leaves.
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ++I) {
+    if (I->getNumOperands() == 0) {
+      I->setNodeId(ReadyToProcess);
+      Worklist.push_back(I);
+    } else {
+      I->setNodeId(Unanalyzed);
+    }
+  }
+
+  // Now that we have a set of nodes to process, handle them all.
+  while (!Worklist.empty()) {
+#ifndef XDEBUG
+    if (EnableExpensiveChecks)
+#endif
+      PerformExpensiveChecks();
+
+    SDNode *N = Worklist.back();
+    Worklist.pop_back();
+    assert(N->getNodeId() == ReadyToProcess &&
+           "Node should be ready if on worklist!");
+
+    if (IgnoreNodeResults(N))
+      goto ScanOperands;
+
+    // Scan the values produced by the node, checking to see if any result
+    // types are illegal.
+    for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i) {
+      EVT ResultVT = N->getValueType(i);
+      switch (getTypeAction(ResultVT)) {
+      default:
+        assert(false && "Unknown action!");
+      case Legal:
+        break;
+      // The following calls must take care of *all* of the node's results,
+      // not just the illegal result they were passed (this includes results
+      // with a legal type).  Results can be remapped using ReplaceValueWith,
+      // or their promoted/expanded/etc values registered in PromotedIntegers,
+      // ExpandedIntegers etc.
+      case PromoteInteger:
+        PromoteIntegerResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case ExpandInteger:
+        ExpandIntegerResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case SoftenFloat:
+        SoftenFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case ExpandFloat:
+        ExpandFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case ScalarizeVector:
+        ScalarizeVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case SplitVector:
+        SplitVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case WidenVector:
+        WidenVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      }
+    }
+
+ScanOperands:
+    // Scan the operand list for the node, handling any nodes with operands that
+    // are illegal.
+    {
+    unsigned NumOperands = N->getNumOperands();
+    bool NeedsReanalyzing = false;
+    unsigned i;
+    for (i = 0; i != NumOperands; ++i) {
+      if (IgnoreNodeResults(N->getOperand(i).getNode()))
+        continue;
+
+      EVT OpVT = N->getOperand(i).getValueType();
+      switch (getTypeAction(OpVT)) {
+      default:
+        assert(false && "Unknown action!");
+      case Legal:
+        continue;
+      // The following calls must either replace all of the node's results
+      // using ReplaceValueWith, and return "false"; or update the node's
+      // operands in place, and return "true".
+      case PromoteInteger:
+        NeedsReanalyzing = PromoteIntegerOperand(N, i);
+        Changed = true;
+        break;
+      case ExpandInteger:
+        NeedsReanalyzing = ExpandIntegerOperand(N, i);
+        Changed = true;
+        break;
+      case SoftenFloat:
+        NeedsReanalyzing = SoftenFloatOperand(N, i);
+        Changed = true;
+        break;
+      case ExpandFloat:
+        NeedsReanalyzing = ExpandFloatOperand(N, i);
+        Changed = true;
+        break;
+      case ScalarizeVector:
+        NeedsReanalyzing = ScalarizeVectorOperand(N, i);
+        Changed = true;
+        break;
+      case SplitVector:
+        NeedsReanalyzing = SplitVectorOperand(N, i);
+        Changed = true;
+        break;
+      case WidenVector:
+        NeedsReanalyzing = WidenVectorOperand(N, i);
+        Changed = true;
+        break;
+      }
+      break;
+    }
+
+    // The sub-method updated N in place.  Check to see if any operands are new,
+    // and if so, mark them.  If the node needs revisiting, don't add all users
+    // to the worklist etc.
+    if (NeedsReanalyzing) {
+      assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+      N->setNodeId(NewNode);
+      // Recompute the NodeId and correct processed operands, adding the node to
+      // the worklist if ready.
+      SDNode *M = AnalyzeNewNode(N);
+      if (M == N)
+        // The node didn't morph - nothing special to do, it will be revisited.
+        continue;
+
+      // The node morphed - this is equivalent to legalizing by replacing every
+      // value of N with the corresponding value of M.  So do that now.
+      assert(N->getNumValues() == M->getNumValues() &&
+             "Node morphing changed the number of results!");
+      for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+        // Replacing the value takes care of remapping the new value.
+        ReplaceValueWith(SDValue(N, i), SDValue(M, i));
+      assert(N->getNodeId() == NewNode && "Unexpected node state!");
+      // The node continues to live on as part of the NewNode fungus that
+      // grows on top of the useful nodes.  Nothing more needs to be done
+      // with it - move on to the next node.
+      continue;
+    }
+
+    if (i == NumOperands) {
+      DEBUG(dbgs() << "Legally typed node: "; N->dump(&DAG); dbgs() << "\n");
+    }
+    }
+NodeDone:
+
+    // If we reach here, the node was processed, potentially creating new nodes.
+    // Mark it as processed and add its users to the worklist as appropriate.
+    assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+    N->setNodeId(Processed);
+
+    for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
+         UI != E; ++UI) {
+      SDNode *User = *UI;
+      int NodeId = User->getNodeId();
+
+      // This node has two options: it can either be a new node or its Node ID
+      // may be a count of the number of operands it has that are not ready.
+      if (NodeId > 0) {
+        User->setNodeId(NodeId-1);
+
+        // If this was the last use it was waiting on, add it to the ready list.
+        if (NodeId-1 == ReadyToProcess)
+          Worklist.push_back(User);
+        continue;
+      }
+
+      // If this is an unreachable new node, then ignore it.  If it ever becomes
+      // reachable by being used by a newly created node then it will be handled
+      // by AnalyzeNewNode.
+      if (NodeId == NewNode)
+        continue;
+
+      // Otherwise, this node is new: this is the first operand of it that
+      // became ready.  Its new NodeId is the number of operands it has minus 1
+      // (as this node is now processed).
+      assert(NodeId == Unanalyzed && "Unknown node ID!");
+      User->setNodeId(User->getNumOperands() - 1);
+
+      // If the node only has a single operand, it is now ready.
+      if (User->getNumOperands() == 1)
+        Worklist.push_back(User);
+    }
+  }
+
+#ifndef XDEBUG
+  if (EnableExpensiveChecks)
+#endif
+    PerformExpensiveChecks();
+
+  // If the root changed (e.g. it was a dead load) update the root.
+  DAG.setRoot(Dummy.getValue());
+
+  // Remove dead nodes.  This is important to do for cleanliness but also before
+  // the checking loop below.  Implicit folding by the DAG.getNode operators and
+  // node morphing can cause unreachable nodes to be around with their flags set
+  // to new.
+  DAG.RemoveDeadNodes();
+
+  // In a debug build, scan all the nodes to make sure we found them all.  This
+  // ensures that there are no cycles and that everything got processed.
+#ifndef NDEBUG
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ++I) {
+    bool Failed = false;
+
+    // Check that all result types are legal.
+    if (!IgnoreNodeResults(I))
+      for (unsigned i = 0, NumVals = I->getNumValues(); i < NumVals; ++i)
+        if (!isTypeLegal(I->getValueType(i))) {
+          dbgs() << "Result type " << i << " illegal!\n";
+          Failed = true;
+        }
+
+    // Check that all operand types are legal.
+    for (unsigned i = 0, NumOps = I->getNumOperands(); i < NumOps; ++i)
+      if (!IgnoreNodeResults(I->getOperand(i).getNode()) &&
+          !isTypeLegal(I->getOperand(i).getValueType())) {
+        dbgs() << "Operand type " << i << " illegal!\n";
+        Failed = true;
+      }
+
+    if (I->getNodeId() != Processed) {
+       if (I->getNodeId() == NewNode)
+         dbgs() << "New node not analyzed?\n";
+       else if (I->getNodeId() == Unanalyzed)
+         dbgs() << "Unanalyzed node not noticed?\n";
+       else if (I->getNodeId() > 0)
+         dbgs() << "Operand not processed?\n";
+       else if (I->getNodeId() == ReadyToProcess)
+         dbgs() << "Not added to worklist?\n";
+       Failed = true;
+    }
+
+    if (Failed) {
+      I->dump(&DAG); dbgs() << "\n";
+      llvm_unreachable(0);
+    }
+  }
+#endif
+
+  return Changed;
+}
+
+/// AnalyzeNewNode - The specified node is the root of a subtree of potentially
+/// new nodes.  Correct any processed operands (this may change the node) and
+/// calculate the NodeId.  If the node itself changes to a processed node, it
+/// is not remapped - the caller needs to take care of this.
+/// Returns the potentially changed node.
+SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
+  // If this was an existing node that is already done, we're done.
+  if (N->getNodeId() != NewNode && N->getNodeId() != Unanalyzed)
+    return N;
+
+  // Remove any stale map entries.
+  ExpungeNode(N);
+
+  // Okay, we know that this node is new.  Recursively walk all of its operands
+  // to see if they are new also.  The depth of this walk is bounded by the size
+  // of the new tree that was constructed (usually 2-3 nodes), so we don't worry
+  // about revisiting of nodes.
+  //
+  // As we walk the operands, keep track of the number of nodes that are
+  // processed.  If non-zero, this will become the new nodeid of this node.
+  // Operands may morph when they are analyzed.  If so, the node will be
+  // updated after all operands have been analyzed.  Since this is rare,
+  // the code tries to minimize overhead in the non-morphing case.
+
+  SmallVector<SDValue, 8> NewOps;
+  unsigned NumProcessed = 0;
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue OrigOp = N->getOperand(i);
+    SDValue Op = OrigOp;
+
+    AnalyzeNewValue(Op); // Op may morph.
+
+    if (Op.getNode()->getNodeId() == Processed)
+      ++NumProcessed;
+
+    if (!NewOps.empty()) {
+      // Some previous operand changed.  Add this one to the list.
+      NewOps.push_back(Op);
+    } else if (Op != OrigOp) {
+      // This is the first operand to change - add all operands so far.
+      NewOps.append(N->op_begin(), N->op_begin() + i);
+      NewOps.push_back(Op);
+    }
+  }
+
+  // Some operands changed - update the node.
+  if (!NewOps.empty()) {
+    SDNode *M = DAG.UpdateNodeOperands(N, &NewOps[0], NewOps.size());
+    if (M != N) {
+      // The node morphed into a different node.  Normally for this to happen
+      // the original node would have to be marked NewNode.  However this can
+      // in theory momentarily not be the case while ReplaceValueWith is doing
+      // its stuff.  Mark the original node NewNode to help sanity checking.
+      N->setNodeId(NewNode);
+      if (M->getNodeId() != NewNode && M->getNodeId() != Unanalyzed)
+        // It morphed into a previously analyzed node - nothing more to do.
+        return M;
+
+      // It morphed into a different new node.  Do the equivalent of passing
+      // it to AnalyzeNewNode: expunge it and calculate the NodeId.  No need
+      // to remap the operands, since they are the same as the operands we
+      // remapped above.
+      N = M;
+      ExpungeNode(N);
+    }
+  }
+
+  // Calculate the NodeId.
+  N->setNodeId(N->getNumOperands() - NumProcessed);
+  if (N->getNodeId() == ReadyToProcess)
+    Worklist.push_back(N);
+
+  return N;
+}
+
+/// AnalyzeNewValue - Call AnalyzeNewNode, updating the node in Val if needed.
+/// If the node changes to a processed node, then remap it.
+void DAGTypeLegalizer::AnalyzeNewValue(SDValue &Val) {
+  Val.setNode(AnalyzeNewNode(Val.getNode()));
+  if (Val.getNode()->getNodeId() == Processed)
+    // We were passed a processed node, or it morphed into one - remap it.
+    RemapValue(Val);
+}
+
+/// ExpungeNode - If N has a bogus mapping in ReplacedValues, eliminate it.
+/// This can occur when a node is deleted then reallocated as a new node -
+/// the mapping in ReplacedValues applies to the deleted node, not the new
+/// one.
+/// The only map that can have a deleted node as a source is ReplacedValues.
+/// Other maps can have deleted nodes as targets, but since their looked-up
+/// values are always immediately remapped using RemapValue, resulting in a
+/// not-deleted node, this is harmless as long as ReplacedValues/RemapValue
+/// always performs correct mappings.  In order to keep the mapping correct,
+/// ExpungeNode should be called on any new nodes *before* adding them as
+/// either source or target to ReplacedValues (which typically means calling
+/// Expunge when a new node is first seen, since it may no longer be marked
+/// NewNode by the time it is added to ReplacedValues).
+void DAGTypeLegalizer::ExpungeNode(SDNode *N) {
+  if (N->getNodeId() != NewNode)
+    return;
+
+  // If N is not remapped by ReplacedValues then there is nothing to do.
+  unsigned i, e;
+  for (i = 0, e = N->getNumValues(); i != e; ++i)
+    if (ReplacedValues.find(SDValue(N, i)) != ReplacedValues.end())
+      break;
+
+  if (i == e)
+    return;
+
+  // Remove N from all maps - this is expensive but rare.
+
+  for (DenseMap<SDValue, SDValue>::iterator I = PromotedIntegers.begin(),
+       E = PromotedIntegers.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = SoftenedFloats.begin(),
+       E = SoftenedFloats.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = ScalarizedVectors.begin(),
+       E = ScalarizedVectors.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = WidenedVectors.begin(),
+       E = WidenedVectors.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
+       I = ExpandedIntegers.begin(), E = ExpandedIntegers.end(); I != E; ++I){
+    assert(I->first.getNode() != N);
+    RemapValue(I->second.first);
+    RemapValue(I->second.second);
+  }
+
+  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
+       I = ExpandedFloats.begin(), E = ExpandedFloats.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second.first);
+    RemapValue(I->second.second);
+  }
+
+  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
+       I = SplitVectors.begin(), E = SplitVectors.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second.first);
+    RemapValue(I->second.second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.begin(),
+       E = ReplacedValues.end(); I != E; ++I)
+    RemapValue(I->second);
+
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+    ReplacedValues.erase(SDValue(N, i));
+}
+
+/// RemapValue - If the specified value was already legalized to another value,
+/// replace it by that value.
+void DAGTypeLegalizer::RemapValue(SDValue &N) {
+  DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.find(N);
+  if (I != ReplacedValues.end()) {
+    // Use path compression to speed up future lookups if values get multiply
+    // replaced with other values.
+    RemapValue(I->second);
+    N = I->second;
+    assert(N.getNode()->getNodeId() != NewNode && "Mapped to new node!");
+  }
+}
+
+namespace {
+  /// NodeUpdateListener - This class is a DAGUpdateListener that listens for
+  /// updates to nodes and recomputes their ready state.
+  class NodeUpdateListener : public SelectionDAG::DAGUpdateListener {
+    DAGTypeLegalizer &DTL;
+    SmallSetVector<SDNode*, 16> &NodesToAnalyze;
+  public:
+    explicit NodeUpdateListener(DAGTypeLegalizer &dtl,
+                                SmallSetVector<SDNode*, 16> &nta)
+      : DTL(dtl), NodesToAnalyze(nta) {}
+
+    virtual void NodeDeleted(SDNode *N, SDNode *E) {
+      assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess &&
+             N->getNodeId() != DAGTypeLegalizer::Processed &&
+             "Invalid node ID for RAUW deletion!");
+      // It is possible, though rare, for the deleted node N to occur as a
+      // target in a map, so note the replacement N -> E in ReplacedValues.
+      assert(E && "Node not replaced?");
+      DTL.NoteDeletion(N, E);
+
+      // In theory the deleted node could also have been scheduled for analysis.
+      // So remove it from the set of nodes which will be analyzed.
+      NodesToAnalyze.remove(N);
+
+      // In general nothing needs to be done for E, since it didn't change but
+      // only gained new uses.  However N -> E was just added to ReplacedValues,
+      // and the result of a ReplacedValues mapping is not allowed to be marked
+      // NewNode.  So if E is marked NewNode, then it needs to be analyzed.
+      if (E->getNodeId() == DAGTypeLegalizer::NewNode)
+        NodesToAnalyze.insert(E);
+    }
+
+    virtual void NodeUpdated(SDNode *N) {
+      // Node updates can mean pretty much anything.  It is possible that an
+      // operand was set to something already processed (f.e.) in which case
+      // this node could become ready.  Recompute its flags.
+      assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess &&
+             N->getNodeId() != DAGTypeLegalizer::Processed &&
+             "Invalid node ID for RAUW deletion!");
+      N->setNodeId(DAGTypeLegalizer::NewNode);
+      NodesToAnalyze.insert(N);
+    }
+  };
+}
+
+
+/// ReplaceValueWith - The specified value was legalized to the specified other
+/// value.  Update the DAG and NodeIds replacing any uses of From to use To
+/// instead.
+void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
+  assert(From.getNode() != To.getNode() && "Potential legalization loop!");
+
+  // If expansion produced new nodes, make sure they are properly marked.
+  ExpungeNode(From.getNode());
+  AnalyzeNewValue(To); // Expunges To.
+
+  // Anything that used the old node should now use the new one.  Note that this
+  // can potentially cause recursive merging.
+  SmallSetVector<SDNode*, 16> NodesToAnalyze;
+  NodeUpdateListener NUL(*this, NodesToAnalyze);
+  do {
+    DAG.ReplaceAllUsesOfValueWith(From, To, &NUL);
+
+    // The old node may still be present in a map like ExpandedIntegers or
+    // PromotedIntegers.  Inform maps about the replacement.
+    ReplacedValues[From] = To;
+
+    // Process the list of nodes that need to be reanalyzed.
+    while (!NodesToAnalyze.empty()) {
+      SDNode *N = NodesToAnalyze.back();
+      NodesToAnalyze.pop_back();
+      if (N->getNodeId() != DAGTypeLegalizer::NewNode)
+        // The node was analyzed while reanalyzing an earlier node - it is safe
+        // to skip.  Note that this is not a morphing node - otherwise it would
+        // still be marked NewNode.
+        continue;
+
+      // Analyze the node's operands and recalculate the node ID.
+      SDNode *M = AnalyzeNewNode(N);
+      if (M != N) {
+        // The node morphed into a different node.  Make everyone use the new
+        // node instead.
+        assert(M->getNodeId() != NewNode && "Analysis resulted in NewNode!");
+        assert(N->getNumValues() == M->getNumValues() &&
+               "Node morphing changed the number of results!");
+        for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+          SDValue OldVal(N, i);
+          SDValue NewVal(M, i);
+          if (M->getNodeId() == Processed)
+            RemapValue(NewVal);
+          DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal, &NUL);
+        }
+        // The original node continues to exist in the DAG, marked NewNode.
+      }
+    }
+    // When recursively update nodes with new nodes, it is possible to have
+    // new uses of From due to CSE. If this happens, replace the new uses of
+    // From with To.
+  } while (!From.use_empty());
+}
+
+void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for promoted integer");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = PromotedIntegers[Op];
+  assert(OpEntry.getNode() == 0 && "Node is already promoted!");
+  OpEntry = Result;
+}
+
+void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for softened float");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = SoftenedFloats[Op];
+  assert(OpEntry.getNode() == 0 && "Node is already converted to integer!");
+  OpEntry = Result;
+}
+
+void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() == Op.getValueType().getVectorElementType() &&
+         "Invalid type for scalarized vector");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = ScalarizedVectors[Op];
+  assert(OpEntry.getNode() == 0 && "Node is already scalarized!");
+  OpEntry = Result;
+}
+
+void DAGTypeLegalizer::GetExpandedInteger(SDValue Op, SDValue &Lo,
+                                          SDValue &Hi) {
+  std::pair<SDValue, SDValue> &Entry = ExpandedIntegers[Op];
+  RemapValue(Entry.first);
+  RemapValue(Entry.second);
+  assert(Entry.first.getNode() && "Operand isn't expanded");
+  Lo = Entry.first;
+  Hi = Entry.second;
+}
+
+void DAGTypeLegalizer::SetExpandedInteger(SDValue Op, SDValue Lo,
+                                          SDValue Hi) {
+  assert(Lo.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         Hi.getValueType() == Lo.getValueType() &&
+         "Invalid type for expanded integer");
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<SDValue, SDValue> &Entry = ExpandedIntegers[Op];
+  assert(Entry.first.getNode() == 0 && "Node already expanded");
+  Entry.first = Lo;
+  Entry.second = Hi;
+}
+
+void DAGTypeLegalizer::GetExpandedFloat(SDValue Op, SDValue &Lo,
+                                        SDValue &Hi) {
+  std::pair<SDValue, SDValue> &Entry = ExpandedFloats[Op];
+  RemapValue(Entry.first);
+  RemapValue(Entry.second);
+  assert(Entry.first.getNode() && "Operand isn't expanded");
+  Lo = Entry.first;
+  Hi = Entry.second;
+}
+
+void DAGTypeLegalizer::SetExpandedFloat(SDValue Op, SDValue Lo,
+                                        SDValue Hi) {
+  assert(Lo.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         Hi.getValueType() == Lo.getValueType() &&
+         "Invalid type for expanded float");
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<SDValue, SDValue> &Entry = ExpandedFloats[Op];
+  assert(Entry.first.getNode() == 0 && "Node already expanded");
+  Entry.first = Lo;
+  Entry.second = Hi;
+}
+
+void DAGTypeLegalizer::GetSplitVector(SDValue Op, SDValue &Lo,
+                                      SDValue &Hi) {
+  std::pair<SDValue, SDValue> &Entry = SplitVectors[Op];
+  RemapValue(Entry.first);
+  RemapValue(Entry.second);
+  assert(Entry.first.getNode() && "Operand isn't split");
+  Lo = Entry.first;
+  Hi = Entry.second;
+}
+
+void DAGTypeLegalizer::SetSplitVector(SDValue Op, SDValue Lo,
+                                      SDValue Hi) {
+  assert(Lo.getValueType().getVectorElementType() ==
+         Op.getValueType().getVectorElementType() &&
+         2*Lo.getValueType().getVectorNumElements() ==
+         Op.getValueType().getVectorNumElements() &&
+         Hi.getValueType() == Lo.getValueType() &&
+         "Invalid type for split vector");
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<SDValue, SDValue> &Entry = SplitVectors[Op];
+  assert(Entry.first.getNode() == 0 && "Node already split");
+  Entry.first = Lo;
+  Entry.second = Hi;
+}
+
+void DAGTypeLegalizer::SetWidenedVector(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for widened vector");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = WidenedVectors[Op];
+  assert(OpEntry.getNode() == 0 && "Node already widened!");
+  OpEntry = Result;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Utilities.
+//===----------------------------------------------------------------------===//
+
+/// BitConvertToInteger - Convert to an integer of the same size.
+SDValue DAGTypeLegalizer::BitConvertToInteger(SDValue Op) {
+  unsigned BitWidth = Op.getValueType().getSizeInBits();
+  return DAG.getNode(ISD::BIT_CONVERT, Op.getDebugLoc(),
+                     EVT::getIntegerVT(*DAG.getContext(), BitWidth), Op);
+}
+
+/// BitConvertVectorToIntegerVector - Convert to a vector of integers of the
+/// same size.
+SDValue DAGTypeLegalizer::BitConvertVectorToIntegerVector(SDValue Op) {
+  assert(Op.getValueType().isVector() && "Only applies to vectors!");
+  unsigned EltWidth = Op.getValueType().getVectorElementType().getSizeInBits();
+  EVT EltNVT = EVT::getIntegerVT(*DAG.getContext(), EltWidth);
+  unsigned NumElts = Op.getValueType().getVectorNumElements();
+  return DAG.getNode(ISD::BIT_CONVERT, Op.getDebugLoc(),
+                     EVT::getVectorVT(*DAG.getContext(), EltNVT, NumElts), Op);
+}
+
+SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
+                                               EVT DestVT) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Create the stack frame object.  Make sure it is aligned for both
+  // the source and destination types.
+  SDValue StackPtr = DAG.CreateStackTemporary(Op.getValueType(), DestVT);
+  // Emit a store to the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr, NULL, 0,
+                               false, false, 0);
+  // Result is a load from the stack slot.
+  return DAG.getLoad(DestVT, dl, Store, StackPtr, NULL, 0, false, false, 0);
+}
+
+/// CustomLowerNode - Replace the node's results with custom code provided
+/// by the target and return "true", or do nothing and return "false".
+/// The last parameter is FALSE if we are dealing with a node with legal
+/// result types and illegal operand. The second parameter denotes the type of
+/// illegal OperandNo in that case.
+/// The last parameter being TRUE means we are dealing with a
+/// node with illegal result types. The second parameter denotes the type of
+/// illegal ResNo in that case.
+bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
+  // See if the target wants to custom lower this node.
+  if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
+    return false;
+
+  SmallVector<SDValue, 8> Results;
+  if (LegalizeResult)
+    TLI.ReplaceNodeResults(N, Results, DAG);
+  else
+    TLI.LowerOperationWrapper(N, Results, DAG);
+
+  if (Results.empty())
+    // The target didn't want to custom lower it after all.
+    return false;
+
+  // Make everything that once used N's values now use those in Results instead.
+  assert(Results.size() == N->getNumValues() &&
+         "Custom lowering returned the wrong number of results!");
+  for (unsigned i = 0, e = Results.size(); i != e; ++i)
+    ReplaceValueWith(SDValue(N, i), Results[i]);
+  return true;
+}
+
+
+/// CustomWidenLowerNode - Widen the node's results with custom code provided
+/// by the target and return "true", or do nothing and return "false".
+bool DAGTypeLegalizer::CustomWidenLowerNode(SDNode *N, EVT VT) {
+  // See if the target wants to custom lower this node.
+  if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
+    return false;
+
+  SmallVector<SDValue, 8> Results;
+  TLI.ReplaceNodeResults(N, Results, DAG);
+
+  if (Results.empty())
+    // The target didn't want to custom widen lower its result  after all.
+    return false;
+
+  // Update the widening map.
+  assert(Results.size() == N->getNumValues() &&
+         "Custom lowering returned the wrong number of results!");
+  for (unsigned i = 0, e = Results.size(); i != e; ++i)
+    SetWidenedVector(SDValue(N, i), Results[i]);
+  return true;
+}
+
+/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+/// which is split into two not necessarily identical pieces.
+void DAGTypeLegalizer::GetSplitDestVTs(EVT InVT, EVT &LoVT, EVT &HiVT) {
+  // Currently all types are split in half.
+  if (!InVT.isVector()) {
+    LoVT = HiVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+  } else {
+    unsigned NumElements = InVT.getVectorNumElements();
+    assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+    LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
+                                   InVT.getVectorElementType(), NumElements/2);
+  }
+}
+
+/// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
+/// high parts of the given value.
+void DAGTypeLegalizer::GetPairElements(SDValue Pair,
+                                       SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = Pair.getDebugLoc();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Pair.getValueType());
+  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NVT, Pair,
+                   DAG.getIntPtrConstant(0));
+  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NVT, Pair,
+                   DAG.getIntPtrConstant(1));
+}
+
+SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, EVT EltVT,
+                                                  SDValue Index) {
+  DebugLoc dl = Index.getDebugLoc();
+  // Make sure the index type is big enough to compute in.
+  if (Index.getValueType().bitsGT(TLI.getPointerTy()))
+    Index = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Index);
+  else
+    Index = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Index);
+
+  // Calculate the element offset and add it to the pointer.
+  unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
+
+  Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
+                      DAG.getConstant(EltSize, Index.getValueType()));
+  return DAG.getNode(ISD::ADD, dl, Index.getValueType(), Index, VecPtr);
+}
+
+/// JoinIntegers - Build an integer with low bits Lo and high bits Hi.
+SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
+  // Arbitrarily use dlHi for result DebugLoc
+  DebugLoc dlHi = Hi.getDebugLoc();
+  DebugLoc dlLo = Lo.getDebugLoc();
+  EVT LVT = Lo.getValueType();
+  EVT HVT = Hi.getValueType();
+  EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
+                              LVT.getSizeInBits() + HVT.getSizeInBits());
+
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, dlLo, NVT, Lo);
+  Hi = DAG.getNode(ISD::ANY_EXTEND, dlHi, NVT, Hi);
+  Hi = DAG.getNode(ISD::SHL, dlHi, NVT, Hi,
+                   DAG.getConstant(LVT.getSizeInBits(), TLI.getPointerTy()));
+  return DAG.getNode(ISD::OR, dlHi, NVT, Lo, Hi);
+}
+
+/// LibCallify - Convert the node into a libcall with the same prototype.
+SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
+                                     bool isSigned) {
+  unsigned NumOps = N->getNumOperands();
+  DebugLoc dl = N->getDebugLoc();
+  if (NumOps == 0) {
+    return MakeLibCall(LC, N->getValueType(0), 0, 0, isSigned, dl);
+  } else if (NumOps == 1) {
+    SDValue Op = N->getOperand(0);
+    return MakeLibCall(LC, N->getValueType(0), &Op, 1, isSigned, dl);
+  } else if (NumOps == 2) {
+    SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+    return MakeLibCall(LC, N->getValueType(0), Ops, 2, isSigned, dl);
+  }
+  SmallVector<SDValue, 8> Ops(NumOps);
+  for (unsigned i = 0; i < NumOps; ++i)
+    Ops[i] = N->getOperand(i);
+
+  return MakeLibCall(LC, N->getValueType(0), &Ops[0], NumOps, isSigned, dl);
+}
+
+/// MakeLibCall - Generate a libcall taking the given operands as arguments and
+/// returning a result of type RetVT.
+SDValue DAGTypeLegalizer::MakeLibCall(RTLIB::Libcall LC, EVT RetVT,
+                                      const SDValue *Ops, unsigned NumOps,
+                                      bool isSigned, DebugLoc dl) {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumOps);
+
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    Entry.Node = Ops[i];
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  const Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  std::pair<SDValue,SDValue> CallInfo =
+    TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
+                    false, 0, TLI.getLibcallCallingConv(LC), false,
+                    /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, dl);
+  return CallInfo.first;
+}
+
+/// PromoteTargetBoolean - Promote the given target boolean to a target boolean
+/// of the given type.  A target boolean is an integer value, not necessarily of
+/// type i1, the bits of which conform to getBooleanContents.
+SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT VT) {
+  DebugLoc dl = Bool.getDebugLoc();
+  ISD::NodeType ExtendCode;
+  switch (TLI.getBooleanContents()) {
+  default:
+    assert(false && "Unknown BooleanContent!");
+  case TargetLowering::UndefinedBooleanContent:
+    // Extend to VT by adding rubbish bits.
+    ExtendCode = ISD::ANY_EXTEND;
+    break;
+  case TargetLowering::ZeroOrOneBooleanContent:
+    // Extend to VT by adding zero bits.
+    ExtendCode = ISD::ZERO_EXTEND;
+    break;
+  case TargetLowering::ZeroOrNegativeOneBooleanContent: {
+    // Extend to VT by copying the sign bit.
+    ExtendCode = ISD::SIGN_EXTEND;
+    break;
+  }
+  }
+  return DAG.getNode(ExtendCode, dl, VT, Bool);
+}
+
+/// SplitInteger - Return the lower LoVT bits of Op in Lo and the upper HiVT
+/// bits in Hi.
+void DAGTypeLegalizer::SplitInteger(SDValue Op,
+                                    EVT LoVT, EVT HiVT,
+                                    SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = Op.getDebugLoc();
+  assert(LoVT.getSizeInBits() + HiVT.getSizeInBits() ==
+         Op.getValueType().getSizeInBits() && "Invalid integer splitting!");
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Op);
+  Hi = DAG.getNode(ISD::SRL, dl, Op.getValueType(), Op,
+                   DAG.getConstant(LoVT.getSizeInBits(), TLI.getPointerTy()));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
+}
+
+/// SplitInteger - Return the lower and upper halves of Op's bits in a value
+/// type half the size of Op's.
+void DAGTypeLegalizer::SplitInteger(SDValue Op,
+                                    SDValue &Lo, SDValue &Hi) {
+  EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(),
+                                 Op.getValueType().getSizeInBits()/2);
+  SplitInteger(Op, HalfVT, HalfVT, Lo, Hi);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Entry Point
+//===----------------------------------------------------------------------===//
+
+/// LegalizeTypes - This transforms the SelectionDAG into a SelectionDAG that
+/// only uses types natively supported by the target.  Returns "true" if it made
+/// any changes.
+///
+/// Note that this is an involved process that may invalidate pointers into
+/// the graph.
+bool SelectionDAG::LegalizeTypes() {
+  return DAGTypeLegalizer(*this).run();
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeTypes.h
new file mode 100644
index 0000000..bd86694
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -0,0 +1,744 @@
+//===-- LegalizeTypes.h - Definition of the DAG Type Legalizer class ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DAGTypeLegalizer class.  This is a private interface
+// shared between the code that implements the SelectionDAG::LegalizeTypes
+// method.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SELECTIONDAG_LEGALIZETYPES_H
+#define SELECTIONDAG_LEGALIZETYPES_H
+
+#define DEBUG_TYPE "legalize-types"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+/// DAGTypeLegalizer - This takes an arbitrary SelectionDAG as input and hacks
+/// on it until only value types the target machine can handle are left.  This
+/// involves promoting small sizes to large sizes or splitting up large values
+/// into small values.
+///
+class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
+  const TargetLowering &TLI;
+  SelectionDAG &DAG;
+public:
+  // NodeIdFlags - This pass uses the NodeId on the SDNodes to hold information
+  // about the state of the node.  The enum has all the values.
+  enum NodeIdFlags {
+    /// ReadyToProcess - All operands have been processed, so this node is ready
+    /// to be handled.
+    ReadyToProcess = 0,
+
+    /// NewNode - This is a new node, not before seen, that was created in the
+    /// process of legalizing some other node.
+    NewNode = -1,
+
+    /// Unanalyzed - This node's ID needs to be set to the number of its
+    /// unprocessed operands.
+    Unanalyzed = -2,
+
+    /// Processed - This is a node that has already been processed.
+    Processed = -3
+
+    // 1+ - This is a node which has this many unprocessed operands.
+  };
+private:
+  enum LegalizeAction {
+    Legal,           // The target natively supports this type.
+    PromoteInteger,  // Replace this integer type with a larger one.
+    ExpandInteger,   // Split this integer type into two of half the size.
+    SoftenFloat,     // Convert this float type to a same size integer type.
+    ExpandFloat,     // Split this float type into two of half the size.
+    ScalarizeVector, // Replace this one-element vector with its element type.
+    SplitVector,     // Split this vector type into two of half the size.
+    WidenVector      // This vector type should be widened into a larger vector.
+  };
+
+  /// ValueTypeActions - This is a bitvector that contains two bits for each
+  /// simple value type, where the two bits correspond to the LegalizeAction
+  /// enum from TargetLowering.  This can be queried with "getTypeAction(VT)".
+  TargetLowering::ValueTypeActionImpl ValueTypeActions;
+
+  /// getTypeAction - Return how we should legalize values of this type.
+  LegalizeAction getTypeAction(EVT VT) const {
+    switch (ValueTypeActions.getTypeAction(*DAG.getContext(), VT)) {
+    default:
+      assert(false && "Unknown legalize action!");
+    case TargetLowering::Legal:
+      return Legal;
+    case TargetLowering::Promote:
+      // Promote can mean
+      //   1) For integers, use a larger integer type (e.g. i8 -> i32).
+      //   2) For vectors, use a wider vector type (e.g. v3i32 -> v4i32).
+      if (!VT.isVector())
+        return PromoteInteger;
+      else
+        return WidenVector;
+    case TargetLowering::Expand:
+      // Expand can mean
+      // 1) split scalar in half, 2) convert a float to an integer,
+      // 3) scalarize a single-element vector, 4) split a vector in two.
+      if (!VT.isVector()) {
+        if (VT.isInteger())
+          return ExpandInteger;
+        else if (VT.getSizeInBits() ==
+                 TLI.getTypeToTransformTo(*DAG.getContext(), VT).getSizeInBits())
+          return SoftenFloat;
+        else
+          return ExpandFloat;
+      } else if (VT.getVectorNumElements() == 1) {
+        return ScalarizeVector;
+      } else {
+        return SplitVector;
+      }
+    }
+  }
+
+  /// isTypeLegal - Return true if this type is legal on this target.
+  bool isTypeLegal(EVT VT) const {
+    return (ValueTypeActions.getTypeAction(*DAG.getContext(), VT) == 
+            TargetLowering::Legal);
+  }
+
+  /// IgnoreNodeResults - Pretend all of this node's results are legal.
+  bool IgnoreNodeResults(SDNode *N) const {
+    return N->getOpcode() == ISD::TargetConstant;
+  }
+
+  /// PromotedIntegers - For integer nodes that are below legal width, this map
+  /// indicates what promoted value to use.
+  DenseMap<SDValue, SDValue> PromotedIntegers;
+
+  /// ExpandedIntegers - For integer nodes that need to be expanded this map
+  /// indicates which operands are the expanded version of the input.
+  DenseMap<SDValue, std::pair<SDValue, SDValue> > ExpandedIntegers;
+
+  /// SoftenedFloats - For floating point nodes converted to integers of
+  /// the same size, this map indicates the converted value to use.
+  DenseMap<SDValue, SDValue> SoftenedFloats;
+
+  /// ExpandedFloats - For float nodes that need to be expanded this map
+  /// indicates which operands are the expanded version of the input.
+  DenseMap<SDValue, std::pair<SDValue, SDValue> > ExpandedFloats;
+
+  /// ScalarizedVectors - For nodes that are <1 x ty>, this map indicates the
+  /// scalar value of type 'ty' to use.
+  DenseMap<SDValue, SDValue> ScalarizedVectors;
+
+  /// SplitVectors - For nodes that need to be split this map indicates
+  /// which operands are the expanded version of the input.
+  DenseMap<SDValue, std::pair<SDValue, SDValue> > SplitVectors;
+
+  /// WidenedVectors - For vector nodes that need to be widened, indicates
+  /// the widened value to use.
+  DenseMap<SDValue, SDValue> WidenedVectors;
+
+  /// ReplacedValues - For values that have been replaced with another,
+  /// indicates the replacement value to use.
+  DenseMap<SDValue, SDValue> ReplacedValues;
+
+  /// Worklist - This defines a worklist of nodes to process.  In order to be
+  /// pushed onto this worklist, all operands of a node must have already been
+  /// processed.
+  SmallVector<SDNode*, 128> Worklist;
+
+public:
+  explicit DAGTypeLegalizer(SelectionDAG &dag)
+    : TLI(dag.getTargetLoweringInfo()), DAG(dag),
+    ValueTypeActions(TLI.getValueTypeActions()) {
+    assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
+           "Too many value types for ValueTypeActions to hold!");
+  }
+
+  /// run - This is the main entry point for the type legalizer.  This does a
+  /// top-down traversal of the dag, legalizing types as it goes.  Returns
+  /// "true" if it made any changes.
+  bool run();
+
+  void NoteDeletion(SDNode *Old, SDNode *New) {
+    ExpungeNode(Old);
+    ExpungeNode(New);
+    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i)
+      ReplacedValues[SDValue(Old, i)] = SDValue(New, i);
+  }
+
+private:
+  SDNode *AnalyzeNewNode(SDNode *N);
+  void AnalyzeNewValue(SDValue &Val);
+  void ExpungeNode(SDNode *N);
+  void PerformExpensiveChecks();
+  void RemapValue(SDValue &N);
+
+  // Common routines.
+  SDValue BitConvertToInteger(SDValue Op);
+  SDValue BitConvertVectorToIntegerVector(SDValue Op);
+  SDValue CreateStackStoreLoad(SDValue Op, EVT DestVT);
+  bool CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult);
+  bool CustomWidenLowerNode(SDNode *N, EVT VT);
+  SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index);
+  SDValue JoinIntegers(SDValue Lo, SDValue Hi);
+  SDValue LibCallify(RTLIB::Libcall LC, SDNode *N, bool isSigned);
+  SDValue MakeLibCall(RTLIB::Libcall LC, EVT RetVT,
+                      const SDValue *Ops, unsigned NumOps, bool isSigned,
+                      DebugLoc dl);
+  SDValue PromoteTargetBoolean(SDValue Bool, EVT VT);
+  void ReplaceValueWith(SDValue From, SDValue To);
+  void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT,
+                    SDValue &Lo, SDValue &Hi);
+
+  //===--------------------------------------------------------------------===//
+  // Integer Promotion Support: LegalizeIntegerTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetPromotedInteger - Given a processed operand Op which was promoted to a
+  /// larger integer type, this returns the promoted value.  The low bits of the
+  /// promoted value corresponding to the original type are exactly equal to Op.
+  /// The extra bits contain rubbish, so the promoted value may need to be zero-
+  /// or sign-extended from the original type before it is usable (the helpers
+  /// SExtPromotedInteger and ZExtPromotedInteger can do this for you).
+  /// For example, if Op is an i16 and was promoted to an i32, then this method
+  /// returns an i32, the lower 16 bits of which coincide with Op, and the upper
+  /// 16 bits of which contain rubbish.
+  SDValue GetPromotedInteger(SDValue Op) {
+    SDValue &PromotedOp = PromotedIntegers[Op];
+    RemapValue(PromotedOp);
+    assert(PromotedOp.getNode() && "Operand wasn't promoted?");
+    return PromotedOp;
+  }
+  void SetPromotedInteger(SDValue Op, SDValue Result);
+
+  /// SExtPromotedInteger - Get a promoted operand and sign extend it to the
+  /// final size.
+  SDValue SExtPromotedInteger(SDValue Op) {
+    EVT OldVT = Op.getValueType();
+    DebugLoc dl = Op.getDebugLoc();
+    Op = GetPromotedInteger(Op);
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(), Op,
+                       DAG.getValueType(OldVT));
+  }
+
+  /// ZExtPromotedInteger - Get a promoted operand and zero extend it to the
+  /// final size.
+  SDValue ZExtPromotedInteger(SDValue Op) {
+    EVT OldVT = Op.getValueType();
+    DebugLoc dl = Op.getDebugLoc();
+    Op = GetPromotedInteger(Op);
+    return DAG.getZeroExtendInReg(Op, dl, OldVT);
+  }
+
+  // Integer Result Promotion.
+  void PromoteIntegerResult(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_AssertSext(SDNode *N);
+  SDValue PromoteIntRes_AssertZext(SDNode *N);
+  SDValue PromoteIntRes_Atomic1(AtomicSDNode *N);
+  SDValue PromoteIntRes_Atomic2(AtomicSDNode *N);
+  SDValue PromoteIntRes_BIT_CONVERT(SDNode *N);
+  SDValue PromoteIntRes_BSWAP(SDNode *N);
+  SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
+  SDValue PromoteIntRes_Constant(SDNode *N);
+  SDValue PromoteIntRes_CONVERT_RNDSAT(SDNode *N);
+  SDValue PromoteIntRes_CTLZ(SDNode *N);
+  SDValue PromoteIntRes_CTPOP(SDNode *N);
+  SDValue PromoteIntRes_CTTZ(SDNode *N);
+  SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
+  SDValue PromoteIntRes_FP32_TO_FP16(SDNode *N);
+  SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
+  SDValue PromoteIntRes_LOAD(LoadSDNode *N);
+  SDValue PromoteIntRes_Overflow(SDNode *N);
+  SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_SDIV(SDNode *N);
+  SDValue PromoteIntRes_SELECT(SDNode *N);
+  SDValue PromoteIntRes_SELECT_CC(SDNode *N);
+  SDValue PromoteIntRes_SETCC(SDNode *N);
+  SDValue PromoteIntRes_SHL(SDNode *N);
+  SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N);
+  SDValue PromoteIntRes_SRA(SDNode *N);
+  SDValue PromoteIntRes_SRL(SDNode *N);
+  SDValue PromoteIntRes_TRUNCATE(SDNode *N);
+  SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_UDIV(SDNode *N);
+  SDValue PromoteIntRes_UNDEF(SDNode *N);
+  SDValue PromoteIntRes_VAARG(SDNode *N);
+  SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
+
+  // Integer Operand Promotion.
+  bool PromoteIntegerOperand(SDNode *N, unsigned OperandNo);
+  SDValue PromoteIntOp_ANY_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_BIT_CONVERT(SDNode *N);
+  SDValue PromoteIntOp_BUILD_PAIR(SDNode *N);
+  SDValue PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N);
+  SDValue PromoteIntOp_CONVERT_RNDSAT(SDNode *N);
+  SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MEMBARRIER(SDNode *N);
+  SDValue PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_Shift(SDNode *N);
+  SDValue PromoteIntOp_SIGN_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_SINT_TO_FP(SDNode *N);
+  SDValue PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_TRUNCATE(SDNode *N);
+  SDValue PromoteIntOp_UINT_TO_FP(SDNode *N);
+  SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
+
+  void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
+
+  //===--------------------------------------------------------------------===//
+  // Integer Expansion Support: LegalizeIntegerTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetExpandedInteger - Given a processed operand Op which was expanded into
+  /// two integers of half the size, this returns the two halves.  The low bits
+  /// of Op are exactly equal to the bits of Lo; the high bits exactly equal Hi.
+  /// For example, if Op is an i64 which was expanded into two i32's, then this
+  /// method returns the two i32's, with Lo being equal to the lower 32 bits of
+  /// Op, and Hi being equal to the upper 32 bits.
+  void GetExpandedInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetExpandedInteger(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Integer Result Expansion.
+  void ExpandIntegerResult(SDNode *N, unsigned ResNo);
+  void ExpandIntRes_ANY_EXTEND        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_AssertSext        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_AssertZext        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_Constant          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTLZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTPOP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTTZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_LOAD          (LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SIGN_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SIGN_EXTEND_INREG (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_TRUNCATE          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ZERO_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FP_TO_SINT        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FP_TO_UINT        (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_Logical           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUB            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBC           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBE           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_Shift             (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_SADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandShiftByConstant(SDNode *N, unsigned Amt,
+                             SDValue &Lo, SDValue &Hi);
+  bool ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
+  bool ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Integer Operand Expansion.
+  bool ExpandIntegerOperand(SDNode *N, unsigned OperandNo);
+  SDValue ExpandIntOp_BIT_CONVERT(SDNode *N);
+  SDValue ExpandIntOp_BR_CC(SDNode *N);
+  SDValue ExpandIntOp_BUILD_VECTOR(SDNode *N);
+  SDValue ExpandIntOp_EXTRACT_ELEMENT(SDNode *N);
+  SDValue ExpandIntOp_SELECT_CC(SDNode *N);
+  SDValue ExpandIntOp_SETCC(SDNode *N);
+  SDValue ExpandIntOp_Shift(SDNode *N);
+  SDValue ExpandIntOp_SINT_TO_FP(SDNode *N);
+  SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue ExpandIntOp_TRUNCATE(SDNode *N);
+  SDValue ExpandIntOp_UINT_TO_FP(SDNode *N);
+  SDValue ExpandIntOp_RETURNADDR(SDNode *N);
+
+  void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                                  ISD::CondCode &CCCode, DebugLoc dl);
+
+  //===--------------------------------------------------------------------===//
+  // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetSoftenedFloat - Given a processed operand Op which was converted to an
+  /// integer of the same size, this returns the integer.  The integer contains
+  /// exactly the same bits as Op - only the type changed.  For example, if Op
+  /// is an f32 which was softened to an i32, then this method returns an i32,
+  /// the bits of which coincide with those of Op.
+  SDValue GetSoftenedFloat(SDValue Op) {
+    SDValue &SoftenedOp = SoftenedFloats[Op];
+    RemapValue(SoftenedOp);
+    assert(SoftenedOp.getNode() && "Operand wasn't converted to integer?");
+    return SoftenedOp;
+  }
+  void SetSoftenedFloat(SDValue Op, SDValue Result);
+
+  // Result Float to Integer Conversion.
+  void SoftenFloatResult(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatRes_BIT_CONVERT(SDNode *N);
+  SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
+  SDValue SoftenFloatRes_ConstantFP(ConstantFPSDNode *N);
+  SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SoftenFloatRes_FABS(SDNode *N);
+  SDValue SoftenFloatRes_FADD(SDNode *N);
+  SDValue SoftenFloatRes_FCEIL(SDNode *N);
+  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N);
+  SDValue SoftenFloatRes_FCOS(SDNode *N);
+  SDValue SoftenFloatRes_FDIV(SDNode *N);
+  SDValue SoftenFloatRes_FEXP(SDNode *N);
+  SDValue SoftenFloatRes_FEXP2(SDNode *N);
+  SDValue SoftenFloatRes_FFLOOR(SDNode *N);
+  SDValue SoftenFloatRes_FLOG(SDNode *N);
+  SDValue SoftenFloatRes_FLOG2(SDNode *N);
+  SDValue SoftenFloatRes_FLOG10(SDNode *N);
+  SDValue SoftenFloatRes_FMUL(SDNode *N);
+  SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
+  SDValue SoftenFloatRes_FNEG(SDNode *N);
+  SDValue SoftenFloatRes_FP_EXTEND(SDNode *N);
+  SDValue SoftenFloatRes_FP16_TO_FP32(SDNode *N);
+  SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
+  SDValue SoftenFloatRes_FPOW(SDNode *N);
+  SDValue SoftenFloatRes_FPOWI(SDNode *N);
+  SDValue SoftenFloatRes_FREM(SDNode *N);
+  SDValue SoftenFloatRes_FRINT(SDNode *N);
+  SDValue SoftenFloatRes_FSIN(SDNode *N);
+  SDValue SoftenFloatRes_FSQRT(SDNode *N);
+  SDValue SoftenFloatRes_FSUB(SDNode *N);
+  SDValue SoftenFloatRes_FTRUNC(SDNode *N);
+  SDValue SoftenFloatRes_LOAD(SDNode *N);
+  SDValue SoftenFloatRes_SELECT(SDNode *N);
+  SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatRes_UNDEF(SDNode *N);
+  SDValue SoftenFloatRes_VAARG(SDNode *N);
+  SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
+
+  // Operand Float to Integer Conversion.
+  bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatOp_BIT_CONVERT(SDNode *N);
+  SDValue SoftenFloatOp_BR_CC(SDNode *N);
+  SDValue SoftenFloatOp_FP_ROUND(SDNode *N);
+  SDValue SoftenFloatOp_FP_TO_SINT(SDNode *N);
+  SDValue SoftenFloatOp_FP_TO_UINT(SDNode *N);
+  SDValue SoftenFloatOp_FP32_TO_FP16(SDNode *N);
+  SDValue SoftenFloatOp_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatOp_SETCC(SDNode *N);
+  SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
+
+  void SoftenSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                           ISD::CondCode &CCCode, DebugLoc dl);
+
+  //===--------------------------------------------------------------------===//
+  // Float Expansion Support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetExpandedFloat - Given a processed operand Op which was expanded into
+  /// two floating point values of half the size, this returns the two halves.
+  /// The low bits of Op are exactly equal to the bits of Lo; the high bits
+  /// exactly equal Hi.  For example, if Op is a ppcf128 which was expanded
+  /// into two f64's, then this method returns the two f64's, with Lo being
+  /// equal to the lower 64 bits of Op, and Hi to the upper 64 bits.
+  void GetExpandedFloat(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetExpandedFloat(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Float Result Expansion.
+  void ExpandFloatResult(SDNode *N, unsigned ResNo);
+  void ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FADD      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCEIL     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCOPYSIGN (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCOS      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FDIV      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FEXP      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FEXP2     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FFLOOR    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG2     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG10    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMUL      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FNEARBYINT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FNEG      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FP_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FPOW      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FPOWI     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FRINT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSIN      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSQRT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSUB      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FTRUNC    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_LOAD      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Float Operand Expansion.
+  bool ExpandFloatOperand(SDNode *N, unsigned OperandNo);
+  SDValue ExpandFloatOp_BR_CC(SDNode *N);
+  SDValue ExpandFloatOp_FP_ROUND(SDNode *N);
+  SDValue ExpandFloatOp_FP_TO_SINT(SDNode *N);
+  SDValue ExpandFloatOp_FP_TO_UINT(SDNode *N);
+  SDValue ExpandFloatOp_SELECT_CC(SDNode *N);
+  SDValue ExpandFloatOp_SETCC(SDNode *N);
+  SDValue ExpandFloatOp_STORE(SDNode *N, unsigned OpNo);
+
+  void FloatExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                                ISD::CondCode &CCCode, DebugLoc dl);
+
+  //===--------------------------------------------------------------------===//
+  // Scalarization Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetScalarizedVector - Given a processed one-element vector Op which was
+  /// scalarized to its element type, this returns the element.  For example,
+  /// if Op is a v1i32, Op = < i32 val >, this method returns val, an i32.
+  SDValue GetScalarizedVector(SDValue Op) {
+    SDValue &ScalarizedOp = ScalarizedVectors[Op];
+    RemapValue(ScalarizedOp);
+    assert(ScalarizedOp.getNode() && "Operand wasn't scalarized?");
+    return ScalarizedOp;
+  }
+  void SetScalarizedVector(SDValue Op, SDValue Result);
+
+  // Vector Result Scalarization: <1 x ty> -> ty.
+  void ScalarizeVectorResult(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecRes_BinOp(SDNode *N);
+  SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
+  SDValue ScalarizeVecRes_InregOp(SDNode *N);
+
+  SDValue ScalarizeVecRes_BIT_CONVERT(SDNode *N);
+  SDValue ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N);
+  SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_FPOWI(SDNode *N);
+  SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
+  SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_SIGN_EXTEND_INREG(SDNode *N);
+  SDValue ScalarizeVecRes_SELECT(SDNode *N);
+  SDValue ScalarizeVecRes_SELECT_CC(SDNode *N);
+  SDValue ScalarizeVecRes_SETCC(SDNode *N);
+  SDValue ScalarizeVecRes_UNDEF(SDNode *N);
+  SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N);
+  SDValue ScalarizeVecRes_VSETCC(SDNode *N);
+
+  // Vector Operand Scalarization: <1 x ty> -> ty.
+  bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecOp_BIT_CONVERT(SDNode *N);
+  SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Splitting Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetSplitVector - Given a processed vector Op which was split into vectors
+  /// of half the size, this method returns the halves.  The first elements of
+  /// Op coincide with the elements of Lo; the remaining elements of Op coincide
+  /// with the elements of Hi: Op is what you would get by concatenating Lo and
+  /// Hi.  For example, if Op is a v8i32 that was split into two v4i32's, then
+  /// this method returns the two v4i32's, with Lo corresponding to the first 4
+  /// elements of Op, and Hi to the last 4 elements.
+  void GetSplitVector(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetSplitVector(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>.
+  void SplitVectorResult(SDNode *N, unsigned OpNo);
+  void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void SplitVecRes_BIT_CONVERT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_BUILD_PAIR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_CONVERT_RNDSAT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,
+                                  SDValue &Hi);
+
+  // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
+  bool SplitVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_UnaryOp(SDNode *N);
+
+  SDValue SplitVecOp_BIT_CONVERT(SDNode *N);
+  SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Widening Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetWidenedVector - Given a processed vector Op which was widened into a
+  /// larger vector, this method returns the larger vector.  The elements of
+  /// the returned vector consist of the elements of Op followed by elements
+  /// containing rubbish.  For example, if Op is a v2i32 that was widened to a
+  /// v4i32, then this method returns a v4i32 for which the first two elements
+  /// are the same as those of Op, while the last two elements contain rubbish.
+  SDValue GetWidenedVector(SDValue Op) {
+    SDValue &WidenedOp = WidenedVectors[Op];
+    RemapValue(WidenedOp);
+    assert(WidenedOp.getNode() && "Operand wasn't widened?");
+    return WidenedOp;
+  }
+  void SetWidenedVector(SDValue Op, SDValue Result);
+
+  // Widen Vector Result Promotion.
+  void WidenVectorResult(SDNode *N, unsigned ResNo);
+  SDValue WidenVecRes_BIT_CONVERT(SDNode* N);
+  SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
+  SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
+  SDValue WidenVecRes_CONVERT_RNDSAT(SDNode* N);
+  SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
+  SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
+  SDValue WidenVecRes_LOAD(SDNode* N);
+  SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
+  SDValue WidenVecRes_SIGN_EXTEND_INREG(SDNode* N);
+  SDValue WidenVecRes_SELECT(SDNode* N);
+  SDValue WidenVecRes_SELECT_CC(SDNode* N);
+  SDValue WidenVecRes_SETCC(SDNode* N);
+  SDValue WidenVecRes_UNDEF(SDNode *N);
+  SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N);
+  SDValue WidenVecRes_VSETCC(SDNode* N);
+
+  SDValue WidenVecRes_Binary(SDNode *N);
+  SDValue WidenVecRes_Convert(SDNode *N);
+  SDValue WidenVecRes_POWI(SDNode *N);
+  SDValue WidenVecRes_Shift(SDNode *N);
+  SDValue WidenVecRes_Unary(SDNode *N);
+  SDValue WidenVecRes_InregOp(SDNode *N);
+
+  // Widen Vector Operand.
+  bool WidenVectorOperand(SDNode *N, unsigned ResNo);
+  SDValue WidenVecOp_BIT_CONVERT(SDNode *N);
+  SDValue WidenVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue WidenVecOp_STORE(SDNode* N);
+
+  SDValue WidenVecOp_Convert(SDNode *N);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Widening Utilities Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Helper GenWidenVectorLoads - Helper function to generate a set of
+  /// loads to load a vector with a resulting wider type. It takes
+  ///   LdChain: list of chains for the load to be generated.
+  ///   Ld:      load to widen
+  SDValue GenWidenVectorLoads(SmallVector<SDValue, 16>& LdChain,
+                              LoadSDNode *LD);
+
+  /// GenWidenVectorExtLoads - Helper function to generate a set of extension
+  /// loads to load a ector with a resulting wider type.  It takes
+  ///   LdChain: list of chains for the load to be generated.
+  ///   Ld:      load to widen
+  ///   ExtType: extension element type
+  SDValue GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
+                                 LoadSDNode *LD, ISD::LoadExtType ExtType);
+
+  /// Helper genWidenVectorStores - Helper function to generate a set of
+  /// stores to store a widen vector into non widen memory
+  ///   StChain: list of chains for the stores we have generated
+  ///   ST:      store of a widen value
+  void GenWidenVectorStores(SmallVector<SDValue, 16>& StChain, StoreSDNode *ST);
+
+  /// Helper genWidenVectorTruncStores - Helper function to generate a set of
+  /// stores to store a truncate widen vector into non widen memory
+  ///   StChain: list of chains for the stores we have generated
+  ///   ST:      store of a widen value
+  void GenWidenVectorTruncStores(SmallVector<SDValue, 16>& StChain,
+                                 StoreSDNode *ST);
+
+  /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
+  /// input vector must have the same element type as NVT.
+  SDValue ModifyToType(SDValue InOp, EVT WidenVT);
+
+
+  //===--------------------------------------------------------------------===//
+  // Generic Splitting: LegalizeTypesGeneric.cpp
+  //===--------------------------------------------------------------------===//
+
+  // Legalization methods which only use that the illegal type is split into two
+  // not necessarily identical types.  As such they can be used for splitting
+  // vectors and expanding integers and floats.
+
+  void GetSplitOp(SDValue Op, SDValue &Lo, SDValue &Hi) {
+    if (Op.getValueType().isVector())
+      GetSplitVector(Op, Lo, Hi);
+    else if (Op.getValueType().isInteger())
+      GetExpandedInteger(Op, Lo, Hi);
+    else
+      GetExpandedFloat(Op, Lo, Hi);
+  }
+
+  /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+  /// which is split (or expanded) into two not necessarily identical pieces.
+  void GetSplitDestVTs(EVT InVT, EVT &LoVT, EVT &HiVT);
+
+  /// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
+  /// high parts of the given value.
+  void GetPairElements(SDValue Pair, SDValue &Lo, SDValue &Hi);
+
+  // Generic Result Splitting.
+  void SplitRes_MERGE_VALUES(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_SELECT      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  //===--------------------------------------------------------------------===//
+  // Generic Expansion: LegalizeTypesGeneric.cpp
+  //===--------------------------------------------------------------------===//
+
+  // Legalization methods which only use that the illegal type is split into two
+  // identical types of half the size, and that the Lo/Hi part is stored first
+  // in memory on little/big-endian machines, followed by the Hi/Lo part.  As
+  // such they can be used for expanding integers and floats.
+
+  void GetExpandedOp(SDValue Op, SDValue &Lo, SDValue &Hi) {
+    if (Op.getValueType().isInteger())
+      GetExpandedInteger(Op, Lo, Hi);
+    else
+      GetExpandedFloat(Op, Lo, Hi);
+  }
+
+  // Generic Result Expansion.
+  void ExpandRes_BIT_CONVERT       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_BUILD_PAIR        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_EXTRACT_ELEMENT   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_NormalLoad        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_VAARG             (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Generic Operand Expansion.
+  SDValue ExpandOp_BIT_CONVERT      (SDNode *N);
+  SDValue ExpandOp_BUILD_VECTOR     (SDNode *N);
+  SDValue ExpandOp_EXTRACT_ELEMENT  (SDNode *N);
+  SDValue ExpandOp_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue ExpandOp_SCALAR_TO_VECTOR (SDNode *N);
+  SDValue ExpandOp_NormalStore      (SDNode *N, unsigned OpNo);
+};
+
+} // end namespace llvm.
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
new file mode 100644
index 0000000..9c2b1d9
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -0,0 +1,482 @@
+//===-------- LegalizeTypesGeneric.cpp - Generic type legalization --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements generic type expansion and splitting for LegalizeTypes.
+// The routines here perform legalization when the details of the type (such as
+// whether it is an integer or a float) do not matter.
+// Expansion is the act of changing a computation in an illegal type to be a
+// computation in two identical registers of a smaller type.  The Lo/Hi part
+// is required to be stored first in memory on little/big-endian machines.
+// Splitting is the act of changing a computation in an illegal type to be a
+// computation in two not necessarily identical registers of a smaller type.
+// There are no requirements on how the type is represented in memory.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Generic Result Expansion.
+//===----------------------------------------------------------------------===//
+
+// These routines assume that the Lo/Hi part is stored first in memory on
+// little/big-endian machines, followed by the Hi/Lo part.  This means that
+// they cannot be used as is on vectors, for which Lo is always stored first.
+
+void DAGTypeLegalizer::ExpandRes_BIT_CONVERT(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Handle some special cases efficiently.
+  switch (getTypeAction(InVT)) {
+    default:
+      assert(false && "Unknown type action!");
+    case Legal:
+    case PromoteInteger:
+      break;
+    case SoftenFloat:
+      // Convert the integer operand instead.
+      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
+      return;
+    case ExpandInteger:
+    case ExpandFloat:
+      // Convert the expanded pieces of the input.
+      GetExpandedOp(InOp, Lo, Hi);
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
+      return;
+    case SplitVector:
+      GetSplitVector(InOp, Lo, Hi);
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
+      return;
+    case ScalarizeVector:
+      // Convert the element instead.
+      SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi);
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
+      return;
+    case WidenVector: {
+      assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BIT_CONVERT");
+      InOp = GetWidenedVector(InOp);
+      EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
+                                   InVT.getVectorNumElements()/2);
+      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                       DAG.getIntPtrConstant(0));
+      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                       DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
+      return;
+    }
+  }
+
+  if (InVT.isVector() && OutVT.isInteger()) {
+    // Handle cases like i64 = BIT_CONVERT v1i64 on x86, where the operand
+    // is legal but the result is not.
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), NOutVT, 2);
+
+    if (isTypeLegal(NVT)) {
+      SDValue CastInOp = DAG.getNode(ISD::BIT_CONVERT, dl, NVT, InOp);
+      Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NOutVT, CastInOp,
+                       DAG.getIntPtrConstant(0));
+      Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NOutVT, CastInOp,
+                       DAG.getIntPtrConstant(1));
+
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+
+      return;
+    }
+  }
+
+  // Lower the bit-convert to a store/load from the stack.
+  assert(NOutVT.isByteSized() && "Expanded type not byte sized!");
+
+  // Create the stack frame object.  Make sure it is aligned for both
+  // the source and expanded destination types.
+  unsigned Alignment =
+    TLI.getTargetData()->getPrefTypeAlignment(NOutVT.
+                                              getTypeForEVT(*DAG.getContext()));
+  SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  const Value *SV = PseudoSourceValue::getFixedStack(SPFI);
+
+  // Emit a store to the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, SV, 0,
+                               false, false, 0);
+
+  // Load the first half from the stack slot.
+  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, SV, 0, false, false, 0);
+
+  // Increment the pointer to the other half.
+  unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
+  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                         DAG.getIntPtrConstant(IncrementSize));
+
+  // Load the second half from the stack slot.
+  Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr, SV, IncrementSize, false,
+                   false, MinAlign(Alignment, IncrementSize));
+
+  // Handle endianness of the load.
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_BUILD_PAIR(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  // Return the operands.
+  Lo = N->getOperand(0);
+  Hi = N->getOperand(1);
+}
+
+void DAGTypeLegalizer::ExpandRes_EXTRACT_ELEMENT(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  GetExpandedOp(N->getOperand(0), Lo, Hi);
+  SDValue Part = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() ?
+                   Hi : Lo;
+
+  assert(Part.getValueType() == N->getValueType(0) &&
+         "Type twice as big as expanded type not itself expanded!");
+
+  GetPairElements(Part, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
+                                                    SDValue &Hi) {
+  SDValue OldVec = N->getOperand(0);
+  unsigned OldElts = OldVec.getValueType().getVectorNumElements();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Convert to a vector of the expanded element type, for example
+  // <3 x i64> -> <6 x i32>.
+  EVT OldVT = N->getValueType(0);
+  EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
+
+  SDValue NewVec = DAG.getNode(ISD::BIT_CONVERT, dl,
+                               EVT::getVectorVT(*DAG.getContext(),
+                                                NewVT, 2*OldElts),
+                               OldVec);
+
+  // Extract the elements at 2 * Idx and 2 * Idx + 1 from the new vector.
+  SDValue Idx = N->getOperand(1);
+
+  // Make sure the type of Idx is big enough to hold the new values.
+  if (Idx.getValueType().bitsLT(TLI.getPointerTy()))
+    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
+
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
+  Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
+
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
+                    DAG.getConstant(1, Idx.getValueType()));
+  Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
+
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  assert(ISD::isNormalLoad(N) && "This routine only for normal loads!");
+  DebugLoc dl = N->getDebugLoc();
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  int SVOffset = LD->getSrcValueOffset();
+  unsigned Alignment = LD->getAlignment();
+  bool isVolatile = LD->isVolatile();
+  bool isNonTemporal = LD->isNonTemporal();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getSrcValue(), SVOffset,
+                   isVolatile, isNonTemporal, Alignment);
+
+  // Increment the pointer to the other half.
+  unsigned IncrementSize = NVT.getSizeInBits() / 8;
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+  Hi = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getSrcValue(),
+                   SVOffset+IncrementSize,
+                   isVolatile, isNonTemporal,
+                   MinAlign(Alignment, IncrementSize));
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                      Hi.getValue(1));
+
+  // Handle endianness of the load.
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+}
+
+void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+  SDValue Chain = N->getOperand(0);
+  SDValue Ptr = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+  const unsigned Align = N->getConstantOperandVal(3);
+
+  Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2), Align);
+  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2), 0);
+
+  // Handle endianness of the load.
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+
+//===--------------------------------------------------------------------===//
+// Generic Operand Expansion.
+//===--------------------------------------------------------------------===//
+
+SDValue DAGTypeLegalizer::ExpandOp_BIT_CONVERT(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getValueType(0).isVector()) {
+    // An illegal expanding type is being converted to a legal vector type.
+    // Make a two element vector out of the expanded parts and convert that
+    // instead, but only if the new vector type is legal (otherwise there
+    // is no point, and it might create expansion loops).  For example, on
+    // x86 this turns v1i64 = BIT_CONVERT i64 into v1i64 = BIT_CONVERT v2i32.
+    EVT OVT = N->getOperand(0).getValueType();
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(),
+                               TLI.getTypeToTransformTo(*DAG.getContext(), OVT),
+                               2);
+
+    if (isTypeLegal(NVT)) {
+      SDValue Parts[2];
+      GetExpandedOp(N->getOperand(0), Parts[0], Parts[1]);
+
+      if (TLI.isBigEndian())
+        std::swap(Parts[0], Parts[1]);
+
+      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Parts, 2);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, N->getValueType(0), Vec);
+    }
+  }
+
+  // Otherwise, store to a temporary and load out again as the new type.
+  return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
+  // The vector type is legal but the element type needs expansion.
+  EVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  EVT OldVT = N->getOperand(0).getValueType();
+  EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
+  DebugLoc dl = N->getDebugLoc();
+
+  assert(OldVT == VecVT.getVectorElementType() &&
+         "BUILD_VECTOR operand type doesn't match vector element type!");
+
+  // Build a vector of twice the length out of the expanded elements.
+  // For example <3 x i64> -> <6 x i32>.
+  std::vector<SDValue> NewElts;
+  NewElts.reserve(NumElts*2);
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Lo, Hi;
+    GetExpandedOp(N->getOperand(i), Lo, Hi);
+    if (TLI.isBigEndian())
+      std::swap(Lo, Hi);
+    NewElts.push_back(Lo);
+    NewElts.push_back(Hi);
+  }
+
+  SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl,
+                               EVT::getVectorVT(*DAG.getContext(),
+                                                NewVT, NewElts.size()),
+                               &NewElts[0], NewElts.size());
+
+  // Convert the new vector to the old vector type.
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, NewVec);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_EXTRACT_ELEMENT(SDNode *N) {
+  SDValue Lo, Hi;
+  GetExpandedOp(N->getOperand(0), Lo, Hi);
+  return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() ? Hi : Lo;
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
+  // The vector type is legal but the element type needs expansion.
+  EVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue Val = N->getOperand(1);
+  EVT OldEVT = Val.getValueType();
+  EVT NewEVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldEVT);
+
+  assert(OldEVT == VecVT.getVectorElementType() &&
+         "Inserted element type doesn't match vector element type!");
+
+  // Bitconvert to a vector of twice the length with elements of the expanded
+  // type, insert the expanded vector elements, and then convert back.
+  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewEVT, NumElts*2);
+  SDValue NewVec = DAG.getNode(ISD::BIT_CONVERT, dl,
+                               NewVecVT, N->getOperand(0));
+
+  SDValue Lo, Hi;
+  GetExpandedOp(Val, Lo, Hi);
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  SDValue Idx = N->getOperand(2);
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
+  NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Lo, Idx);
+  Idx = DAG.getNode(ISD::ADD, dl,
+                    Idx.getValueType(), Idx, DAG.getIntPtrConstant(1));
+  NewVec =  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Hi, Idx);
+
+  // Convert the new vector to the old vector type.
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, NewVec);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  assert(VT.getVectorElementType() == N->getOperand(0).getValueType() &&
+         "SCALAR_TO_VECTOR operand type doesn't match vector element type!");
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumElts);
+  Ops[0] = N->getOperand(0);
+  SDValue UndefVal = DAG.getUNDEF(Ops[0].getValueType());
+  for (unsigned i = 1; i < NumElts; ++i)
+    Ops[i] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
+  assert(ISD::isNormalStore(N) && "This routine only for normal stores!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+  DebugLoc dl = N->getDebugLoc();
+
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(),
+                                     St->getValue().getValueType());
+  SDValue Chain = St->getChain();
+  SDValue Ptr = St->getBasePtr();
+  int SVOffset = St->getSrcValueOffset();
+  unsigned Alignment = St->getAlignment();
+  bool isVolatile = St->isVolatile();
+  bool isNonTemporal = St->isNonTemporal();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  unsigned IncrementSize = NVT.getSizeInBits() / 8;
+
+  SDValue Lo, Hi;
+  GetExpandedOp(St->getValue(), Lo, Hi);
+
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getSrcValue(), SVOffset,
+                    isVolatile, isNonTemporal, Alignment);
+
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+  assert(isTypeLegal(Ptr.getValueType()) && "Pointers must be legal!");
+  Hi = DAG.getStore(Chain, dl, Hi, Ptr, St->getSrcValue(),
+                    SVOffset + IncrementSize,
+                    isVolatile, isNonTemporal,
+                    MinAlign(Alignment, IncrementSize));
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+}
+
+
+//===--------------------------------------------------------------------===//
+// Generic Result Splitting.
+//===--------------------------------------------------------------------===//
+
+// Be careful to make no assumptions about which of Lo/Hi is stored first in
+// memory (for vectors it is always Lo first followed by Hi in the following
+// bytes; for integers and floats it is Lo first if and only if the machine is
+// little-endian).
+
+void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  // A MERGE_VALUES node can produce any number of values.  We know that the
+  // first illegal one needs to be expanded into Lo/Hi.
+  unsigned i;
+
+  // The string of legal results gets turned into input operands, which have
+  // the same type.
+  for (i = 0; isTypeLegal(N->getValueType(i)); ++i)
+    ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i)));
+
+  // The first illegal result must be the one that needs to be expanded.
+  GetSplitOp(N->getOperand(i), Lo, Hi);
+
+  // Legalize the rest of the results into the input operands whether they are
+  // legal or not.
+  unsigned e = N->getNumValues();
+  for (++i; i != e; ++i)
+    ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i)));
+}
+
+void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
+                                       SDValue &Hi) {
+  SDValue LL, LH, RL, RH;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitOp(N->getOperand(1), LL, LH);
+  GetSplitOp(N->getOperand(2), RL, RH);
+
+  SDValue Cond = N->getOperand(0);
+  Lo = DAG.getNode(ISD::SELECT, dl, LL.getValueType(), Cond, LL, RL);
+  Hi = DAG.getNode(ISD::SELECT, dl, LH.getValueType(), Cond, LH, RH);
+}
+
+void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
+                                          SDValue &Hi) {
+  SDValue LL, LH, RL, RH;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitOp(N->getOperand(2), LL, LH);
+  GetSplitOp(N->getOperand(3), RL, RH);
+
+  Lo = DAG.getNode(ISD::SELECT_CC, dl, LL.getValueType(), N->getOperand(0),
+                   N->getOperand(1), LL, RL, N->getOperand(4));
+  Hi = DAG.getNode(ISD::SELECT_CC, dl, LH.getValueType(), N->getOperand(0),
+                   N->getOperand(1), LH, RH, N->getOperand(4));
+}
+
+void DAGTypeLegalizer::SplitRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT LoVT, HiVT;
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  Lo = DAG.getUNDEF(LoVT);
+  Hi = DAG.getUNDEF(HiVT);
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
new file mode 100644
index 0000000..621c087
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -0,0 +1,290 @@
+//===-- LegalizeVectorOps.cpp - Implement SelectionDAG::LegalizeVectors ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::LegalizeVectors method.
+//
+// The vector legalizer looks for vector operations which might need to be
+// scalarized and legalizes them. This is a separate step from Legalize because
+// scalarizing can introduce illegal types.  For example, suppose we have an
+// ISD::SDIV of type v2i64 on x86-32.  The type is legal (for example, addition
+// on a v2i64 is legal), but ISD::SDIV isn't legal, so we have to unroll the
+// operation, which introduces nodes with the illegal type i64 which must be
+// expanded.  Similarly, suppose we have an ISD::SRA of type v16i8 on PowerPC;
+// the operation must be unrolled, which introduces nodes with the illegal
+// type i8 which must be promoted.
+//
+// This does not legalize vector manipulations like ISD::BUILD_VECTOR,
+// or operations that happen to take a vector which are custom-lowered;
+// the legalization for such operations never produces nodes
+// with illegal types, so it's okay to put off legalizing them until
+// SelectionDAG::Legalize runs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+namespace {
+class VectorLegalizer {
+  SelectionDAG& DAG;
+  const TargetLowering &TLI;
+  bool Changed; // Keep track of whether anything changed
+
+  /// LegalizedNodes - For nodes that are of legal width, and that have more
+  /// than one use, this map indicates what regularized operand to use.  This
+  /// allows us to avoid legalizing the same thing more than once.
+  DenseMap<SDValue, SDValue> LegalizedNodes;
+
+  // Adds a node to the translation cache
+  void AddLegalizedOperand(SDValue From, SDValue To) {
+    LegalizedNodes.insert(std::make_pair(From, To));
+    // If someone requests legalization of the new node, return itself.
+    if (From != To)
+      LegalizedNodes.insert(std::make_pair(To, To));
+  }
+
+  // Legalizes the given node
+  SDValue LegalizeOp(SDValue Op);
+  // Assuming the node is legal, "legalize" the results
+  SDValue TranslateLegalizeResults(SDValue Op, SDValue Result);
+  // Implements unrolling a VSETCC.
+  SDValue UnrollVSETCC(SDValue Op);
+  // Implements expansion for FNEG; falls back to UnrollVectorOp if FSUB
+  // isn't legal.
+  SDValue ExpandFNEG(SDValue Op);
+  // Implements vector promotion; this is essentially just bitcasting the
+  // operands to a different type and bitcasting the result back to the
+  // original type.
+  SDValue PromoteVectorOp(SDValue Op);
+
+  public:
+  bool Run();
+  VectorLegalizer(SelectionDAG& dag) :
+      DAG(dag), TLI(dag.getTargetLoweringInfo()), Changed(false) {}
+};
+
+bool VectorLegalizer::Run() {
+  // The legalize process is inherently a bottom-up recursive process (users
+  // legalize their uses before themselves).  Given infinite stack space, we
+  // could just start legalizing on the root and traverse the whole graph.  In
+  // practice however, this causes us to run out of stack space on large basic
+  // blocks.  To avoid this problem, compute an ordering of the nodes where each
+  // node is only legalized after all of its operands are legalized.
+  DAG.AssignTopologicalOrder();
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = prior(DAG.allnodes_end()); I != llvm::next(E); ++I)
+    LegalizeOp(SDValue(I, 0));
+
+  // Finally, it's possible the root changed.  Get the new root.
+  SDValue OldRoot = DAG.getRoot();
+  assert(LegalizedNodes.count(OldRoot) && "Root didn't get legalized?");
+  DAG.setRoot(LegalizedNodes[OldRoot]);
+
+  LegalizedNodes.clear();
+
+  // Remove dead nodes now.
+  DAG.RemoveDeadNodes();
+
+  return Changed;
+}
+
+SDValue VectorLegalizer::TranslateLegalizeResults(SDValue Op, SDValue Result) {
+  // Generic legalization: just pass the operand through.
+  for (unsigned i = 0, e = Op.getNode()->getNumValues(); i != e; ++i)
+    AddLegalizedOperand(Op.getValue(i), Result.getValue(i));
+  return Result.getValue(Op.getResNo());
+}
+
+SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
+  if (I != LegalizedNodes.end()) return I->second;
+
+  SDNode* Node = Op.getNode();
+
+  // Legalize the operands
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
+    Ops.push_back(LegalizeOp(Node->getOperand(i)));
+
+  SDValue Result =
+    SDValue(DAG.UpdateNodeOperands(Op.getNode(), Ops.data(), Ops.size()), 0);
+
+  bool HasVectorValue = false;
+  for (SDNode::value_iterator J = Node->value_begin(), E = Node->value_end();
+       J != E;
+       ++J)
+    HasVectorValue |= J->isVector();
+  if (!HasVectorValue)
+    return TranslateLegalizeResults(Op, Result);
+
+  EVT QueryType;
+  switch (Op.getOpcode()) {
+  default:
+    return TranslateLegalizeResults(Op, Result);
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:
+  case ISD::CTTZ:
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+  case ISD::SELECT:
+  case ISD::SELECT_CC:
+  case ISD::VSETCC:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::FNEG:
+  case ISD::FABS:
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FPOWI:
+  case ISD::FPOW:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FCEIL:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FFLOOR:
+    QueryType = Node->getValueType(0);
+    break;
+  case ISD::SIGN_EXTEND_INREG:
+  case ISD::FP_ROUND_INREG:
+    QueryType = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    QueryType = Node->getOperand(0).getValueType();
+    break;
+  }
+
+  switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) {
+  case TargetLowering::Promote:
+    // "Promote" the operation by bitcasting
+    Result = PromoteVectorOp(Op);
+    Changed = true;
+    break;
+  case TargetLowering::Legal: break;
+  case TargetLowering::Custom: {
+    SDValue Tmp1 = TLI.LowerOperation(Op, DAG);
+    if (Tmp1.getNode()) {
+      Result = Tmp1;
+      break;
+    }
+    // FALL THROUGH
+  }
+  case TargetLowering::Expand:
+    if (Node->getOpcode() == ISD::FNEG)
+      Result = ExpandFNEG(Op);
+    else if (Node->getOpcode() == ISD::VSETCC)
+      Result = UnrollVSETCC(Op);
+    else
+      Result = DAG.UnrollVectorOp(Op.getNode());
+    break;
+  }
+
+  // Make sure that the generated code is itself legal.
+  if (Result != Op) {
+    Result = LegalizeOp(Result);
+    Changed = true;
+  }
+
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  AddLegalizedOperand(Op, Result);
+  return Result;
+}
+
+SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) {
+  // Vector "promotion" is basically just bitcasting and doing the operation
+  // in a different type.  For example, x86 promotes ISD::AND on v2i32 to
+  // v1i64.
+  EVT VT = Op.getValueType();
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "Can't promote a vector with multiple results!");
+  EVT NVT = TLI.getTypeToPromoteTo(Op.getOpcode(), VT);
+  DebugLoc dl = Op.getDebugLoc();
+  SmallVector<SDValue, 4> Operands(Op.getNumOperands());
+
+  for (unsigned j = 0; j != Op.getNumOperands(); ++j) {
+    if (Op.getOperand(j).getValueType().isVector())
+      Operands[j] = DAG.getNode(ISD::BIT_CONVERT, dl, NVT, Op.getOperand(j));
+    else
+      Operands[j] = Op.getOperand(j);
+  }
+
+  Op = DAG.getNode(Op.getOpcode(), dl, NVT, &Operands[0], Operands.size());
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Op);
+}
+
+SDValue VectorLegalizer::ExpandFNEG(SDValue Op) {
+  if (TLI.isOperationLegalOrCustom(ISD::FSUB, Op.getValueType())) {
+    SDValue Zero = DAG.getConstantFP(-0.0, Op.getValueType());
+    return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+                       Zero, Op.getOperand(0));
+  }
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
+SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) {
+  EVT VT = Op.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1), CC = Op.getOperand(2);
+  EVT TmpEltVT = LHS.getValueType().getVectorElementType();
+  DebugLoc dl = Op.getDebugLoc();
+  SmallVector<SDValue, 8> Ops(NumElems);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
+                                  DAG.getIntPtrConstant(i));
+    SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
+                                  DAG.getIntPtrConstant(i));
+    Ops[i] = DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(TmpEltVT),
+                         LHSElem, RHSElem, CC);
+    Ops[i] = DAG.getNode(ISD::SELECT, dl, EltVT, Ops[i],
+                         DAG.getConstant(APInt::getAllOnesValue
+                                         (EltVT.getSizeInBits()), EltVT),
+                         DAG.getConstant(0, EltVT));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElems);
+}
+
+}
+
+bool SelectionDAG::LegalizeVectors() {
+  return VectorLegalizer(*this).Run();
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
new file mode 100644
index 0000000..93aeff5
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -0,0 +1,2538 @@
+//===------- LegalizeVectorTypes.cpp - Legalization of vector types -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file performs vector type splitting and scalarization for LegalizeTypes.
+// Scalarization is the act of changing a computation in an illegal one-element
+// vector type to be a computation in its scalar element type.  For example,
+// implementing <1 x f32> arithmetic in a scalar f32 register.  This is needed
+// as a base case when scalarizing vector arithmetic like <4 x f32>, which
+// eventually decomposes to scalars if the target doesn't support v4f32 or v2f32
+// types.
+// Splitting is the act of changing a computation in an invalid vector type to
+// be a computation in two vectors of half the size.  For example, implementing
+// <128 x f32> operations in terms of two <64 x f32> operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Scalarization: <1 x ty> -> ty.
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Scalarize node result " << ResNo << ": ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue R = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ScalarizeVectorResult #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to scalarize the result of this operator!");
+
+  case ISD::BIT_CONVERT:       R = ScalarizeVecRes_BIT_CONVERT(N); break;
+  case ISD::BUILD_VECTOR:      R = N->getOperand(0); break;
+  case ISD::CONVERT_RNDSAT:    R = ScalarizeVecRes_CONVERT_RNDSAT(N); break;
+  case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::FP_ROUND_INREG:    R = ScalarizeVecRes_InregOp(N); break;
+  case ISD::FPOWI:             R = ScalarizeVecRes_FPOWI(N); break;
+  case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
+  case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
+  case ISD::SELECT:            R = ScalarizeVecRes_SELECT(N); break;
+  case ISD::SELECT_CC:         R = ScalarizeVecRes_SELECT_CC(N); break;
+  case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
+  case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;
+  case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
+  case ISD::VSETCC:            R = ScalarizeVecRes_VSETCC(N); break;
+
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+  case ISD::CTTZ:
+  case ISD::FABS:
+  case ISD::FCOS:
+  case ISD::FNEG:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FTRUNC:
+  case ISD::FFLOOR:
+  case ISD::FCEIL:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+    R = ScalarizeVecRes_UnaryOp(N);
+    break;
+
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::FADD:
+  case ISD::FDIV:
+  case ISD::FMUL:
+  case ISD::FPOW:
+  case ISD::FREM:
+  case ISD::FSUB:
+  case ISD::MUL:
+  case ISD::OR:
+  case ISD::SDIV:
+  case ISD::SREM:
+  case ISD::SUB:
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    R = ScalarizeVecRes_BinOp(N);
+    break;
+  }
+
+  // If R is null, the sub-method took care of registering the result.
+  if (R.getNode())
+    SetScalarizedVector(SDValue(N, ResNo), R);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BIT_CONVERT(SDNode *N) {
+  EVT NewVT = N->getValueType(0).getVectorElementType();
+  return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(),
+                     NewVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) {
+  EVT NewVT = N->getValueType(0).getVectorElementType();
+  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
+  return DAG.getConvertRndSat(NewVT, N->getDebugLoc(),
+                              Op0, DAG.getValueType(NewVT),
+                              DAG.getValueType(Op0.getValueType()),
+                              N->getOperand(3),
+                              N->getOperand(4),
+                              cast<CvtRndSatSDNode>(N)->getCvtCode());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
+                     N->getValueType(0).getVectorElementType(),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_FPOWI(SDNode *N) {
+  SDValue Op = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::FPOWI, N->getDebugLoc(),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
+  // The value to insert may have a wider type than the vector element type,
+  // so be sure to truncate it to the element type if necessary.
+  SDValue Op = N->getOperand(1);
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  if (Op.getValueType() != EltVT)
+    // FIXME: Can this happen for floating point types?
+    Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, Op);
+  return Op;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
+  assert(N->isUnindexed() && "Indexed vector load?");
+
+  SDValue Result = DAG.getLoad(ISD::UNINDEXED,
+                               N->getExtensionType(),
+                               N->getValueType(0).getVectorElementType(),
+                               N->getDebugLoc(),
+                               N->getChain(), N->getBasePtr(),
+                               DAG.getUNDEF(N->getBasePtr().getValueType()),
+                               N->getSrcValue(), N->getSrcValueOffset(),
+                               N->getMemoryVT().getVectorElementType(),
+                               N->isVolatile(), N->isNonTemporal(),
+                               N->getOriginalAlignment());
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
+  // Get the dest type - it doesn't always match the input type, e.g. int_to_fp.
+  EVT DestVT = N->getValueType(0).getVectorElementType();
+  SDValue Op = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), DestVT, Op);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  EVT ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT().getVectorElementType();
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), EltVT,
+                     LHS, DAG.getValueType(ExtVT));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
+  // If the operand is wider than the vector element type then it is implicitly
+  // truncated.  Make that explicit here.
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue InOp = N->getOperand(0);
+  if (InOp.getValueType() != EltVT)
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, InOp);
+  return InOp;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(1));
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0), LHS,
+                     GetScalarizedVector(N->getOperand(2)));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(2));
+  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), LHS.getValueType(),
+                     N->getOperand(0), N->getOperand(1),
+                     LHS, GetScalarizedVector(N->getOperand(3)),
+                     N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+  DebugLoc DL = N->getDebugLoc();
+
+  // Turn it into a scalar SETCC.
+  return DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
+  // Figure out if the scalar is the LHS or RHS and return it.
+  SDValue Arg = N->getOperand(2).getOperand(0);
+  if (Arg.getOpcode() == ISD::UNDEF)
+    return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
+  unsigned Op = !cast<ConstantSDNode>(Arg)->isNullValue();
+  return GetScalarizedVector(N->getOperand(Op));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VSETCC(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+  EVT NVT = N->getValueType(0).getVectorElementType();
+  EVT SVT = TLI.getSetCCResultType(LHS.getValueType());
+  DebugLoc DL = N->getDebugLoc();
+
+  // Turn it into a scalar SETCC.
+  SDValue Res = DAG.getNode(ISD::SETCC, DL, SVT, LHS, RHS, N->getOperand(2));
+
+  // VSETCC always returns a sign-extended value, while SETCC may not.  The
+  // SETCC result type may not match the vector element type.  Correct these.
+  if (NVT.bitsLE(SVT)) {
+    // The SETCC result type is bigger than the vector element type.
+    // Ensure the SETCC result is sign-extended.
+    if (TLI.getBooleanContents() !=
+        TargetLowering::ZeroOrNegativeOneBooleanContent)
+      Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SVT, Res,
+                        DAG.getValueType(MVT::i1));
+    // Truncate to the final type.
+    return DAG.getNode(ISD::TRUNCATE, DL, NVT, Res);
+  }
+
+  // The SETCC result type is smaller than the vector element type.
+  // If the SetCC result is not sign-extended, chop it down to MVT::i1.
+  if (TLI.getBooleanContents() !=
+        TargetLowering::ZeroOrNegativeOneBooleanContent)
+    Res = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Res);
+  // Sign extend to the final type.
+  return DAG.getNode(ISD::SIGN_EXTEND, DL, NVT, Res);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Operand Vector Scalarization <1 x ty> -> ty.
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Scalarize node operand " << OpNo << ": ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (Res.getNode() == 0) {
+    switch (N->getOpcode()) {
+    default:
+#ifndef NDEBUG
+      dbgs() << "ScalarizeVectorOperand Op #" << OpNo << ": ";
+      N->dump(&DAG);
+      dbgs() << "\n";
+#endif
+      llvm_unreachable("Do not know how to scalarize this operator's operand!");
+    case ISD::BIT_CONVERT:
+      Res = ScalarizeVecOp_BIT_CONVERT(N);
+      break;
+    case ISD::CONCAT_VECTORS:
+      Res = ScalarizeVecOp_CONCAT_VECTORS(N);
+      break;
+    case ISD::EXTRACT_VECTOR_ELT:
+      Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N);
+      break;
+    case ISD::STORE:
+      Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo);
+      break;
+    }
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// ScalarizeVecOp_BIT_CONVERT - If the value to convert is a vector that needs
+/// to be scalarized, it must be <1 x ty>.  Convert the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_BIT_CONVERT(SDNode *N) {
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(),
+                     N->getValueType(0), Elt);
+}
+
+/// ScalarizeVecOp_CONCAT_VECTORS - The vectors to concatenate have length one -
+/// use a BUILD_VECTOR instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
+  SmallVector<SDValue, 8> Ops(N->getNumOperands());
+  for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
+    Ops[i] = GetScalarizedVector(N->getOperand(i));
+  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), N->getValueType(0),
+                     &Ops[0], Ops.size());
+}
+
+/// ScalarizeVecOp_EXTRACT_VECTOR_ELT - If the input is a vector that needs to
+/// be scalarized, it must be <1 x ty>, so just return the element, ignoring the
+/// index.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue Res = GetScalarizedVector(N->getOperand(0));
+  if (Res.getValueType() != N->getValueType(0))
+    Res = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), N->getValueType(0),
+                      Res);
+  return Res;
+}
+
+/// ScalarizeVecOp_STORE - If the value to store is a vector that needs to be
+/// scalarized, it must be <1 x ty>.  Just store the element.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
+  assert(N->isUnindexed() && "Indexed store of one-element vector?");
+  assert(OpNo == 1 && "Do not know how to scalarize this operand!");
+  DebugLoc dl = N->getDebugLoc();
+
+  if (N->isTruncatingStore())
+    return DAG.getTruncStore(N->getChain(), dl,
+                             GetScalarizedVector(N->getOperand(1)),
+                             N->getBasePtr(),
+                             N->getSrcValue(), N->getSrcValueOffset(),
+                             N->getMemoryVT().getVectorElementType(),
+                             N->isVolatile(), N->isNonTemporal(),
+                             N->getAlignment());
+
+  return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
+                      N->getBasePtr(), N->getSrcValue(), N->getSrcValueOffset(),
+                      N->isVolatile(), N->isNonTemporal(),
+                      N->getOriginalAlignment());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Splitting
+//===----------------------------------------------------------------------===//
+
+/// SplitVectorResult - This method is called when the specified result of the
+/// specified node is found to need vector splitting.  At this point, the node
+/// may also have invalid operands or may have other results that need
+/// legalization, we just know that (at least) one result needs vector
+/// splitting.
+void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Split node result: ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Lo, Hi;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SplitVectorResult #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to split the result of this operator!");
+
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+
+  case ISD::BIT_CONVERT:       SplitVecRes_BIT_CONVERT(N, Lo, Hi); break;
+  case ISD::BUILD_VECTOR:      SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break;
+  case ISD::CONCAT_VECTORS:    SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break;
+  case ISD::CONVERT_RNDSAT:    SplitVecRes_CONVERT_RNDSAT(N, Lo, Hi); break;
+  case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break;
+  case ISD::FP_ROUND_INREG:    SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
+  case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::SCALAR_TO_VECTOR:  SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::LOAD:
+    SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
+    break;
+  case ISD::SETCC:
+  case ISD::VSETCC:
+    SplitVecRes_SETCC(N, Lo, Hi);
+    break;
+  case ISD::VECTOR_SHUFFLE:
+    SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
+    break;
+
+  case ISD::CTTZ:
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+  case ISD::FNEG:
+  case ISD::FABS:
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FTRUNC:
+  case ISD::FFLOOR:
+  case ISD::FCEIL:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+    SplitVecRes_UnaryOp(N, Lo, Hi);
+    break;
+
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::FDIV:
+  case ISD::FPOW:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::UREM:
+  case ISD::SREM:
+  case ISD::FREM:
+    SplitVecRes_BinOp(N, Lo, Hi);
+    break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetSplitVector(SDValue(N, ResNo), Lo, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
+                                         SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  SDValue RHSLo, RHSHi;
+  GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
+  DebugLoc dl = N->getDebugLoc();
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo, RHSLo);
+  Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi, RHSHi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BIT_CONVERT(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  // We know the result is a vector.  The input may be either a vector or a
+  // scalar value.
+  EVT LoVT, HiVT;
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+
+  // Handle some special cases efficiently.
+  switch (getTypeAction(InVT)) {
+  default:
+    assert(false && "Unknown type action!");
+  case Legal:
+  case PromoteInteger:
+  case SoftenFloat:
+  case ScalarizeVector:
+    break;
+  case ExpandInteger:
+  case ExpandFloat:
+    // A scalar to vector conversion, where the scalar needs expansion.
+    // If the vector is being split in two then we can just convert the
+    // expanded pieces.
+    if (LoVT == HiVT) {
+      GetExpandedOp(InOp, Lo, Hi);
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, LoVT, Lo);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, HiVT, Hi);
+      return;
+    }
+    break;
+  case SplitVector:
+    // If the input is a vector that needs to be split, convert each split
+    // piece of the input now.
+    GetSplitVector(InOp, Lo, Hi);
+    Lo = DAG.getNode(ISD::BIT_CONVERT, dl, LoVT, Lo);
+    Hi = DAG.getNode(ISD::BIT_CONVERT, dl, HiVT, Hi);
+    return;
+  }
+
+  // In the general case, convert the input to an integer and split it by hand.
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  if (TLI.isBigEndian())
+    std::swap(LoIntVT, HiIntVT);
+
+  SplitInteger(BitConvertToInteger(InOp), LoIntVT, HiIntVT, Lo, Hi);
+
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+  Lo = DAG.getNode(ISD::BIT_CONVERT, dl, LoVT, Lo);
+  Hi = DAG.getNode(ISD::BIT_CONVERT, dl, HiVT, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  EVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  unsigned LoNumElts = LoVT.getVectorNumElements();
+  SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
+  Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, &LoOps[0], LoOps.size());
+
+  SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end());
+  Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, &HiOps[0], HiOps.size());
+}
+
+void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  assert(!(N->getNumOperands() & 1) && "Unsupported CONCAT_VECTORS");
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumSubvectors = N->getNumOperands() / 2;
+  if (NumSubvectors == 1) {
+    Lo = N->getOperand(0);
+    Hi = N->getOperand(1);
+    return;
+  }
+
+  EVT LoVT, HiVT;
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
+  Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, &LoOps[0], LoOps.size());
+
+  SmallVector<SDValue, 8> HiOps(N->op_begin()+NumSubvectors, N->op_end());
+  Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HiVT, &HiOps[0], HiOps.size());
+}
+
+void DAGTypeLegalizer::SplitVecRes_CONVERT_RNDSAT(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  EVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  SDValue DTyOpLo =  DAG.getValueType(LoVT);
+  SDValue DTyOpHi =  DAG.getValueType(HiVT);
+
+  SDValue RndOp = N->getOperand(3);
+  SDValue SatOp = N->getOperand(4);
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+
+  // Split the input.
+  SDValue VLo, VHi;
+  EVT InVT = N->getOperand(0).getValueType();
+  switch (getTypeAction(InVT)) {
+  default: llvm_unreachable("Unexpected type action!");
+  case Legal: {
+    EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
+                                 LoVT.getVectorNumElements());
+    VLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
+                      DAG.getIntPtrConstant(0));
+    VHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
+                      DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+    break;
+  }
+  case SplitVector:
+    GetSplitVector(N->getOperand(0), VLo, VHi);
+    break;
+  case WidenVector: {
+    // If the result needs to be split and the input needs to be widened,
+    // the two types must have different lengths. Use the widened result
+    // and extract from it to do the split.
+    SDValue InOp = GetWidenedVector(N->getOperand(0));
+    EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
+                                 LoVT.getVectorNumElements());
+    VLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                     DAG.getIntPtrConstant(0));
+    VHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                     DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+    break;
+  }
+  }
+
+  SDValue STyOpLo =  DAG.getValueType(VLo.getValueType());
+  SDValue STyOpHi =  DAG.getValueType(VHi.getValueType());
+
+  Lo = DAG.getConvertRndSat(LoVT, dl, VLo, DTyOpLo, STyOpLo, RndOp, SatOp,
+                            CvtCode);
+  Hi = DAG.getConvertRndSat(HiVT, dl, VHi, DTyOpHi, STyOpHi, RndOp, SatOp,
+                            CvtCode);
+}
+
+void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+  EVT IdxVT = Idx.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT LoVT, HiVT;
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
+  Idx = DAG.getNode(ISD::ADD, dl, IdxVT, Idx,
+                    DAG.getConstant(LoVT.getVectorNumElements(), IdxVT));
+  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec, Idx);
+}
+
+void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
+                                         SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  Lo = DAG.getNode(ISD::FPOWI, dl, Lo.getValueType(), Lo, N->getOperand(1));
+  Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1));
+}
+
+void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT LoVT, HiVT;
+  GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT(), LoVT, HiVT);
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
+                   DAG.getValueType(LoVT));
+  Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi,
+                   DAG.getValueType(HiVT));
+}
+
+void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Elt = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitVector(Vec, Lo, Hi);
+
+  if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
+    unsigned IdxVal = CIdx->getZExtValue();
+    unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
+    if (IdxVal < LoNumElts)
+      Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                       Lo.getValueType(), Lo, Elt, Idx);
+    else
+      Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
+                       DAG.getIntPtrConstant(IdxVal - LoNumElts));
+    return;
+  }
+
+  // Spill the vector to the stack.
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, NULL, 0,
+                               false, false, 0);
+
+  // Store the new element.  This may be larger than the vector element type,
+  // so use a truncating store.
+  SDValue EltPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
+  const Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
+  unsigned Alignment =
+    TLI.getTargetData()->getPrefTypeAlignment(VecType);
+  Store = DAG.getTruncStore(Store, dl, Elt, EltPtr, NULL, 0, EltVT,
+                            false, false, 0);
+
+  // Load the Lo part from the stack slot.
+  Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, NULL, 0,
+                   false, false, 0);
+
+  // Increment the pointer to the other part.
+  unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8;
+  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                         DAG.getIntPtrConstant(IncrementSize));
+
+  // Load the Hi part from the stack slot.
+  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, NULL, 0, false,
+                   false, MinAlign(Alignment, IncrementSize));
+}
+
+void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
+                                                    SDValue &Hi) {
+  EVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0));
+  Hi = DAG.getUNDEF(HiVT);
+}
+
+void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
+                                        SDValue &Hi) {
+  assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
+  EVT LoVT, HiVT;
+  DebugLoc dl = LD->getDebugLoc();
+  GetSplitDestVTs(LD->getValueType(0), LoVT, HiVT);
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
+  const Value *SV = LD->getSrcValue();
+  int SVOffset = LD->getSrcValueOffset();
+  EVT MemoryVT = LD->getMemoryVT();
+  unsigned Alignment = LD->getOriginalAlignment();
+  bool isVolatile = LD->isVolatile();
+  bool isNonTemporal = LD->isNonTemporal();
+
+  EVT LoMemVT, HiMemVT;
+  GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
+
+  Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
+                   SV, SVOffset, LoMemVT, isVolatile, isNonTemporal, Alignment);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+  SVOffset += IncrementSize;
+  Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
+                   SV, SVOffset, HiMemVT, isVolatile, isNonTemporal, Alignment);
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), Ch);
+}
+
+void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT LoVT, HiVT;
+  DebugLoc DL = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  // Split the input.
+  EVT InVT = N->getOperand(0).getValueType();
+  SDValue LL, LH, RL, RH;
+  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
+                               LoVT.getVectorNumElements());
+  LL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(0),
+                   DAG.getIntPtrConstant(0));
+  LH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(0),
+                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+
+  RL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(1),
+                   DAG.getIntPtrConstant(0));
+  RH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(1),
+                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+}
+
+void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  // Get the dest types - they may not match the input types, e.g. int_to_fp.
+  EVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  // Split the input.
+  EVT InVT = N->getOperand(0).getValueType();
+  switch (getTypeAction(InVT)) {
+  default: llvm_unreachable("Unexpected type action!");
+  case Legal: {
+    EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
+                                 LoVT.getVectorNumElements());
+    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
+                     DAG.getIntPtrConstant(0));
+    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
+                     DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+    break;
+  }
+  case SplitVector:
+    GetSplitVector(N->getOperand(0), Lo, Hi);
+    break;
+  case WidenVector: {
+    // If the result needs to be split and the input needs to be widened,
+    // the two types must have different lengths. Use the widened result
+    // and extract from it to do the split.
+    SDValue InOp = GetWidenedVector(N->getOperand(0));
+    EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
+                                 LoVT.getVectorNumElements());
+    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                     DAG.getIntPtrConstant(0));
+    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                     DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+    break;
+  }
+  }
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
+                                                  SDValue &Lo, SDValue &Hi) {
+  // The low and high parts of the original input give four input vectors.
+  SDValue Inputs[4];
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitVector(N->getOperand(0), Inputs[0], Inputs[1]);
+  GetSplitVector(N->getOperand(1), Inputs[2], Inputs[3]);
+  EVT NewVT = Inputs[0].getValueType();
+  unsigned NewElts = NewVT.getVectorNumElements();
+
+  // If Lo or Hi uses elements from at most two of the four input vectors, then
+  // express it as a vector shuffle of those two inputs.  Otherwise extract the
+  // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
+  SmallVector<int, 16> Ops;
+  for (unsigned High = 0; High < 2; ++High) {
+    SDValue &Output = High ? Hi : Lo;
+
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands (recorded in InputUsed).
+    // If building a suitable shuffle vector proves too hard, then bail
+    // out with useBuildVector set.
+    unsigned InputUsed[2] = { -1U, -1U }; // Not yet discovered.
+    unsigned FirstMaskIdx = High * NewElts;
+    bool useBuildVector = false;
+    for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
+      // The mask element.  This indexes into the input.
+      int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
+
+      // The input vector this mask element indexes into.
+      unsigned Input = (unsigned)Idx / NewElts;
+
+      if (Input >= array_lengthof(Inputs)) {
+        // The mask element does not index into any input vector.
+        Ops.push_back(-1);
+        continue;
+      }
+
+      // Turn the index into an offset from the start of the input vector.
+      Idx -= Input * NewElts;
+
+      // Find or create a shuffle vector operand to hold this input.
+      unsigned OpNo;
+      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
+        if (InputUsed[OpNo] == Input) {
+          // This input vector is already an operand.
+          break;
+        } else if (InputUsed[OpNo] == -1U) {
+          // Create a new operand for this input vector.
+          InputUsed[OpNo] = Input;
+          break;
+        }
+      }
+
+      if (OpNo >= array_lengthof(InputUsed)) {
+        // More than two input vectors used!  Give up on trying to create a
+        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
+        useBuildVector = true;
+        break;
+      }
+
+      // Add the mask index for the new shuffle vector.
+      Ops.push_back(Idx + OpNo * NewElts);
+    }
+
+    if (useBuildVector) {
+      EVT EltVT = NewVT.getVectorElementType();
+      SmallVector<SDValue, 16> SVOps;
+
+      // Extract the input elements by hand.
+      for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
+        // The mask element.  This indexes into the input.
+        int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
+
+        // The input vector this mask element indexes into.
+        unsigned Input = (unsigned)Idx / NewElts;
+
+        if (Input >= array_lengthof(Inputs)) {
+          // The mask element is "undef" or indexes off the end of the input.
+          SVOps.push_back(DAG.getUNDEF(EltVT));
+          continue;
+        }
+
+        // Turn the index into an offset from the start of the input vector.
+        Idx -= Input * NewElts;
+
+        // Extract the vector element by hand.
+        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                    Inputs[Input], DAG.getIntPtrConstant(Idx)));
+      }
+
+      // Construct the Lo/Hi output using a BUILD_VECTOR.
+      Output = DAG.getNode(ISD::BUILD_VECTOR,dl,NewVT, &SVOps[0], SVOps.size());
+    } else if (InputUsed[0] == -1U) {
+      // No input vectors were used!  The result is undefined.
+      Output = DAG.getUNDEF(NewVT);
+    } else {
+      SDValue Op0 = Inputs[InputUsed[0]];
+      // If only one input was used, use an undefined vector for the other.
+      SDValue Op1 = InputUsed[1] == -1U ?
+        DAG.getUNDEF(NewVT) : Inputs[InputUsed[1]];
+      // At least one input vector was used.  Create a new shuffle vector.
+      Output =  DAG.getVectorShuffle(NewVT, dl, Op0, Op1, &Ops[0]);
+    }
+
+    Ops.clear();
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Operand Vector Splitting
+//===----------------------------------------------------------------------===//
+
+/// SplitVectorOperand - This method is called when the specified operand of the
+/// specified node is found to need vector splitting.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need legalization as well as the specified one.
+bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Split node operand: ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (Res.getNode() == 0) {
+    switch (N->getOpcode()) {
+    default:
+#ifndef NDEBUG
+      dbgs() << "SplitVectorOperand Op #" << OpNo << ": ";
+      N->dump(&DAG);
+      dbgs() << "\n";
+#endif
+      llvm_unreachable("Do not know how to split this operator's operand!");
+
+    case ISD::BIT_CONVERT:       Res = SplitVecOp_BIT_CONVERT(N); break;
+    case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
+    case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
+    case ISD::STORE:
+      Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
+      break;
+
+    case ISD::CTTZ:
+    case ISD::CTLZ:
+    case ISD::CTPOP:
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT:
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:
+    case ISD::TRUNCATE:
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::ANY_EXTEND:
+      Res = SplitVecOp_UnaryOp(N);
+      break;
+    }
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
+  // The result has a legal vector type, but the input needs splitting.
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  EVT InVT = Lo.getValueType();
+
+  EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
+                               InVT.getVectorNumElements());
+
+  Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_BIT_CONVERT(SDNode *N) {
+  // For example, i64 = BIT_CONVERT v4i16 on alpha.  Typically the vector will
+  // end up being split all the way down to individual components.  Convert the
+  // split pieces into integers and reassemble.
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  Lo = BitConvertToInteger(Lo);
+  Hi = BitConvertToInteger(Hi);
+
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), N->getValueType(0),
+                     JoinIntegers(Lo, Hi));
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
+  // We know that the extracted result type is legal.  For now, assume the index
+  // is a constant.
+  EVT SubVT = N->getValueType(0);
+  SDValue Idx = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+
+  uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+  if (IdxVal < LoElts) {
+    assert(IdxVal + SubVT.getVectorNumElements() <= LoElts &&
+           "Extracted subvector crosses vector split!");
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
+  } else {
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Hi,
+                       DAG.getConstant(IdxVal - LoElts, Idx.getValueType()));
+  }
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+  EVT VecVT = Vec.getValueType();
+
+  if (isa<ConstantSDNode>(Idx)) {
+    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    assert(IdxVal < VecVT.getVectorNumElements() && "Invalid vector index!");
+
+    SDValue Lo, Hi;
+    GetSplitVector(Vec, Lo, Hi);
+
+    uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+
+    if (IdxVal < LoElts)
+      return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0);
+    return SDValue(DAG.UpdateNodeOperands(N, Hi,
+                                  DAG.getConstant(IdxVal - LoElts,
+                                                  Idx.getValueType())),
+                   0);
+  }
+
+  // Store the vector to the stack.
+  EVT EltVT = VecVT.getVectorElementType();
+  DebugLoc dl = N->getDebugLoc();
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  const Value *SV = PseudoSourceValue::getFixedStack(SPFI);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, SV, 0,
+                               false, false, 0);
+
+  // Load back the required element.
+  StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
+  return DAG.getExtLoad(ISD::EXTLOAD, N->getValueType(0), dl, Store, StackPtr,
+                        SV, 0, EltVT, false, false, 0);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
+  assert(N->isUnindexed() && "Indexed store of vector?");
+  assert(OpNo == 1 && "Can only split the stored value");
+  DebugLoc dl = N->getDebugLoc();
+
+  bool isTruncating = N->isTruncatingStore();
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  int SVOffset = N->getSrcValueOffset();
+  EVT MemoryVT = N->getMemoryVT();
+  unsigned Alignment = N->getOriginalAlignment();
+  bool isVol = N->isVolatile();
+  bool isNT = N->isNonTemporal();
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(1), Lo, Hi);
+
+  EVT LoMemVT, HiMemVT;
+  GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+
+  if (isTruncating)
+    Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getSrcValue(), SVOffset,
+                           LoMemVT, isVol, isNT, Alignment);
+  else
+    Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getSrcValue(), SVOffset,
+                      isVol, isNT, Alignment);
+
+  // Increment the pointer to the other half.
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+  SVOffset += IncrementSize;
+
+  if (isTruncating)
+    Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getSrcValue(), SVOffset,
+                           HiMemVT, isVol, isNT, Alignment);
+  else
+    Hi = DAG.getStore(Ch, dl, Hi, Ptr, N->getSrcValue(), SVOffset,
+                      isVol, isNT, Alignment);
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Widening
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Widen node result " << ResNo << ": ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+
+  // See if the target wants to custom widen this node.
+  if (CustomWidenLowerNode(N, N->getValueType(ResNo)))
+    return;
+
+  SDValue Res = SDValue();
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "WidenVectorResult #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to widen the result of this operator!");
+
+  case ISD::BIT_CONVERT:       Res = WidenVecRes_BIT_CONVERT(N); break;
+  case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
+  case ISD::CONVERT_RNDSAT:    Res = WidenVecRes_CONVERT_RNDSAT(N); break;
+  case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::FP_ROUND_INREG:    Res = WidenVecRes_InregOp(N); break;
+  case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
+  case ISD::SCALAR_TO_VECTOR:  Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
+  case ISD::SELECT:            Res = WidenVecRes_SELECT(N); break;
+  case ISD::SELECT_CC:         Res = WidenVecRes_SELECT_CC(N); break;
+  case ISD::SETCC:             Res = WidenVecRes_SETCC(N); break;
+  case ISD::UNDEF:             Res = WidenVecRes_UNDEF(N); break;
+  case ISD::VECTOR_SHUFFLE:
+    Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
+    break;
+  case ISD::VSETCC:
+    Res = WidenVecRes_VSETCC(N);
+    break;
+
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::BSWAP:
+  case ISD::FADD:
+  case ISD::FCOPYSIGN:
+  case ISD::FDIV:
+  case ISD::FMUL:
+  case ISD::FPOW:
+  case ISD::FREM:
+  case ISD::FSUB:
+  case ISD::MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
+  case ISD::OR:
+  case ISD::SDIV:
+  case ISD::SREM:
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::SUB:
+  case ISD::XOR:
+    Res = WidenVecRes_Binary(N);
+    break;
+
+  case ISD::FPOWI:
+    Res = WidenVecRes_POWI(N);
+    break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    Res = WidenVecRes_Shift(N);
+    break;
+
+  case ISD::FP_ROUND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+    Res = WidenVecRes_Convert(N);
+    break;
+
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+  case ISD::CTTZ:
+  case ISD::FABS:
+  case ISD::FCOS:
+  case ISD::FNEG:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+    Res = WidenVecRes_Unary(N);
+    break;
+  }
+
+  // If Res is null, the sub-method took care of registering the result.
+  if (Res.getNode())
+    SetWidenedVector(SDValue(N, ResNo), Res);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
+  // Binary op widening.
+  unsigned Opcode = N->getOpcode();
+  DebugLoc dl = N->getDebugLoc();
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT WidenEltVT = WidenVT.getVectorElementType();
+  EVT VT = WidenVT;
+  unsigned NumElts =  VT.getVectorNumElements();
+  while (!TLI.isTypeSynthesizable(VT) && NumElts != 1) {
+     NumElts = NumElts / 2;
+     VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
+  }
+
+  if (NumElts != 1 && !TLI.canOpTrap(N->getOpcode(), VT)) {
+    // Operation doesn't trap so just widen as normal.
+    SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+    SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+    return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2);
+  } else if (NumElts == 1) {
+    // No legal vector version so unroll the vector operation and then widen.
+    return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
+  } else {
+    // Since the operation can trap, apply operation on the original vector.
+    EVT MaxVT = VT;
+    SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+    SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+    unsigned CurNumElts = N->getValueType(0).getVectorNumElements();
+
+    SmallVector<SDValue, 16> ConcatOps(CurNumElts);
+    unsigned ConcatEnd = 0;  // Current ConcatOps index.
+    int Idx = 0;        // Current Idx into input vectors.
+
+    // NumElts := greatest synthesizable vector size (at most WidenVT)
+    // while (orig. vector has unhandled elements) {
+    //   take munches of size NumElts from the beginning and add to ConcatOps
+    //   NumElts := next smaller supported vector size or 1
+    // }
+    while (CurNumElts != 0) {
+      while (CurNumElts >= NumElts) {
+        SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
+                                   DAG.getIntPtrConstant(Idx));
+        SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
+                                   DAG.getIntPtrConstant(Idx));
+        ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2);
+        Idx += NumElts;
+        CurNumElts -= NumElts;
+      }
+      do {
+        NumElts = NumElts / 2;
+        VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
+      } while (!TLI.isTypeSynthesizable(VT) && NumElts != 1);
+
+      if (NumElts == 1) {
+        for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
+          SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, 
+                                     InOp1, DAG.getIntPtrConstant(Idx));
+          SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, 
+                                     InOp2, DAG.getIntPtrConstant(Idx));
+          ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
+                                               EOp1, EOp2);
+        }
+        CurNumElts = 0;
+      }
+    }
+
+    // Check to see if we have a single operation with the widen type.
+    if (ConcatEnd == 1) {
+      VT = ConcatOps[0].getValueType();
+      if (VT == WidenVT)
+        return ConcatOps[0];
+    }
+
+    // while (Some element of ConcatOps is not of type MaxVT) {
+    //   From the end of ConcatOps, collect elements of the same type and put
+    //   them into an op of the next larger supported type
+    // }
+    while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) {
+      Idx = ConcatEnd - 1;
+      VT = ConcatOps[Idx--].getValueType();
+      while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT)
+        Idx--;
+
+      int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1;
+      EVT NextVT;
+      do {
+        NextSize *= 2;
+        NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize);
+      } while (!TLI.isTypeSynthesizable(NextVT));
+
+      if (!VT.isVector()) {
+        // Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT
+        SDValue VecOp = DAG.getUNDEF(NextVT);
+        unsigned NumToInsert = ConcatEnd - Idx - 1;
+        for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
+          VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp,
+                              ConcatOps[OpIdx], DAG.getIntPtrConstant(i));
+        }
+        ConcatOps[Idx+1] = VecOp;
+        ConcatEnd = Idx + 2;
+      } 
+      else {
+        // Vector type, create a CONCAT_VECTORS of type NextVT
+        SDValue undefVec = DAG.getUNDEF(VT);
+        unsigned OpsToConcat = NextSize/VT.getVectorNumElements();
+        SmallVector<SDValue, 16> SubConcatOps(OpsToConcat);
+        unsigned RealVals = ConcatEnd - Idx - 1;
+        unsigned SubConcatEnd = 0;
+        unsigned SubConcatIdx = Idx + 1;
+        while (SubConcatEnd < RealVals)
+          SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx];
+        while (SubConcatEnd < OpsToConcat)
+          SubConcatOps[SubConcatEnd++] = undefVec;
+        ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
+                                              NextVT, &SubConcatOps[0],
+                                              OpsToConcat);
+        ConcatEnd = SubConcatIdx + 1;
+      }
+    }
+
+    // Check to see if we have a single operation with the widen type.
+    if (ConcatEnd == 1) {
+      VT = ConcatOps[0].getValueType();
+      if (VT == WidenVT)
+        return ConcatOps[0];
+    }
+    
+    // add undefs of size MaxVT until ConcatOps grows to length of WidenVT
+    unsigned NumOps = 
+        WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements();
+    if (NumOps != ConcatEnd ) {
+      SDValue UndefVal = DAG.getUNDEF(MaxVT);
+      for (unsigned j = ConcatEnd; j < NumOps; ++j)
+        ConcatOps[j] = UndefVal;
+    }
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &ConcatOps[0], NumOps);
+  }
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  EVT InVT = InOp.getValueType();
+  EVT InEltVT = InVT.getVectorElementType();
+  EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);
+
+  unsigned Opcode = N->getOpcode();
+  unsigned InVTNumElts = InVT.getVectorNumElements();
+
+  if (getTypeAction(InVT) == WidenVector) {
+    InOp = GetWidenedVector(N->getOperand(0));
+    InVT = InOp.getValueType();
+    InVTNumElts = InVT.getVectorNumElements();
+    if (InVTNumElts == WidenNumElts)
+      return DAG.getNode(Opcode, dl, WidenVT, InOp);
+  }
+
+  if (TLI.isTypeSynthesizable(InWidenVT)) {
+    // Because the result and the input are different vector types, widening
+    // the result could create a legal type but widening the input might make
+    // it an illegal type that might lead to repeatedly splitting the input
+    // and then widening it. To avoid this, we widen the input only if
+    // it results in a legal type.
+    if (WidenNumElts % InVTNumElts == 0) {
+      // Widen the input and call convert on the widened input vector.
+      unsigned NumConcat = WidenNumElts/InVTNumElts;
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      Ops[0] = InOp;
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      for (unsigned i = 1; i != NumConcat; ++i)
+        Ops[i] = UndefVal;
+      return DAG.getNode(Opcode, dl, WidenVT,
+                         DAG.getNode(ISD::CONCAT_VECTORS, dl, InWidenVT,
+                         &Ops[0], NumConcat));
+    }
+
+    if (InVTNumElts % WidenNumElts == 0) {
+      // Extract the input and convert the shorten input vector.
+      return DAG.getNode(Opcode, dl, WidenVT,
+                         DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InWidenVT,
+                                     InOp, DAG.getIntPtrConstant(0)));
+    }
+  }
+
+  // Otherwise unroll into some nasty scalar code and rebuild the vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = WidenVT.getVectorElementType();
+  unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
+  unsigned i;
+  for (i=0; i < MinElts; ++i)
+    Ops[i] = DAG.getNode(Opcode, dl, EltVT,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+                                     DAG.getIntPtrConstant(i)));
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  SDValue ShOp = N->getOperand(1);
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp, ShOp);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  SDValue ShOp = N->getOperand(1);
+
+  EVT ShVT = ShOp.getValueType();
+  if (getTypeAction(ShVT) == WidenVector) {
+    ShOp = GetWidenedVector(ShOp);
+    ShVT = ShOp.getValueType();
+  }
+  EVT ShWidenVT = EVT::getVectorVT(*DAG.getContext(),
+                                   ShVT.getVectorElementType(),
+                                   WidenVT.getVectorNumElements());
+  if (ShVT != ShWidenVT)
+    ShOp = ModifyToType(ShOp, ShWidenVT);
+
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp, ShOp);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) {
+  // Unary op widening.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
+                               cast<VTSDNode>(N->getOperand(1))->getVT()
+                                 .getVectorElementType(),
+                               WidenVT.getVectorNumElements());
+  SDValue WidenLHS = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     WidenVT, WidenLHS, DAG.getValueType(ExtVT));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BIT_CONVERT(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  EVT VT = N->getValueType(0);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  DebugLoc dl = N->getDebugLoc();
+
+  switch (getTypeAction(InVT)) {
+  default:
+    assert(false && "Unknown type action!");
+    break;
+  case Legal:
+    break;
+  case PromoteInteger:
+    // If the InOp is promoted to the same size, convert it.  Otherwise,
+    // fall out of the switch and widen the promoted input.
+    InOp = GetPromotedInteger(InOp);
+    InVT = InOp.getValueType();
+    if (WidenVT.bitsEq(InVT))
+      return DAG.getNode(ISD::BIT_CONVERT, dl, WidenVT, InOp);
+    break;
+  case SoftenFloat:
+  case ExpandInteger:
+  case ExpandFloat:
+  case ScalarizeVector:
+  case SplitVector:
+    break;
+  case WidenVector:
+    // If the InOp is widened to the same size, convert it.  Otherwise, fall
+    // out of the switch and widen the widened input.
+    InOp = GetWidenedVector(InOp);
+    InVT = InOp.getValueType();
+    if (WidenVT.bitsEq(InVT))
+      // The input widens to the same size. Convert to the widen value.
+      return DAG.getNode(ISD::BIT_CONVERT, dl, WidenVT, InOp);
+    break;
+  }
+
+  unsigned WidenSize = WidenVT.getSizeInBits();
+  unsigned InSize = InVT.getSizeInBits();
+  if (WidenSize % InSize == 0) {
+    // Determine new input vector type.  The new input vector type will use
+    // the same element type (if its a vector) or use the input type as a
+    // vector.  It is the same size as the type to widen to.
+    EVT NewInVT;
+    unsigned NewNumElts = WidenSize / InSize;
+    if (InVT.isVector()) {
+      EVT InEltVT = InVT.getVectorElementType();
+      NewInVT= EVT::getVectorVT(*DAG.getContext(), InEltVT,
+                                WidenSize / InEltVT.getSizeInBits());
+    } else {
+      NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts);
+    }
+
+    if (TLI.isTypeSynthesizable(NewInVT)) {
+      // Because the result and the input are different vector types, widening
+      // the result could create a legal type but widening the input might make
+      // it an illegal type that might lead to repeatedly splitting the input
+      // and then widening it. To avoid this, we widen the input only if
+      // it results in a legal type.
+      SmallVector<SDValue, 16> Ops(NewNumElts);
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      Ops[0] = InOp;
+      for (unsigned i = 1; i < NewNumElts; ++i)
+        Ops[i] = UndefVal;
+
+      SDValue NewVec;
+      if (InVT.isVector())
+        NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl,
+                             NewInVT, &Ops[0], NewNumElts);
+      else
+        NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl,
+                             NewInVT, &Ops[0], NewNumElts);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, WidenVT, NewVec);
+    }
+  }
+
+  return CreateStackStoreLoad(InOp, WidenVT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  // Build a vector with undefined for the new nodes.
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SmallVector<SDValue, 16> NewOps(N->op_begin(), N->op_end());
+  NewOps.reserve(WidenNumElts);
+  for (unsigned i = NumElts; i < WidenNumElts; ++i)
+    NewOps.push_back(DAG.getUNDEF(EltVT));
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &NewOps[0], NewOps.size());
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
+  EVT InVT = N->getOperand(0).getValueType();
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  unsigned NumOperands = N->getNumOperands();
+
+  bool InputWidened = false; // Indicates we need to widen the input.
+  if (getTypeAction(InVT) != WidenVector) {
+    if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) {
+      // Add undef vectors to widen to correct length.
+      unsigned NumConcat = WidenVT.getVectorNumElements() /
+                           InVT.getVectorNumElements();
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      for (unsigned i=0; i < NumOperands; ++i)
+        Ops[i] = N->getOperand(i);
+      for (unsigned i = NumOperands; i != NumConcat; ++i)
+        Ops[i] = UndefVal;
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &Ops[0], NumConcat);
+    }
+  } else {
+    InputWidened = true;
+    if (WidenVT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
+      // The inputs and the result are widen to the same value.
+      unsigned i;
+      for (i=1; i < NumOperands; ++i)
+        if (N->getOperand(i).getOpcode() != ISD::UNDEF)
+          break;
+
+      if (i > NumOperands)
+        // Everything but the first operand is an UNDEF so just return the
+        // widened first operand.
+        return GetWidenedVector(N->getOperand(0));
+
+      if (NumOperands == 2) {
+        // Replace concat of two operands with a shuffle.
+        SmallVector<int, 16> MaskOps(WidenNumElts);
+        for (unsigned i=0; i < WidenNumElts/2; ++i) {
+          MaskOps[i] = i;
+          MaskOps[i+WidenNumElts/2] = i+WidenNumElts;
+        }
+        return DAG.getVectorShuffle(WidenVT, dl,
+                                    GetWidenedVector(N->getOperand(0)),
+                                    GetWidenedVector(N->getOperand(1)),
+                                    &MaskOps[0]);
+      }
+    }
+  }
+
+  // Fall back to use extracts and build vector.
+  EVT EltVT = WidenVT.getVectorElementType();
+  unsigned NumInElts = InVT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  unsigned Idx = 0;
+  for (unsigned i=0; i < NumOperands; ++i) {
+    SDValue InOp = N->getOperand(i);
+    if (InputWidened)
+      InOp = GetWidenedVector(InOp);
+    for (unsigned j=0; j < NumInElts; ++j)
+        Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                                 DAG.getIntPtrConstant(j));
+  }
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; Idx < WidenNumElts; ++Idx)
+    Ops[Idx] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue InOp  = N->getOperand(0);
+  SDValue RndOp = N->getOperand(3);
+  SDValue SatOp = N->getOperand(4);
+
+  EVT      WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),
+                                              N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  EVT InVT = InOp.getValueType();
+  EVT InEltVT = InVT.getVectorElementType();
+  EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);
+
+  SDValue DTyOp = DAG.getValueType(WidenVT);
+  SDValue STyOp = DAG.getValueType(InWidenVT);
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+
+  unsigned InVTNumElts = InVT.getVectorNumElements();
+  if (getTypeAction(InVT) == WidenVector) {
+    InOp = GetWidenedVector(InOp);
+    InVT = InOp.getValueType();
+    InVTNumElts = InVT.getVectorNumElements();
+    if (InVTNumElts == WidenNumElts)
+      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
+                                  SatOp, CvtCode);
+  }
+
+  if (TLI.isTypeSynthesizable(InWidenVT)) {
+    // Because the result and the input are different vector types, widening
+    // the result could create a legal type but widening the input might make
+    // it an illegal type that might lead to repeatedly splitting the input
+    // and then widening it. To avoid this, we widen the input only if
+    // it results in a legal type.
+    if (WidenNumElts % InVTNumElts == 0) {
+      // Widen the input and call convert on the widened input vector.
+      unsigned NumConcat = WidenNumElts/InVTNumElts;
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      Ops[0] = InOp;
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      for (unsigned i = 1; i != NumConcat; ++i) {
+        Ops[i] = UndefVal;
+      }
+      InOp = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWidenVT, &Ops[0],NumConcat);
+      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
+                                  SatOp, CvtCode);
+    }
+
+    if (InVTNumElts % WidenNumElts == 0) {
+      // Extract the input and convert the shorten input vector.
+      InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp,
+                         DAG.getIntPtrConstant(0));
+      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
+                                SatOp, CvtCode);
+    }
+  }
+
+  // Otherwise unroll into some nasty scalar code and rebuild the vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = WidenVT.getVectorElementType();
+  DTyOp = DAG.getValueType(EltVT);
+  STyOp = DAG.getValueType(InEltVT);
+
+  unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
+  unsigned i;
+  for (i=0; i < MinElts; ++i) {
+    SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+                                 DAG.getIntPtrConstant(i));
+    Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp,
+                                        SatOp, CvtCode);
+  }
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  EVT      VT = N->getValueType(0);
+  EVT      WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  SDValue  InOp = N->getOperand(0);
+  SDValue  Idx  = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+
+  if (getTypeAction(InOp.getValueType()) == WidenVector)
+    InOp = GetWidenedVector(InOp);
+
+  EVT InVT = InOp.getValueType();
+
+  ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx);
+  if (CIdx) {
+    unsigned IdxVal = CIdx->getZExtValue();
+    // Check if we can just return the input vector after widening.
+    if (IdxVal == 0 && InVT == WidenVT)
+      return InOp;
+
+    // Check if we can extract from the vector.
+    unsigned InNumElts = InVT.getVectorNumElements();
+    if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
+        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx);
+  }
+
+  // We could try widening the input to the right length but for now, extract
+  // the original elements, fill the rest with undefs and build a vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = VT.getVectorElementType();
+  EVT IdxVT = Idx.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned i;
+  if (CIdx) {
+    unsigned IdxVal = CIdx->getZExtValue();
+    for (i=0; i < NumElts; ++i)
+      Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                           DAG.getConstant(IdxVal+i, IdxVT));
+  } else {
+    Ops[0] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, Idx);
+    for (i=1; i < NumElts; ++i) {
+      SDValue NewIdx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
+                                   DAG.getConstant(i, IdxVT));
+      Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, NewIdx);
+    }
+  }
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, N->getDebugLoc(),
+                     InOp.getValueType(), InOp,
+                     N->getOperand(1), N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+
+  SDValue Result;
+  SmallVector<SDValue, 16> LdChain;  // Chain for the series of load
+  if (ExtType != ISD::NON_EXTLOAD)
+    Result = GenWidenVectorExtLoads(LdChain, LD, ExtType);
+  else
+    Result = GenWidenVectorLoads(LdChain, LD);
+
+  // If we generate a single load, we can use that for the chain.  Otherwise,
+  // build a factor node to remember the multiple loads are independent and
+  // chain to that.
+  SDValue NewChain;
+  if (LdChain.size() == 1)
+    NewChain = LdChain[0];
+  else
+    NewChain = DAG.getNode(ISD::TokenFactor, LD->getDebugLoc(), MVT::Other,
+                           &LdChain[0], LdChain.size());
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), NewChain);
+
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, N->getDebugLoc(),
+                     WidenVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue Cond1 = N->getOperand(0);
+  EVT CondVT = Cond1.getValueType();
+  if (CondVT.isVector()) {
+    EVT CondEltVT = CondVT.getVectorElementType();
+    EVT CondWidenVT =  EVT::getVectorVT(*DAG.getContext(),
+                                        CondEltVT, WidenNumElts);
+    if (getTypeAction(CondVT) == WidenVector)
+      Cond1 = GetWidenedVector(Cond1);
+
+    if (Cond1.getValueType() != CondWidenVT)
+       Cond1 = ModifyToType(Cond1, CondWidenVT);
+  }
+
+  SDValue InOp1 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(2));
+  assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     WidenVT, Cond1, InOp1, InOp2);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
+  SDValue InOp1 = GetWidenedVector(N->getOperand(2));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(),
+                     InOp1.getValueType(), N->getOperand(0),
+                     N->getOperand(1), InOp1, InOp2, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  return DAG.getNode(ISD::SETCC, N->getDebugLoc(), WidenVT,
+                     InOp1, InOp2, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_UNDEF(SDNode *N) {
+ EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ return DAG.getUNDEF(WidenVT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+
+  // Adjust mask based on new input vector length.
+  SmallVector<int, 16> NewMask;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int Idx = N->getMaskElt(i);
+    if (Idx < (int)NumElts)
+      NewMask.push_back(Idx);
+    else
+      NewMask.push_back(Idx - NumElts + WidenNumElts);
+  }
+  for (unsigned i = NumElts; i != WidenNumElts; ++i)
+    NewMask.push_back(-1);
+  return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, &NewMask[0]);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VSETCC(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue InOp1 = N->getOperand(0);
+  EVT InVT = InOp1.getValueType();
+  assert(InVT.isVector() && "can not widen non vector type");
+  EVT WidenInVT = EVT::getVectorVT(*DAG.getContext(),
+                                   InVT.getVectorElementType(), WidenNumElts);
+  InOp1 = GetWidenedVector(InOp1);
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+
+  // Assume that the input and output will be widen appropriately.  If not,
+  // we will have to unroll it at some point.
+  assert(InOp1.getValueType() == WidenInVT &&
+         InOp2.getValueType() == WidenInVT &&
+         "Input not widened to expected type!");
+  return DAG.getNode(ISD::VSETCC, N->getDebugLoc(),
+                     WidenVT, InOp1, InOp2, N->getOperand(2));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Widen Vector Operand
+//===----------------------------------------------------------------------===//
+bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Widen node operand " << ResNo << ": ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "WidenVectorOperand op #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to widen this operator's operand!");
+
+  case ISD::BIT_CONVERT:        Res = WidenVecOp_BIT_CONVERT(N); break;
+  case ISD::CONCAT_VECTORS:     Res = WidenVecOp_CONCAT_VECTORS(N); break;
+  case ISD::EXTRACT_SUBVECTOR:  Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
+  case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
+
+  case ISD::FP_ROUND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+    Res = WidenVecOp_Convert(N);
+    break;
+  }
+
+  // If Res is null, the sub-method took care of registering the result.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
+  // Since the result is legal and the input is illegal, it is unlikely
+  // that we can fix the input to a legal type so unroll the convert
+  // into some scalar code and create a nasty build vector.
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue InOp = N->getOperand(0);
+  if (getTypeAction(InOp.getValueType()) == WidenVector)
+    InOp = GetWidenedVector(InOp);
+  EVT InVT = InOp.getValueType();
+  EVT InEltVT = InVT.getVectorElementType();
+
+  unsigned Opcode = N->getOpcode();
+  SmallVector<SDValue, 16> Ops(NumElts);
+  for (unsigned i=0; i < NumElts; ++i)
+    Ops[i] = DAG.getNode(Opcode, dl, EltVT,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+                                     DAG.getIntPtrConstant(i)));
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_BIT_CONVERT(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  EVT InWidenVT = InOp.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Check if we can convert between two legal vector types and extract.
+  unsigned InWidenSize = InWidenVT.getSizeInBits();
+  unsigned Size = VT.getSizeInBits();
+  if (InWidenSize % Size == 0 && !VT.isVector()) {
+    unsigned NewNumElts = InWidenSize / Size;
+    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
+    if (TLI.isTypeSynthesizable(NewVT)) {
+      SDValue BitOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, InOp);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
+                         DAG.getIntPtrConstant(0));
+    }
+  }
+
+  return CreateStackStoreLoad(InOp, VT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
+  // If the input vector is not legal, it is likely that we will not find a
+  // legal vector of the same size. Replace the concatenate vector with a
+  // nasty build vector.
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumElts);
+
+  EVT InVT = N->getOperand(0).getValueType();
+  unsigned NumInElts = InVT.getVectorNumElements();
+
+  unsigned Idx = 0;
+  unsigned NumOperands = N->getNumOperands();
+  for (unsigned i=0; i < NumOperands; ++i) {
+    SDValue InOp = N->getOperand(i);
+    if (getTypeAction(InOp.getValueType()) == WidenVector)
+      InOp = GetWidenedVector(InOp);
+    for (unsigned j=0; j < NumInElts; ++j)
+      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                               DAG.getIntPtrConstant(j));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, N->getDebugLoc(),
+                     N->getValueType(0), InOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
+                     N->getValueType(0), InOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
+  // We have to widen the value but we want only to store the original
+  // vector type.
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+
+  SmallVector<SDValue, 16> StChain;
+  if (ST->isTruncatingStore())
+    GenWidenVectorTruncStores(StChain, ST);
+  else
+    GenWidenVectorStores(StChain, ST);
+
+  if (StChain.size() == 1)
+    return StChain[0];
+  else
+    return DAG.getNode(ISD::TokenFactor, ST->getDebugLoc(),
+                       MVT::Other,&StChain[0],StChain.size());
+}
+
+//===----------------------------------------------------------------------===//
+// Vector Widening Utilities
+//===----------------------------------------------------------------------===//
+
+// Utility function to find the type to chop up a widen vector for load/store
+//  TLI:       Target lowering used to determine legal types.
+//  Width:     Width left need to load/store.
+//  WidenVT:   The widen vector type to load to/store from
+//  Align:     If 0, don't allow use of a wider type
+//  WidenEx:   If Align is not 0, the amount additional we can load/store from.
+
+static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
+                       unsigned Width, EVT WidenVT,
+                       unsigned Align = 0, unsigned WidenEx = 0) {
+  EVT WidenEltVT = WidenVT.getVectorElementType();
+  unsigned WidenWidth = WidenVT.getSizeInBits();
+  unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
+  unsigned AlignInBits = Align*8;
+
+  // If we have one element to load/store, return it.
+  EVT RetVT = WidenEltVT;
+  if (Width == WidenEltWidth)
+    return RetVT;
+
+  // See if there is larger legal integer than the element type to load/store 
+  unsigned VT;
+  for (VT = (unsigned)MVT::LAST_INTEGER_VALUETYPE;
+       VT >= (unsigned)MVT::FIRST_INTEGER_VALUETYPE; --VT) {
+    EVT MemVT((MVT::SimpleValueType) VT);
+    unsigned MemVTWidth = MemVT.getSizeInBits();
+    if (MemVT.getSizeInBits() <= WidenEltWidth)
+      break;
+    if (TLI.isTypeSynthesizable(MemVT) && (WidenWidth % MemVTWidth) == 0 &&
+        (MemVTWidth <= Width ||
+         (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+      RetVT = MemVT;
+      break;
+    }
+  }
+
+  // See if there is a larger vector type to load/store that has the same vector
+  // element type and is evenly divisible with the WidenVT.
+  for (VT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
+       VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) {
+    EVT MemVT = (MVT::SimpleValueType) VT;
+    unsigned MemVTWidth = MemVT.getSizeInBits();
+    if (TLI.isTypeSynthesizable(MemVT) && WidenEltVT == MemVT.getVectorElementType() &&
+        (WidenWidth % MemVTWidth) == 0 &&
+        (MemVTWidth <= Width ||
+         (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+      if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT)
+        return MemVT;
+    }
+  }
+
+  return RetVT;
+}
+
+// Builds a vector type from scalar loads
+//  VecTy: Resulting Vector type
+//  LDOps: Load operators to build a vector type
+//  [Start,End) the list of loads to use.
+static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
+                                     SmallVector<SDValue, 16>& LdOps,
+                                     unsigned Start, unsigned End) {
+  DebugLoc dl = LdOps[Start].getDebugLoc();
+  EVT LdTy = LdOps[Start].getValueType();
+  unsigned Width = VecTy.getSizeInBits();
+  unsigned NumElts = Width / LdTy.getSizeInBits();
+  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), LdTy, NumElts);
+
+  unsigned Idx = 1;
+  SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT,LdOps[Start]);
+
+  for (unsigned i = Start + 1; i != End; ++i) {
+    EVT NewLdTy = LdOps[i].getValueType();
+    if (NewLdTy != LdTy) {
+      NumElts = Width / NewLdTy.getSizeInBits();
+      NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewLdTy, NumElts);
+      VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, VecOp);
+      // Readjust position and vector position based on new load type
+      Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits();
+      LdTy = NewLdTy;
+    }
+    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
+                        DAG.getIntPtrConstant(Idx++));
+  }
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VecTy, VecOp);
+}
+
+SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16>& LdChain,
+                                              LoadSDNode * LD) {
+  // The strategy assumes that we can efficiently load powers of two widths.
+  // The routines chops the vector into the largest vector loads with the same
+  // element type or scalar loads and then recombines it to the widen vector
+  // type.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
+  unsigned WidenWidth = WidenVT.getSizeInBits();
+  EVT LdVT    = LD->getMemoryVT();
+  DebugLoc dl = LD->getDebugLoc();
+  assert(LdVT.isVector() && WidenVT.isVector());
+  assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
+
+  // Load information
+  SDValue   Chain = LD->getChain();
+  SDValue   BasePtr = LD->getBasePtr();
+  int       SVOffset = LD->getSrcValueOffset();
+  unsigned  Align    = LD->getAlignment();
+  bool      isVolatile = LD->isVolatile();
+  bool      isNonTemporal = LD->isNonTemporal();
+  const Value *SV = LD->getSrcValue();
+
+  int LdWidth = LdVT.getSizeInBits();
+  int WidthDiff = WidenWidth - LdWidth;          // Difference
+  unsigned LdAlign = (isVolatile) ? 0 : Align; // Allow wider loads
+
+  // Find the vector type that can load from.
+  EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
+  int NewVTWidth = NewVT.getSizeInBits();
+  SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, SV, SVOffset,
+                             isVolatile, isNonTemporal, Align);
+  LdChain.push_back(LdOp.getValue(1));
+
+  // Check if we can load the element with one instruction
+  if (LdWidth <= NewVTWidth) {
+    if (NewVT.isVector()) {
+      if (NewVT != WidenVT) {
+        assert(WidenWidth % NewVTWidth == 0);
+        unsigned NumConcat = WidenWidth / NewVTWidth;
+        SmallVector<SDValue, 16> ConcatOps(NumConcat);
+        SDValue UndefVal = DAG.getUNDEF(NewVT);
+        ConcatOps[0] = LdOp;
+        for (unsigned i = 1; i != NumConcat; ++i)
+          ConcatOps[i] = UndefVal;
+        return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &ConcatOps[0],
+                           NumConcat);
+      } else
+        return LdOp;
+    } else {
+      unsigned NumElts = WidenWidth / NewVTWidth;
+      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
+      SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, WidenVT, VecOp);
+    }
+  }
+
+  // Load vector by using multiple loads from largest vector to scalar
+  SmallVector<SDValue, 16> LdOps;
+  LdOps.push_back(LdOp);
+
+  LdWidth -= NewVTWidth;
+  unsigned Offset = 0;
+
+  while (LdWidth > 0) {
+    unsigned Increment = NewVTWidth / 8;
+    Offset += Increment;
+    BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                          DAG.getIntPtrConstant(Increment));
+
+    if (LdWidth < NewVTWidth) {
+      // Our current type we are using is too large, find a better size
+      NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
+      NewVTWidth = NewVT.getSizeInBits();
+    }
+
+    SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, SV,
+                               SVOffset+Offset, isVolatile,
+                               isNonTemporal, MinAlign(Align, Increment));
+    LdChain.push_back(LdOp.getValue(1));
+    LdOps.push_back(LdOp);
+
+    LdWidth -= NewVTWidth;
+  }
+
+  // Build the vector from the loads operations
+  unsigned End = LdOps.size();
+  if (LdOps[0].getValueType().isVector()) {
+    // If the load contains vectors, build the vector using concat vector.
+    // All of the vectors used to loads are power of 2 and the scalars load
+    // can be combined to make a power of 2 vector.
+    SmallVector<SDValue, 16> ConcatOps(End);
+    int i = End - 1;
+    int Idx = End;
+    EVT LdTy = LdOps[i].getValueType();
+    // First combine the scalar loads to a vector
+    if (!LdTy.isVector())  {
+      for (--i; i >= 0; --i) {
+        LdTy = LdOps[i].getValueType();
+        if (LdTy.isVector())
+          break;
+      }
+      ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i+1, End);
+    }
+    ConcatOps[--Idx] = LdOps[i];
+    for (--i; i >= 0; --i) {
+      EVT NewLdTy = LdOps[i].getValueType();
+      if (NewLdTy != LdTy) {
+        // Create a larger vector
+        ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
+                                       &ConcatOps[Idx], End - Idx);
+        Idx = End - 1;
+        LdTy = NewLdTy;
+      }
+      ConcatOps[--Idx] = LdOps[i];
+    }
+
+    if (WidenWidth != LdTy.getSizeInBits()*(End - Idx)) {
+      // We need to fill the rest with undefs to build the vector
+      unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
+      SmallVector<SDValue, 16> WidenOps(NumOps);
+      SDValue UndefVal = DAG.getUNDEF(LdTy);
+      unsigned i = 0;
+      for (; i != End-Idx; ++i)
+        WidenOps[i] = ConcatOps[Idx+i];
+      for (; i != NumOps; ++i)
+        WidenOps[i] = UndefVal;
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &WidenOps[0],NumOps);
+    } else
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
+                         &ConcatOps[Idx], End - Idx);
+  } else // All the loads are scalar loads.
+    return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End);
+}
+
+SDValue
+DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
+                                         LoadSDNode * LD,
+                                         ISD::LoadExtType ExtType) {
+  // For extension loads, it may not be more efficient to chop up the vector
+  // and then extended it.  Instead, we unroll the load and build a new vector.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
+  EVT LdVT    = LD->getMemoryVT();
+  DebugLoc dl = LD->getDebugLoc();
+  assert(LdVT.isVector() && WidenVT.isVector());
+
+  // Load information
+  SDValue   Chain = LD->getChain();
+  SDValue   BasePtr = LD->getBasePtr();
+  int       SVOffset = LD->getSrcValueOffset();
+  unsigned  Align    = LD->getAlignment();
+  bool      isVolatile = LD->isVolatile();
+  bool      isNonTemporal = LD->isNonTemporal();
+  const Value *SV = LD->getSrcValue();
+
+  EVT EltVT = WidenVT.getVectorElementType();
+  EVT LdEltVT = LdVT.getVectorElementType();
+  unsigned NumElts = LdVT.getVectorNumElements();
+
+  // Load each element and widen
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  unsigned Increment = LdEltVT.getSizeInBits() / 8;
+  Ops[0] = DAG.getExtLoad(ExtType, EltVT, dl, Chain, BasePtr, SV, SVOffset,
+                          LdEltVT, isVolatile, isNonTemporal, Align);
+  LdChain.push_back(Ops[0].getValue(1));
+  unsigned i = 0, Offset = Increment;
+  for (i=1; i < NumElts; ++i, Offset += Increment) {
+    SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+                                     BasePtr, DAG.getIntPtrConstant(Offset));
+    Ops[i] = DAG.getExtLoad(ExtType, EltVT, dl, Chain, NewBasePtr, SV,
+                            SVOffset + Offset, LdEltVT, isVolatile,
+                            isNonTemporal, Align);
+    LdChain.push_back(Ops[i].getValue(1));
+  }
+
+  // Fill the rest with undefs
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i != WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], Ops.size());
+}
+
+
+void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
+                                            StoreSDNode *ST) {
+  // The strategy assumes that we can efficiently store powers of two widths.
+  // The routines chops the vector into the largest vector stores with the same
+  // element type or scalar stores.
+  SDValue  Chain = ST->getChain();
+  SDValue  BasePtr = ST->getBasePtr();
+  const    Value *SV = ST->getSrcValue();
+  int      SVOffset = ST->getSrcValueOffset();
+  unsigned Align = ST->getAlignment();
+  bool     isVolatile = ST->isVolatile();
+  bool     isNonTemporal = ST->isNonTemporal();
+  SDValue  ValOp = GetWidenedVector(ST->getValue());
+  DebugLoc dl = ST->getDebugLoc();
+
+  EVT StVT = ST->getMemoryVT();
+  unsigned StWidth = StVT.getSizeInBits();
+  EVT ValVT = ValOp.getValueType();
+  unsigned ValWidth = ValVT.getSizeInBits();
+  EVT ValEltVT = ValVT.getVectorElementType();
+  unsigned ValEltWidth = ValEltVT.getSizeInBits();
+  assert(StVT.getVectorElementType() == ValEltVT);
+
+  int Idx = 0;          // current index to store
+  unsigned Offset = 0;  // offset from base to store
+  while (StWidth != 0) {
+    // Find the largest vector type we can store with
+    EVT NewVT = FindMemType(DAG, TLI, StWidth, ValVT);
+    unsigned NewVTWidth = NewVT.getSizeInBits();
+    unsigned Increment = NewVTWidth / 8;
+    if (NewVT.isVector()) {
+      unsigned NumVTElts = NewVT.getVectorNumElements();
+      do {
+        SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
+                                   DAG.getIntPtrConstant(Idx));
+        StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr, SV,
+                                       SVOffset + Offset, isVolatile,
+                                       isNonTemporal,
+                                       MinAlign(Align, Offset)));
+        StWidth -= NewVTWidth;
+        Offset += Increment;
+        Idx += NumVTElts;
+        BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                              DAG.getIntPtrConstant(Increment));
+      } while (StWidth != 0 && StWidth >= NewVTWidth);
+    } else {
+      // Cast the vector to the scalar type we can store
+      unsigned NumElts = ValWidth / NewVTWidth;
+      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
+      SDValue VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, ValOp);
+      // Readjust index position based on new vector type
+      Idx = Idx * ValEltWidth / NewVTWidth;
+      do {
+        SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
+                      DAG.getIntPtrConstant(Idx++));
+        StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr, SV,
+                                       SVOffset + Offset, isVolatile,
+                                       isNonTemporal, MinAlign(Align, Offset)));
+        StWidth -= NewVTWidth;
+        Offset += Increment;
+        BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                              DAG.getIntPtrConstant(Increment));
+      } while (StWidth != 0  && StWidth >= NewVTWidth);
+      // Restore index back to be relative to the original widen element type
+      Idx = Idx * NewVTWidth / ValEltWidth;
+    }
+  }
+}
+
+void
+DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVector<SDValue, 16>& StChain,
+                                            StoreSDNode *ST) {
+  // For extension loads, it may not be more efficient to truncate the vector
+  // and then store it.  Instead, we extract each element and then store it.
+  SDValue  Chain = ST->getChain();
+  SDValue  BasePtr = ST->getBasePtr();
+  const    Value *SV = ST->getSrcValue();
+  int      SVOffset = ST->getSrcValueOffset();
+  unsigned Align = ST->getAlignment();
+  bool     isVolatile = ST->isVolatile();
+  bool     isNonTemporal = ST->isNonTemporal();
+  SDValue  ValOp = GetWidenedVector(ST->getValue());
+  DebugLoc dl = ST->getDebugLoc();
+  
+  EVT StVT = ST->getMemoryVT();
+  EVT ValVT = ValOp.getValueType();
+
+  // It must be true that we the widen vector type is bigger than where
+  // we need to store.
+  assert(StVT.isVector() && ValOp.getValueType().isVector());
+  assert(StVT.bitsLT(ValOp.getValueType()));
+
+  // For truncating stores, we can not play the tricks of chopping legal
+  // vector types and bit cast it to the right type.  Instead, we unroll
+  // the store.
+  EVT StEltVT  = StVT.getVectorElementType();
+  EVT ValEltVT = ValVT.getVectorElementType();
+  unsigned Increment = ValEltVT.getSizeInBits() / 8;
+  unsigned NumElts = StVT.getVectorNumElements();
+  SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
+                            DAG.getIntPtrConstant(0));
+  StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr, SV,
+                                      SVOffset, StEltVT,
+                                      isVolatile, isNonTemporal, Align));
+  unsigned Offset = Increment;
+  for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
+    SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+                                     BasePtr, DAG.getIntPtrConstant(Offset));
+    SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
+                            DAG.getIntPtrConstant(0));
+    StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr, SV,
+                                        SVOffset + Offset, StEltVT,
+                                        isVolatile, isNonTemporal,
+                                        MinAlign(Align, Offset)));
+  }
+}
+
+/// Modifies a vector input (widen or narrows) to a vector of NVT.  The
+/// input vector must have the same element type as NVT.
+SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
+  // Note that InOp might have been widened so it might already have
+  // the right width or it might need be narrowed.
+  EVT InVT = InOp.getValueType();
+  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+         "input and widen element type must match");
+  DebugLoc dl = InOp.getDebugLoc();
+
+  // Check if InOp already has the right width.
+  if (InVT == NVT)
+    return InOp;
+
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned WidenNumElts = NVT.getVectorNumElements();
+  if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
+    unsigned NumConcat = WidenNumElts / InNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue UndefVal = DAG.getUNDEF(InVT);
+    Ops[0] = InOp;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = UndefVal;
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, &Ops[0], NumConcat);
+  }
+
+  if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
+                       DAG.getIntPtrConstant(0));
+
+  // Fall back to extract and build.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = NVT.getVectorElementType();
+  unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
+  unsigned Idx;
+  for (Idx = 0; Idx < MinNumElts; ++Idx)
+    Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                           DAG.getIntPtrConstant(Idx));
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for ( ; Idx < WidenNumElts; ++Idx)
+    Ops[Idx] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &Ops[0], WidenNumElts);
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/Makefile b/src/LLVM/lib/CodeGen/SelectionDAG/Makefile
new file mode 100644
index 0000000..ea716fd
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/Makefile
@@ -0,0 +1,13 @@
+##===- lib/CodeGen/SelectionDAG/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMSelectionDAG
+
+include $(LEVEL)/Makefile.common
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/src/LLVM/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
new file mode 100644
index 0000000..ac2d338
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
@@ -0,0 +1,114 @@
+//===-- llvm/CodeGen/SDNodeDbgValue.h - SelectionDAG dbg_value --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SDDbgValue class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SDNODEDBGVALUE_H
+#define LLVM_CODEGEN_SDNODEDBGVALUE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/System/DataTypes.h"
+
+namespace llvm {
+
+class MDNode;
+class SDNode;
+class Value;
+
+/// SDDbgValue - Holds the information from a dbg_value node through SDISel.
+/// We do not use SDValue here to avoid including its header.
+
+class SDDbgValue {
+public:
+  enum DbgValueKind {
+    SDNODE = 0,             // value is the result of an expression
+    CONST = 1,              // value is a constant
+    FRAMEIX = 2             // value is contents of a stack location
+  };
+private:
+  enum DbgValueKind kind;
+  union {
+    struct {
+      SDNode *Node;         // valid for expressions
+      unsigned ResNo;       // valid for expressions
+    } s;
+    const Value *Const;     // valid for constants
+    unsigned FrameIx;       // valid for stack objects
+  } u;
+  MDNode *mdPtr;
+  uint64_t Offset;
+  DebugLoc DL;
+  unsigned Order;
+  bool Invalid;
+public:
+  // Constructor for non-constants.
+  SDDbgValue(MDNode *mdP, SDNode *N, unsigned R, uint64_t off, DebugLoc dl,
+             unsigned O) : mdPtr(mdP), Offset(off), DL(dl), Order(O),
+                           Invalid(false) {
+    kind = SDNODE;
+    u.s.Node = N;
+    u.s.ResNo = R;
+  }
+
+  // Constructor for constants.
+  SDDbgValue(MDNode *mdP, const Value *C, uint64_t off, DebugLoc dl,
+             unsigned O) : 
+    mdPtr(mdP), Offset(off), DL(dl), Order(O), Invalid(false) {
+    kind = CONST;
+    u.Const = C;
+  }
+
+  // Constructor for frame indices.
+  SDDbgValue(MDNode *mdP, unsigned FI, uint64_t off, DebugLoc dl, unsigned O) : 
+    mdPtr(mdP), Offset(off), DL(dl), Order(O), Invalid(false) {
+    kind = FRAMEIX;
+    u.FrameIx = FI;
+  }
+
+  // Returns the kind.
+  DbgValueKind getKind() { return kind; }
+
+  // Returns the MDNode pointer.
+  MDNode *getMDPtr() { return mdPtr; }
+
+  // Returns the SDNode* for a register ref
+  SDNode *getSDNode() { assert (kind==SDNODE); return u.s.Node; }
+
+  // Returns the ResNo for a register ref
+  unsigned getResNo() { assert (kind==SDNODE); return u.s.ResNo; }
+
+  // Returns the Value* for a constant
+  const Value *getConst() { assert (kind==CONST); return u.Const; }
+
+  // Returns the FrameIx for a stack object
+  unsigned getFrameIx() { assert (kind==FRAMEIX); return u.FrameIx; }
+
+  // Returns the offset.
+  uint64_t getOffset() { return Offset; }
+
+  // Returns the DebugLoc.
+  DebugLoc getDebugLoc() { return DL; }
+
+  // Returns the SDNodeOrder.  This is the order of the preceding node in the
+  // input.
+  unsigned getOrder() { return Order; }
+
+  // setIsInvalidated / isInvalidated - Setter / getter of the "Invalidated"
+  // property. A SDDbgValue is invalid if the SDNode that produces the value is
+  // deleted.
+  void setIsInvalidated() { Invalid = true; }
+  bool isInvalidated() { return Invalid; }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/SDNodeOrdering.h b/src/LLVM/lib/CodeGen/SelectionDAG/SDNodeOrdering.h
new file mode 100644
index 0000000..f88b26d
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/SDNodeOrdering.h
@@ -0,0 +1,54 @@
+//===-- llvm/CodeGen/SDNodeOrdering.h - SDNode Ordering ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SDNodeOrdering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SDNODEORDERING_H
+#define LLVM_CODEGEN_SDNODEORDERING_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class SDNode;
+
+/// SDNodeOrdering - Maps a unique (monotonically increasing) value to each
+/// SDNode that roughly corresponds to the ordering of the original LLVM
+/// instruction. This is used for turning off scheduling, because we'll forgo
+/// the normal scheduling algorithms and output the instructions according to
+/// this ordering.
+class SDNodeOrdering {
+  DenseMap<const SDNode*, unsigned> OrderMap;
+
+  void operator=(const SDNodeOrdering&);   // Do not implement.
+  SDNodeOrdering(const SDNodeOrdering&);   // Do not implement.
+public:
+  SDNodeOrdering() {}
+
+  void add(const SDNode *Node, unsigned O) {
+    OrderMap[Node] = O;
+  }
+  void remove(const SDNode *Node) {
+    DenseMap<const SDNode*, unsigned>::iterator Itr = OrderMap.find(Node);
+    if (Itr != OrderMap.end())
+      OrderMap.erase(Itr);
+  }
+  void clear() {
+    OrderMap.clear();
+  }
+  unsigned getOrder(const SDNode *Node) {
+    return OrderMap[Node];
+  }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
new file mode 100644
index 0000000..3b86c32
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -0,0 +1,604 @@
+//===----- ScheduleDAGFast.cpp - Fast poor list scheduler -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a fast scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumUnfolds,    "Number of nodes unfolded");
+STATISTIC(NumDups,       "Number of duplicated nodes");
+STATISTIC(NumPRCopies,   "Number of physical copies");
+
+static RegisterScheduler
+  fastDAGScheduler("fast", "Fast suboptimal list scheduling",
+                   createFastDAGScheduler);
+
+namespace {
+  /// FastPriorityQueue - A degenerate priority queue that considers
+  /// all nodes to have the same priority.
+  ///
+  struct FastPriorityQueue {
+    SmallVector<SUnit *, 16> Queue;
+
+    bool empty() const { return Queue.empty(); }
+    
+    void push(SUnit *U) {
+      Queue.push_back(U);
+    }
+
+    SUnit *pop() {
+      if (empty()) return NULL;
+      SUnit *V = Queue.back();
+      Queue.pop_back();
+      return V;
+    }
+  };
+
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGFast - The actual "fast" list scheduler implementation.
+///
+class ScheduleDAGFast : public ScheduleDAGSDNodes {
+private:
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  FastPriorityQueue AvailableQueue;
+
+  /// LiveRegDefs - A set of physical registers and their definition
+  /// that are "live". These nodes must be scheduled before any other nodes that
+  /// modifies the registers can be scheduled.
+  unsigned NumLiveRegs;
+  std::vector<SUnit*> LiveRegDefs;
+  std::vector<unsigned> LiveRegCycles;
+
+public:
+  ScheduleDAGFast(MachineFunction &mf)
+    : ScheduleDAGSDNodes(mf) {}
+
+  void Schedule();
+
+  /// AddPred - adds a predecessor edge to SUnit SU.
+  /// This returns true if this is a new predecessor.
+  void AddPred(SUnit *SU, const SDep &D) {
+    SU->addPred(D);
+  }
+
+  /// RemovePred - removes a predecessor edge from SUnit SU.
+  /// This returns true if an edge was removed.
+  void RemovePred(SUnit *SU, const SDep &D) {
+    SU->removePred(D);
+  }
+
+private:
+  void ReleasePred(SUnit *SU, SDep *PredEdge);
+  void ReleasePredecessors(SUnit *SU, unsigned CurCycle);
+  void ScheduleNodeBottomUp(SUnit*, unsigned);
+  SUnit *CopyAndMoveSuccessors(SUnit*);
+  void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
+                                const TargetRegisterClass*,
+                                const TargetRegisterClass*,
+                                SmallVector<SUnit*, 2>&);
+  bool DelayForLiveRegsBottomUp(SUnit*, SmallVector<unsigned, 4>&);
+  void ListScheduleBottomUp();
+
+  /// ForceUnitLatencies - The fast scheduler doesn't care about real latencies.
+  bool ForceUnitLatencies() const { return true; }
+};
+}  // end anonymous namespace
+
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGFast::Schedule() {
+  DEBUG(dbgs() << "********** List Scheduling **********\n");
+
+  NumLiveRegs = 0;
+  LiveRegDefs.resize(TRI->getNumRegs(), NULL);  
+  LiveRegCycles.resize(TRI->getNumRegs(), 0);
+
+  // Build the scheduling graph.
+  BuildSchedGraph(NULL);
+
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  // Execute the actual scheduling loop.
+  ListScheduleBottomUp();
+}
+
+//===----------------------------------------------------------------------===//
+//  Bottom-Up Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. Add it to
+/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGFast::ReleasePred(SUnit *SU, SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --PredSU->NumSuccsLeft;
+
+  // If all the node's successors are scheduled, this node is ready
+  // to be scheduled. Ignore the special EntrySU node.
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
+    PredSU->isAvailable = true;
+    AvailableQueue.push(PredSU);
+  }
+}
+
+void ScheduleDAGFast::ReleasePredecessors(SUnit *SU, unsigned CurCycle) {
+  // Bottom up: release predecessors
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    ReleasePred(SU, &*I);
+    if (I->isAssignedRegDep()) {
+      // This is a physical register dependency and it's impossible or
+      // expensive to copy the register. Make sure nothing that can 
+      // clobber the register is scheduled between the predecessor and
+      // this node.
+      if (!LiveRegDefs[I->getReg()]) {
+        ++NumLiveRegs;
+        LiveRegDefs[I->getReg()] = I->getSUnit();
+        LiveRegCycles[I->getReg()] = CurCycle;
+      }
+    }
+  }
+}
+
+/// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
+/// count of its predecessors. If a predecessor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGFast::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
+  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+
+  assert(CurCycle >= SU->getHeight() && "Node scheduled below its height!");
+  SU->setHeightToAtLeast(CurCycle);
+  Sequence.push_back(SU);
+
+  ReleasePredecessors(SU, CurCycle);
+
+  // Release all the implicit physical register defs that are live.
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      if (LiveRegCycles[I->getReg()] == I->getSUnit()->getHeight()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        assert(LiveRegDefs[I->getReg()] == SU &&
+               "Physical register dependency violated?");
+        --NumLiveRegs;
+        LiveRegDefs[I->getReg()] = NULL;
+        LiveRegCycles[I->getReg()] = 0;
+      }
+    }
+  }
+
+  SU->isScheduled = true;
+}
+
+/// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
+/// successors to the newly created node.
+SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
+  if (SU->getNode()->getFlaggedNode())
+    return NULL;
+
+  SDNode *N = SU->getNode();
+  if (!N)
+    return NULL;
+
+  SUnit *NewSU;
+  bool TryUnfold = false;
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT == MVT::Flag)
+      return NULL;
+    else if (VT == MVT::Other)
+      TryUnfold = true;
+  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDValue &Op = N->getOperand(i);
+    EVT VT = Op.getNode()->getValueType(Op.getResNo());
+    if (VT == MVT::Flag)
+      return NULL;
+  }
+
+  if (TryUnfold) {
+    SmallVector<SDNode*, 2> NewNodes;
+    if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
+      return NULL;
+
+    DEBUG(dbgs() << "Unfolding SU # " << SU->NodeNum << "\n");
+    assert(NewNodes.size() == 2 && "Expected a load folding node!");
+
+    N = NewNodes[1];
+    SDNode *LoadNode = NewNodes[0];
+    unsigned NumVals = N->getNumValues();
+    unsigned OldNumVals = SU->getNode()->getNumValues();
+    for (unsigned i = 0; i != NumVals; ++i)
+      DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
+    DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1),
+                                   SDValue(LoadNode, 1));
+
+    SUnit *NewSU = NewSUnit(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NewSU->NodeNum);
+      
+    const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
+    for (unsigned i = 0; i != TID.getNumOperands(); ++i) {
+      if (TID.getOperandConstraint(i, TOI::TIED_TO) != -1) {
+        NewSU->isTwoAddress = true;
+        break;
+      }
+    }
+    if (TID.isCommutable())
+      NewSU->isCommutable = true;
+
+    // LoadNode may already exist. This can happen when there is another
+    // load from the same location and producing the same type of value
+    // but it has different alignment or volatileness.
+    bool isNewLoad = true;
+    SUnit *LoadSU;
+    if (LoadNode->getNodeId() != -1) {
+      LoadSU = &SUnits[LoadNode->getNodeId()];
+      isNewLoad = false;
+    } else {
+      LoadSU = NewSUnit(LoadNode);
+      LoadNode->setNodeId(LoadSU->NodeNum);
+    }
+
+    SDep ChainPred;
+    SmallVector<SDep, 4> ChainSuccs;
+    SmallVector<SDep, 4> LoadPreds;
+    SmallVector<SDep, 4> NodePreds;
+    SmallVector<SDep, 4> NodeSuccs;
+    for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainPred = *I;
+      else if (I->getSUnit()->getNode() &&
+               I->getSUnit()->getNode()->isOperandOf(LoadNode))
+        LoadPreds.push_back(*I);
+      else
+        NodePreds.push_back(*I);
+    }
+    for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainSuccs.push_back(*I);
+      else
+        NodeSuccs.push_back(*I);
+    }
+
+    if (ChainPred.getSUnit()) {
+      RemovePred(SU, ChainPred);
+      if (isNewLoad)
+        AddPred(LoadSU, ChainPred);
+    }
+    for (unsigned i = 0, e = LoadPreds.size(); i != e; ++i) {
+      const SDep &Pred = LoadPreds[i];
+      RemovePred(SU, Pred);
+      if (isNewLoad) {
+        AddPred(LoadSU, Pred);
+      }
+    }
+    for (unsigned i = 0, e = NodePreds.size(); i != e; ++i) {
+      const SDep &Pred = NodePreds[i];
+      RemovePred(SU, Pred);
+      AddPred(NewSU, Pred);
+    }
+    for (unsigned i = 0, e = NodeSuccs.size(); i != e; ++i) {
+      SDep D = NodeSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      D.setSUnit(NewSU);
+      AddPred(SuccDep, D);
+    }
+    for (unsigned i = 0, e = ChainSuccs.size(); i != e; ++i) {
+      SDep D = ChainSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      if (isNewLoad) {
+        D.setSUnit(LoadSU);
+        AddPred(SuccDep, D);
+      }
+    } 
+    if (isNewLoad) {
+      AddPred(NewSU, SDep(LoadSU, SDep::Order, LoadSU->Latency));
+    }
+
+    ++NumUnfolds;
+
+    if (NewSU->NumSuccsLeft == 0) {
+      NewSU->isAvailable = true;
+      return NewSU;
+    }
+    SU = NewSU;
+  }
+
+  DEBUG(dbgs() << "Duplicating SU # " << SU->NodeNum << "\n");
+  NewSU = Clone(SU);
+
+  // New SUnit has the exact same predecessors.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I)
+    if (!I->isArtificial())
+      AddPred(NewSU, *I);
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(NewSU);
+      AddPred(SuccSU, D);
+      D.setSUnit(SU);
+      DelDeps.push_back(std::make_pair(SuccSU, D));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+
+  ++NumDups;
+  return NewSU;
+}
+
+/// InsertCopiesAndMoveSuccs - Insert register copies and move all
+/// scheduled successors of the given SUnit to the last copy.
+void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
+                                              const TargetRegisterClass *DestRC,
+                                              const TargetRegisterClass *SrcRC,
+                                               SmallVector<SUnit*, 2> &Copies) {
+  SUnit *CopyFromSU = NewSUnit(static_cast<SDNode *>(NULL));
+  CopyFromSU->CopySrcRC = SrcRC;
+  CopyFromSU->CopyDstRC = DestRC;
+
+  SUnit *CopyToSU = NewSUnit(static_cast<SDNode *>(NULL));
+  CopyToSU->CopySrcRC = DestRC;
+  CopyToSU->CopyDstRC = SrcRC;
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(CopyToSU);
+      AddPred(SuccSU, D);
+      DelDeps.push_back(std::make_pair(SuccSU, *I));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i) {
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+  }
+
+  AddPred(CopyFromSU, SDep(SU, SDep::Data, SU->Latency, Reg));
+  AddPred(CopyToSU, SDep(CopyFromSU, SDep::Data, CopyFromSU->Latency, 0));
+
+  Copies.push_back(CopyFromSU);
+  Copies.push_back(CopyToSU);
+
+  ++NumPRCopies;
+}
+
+/// getPhysicalRegisterVT - Returns the ValueType of the physical register
+/// definition of the specified node.
+/// FIXME: Move to SelectionDAG?
+static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+                                 const TargetInstrInfo *TII) {
+  const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
+  assert(TID.ImplicitDefs && "Physical reg def must be in implicit def list!");
+  unsigned NumRes = TID.getNumDefs();
+  for (const unsigned *ImpDef = TID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+    if (Reg == *ImpDef)
+      break;
+    ++NumRes;
+  }
+  return N->getValueType(NumRes);
+}
+
+/// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
+/// scheduling of the given node to satisfy live physical register dependencies.
+/// If the specific node is the last one that's available to schedule, do
+/// whatever is necessary (i.e. backtracking or cloning) to make it possible.
+bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
+                                               SmallVector<unsigned, 4> &LRegs){
+  if (NumLiveRegs == 0)
+    return false;
+
+  SmallSet<unsigned, 4> RegAdded;
+  // If this node would clobber any "live" register, then it's not ready.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      unsigned Reg = I->getReg();
+      if (LiveRegDefs[Reg] && LiveRegDefs[Reg] != I->getSUnit()) {
+        if (RegAdded.insert(Reg))
+          LRegs.push_back(Reg);
+      }
+      for (const unsigned *Alias = TRI->getAliasSet(Reg);
+           *Alias; ++Alias)
+        if (LiveRegDefs[*Alias] && LiveRegDefs[*Alias] != I->getSUnit()) {
+          if (RegAdded.insert(*Alias))
+            LRegs.push_back(*Alias);
+        }
+    }
+  }
+
+  for (SDNode *Node = SU->getNode(); Node; Node = Node->getFlaggedNode()) {
+    if (!Node->isMachineOpcode())
+      continue;
+    const TargetInstrDesc &TID = TII->get(Node->getMachineOpcode());
+    if (!TID.ImplicitDefs)
+      continue;
+    for (const unsigned *Reg = TID.ImplicitDefs; *Reg; ++Reg) {
+      if (LiveRegDefs[*Reg] && LiveRegDefs[*Reg] != SU) {
+        if (RegAdded.insert(*Reg))
+          LRegs.push_back(*Reg);
+      }
+      for (const unsigned *Alias = TRI->getAliasSet(*Reg);
+           *Alias; ++Alias)
+        if (LiveRegDefs[*Alias] && LiveRegDefs[*Alias] != SU) {
+          if (RegAdded.insert(*Alias))
+            LRegs.push_back(*Alias);
+        }
+    }
+  }
+  return !LRegs.empty();
+}
+
+
+/// ListScheduleBottomUp - The main loop of list scheduling for bottom-up
+/// schedulers.
+void ScheduleDAGFast::ListScheduleBottomUp() {
+  unsigned CurCycle = 0;
+
+  // Release any predecessors of the special Exit node.
+  ReleasePredecessors(&ExitSU, CurCycle);
+
+  // Add root to Available queue.
+  if (!SUnits.empty()) {
+    SUnit *RootSU = &SUnits[DAG->getRoot().getNode()->getNodeId()];
+    assert(RootSU->Succs.empty() && "Graph root shouldn't have successors!");
+    RootSU->isAvailable = true;
+    AvailableQueue.push(RootSU);
+  }
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  SmallVector<SUnit*, 4> NotReady;
+  DenseMap<SUnit*, SmallVector<unsigned, 4> > LRegsMap;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue.empty()) {
+    bool Delayed = false;
+    LRegsMap.clear();
+    SUnit *CurSU = AvailableQueue.pop();
+    while (CurSU) {
+      SmallVector<unsigned, 4> LRegs;
+      if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
+        break;
+      Delayed = true;
+      LRegsMap.insert(std::make_pair(CurSU, LRegs));
+
+      CurSU->isPending = true;  // This SU is not in AvailableQueue right now.
+      NotReady.push_back(CurSU);
+      CurSU = AvailableQueue.pop();
+    }
+
+    // All candidates are delayed due to live physical reg dependencies.
+    // Try code duplication or inserting cross class copies
+    // to resolve it.
+    if (Delayed && !CurSU) {
+      if (!CurSU) {
+        // Try duplicating the nodes that produces these
+        // "expensive to copy" values to break the dependency. In case even
+        // that doesn't work, insert cross class copies.
+        SUnit *TrySU = NotReady[0];
+        SmallVector<unsigned, 4> &LRegs = LRegsMap[TrySU];
+        assert(LRegs.size() == 1 && "Can't handle this yet!");
+        unsigned Reg = LRegs[0];
+        SUnit *LRDef = LiveRegDefs[Reg];
+        EVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+        const TargetRegisterClass *RC =
+          TRI->getMinimalPhysRegClass(Reg, VT);
+        const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
+
+        // If cross copy register class is null, then it must be possible copy
+        // the value directly. Do not try duplicate the def.
+        SUnit *NewDef = 0;
+        if (DestRC)
+          NewDef = CopyAndMoveSuccessors(LRDef);
+        else
+          DestRC = RC;
+        if (!NewDef) {
+          // Issue copies, these can be expensive cross register class copies.
+          SmallVector<SUnit*, 2> Copies;
+          InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
+          DEBUG(dbgs() << "Adding an edge from SU # " << TrySU->NodeNum
+                       << " to SU #" << Copies.front()->NodeNum << "\n");
+          AddPred(TrySU, SDep(Copies.front(), SDep::Order, /*Latency=*/1,
+                              /*Reg=*/0, /*isNormalMemory=*/false,
+                              /*isMustAlias=*/false, /*isArtificial=*/true));
+          NewDef = Copies.back();
+        }
+
+        DEBUG(dbgs() << "Adding an edge from SU # " << NewDef->NodeNum
+                     << " to SU #" << TrySU->NodeNum << "\n");
+        LiveRegDefs[Reg] = NewDef;
+        AddPred(NewDef, SDep(TrySU, SDep::Order, /*Latency=*/1,
+                             /*Reg=*/0, /*isNormalMemory=*/false,
+                             /*isMustAlias=*/false, /*isArtificial=*/true));
+        TrySU->isAvailable = false;
+        CurSU = NewDef;
+      }
+
+      if (!CurSU) {
+        llvm_unreachable("Unable to resolve live physical register dependencies!");
+      }
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    for (unsigned i = 0, e = NotReady.size(); i != e; ++i) {
+      NotReady[i]->isPending = false;
+      // May no longer be available due to backtracking.
+      if (NotReady[i]->isAvailable)
+        AvailableQueue.push(NotReady[i]);
+    }
+    NotReady.clear();
+
+    if (CurSU)
+      ScheduleNodeBottomUp(CurSU, CurCycle);
+    ++CurCycle;
+  }
+
+  // Reverse the order since it is bottom up.
+  std::reverse(Sequence.begin(), Sequence.end());
+
+#ifndef NDEBUG
+  VerifySchedule(/*isBottomUp=*/true);
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+llvm::ScheduleDAGSDNodes *
+llvm::createFastDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGFast(*IS->MF);
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGList.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGList.cpp
new file mode 100644
index 0000000..56f5ded
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGList.cpp
@@ -0,0 +1,268 @@
+//===---- ScheduleDAGList.cpp - Implement a list scheduler for isel DAG ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a top-down list scheduler, using standard algorithms.
+// The basic approach uses a priority queue of available nodes to schedule.
+// One at a time, nodes are taken from the priority queue (thus in priority
+// order), checked for legality to schedule, and emitted if legal.
+//
+// Nodes may not be legal to schedule either due to structural hazards (e.g.
+// pipeline or resource constraints) or because an input to the instruction has
+// not completed execution.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+#include <climits>
+using namespace llvm;
+
+STATISTIC(NumNoops , "Number of noops inserted");
+STATISTIC(NumStalls, "Number of pipeline stalls");
+
+static RegisterScheduler
+  tdListDAGScheduler("list-td", "Top-down list scheduler",
+                     createTDListDAGScheduler);
+   
+namespace {
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGList - The actual list scheduler implementation.  This supports
+/// top-down scheduling.
+///
+class ScheduleDAGList : public ScheduleDAGSDNodes {
+private:
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  ///
+  SchedulingPriorityQueue *AvailableQueue;
+  
+  /// PendingQueue - This contains all of the instructions whose operands have
+  /// been issued, but their results are not ready yet (due to the latency of
+  /// the operation).  Once the operands become available, the instruction is
+  /// added to the AvailableQueue.
+  std::vector<SUnit*> PendingQueue;
+
+  /// HazardRec - The hazard recognizer to use.
+  ScheduleHazardRecognizer *HazardRec;
+
+public:
+  ScheduleDAGList(MachineFunction &mf,
+                  SchedulingPriorityQueue *availqueue,
+                  ScheduleHazardRecognizer *HR)
+    : ScheduleDAGSDNodes(mf),
+      AvailableQueue(availqueue), HazardRec(HR) {
+    }
+
+  ~ScheduleDAGList() {
+    delete HazardRec;
+    delete AvailableQueue;
+  }
+
+  void Schedule();
+
+private:
+  void ReleaseSucc(SUnit *SU, const SDep &D);
+  void ReleaseSuccessors(SUnit *SU);
+  void ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle);
+  void ListScheduleTopDown();
+};
+}  // end anonymous namespace
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGList::Schedule() {
+  DEBUG(dbgs() << "********** List Scheduling **********\n");
+  
+  // Build the scheduling graph.
+  BuildSchedGraph(NULL);
+
+  AvailableQueue->initNodes(SUnits);
+  
+  ListScheduleTopDown();
+  
+  AvailableQueue->releaseState();
+}
+
+//===----------------------------------------------------------------------===//
+//  Top-Down Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. Add it to
+/// the PendingQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGList::ReleaseSucc(SUnit *SU, const SDep &D) {
+  SUnit *SuccSU = D.getSUnit();
+
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --SuccSU->NumPredsLeft;
+
+  SuccSU->setDepthToAtLeast(SU->getDepth() + D.getLatency());
+  
+  // If all the node's predecessors are scheduled, this node is ready
+  // to be scheduled. Ignore the special ExitSU node.
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
+    PendingQueue.push_back(SuccSU);
+}
+
+void ScheduleDAGList::ReleaseSuccessors(SUnit *SU) {
+  // Top down: release successors.
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    assert(!I->isAssignedRegDep() &&
+           "The list-td scheduler doesn't yet support physreg dependencies!");
+
+    ReleaseSucc(SU, *I);
+  }
+}
+
+/// ScheduleNodeTopDown - Add the node to the schedule. Decrement the pending
+/// count of its successors. If a successor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
+  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+  
+  Sequence.push_back(SU);
+  assert(CurCycle >= SU->getDepth() && "Node scheduled above its depth!");
+  SU->setDepthToAtLeast(CurCycle);
+
+  ReleaseSuccessors(SU);
+  SU->isScheduled = true;
+  AvailableQueue->ScheduledNode(SU);
+}
+
+/// ListScheduleTopDown - The main loop of list scheduling for top-down
+/// schedulers.
+void ScheduleDAGList::ListScheduleTopDown() {
+  unsigned CurCycle = 0;
+
+  // Release any successors of the special Entry node.
+  ReleaseSuccessors(&EntrySU);
+
+  // All leaves to Available queue.
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    // It is available if it has no predecessors.
+    if (SUnits[i].Preds.empty()) {
+      AvailableQueue->push(&SUnits[i]);
+      SUnits[i].isAvailable = true;
+    }
+  }
+  
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  std::vector<SUnit*> NotReady;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue->empty() || !PendingQueue.empty()) {
+    // Check to see if any of the pending instructions are ready to issue.  If
+    // so, add them to the available queue.
+    for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+      if (PendingQueue[i]->getDepth() == CurCycle) {
+        AvailableQueue->push(PendingQueue[i]);
+        PendingQueue[i]->isAvailable = true;
+        PendingQueue[i] = PendingQueue.back();
+        PendingQueue.pop_back();
+        --i; --e;
+      } else {
+        assert(PendingQueue[i]->getDepth() > CurCycle && "Negative latency?");
+      }
+    }
+    
+    // If there are no instructions available, don't try to issue anything, and
+    // don't advance the hazard recognizer.
+    if (AvailableQueue->empty()) {
+      ++CurCycle;
+      continue;
+    }
+
+    SUnit *FoundSUnit = 0;
+    
+    bool HasNoopHazards = false;
+    while (!AvailableQueue->empty()) {
+      SUnit *CurSUnit = AvailableQueue->pop();
+      
+      ScheduleHazardRecognizer::HazardType HT =
+        HazardRec->getHazardType(CurSUnit);
+      if (HT == ScheduleHazardRecognizer::NoHazard) {
+        FoundSUnit = CurSUnit;
+        break;
+      }
+    
+      // Remember if this is a noop hazard.
+      HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard;
+      
+      NotReady.push_back(CurSUnit);
+    }
+    
+    // Add the nodes that aren't ready back onto the available list.
+    if (!NotReady.empty()) {
+      AvailableQueue->push_all(NotReady);
+      NotReady.clear();
+    }
+
+    // If we found a node to schedule, do it now.
+    if (FoundSUnit) {
+      ScheduleNodeTopDown(FoundSUnit, CurCycle);
+      HazardRec->EmitInstruction(FoundSUnit);
+
+      // If this is a pseudo-op node, we don't want to increment the current
+      // cycle.
+      if (FoundSUnit->Latency)  // Don't increment CurCycle for pseudo-ops!
+        ++CurCycle;        
+    } else if (!HasNoopHazards) {
+      // Otherwise, we have a pipeline stall, but no other problem, just advance
+      // the current cycle and try again.
+      DEBUG(dbgs() << "*** Advancing cycle, no work to do\n");
+      HazardRec->AdvanceCycle();
+      ++NumStalls;
+      ++CurCycle;
+    } else {
+      // Otherwise, we have no instructions to issue and we have instructions
+      // that will fault if we don't do this right.  This is the case for
+      // processors without pipeline interlocks and other cases.
+      DEBUG(dbgs() << "*** Emitting noop\n");
+      HazardRec->EmitNoop();
+      Sequence.push_back(0);   // NULL here means noop
+      ++NumNoops;
+      ++CurCycle;
+    }
+  }
+
+#ifndef NDEBUG
+  VerifySchedule(/*isBottomUp=*/false);
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+/// createTDListDAGScheduler - This creates a top-down list scheduler with a
+/// new hazard recognizer. This scheduler takes ownership of the hazard
+/// recognizer and deletes it when done.
+ScheduleDAGSDNodes *
+llvm::createTDListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGList(*IS->MF,
+                             new LatencyPriorityQueue(),
+                             IS->CreateTargetHazardRecognizer());
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
new file mode 100644
index 0000000..3a5c616
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -0,0 +1,1993 @@
+//===----- ScheduleDAGRRList.cpp - Reg pressure reduction list scheduler --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements bottom-up and top-down register pressure reduction list
+// schedulers, using standard algorithms.  The basic approach uses a priority
+// queue of available nodes to schedule.  One at a time, nodes are taken from
+// the priority queue (thus in priority order), checked for legality to
+// schedule, and emitted if legal.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <climits>
+using namespace llvm;
+
+STATISTIC(NumBacktracks, "Number of times scheduler backtracked");
+STATISTIC(NumUnfolds,    "Number of nodes unfolded");
+STATISTIC(NumDups,       "Number of duplicated nodes");
+STATISTIC(NumPRCopies,   "Number of physical register copies");
+
+static RegisterScheduler
+  burrListDAGScheduler("list-burr",
+                       "Bottom-up register reduction list scheduling",
+                       createBURRListDAGScheduler);
+static RegisterScheduler
+  tdrListrDAGScheduler("list-tdrr",
+                       "Top-down register reduction list scheduling",
+                       createTDRRListDAGScheduler);
+static RegisterScheduler
+  sourceListDAGScheduler("source",
+                         "Similar to list-burr but schedules in source "
+                         "order when possible",
+                         createSourceListDAGScheduler);
+
+static RegisterScheduler
+  hybridListDAGScheduler("list-hybrid",
+                         "Bottom-up register pressure aware list scheduling "
+                         "which tries to balance latency and register pressure",
+                         createHybridListDAGScheduler);
+
+static RegisterScheduler
+  ILPListDAGScheduler("list-ilp",
+                      "Bottom-up register pressure aware list scheduling "
+                      "which tries to balance ILP and register pressure",
+                      createILPListDAGScheduler);
+
+namespace {
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGRRList - The actual register reduction list scheduler
+/// implementation.  This supports both top-down and bottom-up scheduling.
+///
+class ScheduleDAGRRList : public ScheduleDAGSDNodes {
+private:
+  /// isBottomUp - This is true if the scheduling problem is bottom-up, false if
+  /// it is top-down.
+  bool isBottomUp;
+
+  /// NeedLatency - True if the scheduler will make use of latency information.
+  ///
+  bool NeedLatency;
+
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  SchedulingPriorityQueue *AvailableQueue;
+
+  /// LiveRegDefs - A set of physical registers and their definition
+  /// that are "live". These nodes must be scheduled before any other nodes that
+  /// modifies the registers can be scheduled.
+  unsigned NumLiveRegs;
+  std::vector<SUnit*> LiveRegDefs;
+  std::vector<unsigned> LiveRegCycles;
+
+  /// Topo - A topological ordering for SUnits which permits fast IsReachable
+  /// and similar queries.
+  ScheduleDAGTopologicalSort Topo;
+
+public:
+  ScheduleDAGRRList(MachineFunction &mf,
+                    bool isbottomup, bool needlatency,
+                    SchedulingPriorityQueue *availqueue)
+    : ScheduleDAGSDNodes(mf), isBottomUp(isbottomup), NeedLatency(needlatency),
+      AvailableQueue(availqueue), Topo(SUnits) {
+    }
+
+  ~ScheduleDAGRRList() {
+    delete AvailableQueue;
+  }
+
+  void Schedule();
+
+  /// IsReachable - Checks if SU is reachable from TargetSU.
+  bool IsReachable(const SUnit *SU, const SUnit *TargetSU) {
+    return Topo.IsReachable(SU, TargetSU);
+  }
+
+  /// WillCreateCycle - Returns true if adding an edge from SU to TargetSU will
+  /// create a cycle.
+  bool WillCreateCycle(SUnit *SU, SUnit *TargetSU) {
+    return Topo.WillCreateCycle(SU, TargetSU);
+  }
+
+  /// AddPred - adds a predecessor edge to SUnit SU.
+  /// This returns true if this is a new predecessor.
+  /// Updates the topological ordering if required.
+  void AddPred(SUnit *SU, const SDep &D) {
+    Topo.AddPred(SU, D.getSUnit());
+    SU->addPred(D);
+  }
+
+  /// RemovePred - removes a predecessor edge from SUnit SU.
+  /// This returns true if an edge was removed.
+  /// Updates the topological ordering if required.
+  void RemovePred(SUnit *SU, const SDep &D) {
+    Topo.RemovePred(SU, D.getSUnit());
+    SU->removePred(D);
+  }
+
+private:
+  void ReleasePred(SUnit *SU, const SDep *PredEdge);
+  void ReleasePredecessors(SUnit *SU, unsigned CurCycle);
+  void ReleaseSucc(SUnit *SU, const SDep *SuccEdge);
+  void ReleaseSuccessors(SUnit *SU);
+  void CapturePred(SDep *PredEdge);
+  void ScheduleNodeBottomUp(SUnit*, unsigned);
+  void ScheduleNodeTopDown(SUnit*, unsigned);
+  void UnscheduleNodeBottomUp(SUnit*);
+  void BacktrackBottomUp(SUnit*, unsigned, unsigned&);
+  SUnit *CopyAndMoveSuccessors(SUnit*);
+  void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
+                                const TargetRegisterClass*,
+                                const TargetRegisterClass*,
+                                SmallVector<SUnit*, 2>&);
+  bool DelayForLiveRegsBottomUp(SUnit*, SmallVector<unsigned, 4>&);
+  void ListScheduleTopDown();
+  void ListScheduleBottomUp();
+
+
+  /// CreateNewSUnit - Creates a new SUnit and returns a pointer to it.
+  /// Updates the topological ordering if required.
+  SUnit *CreateNewSUnit(SDNode *N) {
+    unsigned NumSUnits = SUnits.size();
+    SUnit *NewNode = NewSUnit(N);
+    // Update the topological ordering.
+    if (NewNode->NodeNum >= NumSUnits)
+      Topo.InitDAGTopologicalSorting();
+    return NewNode;
+  }
+
+  /// CreateClone - Creates a new SUnit from an existing one.
+  /// Updates the topological ordering if required.
+  SUnit *CreateClone(SUnit *N) {
+    unsigned NumSUnits = SUnits.size();
+    SUnit *NewNode = Clone(N);
+    // Update the topological ordering.
+    if (NewNode->NodeNum >= NumSUnits)
+      Topo.InitDAGTopologicalSorting();
+    return NewNode;
+  }
+
+  /// ForceUnitLatencies - Register-pressure-reducing scheduling doesn't
+  /// need actual latency information but the hybrid scheduler does.
+  bool ForceUnitLatencies() const {
+    return !NeedLatency;
+  }
+};
+}  // end anonymous namespace
+
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGRRList::Schedule() {
+  DEBUG(dbgs()
+        << "********** List Scheduling BB#" << BB->getNumber()
+        << " **********\n");
+
+  NumLiveRegs = 0;
+  LiveRegDefs.resize(TRI->getNumRegs(), NULL);  
+  LiveRegCycles.resize(TRI->getNumRegs(), 0);
+
+  // Build the scheduling graph.
+  BuildSchedGraph(NULL);
+
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+  Topo.InitDAGTopologicalSorting();
+
+  AvailableQueue->initNodes(SUnits);
+  
+  // Execute the actual scheduling loop Top-Down or Bottom-Up as appropriate.
+  if (isBottomUp)
+    ListScheduleBottomUp();
+  else
+    ListScheduleTopDown();
+  
+  AvailableQueue->releaseState();
+}
+
+//===----------------------------------------------------------------------===//
+//  Bottom-Up Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. Add it to
+/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGRRList::ReleasePred(SUnit *SU, const SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --PredSU->NumSuccsLeft;
+
+  if (!ForceUnitLatencies()) {
+    // Updating predecessor's height. This is now the cycle when the
+    // predecessor can be scheduled without causing a pipeline stall.
+    PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge->getLatency());
+  }
+
+  // If all the node's successors are scheduled, this node is ready
+  // to be scheduled. Ignore the special EntrySU node.
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
+    PredSU->isAvailable = true;
+    AvailableQueue->push(PredSU);
+  }
+}
+
+void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU, unsigned CurCycle) {
+  // Bottom up: release predecessors
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    ReleasePred(SU, &*I);
+    if (I->isAssignedRegDep()) {
+      // This is a physical register dependency and it's impossible or
+      // expensive to copy the register. Make sure nothing that can 
+      // clobber the register is scheduled between the predecessor and
+      // this node.
+      if (!LiveRegDefs[I->getReg()]) {
+        ++NumLiveRegs;
+        LiveRegDefs[I->getReg()] = I->getSUnit();
+        LiveRegCycles[I->getReg()] = CurCycle;
+      }
+    }
+  }
+}
+
+/// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
+/// count of its predecessors. If a predecessor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
+  DEBUG(dbgs() << "\n*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+
+#ifndef NDEBUG
+  if (CurCycle < SU->getHeight())
+    DEBUG(dbgs() << "   Height [" << SU->getHeight() << "] pipeline stall!\n");
+#endif
+
+  // FIXME: Handle noop hazard.
+  SU->setHeightToAtLeast(CurCycle);
+  Sequence.push_back(SU);
+
+  AvailableQueue->ScheduledNode(SU);
+
+  ReleasePredecessors(SU, CurCycle);
+
+  // Release all the implicit physical register defs that are live.
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      if (LiveRegCycles[I->getReg()] == I->getSUnit()->getHeight()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        assert(LiveRegDefs[I->getReg()] == SU &&
+               "Physical register dependency violated?");
+        --NumLiveRegs;
+        LiveRegDefs[I->getReg()] = NULL;
+        LiveRegCycles[I->getReg()] = 0;
+      }
+    }
+  }
+
+  SU->isScheduled = true;
+}
+
+/// CapturePred - This does the opposite of ReleasePred. Since SU is being
+/// unscheduled, incrcease the succ left count of its predecessors. Remove
+/// them from AvailableQueue if necessary.
+void ScheduleDAGRRList::CapturePred(SDep *PredEdge) {  
+  SUnit *PredSU = PredEdge->getSUnit();
+  if (PredSU->isAvailable) {
+    PredSU->isAvailable = false;
+    if (!PredSU->isPending)
+      AvailableQueue->remove(PredSU);
+  }
+
+  assert(PredSU->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!");
+  ++PredSU->NumSuccsLeft;
+}
+
+/// UnscheduleNodeBottomUp - Remove the node from the schedule, update its and
+/// its predecessor states to reflect the change.
+void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
+  DEBUG(dbgs() << "*** Unscheduling [" << SU->getHeight() << "]: ");
+  DEBUG(SU->dump(this));
+
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    CapturePred(&*I);
+    if (I->isAssignedRegDep() && SU->getHeight() == LiveRegCycles[I->getReg()]){
+      assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+      assert(LiveRegDefs[I->getReg()] == I->getSUnit() &&
+             "Physical register dependency violated?");
+      --NumLiveRegs;
+      LiveRegDefs[I->getReg()] = NULL;
+      LiveRegCycles[I->getReg()] = 0;
+    }
+  }
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      if (!LiveRegDefs[I->getReg()]) {
+        LiveRegDefs[I->getReg()] = SU;
+        ++NumLiveRegs;
+      }
+      if (I->getSUnit()->getHeight() < LiveRegCycles[I->getReg()])
+        LiveRegCycles[I->getReg()] = I->getSUnit()->getHeight();
+    }
+  }
+
+  SU->setHeightDirty();
+  SU->isScheduled = false;
+  SU->isAvailable = true;
+  AvailableQueue->push(SU);
+  AvailableQueue->UnscheduledNode(SU);
+}
+
+/// BacktrackBottomUp - Backtrack scheduling to a previous cycle specified in
+/// BTCycle in order to schedule a specific node.
+void ScheduleDAGRRList::BacktrackBottomUp(SUnit *SU, unsigned BtCycle,
+                                          unsigned &CurCycle) {
+  SUnit *OldSU = NULL;
+  while (CurCycle > BtCycle) {
+    OldSU = Sequence.back();
+    Sequence.pop_back();
+    if (SU->isSucc(OldSU))
+      // Don't try to remove SU from AvailableQueue.
+      SU->isAvailable = false;
+    UnscheduleNodeBottomUp(OldSU);
+    --CurCycle;
+    AvailableQueue->setCurCycle(CurCycle);
+  }
+
+  assert(!SU->isSucc(OldSU) && "Something is wrong!");
+
+  ++NumBacktracks;
+}
+
+static bool isOperandOf(const SUnit *SU, SDNode *N) {
+  for (const SDNode *SUNode = SU->getNode(); SUNode;
+       SUNode = SUNode->getFlaggedNode()) {
+    if (SUNode->isOperandOf(N))
+      return true;
+  }
+  return false;
+}
+
+/// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
+/// successors to the newly created node.
+SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
+  if (SU->getNode()->getFlaggedNode())
+    return NULL;
+
+  SDNode *N = SU->getNode();
+  if (!N)
+    return NULL;
+
+  SUnit *NewSU;
+  bool TryUnfold = false;
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT == MVT::Flag)
+      return NULL;
+    else if (VT == MVT::Other)
+      TryUnfold = true;
+  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDValue &Op = N->getOperand(i);
+    EVT VT = Op.getNode()->getValueType(Op.getResNo());
+    if (VT == MVT::Flag)
+      return NULL;
+  }
+
+  if (TryUnfold) {
+    SmallVector<SDNode*, 2> NewNodes;
+    if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
+      return NULL;
+
+    DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
+    assert(NewNodes.size() == 2 && "Expected a load folding node!");
+
+    N = NewNodes[1];
+    SDNode *LoadNode = NewNodes[0];
+    unsigned NumVals = N->getNumValues();
+    unsigned OldNumVals = SU->getNode()->getNumValues();
+    for (unsigned i = 0; i != NumVals; ++i)
+      DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
+    DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1),
+                                   SDValue(LoadNode, 1));
+
+    // LoadNode may already exist. This can happen when there is another
+    // load from the same location and producing the same type of value
+    // but it has different alignment or volatileness.
+    bool isNewLoad = true;
+    SUnit *LoadSU;
+    if (LoadNode->getNodeId() != -1) {
+      LoadSU = &SUnits[LoadNode->getNodeId()];
+      isNewLoad = false;
+    } else {
+      LoadSU = CreateNewSUnit(LoadNode);
+      LoadNode->setNodeId(LoadSU->NodeNum);
+      ComputeLatency(LoadSU);
+    }
+
+    SUnit *NewSU = CreateNewSUnit(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NewSU->NodeNum);
+      
+    const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
+    for (unsigned i = 0; i != TID.getNumOperands(); ++i) {
+      if (TID.getOperandConstraint(i, TOI::TIED_TO) != -1) {
+        NewSU->isTwoAddress = true;
+        break;
+      }
+    }
+    if (TID.isCommutable())
+      NewSU->isCommutable = true;
+    ComputeLatency(NewSU);
+
+    // Record all the edges to and from the old SU, by category.
+    SmallVector<SDep, 4> ChainPreds;
+    SmallVector<SDep, 4> ChainSuccs;
+    SmallVector<SDep, 4> LoadPreds;
+    SmallVector<SDep, 4> NodePreds;
+    SmallVector<SDep, 4> NodeSuccs;
+    for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainPreds.push_back(*I);
+      else if (isOperandOf(I->getSUnit(), LoadNode))
+        LoadPreds.push_back(*I);
+      else
+        NodePreds.push_back(*I);
+    }
+    for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainSuccs.push_back(*I);
+      else
+        NodeSuccs.push_back(*I);
+    }
+
+    // Now assign edges to the newly-created nodes.
+    for (unsigned i = 0, e = ChainPreds.size(); i != e; ++i) {
+      const SDep &Pred = ChainPreds[i];
+      RemovePred(SU, Pred);
+      if (isNewLoad)
+        AddPred(LoadSU, Pred);
+    }
+    for (unsigned i = 0, e = LoadPreds.size(); i != e; ++i) {
+      const SDep &Pred = LoadPreds[i];
+      RemovePred(SU, Pred);
+      if (isNewLoad)
+        AddPred(LoadSU, Pred);
+    }
+    for (unsigned i = 0, e = NodePreds.size(); i != e; ++i) {
+      const SDep &Pred = NodePreds[i];
+      RemovePred(SU, Pred);
+      AddPred(NewSU, Pred);
+    }
+    for (unsigned i = 0, e = NodeSuccs.size(); i != e; ++i) {
+      SDep D = NodeSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      D.setSUnit(NewSU);
+      AddPred(SuccDep, D);
+    }
+    for (unsigned i = 0, e = ChainSuccs.size(); i != e; ++i) {
+      SDep D = ChainSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      if (isNewLoad) {
+        D.setSUnit(LoadSU);
+        AddPred(SuccDep, D);
+      }
+    } 
+
+    // Add a data dependency to reflect that NewSU reads the value defined
+    // by LoadSU.
+    AddPred(NewSU, SDep(LoadSU, SDep::Data, LoadSU->Latency));
+
+    if (isNewLoad)
+      AvailableQueue->addNode(LoadSU);
+    AvailableQueue->addNode(NewSU);
+
+    ++NumUnfolds;
+
+    if (NewSU->NumSuccsLeft == 0) {
+      NewSU->isAvailable = true;
+      return NewSU;
+    }
+    SU = NewSU;
+  }
+
+  DEBUG(dbgs() << "    Duplicating SU #" << SU->NodeNum << "\n");
+  NewSU = CreateClone(SU);
+
+  // New SUnit has the exact same predecessors.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I)
+    if (!I->isArtificial())
+      AddPred(NewSU, *I);
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(NewSU);
+      AddPred(SuccSU, D);
+      D.setSUnit(SU);
+      DelDeps.push_back(std::make_pair(SuccSU, D));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+
+  AvailableQueue->updateNode(SU);
+  AvailableQueue->addNode(NewSU);
+
+  ++NumDups;
+  return NewSU;
+}
+
+/// InsertCopiesAndMoveSuccs - Insert register copies and move all
+/// scheduled successors of the given SUnit to the last copy.
+void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
+                                               const TargetRegisterClass *DestRC,
+                                               const TargetRegisterClass *SrcRC,
+                                               SmallVector<SUnit*, 2> &Copies) {
+  SUnit *CopyFromSU = CreateNewSUnit(NULL);
+  CopyFromSU->CopySrcRC = SrcRC;
+  CopyFromSU->CopyDstRC = DestRC;
+
+  SUnit *CopyToSU = CreateNewSUnit(NULL);
+  CopyToSU->CopySrcRC = DestRC;
+  CopyToSU->CopyDstRC = SrcRC;
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(CopyToSU);
+      AddPred(SuccSU, D);
+      DelDeps.push_back(std::make_pair(SuccSU, *I));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+
+  AddPred(CopyFromSU, SDep(SU, SDep::Data, SU->Latency, Reg));
+  AddPred(CopyToSU, SDep(CopyFromSU, SDep::Data, CopyFromSU->Latency, 0));
+
+  AvailableQueue->updateNode(SU);
+  AvailableQueue->addNode(CopyFromSU);
+  AvailableQueue->addNode(CopyToSU);
+  Copies.push_back(CopyFromSU);
+  Copies.push_back(CopyToSU);
+
+  ++NumPRCopies;
+}
+
+/// getPhysicalRegisterVT - Returns the ValueType of the physical register
+/// definition of the specified node.
+/// FIXME: Move to SelectionDAG?
+static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+                                 const TargetInstrInfo *TII) {
+  const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
+  assert(TID.ImplicitDefs && "Physical reg def must be in implicit def list!");
+  unsigned NumRes = TID.getNumDefs();
+  for (const unsigned *ImpDef = TID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+    if (Reg == *ImpDef)
+      break;
+    ++NumRes;
+  }
+  return N->getValueType(NumRes);
+}
+
+/// CheckForLiveRegDef - Return true and update live register vector if the
+/// specified register def of the specified SUnit clobbers any "live" registers.
+static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg,
+                               std::vector<SUnit*> &LiveRegDefs,
+                               SmallSet<unsigned, 4> &RegAdded,
+                               SmallVector<unsigned, 4> &LRegs,
+                               const TargetRegisterInfo *TRI) {
+  bool Added = false;
+  if (LiveRegDefs[Reg] && LiveRegDefs[Reg] != SU) {
+    if (RegAdded.insert(Reg)) {
+      LRegs.push_back(Reg);
+      Added = true;
+    }
+  }
+  for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias)
+    if (LiveRegDefs[*Alias] && LiveRegDefs[*Alias] != SU) {
+      if (RegAdded.insert(*Alias)) {
+        LRegs.push_back(*Alias);
+        Added = true;
+      }
+    }
+  return Added;
+}
+
+/// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
+/// scheduling of the given node to satisfy live physical register dependencies.
+/// If the specific node is the last one that's available to schedule, do
+/// whatever is necessary (i.e. backtracking or cloning) to make it possible.
+bool ScheduleDAGRRList::DelayForLiveRegsBottomUp(SUnit *SU,
+                                                 SmallVector<unsigned, 4> &LRegs){
+  if (NumLiveRegs == 0)
+    return false;
+
+  SmallSet<unsigned, 4> RegAdded;
+  // If this node would clobber any "live" register, then it's not ready.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep())
+      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs,
+                         RegAdded, LRegs, TRI);
+  }
+
+  for (SDNode *Node = SU->getNode(); Node; Node = Node->getFlaggedNode()) {
+    if (!Node->isMachineOpcode())
+      continue;
+    const TargetInstrDesc &TID = TII->get(Node->getMachineOpcode());
+    if (!TID.ImplicitDefs)
+      continue;
+    for (const unsigned *Reg = TID.ImplicitDefs; *Reg; ++Reg)
+      CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+  }
+  return !LRegs.empty();
+}
+
+
+/// ListScheduleBottomUp - The main loop of list scheduling for bottom-up
+/// schedulers.
+void ScheduleDAGRRList::ListScheduleBottomUp() {
+  unsigned CurCycle = 0;
+
+  // Release any predecessors of the special Exit node.
+  ReleasePredecessors(&ExitSU, CurCycle);
+
+  // Add root to Available queue.
+  if (!SUnits.empty()) {
+    SUnit *RootSU = &SUnits[DAG->getRoot().getNode()->getNodeId()];
+    assert(RootSU->Succs.empty() && "Graph root shouldn't have successors!");
+    RootSU->isAvailable = true;
+    AvailableQueue->push(RootSU);
+  }
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  SmallVector<SUnit*, 4> NotReady;
+  DenseMap<SUnit*, SmallVector<unsigned, 4> > LRegsMap;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue->empty()) {
+    bool Delayed = false;
+    LRegsMap.clear();
+    SUnit *CurSU = AvailableQueue->pop();
+    while (CurSU) {
+      SmallVector<unsigned, 4> LRegs;
+      if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
+        break;
+      Delayed = true;
+      LRegsMap.insert(std::make_pair(CurSU, LRegs));
+
+      CurSU->isPending = true;  // This SU is not in AvailableQueue right now.
+      NotReady.push_back(CurSU);
+      CurSU = AvailableQueue->pop();
+    }
+
+    // All candidates are delayed due to live physical reg dependencies.
+    // Try backtracking, code duplication, or inserting cross class copies
+    // to resolve it.
+    if (Delayed && !CurSU) {
+      for (unsigned i = 0, e = NotReady.size(); i != e; ++i) {
+        SUnit *TrySU = NotReady[i];
+        SmallVector<unsigned, 4> &LRegs = LRegsMap[TrySU];
+
+        // Try unscheduling up to the point where it's safe to schedule
+        // this node.
+        unsigned LiveCycle = CurCycle;
+        for (unsigned j = 0, ee = LRegs.size(); j != ee; ++j) {
+          unsigned Reg = LRegs[j];
+          unsigned LCycle = LiveRegCycles[Reg];
+          LiveCycle = std::min(LiveCycle, LCycle);
+        }
+        SUnit *OldSU = Sequence[LiveCycle];
+        if (!WillCreateCycle(TrySU, OldSU))  {
+          BacktrackBottomUp(TrySU, LiveCycle, CurCycle);
+          // Force the current node to be scheduled before the node that
+          // requires the physical reg dep.
+          if (OldSU->isAvailable) {
+            OldSU->isAvailable = false;
+            AvailableQueue->remove(OldSU);
+          }
+          AddPred(TrySU, SDep(OldSU, SDep::Order, /*Latency=*/1,
+                              /*Reg=*/0, /*isNormalMemory=*/false,
+                              /*isMustAlias=*/false, /*isArtificial=*/true));
+          // If one or more successors has been unscheduled, then the current
+          // node is no longer avaialable. Schedule a successor that's now
+          // available instead.
+          if (!TrySU->isAvailable)
+            CurSU = AvailableQueue->pop();
+          else {
+            CurSU = TrySU;
+            TrySU->isPending = false;
+            NotReady.erase(NotReady.begin()+i);
+          }
+          break;
+        }
+      }
+
+      if (!CurSU) {
+        // Can't backtrack. If it's too expensive to copy the value, then try
+        // duplicate the nodes that produces these "too expensive to copy"
+        // values to break the dependency. In case even that doesn't work,
+        // insert cross class copies.
+        // If it's not too expensive, i.e. cost != -1, issue copies.
+        SUnit *TrySU = NotReady[0];
+        SmallVector<unsigned, 4> &LRegs = LRegsMap[TrySU];
+        assert(LRegs.size() == 1 && "Can't handle this yet!");
+        unsigned Reg = LRegs[0];
+        SUnit *LRDef = LiveRegDefs[Reg];
+        EVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+        const TargetRegisterClass *RC =
+          TRI->getMinimalPhysRegClass(Reg, VT);
+        const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
+
+        // If cross copy register class is null, then it must be possible copy
+        // the value directly. Do not try duplicate the def.
+        SUnit *NewDef = 0;
+        if (DestRC)
+          NewDef = CopyAndMoveSuccessors(LRDef);
+        else
+          DestRC = RC;
+        if (!NewDef) {
+          // Issue copies, these can be expensive cross register class copies.
+          SmallVector<SUnit*, 2> Copies;
+          InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
+          DEBUG(dbgs() << "    Adding an edge from SU #" << TrySU->NodeNum
+                       << " to SU #" << Copies.front()->NodeNum << "\n");
+          AddPred(TrySU, SDep(Copies.front(), SDep::Order, /*Latency=*/1,
+                              /*Reg=*/0, /*isNormalMemory=*/false,
+                              /*isMustAlias=*/false,
+                              /*isArtificial=*/true));
+          NewDef = Copies.back();
+        }
+
+        DEBUG(dbgs() << "    Adding an edge from SU #" << NewDef->NodeNum
+                     << " to SU #" << TrySU->NodeNum << "\n");
+        LiveRegDefs[Reg] = NewDef;
+        AddPred(NewDef, SDep(TrySU, SDep::Order, /*Latency=*/1,
+                             /*Reg=*/0, /*isNormalMemory=*/false,
+                             /*isMustAlias=*/false,
+                             /*isArtificial=*/true));
+        TrySU->isAvailable = false;
+        CurSU = NewDef;
+      }
+
+      assert(CurSU && "Unable to resolve live physical register dependencies!");
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    for (unsigned i = 0, e = NotReady.size(); i != e; ++i) {
+      NotReady[i]->isPending = false;
+      // May no longer be available due to backtracking.
+      if (NotReady[i]->isAvailable)
+        AvailableQueue->push(NotReady[i]);
+    }
+    NotReady.clear();
+
+    if (CurSU)
+      ScheduleNodeBottomUp(CurSU, CurCycle);
+    ++CurCycle;
+    AvailableQueue->setCurCycle(CurCycle);
+  }
+
+  // Reverse the order if it is bottom up.
+  std::reverse(Sequence.begin(), Sequence.end());
+  
+#ifndef NDEBUG
+  VerifySchedule(isBottomUp);
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//  Top-Down Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. Add it to
+/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGRRList::ReleaseSucc(SUnit *SU, const SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --SuccSU->NumPredsLeft;
+
+  // If all the node's predecessors are scheduled, this node is ready
+  // to be scheduled. Ignore the special ExitSU node.
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU) {
+    SuccSU->isAvailable = true;
+    AvailableQueue->push(SuccSU);
+  }
+}
+
+void ScheduleDAGRRList::ReleaseSuccessors(SUnit *SU) {
+  // Top down: release successors
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    assert(!I->isAssignedRegDep() &&
+           "The list-tdrr scheduler doesn't yet support physreg dependencies!");
+
+    ReleaseSucc(SU, &*I);
+  }
+}
+
+/// ScheduleNodeTopDown - Add the node to the schedule. Decrement the pending
+/// count of its successors. If a successor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGRRList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
+  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+
+  assert(CurCycle >= SU->getDepth() && "Node scheduled above its depth!");
+  SU->setDepthToAtLeast(CurCycle);
+  Sequence.push_back(SU);
+
+  ReleaseSuccessors(SU);
+  SU->isScheduled = true;
+  AvailableQueue->ScheduledNode(SU);
+}
+
+/// ListScheduleTopDown - The main loop of list scheduling for top-down
+/// schedulers.
+void ScheduleDAGRRList::ListScheduleTopDown() {
+  unsigned CurCycle = 0;
+  AvailableQueue->setCurCycle(CurCycle);
+
+  // Release any successors of the special Entry node.
+  ReleaseSuccessors(&EntrySU);
+
+  // All leaves to Available queue.
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    // It is available if it has no predecessors.
+    if (SUnits[i].Preds.empty()) {
+      AvailableQueue->push(&SUnits[i]);
+      SUnits[i].isAvailable = true;
+    }
+  }
+  
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue->empty()) {
+    SUnit *CurSU = AvailableQueue->pop();
+    
+    if (CurSU)
+      ScheduleNodeTopDown(CurSU, CurCycle);
+    ++CurCycle;
+    AvailableQueue->setCurCycle(CurCycle);
+  }
+  
+#ifndef NDEBUG
+  VerifySchedule(isBottomUp);
+#endif
+}
+
+
+//===----------------------------------------------------------------------===//
+//                RegReductionPriorityQueue Implementation
+//===----------------------------------------------------------------------===//
+//
+// This is a SchedulingPriorityQueue that schedules using Sethi Ullman numbers
+// to reduce register pressure.
+// 
+namespace {
+  template<class SF>
+  class RegReductionPriorityQueue;
+  
+  /// bu_ls_rr_sort - Priority function for bottom up register pressure
+  // reduction scheduler.
+  struct bu_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
+    RegReductionPriorityQueue<bu_ls_rr_sort> *SPQ;
+    bu_ls_rr_sort(RegReductionPriorityQueue<bu_ls_rr_sort> *spq) : SPQ(spq) {}
+    bu_ls_rr_sort(const bu_ls_rr_sort &RHS) : SPQ(RHS.SPQ) {}
+    
+    bool operator()(const SUnit* left, const SUnit* right) const;
+  };
+
+  // td_ls_rr_sort - Priority function for top down register pressure reduction
+  // scheduler.
+  struct td_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
+    RegReductionPriorityQueue<td_ls_rr_sort> *SPQ;
+    td_ls_rr_sort(RegReductionPriorityQueue<td_ls_rr_sort> *spq) : SPQ(spq) {}
+    td_ls_rr_sort(const td_ls_rr_sort &RHS) : SPQ(RHS.SPQ) {}
+    
+    bool operator()(const SUnit* left, const SUnit* right) const;
+  };
+
+  // src_ls_rr_sort - Priority function for source order scheduler.
+  struct src_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
+    RegReductionPriorityQueue<src_ls_rr_sort> *SPQ;
+    src_ls_rr_sort(RegReductionPriorityQueue<src_ls_rr_sort> *spq)
+      : SPQ(spq) {}
+    src_ls_rr_sort(const src_ls_rr_sort &RHS)
+      : SPQ(RHS.SPQ) {}
+    
+    bool operator()(const SUnit* left, const SUnit* right) const;
+  };
+
+  // hybrid_ls_rr_sort - Priority function for hybrid scheduler.
+  struct hybrid_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
+    RegReductionPriorityQueue<hybrid_ls_rr_sort> *SPQ;
+    hybrid_ls_rr_sort(RegReductionPriorityQueue<hybrid_ls_rr_sort> *spq)
+      : SPQ(spq) {}
+    hybrid_ls_rr_sort(const hybrid_ls_rr_sort &RHS)
+      : SPQ(RHS.SPQ) {}
+
+    bool operator()(const SUnit* left, const SUnit* right) const;
+  };
+
+  // ilp_ls_rr_sort - Priority function for ILP (instruction level parallelism)
+  // scheduler.
+  struct ilp_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
+    RegReductionPriorityQueue<ilp_ls_rr_sort> *SPQ;
+    ilp_ls_rr_sort(RegReductionPriorityQueue<ilp_ls_rr_sort> *spq)
+      : SPQ(spq) {}
+    ilp_ls_rr_sort(const ilp_ls_rr_sort &RHS)
+      : SPQ(RHS.SPQ) {}
+
+    bool operator()(const SUnit* left, const SUnit* right) const;
+  };
+}  // end anonymous namespace
+
+/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
+/// Smaller number is the higher priority.
+static unsigned
+CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
+  unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum];
+  if (SethiUllmanNumber != 0)
+    return SethiUllmanNumber;
+
+  unsigned Extra = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    SUnit *PredSU = I->getSUnit();
+    unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers);
+    if (PredSethiUllman > SethiUllmanNumber) {
+      SethiUllmanNumber = PredSethiUllman;
+      Extra = 0;
+    } else if (PredSethiUllman == SethiUllmanNumber)
+      ++Extra;
+  }
+
+  SethiUllmanNumber += Extra;
+
+  if (SethiUllmanNumber == 0)
+    SethiUllmanNumber = 1;
+  
+  return SethiUllmanNumber;
+}
+
+namespace {
+  template<class SF>
+  class RegReductionPriorityQueue : public SchedulingPriorityQueue {
+    std::vector<SUnit*> Queue;
+    SF Picker;
+    unsigned CurQueueId;
+    bool TracksRegPressure;
+
+  protected:
+    // SUnits - The SUnits for the current graph.
+    std::vector<SUnit> *SUnits;
+
+    MachineFunction &MF;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const TargetLowering *TLI;
+    ScheduleDAGRRList *scheduleDAG;
+
+    // SethiUllmanNumbers - The SethiUllman number for each node.
+    std::vector<unsigned> SethiUllmanNumbers;
+
+    /// RegPressure - Tracking current reg pressure per register class.
+    ///
+    std::vector<unsigned> RegPressure;
+
+    /// RegLimit - Tracking the number of allocatable registers per register
+    /// class.
+    std::vector<unsigned> RegLimit;
+
+  public:
+    RegReductionPriorityQueue(MachineFunction &mf,
+                              bool tracksrp,
+                              const TargetInstrInfo *tii,
+                              const TargetRegisterInfo *tri,
+                              const TargetLowering *tli)
+      : Picker(this), CurQueueId(0), TracksRegPressure(tracksrp),
+        MF(mf), TII(tii), TRI(tri), TLI(tli), scheduleDAG(NULL) {
+      if (TracksRegPressure) {
+        unsigned NumRC = TRI->getNumRegClasses();
+        RegLimit.resize(NumRC);
+        RegPressure.resize(NumRC);
+        std::fill(RegLimit.begin(), RegLimit.end(), 0);
+        std::fill(RegPressure.begin(), RegPressure.end(), 0);
+        for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+               E = TRI->regclass_end(); I != E; ++I)
+          RegLimit[(*I)->getID()] = tli->getRegPressureLimit(*I, MF);
+      }
+    }
+    
+    void initNodes(std::vector<SUnit> &sunits) {
+      SUnits = &sunits;
+      // Add pseudo dependency edges for two-address nodes.
+      AddPseudoTwoAddrDeps();
+      // Reroute edges to nodes with multiple uses.
+      PrescheduleNodesWithMultipleUses();
+      // Calculate node priorities.
+      CalculateSethiUllmanNumbers();
+    }
+
+    void addNode(const SUnit *SU) {
+      unsigned SUSize = SethiUllmanNumbers.size();
+      if (SUnits->size() > SUSize)
+        SethiUllmanNumbers.resize(SUSize*2, 0);
+      CalcNodeSethiUllmanNumber(SU, SethiUllmanNumbers);
+    }
+
+    void updateNode(const SUnit *SU) {
+      SethiUllmanNumbers[SU->NodeNum] = 0;
+      CalcNodeSethiUllmanNumber(SU, SethiUllmanNumbers);
+    }
+
+    void releaseState() {
+      SUnits = 0;
+      SethiUllmanNumbers.clear();
+      std::fill(RegPressure.begin(), RegPressure.end(), 0);
+    }
+
+    unsigned getNodePriority(const SUnit *SU) const {
+      assert(SU->NodeNum < SethiUllmanNumbers.size());
+      unsigned Opc = SU->getNode() ? SU->getNode()->getOpcode() : 0;
+      if (Opc == ISD::TokenFactor || Opc == ISD::CopyToReg)
+        // CopyToReg should be close to its uses to facilitate coalescing and
+        // avoid spilling.
+        return 0;
+      if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+          Opc == TargetOpcode::SUBREG_TO_REG ||
+          Opc == TargetOpcode::INSERT_SUBREG)
+        // EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG nodes should be
+        // close to their uses to facilitate coalescing.
+        return 0;
+      if (SU->NumSuccs == 0 && SU->NumPreds != 0)
+        // If SU does not have a register use, i.e. it doesn't produce a value
+        // that would be consumed (e.g. store), then it terminates a chain of
+        // computation.  Give it a large SethiUllman number so it will be
+        // scheduled right before its predecessors that it doesn't lengthen
+        // their live ranges.
+        return 0xffff;
+      if (SU->NumPreds == 0 && SU->NumSuccs != 0)
+        // If SU does not have a register def, schedule it close to its uses
+        // because it does not lengthen any live ranges.
+        return 0;
+      return SethiUllmanNumbers[SU->NodeNum];
+    }
+
+    unsigned getNodeOrdering(const SUnit *SU) const {
+      return scheduleDAG->DAG->GetOrdering(SU->getNode());
+    }
+
+    bool empty() const { return Queue.empty(); }
+    
+    void push(SUnit *U) {
+      assert(!U->NodeQueueId && "Node in the queue already");
+      U->NodeQueueId = ++CurQueueId;
+      Queue.push_back(U);
+    }
+
+    SUnit *pop() {
+      if (empty()) return NULL;
+      std::vector<SUnit *>::iterator Best = Queue.begin();
+      for (std::vector<SUnit *>::iterator I = llvm::next(Queue.begin()),
+           E = Queue.end(); I != E; ++I)
+        if (Picker(*Best, *I))
+          Best = I;
+      SUnit *V = *Best;
+      if (Best != prior(Queue.end()))
+        std::swap(*Best, Queue.back());
+      Queue.pop_back();
+      V->NodeQueueId = 0;
+      return V;
+    }
+
+    void remove(SUnit *SU) {
+      assert(!Queue.empty() && "Queue is empty!");
+      assert(SU->NodeQueueId != 0 && "Not in queue!");
+      std::vector<SUnit *>::iterator I = std::find(Queue.begin(), Queue.end(),
+                                                   SU);
+      if (I != prior(Queue.end()))
+        std::swap(*I, Queue.back());
+      Queue.pop_back();
+      SU->NodeQueueId = 0;
+    }
+
+    bool HighRegPressure(const SUnit *SU) const {
+      if (!TLI)
+        return false;
+
+      for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
+           I != E; ++I) {
+        if (I->isCtrl())
+          continue;
+        SUnit *PredSU = I->getSUnit();
+        const SDNode *PN = PredSU->getNode();
+        if (!PN->isMachineOpcode()) {
+          if (PN->getOpcode() == ISD::CopyFromReg) {
+            EVT VT = PN->getValueType(0);
+            unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+            unsigned Cost = TLI->getRepRegClassCostFor(VT);
+            if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
+              return true;
+          }
+          continue;
+        }
+        unsigned POpc = PN->getMachineOpcode();
+        if (POpc == TargetOpcode::IMPLICIT_DEF)
+          continue;
+        if (POpc == TargetOpcode::EXTRACT_SUBREG) {
+          EVT VT = PN->getOperand(0).getValueType();
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          unsigned Cost = TLI->getRepRegClassCostFor(VT);
+          // Check if this increases register pressure of the specific register
+          // class to the point where it would cause spills.
+          if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
+            return true;
+          continue;            
+        } else if (POpc == TargetOpcode::INSERT_SUBREG ||
+                   POpc == TargetOpcode::SUBREG_TO_REG) {
+          EVT VT = PN->getValueType(0);
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          unsigned Cost = TLI->getRepRegClassCostFor(VT);
+          // Check if this increases register pressure of the specific register
+          // class to the point where it would cause spills.
+          if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
+            return true;
+          continue;
+        }
+        unsigned NumDefs = TII->get(PN->getMachineOpcode()).getNumDefs();
+        for (unsigned i = 0; i != NumDefs; ++i) {
+          EVT VT = PN->getValueType(i);
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          if (RegPressure[RCId] >= RegLimit[RCId])
+            return true; // Reg pressure already high.
+          unsigned Cost = TLI->getRepRegClassCostFor(VT);
+          if (!PN->hasAnyUseOfValue(i))
+            continue;
+          // Check if this increases register pressure of the specific register
+          // class to the point where it would cause spills.
+          if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
+            return true;
+        }
+      }
+
+      return false;
+    }
+
+    void ScheduledNode(SUnit *SU) {
+      if (!TracksRegPressure)
+        return;
+
+      const SDNode *N = SU->getNode();
+      if (!N->isMachineOpcode()) {
+        if (N->getOpcode() != ISD::CopyToReg)
+          return;
+      } else {
+        unsigned Opc = N->getMachineOpcode();
+        if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+            Opc == TargetOpcode::INSERT_SUBREG ||
+            Opc == TargetOpcode::SUBREG_TO_REG ||
+            Opc == TargetOpcode::REG_SEQUENCE ||
+            Opc == TargetOpcode::IMPLICIT_DEF)
+          return;
+      }
+
+      for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+           I != E; ++I) {
+        if (I->isCtrl())
+          continue;
+        SUnit *PredSU = I->getSUnit();
+        if (PredSU->NumSuccsLeft != PredSU->NumSuccs)
+          continue;
+        const SDNode *PN = PredSU->getNode();
+        if (!PN->isMachineOpcode()) {
+          if (PN->getOpcode() == ISD::CopyFromReg) {
+            EVT VT = PN->getValueType(0);
+            unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+            RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          }
+          continue;
+        }
+        unsigned POpc = PN->getMachineOpcode();
+        if (POpc == TargetOpcode::IMPLICIT_DEF)
+          continue;
+        if (POpc == TargetOpcode::EXTRACT_SUBREG) {
+          EVT VT = PN->getOperand(0).getValueType();
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          continue;            
+        } else if (POpc == TargetOpcode::INSERT_SUBREG ||
+                   POpc == TargetOpcode::SUBREG_TO_REG) {
+          EVT VT = PN->getValueType(0);
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          continue;
+        }
+        unsigned NumDefs = TII->get(PN->getMachineOpcode()).getNumDefs();
+        for (unsigned i = 0; i != NumDefs; ++i) {
+          EVT VT = PN->getValueType(i);
+          if (!PN->hasAnyUseOfValue(i))
+            continue;
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+        }
+      }
+
+      // Check for isMachineOpcode() as PrescheduleNodesWithMultipleUses()
+      // may transfer data dependencies to CopyToReg.
+      if (SU->NumSuccs && N->isMachineOpcode()) {
+        unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+        for (unsigned i = 0; i != NumDefs; ++i) {
+          EVT VT = N->getValueType(i);
+          if (!N->hasAnyUseOfValue(i))
+            continue;
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          if (RegPressure[RCId] < TLI->getRepRegClassCostFor(VT))
+            // Register pressure tracking is imprecise. This can happen.
+            RegPressure[RCId] = 0;
+          else
+            RegPressure[RCId] -= TLI->getRepRegClassCostFor(VT);
+        }
+      }
+
+      dumpRegPressure();
+    }
+
+    void UnscheduledNode(SUnit *SU) {
+      if (!TracksRegPressure)
+        return;
+
+      const SDNode *N = SU->getNode();
+      if (!N->isMachineOpcode()) {
+        if (N->getOpcode() != ISD::CopyToReg)
+          return;
+      } else {
+        unsigned Opc = N->getMachineOpcode();
+        if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+            Opc == TargetOpcode::INSERT_SUBREG ||
+            Opc == TargetOpcode::SUBREG_TO_REG ||
+            Opc == TargetOpcode::REG_SEQUENCE ||
+            Opc == TargetOpcode::IMPLICIT_DEF)
+          return;
+      }
+
+      for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+           I != E; ++I) {
+        if (I->isCtrl())
+          continue;
+        SUnit *PredSU = I->getSUnit();
+        if (PredSU->NumSuccsLeft != PredSU->NumSuccs)
+          continue;
+        const SDNode *PN = PredSU->getNode();
+        if (!PN->isMachineOpcode()) {
+          if (PN->getOpcode() == ISD::CopyFromReg) {
+            EVT VT = PN->getValueType(0);
+            unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+            RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          }
+          continue;
+        }
+        unsigned POpc = PN->getMachineOpcode();
+        if (POpc == TargetOpcode::IMPLICIT_DEF)
+          continue;
+        if (POpc == TargetOpcode::EXTRACT_SUBREG) {
+          EVT VT = PN->getOperand(0).getValueType();
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          continue;            
+        } else if (POpc == TargetOpcode::INSERT_SUBREG ||
+                   POpc == TargetOpcode::SUBREG_TO_REG) {
+          EVT VT = PN->getValueType(0);
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          continue;
+        }
+        unsigned NumDefs = TII->get(PN->getMachineOpcode()).getNumDefs();
+        for (unsigned i = 0; i != NumDefs; ++i) {
+          EVT VT = PN->getValueType(i);
+          if (!PN->hasAnyUseOfValue(i))
+            continue;
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          if (RegPressure[RCId] < TLI->getRepRegClassCostFor(VT))
+            // Register pressure tracking is imprecise. This can happen.
+            RegPressure[RCId] = 0;
+          else
+            RegPressure[RCId] -= TLI->getRepRegClassCostFor(VT);
+        }
+      }
+
+      // Check for isMachineOpcode() as PrescheduleNodesWithMultipleUses()
+      // may transfer data dependencies to CopyToReg.
+      if (SU->NumSuccs && N->isMachineOpcode()) {
+        unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+        for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
+          EVT VT = N->getValueType(i);
+          if (VT == MVT::Flag || VT == MVT::Other)
+            continue;
+          if (!N->hasAnyUseOfValue(i))
+            continue;
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+        }
+      }
+
+      dumpRegPressure();
+    }
+
+    void setScheduleDAG(ScheduleDAGRRList *scheduleDag) { 
+      scheduleDAG = scheduleDag; 
+    }
+
+    void dumpRegPressure() const {
+      for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+             E = TRI->regclass_end(); I != E; ++I) {
+        const TargetRegisterClass *RC = *I;
+        unsigned Id = RC->getID();
+        unsigned RP = RegPressure[Id];
+        if (!RP) continue;
+        DEBUG(dbgs() << RC->getName() << ": " << RP << " / " << RegLimit[Id]
+              << '\n');
+      }
+    }
+
+  protected:
+    bool canClobber(const SUnit *SU, const SUnit *Op);
+    void AddPseudoTwoAddrDeps();
+    void PrescheduleNodesWithMultipleUses();
+    void CalculateSethiUllmanNumbers();
+  };
+
+  typedef RegReductionPriorityQueue<bu_ls_rr_sort>
+    BURegReductionPriorityQueue;
+
+  typedef RegReductionPriorityQueue<td_ls_rr_sort>
+    TDRegReductionPriorityQueue;
+
+  typedef RegReductionPriorityQueue<src_ls_rr_sort>
+    SrcRegReductionPriorityQueue;
+
+  typedef RegReductionPriorityQueue<hybrid_ls_rr_sort>
+    HybridBURRPriorityQueue;
+
+  typedef RegReductionPriorityQueue<ilp_ls_rr_sort>
+    ILPBURRPriorityQueue;
+}
+
+/// closestSucc - Returns the scheduled cycle of the successor which is
+/// closest to the current cycle.
+static unsigned closestSucc(const SUnit *SU) {
+  unsigned MaxHeight = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain succs
+    unsigned Height = I->getSUnit()->getHeight();
+    // If there are bunch of CopyToRegs stacked up, they should be considered
+    // to be at the same position.
+    if (I->getSUnit()->getNode() &&
+        I->getSUnit()->getNode()->getOpcode() == ISD::CopyToReg)
+      Height = closestSucc(I->getSUnit())+1;
+    if (Height > MaxHeight)
+      MaxHeight = Height;
+  }
+  return MaxHeight;
+}
+
+/// calcMaxScratches - Returns an cost estimate of the worse case requirement
+/// for scratch registers, i.e. number of data dependencies.
+static unsigned calcMaxScratches(const SUnit *SU) {
+  unsigned Scratches = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    Scratches++;
+  }
+  return Scratches;
+}
+
+template <typename RRSort>
+static bool BURRSort(const SUnit *left, const SUnit *right,
+                     const RegReductionPriorityQueue<RRSort> *SPQ) {
+  unsigned LPriority = SPQ->getNodePriority(left);
+  unsigned RPriority = SPQ->getNodePriority(right);
+  if (LPriority != RPriority)
+    return LPriority > RPriority;
+
+  // Try schedule def + use closer when Sethi-Ullman numbers are the same.
+  // e.g.
+  // t1 = op t2, c1
+  // t3 = op t4, c2
+  //
+  // and the following instructions are both ready.
+  // t2 = op c3
+  // t4 = op c4
+  //
+  // Then schedule t2 = op first.
+  // i.e.
+  // t4 = op c4
+  // t2 = op c3
+  // t1 = op t2, c1
+  // t3 = op t4, c2
+  //
+  // This creates more short live intervals.
+  unsigned LDist = closestSucc(left);
+  unsigned RDist = closestSucc(right);
+  if (LDist != RDist)
+    return LDist < RDist;
+
+  // How many registers becomes live when the node is scheduled.
+  unsigned LScratch = calcMaxScratches(left);
+  unsigned RScratch = calcMaxScratches(right);
+  if (LScratch != RScratch)
+    return LScratch > RScratch;
+
+  if (left->getHeight() != right->getHeight())
+    return left->getHeight() > right->getHeight();
+  
+  if (left->getDepth() != right->getDepth())
+    return left->getDepth() < right->getDepth();
+
+  assert(left->NodeQueueId && right->NodeQueueId && 
+         "NodeQueueId cannot be zero");
+  return (left->NodeQueueId > right->NodeQueueId);
+}
+
+// Bottom up
+bool bu_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
+  return BURRSort(left, right, SPQ);
+}
+
+// Source order, otherwise bottom up.
+bool src_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
+  unsigned LOrder = SPQ->getNodeOrdering(left);
+  unsigned ROrder = SPQ->getNodeOrdering(right);
+
+  // Prefer an ordering where the lower the non-zero order number, the higher
+  // the preference.
+  if ((LOrder || ROrder) && LOrder != ROrder)
+    return LOrder != 0 && (LOrder < ROrder || ROrder == 0);
+
+  return BURRSort(left, right, SPQ);
+}
+
+bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
+  bool LHigh = SPQ->HighRegPressure(left);
+  bool RHigh = SPQ->HighRegPressure(right);
+  // Avoid causing spills. If register pressure is high, schedule for
+  // register pressure reduction.
+  if (LHigh && !RHigh)
+    return true;
+  else if (!LHigh && RHigh)
+    return false;
+  else if (!LHigh && !RHigh) {
+    // Low register pressure situation, schedule for latency if possible.
+    bool LStall = left->SchedulingPref == Sched::Latency &&
+      SPQ->getCurCycle() < left->getHeight();
+    bool RStall = right->SchedulingPref == Sched::Latency &&
+      SPQ->getCurCycle() < right->getHeight();
+    // If scheduling one of the node will cause a pipeline stall, delay it.
+    // If scheduling either one of the node will cause a pipeline stall, sort
+    // them according to their height.
+    // If neither will cause a pipeline stall, try to reduce register pressure.
+    if (LStall) {
+      if (!RStall)
+        return true;
+      if (left->getHeight() != right->getHeight())
+        return left->getHeight() > right->getHeight();
+    } else if (RStall)
+      return false;
+
+    // If either node is scheduling for latency, sort them by height and latency
+    // first.
+    if (left->SchedulingPref == Sched::Latency ||
+        right->SchedulingPref == Sched::Latency) {
+      if (left->getHeight() != right->getHeight())
+        return left->getHeight() > right->getHeight();
+      if (left->Latency != right->Latency)
+        return left->Latency > right->Latency;
+    }
+  }
+
+  return BURRSort(left, right, SPQ);
+}
+
+bool ilp_ls_rr_sort::operator()(const SUnit *left,
+                                const SUnit *right) const {
+  bool LHigh = SPQ->HighRegPressure(left);
+  bool RHigh = SPQ->HighRegPressure(right);
+  // Avoid causing spills. If register pressure is high, schedule for
+  // register pressure reduction.
+  if (LHigh && !RHigh)
+    return true;
+  else if (!LHigh && RHigh)
+    return false;
+  else if (!LHigh && !RHigh) {
+    // Low register pressure situation, schedule to maximize instruction level
+    // parallelism.
+    if (left->NumPreds > right->NumPreds)
+      return false;
+    else if (left->NumPreds < right->NumPreds)
+      return false;
+  }
+
+  return BURRSort(left, right, SPQ);
+}
+
+template<class SF>
+bool
+RegReductionPriorityQueue<SF>::canClobber(const SUnit *SU, const SUnit *Op) {
+  if (SU->isTwoAddress) {
+    unsigned Opc = SU->getNode()->getMachineOpcode();
+    const TargetInstrDesc &TID = TII->get(Opc);
+    unsigned NumRes = TID.getNumDefs();
+    unsigned NumOps = TID.getNumOperands() - NumRes;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      if (TID.getOperandConstraint(i+NumRes, TOI::TIED_TO) != -1) {
+        SDNode *DU = SU->getNode()->getOperand(i).getNode();
+        if (DU->getNodeId() != -1 &&
+            Op->OrigNode == &(*SUnits)[DU->getNodeId()])
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// hasCopyToRegUse - Return true if SU has a value successor that is a
+/// CopyToReg node.
+static bool hasCopyToRegUse(const SUnit *SU) {
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;
+    const SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg)
+      return true;
+  }
+  return false;
+}
+
+/// canClobberPhysRegDefs - True if SU would clobber one of SuccSU's
+/// physical register defs.
+static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
+                                  const TargetInstrInfo *TII,
+                                  const TargetRegisterInfo *TRI) {
+  SDNode *N = SuccSU->getNode();
+  unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+  const unsigned *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
+  assert(ImpDefs && "Caller should check hasPhysRegDefs");
+  for (const SDNode *SUNode = SU->getNode(); SUNode;
+       SUNode = SUNode->getFlaggedNode()) {
+    if (!SUNode->isMachineOpcode())
+      continue;
+    const unsigned *SUImpDefs =
+      TII->get(SUNode->getMachineOpcode()).getImplicitDefs();
+    if (!SUImpDefs)
+      return false;
+    for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
+      EVT VT = N->getValueType(i);
+      if (VT == MVT::Flag || VT == MVT::Other)
+        continue;
+      if (!N->hasAnyUseOfValue(i))
+        continue;
+      unsigned Reg = ImpDefs[i - NumDefs];
+      for (;*SUImpDefs; ++SUImpDefs) {
+        unsigned SUReg = *SUImpDefs;
+        if (TRI->regsOverlap(Reg, SUReg))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// PrescheduleNodesWithMultipleUses - Nodes with multiple uses
+/// are not handled well by the general register pressure reduction
+/// heuristics. When presented with code like this:
+///
+///      N
+///    / |
+///   /  |
+///  U  store
+///  |
+/// ...
+///
+/// the heuristics tend to push the store up, but since the
+/// operand of the store has another use (U), this would increase
+/// the length of that other use (the U->N edge).
+///
+/// This function transforms code like the above to route U's
+/// dependence through the store when possible, like this:
+///
+///      N
+///      ||
+///      ||
+///     store
+///       |
+///       U
+///       |
+///      ...
+///
+/// This results in the store being scheduled immediately
+/// after N, which shortens the U->N live range, reducing
+/// register pressure.
+///
+template<class SF>
+void RegReductionPriorityQueue<SF>::PrescheduleNodesWithMultipleUses() {
+  // Visit all the nodes in topological order, working top-down.
+  for (unsigned i = 0, e = SUnits->size(); i != e; ++i) {
+    SUnit *SU = &(*SUnits)[i];
+    // For now, only look at nodes with no data successors, such as stores.
+    // These are especially important, due to the heuristics in
+    // getNodePriority for nodes with no data successors.
+    if (SU->NumSuccs != 0)
+      continue;
+    // For now, only look at nodes with exactly one data predecessor.
+    if (SU->NumPreds != 1)
+      continue;
+    // Avoid prescheduling copies to virtual registers, which don't behave
+    // like other nodes from the perspective of scheduling heuristics.
+    if (SDNode *N = SU->getNode())
+      if (N->getOpcode() == ISD::CopyToReg &&
+          TargetRegisterInfo::isVirtualRegister
+            (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
+        continue;
+
+    // Locate the single data predecessor.
+    SUnit *PredSU = 0;
+    for (SUnit::const_pred_iterator II = SU->Preds.begin(),
+         EE = SU->Preds.end(); II != EE; ++II)
+      if (!II->isCtrl()) {
+        PredSU = II->getSUnit();
+        break;
+      }
+    assert(PredSU);
+
+    // Don't rewrite edges that carry physregs, because that requires additional
+    // support infrastructure.
+    if (PredSU->hasPhysRegDefs)
+      continue;
+    // Short-circuit the case where SU is PredSU's only data successor.
+    if (PredSU->NumSuccs == 1)
+      continue;
+    // Avoid prescheduling to copies from virtual registers, which don't behave
+    // like other nodes from the perspective of scheduling // heuristics.
+    if (SDNode *N = SU->getNode())
+      if (N->getOpcode() == ISD::CopyFromReg &&
+          TargetRegisterInfo::isVirtualRegister
+            (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
+        continue;
+
+    // Perform checks on the successors of PredSU.
+    for (SUnit::const_succ_iterator II = PredSU->Succs.begin(),
+         EE = PredSU->Succs.end(); II != EE; ++II) {
+      SUnit *PredSuccSU = II->getSUnit();
+      if (PredSuccSU == SU) continue;
+      // If PredSU has another successor with no data successors, for
+      // now don't attempt to choose either over the other.
+      if (PredSuccSU->NumSuccs == 0)
+        goto outer_loop_continue;
+      // Don't break physical register dependencies.
+      if (SU->hasPhysRegClobbers && PredSuccSU->hasPhysRegDefs)
+        if (canClobberPhysRegDefs(PredSuccSU, SU, TII, TRI))
+          goto outer_loop_continue;
+      // Don't introduce graph cycles.
+      if (scheduleDAG->IsReachable(SU, PredSuccSU))
+        goto outer_loop_continue;
+    }
+
+    // Ok, the transformation is safe and the heuristics suggest it is
+    // profitable. Update the graph.
+    DEBUG(dbgs() << "    Prescheduling SU #" << SU->NodeNum
+                 << " next to PredSU #" << PredSU->NodeNum
+                 << " to guide scheduling in the presence of multiple uses\n");
+    for (unsigned i = 0; i != PredSU->Succs.size(); ++i) {
+      SDep Edge = PredSU->Succs[i];
+      assert(!Edge.isAssignedRegDep());
+      SUnit *SuccSU = Edge.getSUnit();
+      if (SuccSU != SU) {
+        Edge.setSUnit(PredSU);
+        scheduleDAG->RemovePred(SuccSU, Edge);
+        scheduleDAG->AddPred(SU, Edge);
+        Edge.setSUnit(SU);
+        scheduleDAG->AddPred(SuccSU, Edge);
+        --i;
+      }
+    }
+  outer_loop_continue:;
+  }
+}
+
+/// AddPseudoTwoAddrDeps - If two nodes share an operand and one of them uses
+/// it as a def&use operand. Add a pseudo control edge from it to the other
+/// node (if it won't create a cycle) so the two-address one will be scheduled
+/// first (lower in the schedule). If both nodes are two-address, favor the
+/// one that has a CopyToReg use (more likely to be a loop induction update).
+/// If both are two-address, but one is commutable while the other is not
+/// commutable, favor the one that's not commutable.
+template<class SF>
+void RegReductionPriorityQueue<SF>::AddPseudoTwoAddrDeps() {
+  for (unsigned i = 0, e = SUnits->size(); i != e; ++i) {
+    SUnit *SU = &(*SUnits)[i];
+    if (!SU->isTwoAddress)
+      continue;
+
+    SDNode *Node = SU->getNode();
+    if (!Node || !Node->isMachineOpcode() || SU->getNode()->getFlaggedNode())
+      continue;
+
+    unsigned Opc = Node->getMachineOpcode();
+    const TargetInstrDesc &TID = TII->get(Opc);
+    unsigned NumRes = TID.getNumDefs();
+    unsigned NumOps = TID.getNumOperands() - NumRes;
+    for (unsigned j = 0; j != NumOps; ++j) {
+      if (TID.getOperandConstraint(j+NumRes, TOI::TIED_TO) == -1)
+        continue;
+      SDNode *DU = SU->getNode()->getOperand(j).getNode();
+      if (DU->getNodeId() == -1)
+        continue;
+      const SUnit *DUSU = &(*SUnits)[DU->getNodeId()];
+      if (!DUSU) continue;
+      for (SUnit::const_succ_iterator I = DUSU->Succs.begin(),
+           E = DUSU->Succs.end(); I != E; ++I) {
+        if (I->isCtrl()) continue;
+        SUnit *SuccSU = I->getSUnit();
+        if (SuccSU == SU)
+          continue;
+        // Be conservative. Ignore if nodes aren't at roughly the same
+        // depth and height.
+        if (SuccSU->getHeight() < SU->getHeight() &&
+            (SU->getHeight() - SuccSU->getHeight()) > 1)
+          continue;
+        // Skip past COPY_TO_REGCLASS nodes, so that the pseudo edge
+        // constrains whatever is using the copy, instead of the copy
+        // itself. In the case that the copy is coalesced, this
+        // preserves the intent of the pseudo two-address heurietics.
+        while (SuccSU->Succs.size() == 1 &&
+               SuccSU->getNode()->isMachineOpcode() &&
+               SuccSU->getNode()->getMachineOpcode() ==
+                 TargetOpcode::COPY_TO_REGCLASS)
+          SuccSU = SuccSU->Succs.front().getSUnit();
+        // Don't constrain non-instruction nodes.
+        if (!SuccSU->getNode() || !SuccSU->getNode()->isMachineOpcode())
+          continue;
+        // Don't constrain nodes with physical register defs if the
+        // predecessor can clobber them.
+        if (SuccSU->hasPhysRegDefs && SU->hasPhysRegClobbers) {
+          if (canClobberPhysRegDefs(SuccSU, SU, TII, TRI))
+            continue;
+        }
+        // Don't constrain EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG;
+        // these may be coalesced away. We want them close to their uses.
+        unsigned SuccOpc = SuccSU->getNode()->getMachineOpcode();
+        if (SuccOpc == TargetOpcode::EXTRACT_SUBREG ||
+            SuccOpc == TargetOpcode::INSERT_SUBREG ||
+            SuccOpc == TargetOpcode::SUBREG_TO_REG)
+          continue;
+        if ((!canClobber(SuccSU, DUSU) ||
+             (hasCopyToRegUse(SU) && !hasCopyToRegUse(SuccSU)) ||
+             (!SU->isCommutable && SuccSU->isCommutable)) &&
+            !scheduleDAG->IsReachable(SuccSU, SU)) {
+          DEBUG(dbgs() << "    Adding a pseudo-two-addr edge from SU #"
+                       << SU->NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
+          scheduleDAG->AddPred(SU, SDep(SuccSU, SDep::Order, /*Latency=*/0,
+                                        /*Reg=*/0, /*isNormalMemory=*/false,
+                                        /*isMustAlias=*/false,
+                                        /*isArtificial=*/true));
+        }
+      }
+    }
+  }
+}
+
+/// CalculateSethiUllmanNumbers - Calculate Sethi-Ullman numbers of all
+/// scheduling units.
+template<class SF>
+void RegReductionPriorityQueue<SF>::CalculateSethiUllmanNumbers() {
+  SethiUllmanNumbers.assign(SUnits->size(), 0);
+  
+  for (unsigned i = 0, e = SUnits->size(); i != e; ++i)
+    CalcNodeSethiUllmanNumber(&(*SUnits)[i], SethiUllmanNumbers);
+}
+
+/// LimitedSumOfUnscheduledPredsOfSuccs - Compute the sum of the unscheduled
+/// predecessors of the successors of the SUnit SU. Stop when the provided
+/// limit is exceeded.
+static unsigned LimitedSumOfUnscheduledPredsOfSuccs(const SUnit *SU, 
+                                                    unsigned Limit) {
+  unsigned Sum = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    const SUnit *SuccSU = I->getSUnit();
+    for (SUnit::const_pred_iterator II = SuccSU->Preds.begin(),
+         EE = SuccSU->Preds.end(); II != EE; ++II) {
+      SUnit *PredSU = II->getSUnit();
+      if (!PredSU->isScheduled)
+        if (++Sum > Limit)
+          return Sum;
+    }
+  }
+  return Sum;
+}
+
+
+// Top down
+bool td_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
+  unsigned LPriority = SPQ->getNodePriority(left);
+  unsigned RPriority = SPQ->getNodePriority(right);
+  bool LIsTarget = left->getNode() && left->getNode()->isMachineOpcode();
+  bool RIsTarget = right->getNode() && right->getNode()->isMachineOpcode();
+  bool LIsFloater = LIsTarget && left->NumPreds == 0;
+  bool RIsFloater = RIsTarget && right->NumPreds == 0;
+  unsigned LBonus = (LimitedSumOfUnscheduledPredsOfSuccs(left,1) == 1) ? 2 : 0;
+  unsigned RBonus = (LimitedSumOfUnscheduledPredsOfSuccs(right,1) == 1) ? 2 : 0;
+
+  if (left->NumSuccs == 0 && right->NumSuccs != 0)
+    return false;
+  else if (left->NumSuccs != 0 && right->NumSuccs == 0)
+    return true;
+
+  if (LIsFloater)
+    LBonus -= 2;
+  if (RIsFloater)
+    RBonus -= 2;
+  if (left->NumSuccs == 1)
+    LBonus += 2;
+  if (right->NumSuccs == 1)
+    RBonus += 2;
+
+  if (LPriority+LBonus != RPriority+RBonus)
+    return LPriority+LBonus < RPriority+RBonus;
+
+  if (left->getDepth() != right->getDepth())
+    return left->getDepth() < right->getDepth();
+
+  if (left->NumSuccsLeft != right->NumSuccsLeft)
+    return left->NumSuccsLeft > right->NumSuccsLeft;
+
+  assert(left->NodeQueueId && right->NodeQueueId && 
+         "NodeQueueId cannot be zero");
+  return (left->NodeQueueId > right->NodeQueueId);
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+llvm::ScheduleDAGSDNodes *
+llvm::createBURRListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  
+  BURegReductionPriorityQueue *PQ =
+    new BURegReductionPriorityQueue(*IS->MF, false, TII, TRI, 0);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, false, PQ);
+  PQ->setScheduleDAG(SD);
+  return SD;  
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createTDRRListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  
+  TDRegReductionPriorityQueue *PQ =
+    new TDRegReductionPriorityQueue(*IS->MF, false, TII, TRI, 0);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, false, PQ);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createSourceListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  
+  SrcRegReductionPriorityQueue *PQ =
+    new SrcRegReductionPriorityQueue(*IS->MF, false, TII, TRI, 0);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, false, PQ);
+  PQ->setScheduleDAG(SD);
+  return SD;  
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createHybridListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetLowering *TLI = &IS->getTargetLowering();
+  
+  HybridBURRPriorityQueue *PQ =
+    new HybridBURRPriorityQueue(*IS->MF, true, TII, TRI, TLI);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, true, PQ);
+  PQ->setScheduleDAG(SD);
+  return SD;  
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createILPListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetLowering *TLI = &IS->getTargetLowering();
+  
+  ILPBURRPriorityQueue *PQ =
+    new ILPBURRPriorityQueue(*IS->MF, true, TII, TRI, TLI);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, true, PQ);
+  PQ->setScheduleDAG(SD);
+  return SD;  
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
new file mode 100644
index 0000000..f1bf82a
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -0,0 +1,672 @@
+//===--- ScheduleDAGSDNodes.cpp - Implement the ScheduleDAGSDNodes class --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG class, which is a base class used by
+// scheduling implementation classes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "SDNodeDbgValue.h"
+#include "ScheduleDAGSDNodes.h"
+#include "InstrEmitter.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(LoadsClustered, "Number of loads clustered together");
+
+ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
+  : ScheduleDAG(mf) {
+}
+
+/// Run - perform scheduling.
+///
+void ScheduleDAGSDNodes::Run(SelectionDAG *dag, MachineBasicBlock *bb,
+                             MachineBasicBlock::iterator insertPos) {
+  DAG = dag;
+  ScheduleDAG::Run(bb, insertPos);
+}
+
+/// NewSUnit - Creates a new SUnit and return a ptr to it.
+///
+SUnit *ScheduleDAGSDNodes::NewSUnit(SDNode *N) {
+#ifndef NDEBUG
+  const SUnit *Addr = 0;
+  if (!SUnits.empty())
+    Addr = &SUnits[0];
+#endif
+  SUnits.push_back(SUnit(N, (unsigned)SUnits.size()));
+  assert((Addr == 0 || Addr == &SUnits[0]) &&
+         "SUnits std::vector reallocated on the fly!");
+  SUnits.back().OrigNode = &SUnits.back();
+  SUnit *SU = &SUnits.back();
+  const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+  if (!N ||
+      (N->isMachineOpcode() &&
+       N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF))
+    SU->SchedulingPref = Sched::None;
+  else
+    SU->SchedulingPref = TLI.getSchedulingPreference(N);
+  return SU;
+}
+
+SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
+  SUnit *SU = NewSUnit(Old->getNode());
+  SU->OrigNode = Old->OrigNode;
+  SU->Latency = Old->Latency;
+  SU->isTwoAddress = Old->isTwoAddress;
+  SU->isCommutable = Old->isCommutable;
+  SU->hasPhysRegDefs = Old->hasPhysRegDefs;
+  SU->hasPhysRegClobbers = Old->hasPhysRegClobbers;
+  SU->SchedulingPref = Old->SchedulingPref;
+  Old->isCloned = true;
+  return SU;
+}
+
+/// CheckForPhysRegDependency - Check if the dependency between def and use of
+/// a specified operand is a physical register dependency. If so, returns the
+/// register and the cost of copying the register.
+static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
+                                      const TargetRegisterInfo *TRI, 
+                                      const TargetInstrInfo *TII,
+                                      unsigned &PhysReg, int &Cost) {
+  if (Op != 2 || User->getOpcode() != ISD::CopyToReg)
+    return;
+
+  unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return;
+
+  unsigned ResNo = User->getOperand(2).getResNo();
+  if (Def->isMachineOpcode()) {
+    const TargetInstrDesc &II = TII->get(Def->getMachineOpcode());
+    if (ResNo >= II.getNumDefs() &&
+        II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg) {
+      PhysReg = Reg;
+      const TargetRegisterClass *RC =
+        TRI->getMinimalPhysRegClass(Reg, Def->getValueType(ResNo));
+      Cost = RC->getCopyCost();
+    }
+  }
+}
+
+static void AddFlags(SDNode *N, SDValue Flag, bool AddFlag,
+                     SelectionDAG *DAG) {
+  SmallVector<EVT, 4> VTs;
+  SDNode *FlagDestNode = Flag.getNode();
+
+  // Don't add a flag from a node to itself.
+  if (FlagDestNode == N) return;
+
+  // Don't add a flag to something which already has a flag.
+  if (N->getValueType(N->getNumValues() - 1) == MVT::Flag) return;
+
+  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
+    VTs.push_back(N->getValueType(I));
+
+  if (AddFlag)
+    VTs.push_back(MVT::Flag);
+
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
+    Ops.push_back(N->getOperand(I));
+
+  if (FlagDestNode)
+    Ops.push_back(Flag);
+
+  SDVTList VTList = DAG->getVTList(&VTs[0], VTs.size());
+  MachineSDNode::mmo_iterator Begin = 0, End = 0;
+  MachineSDNode *MN = dyn_cast<MachineSDNode>(N);
+
+  // Store memory references.
+  if (MN) {
+    Begin = MN->memoperands_begin();
+    End = MN->memoperands_end();
+  }
+
+  DAG->MorphNodeTo(N, N->getOpcode(), VTList, &Ops[0], Ops.size());
+
+  // Reset the memory references
+  if (MN)
+    MN->setMemRefs(Begin, End);
+}
+
+/// ClusterNeighboringLoads - Force nearby loads together by "flagging" them.
+/// This function finds loads of the same base and different offsets. If the
+/// offsets are not far apart (target specific), it add MVT::Flag inputs and
+/// outputs to ensure they are scheduled together and in order. This
+/// optimization may benefit some targets by improving cache locality.
+void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
+  SDNode *Chain = 0;
+  unsigned NumOps = Node->getNumOperands();
+  if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
+    Chain = Node->getOperand(NumOps-1).getNode();
+  if (!Chain)
+    return;
+
+  // Look for other loads of the same chain. Find loads that are loading from
+  // the same base pointer and different offsets.
+  SmallPtrSet<SDNode*, 16> Visited;
+  SmallVector<int64_t, 4> Offsets;
+  DenseMap<long long, SDNode*> O2SMap;  // Map from offset to SDNode.
+  bool Cluster = false;
+  SDNode *Base = Node;
+  for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
+       I != E; ++I) {
+    SDNode *User = *I;
+    if (User == Node || !Visited.insert(User))
+      continue;
+    int64_t Offset1, Offset2;
+    if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) ||
+        Offset1 == Offset2)
+      // FIXME: Should be ok if they addresses are identical. But earlier
+      // optimizations really should have eliminated one of the loads.
+      continue;
+    if (O2SMap.insert(std::make_pair(Offset1, Base)).second)
+      Offsets.push_back(Offset1);
+    O2SMap.insert(std::make_pair(Offset2, User));
+    Offsets.push_back(Offset2);
+    if (Offset2 < Offset1)
+      Base = User;
+    Cluster = true;
+  }
+
+  if (!Cluster)
+    return;
+
+  // Sort them in increasing order.
+  std::sort(Offsets.begin(), Offsets.end());
+
+  // Check if the loads are close enough.
+  SmallVector<SDNode*, 4> Loads;
+  unsigned NumLoads = 0;
+  int64_t BaseOff = Offsets[0];
+  SDNode *BaseLoad = O2SMap[BaseOff];
+  Loads.push_back(BaseLoad);
+  for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {
+    int64_t Offset = Offsets[i];
+    SDNode *Load = O2SMap[Offset];
+    if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,NumLoads))
+      break; // Stop right here. Ignore loads that are further away.
+    Loads.push_back(Load);
+    ++NumLoads;
+  }
+
+  if (NumLoads == 0)
+    return;
+
+  // Cluster loads by adding MVT::Flag outputs and inputs. This also
+  // ensure they are scheduled in order of increasing addresses.
+  SDNode *Lead = Loads[0];
+  AddFlags(Lead, SDValue(0, 0), true, DAG);
+
+  SDValue InFlag = SDValue(Lead, Lead->getNumValues() - 1);
+  for (unsigned I = 1, E = Loads.size(); I != E; ++I) {
+    bool OutFlag = I < E - 1;
+    SDNode *Load = Loads[I];
+
+    AddFlags(Load, InFlag, OutFlag, DAG);
+
+    if (OutFlag)
+      InFlag = SDValue(Load, Load->getNumValues() - 1);
+
+    ++LoadsClustered;
+  }
+}
+
+/// ClusterNodes - Cluster certain nodes which should be scheduled together.
+///
+void ScheduleDAGSDNodes::ClusterNodes() {
+  for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(),
+       E = DAG->allnodes_end(); NI != E; ++NI) {
+    SDNode *Node = &*NI;
+    if (!Node || !Node->isMachineOpcode())
+      continue;
+
+    unsigned Opc = Node->getMachineOpcode();
+    const TargetInstrDesc &TID = TII->get(Opc);
+    if (TID.mayLoad())
+      // Cluster loads from "near" addresses into combined SUnits.
+      ClusterNeighboringLoads(Node);
+  }
+}
+
+void ScheduleDAGSDNodes::BuildSchedUnits() {
+  // During scheduling, the NodeId field of SDNode is used to map SDNodes
+  // to their associated SUnits by holding SUnits table indices. A value
+  // of -1 means the SDNode does not yet have an associated SUnit.
+  unsigned NumNodes = 0;
+  for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(),
+       E = DAG->allnodes_end(); NI != E; ++NI) {
+    NI->setNodeId(-1);
+    ++NumNodes;
+  }
+
+  // Reserve entries in the vector for each of the SUnits we are creating.  This
+  // ensure that reallocation of the vector won't happen, so SUnit*'s won't get
+  // invalidated.
+  // FIXME: Multiply by 2 because we may clone nodes during scheduling.
+  // This is a temporary workaround.
+  SUnits.reserve(NumNodes * 2);
+  
+  // Add all nodes in depth first order.
+  SmallVector<SDNode*, 64> Worklist;
+  SmallPtrSet<SDNode*, 64> Visited;
+  Worklist.push_back(DAG->getRoot().getNode());
+  Visited.insert(DAG->getRoot().getNode());
+  
+  while (!Worklist.empty()) {
+    SDNode *NI = Worklist.pop_back_val();
+    
+    // Add all operands to the worklist unless they've already been added.
+    for (unsigned i = 0, e = NI->getNumOperands(); i != e; ++i)
+      if (Visited.insert(NI->getOperand(i).getNode()))
+        Worklist.push_back(NI->getOperand(i).getNode());
+  
+    if (isPassiveNode(NI))  // Leaf node, e.g. a TargetImmediate.
+      continue;
+    
+    // If this node has already been processed, stop now.
+    if (NI->getNodeId() != -1) continue;
+    
+    SUnit *NodeSUnit = NewSUnit(NI);
+    
+    // See if anything is flagged to this node, if so, add them to flagged
+    // nodes.  Nodes can have at most one flag input and one flag output.  Flags
+    // are required to be the last operand and result of a node.
+    
+    // Scan up to find flagged preds.
+    SDNode *N = NI;
+    while (N->getNumOperands() &&
+           N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Flag) {
+      N = N->getOperand(N->getNumOperands()-1).getNode();
+      assert(N->getNodeId() == -1 && "Node already inserted!");
+      N->setNodeId(NodeSUnit->NodeNum);
+    }
+    
+    // Scan down to find any flagged succs.
+    N = NI;
+    while (N->getValueType(N->getNumValues()-1) == MVT::Flag) {
+      SDValue FlagVal(N, N->getNumValues()-1);
+      
+      // There are either zero or one users of the Flag result.
+      bool HasFlagUse = false;
+      for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); 
+           UI != E; ++UI)
+        if (FlagVal.isOperandOf(*UI)) {
+          HasFlagUse = true;
+          assert(N->getNodeId() == -1 && "Node already inserted!");
+          N->setNodeId(NodeSUnit->NodeNum);
+          N = *UI;
+          break;
+        }
+      if (!HasFlagUse) break;
+    }
+    
+    // If there are flag operands involved, N is now the bottom-most node
+    // of the sequence of nodes that are flagged together.
+    // Update the SUnit.
+    NodeSUnit->setNode(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NodeSUnit->NodeNum);
+
+    // Assign the Latency field of NodeSUnit using target-provided information.
+    ComputeLatency(NodeSUnit);
+  }
+}
+
+void ScheduleDAGSDNodes::AddSchedEdges() {
+  const TargetSubtarget &ST = TM.getSubtarget<TargetSubtarget>();
+
+  // Check to see if the scheduler cares about latencies.
+  bool UnitLatencies = ForceUnitLatencies();
+
+  // Pass 2: add the preds, succs, etc.
+  for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
+    SUnit *SU = &SUnits[su];
+    SDNode *MainNode = SU->getNode();
+    
+    if (MainNode->isMachineOpcode()) {
+      unsigned Opc = MainNode->getMachineOpcode();
+      const TargetInstrDesc &TID = TII->get(Opc);
+      for (unsigned i = 0; i != TID.getNumOperands(); ++i) {
+        if (TID.getOperandConstraint(i, TOI::TIED_TO) != -1) {
+          SU->isTwoAddress = true;
+          break;
+        }
+      }
+      if (TID.isCommutable())
+        SU->isCommutable = true;
+    }
+    
+    // Find all predecessors and successors of the group.
+    for (SDNode *N = SU->getNode(); N; N = N->getFlaggedNode()) {
+      if (N->isMachineOpcode() &&
+          TII->get(N->getMachineOpcode()).getImplicitDefs()) {
+        SU->hasPhysRegClobbers = true;
+        unsigned NumUsed = InstrEmitter::CountResults(N);
+        while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1))
+          --NumUsed;    // Skip over unused values at the end.
+        if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs())
+          SU->hasPhysRegDefs = true;
+      }
+      
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+        SDNode *OpN = N->getOperand(i).getNode();
+        if (isPassiveNode(OpN)) continue;   // Not scheduled.
+        SUnit *OpSU = &SUnits[OpN->getNodeId()];
+        assert(OpSU && "Node has no SUnit!");
+        if (OpSU == SU) continue;           // In the same group.
+
+        EVT OpVT = N->getOperand(i).getValueType();
+        assert(OpVT != MVT::Flag && "Flagged nodes should be in same sunit!");
+        bool isChain = OpVT == MVT::Other;
+
+        unsigned PhysReg = 0;
+        int Cost = 1;
+        // Determine if this is a physical register dependency.
+        CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost);
+        assert((PhysReg == 0 || !isChain) &&
+               "Chain dependence via physreg data?");
+        // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler
+        // emits a copy from the physical register to a virtual register unless
+        // it requires a cross class copy (cost < 0). That means we are only
+        // treating "expensive to copy" register dependency as physical register
+        // dependency. This may change in the future though.
+        if (Cost >= 0)
+          PhysReg = 0;
+
+        // If this is a ctrl dep, latency is 1.
+        unsigned OpLatency = isChain ? 1 : OpSU->Latency;
+        const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
+                               OpLatency, PhysReg);
+        if (!isChain && !UnitLatencies) {
+          ComputeOperandLatency(OpN, N, i, const_cast<SDep &>(dep));
+          ST.adjustSchedDependency(OpSU, SU, const_cast<SDep &>(dep));
+        }
+
+        SU->addPred(dep);
+      }
+    }
+  }
+}
+
+/// BuildSchedGraph - Build the SUnit graph from the selection dag that we
+/// are input.  This SUnit graph is similar to the SelectionDAG, but
+/// excludes nodes that aren't interesting to scheduling, and represents
+/// flagged together nodes with a single SUnit.
+void ScheduleDAGSDNodes::BuildSchedGraph(AliasAnalysis *AA) {
+  // Cluster certain nodes which should be scheduled together.
+  ClusterNodes();
+  // Populate the SUnits array.
+  BuildSchedUnits();
+  // Compute all the scheduling dependencies between nodes.
+  AddSchedEdges();
+}
+
+void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
+  // Check to see if the scheduler cares about latencies.
+  if (ForceUnitLatencies()) {
+    SU->Latency = 1;
+    return;
+  }
+
+  const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
+  if (InstrItins.isEmpty()) {
+    SU->Latency = 1;
+    return;
+  }
+  
+  // Compute the latency for the node.  We use the sum of the latencies for
+  // all nodes flagged together into this SUnit.
+  SU->Latency = 0;
+  for (SDNode *N = SU->getNode(); N; N = N->getFlaggedNode())
+    if (N->isMachineOpcode()) {
+      SU->Latency += InstrItins.
+        getStageLatency(TII->get(N->getMachineOpcode()).getSchedClass());
+    }
+}
+
+void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use,
+                                               unsigned OpIdx, SDep& dep) const{
+  // Check to see if the scheduler cares about latencies.
+  if (ForceUnitLatencies())
+    return;
+
+  const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
+  if (InstrItins.isEmpty())
+    return;
+  
+  if (dep.getKind() != SDep::Data)
+    return;
+
+  unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
+  if (Def->isMachineOpcode()) {
+    const TargetInstrDesc &II = TII->get(Def->getMachineOpcode());
+    if (DefIdx >= II.getNumDefs())
+      return;
+    int DefCycle = InstrItins.getOperandCycle(II.getSchedClass(), DefIdx);
+    if (DefCycle < 0)
+      return;
+    int UseCycle = 1;
+    if (Use->isMachineOpcode()) {
+      const unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass();
+      UseCycle = InstrItins.getOperandCycle(UseClass, OpIdx);
+    }
+    if (UseCycle >= 0) {
+      int Latency = DefCycle - UseCycle + 1;
+      if (Latency >= 0)
+        dep.setLatency(Latency);
+    }
+  }
+}
+
+void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
+  if (!SU->getNode()) {
+    dbgs() << "PHYS REG COPY\n";
+    return;
+  }
+
+  SU->getNode()->dump(DAG);
+  dbgs() << "\n";
+  SmallVector<SDNode *, 4> FlaggedNodes;
+  for (SDNode *N = SU->getNode()->getFlaggedNode(); N; N = N->getFlaggedNode())
+    FlaggedNodes.push_back(N);
+  while (!FlaggedNodes.empty()) {
+    dbgs() << "    ";
+    FlaggedNodes.back()->dump(DAG);
+    dbgs() << "\n";
+    FlaggedNodes.pop_back();
+  }
+}
+
+namespace {
+  struct OrderSorter {
+    bool operator()(const std::pair<unsigned, MachineInstr*> &A,
+                    const std::pair<unsigned, MachineInstr*> &B) {
+      return A.first < B.first;
+    }
+  };
+}
+
+// ProcessSourceNode - Process nodes with source order numbers. These are added
+// to a vector which EmitSchedule uses to determine how to insert dbg_value
+// instructions in the right order.
+static void ProcessSourceNode(SDNode *N, SelectionDAG *DAG,
+                           InstrEmitter &Emitter,
+                           DenseMap<SDValue, unsigned> &VRBaseMap,
+                    SmallVector<std::pair<unsigned, MachineInstr*>, 32> &Orders,
+                           SmallSet<unsigned, 8> &Seen) {
+  unsigned Order = DAG->GetOrdering(N);
+  if (!Order || !Seen.insert(Order))
+    return;
+
+  MachineBasicBlock *BB = Emitter.getBlock();
+  if (Emitter.getInsertPos() == BB->begin() || BB->back().isPHI()) {
+    // Did not insert any instruction.
+    Orders.push_back(std::make_pair(Order, (MachineInstr*)0));
+    return;
+  }
+
+  Orders.push_back(std::make_pair(Order, prior(Emitter.getInsertPos())));
+  if (!N->getHasDebugValue())
+    return;
+  // Opportunistically insert immediate dbg_value uses, i.e. those with source
+  // order number right after the N.
+  MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
+  SmallVector<SDDbgValue*,2> &DVs = DAG->GetDbgValues(N);
+  for (unsigned i = 0, e = DVs.size(); i != e; ++i) {
+    if (DVs[i]->isInvalidated())
+      continue;
+    unsigned DVOrder = DVs[i]->getOrder();
+    if (DVOrder == ++Order) {
+      MachineInstr *DbgMI = Emitter.EmitDbgValue(DVs[i], VRBaseMap);
+      if (DbgMI) {
+        Orders.push_back(std::make_pair(DVOrder, DbgMI));
+        BB->insert(InsertPos, DbgMI);
+      }
+      DVs[i]->setIsInvalidated();
+    }
+  }
+}
+
+
+/// EmitSchedule - Emit the machine code in scheduled order.
+MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
+  InstrEmitter Emitter(BB, InsertPos);
+  DenseMap<SDValue, unsigned> VRBaseMap;
+  DenseMap<SUnit*, unsigned> CopyVRBaseMap;
+  SmallVector<std::pair<unsigned, MachineInstr*>, 32> Orders;
+  SmallSet<unsigned, 8> Seen;
+  bool HasDbg = DAG->hasDebugValues();
+
+  // If this is the first BB, emit byval parameter dbg_value's.
+  if (HasDbg && BB->getParent()->begin() == MachineFunction::iterator(BB)) {
+    SDDbgInfo::DbgIterator PDI = DAG->ByvalParmDbgBegin();
+    SDDbgInfo::DbgIterator PDE = DAG->ByvalParmDbgEnd();
+    for (; PDI != PDE; ++PDI) {
+      MachineInstr *DbgMI= Emitter.EmitDbgValue(*PDI, VRBaseMap);
+      if (DbgMI)
+        BB->insert(InsertPos, DbgMI);
+    }
+  }
+
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    SUnit *SU = Sequence[i];
+    if (!SU) {
+      // Null SUnit* is a noop.
+      EmitNoop();
+      continue;
+    }
+
+    // For pre-regalloc scheduling, create instructions corresponding to the
+    // SDNode and any flagged SDNodes and append them to the block.
+    if (!SU->getNode()) {
+      // Emit a copy.
+      EmitPhysRegCopy(SU, CopyVRBaseMap);
+      continue;
+    }
+
+    SmallVector<SDNode *, 4> FlaggedNodes;
+    for (SDNode *N = SU->getNode()->getFlaggedNode(); N;
+         N = N->getFlaggedNode())
+      FlaggedNodes.push_back(N);
+    while (!FlaggedNodes.empty()) {
+      SDNode *N = FlaggedNodes.back();
+      Emitter.EmitNode(FlaggedNodes.back(), SU->OrigNode != SU, SU->isCloned,
+                       VRBaseMap);
+      // Remember the source order of the inserted instruction.
+      if (HasDbg)
+        ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen);
+      FlaggedNodes.pop_back();
+    }
+    Emitter.EmitNode(SU->getNode(), SU->OrigNode != SU, SU->isCloned,
+                     VRBaseMap);
+    // Remember the source order of the inserted instruction.
+    if (HasDbg)
+      ProcessSourceNode(SU->getNode(), DAG, Emitter, VRBaseMap, Orders,
+                        Seen);
+  }
+
+  // Insert all the dbg_values which have not already been inserted in source
+  // order sequence.
+  if (HasDbg) {
+    MachineBasicBlock::iterator BBBegin = BB->getFirstNonPHI();
+
+    // Sort the source order instructions and use the order to insert debug
+    // values.
+    std::sort(Orders.begin(), Orders.end(), OrderSorter());
+
+    SDDbgInfo::DbgIterator DI = DAG->DbgBegin();
+    SDDbgInfo::DbgIterator DE = DAG->DbgEnd();
+    // Now emit the rest according to source order.
+    unsigned LastOrder = 0;
+    for (unsigned i = 0, e = Orders.size(); i != e && DI != DE; ++i) {
+      unsigned Order = Orders[i].first;
+      MachineInstr *MI = Orders[i].second;
+      // Insert all SDDbgValue's whose order(s) are before "Order".
+      if (!MI)
+        continue;
+#ifndef NDEBUG
+      unsigned LastDIOrder = 0;
+#endif
+      for (; DI != DE &&
+             (*DI)->getOrder() >= LastOrder && (*DI)->getOrder() < Order; ++DI) {
+#ifndef NDEBUG
+        assert((*DI)->getOrder() >= LastDIOrder &&
+               "SDDbgValue nodes must be in source order!");
+        LastDIOrder = (*DI)->getOrder();
+#endif
+        if ((*DI)->isInvalidated())
+          continue;
+        MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap);
+        if (DbgMI) {
+          if (!LastOrder)
+            // Insert to start of the BB (after PHIs).
+            BB->insert(BBBegin, DbgMI);
+          else {
+            // Insert at the instruction, which may be in a different
+            // block, if the block was split by a custom inserter.
+            MachineBasicBlock::iterator Pos = MI;
+            MI->getParent()->insert(llvm::next(Pos), DbgMI);
+          }
+        }
+      }
+      LastOrder = Order;
+    }
+    // Add trailing DbgValue's before the terminator. FIXME: May want to add
+    // some of them before one or more conditional branches?
+    while (DI != DE) {
+      MachineBasicBlock *InsertBB = Emitter.getBlock();
+      MachineBasicBlock::iterator Pos= Emitter.getBlock()->getFirstTerminator();
+      if (!(*DI)->isInvalidated()) {
+        MachineInstr *DbgMI= Emitter.EmitDbgValue(*DI, VRBaseMap);
+        if (DbgMI)
+          InsertBB->insert(Pos, DbgMI);
+      }
+      ++DI;
+    }
+  }
+
+  BB = Emitter.getBlock();
+  InsertPos = Emitter.getInsertPos();
+  return BB;
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
new file mode 100644
index 0000000..842fc8c
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -0,0 +1,122 @@
+//===---- ScheduleDAGSDNodes.h - SDNode Scheduling --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ScheduleDAGSDNodes class, which implements
+// scheduling for an SDNode-based dependency graph.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCHEDULEDAGSDNODES_H
+#define SCHEDULEDAGSDNODES_H
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+namespace llvm {
+  /// ScheduleDAGSDNodes - A ScheduleDAG for scheduling SDNode-based DAGs.
+  /// 
+  /// Edges between SUnits are initially based on edges in the SelectionDAG,
+  /// and additional edges can be added by the schedulers as heuristics.
+  /// SDNodes such as Constants, Registers, and a few others that are not
+  /// interesting to schedulers are not allocated SUnits.
+  ///
+  /// SDNodes with MVT::Flag operands are grouped along with the flagged
+  /// nodes into a single SUnit so that they are scheduled together.
+  ///
+  /// SDNode-based scheduling graphs do not use SDep::Anti or SDep::Output
+  /// edges.  Physical register dependence information is not carried in
+  /// the DAG and must be handled explicitly by schedulers.
+  ///
+  class ScheduleDAGSDNodes : public ScheduleDAG {
+  public:
+    SelectionDAG *DAG;                    // DAG of the current basic block
+
+    explicit ScheduleDAGSDNodes(MachineFunction &mf);
+
+    virtual ~ScheduleDAGSDNodes() {}
+
+    /// Run - perform scheduling.
+    ///
+    void Run(SelectionDAG *dag, MachineBasicBlock *bb,
+             MachineBasicBlock::iterator insertPos);
+
+    /// isPassiveNode - Return true if the node is a non-scheduled leaf.
+    ///
+    static bool isPassiveNode(SDNode *Node) {
+      if (isa<ConstantSDNode>(Node))       return true;
+      if (isa<ConstantFPSDNode>(Node))     return true;
+      if (isa<RegisterSDNode>(Node))       return true;
+      if (isa<GlobalAddressSDNode>(Node))  return true;
+      if (isa<BasicBlockSDNode>(Node))     return true;
+      if (isa<FrameIndexSDNode>(Node))     return true;
+      if (isa<ConstantPoolSDNode>(Node))   return true;
+      if (isa<JumpTableSDNode>(Node))      return true;
+      if (isa<ExternalSymbolSDNode>(Node)) return true;
+      if (isa<BlockAddressSDNode>(Node))   return true;
+      if (Node->getOpcode() == ISD::EntryToken ||
+          isa<MDNodeSDNode>(Node)) return true;
+      return false;
+    }
+
+    /// NewSUnit - Creates a new SUnit and return a ptr to it.
+    ///
+    SUnit *NewSUnit(SDNode *N);
+
+    /// Clone - Creates a clone of the specified SUnit. It does not copy the
+    /// predecessors / successors info nor the temporary scheduling states.
+    ///
+    SUnit *Clone(SUnit *N);
+    
+    /// BuildSchedGraph - Build the SUnit graph from the selection dag that we
+    /// are input.  This SUnit graph is similar to the SelectionDAG, but
+    /// excludes nodes that aren't interesting to scheduling, and represents
+    /// flagged together nodes with a single SUnit.
+    virtual void BuildSchedGraph(AliasAnalysis *AA);
+
+    /// ComputeLatency - Compute node latency.
+    ///
+    virtual void ComputeLatency(SUnit *SU);
+
+    /// ComputeOperandLatency - Override dependence edge latency using
+    /// operand use/def information
+    ///
+    virtual void ComputeOperandLatency(SUnit *Def, SUnit *Use,
+                                       SDep& dep) const { }
+
+    virtual void ComputeOperandLatency(SDNode *Def, SDNode *Use,
+                                       unsigned OpIdx, SDep& dep) const;
+
+    virtual MachineBasicBlock *EmitSchedule();
+
+    /// Schedule - Order nodes according to selected style, filling
+    /// in the Sequence member.
+    ///
+    virtual void Schedule() = 0;
+
+    virtual void dumpNode(const SUnit *SU) const;
+
+    virtual std::string getGraphNodeLabel(const SUnit *SU) const;
+
+    virtual void getCustomGraphFeatures(GraphWriter<ScheduleDAG*> &GW) const;
+
+  private:
+    /// ClusterNeighboringLoads - Cluster loads from "near" addresses into
+    /// combined SUnits.
+    void ClusterNeighboringLoads(SDNode *Node);
+    /// ClusterNodes - Cluster certain nodes which should be scheduled together.
+    ///
+    void ClusterNodes();
+
+    /// BuildSchedUnits, AddSchedEdges - Helper functions for BuildSchedGraph.
+    void BuildSchedUnits();
+    void AddSchedEdges();
+  };
+}
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
new file mode 100644
index 0000000..8b77d7b
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -0,0 +1,6449 @@
+//===-- SelectionDAG.cpp - Implement the SelectionDAG data structures -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "SDNodeOrdering.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/Constants.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/System/Mutex.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include <algorithm>
+#include <cmath>
+using namespace llvm;
+
+/// makeVTList - Return an instance of the SDVTList struct initialized with the
+/// specified members.
+static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
+  SDVTList Res = {VTs, NumVTs};
+  return Res;
+}
+
+static const fltSemantics *EVTToAPFloatSemantics(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unknown FP format");
+  case MVT::f32:     return &APFloat::IEEEsingle;
+  case MVT::f64:     return &APFloat::IEEEdouble;
+  case MVT::f80:     return &APFloat::x87DoubleExtended;
+  case MVT::f128:    return &APFloat::IEEEquad;
+  case MVT::ppcf128: return &APFloat::PPCDoubleDouble;
+  }
+}
+
+SelectionDAG::DAGUpdateListener::~DAGUpdateListener() {}
+
+//===----------------------------------------------------------------------===//
+//                              ConstantFPSDNode Class
+//===----------------------------------------------------------------------===//
+
+/// isExactlyValue - We don't rely on operator== working on double values, as
+/// it returns true for things that are clearly not equal, like -0.0 and 0.0.
+/// As such, this method can be used to do an exact bit-for-bit comparison of
+/// two floating point values.
+bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const {
+  return getValueAPF().bitwiseIsEqual(V);
+}
+
+bool ConstantFPSDNode::isValueValidForType(EVT VT,
+                                           const APFloat& Val) {
+  assert(VT.isFloatingPoint() && "Can only convert between FP types");
+
+  // PPC long double cannot be converted to any other type.
+  if (VT == MVT::ppcf128 ||
+      &Val.getSemantics() == &APFloat::PPCDoubleDouble)
+    return false;
+
+  // convert modifies in place, so make a copy.
+  APFloat Val2 = APFloat(Val);
+  bool losesInfo;
+  (void) Val2.convert(*EVTToAPFloatSemantics(VT), APFloat::rmNearestTiesToEven,
+                      &losesInfo);
+  return !losesInfo;
+}
+
+//===----------------------------------------------------------------------===//
+//                              ISD Namespace
+//===----------------------------------------------------------------------===//
+
+/// isBuildVectorAllOnes - Return true if the specified node is a
+/// BUILD_VECTOR where all of the elements are ~0 or undef.
+bool ISD::isBuildVectorAllOnes(const SDNode *N) {
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BIT_CONVERT)
+    N = N->getOperand(0).getNode();
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
+
+  unsigned i = 0, e = N->getNumOperands();
+
+  // Skip over all of the undef values.
+  while (i != e && N->getOperand(i).getOpcode() == ISD::UNDEF)
+    ++i;
+
+  // Do not accept an all-undef vector.
+  if (i == e) return false;
+
+  // Do not accept build_vectors that aren't all constants or which have non-~0
+  // elements.
+  SDValue NotZero = N->getOperand(i);
+  if (isa<ConstantSDNode>(NotZero)) {
+    if (!cast<ConstantSDNode>(NotZero)->isAllOnesValue())
+      return false;
+  } else if (isa<ConstantFPSDNode>(NotZero)) {
+    if (!cast<ConstantFPSDNode>(NotZero)->getValueAPF().
+                bitcastToAPInt().isAllOnesValue())
+      return false;
+  } else
+    return false;
+
+  // Okay, we have at least one ~0 value, check to see if the rest match or are
+  // undefs.
+  for (++i; i != e; ++i)
+    if (N->getOperand(i) != NotZero &&
+        N->getOperand(i).getOpcode() != ISD::UNDEF)
+      return false;
+  return true;
+}
+
+
+/// isBuildVectorAllZeros - Return true if the specified node is a
+/// BUILD_VECTOR where all of the elements are 0 or undef.
+bool ISD::isBuildVectorAllZeros(const SDNode *N) {
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BIT_CONVERT)
+    N = N->getOperand(0).getNode();
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
+
+  unsigned i = 0, e = N->getNumOperands();
+
+  // Skip over all of the undef values.
+  while (i != e && N->getOperand(i).getOpcode() == ISD::UNDEF)
+    ++i;
+
+  // Do not accept an all-undef vector.
+  if (i == e) return false;
+
+  // Do not accept build_vectors that aren't all constants or which have non-0
+  // elements.
+  SDValue Zero = N->getOperand(i);
+  if (isa<ConstantSDNode>(Zero)) {
+    if (!cast<ConstantSDNode>(Zero)->isNullValue())
+      return false;
+  } else if (isa<ConstantFPSDNode>(Zero)) {
+    if (!cast<ConstantFPSDNode>(Zero)->getValueAPF().isPosZero())
+      return false;
+  } else
+    return false;
+
+  // Okay, we have at least one 0 value, check to see if the rest match or are
+  // undefs.
+  for (++i; i != e; ++i)
+    if (N->getOperand(i) != Zero &&
+        N->getOperand(i).getOpcode() != ISD::UNDEF)
+      return false;
+  return true;
+}
+
+/// isScalarToVector - Return true if the specified node is a
+/// ISD::SCALAR_TO_VECTOR node or a BUILD_VECTOR node where only the low
+/// element is not an undef.
+bool ISD::isScalarToVector(const SDNode *N) {
+  if (N->getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return true;
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+  if (N->getOperand(0).getOpcode() == ISD::UNDEF)
+    return false;
+  unsigned NumElems = N->getNumOperands();
+  for (unsigned i = 1; i < NumElems; ++i) {
+    SDValue V = N->getOperand(i);
+    if (V.getOpcode() != ISD::UNDEF)
+      return false;
+  }
+  return true;
+}
+
+/// getSetCCSwappedOperands - Return the operation corresponding to (Y op X)
+/// when given the operation for (X op Y).
+ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
+  // To perform this operation, we just need to swap the L and G bits of the
+  // operation.
+  unsigned OldL = (Operation >> 2) & 1;
+  unsigned OldG = (Operation >> 1) & 1;
+  return ISD::CondCode((Operation & ~6) |  // Keep the N, U, E bits
+                       (OldL << 1) |       // New G bit
+                       (OldG << 2));       // New L bit.
+}
+
+/// getSetCCInverse - Return the operation corresponding to !(X op Y), where
+/// 'op' is a valid SetCC operation.
+ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
+  unsigned Operation = Op;
+  if (isInteger)
+    Operation ^= 7;   // Flip L, G, E bits, but not U.
+  else
+    Operation ^= 15;  // Flip all of the condition bits.
+
+  if (Operation > ISD::SETTRUE2)
+    Operation &= ~8;  // Don't let N and U bits get set.
+
+  return ISD::CondCode(Operation);
+}
+
+
+/// isSignedOp - For an integer comparison, return 1 if the comparison is a
+/// signed operation and 2 if the result is an unsigned comparison.  Return zero
+/// if the operation does not depend on the sign of the input (setne and seteq).
+static int isSignedOp(ISD::CondCode Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Illegal integer setcc operation!");
+  case ISD::SETEQ:
+  case ISD::SETNE: return 0;
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETGT:
+  case ISD::SETGE: return 1;
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: return 2;
+  }
+}
+
+/// getSetCCOrOperation - Return the result of a logical OR between different
+/// comparisons of identical values: ((X op1 Y) | (X op2 Y)).  This function
+/// returns SETCC_INVALID if it is not possible to represent the resultant
+/// comparison.
+ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
+                                       bool isInteger) {
+  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+    // Cannot fold a signed integer setcc with an unsigned integer setcc.
+    return ISD::SETCC_INVALID;
+
+  unsigned Op = Op1 | Op2;  // Combine all of the condition bits.
+
+  // If the N and U bits get set then the resultant comparison DOES suddenly
+  // care about orderedness, and is true when ordered.
+  if (Op > ISD::SETTRUE2)
+    Op &= ~16;     // Clear the U bit if the N bit is set.
+
+  // Canonicalize illegal integer setcc's.
+  if (isInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
+    Op = ISD::SETNE;
+
+  return ISD::CondCode(Op);
+}
+
+/// getSetCCAndOperation - Return the result of a logical AND between different
+/// comparisons of identical values: ((X op1 Y) & (X op2 Y)).  This
+/// function returns zero if it is not possible to represent the resultant
+/// comparison.
+ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
+                                        bool isInteger) {
+  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+    // Cannot fold a signed setcc with an unsigned setcc.
+    return ISD::SETCC_INVALID;
+
+  // Combine all of the condition bits.
+  ISD::CondCode Result = ISD::CondCode(Op1 & Op2);
+
+  // Canonicalize illegal integer setcc's.
+  if (isInteger) {
+    switch (Result) {
+    default: break;
+    case ISD::SETUO : Result = ISD::SETFALSE; break;  // SETUGT & SETULT
+    case ISD::SETOEQ:                                 // SETEQ  & SETU[LG]E
+    case ISD::SETUEQ: Result = ISD::SETEQ   ; break;  // SETUGE & SETULE
+    case ISD::SETOLT: Result = ISD::SETULT  ; break;  // SETULT & SETNE
+    case ISD::SETOGT: Result = ISD::SETUGT  ; break;  // SETUGT & SETNE
+    }
+  }
+
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+//                           SDNode Profile Support
+//===----------------------------------------------------------------------===//
+
+/// AddNodeIDOpcode - Add the node opcode to the NodeID data.
+///
+static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC)  {
+  ID.AddInteger(OpC);
+}
+
+/// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them
+/// solely with their pointer.
+static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
+  ID.AddPointer(VTList.VTs);
+}
+
+/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
+///
+static void AddNodeIDOperands(FoldingSetNodeID &ID,
+                              const SDValue *Ops, unsigned NumOps) {
+  for (; NumOps; --NumOps, ++Ops) {
+    ID.AddPointer(Ops->getNode());
+    ID.AddInteger(Ops->getResNo());
+  }
+}
+
+/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
+///
+static void AddNodeIDOperands(FoldingSetNodeID &ID,
+                              const SDUse *Ops, unsigned NumOps) {
+  for (; NumOps; --NumOps, ++Ops) {
+    ID.AddPointer(Ops->getNode());
+    ID.AddInteger(Ops->getResNo());
+  }
+}
+
+static void AddNodeIDNode(FoldingSetNodeID &ID,
+                          unsigned short OpC, SDVTList VTList,
+                          const SDValue *OpList, unsigned N) {
+  AddNodeIDOpcode(ID, OpC);
+  AddNodeIDValueTypes(ID, VTList);
+  AddNodeIDOperands(ID, OpList, N);
+}
+
+/// AddNodeIDCustom - If this is an SDNode with special info, add this info to
+/// the NodeID data.
+static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
+  switch (N->getOpcode()) {
+  case ISD::TargetExternalSymbol:
+  case ISD::ExternalSymbol:
+    llvm_unreachable("Should only be used on nodes with operands");
+  default: break;  // Normal nodes don't need extra info.
+  case ISD::TargetConstant:
+  case ISD::Constant:
+    ID.AddPointer(cast<ConstantSDNode>(N)->getConstantIntValue());
+    break;
+  case ISD::TargetConstantFP:
+  case ISD::ConstantFP: {
+    ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
+    break;
+  }
+  case ISD::TargetGlobalAddress:
+  case ISD::GlobalAddress:
+  case ISD::TargetGlobalTLSAddress:
+  case ISD::GlobalTLSAddress: {
+    const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+    ID.AddPointer(GA->getGlobal());
+    ID.AddInteger(GA->getOffset());
+    ID.AddInteger(GA->getTargetFlags());
+    break;
+  }
+  case ISD::BasicBlock:
+    ID.AddPointer(cast<BasicBlockSDNode>(N)->getBasicBlock());
+    break;
+  case ISD::Register:
+    ID.AddInteger(cast<RegisterSDNode>(N)->getReg());
+    break;
+
+  case ISD::SRCVALUE:
+    ID.AddPointer(cast<SrcValueSDNode>(N)->getValue());
+    break;
+  case ISD::FrameIndex:
+  case ISD::TargetFrameIndex:
+    ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
+    break;
+  case ISD::JumpTable:
+  case ISD::TargetJumpTable:
+    ID.AddInteger(cast<JumpTableSDNode>(N)->getIndex());
+    ID.AddInteger(cast<JumpTableSDNode>(N)->getTargetFlags());
+    break;
+  case ISD::ConstantPool:
+  case ISD::TargetConstantPool: {
+    const ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
+    ID.AddInteger(CP->getAlignment());
+    ID.AddInteger(CP->getOffset());
+    if (CP->isMachineConstantPoolEntry())
+      CP->getMachineCPVal()->AddSelectionDAGCSEId(ID);
+    else
+      ID.AddPointer(CP->getConstVal());
+    ID.AddInteger(CP->getTargetFlags());
+    break;
+  }
+  case ISD::LOAD: {
+    const LoadSDNode *LD = cast<LoadSDNode>(N);
+    ID.AddInteger(LD->getMemoryVT().getRawBits());
+    ID.AddInteger(LD->getRawSubclassData());
+    break;
+  }
+  case ISD::STORE: {
+    const StoreSDNode *ST = cast<StoreSDNode>(N);
+    ID.AddInteger(ST->getMemoryVT().getRawBits());
+    ID.AddInteger(ST->getRawSubclassData());
+    break;
+  }
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX: {
+    const AtomicSDNode *AT = cast<AtomicSDNode>(N);
+    ID.AddInteger(AT->getMemoryVT().getRawBits());
+    ID.AddInteger(AT->getRawSubclassData());
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+    for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements();
+         i != e; ++i)
+      ID.AddInteger(SVN->getMaskElt(i));
+    break;
+  }
+  case ISD::TargetBlockAddress:
+  case ISD::BlockAddress: {
+    ID.AddPointer(cast<BlockAddressSDNode>(N)->getBlockAddress());
+    ID.AddInteger(cast<BlockAddressSDNode>(N)->getTargetFlags());
+    break;
+  }
+  } // end switch (N->getOpcode())
+}
+
+/// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
+/// data.
+static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
+  AddNodeIDOpcode(ID, N->getOpcode());
+  // Add the return value info.
+  AddNodeIDValueTypes(ID, N->getVTList());
+  // Add the operand info.
+  AddNodeIDOperands(ID, N->op_begin(), N->getNumOperands());
+
+  // Handle SDNode leafs with special info.
+  AddNodeIDCustom(ID, N);
+}
+
+/// encodeMemSDNodeFlags - Generic routine for computing a value for use in
+/// the CSE map that carries volatility, temporalness, indexing mode, and
+/// extension/truncation information.
+///
+static inline unsigned
+encodeMemSDNodeFlags(int ConvType, ISD::MemIndexedMode AM, bool isVolatile,
+                     bool isNonTemporal) {
+  assert((ConvType & 3) == ConvType &&
+         "ConvType may not require more than 2 bits!");
+  assert((AM & 7) == AM &&
+         "AM may not require more than 3 bits!");
+  return ConvType |
+         (AM << 2) |
+         (isVolatile << 5) |
+         (isNonTemporal << 6);
+}
+
+//===----------------------------------------------------------------------===//
+//                              SelectionDAG Class
+//===----------------------------------------------------------------------===//
+
+/// doNotCSE - Return true if CSE should not be performed for this node.
+static bool doNotCSE(SDNode *N) {
+  if (N->getValueType(0) == MVT::Flag)
+    return true; // Never CSE anything that produces a flag.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::HANDLENODE:
+    return true;   // Never CSE these nodes.
+  }
+
+  // Check that remaining values produced are not flags.
+  for (unsigned i = 1, e = N->getNumValues(); i != e; ++i)
+    if (N->getValueType(i) == MVT::Flag)
+      return true; // Never CSE anything that produces a flag.
+
+  return false;
+}
+
+/// RemoveDeadNodes - This method deletes all unreachable nodes in the
+/// SelectionDAG.
+void SelectionDAG::RemoveDeadNodes() {
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted.
+  HandleSDNode Dummy(getRoot());
+
+  SmallVector<SDNode*, 128> DeadNodes;
+
+  // Add all obviously-dead nodes to the DeadNodes worklist.
+  for (allnodes_iterator I = allnodes_begin(), E = allnodes_end(); I != E; ++I)
+    if (I->use_empty())
+      DeadNodes.push_back(I);
+
+  RemoveDeadNodes(DeadNodes);
+
+  // If the root changed (e.g. it was a dead load, update the root).
+  setRoot(Dummy.getValue());
+}
+
+/// RemoveDeadNodes - This method deletes the unreachable nodes in the
+/// given list, and any nodes that become unreachable as a result.
+void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes,
+                                   DAGUpdateListener *UpdateListener) {
+
+  // Process the worklist, deleting the nodes and adding their uses to the
+  // worklist.
+  while (!DeadNodes.empty()) {
+    SDNode *N = DeadNodes.pop_back_val();
+
+    if (UpdateListener)
+      UpdateListener->NodeDeleted(N, 0);
+
+    // Take the node out of the appropriate CSE map.
+    RemoveNodeFromCSEMaps(N);
+
+    // Next, brutally remove the operand list.  This is safe to do, as there are
+    // no cycles in the graph.
+    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
+      SDUse &Use = *I++;
+      SDNode *Operand = Use.getNode();
+      Use.set(SDValue());
+
+      // Now that we removed this operand, see if there are no uses of it left.
+      if (Operand->use_empty())
+        DeadNodes.push_back(Operand);
+    }
+
+    DeallocateNode(N);
+  }
+}
+
+void SelectionDAG::RemoveDeadNode(SDNode *N, DAGUpdateListener *UpdateListener){
+  SmallVector<SDNode*, 16> DeadNodes(1, N);
+  RemoveDeadNodes(DeadNodes, UpdateListener);
+}
+
+void SelectionDAG::DeleteNode(SDNode *N) {
+  // First take this out of the appropriate CSE map.
+  RemoveNodeFromCSEMaps(N);
+
+  // Finally, remove uses due to operands of this node, remove from the
+  // AllNodes list, and delete the node.
+  DeleteNodeNotInCSEMaps(N);
+}
+
+void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
+  assert(N != AllNodes.begin() && "Cannot delete the entry node!");
+  assert(N->use_empty() && "Cannot delete a node that is not dead!");
+
+  // Drop all of the operands and decrement used node's use counts.
+  N->DropOperands();
+
+  DeallocateNode(N);
+}
+
+void SelectionDAG::DeallocateNode(SDNode *N) {
+  if (N->OperandsNeedDelete)
+    delete[] N->OperandList;
+
+  // Set the opcode to DELETED_NODE to help catch bugs when node
+  // memory is reallocated.
+  N->NodeType = ISD::DELETED_NODE;
+
+  NodeAllocator.Deallocate(AllNodes.remove(N));
+
+  // Remove the ordering of this node.
+  Ordering->remove(N);
+
+  // If any of the SDDbgValue nodes refer to this SDNode, invalidate them.
+  SmallVector<SDDbgValue*, 2> &DbgVals = DbgInfo->getSDDbgValues(N);
+  for (unsigned i = 0, e = DbgVals.size(); i != e; ++i)
+    DbgVals[i]->setIsInvalidated();
+}
+
+/// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
+/// correspond to it.  This is useful when we're about to delete or repurpose
+/// the node.  We don't want future request for structurally identical nodes
+/// to return N anymore.
+bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
+  bool Erased = false;
+  switch (N->getOpcode()) {
+  case ISD::EntryToken:
+    llvm_unreachable("EntryToken should not be in CSEMaps!");
+    return false;
+  case ISD::HANDLENODE: return false;  // noop.
+  case ISD::CONDCODE:
+    assert(CondCodeNodes[cast<CondCodeSDNode>(N)->get()] &&
+           "Cond code doesn't exist!");
+    Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != 0;
+    CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = 0;
+    break;
+  case ISD::ExternalSymbol:
+    Erased = ExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
+    break;
+  case ISD::TargetExternalSymbol: {
+    ExternalSymbolSDNode *ESN = cast<ExternalSymbolSDNode>(N);
+    Erased = TargetExternalSymbols.erase(
+               std::pair<std::string,unsigned char>(ESN->getSymbol(),
+                                                    ESN->getTargetFlags()));
+    break;
+  }
+  case ISD::VALUETYPE: {
+    EVT VT = cast<VTSDNode>(N)->getVT();
+    if (VT.isExtended()) {
+      Erased = ExtendedValueTypeNodes.erase(VT);
+    } else {
+      Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != 0;
+      ValueTypeNodes[VT.getSimpleVT().SimpleTy] = 0;
+    }
+    break;
+  }
+  default:
+    // Remove it from the CSE Map.
+    Erased = CSEMap.RemoveNode(N);
+    break;
+  }
+#ifndef NDEBUG
+  // Verify that the node was actually in one of the CSE maps, unless it has a
+  // flag result (which cannot be CSE'd) or is one of the special cases that are
+  // not subject to CSE.
+  if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Flag &&
+      !N->isMachineOpcode() && !doNotCSE(N)) {
+    N->dump(this);
+    dbgs() << "\n";
+    llvm_unreachable("Node is not in map!");
+  }
+#endif
+  return Erased;
+}
+
+/// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE
+/// maps and modified in place. Add it back to the CSE maps, unless an identical
+/// node already exists, in which case transfer all its users to the existing
+/// node. This transfer can potentially trigger recursive merging.
+///
+void
+SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N,
+                                       DAGUpdateListener *UpdateListener) {
+  // For node types that aren't CSE'd, just act as if no identical node
+  // already exists.
+  if (!doNotCSE(N)) {
+    SDNode *Existing = CSEMap.GetOrInsertNode(N);
+    if (Existing != N) {
+      // If there was already an existing matching node, use ReplaceAllUsesWith
+      // to replace the dead one with the existing one.  This can cause
+      // recursive merging of other unrelated nodes down the line.
+      ReplaceAllUsesWith(N, Existing, UpdateListener);
+
+      // N is now dead.  Inform the listener if it exists and delete it.
+      if (UpdateListener)
+        UpdateListener->NodeDeleted(N, Existing);
+      DeleteNodeNotInCSEMaps(N);
+      return;
+    }
+  }
+
+  // If the node doesn't already exist, we updated it.  Inform a listener if
+  // it exists.
+  if (UpdateListener)
+    UpdateListener->NodeUpdated(N);
+}
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return 0;
+
+  SDValue Ops[] = { Op };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, 1);
+  AddNodeIDCustom(ID, N);
+  SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  return Node;
+}
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
+                                           SDValue Op1, SDValue Op2,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return 0;
+
+  SDValue Ops[] = { Op1, Op2 };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, 2);
+  AddNodeIDCustom(ID, N);
+  SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  return Node;
+}
+
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
+                                           const SDValue *Ops,unsigned NumOps,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return 0;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, NumOps);
+  AddNodeIDCustom(ID, N);
+  SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  return Node;
+}
+
+/// VerifyNode - Sanity check the given node.  Aborts if it is invalid.
+void SelectionDAG::VerifyNode(SDNode *N) {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::BUILD_PAIR: {
+    EVT VT = N->getValueType(0);
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) &&
+           "Wrong return type!");
+    assert(N->getNumOperands() == 2 && "Wrong number of operands!");
+    assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
+           "Mismatched operand types!");
+    assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
+           "Wrong operand type!");
+    assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
+           "Wrong return type size");
+    break;
+  }
+  case ISD::BUILD_VECTOR: {
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(N->getValueType(0).isVector() && "Wrong return type!");
+    assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
+           "Wrong number of operands!");
+    EVT EltVT = N->getValueType(0).getVectorElementType();
+    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I)
+      assert((I->getValueType() == EltVT ||
+             (EltVT.isInteger() && I->getValueType().isInteger() &&
+              EltVT.bitsLE(I->getValueType()))) &&
+            "Wrong operand type!");
+    break;
+  }
+  }
+}
+
+/// getEVTAlignment - Compute the default alignment value for the
+/// given type.
+///
+unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
+  const Type *Ty = VT == MVT::iPTR ?
+                   PointerType::get(Type::getInt8Ty(*getContext()), 0) :
+                   VT.getTypeForEVT(*getContext());
+
+  return TLI.getTargetData()->getABITypeAlignment(Ty);
+}
+
+// EntryNode could meaningfully have debug info if we can find it...
+SelectionDAG::SelectionDAG(const TargetMachine &tm)
+  : TM(tm), TLI(*tm.getTargetLowering()), TSI(*tm.getSelectionDAGInfo()),
+    EntryNode(ISD::EntryToken, DebugLoc(), getVTList(MVT::Other)),
+    Root(getEntryNode()), Ordering(0) {
+  AllNodes.push_back(&EntryNode);
+  Ordering = new SDNodeOrdering();
+  DbgInfo = new SDDbgInfo();
+}
+
+void SelectionDAG::init(MachineFunction &mf) {
+  MF = &mf;
+  Context = &mf.getFunction()->getContext();
+}
+
+SelectionDAG::~SelectionDAG() {
+  allnodes_clear();
+  delete Ordering;
+  delete DbgInfo;
+}
+
+void SelectionDAG::allnodes_clear() {
+  assert(&*AllNodes.begin() == &EntryNode);
+  AllNodes.remove(AllNodes.begin());
+  while (!AllNodes.empty())
+    DeallocateNode(AllNodes.begin());
+}
+
+void SelectionDAG::clear() {
+  allnodes_clear();
+  OperandAllocator.Reset();
+  CSEMap.clear();
+
+  ExtendedValueTypeNodes.clear();
+  ExternalSymbols.clear();
+  TargetExternalSymbols.clear();
+  std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
+            static_cast<CondCodeSDNode*>(0));
+  std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
+            static_cast<SDNode*>(0));
+
+  EntryNode.UseList = 0;
+  AllNodes.push_back(&EntryNode);
+  Root = getEntryNode();
+  Ordering->clear();
+  DbgInfo->clear();
+}
+
+SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType()) ?
+    getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
+    getNode(ISD::TRUNCATE, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType()) ?
+    getNode(ISD::ZERO_EXTEND, DL, VT, Op) :
+    getNode(ISD::TRUNCATE, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, DebugLoc DL, EVT VT) {
+  assert(!VT.isVector() &&
+         "getZeroExtendInReg should use the vector element type instead of "
+         "the vector type!");
+  if (Op.getValueType() == VT) return Op;
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+  APInt Imm = APInt::getLowBitsSet(BitWidth,
+                                   VT.getSizeInBits());
+  return getNode(ISD::AND, DL, Op.getValueType(), Op,
+                 getConstant(Imm, Op.getValueType()));
+}
+
+/// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
+///
+SDValue SelectionDAG::getNOT(DebugLoc DL, SDValue Val, EVT VT) {
+  EVT EltVT = VT.getScalarType();
+  SDValue NegOne =
+    getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT);
+  return getNode(ISD::XOR, DL, VT, Val, NegOne);
+}
+
+SDValue SelectionDAG::getConstant(uint64_t Val, EVT VT, bool isT) {
+  EVT EltVT = VT.getScalarType();
+  assert((EltVT.getSizeInBits() >= 64 ||
+         (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
+         "getConstant with a uint64_t value that doesn't fit in the type!");
+  return getConstant(APInt(EltVT.getSizeInBits(), Val), VT, isT);
+}
+
+SDValue SelectionDAG::getConstant(const APInt &Val, EVT VT, bool isT) {
+  return getConstant(*ConstantInt::get(*Context, Val), VT, isT);
+}
+
+SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT) {
+  assert(VT.isInteger() && "Cannot create FP integer constant!");
+
+  EVT EltVT = VT.getScalarType();
+  assert(Val.getBitWidth() == EltVT.getSizeInBits() &&
+         "APInt size does not match type size!");
+
+  unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(EltVT), 0, 0);
+  ID.AddPointer(&Val);
+  void *IP = 0;
+  SDNode *N = NULL;
+  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
+    if (!VT.isVector())
+      return SDValue(N, 0);
+
+  if (!N) {
+    N = new (NodeAllocator) ConstantSDNode(isT, &Val, EltVT);
+    CSEMap.InsertNode(N, IP);
+    AllNodes.push_back(N);
+  }
+
+  SDValue Result(N, 0);
+  if (VT.isVector()) {
+    SmallVector<SDValue, 8> Ops;
+    Ops.assign(VT.getVectorNumElements(), Result);
+    Result = getNode(ISD::BUILD_VECTOR, DebugLoc(), VT, &Ops[0], Ops.size());
+  }
+  return Result;
+}
+
+SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, bool isTarget) {
+  return getConstant(Val, TLI.getPointerTy(), isTarget);
+}
+
+
+SDValue SelectionDAG::getConstantFP(const APFloat& V, EVT VT, bool isTarget) {
+  return getConstantFP(*ConstantFP::get(*getContext(), V), VT, isTarget);
+}
+
+SDValue SelectionDAG::getConstantFP(const ConstantFP& V, EVT VT, bool isTarget){
+  assert(VT.isFloatingPoint() && "Cannot create integer FP constant!");
+
+  EVT EltVT = VT.getScalarType();
+
+  // Do the map lookup using the actual bit pattern for the floating point
+  // value, so that we don't have problems with 0.0 comparing equal to -0.0, and
+  // we don't have issues with SNANs.
+  unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(EltVT), 0, 0);
+  ID.AddPointer(&V);
+  void *IP = 0;
+  SDNode *N = NULL;
+  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
+    if (!VT.isVector())
+      return SDValue(N, 0);
+
+  if (!N) {
+    N = new (NodeAllocator) ConstantFPSDNode(isTarget, &V, EltVT);
+    CSEMap.InsertNode(N, IP);
+    AllNodes.push_back(N);
+  }
+
+  SDValue Result(N, 0);
+  if (VT.isVector()) {
+    SmallVector<SDValue, 8> Ops;
+    Ops.assign(VT.getVectorNumElements(), Result);
+    // FIXME DebugLoc info might be appropriate here
+    Result = getNode(ISD::BUILD_VECTOR, DebugLoc(), VT, &Ops[0], Ops.size());
+  }
+  return Result;
+}
+
+SDValue SelectionDAG::getConstantFP(double Val, EVT VT, bool isTarget) {
+  EVT EltVT = VT.getScalarType();
+  if (EltVT==MVT::f32)
+    return getConstantFP(APFloat((float)Val), VT, isTarget);
+  else if (EltVT==MVT::f64)
+    return getConstantFP(APFloat(Val), VT, isTarget);
+  else if (EltVT==MVT::f80 || EltVT==MVT::f128) {
+    bool ignored;
+    APFloat apf = APFloat(Val);
+    apf.convert(*EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
+                &ignored);
+    return getConstantFP(apf, VT, isTarget);
+  } else {
+    assert(0 && "Unsupported type in getConstantFP");
+    return SDValue();
+  }
+}
+
+SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, DebugLoc DL,
+                                       EVT VT, int64_t Offset,
+                                       bool isTargetGA,
+                                       unsigned char TargetFlags) {
+  assert((TargetFlags == 0 || isTargetGA) &&
+         "Cannot set target flags on target-independent globals");
+
+  // Truncate (with sign-extension) the offset value to the pointer size.
+  EVT PTy = TLI.getPointerTy();
+  unsigned BitWidth = PTy.getSizeInBits();
+  if (BitWidth < 64)
+    Offset = (Offset << (64 - BitWidth) >> (64 - BitWidth));
+
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    // If GV is an alias then use the aliasee for determining thread-localness.
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false));
+  }
+
+  unsigned Opc;
+  if (GVar && GVar->isThreadLocal())
+    Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
+  else
+    Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddPointer(GV);
+  ID.AddInteger(Offset);
+  ID.AddInteger(TargetFlags);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) GlobalAddressSDNode(Opc, DL, GV, VT,
+                                                      Offset, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
+  unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddInteger(FI);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) FrameIndexSDNode(FI, VT, isTarget);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
+                                   unsigned char TargetFlags) {
+  assert((TargetFlags == 0 || isTarget) &&
+         "Cannot set target flags on target-independent jump tables");
+  unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddInteger(JTI);
+  ID.AddInteger(TargetFlags);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) JumpTableSDNode(JTI, VT, isTarget,
+                                                  TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
+                                      unsigned Alignment, int Offset,
+                                      bool isTarget,
+                                      unsigned char TargetFlags) {
+  assert((TargetFlags == 0 || isTarget) &&
+         "Cannot set target flags on target-independent globals");
+  if (Alignment == 0)
+    Alignment = TLI.getTargetData()->getPrefTypeAlignment(C->getType());
+  unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddInteger(Alignment);
+  ID.AddInteger(Offset);
+  ID.AddPointer(C);
+  ID.AddInteger(TargetFlags);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset,
+                                                     Alignment, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+
+SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
+                                      unsigned Alignment, int Offset,
+                                      bool isTarget,
+                                      unsigned char TargetFlags) {
+  assert((TargetFlags == 0 || isTarget) &&
+         "Cannot set target flags on target-independent globals");
+  if (Alignment == 0)
+    Alignment = TLI.getTargetData()->getPrefTypeAlignment(C->getType());
+  unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddInteger(Alignment);
+  ID.AddInteger(Offset);
+  C->AddSelectionDAGCSEId(ID);
+  ID.AddInteger(TargetFlags);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset,
+                                                     Alignment, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), 0, 0);
+  ID.AddPointer(MBB);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) BasicBlockSDNode(MBB);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getValueType(EVT VT) {
+  if (VT.isSimple() && (unsigned)VT.getSimpleVT().SimpleTy >=
+      ValueTypeNodes.size())
+    ValueTypeNodes.resize(VT.getSimpleVT().SimpleTy+1);
+
+  SDNode *&N = VT.isExtended() ?
+    ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy];
+
+  if (N) return SDValue(N, 0);
+  N = new (NodeAllocator) VTSDNode(VT);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
+  SDNode *&N = ExternalSymbols[Sym];
+  if (N) return SDValue(N, 0);
+  N = new (NodeAllocator) ExternalSymbolSDNode(false, Sym, 0, VT);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
+                                              unsigned char TargetFlags) {
+  SDNode *&N =
+    TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
+                                                               TargetFlags)];
+  if (N) return SDValue(N, 0);
+  N = new (NodeAllocator) ExternalSymbolSDNode(true, Sym, TargetFlags, VT);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
+  if ((unsigned)Cond >= CondCodeNodes.size())
+    CondCodeNodes.resize(Cond+1);
+
+  if (CondCodeNodes[Cond] == 0) {
+    CondCodeSDNode *N = new (NodeAllocator) CondCodeSDNode(Cond);
+    CondCodeNodes[Cond] = N;
+    AllNodes.push_back(N);
+  }
+
+  return SDValue(CondCodeNodes[Cond], 0);
+}
+
+// commuteShuffle - swaps the values of N1 and N2, and swaps all indices in
+// the shuffle mask M that point at N1 to point at N2, and indices that point
+// N2 to point at N1.
+static void commuteShuffle(SDValue &N1, SDValue &N2, SmallVectorImpl<int> &M) {
+  std::swap(N1, N2);
+  int NElts = M.size();
+  for (int i = 0; i != NElts; ++i) {
+    if (M[i] >= NElts)
+      M[i] -= NElts;
+    else if (M[i] >= 0)
+      M[i] += NElts;
+  }
+}
+
+SDValue SelectionDAG::getVectorShuffle(EVT VT, DebugLoc dl, SDValue N1,
+                                       SDValue N2, const int *Mask) {
+  assert(N1.getValueType() == N2.getValueType() && "Invalid VECTOR_SHUFFLE");
+  assert(VT.isVector() && N1.getValueType().isVector() &&
+         "Vector Shuffle VTs must be a vectors");
+  assert(VT.getVectorElementType() == N1.getValueType().getVectorElementType()
+         && "Vector Shuffle VTs must have same element type");
+
+  // Canonicalize shuffle undef, undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() == ISD::UNDEF)
+    return getUNDEF(VT);
+
+  // Validate that all indices in Mask are within the range of the elements
+  // input to the shuffle.
+  unsigned NElts = VT.getVectorNumElements();
+  SmallVector<int, 8> MaskVec;
+  for (unsigned i = 0; i != NElts; ++i) {
+    assert(Mask[i] < (int)(NElts * 2) && "Index out of range");
+    MaskVec.push_back(Mask[i]);
+  }
+
+  // Canonicalize shuffle v, v -> v, undef
+  if (N1 == N2) {
+    N2 = getUNDEF(VT);
+    for (unsigned i = 0; i != NElts; ++i)
+      if (MaskVec[i] >= (int)NElts) MaskVec[i] -= NElts;
+  }
+
+  // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
+  if (N1.getOpcode() == ISD::UNDEF)
+    commuteShuffle(N1, N2, MaskVec);
+
+  // Canonicalize all index into lhs, -> shuffle lhs, undef
+  // Canonicalize all index into rhs, -> shuffle rhs, undef
+  bool AllLHS = true, AllRHS = true;
+  bool N2Undef = N2.getOpcode() == ISD::UNDEF;
+  for (unsigned i = 0; i != NElts; ++i) {
+    if (MaskVec[i] >= (int)NElts) {
+      if (N2Undef)
+        MaskVec[i] = -1;
+      else
+        AllLHS = false;
+    } else if (MaskVec[i] >= 0) {
+      AllRHS = false;
+    }
+  }
+  if (AllLHS && AllRHS)
+    return getUNDEF(VT);
+  if (AllLHS && !N2Undef)
+    N2 = getUNDEF(VT);
+  if (AllRHS) {
+    N1 = getUNDEF(VT);
+    commuteShuffle(N1, N2, MaskVec);
+  }
+
+  // If Identity shuffle, or all shuffle in to undef, return that node.
+  bool AllUndef = true;
+  bool Identity = true;
+  for (unsigned i = 0; i != NElts; ++i) {
+    if (MaskVec[i] >= 0 && MaskVec[i] != (int)i) Identity = false;
+    if (MaskVec[i] >= 0) AllUndef = false;
+  }
+  if (Identity && NElts == N1.getValueType().getVectorNumElements())
+    return N1;
+  if (AllUndef)
+    return getUNDEF(VT);
+
+  FoldingSetNodeID ID;
+  SDValue Ops[2] = { N1, N2 };
+  AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops, 2);
+  for (unsigned i = 0; i != NElts; ++i)
+    ID.AddInteger(MaskVec[i]);
+
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  // Allocate the mask array for the node out of the BumpPtrAllocator, since
+  // SDNode doesn't have access to it.  This memory will be "leaked" when
+  // the node is deallocated, but recovered when the NodeAllocator is released.
+  int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
+  memcpy(MaskAlloc, &MaskVec[0], NElts * sizeof(int));
+
+  ShuffleVectorSDNode *N =
+    new (NodeAllocator) ShuffleVectorSDNode(VT, dl, N1, N2, MaskAlloc);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getConvertRndSat(EVT VT, DebugLoc dl,
+                                       SDValue Val, SDValue DTy,
+                                       SDValue STy, SDValue Rnd, SDValue Sat,
+                                       ISD::CvtCode Code) {
+  // If the src and dest types are the same and the conversion is between
+  // integer types of the same sign or two floats, no conversion is necessary.
+  if (DTy == STy &&
+      (Code == ISD::CVT_UU || Code == ISD::CVT_SS || Code == ISD::CVT_FF))
+    return Val;
+
+  FoldingSetNodeID ID;
+  SDValue Ops[] = { Val, DTy, STy, Rnd, Sat };
+  AddNodeIDNode(ID, ISD::CONVERT_RNDSAT, getVTList(VT), &Ops[0], 5);
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  CvtRndSatSDNode *N = new (NodeAllocator) CvtRndSatSDNode(VT, dl, Ops, 5,
+                                                           Code);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::Register, getVTList(VT), 0, 0);
+  ID.AddInteger(RegNo);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) RegisterSDNode(RegNo, VT);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
+                                      bool isTarget,
+                                      unsigned char TargetFlags) {
+  unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddPointer(BA);
+  ID.AddInteger(TargetFlags);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) BlockAddressSDNode(Opc, VT, BA, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getSrcValue(const Value *V) {
+  assert((!V || V->getType()->isPointerTy()) &&
+         "SrcValue is not a pointer?");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), 0, 0);
+  ID.AddPointer(V);
+
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) SrcValueSDNode(V);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+/// getMDNode - Return an MDNodeSDNode which holds an MDNode.
+SDValue SelectionDAG::getMDNode(const MDNode *MD) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), 0, 0);
+  ID.AddPointer(MD);
+  
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  
+  SDNode *N = new (NodeAllocator) MDNodeSDNode(MD);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+
+/// getShiftAmountOperand - Return the specified value casted to
+/// the target's desired shift amount type.
+SDValue SelectionDAG::getShiftAmountOperand(SDValue Op) {
+  EVT OpTy = Op.getValueType();
+  MVT ShTy = TLI.getShiftAmountTy();
+  if (OpTy == ShTy || OpTy.isVector()) return Op;
+
+  ISD::NodeType Opcode = OpTy.bitsGT(ShTy) ?  ISD::TRUNCATE : ISD::ZERO_EXTEND;
+  return getNode(Opcode, Op.getDebugLoc(), ShTy, Op);
+}
+
+/// CreateStackTemporary - Create a stack temporary, suitable for holding the
+/// specified value type.
+SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
+  MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
+  unsigned ByteSize = VT.getStoreSize();
+  const Type *Ty = VT.getTypeForEVT(*getContext());
+  unsigned StackAlign =
+  std::max((unsigned)TLI.getTargetData()->getPrefTypeAlignment(Ty), minAlign);
+
+  int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
+  return getFrameIndex(FrameIdx, TLI.getPointerTy());
+}
+
+/// CreateStackTemporary - Create a stack temporary suitable for holding
+/// either of the specified value types.
+SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
+  unsigned Bytes = std::max(VT1.getStoreSizeInBits(),
+                            VT2.getStoreSizeInBits())/8;
+  const Type *Ty1 = VT1.getTypeForEVT(*getContext());
+  const Type *Ty2 = VT2.getTypeForEVT(*getContext());
+  const TargetData *TD = TLI.getTargetData();
+  unsigned Align = std::max(TD->getPrefTypeAlignment(Ty1),
+                            TD->getPrefTypeAlignment(Ty2));
+
+  MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(Bytes, Align, false);
+  return getFrameIndex(FrameIdx, TLI.getPointerTy());
+}
+
+SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
+                                SDValue N2, ISD::CondCode Cond, DebugLoc dl) {
+  // These setcc operations always fold.
+  switch (Cond) {
+  default: break;
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2: return getConstant(0, VT);
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2:  return getConstant(1, VT);
+
+  case ISD::SETOEQ:
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETONE:
+  case ISD::SETO:
+  case ISD::SETUO:
+  case ISD::SETUEQ:
+  case ISD::SETUNE:
+    assert(!N1.getValueType().isInteger() && "Illegal setcc for integer!");
+    break;
+  }
+
+  if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode())) {
+    const APInt &C2 = N2C->getAPIntValue();
+    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+      const APInt &C1 = N1C->getAPIntValue();
+
+      switch (Cond) {
+      default: llvm_unreachable("Unknown integer setcc!");
+      case ISD::SETEQ:  return getConstant(C1 == C2, VT);
+      case ISD::SETNE:  return getConstant(C1 != C2, VT);
+      case ISD::SETULT: return getConstant(C1.ult(C2), VT);
+      case ISD::SETUGT: return getConstant(C1.ugt(C2), VT);
+      case ISD::SETULE: return getConstant(C1.ule(C2), VT);
+      case ISD::SETUGE: return getConstant(C1.uge(C2), VT);
+      case ISD::SETLT:  return getConstant(C1.slt(C2), VT);
+      case ISD::SETGT:  return getConstant(C1.sgt(C2), VT);
+      case ISD::SETLE:  return getConstant(C1.sle(C2), VT);
+      case ISD::SETGE:  return getConstant(C1.sge(C2), VT);
+      }
+    }
+  }
+  if (ConstantFPSDNode *N1C = dyn_cast<ConstantFPSDNode>(N1.getNode())) {
+    if (ConstantFPSDNode *N2C = dyn_cast<ConstantFPSDNode>(N2.getNode())) {
+      // No compile time operations on this type yet.
+      if (N1C->getValueType(0) == MVT::ppcf128)
+        return SDValue();
+
+      APFloat::cmpResult R = N1C->getValueAPF().compare(N2C->getValueAPF());
+      switch (Cond) {
+      default: break;
+      case ISD::SETEQ:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOEQ: return getConstant(R==APFloat::cmpEqual, VT);
+      case ISD::SETNE:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETONE: return getConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpLessThan, VT);
+      case ISD::SETLT:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOLT: return getConstant(R==APFloat::cmpLessThan, VT);
+      case ISD::SETGT:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOGT: return getConstant(R==APFloat::cmpGreaterThan, VT);
+      case ISD::SETLE:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOLE: return getConstant(R==APFloat::cmpLessThan ||
+                                           R==APFloat::cmpEqual, VT);
+      case ISD::SETGE:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOGE: return getConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpEqual, VT);
+      case ISD::SETO:   return getConstant(R!=APFloat::cmpUnordered, VT);
+      case ISD::SETUO:  return getConstant(R==APFloat::cmpUnordered, VT);
+      case ISD::SETUEQ: return getConstant(R==APFloat::cmpUnordered ||
+                                           R==APFloat::cmpEqual, VT);
+      case ISD::SETUNE: return getConstant(R!=APFloat::cmpEqual, VT);
+      case ISD::SETULT: return getConstant(R==APFloat::cmpUnordered ||
+                                           R==APFloat::cmpLessThan, VT);
+      case ISD::SETUGT: return getConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpUnordered, VT);
+      case ISD::SETULE: return getConstant(R!=APFloat::cmpGreaterThan, VT);
+      case ISD::SETUGE: return getConstant(R!=APFloat::cmpLessThan, VT);
+      }
+    } else {
+      // Ensure that the constant occurs on the RHS.
+      return getSetCC(dl, VT, N2, N1, ISD::getSetCCSwappedOperands(Cond));
+    }
+  }
+
+  // Could not fold it.
+  return SDValue();
+}
+
+/// SignBitIsZero - Return true if the sign bit of Op is known to be zero.  We
+/// use this predicate to simplify operations downstream.
+bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
+  // This predicate is not safe for vector operations.
+  if (Op.getValueType().isVector())
+    return false;
+
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+  return MaskedValueIsZero(Op, APInt::getSignBit(BitWidth), Depth);
+}
+
+/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
+/// this predicate to simplify operations downstream.  Mask is known to be zero
+/// for bits that V cannot have.
+bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
+                                     unsigned Depth) const {
+  APInt KnownZero, KnownOne;
+  ComputeMaskedBits(Op, Mask, KnownZero, KnownOne, Depth);
+  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+  return (KnownZero & Mask) == Mask;
+}
+
+/// ComputeMaskedBits - Determine which of the bits specified in Mask are
+/// known to be either zero or one and return them in the KnownZero/KnownOne
+/// bitsets.  This code only analyzes bits in Mask, in order to short-circuit
+/// processing.
+void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
+                                     APInt &KnownZero, APInt &KnownOne,
+                                     unsigned Depth) const {
+  unsigned BitWidth = Mask.getBitWidth();
+  assert(BitWidth == Op.getValueType().getScalarType().getSizeInBits() &&
+         "Mask size mismatches value type size!");
+
+  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
+  if (Depth == 6 || Mask == 0)
+    return;  // Limit search depth.
+
+  APInt KnownZero2, KnownOne2;
+
+  switch (Op.getOpcode()) {
+  case ISD::Constant:
+    // We know all of the bits for a constant!
+    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue() & Mask;
+    KnownZero = ~KnownOne & Mask;
+    return;
+  case ISD::AND:
+    // If either the LHS or the RHS are Zero, the result is zero.
+    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), Mask & ~KnownZero,
+                      KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    return;
+  case ISD::OR:
+    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), Mask & ~KnownOne,
+                      KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    return;
+  case ISD::XOR: {
+    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+    KnownZero = KnownZeroOut;
+    return;
+  }
+  case ISD::MUL: {
+    APInt Mask2 = APInt::getAllOnesValue(BitWidth);
+    ComputeMaskedBits(Op.getOperand(1), Mask2, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If low bits are zero in either operand, output low known-0 bits.
+    // Also compute a conserative estimate for high known-0 bits.
+    // More trickiness is possible, but this is sufficient for the
+    // interesting case of alignment computation.
+    KnownOne.clear();
+    unsigned TrailZ = KnownZero.countTrailingOnes() +
+                      KnownZero2.countTrailingOnes();
+    unsigned LeadZ =  std::max(KnownZero.countLeadingOnes() +
+                               KnownZero2.countLeadingOnes(),
+                               BitWidth) - BitWidth;
+
+    TrailZ = std::min(TrailZ, BitWidth);
+    LeadZ = std::min(LeadZ, BitWidth);
+    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
+                APInt::getHighBitsSet(BitWidth, LeadZ);
+    KnownZero &= Mask;
+    return;
+  }
+  case ISD::UDIV: {
+    // For the purposes of computing leading zeros we can conservatively
+    // treat a udiv as a logical right shift by the power of 2 known to
+    // be less than the denominator.
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    ComputeMaskedBits(Op.getOperand(0),
+                      AllOnes, KnownZero2, KnownOne2, Depth+1);
+    unsigned LeadZ = KnownZero2.countLeadingOnes();
+
+    KnownOne2.clear();
+    KnownZero2.clear();
+    ComputeMaskedBits(Op.getOperand(1),
+                      AllOnes, KnownZero2, KnownOne2, Depth+1);
+    unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
+    if (RHSUnknownLeadingOnes != BitWidth)
+      LeadZ = std::min(BitWidth,
+                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+
+    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ) & Mask;
+    return;
+  }
+  case ISD::SELECT:
+    ComputeMaskedBits(Op.getOperand(2), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    return;
+  case ISD::SELECT_CC:
+    ComputeMaskedBits(Op.getOperand(3), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(2), Mask, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    return;
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    if (Op.getResNo() != 1)
+      return;
+    // The boolean result conforms to getBooleanContents.  Fall through.
+  case ISD::SETCC:
+    // If we know the result of a setcc has the top bits zero, use this info.
+    if (TLI.getBooleanContents() == TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    return;
+  case ISD::SHL:
+    // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        return;
+
+      ComputeMaskedBits(Op.getOperand(0), Mask.lshr(ShAmt),
+                        KnownZero, KnownOne, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero <<= ShAmt;
+      KnownOne  <<= ShAmt;
+      // low bits known zero.
+      KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt);
+    }
+    return;
+  case ISD::SRL:
+    // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        return;
+
+      ComputeMaskedBits(Op.getOperand(0), (Mask << ShAmt),
+                        KnownZero, KnownOne, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt) & Mask;
+      KnownZero |= HighBits;  // High bits known zero.
+    }
+    return;
+  case ISD::SRA:
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        return;
+
+      APInt InDemandedMask = (Mask << ShAmt);
+      // If any of the demanded bits are produced by the sign extension, we also
+      // demand the input sign bit.
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt) & Mask;
+      if (HighBits.getBoolValue())
+        InDemandedMask |= APInt::getSignBit(BitWidth);
+
+      ComputeMaskedBits(Op.getOperand(0), InDemandedMask, KnownZero, KnownOne,
+                        Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      // Handle the sign bits.
+      APInt SignBit = APInt::getSignBit(BitWidth);
+      SignBit = SignBit.lshr(ShAmt);  // Adjust to where it is now in the mask.
+
+      if (KnownZero.intersects(SignBit)) {
+        KnownZero |= HighBits;  // New bits are known zero.
+      } else if (KnownOne.intersects(SignBit)) {
+        KnownOne  |= HighBits;  // New bits are known one.
+      }
+    }
+    return;
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    unsigned EBits = EVT.getScalarType().getSizeInBits();
+
+    // Sign extension.  Compute the demanded bits in the result that are not
+    // present in the input.
+    APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits) & Mask;
+
+    APInt InSignBit = APInt::getSignBit(EBits);
+    APInt InputDemandedBits = Mask & APInt::getLowBitsSet(BitWidth, EBits);
+
+    // If the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    InSignBit.zext(BitWidth);
+    if (NewBits.getBoolValue())
+      InputDemandedBits |= InSignBit;
+
+    ComputeMaskedBits(Op.getOperand(0), InputDemandedBits,
+                      KnownZero, KnownOne, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    if (KnownZero.intersects(InSignBit)) {         // Input sign bit known clear
+      KnownZero |= NewBits;
+      KnownOne  &= ~NewBits;
+    } else if (KnownOne.intersects(InSignBit)) {   // Input sign bit known set
+      KnownOne  |= NewBits;
+      KnownZero &= ~NewBits;
+    } else {                              // Input sign bit unknown
+      KnownZero &= ~NewBits;
+      KnownOne  &= ~NewBits;
+    }
+    return;
+  }
+  case ISD::CTTZ:
+  case ISD::CTLZ:
+  case ISD::CTPOP: {
+    unsigned LowBits = Log2_32(BitWidth)+1;
+    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+    KnownOne.clear();
+    return;
+  }
+  case ISD::LOAD: {
+    if (ISD::isZEXTLoad(Op.getNode())) {
+      LoadSDNode *LD = cast<LoadSDNode>(Op);
+      EVT VT = LD->getMemoryVT();
+      unsigned MemBits = VT.getScalarType().getSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits) & Mask;
+    }
+    return;
+  }
+  case ISD::ZERO_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits) & Mask;
+    APInt InMask    = Mask;
+    InMask.trunc(InBits);
+    KnownZero.trunc(InBits);
+    KnownOne.trunc(InBits);
+    ComputeMaskedBits(Op.getOperand(0), InMask, KnownZero, KnownOne, Depth+1);
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+    KnownZero |= NewBits;
+    return;
+  }
+  case ISD::SIGN_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    APInt InSignBit = APInt::getSignBit(InBits);
+    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits) & Mask;
+    APInt InMask = Mask;
+    InMask.trunc(InBits);
+
+    // If any of the sign extended bits are demanded, we know that the sign
+    // bit is demanded. Temporarily set this bit in the mask for our callee.
+    if (NewBits.getBoolValue())
+      InMask |= InSignBit;
+
+    KnownZero.trunc(InBits);
+    KnownOne.trunc(InBits);
+    ComputeMaskedBits(Op.getOperand(0), InMask, KnownZero, KnownOne, Depth+1);
+
+    // Note if the sign bit is known to be zero or one.
+    bool SignBitKnownZero = KnownZero.isNegative();
+    bool SignBitKnownOne  = KnownOne.isNegative();
+    assert(!(SignBitKnownZero && SignBitKnownOne) &&
+           "Sign bit can't be known to be both zero and one!");
+
+    // If the sign bit wasn't actually demanded by our caller, we don't
+    // want it set in the KnownZero and KnownOne result values. Reset the
+    // mask and reapply it to the result values.
+    InMask = Mask;
+    InMask.trunc(InBits);
+    KnownZero &= InMask;
+    KnownOne  &= InMask;
+
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+
+    // If the sign bit is known zero or one, the top bits match.
+    if (SignBitKnownZero)
+      KnownZero |= NewBits;
+    else if (SignBitKnownOne)
+      KnownOne  |= NewBits;
+    return;
+  }
+  case ISD::ANY_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    APInt InMask = Mask;
+    InMask.trunc(InBits);
+    KnownZero.trunc(InBits);
+    KnownOne.trunc(InBits);
+    ComputeMaskedBits(Op.getOperand(0), InMask, KnownZero, KnownOne, Depth+1);
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+    return;
+  }
+  case ISD::TRUNCATE: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    APInt InMask = Mask;
+    InMask.zext(InBits);
+    KnownZero.zext(InBits);
+    KnownOne.zext(InBits);
+    ComputeMaskedBits(Op.getOperand(0), InMask, KnownZero, KnownOne, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    KnownZero.trunc(BitWidth);
+    KnownOne.trunc(BitWidth);
+    break;
+  }
+  case ISD::AssertZext: {
+    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
+    ComputeMaskedBits(Op.getOperand(0), Mask & InMask, KnownZero,
+                      KnownOne, Depth+1);
+    KnownZero |= (~InMask) & Mask;
+    return;
+  }
+  case ISD::FGETSIGN:
+    // All bits are zero except the low bit.
+    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    return;
+
+  case ISD::SUB: {
+    if (ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0))) {
+      // We know that the top bits of C-X are clear if X contains less bits
+      // than C (i.e. no wrap-around can happen).  For example, 20-X is
+      // positive if we can prove that X is >= 0 and < 16.
+      if (CLHS->getAPIntValue().isNonNegative()) {
+        unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
+        // NLZ can't be BitWidth with no sign bit
+        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
+        ComputeMaskedBits(Op.getOperand(1), MaskV, KnownZero2, KnownOne2,
+                          Depth+1);
+
+        // If all of the MaskV bits are known to be zero, then we know the
+        // output top bits are zero, because we now know that the output is
+        // from [0-C].
+        if ((KnownZero2 & MaskV) == MaskV) {
+          unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
+          // Top bits known zero.
+          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2) & Mask;
+        }
+      }
+    }
+  }
+  // fall through
+  case ISD::ADD: {
+    // Output known-0 bits are known if clear or set in both the low clear bits
+    // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
+    // low 3 bits clear.
+    APInt Mask2 = APInt::getLowBitsSet(BitWidth,
+                                       BitWidth - Mask.countLeadingZeros());
+    ComputeMaskedBits(Op.getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    unsigned KnownZeroOut = KnownZero2.countTrailingOnes();
+
+    ComputeMaskedBits(Op.getOperand(1), Mask2, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    KnownZeroOut = std::min(KnownZeroOut,
+                            KnownZero2.countTrailingOnes());
+
+    KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroOut);
+    return;
+  }
+  case ISD::SREM:
+    if (ConstantSDNode *Rem = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      const APInt &RA = Rem->getAPIntValue().abs();
+      if (RA.isPowerOf2()) {
+        APInt LowBits = RA - 1;
+        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
+        ComputeMaskedBits(Op.getOperand(0), Mask2,KnownZero2,KnownOne2,Depth+1);
+
+        // The low bits of the first operand are unchanged by the srem.
+        KnownZero = KnownZero2 & LowBits;
+        KnownOne = KnownOne2 & LowBits;
+
+        // If the first operand is non-negative or has all low bits zero, then
+        // the upper bits are all zero.
+        if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits))
+          KnownZero |= ~LowBits;
+
+        // If the first operand is negative and not all low bits are zero, then
+        // the upper bits are all one.
+        if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0))
+          KnownOne |= ~LowBits;
+
+        KnownZero &= Mask;
+        KnownOne &= Mask;
+
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
+      }
+    }
+    return;
+  case ISD::UREM: {
+    if (ConstantSDNode *Rem = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      const APInt &RA = Rem->getAPIntValue();
+      if (RA.isPowerOf2()) {
+        APInt LowBits = (RA - 1);
+        APInt Mask2 = LowBits & Mask;
+        KnownZero |= ~LowBits & Mask;
+        ComputeMaskedBits(Op.getOperand(0), Mask2, KnownZero, KnownOne,Depth+1);
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
+        break;
+      }
+    }
+
+    // Since the result is less than or equal to either operand, any leading
+    // zero bits in either operand must also exist in the result.
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    ComputeMaskedBits(Op.getOperand(0), AllOnes, KnownZero, KnownOne,
+                      Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), AllOnes, KnownZero2, KnownOne2,
+                      Depth+1);
+
+    uint32_t Leaders = std::max(KnownZero.countLeadingOnes(),
+                                KnownZero2.countLeadingOnes());
+    KnownOne.clear();
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & Mask;
+    return;
+  }
+  default:
+    // Allow the target to implement this method for its nodes.
+    if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_VOID:
+      TLI.computeMaskedBitsForTargetNode(Op, Mask, KnownZero, KnownOne, *this,
+                                         Depth);
+    }
+    return;
+  }
+}
+
+/// ComputeNumSignBits - Return the number of times the sign bit of the
+/// register is replicated into the other bits.  We know that at least 1 bit
+/// is always equal to the sign bit (itself), but other cases can give us
+/// information.  For example, immediately after an "SRA X, 2", we know that
+/// the top 3 bits are all equal to each other, so we return 3.
+unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
+  EVT VT = Op.getValueType();
+  assert(VT.isInteger() && "Invalid VT!");
+  unsigned VTBits = VT.getScalarType().getSizeInBits();
+  unsigned Tmp, Tmp2;
+  unsigned FirstAnswer = 1;
+
+  if (Depth == 6)
+    return 1;  // Limit search depth.
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::AssertSext:
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+    return VTBits-Tmp+1;
+  case ISD::AssertZext:
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+    return VTBits-Tmp;
+
+  case ISD::Constant: {
+    const APInt &Val = cast<ConstantSDNode>(Op)->getAPIntValue();
+    // If negative, return # leading ones.
+    if (Val.isNegative())
+      return Val.countLeadingOnes();
+
+    // Return # leading zeros.
+    return Val.countLeadingZeros();
+  }
+
+  case ISD::SIGN_EXTEND:
+    Tmp = VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;
+
+  case ISD::SIGN_EXTEND_INREG:
+    // Max of the input and what this extends.
+    Tmp =
+      cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarType().getSizeInBits();
+    Tmp = VTBits-Tmp+1;
+
+    Tmp2 = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    return std::max(Tmp, Tmp2);
+
+  case ISD::SRA:
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    // SRA X, C   -> adds C sign bits.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      Tmp += C->getZExtValue();
+      if (Tmp > VTBits) Tmp = VTBits;
+    }
+    return Tmp;
+  case ISD::SHL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      // shl destroys sign bits.
+      Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+      if (C->getZExtValue() >= VTBits ||      // Bad shift.
+          C->getZExtValue() >= Tmp) break;    // Shifted all sign bits out.
+      return Tmp - C->getZExtValue();
+    }
+    break;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:    // NOT is handled here.
+    // Logical binary ops preserve the number of sign bits at the worst.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    if (Tmp != 1) {
+      Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+      FirstAnswer = std::min(Tmp, Tmp2);
+      // We computed what we know about the sign bits as our first
+      // answer. Now proceed to the generic code that uses
+      // ComputeMaskedBits, and pick whichever answer is better.
+    }
+    break;
+
+  case ISD::SELECT:
+    Tmp = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(2), Depth+1);
+    return std::min(Tmp, Tmp2);
+
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    if (Op.getResNo() != 1)
+      break;
+    // The boolean result conforms to getBooleanContents.  Fall through.
+  case ISD::SETCC:
+    // If setcc returns 0/-1, all bits are sign bits.
+    if (TLI.getBooleanContents() ==
+        TargetLowering::ZeroOrNegativeOneBooleanContent)
+      return VTBits;
+    break;
+  case ISD::ROTL:
+  case ISD::ROTR:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned RotAmt = C->getZExtValue() & (VTBits-1);
+
+      // Handle rotate right by N like a rotate left by 32-N.
+      if (Op.getOpcode() == ISD::ROTR)
+        RotAmt = (VTBits-RotAmt) & (VTBits-1);
+
+      // If we aren't rotating out all of the known-in sign bits, return the
+      // number that are left.  This handles rotl(sext(x), 1) for example.
+      Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+      if (Tmp > RotAmt+1) return Tmp-RotAmt;
+    }
+    break;
+  case ISD::ADD:
+    // Add can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+
+    // Special case decrementing a value (ADD X, -1):
+    if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+      if (CRHS->isAllOnesValue()) {
+        APInt KnownZero, KnownOne;
+        APInt Mask = APInt::getAllOnesValue(VTBits);
+        ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
+
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((KnownZero | APInt(VTBits, 1)) == Mask)
+          return VTBits;
+
+        // If we are subtracting one from a positive number, there is no carry
+        // out of the result.
+        if (KnownZero.isNegative())
+          return Tmp;
+      }
+
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    if (Tmp2 == 1) return 1;
+      return std::min(Tmp, Tmp2)-1;
+    break;
+
+  case ISD::SUB:
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    if (Tmp2 == 1) return 1;
+
+    // Handle NEG.
+    if (ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0)))
+      if (CLHS->isNullValue()) {
+        APInt KnownZero, KnownOne;
+        APInt Mask = APInt::getAllOnesValue(VTBits);
+        ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((KnownZero | APInt(VTBits, 1)) == Mask)
+          return VTBits;
+
+        // If the input is known to be positive (the sign bit is known clear),
+        // the output of the NEG has the same number of sign bits as the input.
+        if (KnownZero.isNegative())
+          return Tmp2;
+
+        // Otherwise, we treat this like a SUB.
+      }
+
+    // Sub can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+      return std::min(Tmp, Tmp2)-1;
+    break;
+  case ISD::TRUNCATE:
+    // FIXME: it's tricky to do anything useful for this, but it is an important
+    // case for targets like X86.
+    break;
+  }
+
+  // Handle LOADX separately here. EXTLOAD case will fallthrough.
+  if (Op.getOpcode() == ISD::LOAD) {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    unsigned ExtType = LD->getExtensionType();
+    switch (ExtType) {
+    default: break;
+    case ISD::SEXTLOAD:    // '17' bits known
+      Tmp = LD->getMemoryVT().getScalarType().getSizeInBits();
+      return VTBits-Tmp+1;
+    case ISD::ZEXTLOAD:    // '16' bits known
+      Tmp = LD->getMemoryVT().getScalarType().getSizeInBits();
+      return VTBits-Tmp;
+    }
+  }
+
+  // Allow the target to implement this method for its nodes.
+  if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+      Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+      Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+      Op.getOpcode() == ISD::INTRINSIC_VOID) {
+    unsigned NumBits = TLI.ComputeNumSignBitsForTargetNode(Op, Depth);
+    if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits);
+  }
+
+  // Finally, if we can prove that the top bits of the result are 0's or 1's,
+  // use this information.
+  APInt KnownZero, KnownOne;
+  APInt Mask = APInt::getAllOnesValue(VTBits);
+  ComputeMaskedBits(Op, Mask, KnownZero, KnownOne, Depth);
+
+  if (KnownZero.isNegative()) {        // sign bit is 0
+    Mask = KnownZero;
+  } else if (KnownOne.isNegative()) {  // sign bit is 1;
+    Mask = KnownOne;
+  } else {
+    // Nothing known.
+    return FirstAnswer;
+  }
+
+  // Okay, we know that the sign bit in Mask is set.  Use CLZ to determine
+  // the number of identical bits in the top of the input value.
+  Mask = ~Mask;
+  Mask <<= Mask.getBitWidth()-VTBits;
+  // Return # leading zeros.  We use 'min' here in case Val was zero before
+  // shifting.  We don't want to return '64' as for an i32 "0".
+  return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
+}
+
+bool SelectionDAG::isKnownNeverNaN(SDValue Op) const {
+  // If we're told that NaNs won't happen, assume they won't.
+  if (NoNaNsFPMath)
+    return true;
+
+  // If the value is a constant, we can obviously see if it is a NaN or not.
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
+    return !C->getValueAPF().isNaN();
+
+  // TODO: Recognize more cases here.
+
+  return false;
+}
+
+bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
+  // If the value is a constant, we can obviously see if it is a zero or not.
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
+    return !C->isZero();
+
+  // TODO: Recognize more cases here.
+
+  return false;
+}
+
+bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
+  // Check the obvious case.
+  if (A == B) return true;
+
+  // For for negative and positive zero.
+  if (const ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A))
+    if (const ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B))
+      if (CA->isZero() && CB->isZero()) return true;
+
+  // Otherwise they may not be equal.
+  return false;
+}
+
+bool SelectionDAG::isVerifiedDebugInfoDesc(SDValue Op) const {
+  GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
+  if (!GA) return false;
+  if (GA->getOffset() != 0) return false;
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(GA->getGlobal());
+  if (!GV) return false;
+  return MF->getMMI().hasDebugInfo();
+}
+
+
+/// getShuffleScalarElt - Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+SDValue SelectionDAG::getShuffleScalarElt(const ShuffleVectorSDNode *N,
+                                          unsigned i) {
+  EVT VT = N->getValueType(0);
+  if (N->getMaskElt(i) < 0)
+    return getUNDEF(VT.getVectorElementType());
+  unsigned Index = N->getMaskElt(i);
+  unsigned NumElems = VT.getVectorNumElements();
+  SDValue V = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1);
+  Index %= NumElems;
+
+  if (V.getOpcode() == ISD::BIT_CONVERT) {
+    V = V.getOperand(0);
+    EVT VVT = V.getValueType();
+    if (!VVT.isVector() || VVT.getVectorNumElements() != (unsigned)NumElems)
+      return SDValue();
+  }
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return (Index == 0) ? V.getOperand(0)
+                      : getUNDEF(VT.getVectorElementType());
+  if (V.getOpcode() == ISD::BUILD_VECTOR)
+    return V.getOperand(Index);
+  if (const ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(V))
+    return getShuffleScalarElt(SVN, Index);
+  return SDValue();
+}
+
+
+/// getNode - Gets or creates the specified node.
+///
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opcode, getVTList(VT), 0, 0);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) SDNode(Opcode, DL, getVTList(VT));
+  CSEMap.InsertNode(N, IP);
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
+                              EVT VT, SDValue Operand) {
+  // Constant fold unary operations with an integer constant operand.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand.getNode())) {
+    const APInt &Val = C->getAPIntValue();
+    switch (Opcode) {
+    default: break;
+    case ISD::SIGN_EXTEND:
+      return getConstant(APInt(Val).sextOrTrunc(VT.getSizeInBits()), VT);
+    case ISD::ANY_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::TRUNCATE:
+      return getConstant(APInt(Val).zextOrTrunc(VT.getSizeInBits()), VT);
+    case ISD::UINT_TO_FP:
+    case ISD::SINT_TO_FP: {
+      const uint64_t zero[] = {0, 0};
+      // No compile time operations on ppcf128.
+      if (VT == MVT::ppcf128) break;
+      APFloat apf = APFloat(APInt(VT.getSizeInBits(), 2, zero));
+      (void)apf.convertFromAPInt(Val,
+                                 Opcode==ISD::SINT_TO_FP,
+                                 APFloat::rmNearestTiesToEven);
+      return getConstantFP(apf, VT);
+    }
+    case ISD::BIT_CONVERT:
+      if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
+        return getConstantFP(Val.bitsToFloat(), VT);
+      else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
+        return getConstantFP(Val.bitsToDouble(), VT);
+      break;
+    case ISD::BSWAP:
+      return getConstant(Val.byteSwap(), VT);
+    case ISD::CTPOP:
+      return getConstant(Val.countPopulation(), VT);
+    case ISD::CTLZ:
+      return getConstant(Val.countLeadingZeros(), VT);
+    case ISD::CTTZ:
+      return getConstant(Val.countTrailingZeros(), VT);
+    }
+  }
+
+  // Constant fold unary operations with a floating point constant operand.
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand.getNode())) {
+    APFloat V = C->getValueAPF();    // make copy
+    if (VT != MVT::ppcf128 && Operand.getValueType() != MVT::ppcf128) {
+      switch (Opcode) {
+      case ISD::FNEG:
+        V.changeSign();
+        return getConstantFP(V, VT);
+      case ISD::FABS:
+        V.clearSign();
+        return getConstantFP(V, VT);
+      case ISD::FP_ROUND:
+      case ISD::FP_EXTEND: {
+        bool ignored;
+        // This can return overflow, underflow, or inexact; we don't care.
+        // FIXME need to be more flexible about rounding mode.
+        (void)V.convert(*EVTToAPFloatSemantics(VT),
+                        APFloat::rmNearestTiesToEven, &ignored);
+        return getConstantFP(V, VT);
+      }
+      case ISD::FP_TO_SINT:
+      case ISD::FP_TO_UINT: {
+        integerPart x[2];
+        bool ignored;
+        assert(integerPartWidth >= 64);
+        // FIXME need to be more flexible about rounding mode.
+        APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(),
+                              Opcode==ISD::FP_TO_SINT,
+                              APFloat::rmTowardZero, &ignored);
+        if (s==APFloat::opInvalidOp)     // inexact is OK, in fact usual
+          break;
+        APInt api(VT.getSizeInBits(), 2, x);
+        return getConstant(api, VT);
+      }
+      case ISD::BIT_CONVERT:
+        if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
+          return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), VT);
+        else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
+          return getConstant(V.bitcastToAPInt().getZExtValue(), VT);
+        break;
+      }
+    }
+  }
+
+  unsigned OpOpcode = Operand.getNode()->getOpcode();
+  switch (Opcode) {
+  case ISD::TokenFactor:
+  case ISD::MERGE_VALUES:
+  case ISD::CONCAT_VECTORS:
+    return Operand;         // Factor, merge or concat of one node?  No need.
+  case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node");
+  case ISD::FP_EXTEND:
+    assert(VT.isFloatingPoint() &&
+           Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
+    if (Operand.getValueType() == VT) return Operand;  // noop conversion.
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    if (Operand.getOpcode() == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::SIGN_EXTEND:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid SIGN_EXTEND!");
+    if (Operand.getValueType() == VT) return Operand;   // noop extension
+    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
+           "Invalid sext node, dst < src!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
+      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+    break;
+  case ISD::ZERO_EXTEND:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid ZERO_EXTEND!");
+    if (Operand.getValueType() == VT) return Operand;   // noop extension
+    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
+           "Invalid zext node, dst < src!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    if (OpOpcode == ISD::ZERO_EXTEND)   // (zext (zext x)) -> (zext x)
+      return getNode(ISD::ZERO_EXTEND, DL, VT,
+                     Operand.getNode()->getOperand(0));
+    break;
+  case ISD::ANY_EXTEND:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid ANY_EXTEND!");
+    if (Operand.getValueType() == VT) return Operand;   // noop extension
+    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
+           "Invalid anyext node, dst < src!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+
+    if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+        OpOpcode == ISD::ANY_EXTEND)
+      // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
+      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+
+    // (ext (trunx x)) -> x
+    if (OpOpcode == ISD::TRUNCATE) {
+      SDValue OpOp = Operand.getNode()->getOperand(0);
+      if (OpOp.getValueType() == VT)
+        return OpOp;
+    }
+    break;
+  case ISD::TRUNCATE:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid TRUNCATE!");
+    if (Operand.getValueType() == VT) return Operand;   // noop truncate
+    assert(Operand.getValueType().getScalarType().bitsGT(VT.getScalarType()) &&
+           "Invalid truncate node, src < dst!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    if (OpOpcode == ISD::TRUNCATE)
+      return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+    else if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+             OpOpcode == ISD::ANY_EXTEND) {
+      // If the source is smaller than the dest, we still need an extend.
+      if (Operand.getNode()->getOperand(0).getValueType().getScalarType()
+            .bitsLT(VT.getScalarType()))
+        return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+      else if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT))
+        return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+      else
+        return Operand.getNode()->getOperand(0);
+    }
+    break;
+  case ISD::BIT_CONVERT:
+    // Basic sanity checking.
+    assert(VT.getSizeInBits() == Operand.getValueType().getSizeInBits()
+           && "Cannot BIT_CONVERT between types of different sizes!");
+    if (VT == Operand.getValueType()) return Operand;  // noop conversion.
+    if (OpOpcode == ISD::BIT_CONVERT)  // bitconv(bitconv(x)) -> bitconv(x)
+      return getNode(ISD::BIT_CONVERT, DL, VT, Operand.getOperand(0));
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::SCALAR_TO_VECTOR:
+    assert(VT.isVector() && !Operand.getValueType().isVector() &&
+           (VT.getVectorElementType() == Operand.getValueType() ||
+            (VT.getVectorElementType().isInteger() &&
+             Operand.getValueType().isInteger() &&
+             VT.getVectorElementType().bitsLE(Operand.getValueType()))) &&
+           "Illegal SCALAR_TO_VECTOR node!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined.
+    if (OpOpcode == ISD::EXTRACT_VECTOR_ELT &&
+        isa<ConstantSDNode>(Operand.getOperand(1)) &&
+        Operand.getConstantOperandVal(1) == 0 &&
+        Operand.getOperand(0).getValueType() == VT)
+      return Operand.getOperand(0);
+    break;
+  case ISD::FNEG:
+    // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
+    if (UnsafeFPMath && OpOpcode == ISD::FSUB)
+      return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1),
+                     Operand.getNode()->getOperand(0));
+    if (OpOpcode == ISD::FNEG)  // --X -> X
+      return Operand.getNode()->getOperand(0);
+    break;
+  case ISD::FABS:
+    if (OpOpcode == ISD::FNEG)  // abs(-X) -> abs(X)
+      return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0));
+    break;
+  }
+
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  if (VT != MVT::Flag) { // Don't CSE flag producing nodes
+    FoldingSetNodeID ID;
+    SDValue Ops[1] = { Operand };
+    AddNodeIDNode(ID, Opcode, VTs, Ops, 1);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+
+    N = new (NodeAllocator) UnarySDNode(Opcode, DL, VTs, Operand);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = new (NodeAllocator) UnarySDNode(Opcode, DL, VTs, Operand);
+  }
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode,
+                                             EVT VT,
+                                             ConstantSDNode *Cst1,
+                                             ConstantSDNode *Cst2) {
+  const APInt &C1 = Cst1->getAPIntValue(), &C2 = Cst2->getAPIntValue();
+
+  switch (Opcode) {
+  case ISD::ADD:  return getConstant(C1 + C2, VT);
+  case ISD::SUB:  return getConstant(C1 - C2, VT);
+  case ISD::MUL:  return getConstant(C1 * C2, VT);
+  case ISD::UDIV:
+    if (C2.getBoolValue()) return getConstant(C1.udiv(C2), VT);
+    break;
+  case ISD::UREM:
+    if (C2.getBoolValue()) return getConstant(C1.urem(C2), VT);
+    break;
+  case ISD::SDIV:
+    if (C2.getBoolValue()) return getConstant(C1.sdiv(C2), VT);
+    break;
+  case ISD::SREM:
+    if (C2.getBoolValue()) return getConstant(C1.srem(C2), VT);
+    break;
+  case ISD::AND:  return getConstant(C1 & C2, VT);
+  case ISD::OR:   return getConstant(C1 | C2, VT);
+  case ISD::XOR:  return getConstant(C1 ^ C2, VT);
+  case ISD::SHL:  return getConstant(C1 << C2, VT);
+  case ISD::SRL:  return getConstant(C1.lshr(C2), VT);
+  case ISD::SRA:  return getConstant(C1.ashr(C2), VT);
+  case ISD::ROTL: return getConstant(C1.rotl(C2), VT);
+  case ISD::ROTR: return getConstant(C1.rotr(C2), VT);
+  default: break;
+  }
+
+  return SDValue();
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+                              SDValue N1, SDValue N2) {
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+  switch (Opcode) {
+  default: break;
+  case ISD::TokenFactor:
+    assert(VT == MVT::Other && N1.getValueType() == MVT::Other &&
+           N2.getValueType() == MVT::Other && "Invalid token factor!");
+    // Fold trivial token factors.
+    if (N1.getOpcode() == ISD::EntryToken) return N2;
+    if (N2.getOpcode() == ISD::EntryToken) return N1;
+    if (N1 == N2) return N1;
+    break;
+  case ISD::CONCAT_VECTORS:
+    // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
+    // one big BUILD_VECTOR.
+    if (N1.getOpcode() == ISD::BUILD_VECTOR &&
+        N2.getOpcode() == ISD::BUILD_VECTOR) {
+      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
+                                    N1.getNode()->op_end());
+      Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
+      return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
+    }
+    break;
+  case ISD::AND:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    // (X & 0) -> 0.  This commonly occurs when legalizing i64 values, so it's
+    // worth handling here.
+    if (N2C && N2C->isNullValue())
+      return N2;
+    if (N2C && N2C->isAllOnesValue())  // X & -1 -> X
+      return N1;
+    break;
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ADD:
+  case ISD::SUB:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    // (X ^|+- 0) -> X.  This commonly occurs when legalizing i64 values, so
+    // it's worth handling here.
+    if (N2C && N2C->isNullValue())
+      return N1;
+    break;
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::MULHU:
+  case ISD::MULHS:
+  case ISD::MUL:
+  case ISD::SDIV:
+  case ISD::SREM:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    break;
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+    if (UnsafeFPMath) {
+      if (Opcode == ISD::FADD) {
+        // 0+x --> x
+        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1))
+          if (CFP->getValueAPF().isZero())
+            return N2;
+        // x+0 --> x
+        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N2))
+          if (CFP->getValueAPF().isZero())
+            return N1;
+      } else if (Opcode == ISD::FSUB) {
+        // x-0 --> x
+        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N2))
+          if (CFP->getValueAPF().isZero())
+            return N1;
+      }
+    }
+    assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    break;
+  case ISD::FCOPYSIGN:   // N1 and result must match.  N1/N2 need not match.
+    assert(N1.getValueType() == VT &&
+           N1.getValueType().isFloatingPoint() &&
+           N2.getValueType().isFloatingPoint() &&
+           "Invalid FCOPYSIGN!");
+    break;
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:
+    assert(VT == N1.getValueType() &&
+           "Shift operators return type must be the same as their first arg");
+    assert(VT.isInteger() && N2.getValueType().isInteger() &&
+           "Shifts only work on integers");
+
+    // Always fold shifts of i1 values so the code generator doesn't need to
+    // handle them.  Since we know the size of the shift has to be less than the
+    // size of the value, the shift/rotate count is guaranteed to be zero.
+    if (VT == MVT::i1)
+      return N1;
+    if (N2C && N2C->isNullValue())
+      return N1;
+    break;
+  case ISD::FP_ROUND_INREG: {
+    EVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg round!");
+    assert(VT.isFloatingPoint() && EVT.isFloatingPoint() &&
+           "Cannot FP_ROUND_INREG integer types");
+    assert(EVT.isVector() == VT.isVector() &&
+           "FP_ROUND_INREG type should be vector iff the operand "
+           "type is vector!");
+    assert((!EVT.isVector() ||
+            EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
+           "Vector element counts must match in FP_ROUND_INREG");
+    assert(EVT.bitsLE(VT) && "Not rounding down!");
+    if (cast<VTSDNode>(N2)->getVT() == VT) return N1;  // Not actually rounding.
+    break;
+  }
+  case ISD::FP_ROUND:
+    assert(VT.isFloatingPoint() &&
+           N1.getValueType().isFloatingPoint() &&
+           VT.bitsLE(N1.getValueType()) &&
+           isa<ConstantSDNode>(N2) && "Invalid FP_ROUND!");
+    if (N1.getValueType() == VT) return N1;  // noop conversion.
+    break;
+  case ISD::AssertSext:
+  case ISD::AssertZext: {
+    EVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg extend!");
+    assert(VT.isInteger() && EVT.isInteger() &&
+           "Cannot *_EXTEND_INREG FP types");
+    assert(!EVT.isVector() &&
+           "AssertSExt/AssertZExt type should be the vector element type "
+           "rather than the vector type!");
+    assert(EVT.bitsLE(VT) && "Not extending!");
+    if (VT == EVT) return N1; // noop assertion.
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg extend!");
+    assert(VT.isInteger() && EVT.isInteger() &&
+           "Cannot *_EXTEND_INREG FP types");
+    assert(EVT.isVector() == VT.isVector() &&
+           "SIGN_EXTEND_INREG type should be vector iff the operand "
+           "type is vector!");
+    assert((!EVT.isVector() ||
+            EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
+           "Vector element counts must match in SIGN_EXTEND_INREG");
+    assert(EVT.bitsLE(VT) && "Not extending!");
+    if (EVT == VT) return N1;  // Not actually extending
+
+    if (N1C) {
+      APInt Val = N1C->getAPIntValue();
+      unsigned FromBits = EVT.getScalarType().getSizeInBits();
+      Val <<= Val.getBitWidth()-FromBits;
+      Val = Val.ashr(Val.getBitWidth()-FromBits);
+      return getConstant(Val, VT);
+    }
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT:
+    // EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF.
+    if (N1.getOpcode() == ISD::UNDEF)
+      return getUNDEF(VT);
+
+    // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
+    // expanding copies of large vectors from registers.
+    if (N2C &&
+        N1.getOpcode() == ISD::CONCAT_VECTORS &&
+        N1.getNumOperands() > 0) {
+      unsigned Factor =
+        N1.getOperand(0).getValueType().getVectorNumElements();
+      return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                     N1.getOperand(N2C->getZExtValue() / Factor),
+                     getConstant(N2C->getZExtValue() % Factor,
+                                 N2.getValueType()));
+    }
+
+    // EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
+    // expanding large vector constants.
+    if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue Elt = N1.getOperand(N2C->getZExtValue());
+      EVT VEltTy = N1.getValueType().getVectorElementType();
+      if (Elt.getValueType() != VEltTy) {
+        // If the vector element type is not legal, the BUILD_VECTOR operands
+        // are promoted and implicitly truncated.  Make that explicit here.
+        Elt = getNode(ISD::TRUNCATE, DL, VEltTy, Elt);
+      }
+      if (VT != VEltTy) {
+        // If the vector element type is not legal, the EXTRACT_VECTOR_ELT
+        // result is implicitly extended.
+        Elt = getNode(ISD::ANY_EXTEND, DL, VT, Elt);
+      }
+      return Elt;
+    }
+
+    // EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector
+    // operations are lowered to scalars.
+    if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+      // If the indices are the same, return the inserted element else
+      // if the indices are known different, extract the element from
+      // the original vector.
+      SDValue N1Op2 = N1.getOperand(2);
+      ConstantSDNode *N1Op2C = dyn_cast<ConstantSDNode>(N1Op2.getNode());
+
+      if (N1Op2C && N2C) {
+        if (N1Op2C->getZExtValue() == N2C->getZExtValue()) {
+          if (VT == N1.getOperand(1).getValueType())
+            return N1.getOperand(1);
+          else
+            return getSExtOrTrunc(N1.getOperand(1), DL, VT);
+        }
+
+        return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2);
+      }
+    }
+    break;
+  case ISD::EXTRACT_ELEMENT:
+    assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!");
+    assert(!N1.getValueType().isVector() && !VT.isVector() &&
+           (N1.getValueType().isInteger() == VT.isInteger()) &&
+           "Wrong types for EXTRACT_ELEMENT!");
+
+    // EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding
+    // 64-bit integers into 32-bit parts.  Instead of building the extract of
+    // the BUILD_PAIR, only to have legalize rip it apart, just do it now.
+    if (N1.getOpcode() == ISD::BUILD_PAIR)
+      return N1.getOperand(N2C->getZExtValue());
+
+    // EXTRACT_ELEMENT of a constant int is also very common.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+      unsigned ElementSize = VT.getSizeInBits();
+      unsigned Shift = ElementSize * N2C->getZExtValue();
+      APInt ShiftedVal = C->getAPIntValue().lshr(Shift);
+      return getConstant(ShiftedVal.trunc(ElementSize), VT);
+    }
+    break;
+  case ISD::EXTRACT_SUBVECTOR:
+    if (N1.getValueType() == VT) // Trivial extraction.
+      return N1;
+    break;
+  }
+
+  if (N1C) {
+    if (N2C) {
+      SDValue SV = FoldConstantArithmetic(Opcode, VT, N1C, N2C);
+      if (SV.getNode()) return SV;
+    } else {      // Cannonicalize constant to RHS if commutative
+      if (isCommutativeBinOp(Opcode)) {
+        std::swap(N1C, N2C);
+        std::swap(N1, N2);
+      }
+    }
+  }
+
+  // Constant fold FP operations.
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1.getNode());
+  ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2.getNode());
+  if (N1CFP) {
+    if (!N2CFP && isCommutativeBinOp(Opcode)) {
+      // Cannonicalize constant to RHS if commutative
+      std::swap(N1CFP, N2CFP);
+      std::swap(N1, N2);
+    } else if (N2CFP && VT != MVT::ppcf128) {
+      APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF();
+      APFloat::opStatus s;
+      switch (Opcode) {
+      case ISD::FADD:
+        s = V1.add(V2, APFloat::rmNearestTiesToEven);
+        if (s != APFloat::opInvalidOp)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FSUB:
+        s = V1.subtract(V2, APFloat::rmNearestTiesToEven);
+        if (s!=APFloat::opInvalidOp)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FMUL:
+        s = V1.multiply(V2, APFloat::rmNearestTiesToEven);
+        if (s!=APFloat::opInvalidOp)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FDIV:
+        s = V1.divide(V2, APFloat::rmNearestTiesToEven);
+        if (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FREM :
+        s = V1.mod(V2, APFloat::rmNearestTiesToEven);
+        if (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FCOPYSIGN:
+        V1.copySign(V2);
+        return getConstantFP(V1, VT);
+      default: break;
+      }
+    }
+  }
+
+  // Canonicalize an UNDEF to the RHS, even over a constant.
+  if (N1.getOpcode() == ISD::UNDEF) {
+    if (isCommutativeBinOp(Opcode)) {
+      std::swap(N1, N2);
+    } else {
+      switch (Opcode) {
+      case ISD::FP_ROUND_INREG:
+      case ISD::SIGN_EXTEND_INREG:
+      case ISD::SUB:
+      case ISD::FSUB:
+      case ISD::FDIV:
+      case ISD::FREM:
+      case ISD::SRA:
+        return N1;     // fold op(undef, arg2) -> undef
+      case ISD::UDIV:
+      case ISD::SDIV:
+      case ISD::UREM:
+      case ISD::SREM:
+      case ISD::SRL:
+      case ISD::SHL:
+        if (!VT.isVector())
+          return getConstant(0, VT);    // fold op(undef, arg2) -> 0
+        // For vectors, we can't easily build an all zero vector, just return
+        // the LHS.
+        return N2;
+      }
+    }
+  }
+
+  // Fold a bunch of operators when the RHS is undef.
+  if (N2.getOpcode() == ISD::UNDEF) {
+    switch (Opcode) {
+    case ISD::XOR:
+      if (N1.getOpcode() == ISD::UNDEF)
+        // Handle undef ^ undef -> 0 special case. This is a common
+        // idiom (misuse).
+        return getConstant(0, VT);
+      // fallthrough
+    case ISD::ADD:
+    case ISD::ADDC:
+    case ISD::ADDE:
+    case ISD::SUB:
+    case ISD::UDIV:
+    case ISD::SDIV:
+    case ISD::UREM:
+    case ISD::SREM:
+      return N2;       // fold op(arg1, undef) -> undef
+    case ISD::FADD:
+    case ISD::FSUB:
+    case ISD::FMUL:
+    case ISD::FDIV:
+    case ISD::FREM:
+      if (UnsafeFPMath)
+        return N2;
+      break;
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::SRL:
+    case ISD::SHL:
+      if (!VT.isVector())
+        return getConstant(0, VT);  // fold op(arg1, undef) -> 0
+      // For vectors, we can't easily build an all zero vector, just return
+      // the LHS.
+      return N1;
+    case ISD::OR:
+      if (!VT.isVector())
+        return getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT);
+      // For vectors, we can't easily build an all one vector, just return
+      // the LHS.
+      return N1;
+    case ISD::SRA:
+      return N1;
+    }
+  }
+
+  // Memoize this node if possible.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  if (VT != MVT::Flag) {
+    SDValue Ops[] = { N1, N2 };
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+
+    N = new (NodeAllocator) BinarySDNode(Opcode, DL, VTs, N1, N2);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = new (NodeAllocator) BinarySDNode(Opcode, DL, VTs, N1, N2);
+  }
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3) {
+  // Perform various simplifications.
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  switch (Opcode) {
+  case ISD::CONCAT_VECTORS:
+    // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
+    // one big BUILD_VECTOR.
+    if (N1.getOpcode() == ISD::BUILD_VECTOR &&
+        N2.getOpcode() == ISD::BUILD_VECTOR &&
+        N3.getOpcode() == ISD::BUILD_VECTOR) {
+      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
+                                    N1.getNode()->op_end());
+      Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
+      Elts.append(N3.getNode()->op_begin(), N3.getNode()->op_end());
+      return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
+    }
+    break;
+  case ISD::SETCC: {
+    // Use FoldSetCC to simplify SETCC's.
+    SDValue Simp = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL);
+    if (Simp.getNode()) return Simp;
+    break;
+  }
+  case ISD::SELECT:
+    if (N1C) {
+     if (N1C->getZExtValue())
+        return N2;             // select true, X, Y -> X
+      else
+        return N3;             // select false, X, Y -> Y
+    }
+
+    if (N2 == N3) return N2;   // select C, X, X -> X
+    break;
+  case ISD::VECTOR_SHUFFLE:
+    llvm_unreachable("should use getVectorShuffle constructor!");
+    break;
+  case ISD::BIT_CONVERT:
+    // Fold bit_convert nodes from a type to themselves.
+    if (N1.getValueType() == VT)
+      return N1;
+    break;
+  }
+
+  // Memoize node if it doesn't produce a flag.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  if (VT != MVT::Flag) {
+    SDValue Ops[] = { N1, N2, N3 };
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+
+    N = new (NodeAllocator) TernarySDNode(Opcode, DL, VTs, N1, N2, N3);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = new (NodeAllocator) TernarySDNode(Opcode, DL, VTs, N1, N2, N3);
+  }
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              SDValue N4) {
+  SDValue Ops[] = { N1, N2, N3, N4 };
+  return getNode(Opcode, DL, VT, Ops, 4);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              SDValue N4, SDValue N5) {
+  SDValue Ops[] = { N1, N2, N3, N4, N5 };
+  return getNode(Opcode, DL, VT, Ops, 5);
+}
+
+/// getStackArgumentTokenFactor - Compute a TokenFactor to force all
+/// the incoming stack arguments to be loaded from the stack.
+SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
+  SmallVector<SDValue, 8> ArgChains;
+
+  // Include the original chain at the beginning of the list. When this is
+  // used by target LowerCall hooks, this helps legalize find the
+  // CALLSEQ_BEGIN node.
+  ArgChains.push_back(Chain);
+
+  // Add a chain value for each stack argument.
+  for (SDNode::use_iterator U = getEntryNode().getNode()->use_begin(),
+       UE = getEntryNode().getNode()->use_end(); U != UE; ++U)
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+        if (FI->getIndex() < 0)
+          ArgChains.push_back(SDValue(L, 1));
+
+  // Build a tokenfactor for all the chains.
+  return getNode(ISD::TokenFactor, Chain.getDebugLoc(), MVT::Other,
+                 &ArgChains[0], ArgChains.size());
+}
+
+/// getMemsetValue - Vectorized representation of the memset value
+/// operand.
+static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
+                              DebugLoc dl) {
+  assert(Value.getOpcode() != ISD::UNDEF);
+
+  unsigned NumBits = VT.getScalarType().getSizeInBits();
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
+    APInt Val = APInt(NumBits, C->getZExtValue() & 255);
+    unsigned Shift = 8;
+    for (unsigned i = NumBits; i > 8; i >>= 1) {
+      Val = (Val << Shift) | Val;
+      Shift <<= 1;
+    }
+    if (VT.isInteger())
+      return DAG.getConstant(Val, VT);
+    return DAG.getConstantFP(APFloat(Val), VT);
+  }
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  Value = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Value);
+  unsigned Shift = 8;
+  for (unsigned i = NumBits; i > 8; i >>= 1) {
+    Value = DAG.getNode(ISD::OR, dl, VT,
+                        DAG.getNode(ISD::SHL, dl, VT, Value,
+                                    DAG.getConstant(Shift,
+                                                    TLI.getShiftAmountTy())),
+                        Value);
+    Shift <<= 1;
+  }
+
+  return Value;
+}
+
+/// getMemsetStringVal - Similar to getMemsetValue. Except this is only
+/// used when a memcpy is turned into a memset when the source is a constant
+/// string ptr.
+static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
+                                  const TargetLowering &TLI,
+                                  std::string &Str, unsigned Offset) {
+  // Handle vector with all elements zero.
+  if (Str.empty()) {
+    if (VT.isInteger())
+      return DAG.getConstant(0, VT);
+    else if (VT.getSimpleVT().SimpleTy == MVT::f32 ||
+             VT.getSimpleVT().SimpleTy == MVT::f64)
+      return DAG.getConstantFP(0.0, VT);
+    else if (VT.isVector()) {
+      unsigned NumElts = VT.getVectorNumElements();
+      MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                         DAG.getConstant(0, EVT::getVectorVT(*DAG.getContext(),
+                                                             EltVT, NumElts)));
+    } else
+      llvm_unreachable("Expected type!");
+  }
+
+  assert(!VT.isVector() && "Can't handle vector type here!");
+  unsigned NumBits = VT.getSizeInBits();
+  unsigned MSB = NumBits / 8;
+  uint64_t Val = 0;
+  if (TLI.isLittleEndian())
+    Offset = Offset + MSB - 1;
+  for (unsigned i = 0; i != MSB; ++i) {
+    Val = (Val << 8) | (unsigned char)Str[Offset];
+    Offset += TLI.isLittleEndian() ? -1 : 1;
+  }
+  return DAG.getConstant(Val, VT);
+}
+
+/// getMemBasePlusOffset - Returns base and offset node for the
+///
+static SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset,
+                                      SelectionDAG &DAG) {
+  EVT VT = Base.getValueType();
+  return DAG.getNode(ISD::ADD, Base.getDebugLoc(),
+                     VT, Base, DAG.getConstant(Offset, VT));
+}
+
+/// isMemSrcFromString - Returns true if memcpy source is a string constant.
+///
+static bool isMemSrcFromString(SDValue Src, std::string &Str) {
+  unsigned SrcDelta = 0;
+  GlobalAddressSDNode *G = NULL;
+  if (Src.getOpcode() == ISD::GlobalAddress)
+    G = cast<GlobalAddressSDNode>(Src);
+  else if (Src.getOpcode() == ISD::ADD &&
+           Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
+           Src.getOperand(1).getOpcode() == ISD::Constant) {
+    G = cast<GlobalAddressSDNode>(Src.getOperand(0));
+    SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
+  }
+  if (!G)
+    return false;
+
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(G->getGlobal());
+  if (GV && GetConstantStringInfo(GV, Str, SrcDelta, false))
+    return true;
+
+  return false;
+}
+
+/// FindOptimalMemOpLowering - Determines the optimial series memory ops
+/// to replace the memset / memcpy. Return true if the number of memory ops
+/// is below the threshold. It returns the types of the sequence of
+/// memory ops to perform memset / memcpy by reference.
+static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
+                                     unsigned Limit, uint64_t Size,
+                                     unsigned DstAlign, unsigned SrcAlign,
+                                     bool NonScalarIntSafe,
+                                     bool MemcpyStrSrc,
+                                     SelectionDAG &DAG,
+                                     const TargetLowering &TLI) {
+  assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
+         "Expecting memcpy / memset source to meet alignment requirement!");
+  // If 'SrcAlign' is zero, that means the memory operation does not need load
+  // the value, i.e. memset or memcpy from constant string. Otherwise, it's
+  // the inferred alignment of the source. 'DstAlign', on the other hand, is the
+  // specified alignment of the memory operation. If it is zero, that means
+  // it's possible to change the alignment of the destination. 'MemcpyStrSrc'
+  // indicates whether the memcpy source is constant so it does not need to be
+  // loaded.
+  EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign,
+                                   NonScalarIntSafe, MemcpyStrSrc,
+                                   DAG.getMachineFunction());
+
+  if (VT == MVT::Other) {
+    if (DstAlign >= TLI.getTargetData()->getPointerPrefAlignment() ||
+        TLI.allowsUnalignedMemoryAccesses(VT)) {
+      VT = TLI.getPointerTy();
+    } else {
+      switch (DstAlign & 7) {
+      case 0:  VT = MVT::i64; break;
+      case 4:  VT = MVT::i32; break;
+      case 2:  VT = MVT::i16; break;
+      default: VT = MVT::i8;  break;
+      }
+    }
+
+    MVT LVT = MVT::i64;
+    while (!TLI.isTypeLegal(LVT))
+      LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
+    assert(LVT.isInteger());
+
+    if (VT.bitsGT(LVT))
+      VT = LVT;
+  }
+  
+  // If we're optimizing for size, and there is a limit, bump the maximum number
+  // of operations inserted down to 4.  This is a wild guess that approximates
+  // the size of a call to memcpy or memset (3 arguments + call).
+  if (Limit != ~0U) {
+    const Function *F = DAG.getMachineFunction().getFunction();
+    if (F->hasFnAttr(Attribute::OptimizeForSize))
+      Limit = 4;
+  }
+
+  unsigned NumMemOps = 0;
+  while (Size != 0) {
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    while (VTSize > Size) {
+      // For now, only use non-vector load / store's for the left-over pieces.
+      if (VT.isVector() || VT.isFloatingPoint()) {
+        VT = MVT::i64;
+        while (!TLI.isTypeLegal(VT))
+          VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
+        VTSize = VT.getSizeInBits() / 8;
+      } else {
+        // This can result in a type that is not legal on the target, e.g.
+        // 1 or 2 bytes on PPC.
+        VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
+        VTSize >>= 1;
+      }
+    }
+
+    if (++NumMemOps > Limit)
+      return false;
+    MemOps.push_back(VT);
+    Size -= VTSize;
+  }
+
+  return true;
+}
+
+static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
+                                       SDValue Chain, SDValue Dst,
+                                       SDValue Src, uint64_t Size,
+                                       unsigned Align, bool isVol,
+                                       bool AlwaysInline,
+                                       const Value *DstSV, uint64_t DstSVOff,
+                                       const Value *SrcSV, uint64_t SrcSVOff) {
+  // Turn a memcpy of undef to nop.
+  if (Src.getOpcode() == ISD::UNDEF)
+    return Chain;
+
+  // Expand memcpy to a series of load and store ops if the size operand falls
+  // below a certain threshold.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::vector<EVT> MemOps;
+  bool DstAlignCanChange = false;
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
+  if (Align > SrcAlign)
+    SrcAlign = Align;
+  std::string Str;
+  bool CopyFromStr = isMemSrcFromString(Src, Str);
+  bool isZeroStr = CopyFromStr && Str.empty();
+  unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy();
+  
+  if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
+                                (DstAlignCanChange ? 0 : Align),
+                                (isZeroStr ? 0 : SrcAlign),
+                                true, CopyFromStr, DAG, TLI))
+    return SDValue();
+
+  if (DstAlignCanChange) {
+    const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
+    if (NewAlign > Align) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
+        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
+      Align = NewAlign;
+    }
+  }
+
+  SmallVector<SDValue, 8> OutChains;
+  unsigned NumMemOps = MemOps.size();
+  uint64_t SrcOff = 0, DstOff = 0;
+  for (unsigned i = 0; i != NumMemOps; ++i) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value, Store;
+
+    if (CopyFromStr &&
+        (isZeroStr || (VT.isInteger() && !VT.isVector()))) {
+      // It's unlikely a store of a vector immediate can be done in a single
+      // instruction. It would require a load from a constantpool first.
+      // We only handle zero vectors here.
+      // FIXME: Handle other cases where store of vector immediate is done in
+      // a single instruction.
+      Value = getMemsetStringVal(VT, dl, DAG, TLI, Str, SrcOff);
+      Store = DAG.getStore(Chain, dl, Value,
+                           getMemBasePlusOffset(Dst, DstOff, DAG),
+                           DstSV, DstSVOff + DstOff, isVol, false, Align);
+    } else {
+      // The type might not be legal for the target.  This should only happen
+      // if the type is smaller than a legal type, as on PPC, so the right
+      // thing to do is generate a LoadExt/StoreTrunc pair.  These simplify
+      // to Load/Store if NVT==VT.
+      // FIXME does the case above also need this?
+      EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+      assert(NVT.bitsGE(VT));
+      Value = DAG.getExtLoad(ISD::EXTLOAD, NVT, dl, Chain,
+                             getMemBasePlusOffset(Src, SrcOff, DAG),
+                             SrcSV, SrcSVOff + SrcOff, VT, isVol, false,
+                             MinAlign(SrcAlign, SrcOff));
+      Store = DAG.getTruncStore(Chain, dl, Value,
+                                getMemBasePlusOffset(Dst, DstOff, DAG),
+                                DstSV, DstSVOff + DstOff, VT, isVol, false,
+                                Align);
+    }
+    OutChains.push_back(Store);
+    SrcOff += VTSize;
+    DstOff += VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &OutChains[0], OutChains.size());
+}
+
+static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
+                                        SDValue Chain, SDValue Dst,
+                                        SDValue Src, uint64_t Size,
+                                        unsigned Align,  bool isVol,
+                                        bool AlwaysInline,
+                                        const Value *DstSV, uint64_t DstSVOff,
+                                        const Value *SrcSV, uint64_t SrcSVOff) {
+  // Turn a memmove of undef to nop.
+  if (Src.getOpcode() == ISD::UNDEF)
+    return Chain;
+
+  // Expand memmove to a series of load and store ops if the size operand falls
+  // below a certain threshold.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::vector<EVT> MemOps;
+  bool DstAlignCanChange = false;
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
+  if (Align > SrcAlign)
+    SrcAlign = Align;
+  unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove();
+
+  if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
+                                (DstAlignCanChange ? 0 : Align),
+                                SrcAlign, true, false, DAG, TLI))
+    return SDValue();
+
+  if (DstAlignCanChange) {
+    const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
+    if (NewAlign > Align) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
+        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
+      Align = NewAlign;
+    }
+  }
+
+  uint64_t SrcOff = 0, DstOff = 0;
+  SmallVector<SDValue, 8> LoadValues;
+  SmallVector<SDValue, 8> LoadChains;
+  SmallVector<SDValue, 8> OutChains;
+  unsigned NumMemOps = MemOps.size();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value, Store;
+
+    Value = DAG.getLoad(VT, dl, Chain,
+                        getMemBasePlusOffset(Src, SrcOff, DAG),
+                        SrcSV, SrcSVOff + SrcOff, isVol, false, SrcAlign);
+    LoadValues.push_back(Value);
+    LoadChains.push_back(Value.getValue(1));
+    SrcOff += VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      &LoadChains[0], LoadChains.size());
+  OutChains.clear();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value, Store;
+
+    Store = DAG.getStore(Chain, dl, LoadValues[i],
+                         getMemBasePlusOffset(Dst, DstOff, DAG),
+                         DstSV, DstSVOff + DstOff, isVol, false, Align);
+    OutChains.push_back(Store);
+    DstOff += VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &OutChains[0], OutChains.size());
+}
+
+static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
+                               SDValue Chain, SDValue Dst,
+                               SDValue Src, uint64_t Size,
+                               unsigned Align, bool isVol,
+                               const Value *DstSV, uint64_t DstSVOff) {
+  // Turn a memset of undef to nop.
+  if (Src.getOpcode() == ISD::UNDEF)
+    return Chain;
+
+  // Expand memset to a series of load/store ops if the size operand
+  // falls below a certain threshold.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::vector<EVT> MemOps;
+  bool DstAlignCanChange = false;
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  bool NonScalarIntSafe =
+    isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
+  if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(),
+                                Size, (DstAlignCanChange ? 0 : Align), 0,
+                                NonScalarIntSafe, false, DAG, TLI))
+    return SDValue();
+
+  if (DstAlignCanChange) {
+    const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
+    if (NewAlign > Align) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
+        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
+      Align = NewAlign;
+    }
+  }
+
+  SmallVector<SDValue, 8> OutChains;
+  uint64_t DstOff = 0;
+  unsigned NumMemOps = MemOps.size();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value = getMemsetValue(Src, VT, DAG, dl);
+    SDValue Store = DAG.getStore(Chain, dl, Value,
+                                 getMemBasePlusOffset(Dst, DstOff, DAG),
+                                 DstSV, DstSVOff + DstOff, isVol, false, 0);
+    OutChains.push_back(Store);
+    DstOff += VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &OutChains[0], OutChains.size());
+}
+
+SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
+                                SDValue Src, SDValue Size,
+                                unsigned Align, bool isVol, bool AlwaysInline,
+                                const Value *DstSV, uint64_t DstSVOff,
+                                const Value *SrcSV, uint64_t SrcSVOff) {
+
+  // Check to see if we should lower the memcpy to loads and stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memcpy with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
+                                             ConstantSize->getZExtValue(),Align,
+                                isVol, false, DstSV, DstSVOff, SrcSV, SrcSVOff);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memcpy with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  SDValue Result =
+    TSI.EmitTargetCodeForMemcpy(*this, dl, Chain, Dst, Src, Size, Align,
+                                isVol, AlwaysInline,
+                                DstSV, DstSVOff, SrcSV, SrcSVOff);
+  if (Result.getNode())
+    return Result;
+
+  // If we really need inline code and the target declined to provide it,
+  // use a (potentially long) sequence of loads and stores.
+  if (AlwaysInline) {
+    assert(ConstantSize && "AlwaysInline requires a constant size!");
+    return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
+                                   ConstantSize->getZExtValue(), Align, isVol,
+                                   true, DstSV, DstSVOff, SrcSV, SrcSVOff);
+  }
+
+  // FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc
+  // memcpy is not guaranteed to be safe. libc memcpys aren't required to
+  // respect volatile, so they may do things like read or write memory
+  // beyond the given memory regions. But fixing this isn't easy, and most
+  // people don't care.
+
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = TLI.getTargetData()->getIntPtrType(*getContext());
+  Entry.Node = Dst; Args.push_back(Entry);
+  Entry.Node = Src; Args.push_back(Entry);
+  Entry.Node = Size; Args.push_back(Entry);
+  // FIXME: pass in DebugLoc
+  std::pair<SDValue,SDValue> CallResult =
+    TLI.LowerCallTo(Chain, Type::getVoidTy(*getContext()),
+                    false, false, false, false, 0,
+                    TLI.getLibcallCallingConv(RTLIB::MEMCPY), false,
+                    /*isReturnValueUsed=*/false,
+                    getExternalSymbol(TLI.getLibcallName(RTLIB::MEMCPY),
+                                      TLI.getPointerTy()),
+                    Args, *this, dl);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst,
+                                 SDValue Src, SDValue Size,
+                                 unsigned Align, bool isVol,
+                                 const Value *DstSV, uint64_t DstSVOff,
+                                 const Value *SrcSV, uint64_t SrcSVOff) {
+
+  // Check to see if we should lower the memmove to loads and stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memmove with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDValue Result =
+      getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src,
+                               ConstantSize->getZExtValue(), Align, isVol,
+                               false, DstSV, DstSVOff, SrcSV, SrcSVOff);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memmove with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  SDValue Result =
+    TSI.EmitTargetCodeForMemmove(*this, dl, Chain, Dst, Src, Size, Align, isVol,
+                                 DstSV, DstSVOff, SrcSV, SrcSVOff);
+  if (Result.getNode())
+    return Result;
+
+  // FIXME: If the memmove is volatile, lowering it to plain libc memmove may
+  // not be safe.  See memcpy above for more details.
+
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = TLI.getTargetData()->getIntPtrType(*getContext());
+  Entry.Node = Dst; Args.push_back(Entry);
+  Entry.Node = Src; Args.push_back(Entry);
+  Entry.Node = Size; Args.push_back(Entry);
+  // FIXME:  pass in DebugLoc
+  std::pair<SDValue,SDValue> CallResult =
+    TLI.LowerCallTo(Chain, Type::getVoidTy(*getContext()),
+                    false, false, false, false, 0,
+                    TLI.getLibcallCallingConv(RTLIB::MEMMOVE), false,
+                    /*isReturnValueUsed=*/false,
+                    getExternalSymbol(TLI.getLibcallName(RTLIB::MEMMOVE),
+                                      TLI.getPointerTy()),
+                    Args, *this, dl);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
+                                SDValue Src, SDValue Size,
+                                unsigned Align, bool isVol,
+                                const Value *DstSV, uint64_t DstSVOff) {
+
+  // Check to see if we should lower the memset to stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memset with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDValue Result =
+      getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
+                      Align, isVol, DstSV, DstSVOff);
+
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memset with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  SDValue Result =
+    TSI.EmitTargetCodeForMemset(*this, dl, Chain, Dst, Src, Size, Align, isVol,
+                                DstSV, DstSVOff);
+  if (Result.getNode())
+    return Result;
+
+  // Emit a library call.  
+  const Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Node = Dst; Entry.Ty = IntPtrTy;
+  Args.push_back(Entry);
+  // Extend or truncate the argument to be an i32 value for the call.
+  if (Src.getValueType().bitsGT(MVT::i32))
+    Src = getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+  else
+    Src = getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
+  Entry.Node = Src;
+  Entry.Ty = Type::getInt32Ty(*getContext());
+  Entry.isSExt = true;
+  Args.push_back(Entry);
+  Entry.Node = Size;
+  Entry.Ty = IntPtrTy;
+  Entry.isSExt = false;
+  Args.push_back(Entry);
+  // FIXME: pass in DebugLoc
+  std::pair<SDValue,SDValue> CallResult =
+    TLI.LowerCallTo(Chain, Type::getVoidTy(*getContext()),
+                    false, false, false, false, 0,
+                    TLI.getLibcallCallingConv(RTLIB::MEMSET), false,
+                    /*isReturnValueUsed=*/false,
+                    getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
+                                      TLI.getPointerTy()),
+                    Args, *this, dl);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+                                SDValue Chain,
+                                SDValue Ptr, SDValue Cmp,
+                                SDValue Swp, const Value* PtrVal,
+                                unsigned Alignment) {
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(MemVT);
+
+  // Check if the memory reference references a frame index
+  if (!PtrVal)
+    if (const FrameIndexSDNode *FI =
+          dyn_cast<const FrameIndexSDNode>(Ptr.getNode()))
+      PtrVal = PseudoSourceValue::getFixedStack(FI->getIndex());
+
+  MachineFunction &MF = getMachineFunction();
+  unsigned Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+
+  // For now, atomics are considered to be volatile always.
+  Flags |= MachineMemOperand::MOVolatile;
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PtrVal, Flags, 0,
+                            MemVT.getStoreSize(), Alignment);
+
+  return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Cmp, Swp, MMO);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+                                SDValue Chain,
+                                SDValue Ptr, SDValue Cmp,
+                                SDValue Swp, MachineMemOperand *MMO) {
+  assert(Opcode == ISD::ATOMIC_CMP_SWAP && "Invalid Atomic Op");
+  assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
+
+  EVT VT = Cmp.getValueType();
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
+  AddNodeIDNode(ID, Opcode, VTs, Ops, 4);
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain,
+                                               Ptr, Cmp, Swp, MMO);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+                                SDValue Chain,
+                                SDValue Ptr, SDValue Val,
+                                const Value* PtrVal,
+                                unsigned Alignment) {
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(MemVT);
+
+  // Check if the memory reference references a frame index
+  if (!PtrVal)
+    if (const FrameIndexSDNode *FI =
+          dyn_cast<const FrameIndexSDNode>(Ptr.getNode()))
+      PtrVal = PseudoSourceValue::getFixedStack(FI->getIndex());
+
+  MachineFunction &MF = getMachineFunction();
+  unsigned Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+
+  // For now, atomics are considered to be volatile always.
+  Flags |= MachineMemOperand::MOVolatile;
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PtrVal, Flags, 0,
+                            MemVT.getStoreSize(), Alignment);
+
+  return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+                                SDValue Chain,
+                                SDValue Ptr, SDValue Val,
+                                MachineMemOperand *MMO) {
+  assert((Opcode == ISD::ATOMIC_LOAD_ADD ||
+          Opcode == ISD::ATOMIC_LOAD_SUB ||
+          Opcode == ISD::ATOMIC_LOAD_AND ||
+          Opcode == ISD::ATOMIC_LOAD_OR ||
+          Opcode == ISD::ATOMIC_LOAD_XOR ||
+          Opcode == ISD::ATOMIC_LOAD_NAND ||
+          Opcode == ISD::ATOMIC_LOAD_MIN ||
+          Opcode == ISD::ATOMIC_LOAD_MAX ||
+          Opcode == ISD::ATOMIC_LOAD_UMIN ||
+          Opcode == ISD::ATOMIC_LOAD_UMAX ||
+          Opcode == ISD::ATOMIC_SWAP) &&
+         "Invalid Atomic Op");
+
+  EVT VT = Val.getValueType();
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  SDValue Ops[] = {Chain, Ptr, Val};
+  AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain,
+                                               Ptr, Val, MMO);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+/// getMergeValues - Create a MERGE_VALUES node from the given operands.
+/// Allowed to return something different (and simpler) if Simplify is true.
+SDValue SelectionDAG::getMergeValues(const SDValue *Ops, unsigned NumOps,
+                                     DebugLoc dl) {
+  if (NumOps == 1)
+    return Ops[0];
+
+  SmallVector<EVT, 4> VTs;
+  VTs.reserve(NumOps);
+  for (unsigned i = 0; i < NumOps; ++i)
+    VTs.push_back(Ops[i].getValueType());
+  return getNode(ISD::MERGE_VALUES, dl, getVTList(&VTs[0], NumOps),
+                 Ops, NumOps);
+}
+
+SDValue
+SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl,
+                                  const EVT *VTs, unsigned NumVTs,
+                                  const SDValue *Ops, unsigned NumOps,
+                                  EVT MemVT, const Value *srcValue, int SVOff,
+                                  unsigned Align, bool Vol,
+                                  bool ReadMem, bool WriteMem) {
+  return getMemIntrinsicNode(Opcode, dl, makeVTList(VTs, NumVTs), Ops, NumOps,
+                             MemVT, srcValue, SVOff, Align, Vol,
+                             ReadMem, WriteMem);
+}
+
+SDValue
+SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
+                                  const SDValue *Ops, unsigned NumOps,
+                                  EVT MemVT, const Value *srcValue, int SVOff,
+                                  unsigned Align, bool Vol,
+                                  bool ReadMem, bool WriteMem) {
+  if (Align == 0)  // Ensure that codegen never sees alignment 0
+    Align = getEVTAlignment(MemVT);
+
+  MachineFunction &MF = getMachineFunction();
+  unsigned Flags = 0;
+  if (WriteMem)
+    Flags |= MachineMemOperand::MOStore;
+  if (ReadMem)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Vol)
+    Flags |= MachineMemOperand::MOVolatile;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(srcValue, Flags, SVOff,
+                            MemVT.getStoreSize(), Align);
+
+  return getMemIntrinsicNode(Opcode, dl, VTList, Ops, NumOps, MemVT, MMO);
+}
+
+SDValue
+SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
+                                  const SDValue *Ops, unsigned NumOps,
+                                  EVT MemVT, MachineMemOperand *MMO) {
+  assert((Opcode == ISD::INTRINSIC_VOID ||
+          Opcode == ISD::INTRINSIC_W_CHAIN ||
+          (Opcode <= INT_MAX &&
+           (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
+         "Opcode is not a memory-accessing opcode!");
+
+  // Memoize the node unless it returns a flag.
+  MemIntrinsicSDNode *N;
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Flag) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
+      return SDValue(E, 0);
+    }
+
+    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl, VTList, Ops, NumOps,
+                                               MemVT, MMO);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl, VTList, Ops, NumOps,
+                                               MemVT, MMO);
+  }
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue
+SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                      EVT VT, DebugLoc dl, SDValue Chain,
+                      SDValue Ptr, SDValue Offset,
+                      const Value *SV, int SVOffset, EVT MemVT,
+                      bool isVolatile, bool isNonTemporal,
+                      unsigned Alignment) {
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(VT);
+
+  // Check if the memory reference references a frame index
+  if (!SV)
+    if (const FrameIndexSDNode *FI =
+          dyn_cast<const FrameIndexSDNode>(Ptr.getNode()))
+      SV = PseudoSourceValue::getFixedStack(FI->getIndex());
+
+  MachineFunction &MF = getMachineFunction();
+  unsigned Flags = MachineMemOperand::MOLoad;
+  if (isVolatile)
+    Flags |= MachineMemOperand::MOVolatile;
+  if (isNonTemporal)
+    Flags |= MachineMemOperand::MONonTemporal;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(SV, Flags, SVOffset,
+                            MemVT.getStoreSize(), Alignment);
+  return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
+}
+
+SDValue
+SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, 
+                      EVT VT, DebugLoc dl, SDValue Chain,
+                      SDValue Ptr, SDValue Offset, EVT MemVT,
+                      MachineMemOperand *MMO) {
+  if (VT == MemVT) {
+    ExtType = ISD::NON_EXTLOAD;
+  } else if (ExtType == ISD::NON_EXTLOAD) {
+    assert(VT == MemVT && "Non-extending load from different memory type!");
+  } else {
+    // Extending load.
+    assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
+           "Should only be an extending load, not truncating!");
+    assert(VT.isInteger() == MemVT.isInteger() &&
+           "Cannot convert from FP to Int or Int -> FP!");
+    assert(VT.isVector() == MemVT.isVector() &&
+           "Cannot use trunc store to convert to or from a vector!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
+           "Cannot use trunc store to change the number of vector elements!");
+  }
+
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.getOpcode() == ISD::UNDEF) &&
+         "Unindexed load with an offset!");
+
+  SDVTList VTs = Indexed ?
+    getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Offset };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::LOAD, VTs, Ops, 3);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(ExtType, AM, MMO->isVolatile(),
+                                     MMO->isNonTemporal()));
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<LoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) LoadSDNode(Ops, dl, VTs, AM, ExtType,
+                                             MemVT, MMO);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getLoad(EVT VT, DebugLoc dl,
+                              SDValue Chain, SDValue Ptr,
+                              const Value *SV, int SVOffset,
+                              bool isVolatile, bool isNonTemporal,
+                              unsigned Alignment) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                 SV, SVOffset, VT, isVolatile, isNonTemporal, Alignment);
+}
+
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, EVT VT, DebugLoc dl,
+                                 SDValue Chain, SDValue Ptr,
+                                 const Value *SV,
+                                 int SVOffset, EVT MemVT,
+                                 bool isVolatile, bool isNonTemporal,
+                                 unsigned Alignment) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
+                 SV, SVOffset, MemVT, isVolatile, isNonTemporal, Alignment);
+}
+
+SDValue
+SelectionDAG::getIndexedLoad(SDValue OrigLoad, DebugLoc dl, SDValue Base,
+                             SDValue Offset, ISD::MemIndexedMode AM) {
+  LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
+  assert(LD->getOffset().getOpcode() == ISD::UNDEF &&
+         "Load is already a indexed load!");
+  return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
+                 LD->getChain(), Base, Offset, LD->getSrcValue(),
+                 LD->getSrcValueOffset(), LD->getMemoryVT(),
+                 LD->isVolatile(), LD->isNonTemporal(), LD->getAlignment());
+}
+
+SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
+                               SDValue Ptr, const Value *SV, int SVOffset,
+                               bool isVolatile, bool isNonTemporal,
+                               unsigned Alignment) {
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(Val.getValueType());
+
+  // Check if the memory reference references a frame index
+  if (!SV)
+    if (const FrameIndexSDNode *FI =
+          dyn_cast<const FrameIndexSDNode>(Ptr.getNode()))
+      SV = PseudoSourceValue::getFixedStack(FI->getIndex());
+
+  MachineFunction &MF = getMachineFunction();
+  unsigned Flags = MachineMemOperand::MOStore;
+  if (isVolatile)
+    Flags |= MachineMemOperand::MOVolatile;
+  if (isNonTemporal)
+    Flags |= MachineMemOperand::MONonTemporal;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(SV, Flags, SVOffset,
+                            Val.getValueType().getStoreSize(), Alignment);
+
+  return getStore(Chain, dl, Val, Ptr, MMO);
+}
+
+SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
+                               SDValue Ptr, MachineMemOperand *MMO) {
+  EVT VT = Val.getValueType();
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
+                                     MMO->isNonTemporal()));
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<StoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl, VTs, ISD::UNINDEXED,
+                                              false, VT, MMO);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
+                                    SDValue Ptr, const Value *SV,
+                                    int SVOffset, EVT SVT,
+                                    bool isVolatile, bool isNonTemporal,
+                                    unsigned Alignment) {
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(SVT);
+
+  // Check if the memory reference references a frame index
+  if (!SV)
+    if (const FrameIndexSDNode *FI =
+          dyn_cast<const FrameIndexSDNode>(Ptr.getNode()))
+      SV = PseudoSourceValue::getFixedStack(FI->getIndex());
+
+  MachineFunction &MF = getMachineFunction();
+  unsigned Flags = MachineMemOperand::MOStore;
+  if (isVolatile)
+    Flags |= MachineMemOperand::MOVolatile;
+  if (isNonTemporal)
+    Flags |= MachineMemOperand::MONonTemporal;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(SV, Flags, SVOffset, SVT.getStoreSize(), Alignment);
+
+  return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
+}
+
+SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
+                                    SDValue Ptr, EVT SVT,
+                                    MachineMemOperand *MMO) {
+  EVT VT = Val.getValueType();
+
+  if (VT == SVT)
+    return getStore(Chain, dl, Val, Ptr, MMO);
+
+  assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
+         "Should only be a truncating store, not extending!");
+  assert(VT.isInteger() == SVT.isInteger() &&
+         "Can't do FP-INT conversion!");
+  assert(VT.isVector() == SVT.isVector() &&
+         "Cannot use trunc store to convert to or from a vector!");
+  assert((!VT.isVector() ||
+          VT.getVectorNumElements() == SVT.getVectorNumElements()) &&
+         "Cannot use trunc store to change the number of vector elements!");
+
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
+  ID.AddInteger(SVT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(true, ISD::UNINDEXED, MMO->isVolatile(),
+                                     MMO->isNonTemporal()));
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<StoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl, VTs, ISD::UNINDEXED,
+                                              true, SVT, MMO);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue
+SelectionDAG::getIndexedStore(SDValue OrigStore, DebugLoc dl, SDValue Base,
+                              SDValue Offset, ISD::MemIndexedMode AM) {
+  StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
+  assert(ST->getOffset().getOpcode() == ISD::UNDEF &&
+         "Store is already a indexed store!");
+  SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
+  SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
+  ID.AddInteger(ST->getMemoryVT().getRawBits());
+  ID.AddInteger(ST->getRawSubclassData());
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl, VTs, AM,
+                                              ST->isTruncatingStore(),
+                                              ST->getMemoryVT(),
+                                              ST->getMemOperand());
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getVAArg(EVT VT, DebugLoc dl,
+                               SDValue Chain, SDValue Ptr,
+                               SDValue SV,
+                               unsigned Align) {
+  SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, MVT::i32) };
+  return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops, 4);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+                              const SDUse *Ops, unsigned NumOps) {
+  switch (NumOps) {
+  case 0: return getNode(Opcode, DL, VT);
+  case 1: return getNode(Opcode, DL, VT, Ops[0]);
+  case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
+  case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
+  default: break;
+  }
+
+  // Copy from an SDUse array into an SDValue array for use with
+  // the regular getNode logic.
+  SmallVector<SDValue, 8> NewOps(Ops, Ops + NumOps);
+  return getNode(Opcode, DL, VT, &NewOps[0], NumOps);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+                              const SDValue *Ops, unsigned NumOps) {
+  switch (NumOps) {
+  case 0: return getNode(Opcode, DL, VT);
+  case 1: return getNode(Opcode, DL, VT, Ops[0]);
+  case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
+  case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
+  default: break;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case ISD::SELECT_CC: {
+    assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
+    assert(Ops[0].getValueType() == Ops[1].getValueType() &&
+           "LHS and RHS of condition must have same type!");
+    assert(Ops[2].getValueType() == Ops[3].getValueType() &&
+           "True and False arms of SelectCC must have same type!");
+    assert(Ops[2].getValueType() == VT &&
+           "select_cc node must be of same type as true and false value!");
+    break;
+  }
+  case ISD::BR_CC: {
+    assert(NumOps == 5 && "BR_CC takes 5 operands!");
+    assert(Ops[2].getValueType() == Ops[3].getValueType() &&
+           "LHS/RHS of comparison should match types!");
+    break;
+  }
+  }
+
+  // Memoize nodes.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+
+  if (VT != MVT::Flag) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops, NumOps);
+    void *IP = 0;
+
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+
+    N = new (NodeAllocator) SDNode(Opcode, DL, VTs, Ops, NumOps);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = new (NodeAllocator) SDNode(Opcode, DL, VTs, Ops, NumOps);
+  }
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
+                              const std::vector<EVT> &ResultTys,
+                              const SDValue *Ops, unsigned NumOps) {
+  return getNode(Opcode, DL, getVTList(&ResultTys[0], ResultTys.size()),
+                 Ops, NumOps);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
+                              const EVT *VTs, unsigned NumVTs,
+                              const SDValue *Ops, unsigned NumOps) {
+  if (NumVTs == 1)
+    return getNode(Opcode, DL, VTs[0], Ops, NumOps);
+  return getNode(Opcode, DL, makeVTList(VTs, NumVTs), Ops, NumOps);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              const SDValue *Ops, unsigned NumOps) {
+  if (VTList.NumVTs == 1)
+    return getNode(Opcode, DL, VTList.VTs[0], Ops, NumOps);
+
+#if 0
+  switch (Opcode) {
+  // FIXME: figure out how to safely handle things like
+  // int foo(int x) { return 1 << (x & 255); }
+  // int bar() { return foo(256); }
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:
+  case ISD::SHL_PARTS:
+    if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+        cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
+      return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
+    else if (N3.getOpcode() == ISD::AND)
+      if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
+        // If the and is only masking out bits that cannot effect the shift,
+        // eliminate the and.
+        unsigned NumBits = VT.getScalarType().getSizeInBits()*2;
+        if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
+          return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
+      }
+    break;
+  }
+#endif
+
+  // Memoize the node unless it returns a flag.
+  SDNode *N;
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Flag) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+
+    if (NumOps == 1) {
+      N = new (NodeAllocator) UnarySDNode(Opcode, DL, VTList, Ops[0]);
+    } else if (NumOps == 2) {
+      N = new (NodeAllocator) BinarySDNode(Opcode, DL, VTList, Ops[0], Ops[1]);
+    } else if (NumOps == 3) {
+      N = new (NodeAllocator) TernarySDNode(Opcode, DL, VTList, Ops[0], Ops[1],
+                                            Ops[2]);
+    } else {
+      N = new (NodeAllocator) SDNode(Opcode, DL, VTList, Ops, NumOps);
+    }
+    CSEMap.InsertNode(N, IP);
+  } else {
+    if (NumOps == 1) {
+      N = new (NodeAllocator) UnarySDNode(Opcode, DL, VTList, Ops[0]);
+    } else if (NumOps == 2) {
+      N = new (NodeAllocator) BinarySDNode(Opcode, DL, VTList, Ops[0], Ops[1]);
+    } else if (NumOps == 3) {
+      N = new (NodeAllocator) TernarySDNode(Opcode, DL, VTList, Ops[0], Ops[1],
+                                            Ops[2]);
+    } else {
+      N = new (NodeAllocator) SDNode(Opcode, DL, VTList, Ops, NumOps);
+    }
+  }
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList) {
+  return getNode(Opcode, DL, VTList, 0, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1) {
+  SDValue Ops[] = { N1 };
+  return getNode(Opcode, DL, VTList, Ops, 1);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1, SDValue N2) {
+  SDValue Ops[] = { N1, N2 };
+  return getNode(Opcode, DL, VTList, Ops, 2);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3) {
+  SDValue Ops[] = { N1, N2, N3 };
+  return getNode(Opcode, DL, VTList, Ops, 3);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              SDValue N4) {
+  SDValue Ops[] = { N1, N2, N3, N4 };
+  return getNode(Opcode, DL, VTList, Ops, 4);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              SDValue N4, SDValue N5) {
+  SDValue Ops[] = { N1, N2, N3, N4, N5 };
+  return getNode(Opcode, DL, VTList, Ops, 5);
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT) {
+  return makeVTList(SDNode::getValueTypeList(VT), 1);
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
+  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
+       E = VTList.rend(); I != E; ++I)
+    if (I->NumVTs == 2 && I->VTs[0] == VT1 && I->VTs[1] == VT2)
+      return *I;
+
+  EVT *Array = Allocator.Allocate<EVT>(2);
+  Array[0] = VT1;
+  Array[1] = VT2;
+  SDVTList Result = makeVTList(Array, 2);
+  VTList.push_back(Result);
+  return Result;
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
+  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
+       E = VTList.rend(); I != E; ++I)
+    if (I->NumVTs == 3 && I->VTs[0] == VT1 && I->VTs[1] == VT2 &&
+                          I->VTs[2] == VT3)
+      return *I;
+
+  EVT *Array = Allocator.Allocate<EVT>(3);
+  Array[0] = VT1;
+  Array[1] = VT2;
+  Array[2] = VT3;
+  SDVTList Result = makeVTList(Array, 3);
+  VTList.push_back(Result);
+  return Result;
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
+  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
+       E = VTList.rend(); I != E; ++I)
+    if (I->NumVTs == 4 && I->VTs[0] == VT1 && I->VTs[1] == VT2 &&
+                          I->VTs[2] == VT3 && I->VTs[3] == VT4)
+      return *I;
+
+  EVT *Array = Allocator.Allocate<EVT>(4);
+  Array[0] = VT1;
+  Array[1] = VT2;
+  Array[2] = VT3;
+  Array[3] = VT4;
+  SDVTList Result = makeVTList(Array, 4);
+  VTList.push_back(Result);
+  return Result;
+}
+
+SDVTList SelectionDAG::getVTList(const EVT *VTs, unsigned NumVTs) {
+  switch (NumVTs) {
+    case 0: llvm_unreachable("Cannot have nodes without results!");
+    case 1: return getVTList(VTs[0]);
+    case 2: return getVTList(VTs[0], VTs[1]);
+    case 3: return getVTList(VTs[0], VTs[1], VTs[2]);
+    case 4: return getVTList(VTs[0], VTs[1], VTs[2], VTs[3]);
+    default: break;
+  }
+
+  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
+       E = VTList.rend(); I != E; ++I) {
+    if (I->NumVTs != NumVTs || VTs[0] != I->VTs[0] || VTs[1] != I->VTs[1])
+      continue;
+
+    bool NoMatch = false;
+    for (unsigned i = 2; i != NumVTs; ++i)
+      if (VTs[i] != I->VTs[i]) {
+        NoMatch = true;
+        break;
+      }
+    if (!NoMatch)
+      return *I;
+  }
+
+  EVT *Array = Allocator.Allocate<EVT>(NumVTs);
+  std::copy(VTs, VTs+NumVTs, Array);
+  SDVTList Result = makeVTList(Array, NumVTs);
+  VTList.push_back(Result);
+  return Result;
+}
+
+
+/// UpdateNodeOperands - *Mutate* the specified node in-place to have the
+/// specified operands.  If the resultant node already exists in the DAG,
+/// this does not modify the specified node, instead it returns the node that
+/// already exists.  If the resultant node does not exist in the DAG, the
+/// input node is returned.  As a degenerate case, if you specify the same
+/// input operands as the node already has, the input node is returned.
+SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) {
+  assert(N->getNumOperands() == 1 && "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  if (Op == N->getOperand(0)) return N;
+
+  // See if the modified node already exists.
+  void *InsertPos = 0;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
+    return Existing;
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = 0;
+
+  // Now we update the operands.
+  N->OperandList[0].set(Op);
+
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return N;
+}
+
+SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) {
+  assert(N->getNumOperands() == 2 && "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
+    return N;   // No operands changed, just return the input node.
+
+  // See if the modified node already exists.
+  void *InsertPos = 0;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
+    return Existing;
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = 0;
+
+  // Now we update the operands.
+  if (N->OperandList[0] != Op1)
+    N->OperandList[0].set(Op1);
+  if (N->OperandList[1] != Op2)
+    N->OperandList[1].set(Op2);
+
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return N;
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return UpdateNodeOperands(N, Ops, 3);
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                   SDValue Op3, SDValue Op4) {
+  SDValue Ops[] = { Op1, Op2, Op3, Op4 };
+  return UpdateNodeOperands(N, Ops, 4);
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                   SDValue Op3, SDValue Op4, SDValue Op5) {
+  SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
+  return UpdateNodeOperands(N, Ops, 5);
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, const SDValue *Ops, unsigned NumOps) {
+  assert(N->getNumOperands() == NumOps &&
+         "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  bool AnyChange = false;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    if (Ops[i] != N->getOperand(i)) {
+      AnyChange = true;
+      break;
+    }
+  }
+
+  // No operands changed, just return the input node.
+  if (!AnyChange) return N;
+
+  // See if the modified node already exists.
+  void *InsertPos = 0;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, NumOps, InsertPos))
+    return Existing;
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = 0;
+
+  // Now we update the operands.
+  for (unsigned i = 0; i != NumOps; ++i)
+    if (N->OperandList[i] != Ops[i])
+      N->OperandList[i].set(Ops[i]);
+
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return N;
+}
+
+/// DropOperands - Release the operands and set this node to have
+/// zero operands.
+void SDNode::DropOperands() {
+  // Unlike the code in MorphNodeTo that does this, we don't need to
+  // watch for dead nodes here.
+  for (op_iterator I = op_begin(), E = op_end(); I != E; ) {
+    SDUse &Use = *I++;
+    Use.set(SDValue());
+  }
+}
+
+/// SelectNodeTo - These are wrappers around MorphNodeTo that accept a
+/// machine opcode.
+///
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT) {
+  SDVTList VTs = getVTList(VT);
+  return SelectNodeTo(N, MachineOpc, VTs, 0, 0);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, SDValue Op1) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 1);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, SDValue Op1,
+                                   SDValue Op2) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 2);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, SDValue Op1,
+                                   SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 3);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, const SDValue *Ops,
+                                   unsigned NumOps) {
+  SDVTList VTs = getVTList(VT);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, const SDValue *Ops,
+                                   unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return SelectNodeTo(N, MachineOpc, VTs, (SDValue *)0, 0);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, EVT VT3,
+                                   const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, EVT VT3, EVT VT4,
+                                   const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3, VT4);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2,
+                                   SDValue Op1) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 1);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2,
+                                   SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 2);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2,
+                                   SDValue Op1, SDValue Op2,
+                                   SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 3);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, EVT VT3,
+                                   SDValue Op1, SDValue Op2,
+                                   SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 3);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   SDVTList VTs, const SDValue *Ops,
+                                   unsigned NumOps) {
+  N = MorphNodeTo(N, ~MachineOpc, VTs, Ops, NumOps);
+  // Reset the NodeID to -1.
+  N->setNodeId(-1);
+  return N;
+}
+
+/// MorphNodeTo - This *mutates* the specified node to have the specified
+/// return type, opcode, and operands.
+///
+/// Note that MorphNodeTo returns the resultant node.  If there is already a
+/// node of the specified opcode and operands, it returns that node instead of
+/// the current one.  Note that the DebugLoc need not be the same.
+///
+/// Using MorphNodeTo is faster than creating a new node and swapping it in
+/// with ReplaceAllUsesWith both because it often avoids allocating a new
+/// node, and because it doesn't require CSE recalculation for any of
+/// the node's users.
+///
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  SDVTList VTs, const SDValue *Ops,
+                                  unsigned NumOps) {
+  // If an identical node already exists, use it.
+  void *IP = 0;
+  if (VTs.VTs[VTs.NumVTs-1] != MVT::Flag) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opc, VTs, Ops, NumOps);
+    if (SDNode *ON = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return ON;
+  }
+
+  if (!RemoveNodeFromCSEMaps(N))
+    IP = 0;
+
+  // Start the morphing.
+  N->NodeType = Opc;
+  N->ValueList = VTs.VTs;
+  N->NumValues = VTs.NumVTs;
+
+  // Clear the operands list, updating used nodes to remove this from their
+  // use list.  Keep track of any operands that become dead as a result.
+  SmallPtrSet<SDNode*, 16> DeadNodeSet;
+  for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
+    SDUse &Use = *I++;
+    SDNode *Used = Use.getNode();
+    Use.set(SDValue());
+    if (Used->use_empty())
+      DeadNodeSet.insert(Used);
+  }
+
+  if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N)) {
+    // Initialize the memory references information.
+    MN->setMemRefs(0, 0);
+    // If NumOps is larger than the # of operands we can have in a
+    // MachineSDNode, reallocate the operand list.
+    if (NumOps > MN->NumOperands || !MN->OperandsNeedDelete) {
+      if (MN->OperandsNeedDelete)
+        delete[] MN->OperandList;
+      if (NumOps > array_lengthof(MN->LocalOperands))
+        // We're creating a final node that will live unmorphed for the
+        // remainder of the current SelectionDAG iteration, so we can allocate
+        // the operands directly out of a pool with no recycling metadata.
+        MN->InitOperands(OperandAllocator.Allocate<SDUse>(NumOps),
+                         Ops, NumOps);
+      else
+        MN->InitOperands(MN->LocalOperands, Ops, NumOps);
+      MN->OperandsNeedDelete = false;
+    } else
+      MN->InitOperands(MN->OperandList, Ops, NumOps);
+  } else {
+    // If NumOps is larger than the # of operands we currently have, reallocate
+    // the operand list.
+    if (NumOps > N->NumOperands) {
+      if (N->OperandsNeedDelete)
+        delete[] N->OperandList;
+      N->InitOperands(new SDUse[NumOps], Ops, NumOps);
+      N->OperandsNeedDelete = true;
+    } else
+      N->InitOperands(N->OperandList, Ops, NumOps);
+  }
+
+  // Delete any nodes that are still dead after adding the uses for the
+  // new operands.
+  if (!DeadNodeSet.empty()) {
+    SmallVector<SDNode *, 16> DeadNodes;
+    for (SmallPtrSet<SDNode *, 16>::iterator I = DeadNodeSet.begin(),
+         E = DeadNodeSet.end(); I != E; ++I)
+      if ((*I)->use_empty())
+        DeadNodes.push_back(*I);
+    RemoveDeadNodes(DeadNodes);
+  }
+
+  if (IP)
+    CSEMap.InsertNode(N, IP);   // Memoize the new node.
+  return N;
+}
+
+
+/// getMachineNode - These are used for target selectors to create a new node
+/// with specified return type(s), MachineInstr opcode, and operands.
+///
+/// Note that getMachineNode returns the resultant node.  If there is already a
+/// node of the specified opcode and operands, it returns that node instead of
+/// the current one.
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT) {
+  SDVTList VTs = getVTList(VT);
+  return getMachineNode(Opcode, dl, VTs, 0, 0);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT, SDValue Op1) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT,
+                             SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT,
+                             SDValue Op1, SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT,
+                             const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT);
+  return getMachineNode(Opcode, dl, VTs, Ops, NumOps);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT1, EVT VT2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return getMachineNode(Opcode, dl, VTs, 0, 0);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2, SDValue Op1) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2, SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2, SDValue Op1,
+                             SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2,
+                             const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return getMachineNode(Opcode, dl, VTs, Ops, NumOps);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2, EVT VT3,
+                             SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2, EVT VT3,
+                             SDValue Op1, SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2, EVT VT3,
+                             const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  return getMachineNode(Opcode, dl, VTs, Ops, NumOps);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT1,
+                             EVT VT2, EVT VT3, EVT VT4,
+                             const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3, VT4);
+  return getMachineNode(Opcode, dl, VTs, Ops, NumOps);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             const std::vector<EVT> &ResultTys,
+                             const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(&ResultTys[0], ResultTys.size());
+  return getMachineNode(Opcode, dl, VTs, Ops, NumOps);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc DL, SDVTList VTs,
+                             const SDValue *Ops, unsigned NumOps) {
+  bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Flag;
+  MachineSDNode *N;
+  void *IP;
+
+  if (DoCSE) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, ~Opcode, VTs, Ops, NumOps);
+    IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return cast<MachineSDNode>(E);
+  }
+
+  // Allocate a new MachineSDNode.
+  N = new (NodeAllocator) MachineSDNode(~Opcode, DL, VTs);
+
+  // Initialize the operands list.
+  if (NumOps > array_lengthof(N->LocalOperands))
+    // We're creating a final node that will live unmorphed for the
+    // remainder of the current SelectionDAG iteration, so we can allocate
+    // the operands directly out of a pool with no recycling metadata.
+    N->InitOperands(OperandAllocator.Allocate<SDUse>(NumOps),
+                    Ops, NumOps);
+  else
+    N->InitOperands(N->LocalOperands, Ops, NumOps);
+  N->OperandsNeedDelete = false;
+
+  if (DoCSE)
+    CSEMap.InsertNode(N, IP);
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return N;
+}
+
+/// getTargetExtractSubreg - A convenience function for creating
+/// TargetOpcode::EXTRACT_SUBREG nodes.
+SDValue
+SelectionDAG::getTargetExtractSubreg(int SRIdx, DebugLoc DL, EVT VT,
+                                     SDValue Operand) {
+  SDValue SRIdxVal = getTargetConstant(SRIdx, MVT::i32);
+  SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                  VT, Operand, SRIdxVal);
+  return SDValue(Subreg, 0);
+}
+
+/// getTargetInsertSubreg - A convenience function for creating
+/// TargetOpcode::INSERT_SUBREG nodes.
+SDValue
+SelectionDAG::getTargetInsertSubreg(int SRIdx, DebugLoc DL, EVT VT,
+                                    SDValue Operand, SDValue Subreg) {
+  SDValue SRIdxVal = getTargetConstant(SRIdx, MVT::i32);
+  SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
+                                  VT, Operand, Subreg, SRIdxVal);
+  return SDValue(Result, 0);
+}
+
+/// getNodeIfExists - Get the specified node if it's already available, or
+/// else return NULL.
+SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
+                                      const SDValue *Ops, unsigned NumOps) {
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Flag) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return E;
+  }
+  return NULL;
+}
+
+/// getDbgValue - Creates a SDDbgValue node.
+///
+SDDbgValue *
+SelectionDAG::getDbgValue(MDNode *MDPtr, SDNode *N, unsigned R, uint64_t Off,
+                          DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(MDPtr, N, R, Off, DL, O);
+}
+
+SDDbgValue *
+SelectionDAG::getDbgValue(MDNode *MDPtr, const Value *C, uint64_t Off,
+                          DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(MDPtr, C, Off, DL, O);
+}
+
+SDDbgValue *
+SelectionDAG::getDbgValue(MDNode *MDPtr, unsigned FI, uint64_t Off,
+                          DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(MDPtr, FI, Off, DL, O);
+}
+
+namespace {
+
+/// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node
+/// pointed to by a use iterator is deleted, increment the use iterator
+/// so that it doesn't dangle.
+///
+/// This class also manages a "downlink" DAGUpdateListener, to forward
+/// messages to ReplaceAllUsesWith's callers.
+///
+class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
+  SelectionDAG::DAGUpdateListener *DownLink;
+  SDNode::use_iterator &UI;
+  SDNode::use_iterator &UE;
+
+  virtual void NodeDeleted(SDNode *N, SDNode *E) {
+    // Increment the iterator as needed.
+    while (UI != UE && N == *UI)
+      ++UI;
+
+    // Then forward the message.
+    if (DownLink) DownLink->NodeDeleted(N, E);
+  }
+
+  virtual void NodeUpdated(SDNode *N) {
+    // Just forward the message.
+    if (DownLink) DownLink->NodeUpdated(N);
+  }
+
+public:
+  RAUWUpdateListener(SelectionDAG::DAGUpdateListener *dl,
+                     SDNode::use_iterator &ui,
+                     SDNode::use_iterator &ue)
+    : DownLink(dl), UI(ui), UE(ue) {}
+};
+
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version assumes From has a single result value.
+///
+void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To,
+                                      DAGUpdateListener *UpdateListener) {
+  SDNode *From = FromN.getNode();
+  assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
+         "Cannot replace with this method!");
+  assert(From != To.getNode() && "Cannot replace uses of with self");
+
+  // Iterate over all the existing uses of From. New uses will be added
+  // to the beginning of the use list, which we avoid visiting.
+  // This specifically avoids visiting uses of From that arise while the
+  // replacement is happening, because any such uses would be the result
+  // of CSE: If an existing node looks like From after one of its operands
+  // is replaced by To, we don't want to replace of all its users with To
+  // too. See PR3018 for more info.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  RAUWUpdateListener Listener(UpdateListener, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      ++UI;
+      Use.set(To);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User, &Listener);
+  }
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version assumes that for each value of From, there is a
+/// corresponding value in To in the same position with the same type.
+///
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To,
+                                      DAGUpdateListener *UpdateListener) {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+    assert((!From->hasAnyUseOfValue(i) ||
+            From->getValueType(i) == To->getValueType(i)) &&
+           "Cannot use this version of ReplaceAllUsesWith!");
+#endif
+
+  // Handle the trivial case.
+  if (From == To)
+    return;
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  RAUWUpdateListener Listener(UpdateListener, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      ++UI;
+      Use.setNode(To);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User, &Listener);
+  }
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version can replace From with any result values.  To must match the
+/// number and types of values returned by From.
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From,
+                                      const SDValue *To,
+                                      DAGUpdateListener *UpdateListener) {
+  if (From->getNumValues() == 1)  // Handle the simple case efficiently.
+    return ReplaceAllUsesWith(SDValue(From, 0), To[0], UpdateListener);
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  RAUWUpdateListener Listener(UpdateListener, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      const SDValue &ToOp = To[Use.getResNo()];
+      ++UI;
+      Use.set(ToOp);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User, &Listener);
+  }
+}
+
+/// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
+/// uses of other values produced by From.getNode() alone.  The Deleted
+/// vector is handled the same way as for ReplaceAllUsesWith.
+void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To,
+                                             DAGUpdateListener *UpdateListener){
+  // Handle the really simple, really trivial case efficiently.
+  if (From == To) return;
+
+  // Handle the simple, trivial, case efficiently.
+  if (From.getNode()->getNumValues() == 1) {
+    ReplaceAllUsesWith(From, To, UpdateListener);
+    return;
+  }
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From.getNode()->use_begin(),
+                       UE = From.getNode()->use_end();
+  RAUWUpdateListener Listener(UpdateListener, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+    bool UserRemovedFromCSEMaps = false;
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+
+      // Skip uses of different values from the same node.
+      if (Use.getResNo() != From.getResNo()) {
+        ++UI;
+        continue;
+      }
+
+      // If this node hasn't been modified yet, it's still in the CSE maps,
+      // so remove its old self from the CSE maps.
+      if (!UserRemovedFromCSEMaps) {
+        RemoveNodeFromCSEMaps(User);
+        UserRemovedFromCSEMaps = true;
+      }
+
+      ++UI;
+      Use.set(To);
+    } while (UI != UE && *UI == User);
+
+    // We are iterating over all uses of the From node, so if a use
+    // doesn't use the specific value, no changes are made.
+    if (!UserRemovedFromCSEMaps)
+      continue;
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User, &Listener);
+  }
+}
+
+namespace {
+  /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
+  /// to record information about a use.
+  struct UseMemo {
+    SDNode *User;
+    unsigned Index;
+    SDUse *Use;
+  };
+
+  /// operator< - Sort Memos by User.
+  bool operator<(const UseMemo &L, const UseMemo &R) {
+    return (intptr_t)L.User < (intptr_t)R.User;
+  }
+}
+
+/// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
+/// uses of other values produced by From.getNode() alone.  The same value
+/// may appear in both the From and To list.  The Deleted vector is
+/// handled the same way as for ReplaceAllUsesWith.
+void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
+                                              const SDValue *To,
+                                              unsigned Num,
+                                              DAGUpdateListener *UpdateListener){
+  // Handle the simple, trivial case efficiently.
+  if (Num == 1)
+    return ReplaceAllUsesOfValueWith(*From, *To, UpdateListener);
+
+  // Read up all the uses and make records of them. This helps
+  // processing new uses that are introduced during the
+  // replacement process.
+  SmallVector<UseMemo, 4> Uses;
+  for (unsigned i = 0; i != Num; ++i) {
+    unsigned FromResNo = From[i].getResNo();
+    SDNode *FromNode = From[i].getNode();
+    for (SDNode::use_iterator UI = FromNode->use_begin(),
+         E = FromNode->use_end(); UI != E; ++UI) {
+      SDUse &Use = UI.getUse();
+      if (Use.getResNo() == FromResNo) {
+        UseMemo Memo = { *UI, i, &Use };
+        Uses.push_back(Memo);
+      }
+    }
+  }
+
+  // Sort the uses, so that all the uses from a given User are together.
+  std::sort(Uses.begin(), Uses.end());
+
+  for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
+       UseIndex != UseIndexEnd; ) {
+    // We know that this user uses some value of From.  If it is the right
+    // value, update it.
+    SDNode *User = Uses[UseIndex].User;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // The Uses array is sorted, so all the uses for a given User
+    // are next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      unsigned i = Uses[UseIndex].Index;
+      SDUse &Use = *Uses[UseIndex].Use;
+      ++UseIndex;
+
+      Use.set(To[i]);
+    } while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User, UpdateListener);
+  }
+}
+
+/// AssignTopologicalOrder - Assign a unique node id for each node in the DAG
+/// based on their topological order. It returns the maximum id and a vector
+/// of the SDNodes* in assigned order by reference.
+unsigned SelectionDAG::AssignTopologicalOrder() {
+
+  unsigned DAGSize = 0;
+
+  // SortedPos tracks the progress of the algorithm. Nodes before it are
+  // sorted, nodes after it are unsorted. When the algorithm completes
+  // it is at the end of the list.
+  allnodes_iterator SortedPos = allnodes_begin();
+
+  // Visit all the nodes. Move nodes with no operands to the front of
+  // the list immediately. Annotate nodes that do have operands with their
+  // operand count. Before we do this, the Node Id fields of the nodes
+  // may contain arbitrary values. After, the Node Id fields for nodes
+  // before SortedPos will contain the topological sort index, and the
+  // Node Id fields for nodes At SortedPos and after will contain the
+  // count of outstanding operands.
+  for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
+    SDNode *N = I++;
+    checkForCycles(N);
+    unsigned Degree = N->getNumOperands();
+    if (Degree == 0) {
+      // A node with no uses, add it to the result array immediately.
+      N->setNodeId(DAGSize++);
+      allnodes_iterator Q = N;
+      if (Q != SortedPos)
+        SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
+      assert(SortedPos != AllNodes.end() && "Overran node list");
+      ++SortedPos;
+    } else {
+      // Temporarily use the Node Id as scratch space for the degree count.
+      N->setNodeId(Degree);
+    }
+  }
+
+  // Visit all the nodes. As we iterate, moves nodes into sorted order,
+  // such that by the time the end is reached all nodes will be sorted.
+  for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ++I) {
+    SDNode *N = I;
+    checkForCycles(N);
+    // N is in sorted position, so all its uses have one less operand
+    // that needs to be sorted.
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI) {
+      SDNode *P = *UI;
+      unsigned Degree = P->getNodeId();
+      assert(Degree != 0 && "Invalid node degree");
+      --Degree;
+      if (Degree == 0) {
+        // All of P's operands are sorted, so P may sorted now.
+        P->setNodeId(DAGSize++);
+        if (P != SortedPos)
+          SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P));
+        assert(SortedPos != AllNodes.end() && "Overran node list");
+        ++SortedPos;
+      } else {
+        // Update P's outstanding operand count.
+        P->setNodeId(Degree);
+      }
+    }
+    if (I == SortedPos) {
+#ifndef NDEBUG
+      SDNode *S = ++I;
+      dbgs() << "Overran sorted position:\n";
+      S->dumprFull();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+
+  assert(SortedPos == AllNodes.end() &&
+         "Topological sort incomplete!");
+  assert(AllNodes.front().getOpcode() == ISD::EntryToken &&
+         "First node in topological sort is not the entry token!");
+  assert(AllNodes.front().getNodeId() == 0 &&
+         "First node in topological sort has non-zero id!");
+  assert(AllNodes.front().getNumOperands() == 0 &&
+         "First node in topological sort has operands!");
+  assert(AllNodes.back().getNodeId() == (int)DAGSize-1 &&
+         "Last node in topologic sort has unexpected id!");
+  assert(AllNodes.back().use_empty() &&
+         "Last node in topologic sort has users!");
+  assert(DAGSize == allnodes_size() && "Node count mismatch!");
+  return DAGSize;
+}
+
+/// AssignOrdering - Assign an order to the SDNode.
+void SelectionDAG::AssignOrdering(const SDNode *SD, unsigned Order) {
+  assert(SD && "Trying to assign an order to a null node!");
+  Ordering->add(SD, Order);
+}
+
+/// GetOrdering - Get the order for the SDNode.
+unsigned SelectionDAG::GetOrdering(const SDNode *SD) const {
+  assert(SD && "Trying to get the order of a null node!");
+  return Ordering->getOrder(SD);
+}
+
+/// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
+/// value is produced by SD.
+void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) {
+  DbgInfo->add(DB, SD, isParameter);
+  if (SD)
+    SD->setHasDebugValue(true);
+}
+
+//===----------------------------------------------------------------------===//
+//                              SDNode Class
+//===----------------------------------------------------------------------===//
+
+HandleSDNode::~HandleSDNode() {
+  DropOperands();
+}
+
+GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, DebugLoc DL,
+                                         const GlobalValue *GA,
+                                         EVT VT, int64_t o, unsigned char TF)
+  : SDNode(Opc, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
+  TheGlobal = GA;
+}
+
+MemSDNode::MemSDNode(unsigned Opc, DebugLoc dl, SDVTList VTs, EVT memvt,
+                     MachineMemOperand *mmo)
+ : SDNode(Opc, dl, VTs), MemoryVT(memvt), MMO(mmo) {
+  SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(),
+                                      MMO->isNonTemporal());
+  assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!");
+  assert(isNonTemporal() == MMO->isNonTemporal() &&
+         "Non-temporal encoding error!");
+  assert(memvt.getStoreSize() == MMO->getSize() && "Size mismatch!");
+}
+
+MemSDNode::MemSDNode(unsigned Opc, DebugLoc dl, SDVTList VTs,
+                     const SDValue *Ops, unsigned NumOps, EVT memvt, 
+                     MachineMemOperand *mmo)
+   : SDNode(Opc, dl, VTs, Ops, NumOps),
+     MemoryVT(memvt), MMO(mmo) {
+  SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(),
+                                      MMO->isNonTemporal());
+  assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!");
+  assert(memvt.getStoreSize() == MMO->getSize() && "Size mismatch!");
+}
+
+/// Profile - Gather unique data for the node.
+///
+void SDNode::Profile(FoldingSetNodeID &ID) const {
+  AddNodeIDNode(ID, this);
+}
+
+namespace {
+  struct EVTArray {
+    std::vector<EVT> VTs;
+    
+    EVTArray() {
+      VTs.reserve(MVT::LAST_VALUETYPE);
+      for (unsigned i = 0; i < MVT::LAST_VALUETYPE; ++i)
+        VTs.push_back(MVT((MVT::SimpleValueType)i));
+    }
+  };
+}
+
+static ManagedStatic<std::set<EVT, EVT::compareRawBits> > EVTs;
+static ManagedStatic<EVTArray> SimpleVTArray;
+static ManagedStatic<sys::SmartMutex<true> > VTMutex;
+
+/// getValueTypeList - Return a pointer to the specified value type.
+///
+const EVT *SDNode::getValueTypeList(EVT VT) {
+  if (VT.isExtended()) {
+    sys::SmartScopedLock<true> Lock(*VTMutex);
+    return &(*EVTs->insert(VT).first);
+  } else {
+    assert(VT.getSimpleVT().SimpleTy < MVT::LAST_VALUETYPE &&
+           "Value type out of range!");
+    return &SimpleVTArray->VTs[VT.getSimpleVT().SimpleTy];
+  }
+}
+
+/// hasNUsesOfValue - Return true if there are exactly NUSES uses of the
+/// indicated value.  This method ignores uses of other values defined by this
+/// operation.
+bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
+  assert(Value < getNumValues() && "Bad value!");
+
+  // TODO: Only iterate over uses of a given value of the node
+  for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
+    if (UI.getUse().getResNo() == Value) {
+      if (NUses == 0)
+        return false;
+      --NUses;
+    }
+  }
+
+  // Found exactly the right number of uses?
+  return NUses == 0;
+}
+
+
+/// hasAnyUseOfValue - Return true if there are any use of the indicated
+/// value. This method ignores uses of other values defined by this operation.
+bool SDNode::hasAnyUseOfValue(unsigned Value) const {
+  assert(Value < getNumValues() && "Bad value!");
+
+  for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI)
+    if (UI.getUse().getResNo() == Value)
+      return true;
+
+  return false;
+}
+
+
+/// isOnlyUserOf - Return true if this node is the only use of N.
+///
+bool SDNode::isOnlyUserOf(SDNode *N) const {
+  bool Seen = false;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDNode *User = *I;
+    if (User == this)
+      Seen = true;
+    else
+      return false;
+  }
+
+  return Seen;
+}
+
+/// isOperand - Return true if this node is an operand of N.
+///
+bool SDValue::isOperandOf(SDNode *N) const {
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    if (*this == N->getOperand(i))
+      return true;
+  return false;
+}
+
+bool SDNode::isOperandOf(SDNode *N) const {
+  for (unsigned i = 0, e = N->NumOperands; i != e; ++i)
+    if (this == N->OperandList[i].getNode())
+      return true;
+  return false;
+}
+
+/// reachesChainWithoutSideEffects - Return true if this operand (which must
+/// be a chain) reaches the specified operand without crossing any
+/// side-effecting instructions.  In practice, this looks through token
+/// factors and non-volatile loads.  In order to remain efficient, this only
+/// looks a couple of nodes in, it does not do an exhaustive search.
+bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
+                                               unsigned Depth) const {
+  if (*this == Dest) return true;
+
+  // Don't search too deeply, we just want to be able to see through
+  // TokenFactor's etc.
+  if (Depth == 0) return false;
+
+  // If this is a token factor, all inputs to the TF happen in parallel.  If any
+  // of the operands of the TF reach dest, then we can do the xform.
+  if (getOpcode() == ISD::TokenFactor) {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+      if (getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1))
+        return true;
+    return false;
+  }
+
+  // Loads don't have side effects, look through them.
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(*this)) {
+    if (!Ld->isVolatile())
+      return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
+  }
+  return false;
+}
+
+/// isPredecessorOf - Return true if this node is a predecessor of N. This node
+/// is either an operand of N or it can be reached by traversing up the operands.
+/// NOTE: this is an expensive method. Use it carefully.
+bool SDNode::isPredecessorOf(SDNode *N) const {
+  SmallPtrSet<SDNode *, 32> Visited;
+  SmallVector<SDNode *, 16> Worklist;
+  Worklist.push_back(N);
+
+  do {
+    N = Worklist.pop_back_val();
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+      SDNode *Op = N->getOperand(i).getNode();
+      if (Op == this)
+        return true;
+      if (Visited.insert(Op))
+        Worklist.push_back(Op);
+    }
+  } while (!Worklist.empty());
+
+  return false;
+}
+
+uint64_t SDNode::getConstantOperandVal(unsigned Num) const {
+  assert(Num < NumOperands && "Invalid child # of SDNode!");
+  return cast<ConstantSDNode>(OperandList[Num])->getZExtValue();
+}
+
+std::string SDNode::getOperationName(const SelectionDAG *G) const {
+  switch (getOpcode()) {
+  default:
+    if (getOpcode() < ISD::BUILTIN_OP_END)
+      return "<<Unknown DAG Node>>";
+    if (isMachineOpcode()) {
+      if (G)
+        if (const TargetInstrInfo *TII = G->getTarget().getInstrInfo())
+          if (getMachineOpcode() < TII->getNumOpcodes())
+            return TII->get(getMachineOpcode()).getName();
+      return "<<Unknown Machine Node #" + utostr(getOpcode()) + ">>";
+    }
+    if (G) {
+      const TargetLowering &TLI = G->getTargetLoweringInfo();
+      const char *Name = TLI.getTargetNodeName(getOpcode());
+      if (Name) return Name;
+      return "<<Unknown Target Node #" + utostr(getOpcode()) + ">>";
+    }
+    return "<<Unknown Node #" + utostr(getOpcode()) + ">>";
+
+#ifndef NDEBUG
+  case ISD::DELETED_NODE:
+    return "<<Deleted Node!>>";
+#endif
+  case ISD::PREFETCH:      return "Prefetch";
+  case ISD::MEMBARRIER:    return "MemBarrier";
+  case ISD::ATOMIC_CMP_SWAP:    return "AtomicCmpSwap";
+  case ISD::ATOMIC_SWAP:        return "AtomicSwap";
+  case ISD::ATOMIC_LOAD_ADD:    return "AtomicLoadAdd";
+  case ISD::ATOMIC_LOAD_SUB:    return "AtomicLoadSub";
+  case ISD::ATOMIC_LOAD_AND:    return "AtomicLoadAnd";
+  case ISD::ATOMIC_LOAD_OR:     return "AtomicLoadOr";
+  case ISD::ATOMIC_LOAD_XOR:    return "AtomicLoadXor";
+  case ISD::ATOMIC_LOAD_NAND:   return "AtomicLoadNand";
+  case ISD::ATOMIC_LOAD_MIN:    return "AtomicLoadMin";
+  case ISD::ATOMIC_LOAD_MAX:    return "AtomicLoadMax";
+  case ISD::ATOMIC_LOAD_UMIN:   return "AtomicLoadUMin";
+  case ISD::ATOMIC_LOAD_UMAX:   return "AtomicLoadUMax";
+  case ISD::PCMARKER:      return "PCMarker";
+  case ISD::READCYCLECOUNTER: return "ReadCycleCounter";
+  case ISD::SRCVALUE:      return "SrcValue";
+  case ISD::MDNODE_SDNODE: return "MDNode";
+  case ISD::EntryToken:    return "EntryToken";
+  case ISD::TokenFactor:   return "TokenFactor";
+  case ISD::AssertSext:    return "AssertSext";
+  case ISD::AssertZext:    return "AssertZext";
+
+  case ISD::BasicBlock:    return "BasicBlock";
+  case ISD::VALUETYPE:     return "ValueType";
+  case ISD::Register:      return "Register";
+
+  case ISD::Constant:      return "Constant";
+  case ISD::ConstantFP:    return "ConstantFP";
+  case ISD::GlobalAddress: return "GlobalAddress";
+  case ISD::GlobalTLSAddress: return "GlobalTLSAddress";
+  case ISD::FrameIndex:    return "FrameIndex";
+  case ISD::JumpTable:     return "JumpTable";
+  case ISD::GLOBAL_OFFSET_TABLE: return "GLOBAL_OFFSET_TABLE";
+  case ISD::RETURNADDR: return "RETURNADDR";
+  case ISD::FRAMEADDR: return "FRAMEADDR";
+  case ISD::FRAME_TO_ARGS_OFFSET: return "FRAME_TO_ARGS_OFFSET";
+  case ISD::LSDAADDR: return "LSDAADDR";
+  case ISD::ConstantPool:  return "ConstantPool";
+  case ISD::ExternalSymbol: return "ExternalSymbol";
+  case ISD::BlockAddress:  return "BlockAddress";
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned OpNo = getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 0 : 1;
+    unsigned IID = cast<ConstantSDNode>(getOperand(OpNo))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return Intrinsic::getName((Intrinsic::ID)IID);
+    else if (const TargetIntrinsicInfo *TII = G->getTarget().getIntrinsicInfo())
+      return TII->getName(IID);
+    llvm_unreachable("Invalid intrinsic ID");
+  }
+
+  case ISD::BUILD_VECTOR:   return "BUILD_VECTOR";
+  case ISD::TargetConstant: return "TargetConstant";
+  case ISD::TargetConstantFP:return "TargetConstantFP";
+  case ISD::TargetGlobalAddress: return "TargetGlobalAddress";
+  case ISD::TargetGlobalTLSAddress: return "TargetGlobalTLSAddress";
+  case ISD::TargetFrameIndex: return "TargetFrameIndex";
+  case ISD::TargetJumpTable:  return "TargetJumpTable";
+  case ISD::TargetConstantPool:  return "TargetConstantPool";
+  case ISD::TargetExternalSymbol: return "TargetExternalSymbol";
+  case ISD::TargetBlockAddress: return "TargetBlockAddress";
+
+  case ISD::CopyToReg:     return "CopyToReg";
+  case ISD::CopyFromReg:   return "CopyFromReg";
+  case ISD::UNDEF:         return "undef";
+  case ISD::MERGE_VALUES:  return "merge_values";
+  case ISD::HANDLENODE:    return "handlenode";
+
+  // Unary operators
+  case ISD::FABS:   return "fabs";
+  case ISD::FNEG:   return "fneg";
+  case ISD::FSQRT:  return "fsqrt";
+  case ISD::FSIN:   return "fsin";
+  case ISD::FCOS:   return "fcos";
+  case ISD::FTRUNC: return "ftrunc";
+  case ISD::FFLOOR: return "ffloor";
+  case ISD::FCEIL:  return "fceil";
+  case ISD::FRINT:  return "frint";
+  case ISD::FNEARBYINT: return "fnearbyint";
+  case ISD::FEXP:   return "fexp";
+  case ISD::FEXP2:  return "fexp2";
+  case ISD::FLOG:   return "flog";
+  case ISD::FLOG2:  return "flog2";
+  case ISD::FLOG10: return "flog10";
+
+  // Binary operators
+  case ISD::ADD:    return "add";
+  case ISD::SUB:    return "sub";
+  case ISD::MUL:    return "mul";
+  case ISD::MULHU:  return "mulhu";
+  case ISD::MULHS:  return "mulhs";
+  case ISD::SDIV:   return "sdiv";
+  case ISD::UDIV:   return "udiv";
+  case ISD::SREM:   return "srem";
+  case ISD::UREM:   return "urem";
+  case ISD::SMUL_LOHI:  return "smul_lohi";
+  case ISD::UMUL_LOHI:  return "umul_lohi";
+  case ISD::SDIVREM:    return "sdivrem";
+  case ISD::UDIVREM:    return "udivrem";
+  case ISD::AND:    return "and";
+  case ISD::OR:     return "or";
+  case ISD::XOR:    return "xor";
+  case ISD::SHL:    return "shl";
+  case ISD::SRA:    return "sra";
+  case ISD::SRL:    return "srl";
+  case ISD::ROTL:   return "rotl";
+  case ISD::ROTR:   return "rotr";
+  case ISD::FADD:   return "fadd";
+  case ISD::FSUB:   return "fsub";
+  case ISD::FMUL:   return "fmul";
+  case ISD::FDIV:   return "fdiv";
+  case ISD::FREM:   return "frem";
+  case ISD::FCOPYSIGN: return "fcopysign";
+  case ISD::FGETSIGN:  return "fgetsign";
+  case ISD::FPOW:   return "fpow";
+
+  case ISD::FPOWI:  return "fpowi";
+  case ISD::SETCC:       return "setcc";
+  case ISD::VSETCC:      return "vsetcc";
+  case ISD::SELECT:      return "select";
+  case ISD::SELECT_CC:   return "select_cc";
+  case ISD::INSERT_VECTOR_ELT:   return "insert_vector_elt";
+  case ISD::EXTRACT_VECTOR_ELT:  return "extract_vector_elt";
+  case ISD::CONCAT_VECTORS:      return "concat_vectors";
+  case ISD::EXTRACT_SUBVECTOR:   return "extract_subvector";
+  case ISD::SCALAR_TO_VECTOR:    return "scalar_to_vector";
+  case ISD::VECTOR_SHUFFLE:      return "vector_shuffle";
+  case ISD::CARRY_FALSE:         return "carry_false";
+  case ISD::ADDC:        return "addc";
+  case ISD::ADDE:        return "adde";
+  case ISD::SADDO:       return "saddo";
+  case ISD::UADDO:       return "uaddo";
+  case ISD::SSUBO:       return "ssubo";
+  case ISD::USUBO:       return "usubo";
+  case ISD::SMULO:       return "smulo";
+  case ISD::UMULO:       return "umulo";
+  case ISD::SUBC:        return "subc";
+  case ISD::SUBE:        return "sube";
+  case ISD::SHL_PARTS:   return "shl_parts";
+  case ISD::SRA_PARTS:   return "sra_parts";
+  case ISD::SRL_PARTS:   return "srl_parts";
+
+  // Conversion operators.
+  case ISD::SIGN_EXTEND: return "sign_extend";
+  case ISD::ZERO_EXTEND: return "zero_extend";
+  case ISD::ANY_EXTEND:  return "any_extend";
+  case ISD::SIGN_EXTEND_INREG: return "sign_extend_inreg";
+  case ISD::TRUNCATE:    return "truncate";
+  case ISD::FP_ROUND:    return "fp_round";
+  case ISD::FLT_ROUNDS_: return "flt_rounds";
+  case ISD::FP_ROUND_INREG: return "fp_round_inreg";
+  case ISD::FP_EXTEND:   return "fp_extend";
+
+  case ISD::SINT_TO_FP:  return "sint_to_fp";
+  case ISD::UINT_TO_FP:  return "uint_to_fp";
+  case ISD::FP_TO_SINT:  return "fp_to_sint";
+  case ISD::FP_TO_UINT:  return "fp_to_uint";
+  case ISD::BIT_CONVERT: return "bit_convert";
+  case ISD::FP16_TO_FP32: return "fp16_to_fp32";
+  case ISD::FP32_TO_FP16: return "fp32_to_fp16";
+
+  case ISD::CONVERT_RNDSAT: {
+    switch (cast<CvtRndSatSDNode>(this)->getCvtCode()) {
+    default: llvm_unreachable("Unknown cvt code!");
+    case ISD::CVT_FF:  return "cvt_ff";
+    case ISD::CVT_FS:  return "cvt_fs";
+    case ISD::CVT_FU:  return "cvt_fu";
+    case ISD::CVT_SF:  return "cvt_sf";
+    case ISD::CVT_UF:  return "cvt_uf";
+    case ISD::CVT_SS:  return "cvt_ss";
+    case ISD::CVT_SU:  return "cvt_su";
+    case ISD::CVT_US:  return "cvt_us";
+    case ISD::CVT_UU:  return "cvt_uu";
+    }
+  }
+
+    // Control flow instructions
+  case ISD::BR:      return "br";
+  case ISD::BRIND:   return "brind";
+  case ISD::BR_JT:   return "br_jt";
+  case ISD::BRCOND:  return "brcond";
+  case ISD::BR_CC:   return "br_cc";
+  case ISD::CALLSEQ_START:  return "callseq_start";
+  case ISD::CALLSEQ_END:    return "callseq_end";
+
+    // Other operators
+  case ISD::LOAD:               return "load";
+  case ISD::STORE:              return "store";
+  case ISD::VAARG:              return "vaarg";
+  case ISD::VACOPY:             return "vacopy";
+  case ISD::VAEND:              return "vaend";
+  case ISD::VASTART:            return "vastart";
+  case ISD::DYNAMIC_STACKALLOC: return "dynamic_stackalloc";
+  case ISD::EXTRACT_ELEMENT:    return "extract_element";
+  case ISD::BUILD_PAIR:         return "build_pair";
+  case ISD::STACKSAVE:          return "stacksave";
+  case ISD::STACKRESTORE:       return "stackrestore";
+  case ISD::TRAP:               return "trap";
+
+  // Bit manipulation
+  case ISD::BSWAP:   return "bswap";
+  case ISD::CTPOP:   return "ctpop";
+  case ISD::CTTZ:    return "cttz";
+  case ISD::CTLZ:    return "ctlz";
+
+  // Trampolines
+  case ISD::TRAMPOLINE: return "trampoline";
+
+  case ISD::CONDCODE:
+    switch (cast<CondCodeSDNode>(this)->get()) {
+    default: llvm_unreachable("Unknown setcc condition!");
+    case ISD::SETOEQ:  return "setoeq";
+    case ISD::SETOGT:  return "setogt";
+    case ISD::SETOGE:  return "setoge";
+    case ISD::SETOLT:  return "setolt";
+    case ISD::SETOLE:  return "setole";
+    case ISD::SETONE:  return "setone";
+
+    case ISD::SETO:    return "seto";
+    case ISD::SETUO:   return "setuo";
+    case ISD::SETUEQ:  return "setue";
+    case ISD::SETUGT:  return "setugt";
+    case ISD::SETUGE:  return "setuge";
+    case ISD::SETULT:  return "setult";
+    case ISD::SETULE:  return "setule";
+    case ISD::SETUNE:  return "setune";
+
+    case ISD::SETEQ:   return "seteq";
+    case ISD::SETGT:   return "setgt";
+    case ISD::SETGE:   return "setge";
+    case ISD::SETLT:   return "setlt";
+    case ISD::SETLE:   return "setle";
+    case ISD::SETNE:   return "setne";
+    }
+  }
+}
+
+const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
+  switch (AM) {
+  default:
+    return "";
+  case ISD::PRE_INC:
+    return "<pre-inc>";
+  case ISD::PRE_DEC:
+    return "<pre-dec>";
+  case ISD::POST_INC:
+    return "<post-inc>";
+  case ISD::POST_DEC:
+    return "<post-dec>";
+  }
+}
+
+std::string ISD::ArgFlagsTy::getArgFlagsString() {
+  std::string S = "< ";
+
+  if (isZExt())
+    S += "zext ";
+  if (isSExt())
+    S += "sext ";
+  if (isInReg())
+    S += "inreg ";
+  if (isSRet())
+    S += "sret ";
+  if (isByVal())
+    S += "byval ";
+  if (isNest())
+    S += "nest ";
+  if (getByValAlign())
+    S += "byval-align:" + utostr(getByValAlign()) + " ";
+  if (getOrigAlign())
+    S += "orig-align:" + utostr(getOrigAlign()) + " ";
+  if (getByValSize())
+    S += "byval-size:" + utostr(getByValSize()) + " ";
+  return S + ">";
+}
+
+void SDNode::dump() const { dump(0); }
+void SDNode::dump(const SelectionDAG *G) const {
+  print(dbgs(), G);
+  dbgs() << '\n';
+}
+
+void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
+  OS << (void*)this << ": ";
+
+  for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
+    if (i) OS << ",";
+    if (getValueType(i) == MVT::Other)
+      OS << "ch";
+    else
+      OS << getValueType(i).getEVTString();
+  }
+  OS << " = " << getOperationName(G);
+}
+
+void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
+  if (const MachineSDNode *MN = dyn_cast<MachineSDNode>(this)) {
+    if (!MN->memoperands_empty()) {
+      OS << "<";
+      OS << "Mem:";
+      for (MachineSDNode::mmo_iterator i = MN->memoperands_begin(),
+           e = MN->memoperands_end(); i != e; ++i) {
+        OS << **i;
+        if (llvm::next(i) != e)
+          OS << " ";
+      }
+      OS << ">";
+    }
+  } else if (const ShuffleVectorSDNode *SVN =
+               dyn_cast<ShuffleVectorSDNode>(this)) {
+    OS << "<";
+    for (unsigned i = 0, e = ValueList[0].getVectorNumElements(); i != e; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (i) OS << ",";
+      if (Idx < 0)
+        OS << "u";
+      else
+        OS << Idx;
+    }
+    OS << ">";
+  } else if (const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(this)) {
+    OS << '<' << CSDN->getAPIntValue() << '>';
+  } else if (const ConstantFPSDNode *CSDN = dyn_cast<ConstantFPSDNode>(this)) {
+    if (&CSDN->getValueAPF().getSemantics()==&APFloat::IEEEsingle)
+      OS << '<' << CSDN->getValueAPF().convertToFloat() << '>';
+    else if (&CSDN->getValueAPF().getSemantics()==&APFloat::IEEEdouble)
+      OS << '<' << CSDN->getValueAPF().convertToDouble() << '>';
+    else {
+      OS << "<APFloat(";
+      CSDN->getValueAPF().bitcastToAPInt().dump();
+      OS << ")>";
+    }
+  } else if (const GlobalAddressSDNode *GADN =
+             dyn_cast<GlobalAddressSDNode>(this)) {
+    int64_t offset = GADN->getOffset();
+    OS << '<';
+    WriteAsOperand(OS, GADN->getGlobal());
+    OS << '>';
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = GADN->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const FrameIndexSDNode *FIDN = dyn_cast<FrameIndexSDNode>(this)) {
+    OS << "<" << FIDN->getIndex() << ">";
+  } else if (const JumpTableSDNode *JTDN = dyn_cast<JumpTableSDNode>(this)) {
+    OS << "<" << JTDN->getIndex() << ">";
+    if (unsigned int TF = JTDN->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(this)){
+    int offset = CP->getOffset();
+    if (CP->isMachineConstantPoolEntry())
+      OS << "<" << *CP->getMachineCPVal() << ">";
+    else
+      OS << "<" << *CP->getConstVal() << ">";
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = CP->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const BasicBlockSDNode *BBDN = dyn_cast<BasicBlockSDNode>(this)) {
+    OS << "<";
+    const Value *LBB = (const Value*)BBDN->getBasicBlock()->getBasicBlock();
+    if (LBB)
+      OS << LBB->getName() << " ";
+    OS << (const void*)BBDN->getBasicBlock() << ">";
+  } else if (const RegisterSDNode *R = dyn_cast<RegisterSDNode>(this)) {
+    if (G && R->getReg() &&
+        TargetRegisterInfo::isPhysicalRegister(R->getReg())) {
+      OS << " %" << G->getTarget().getRegisterInfo()->getName(R->getReg());
+    } else {
+      OS << " %reg" << R->getReg();
+    }
+  } else if (const ExternalSymbolSDNode *ES =
+             dyn_cast<ExternalSymbolSDNode>(this)) {
+    OS << "'" << ES->getSymbol() << "'";
+    if (unsigned int TF = ES->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const SrcValueSDNode *M = dyn_cast<SrcValueSDNode>(this)) {
+    if (M->getValue())
+      OS << "<" << M->getValue() << ">";
+    else
+      OS << "<null>";
+  } else if (const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(this)) {
+    if (MD->getMD())
+      OS << "<" << MD->getMD() << ">";
+    else
+      OS << "<null>";
+  } else if (const VTSDNode *N = dyn_cast<VTSDNode>(this)) {
+    OS << ":" << N->getVT().getEVTString();
+  }
+  else if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(this)) {
+    OS << "<" << *LD->getMemOperand();
+
+    bool doExt = true;
+    switch (LD->getExtensionType()) {
+    default: doExt = false; break;
+    case ISD::EXTLOAD: OS << ", anyext"; break;
+    case ISD::SEXTLOAD: OS << ", sext"; break;
+    case ISD::ZEXTLOAD: OS << ", zext"; break;
+    }
+    if (doExt)
+      OS << " from " << LD->getMemoryVT().getEVTString();
+
+    const char *AM = getIndexedModeName(LD->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
+    OS << ">";
+  } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(this)) {
+    OS << "<" << *ST->getMemOperand();
+
+    if (ST->isTruncatingStore())
+      OS << ", trunc to " << ST->getMemoryVT().getEVTString();
+
+    const char *AM = getIndexedModeName(ST->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+    
+    OS << ">";
+  } else if (const MemSDNode* M = dyn_cast<MemSDNode>(this)) {
+    OS << "<" << *M->getMemOperand() << ">";
+  } else if (const BlockAddressSDNode *BA =
+               dyn_cast<BlockAddressSDNode>(this)) {
+    OS << "<";
+    WriteAsOperand(OS, BA->getBlockAddress()->getFunction(), false);
+    OS << ", ";
+    WriteAsOperand(OS, BA->getBlockAddress()->getBasicBlock(), false);
+    OS << ">";
+    if (unsigned int TF = BA->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  }
+
+  if (G)
+    if (unsigned Order = G->GetOrdering(this))
+      OS << " [ORD=" << Order << ']';
+
+  if (getNodeId() != -1)
+    OS << " [ID=" << getNodeId() << ']';
+
+  DebugLoc dl = getDebugLoc();
+  if (G && !dl.isUnknown()) {
+    DIScope
+      Scope(dl.getScope(G->getMachineFunction().getFunction()->getContext()));
+    OS << " dbg:";
+    // Omit the directory, since it's usually long and uninteresting.
+    if (Scope.Verify())
+      OS << Scope.getFilename();
+    else
+      OS << "<unknown>";
+    OS << ':' << dl.getLine();
+    if (dl.getCol() != 0)
+      OS << ':' << dl.getCol();
+  }
+}
+
+void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {
+  print_types(OS, G);
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    if (i) OS << ", "; else OS << " ";
+    OS << (void*)getOperand(i).getNode();
+    if (unsigned RN = getOperand(i).getResNo())
+      OS << ":" << RN;
+  }
+  print_details(OS, G);
+}
+
+static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N,
+                                  const SelectionDAG *G, unsigned depth,
+                                  unsigned indent) 
+{
+  if (depth == 0)
+    return;
+
+  OS.indent(indent);
+
+  N->print(OS, G);
+
+  if (depth < 1)
+    return;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    OS << '\n';
+    printrWithDepthHelper(OS, N->getOperand(i).getNode(), G, depth-1, indent+2);
+  }
+}
+
+void SDNode::printrWithDepth(raw_ostream &OS, const SelectionDAG *G,
+                            unsigned depth) const {
+  printrWithDepthHelper(OS, this, G, depth, 0);
+} 
+
+void SDNode::printrFull(raw_ostream &OS, const SelectionDAG *G) const {
+  // Don't print impossibly deep things.
+  printrWithDepth(OS, G, 100);
+}
+
+void SDNode::dumprWithDepth(const SelectionDAG *G, unsigned depth) const {
+  printrWithDepth(dbgs(), G, depth);
+}
+
+void SDNode::dumprFull(const SelectionDAG *G) const {
+  // Don't print impossibly deep things.
+  dumprWithDepth(G, 100);
+} 
+
+static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    if (N->getOperand(i).getNode()->hasOneUse())
+      DumpNodes(N->getOperand(i).getNode(), indent+2, G);
+    else
+      dbgs() << "\n" << std::string(indent+2, ' ')
+           << (void*)N->getOperand(i).getNode() << ": <multiple use>";
+
+
+  dbgs() << "\n";
+  dbgs().indent(indent);
+  N->dump(G);
+}
+
+SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
+  assert(N->getNumValues() == 1 &&
+         "Can't unroll a vector with multiple results!");
+
+  EVT VT = N->getValueType(0);
+  unsigned NE = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = N->getDebugLoc();
+
+  SmallVector<SDValue, 8> Scalars;
+  SmallVector<SDValue, 4> Operands(N->getNumOperands());
+
+  // If ResNE is 0, fully unroll the vector op.
+  if (ResNE == 0)
+    ResNE = NE;
+  else if (NE > ResNE)
+    NE = ResNE;
+
+  unsigned i;
+  for (i= 0; i != NE; ++i) {
+    for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) {
+      SDValue Operand = N->getOperand(j);
+      EVT OperandVT = Operand.getValueType();
+      if (OperandVT.isVector()) {
+        // A vector operand; extract a single element.
+        EVT OperandEltVT = OperandVT.getVectorElementType();
+        Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              OperandEltVT,
+                              Operand,
+                              getConstant(i, MVT::i32));
+      } else {
+        // A scalar operand; just use it as is.
+        Operands[j] = Operand;
+      }
+    }
+
+    switch (N->getOpcode()) {
+    default:
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
+                                &Operands[0], Operands.size()));
+      break;
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+    case ISD::ROTL:
+    case ISD::ROTR:
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
+                                getShiftAmountOperand(Operands[1])));
+      break;
+    case ISD::SIGN_EXTEND_INREG:
+    case ISD::FP_ROUND_INREG: {
+      EVT ExtVT = cast<VTSDNode>(Operands[1])->getVT().getVectorElementType();
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
+                                Operands[0],
+                                getValueType(ExtVT)));
+    }
+    }
+  }
+
+  for (; i < ResNE; ++i)
+    Scalars.push_back(getUNDEF(EltVT));
+
+  return getNode(ISD::BUILD_VECTOR, dl,
+                 EVT::getVectorVT(*getContext(), EltVT, ResNE),
+                 &Scalars[0], Scalars.size());
+}
+
+
+/// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a 
+/// location that is 'Dist' units away from the location that the 'Base' load 
+/// is loading from.
+bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, 
+                                     unsigned Bytes, int Dist) const {
+  if (LD->getChain() != Base->getChain())
+    return false;
+  EVT VT = LD->getValueType(0);
+  if (VT.getSizeInBits() / 8 != Bytes)
+    return false;
+
+  SDValue Loc = LD->getOperand(1);
+  SDValue BaseLoc = Base->getOperand(1);
+  if (Loc.getOpcode() == ISD::FrameIndex) {
+    if (BaseLoc.getOpcode() != ISD::FrameIndex)
+      return false;
+    const MachineFrameInfo *MFI = getMachineFunction().getFrameInfo();
+    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
+    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+    int FS  = MFI->getObjectSize(FI);
+    int BFS = MFI->getObjectSize(BFI);
+    if (FS != BFS || FS != (int)Bytes) return false;
+    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
+  }
+  if (Loc.getOpcode() == ISD::ADD && Loc.getOperand(0) == BaseLoc) {
+    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Loc.getOperand(1));
+    if (V && (V->getSExtValue() == Dist*Bytes))
+      return true;
+  }
+
+  const GlobalValue *GV1 = NULL;
+  const GlobalValue *GV2 = NULL;
+  int64_t Offset1 = 0;
+  int64_t Offset2 = 0;
+  bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
+  bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
+  if (isGA1 && isGA2 && GV1 == GV2)
+    return Offset1 == (Offset2 + Dist*Bytes);
+  return false;
+}
+
+
+/// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
+/// it cannot be inferred.
+unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
+  // If this is a GlobalAddress + cst, return the alignment.
+  const GlobalValue *GV;
+  int64_t GVOffset = 0;
+  if (TLI.isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
+    // If GV has specified alignment, then use it. Otherwise, use the preferred
+    // alignment.
+    unsigned Align = GV->getAlignment();
+    if (!Align) {
+      if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) {
+        if (GVar->hasInitializer()) {
+          const TargetData *TD = TLI.getTargetData();
+          Align = TD->getPreferredAlignment(GVar);
+        }
+      }
+    }
+    return MinAlign(Align, GVOffset);
+  }
+
+  // If this is a direct reference to a stack slot, use information about the
+  // stack slot's alignment.
+  int FrameIdx = 1 << 31;
+  int64_t FrameOffset = 0;
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
+    FrameIdx = FI->getIndex();
+  } else if (Ptr.getOpcode() == ISD::ADD &&
+             isa<ConstantSDNode>(Ptr.getOperand(1)) &&
+             isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+    FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+    FrameOffset = Ptr.getConstantOperandVal(1);
+  }
+
+  if (FrameIdx != (1 << 31)) {
+    // FIXME: Handle FI+CST.
+    const MachineFrameInfo &MFI = *getMachineFunction().getFrameInfo();
+    unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
+                                    FrameOffset);
+    return FIInfoAlign;
+  }
+
+  return 0;
+}
+
+void SelectionDAG::dump() const {
+  dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:";
+
+  for (allnodes_const_iterator I = allnodes_begin(), E = allnodes_end();
+       I != E; ++I) {
+    const SDNode *N = I;
+    if (!N->hasOneUse() && N != getRoot().getNode())
+      DumpNodes(N, 2, this);
+  }
+
+  if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this);
+
+  dbgs() << "\n\n";
+}
+
+void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
+  print_types(OS, G);
+  print_details(OS, G);
+}
+
+typedef SmallPtrSet<const SDNode *, 128> VisitedSDNodeSet;
+static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
+                       const SelectionDAG *G, VisitedSDNodeSet &once) {
+  if (!once.insert(N))          // If we've been here before, return now.
+    return;
+
+  // Dump the current SDNode, but don't end the line yet.
+  OS << std::string(indent, ' ');
+  N->printr(OS, G);
+
+  // Having printed this SDNode, walk the children:
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDNode *child = N->getOperand(i).getNode();
+
+    if (i) OS << ",";
+    OS << " ";
+
+    if (child->getNumOperands() == 0) {
+      // This child has no grandchildren; print it inline right here.
+      child->printr(OS, G);
+      once.insert(child);
+    } else {         // Just the address. FIXME: also print the child's opcode.
+      OS << (void*)child;
+      if (unsigned RN = N->getOperand(i).getResNo())
+        OS << ":" << RN;
+    }
+  }
+
+  OS << "\n";
+
+  // Dump children that have grandchildren on their own line(s).
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDNode *child = N->getOperand(i).getNode();
+    DumpNodesr(OS, child, indent+2, G, once);
+  }
+}
+
+void SDNode::dumpr() const {
+  VisitedSDNodeSet once;
+  DumpNodesr(dbgs(), this, 0, 0, once);
+}
+
+void SDNode::dumpr(const SelectionDAG *G) const {
+  VisitedSDNodeSet once;
+  DumpNodesr(dbgs(), this, 0, G, once);
+}
+
+
+// getAddressSpace - Return the address space this GlobalAddress belongs to.
+unsigned GlobalAddressSDNode::getAddressSpace() const {
+  return getGlobal()->getType()->getAddressSpace();
+}
+
+
+const Type *ConstantPoolSDNode::getType() const {
+  if (isMachineConstantPoolEntry())
+    return Val.MachineCPVal->getType();
+  return Val.ConstVal->getType();
+}
+
+bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue,
+                                        APInt &SplatUndef,
+                                        unsigned &SplatBitSize,
+                                        bool &HasAnyUndefs,
+                                        unsigned MinSplatBits,
+                                        bool isBigEndian) {
+  EVT VT = getValueType(0);
+  assert(VT.isVector() && "Expected a vector type");
+  unsigned sz = VT.getSizeInBits();
+  if (MinSplatBits > sz)
+    return false;
+
+  SplatValue = APInt(sz, 0);
+  SplatUndef = APInt(sz, 0);
+
+  // Get the bits.  Bits with undefined values (when the corresponding element
+  // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared
+  // in SplatValue.  If any of the values are not constant, give up and return
+  // false.
+  unsigned int nOps = getNumOperands();
+  assert(nOps > 0 && "isConstantSplat has 0-size build vector");
+  unsigned EltBitSize = VT.getVectorElementType().getSizeInBits();
+
+  for (unsigned j = 0; j < nOps; ++j) {
+    unsigned i = isBigEndian ? nOps-1-j : j;
+    SDValue OpVal = getOperand(i);
+    unsigned BitPos = j * EltBitSize;
+
+    if (OpVal.getOpcode() == ISD::UNDEF)
+      SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize);
+    else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal))
+      SplatValue |= APInt(CN->getAPIntValue()).zextOrTrunc(EltBitSize).
+                    zextOrTrunc(sz) << BitPos;
+    else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal))
+      SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) <<BitPos;
+     else
+      return false;
+  }
+
+  // The build_vector is all constants or undefs.  Find the smallest element
+  // size that splats the vector.
+
+  HasAnyUndefs = (SplatUndef != 0);
+  while (sz > 8) {
+
+    unsigned HalfSize = sz / 2;
+    APInt HighValue = APInt(SplatValue).lshr(HalfSize).trunc(HalfSize);
+    APInt LowValue = APInt(SplatValue).trunc(HalfSize);
+    APInt HighUndef = APInt(SplatUndef).lshr(HalfSize).trunc(HalfSize);
+    APInt LowUndef = APInt(SplatUndef).trunc(HalfSize);
+
+    // If the two halves do not match (ignoring undef bits), stop here.
+    if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) ||
+        MinSplatBits > HalfSize)
+      break;
+
+    SplatValue = HighValue | LowValue;
+    SplatUndef = HighUndef & LowUndef;
+
+    sz = HalfSize;
+  }
+
+  SplatBitSize = sz;
+  return true;
+}
+
+bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i, e;
+  for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i)
+    /* search */;
+
+  assert(i != e && "VECTOR_SHUFFLE node with all undef indices!");
+
+  // Make sure all remaining elements are either undef or the same as the first
+  // non-undef value.
+  for (int Idx = Mask[i]; i != e; ++i)
+    if (Mask[i] >= 0 && Mask[i] != Idx)
+      return false;
+  return true;
+}
+
+#ifdef XDEBUG
+static void checkForCyclesHelper(const SDNode *N,
+                                 SmallPtrSet<const SDNode*, 32> &Visited,
+                                 SmallPtrSet<const SDNode*, 32> &Checked) {
+  // If this node has already been checked, don't check it again.
+  if (Checked.count(N))
+    return;
+  
+  // If a node has already been visited on this depth-first walk, reject it as
+  // a cycle.
+  if (!Visited.insert(N)) {
+    dbgs() << "Offending node:\n";
+    N->dumprFull();
+    errs() << "Detected cycle in SelectionDAG\n";
+    abort();
+  }
+  
+  for(unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    checkForCyclesHelper(N->getOperand(i).getNode(), Visited, Checked);
+  
+  Checked.insert(N);
+  Visited.erase(N);
+}
+#endif
+
+void llvm::checkForCycles(const llvm::SDNode *N) {
+#ifdef XDEBUG
+  assert(N && "Checking nonexistant SDNode");
+  SmallPtrSet<const SDNode*, 32> visited;
+  SmallPtrSet<const SDNode*, 32> checked;
+  checkForCyclesHelper(N, visited, checked);
+#endif
+}
+
+void llvm::checkForCycles(const llvm::SelectionDAG *DAG) {
+  checkForCycles(DAG->getRoot().getNode());
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
new file mode 100644
index 0000000..aa6df1d
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -0,0 +1,5312 @@
+//===-- SelectionDAGBuilder.cpp - Selection-DAG building ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating from LLVM IR into SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "isel"
+#include "SDNodeDbgValue.h"
+#include "SelectionDAGBuilder.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Constants.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+/// LimitFloatPrecision - Generate low-precision inline sequences for
+/// some float libcalls (6, 8 or 12 bits).
+static unsigned LimitFloatPrecision;
+
+static cl::opt<unsigned, true>
+LimitFPPrecision("limit-float-precision",
+                 cl::desc("Generate low-precision inline sequences "
+                          "for some float libcalls"),
+                 cl::location(LimitFloatPrecision),
+                 cl::init(0));
+
+/// getCopyFromParts - Create a value that contains the specified legal parts
+/// combined into the value they represent.  If the parts combine to a type
+/// larger then ValueVT then AssertOp can be used to specify whether the extra
+/// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
+/// (ISD::AssertSext).
+static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc dl,
+                                const SDValue *Parts,
+                                unsigned NumParts, EVT PartVT, EVT ValueVT,
+                                ISD::NodeType AssertOp = ISD::DELETED_NODE) {
+  assert(NumParts > 0 && "No parts to assemble!");
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Val = Parts[0];
+
+  if (NumParts > 1) {
+    // Assemble the value from multiple parts.
+    if (!ValueVT.isVector() && ValueVT.isInteger()) {
+      unsigned PartBits = PartVT.getSizeInBits();
+      unsigned ValueBits = ValueVT.getSizeInBits();
+
+      // Assemble the power of 2 part.
+      unsigned RoundParts = NumParts & (NumParts - 1) ?
+        1 << Log2_32(NumParts) : NumParts;
+      unsigned RoundBits = PartBits * RoundParts;
+      EVT RoundVT = RoundBits == ValueBits ?
+        ValueVT : EVT::getIntegerVT(*DAG.getContext(), RoundBits);
+      SDValue Lo, Hi;
+
+      EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2);
+
+      if (RoundParts > 2) {
+        Lo = getCopyFromParts(DAG, dl, Parts, RoundParts / 2,
+                              PartVT, HalfVT);
+        Hi = getCopyFromParts(DAG, dl, Parts + RoundParts / 2,
+                              RoundParts / 2, PartVT, HalfVT);
+      } else {
+        Lo = DAG.getNode(ISD::BIT_CONVERT, dl, HalfVT, Parts[0]);
+        Hi = DAG.getNode(ISD::BIT_CONVERT, dl, HalfVT, Parts[1]);
+      }
+
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+
+      Val = DAG.getNode(ISD::BUILD_PAIR, dl, RoundVT, Lo, Hi);
+
+      if (RoundParts < NumParts) {
+        // Assemble the trailing non-power-of-2 part.
+        unsigned OddParts = NumParts - RoundParts;
+        EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits);
+        Hi = getCopyFromParts(DAG, dl,
+                              Parts + RoundParts, OddParts, PartVT, OddVT);
+
+        // Combine the round and odd parts.
+        Lo = Val;
+        if (TLI.isBigEndian())
+          std::swap(Lo, Hi);
+        EVT TotalVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+        Hi = DAG.getNode(ISD::ANY_EXTEND, dl, TotalVT, Hi);
+        Hi = DAG.getNode(ISD::SHL, dl, TotalVT, Hi,
+                         DAG.getConstant(Lo.getValueType().getSizeInBits(),
+                                         TLI.getPointerTy()));
+        Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, TotalVT, Lo);
+        Val = DAG.getNode(ISD::OR, dl, TotalVT, Lo, Hi);
+      }
+    } else if (ValueVT.isVector()) {
+      // Handle a multi-element vector.
+      EVT IntermediateVT, RegisterVT;
+      unsigned NumIntermediates;
+      unsigned NumRegs =
+        TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
+                                   NumIntermediates, RegisterVT);
+      assert(NumRegs == NumParts
+             && "Part count doesn't match vector breakdown!");
+      NumParts = NumRegs; // Silence a compiler warning.
+      assert(RegisterVT == PartVT
+             && "Part type doesn't match vector breakdown!");
+      assert(RegisterVT == Parts[0].getValueType() &&
+             "Part type doesn't match part!");
+
+      // Assemble the parts into intermediate operands.
+      SmallVector<SDValue, 8> Ops(NumIntermediates);
+      if (NumIntermediates == NumParts) {
+        // If the register was not expanded, truncate or copy the value,
+        // as appropriate.
+        for (unsigned i = 0; i != NumParts; ++i)
+          Ops[i] = getCopyFromParts(DAG, dl, &Parts[i], 1,
+                                    PartVT, IntermediateVT);
+      } else if (NumParts > 0) {
+        // If the intermediate type was expanded, build the intermediate
+        // operands from the parts.
+        assert(NumParts % NumIntermediates == 0 &&
+               "Must expand into a divisible number of parts!");
+        unsigned Factor = NumParts / NumIntermediates;
+        for (unsigned i = 0; i != NumIntermediates; ++i)
+          Ops[i] = getCopyFromParts(DAG, dl, &Parts[i * Factor], Factor,
+                                    PartVT, IntermediateVT);
+      }
+
+      // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
+      // intermediate operands.
+      Val = DAG.getNode(IntermediateVT.isVector() ?
+                        ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, dl,
+                        ValueVT, &Ops[0], NumIntermediates);
+    } else if (PartVT.isFloatingPoint()) {
+      // FP split into multiple FP parts (for ppcf128)
+      assert(ValueVT == EVT(MVT::ppcf128) && PartVT == EVT(MVT::f64) &&
+             "Unexpected split");
+      SDValue Lo, Hi;
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, EVT(MVT::f64), Parts[0]);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, EVT(MVT::f64), Parts[1]);
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+      Val = DAG.getNode(ISD::BUILD_PAIR, dl, ValueVT, Lo, Hi);
+    } else {
+      // FP split into integer parts (soft fp)
+      assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
+             !PartVT.isVector() && "Unexpected split");
+      EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
+      Val = getCopyFromParts(DAG, dl, Parts, NumParts, PartVT, IntVT);
+    }
+  }
+
+  // There is now one part, held in Val.  Correct it to match ValueVT.
+  PartVT = Val.getValueType();
+
+  if (PartVT == ValueVT)
+    return Val;
+
+  if (PartVT.isVector()) {
+    assert(ValueVT.isVector() && "Unknown vector conversion!");
+    return DAG.getNode(ISD::BIT_CONVERT, dl, ValueVT, Val);
+  }
+
+  if (ValueVT.isVector()) {
+    assert(ValueVT.getVectorElementType() == PartVT &&
+           ValueVT.getVectorNumElements() == 1 &&
+           "Only trivial scalar-to-vector conversions should get here!");
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, ValueVT, Val);
+  }
+
+  if (PartVT.isInteger() &&
+      ValueVT.isInteger()) {
+    if (ValueVT.bitsLT(PartVT)) {
+      // For a truncate, see if we have any information to
+      // indicate whether the truncated bits will always be
+      // zero or sign-extension.
+      if (AssertOp != ISD::DELETED_NODE)
+        Val = DAG.getNode(AssertOp, dl, PartVT, Val,
+                          DAG.getValueType(ValueVT));
+      return DAG.getNode(ISD::TRUNCATE, dl, ValueVT, Val);
+    } else {
+      return DAG.getNode(ISD::ANY_EXTEND, dl, ValueVT, Val);
+    }
+  }
+
+  if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
+    if (ValueVT.bitsLT(Val.getValueType())) {
+      // FP_ROUND's are always exact here.
+      return DAG.getNode(ISD::FP_ROUND, dl, ValueVT, Val,
+                         DAG.getIntPtrConstant(1));
+    }
+
+    return DAG.getNode(ISD::FP_EXTEND, dl, ValueVT, Val);
+  }
+
+  if (PartVT.getSizeInBits() == ValueVT.getSizeInBits())
+    return DAG.getNode(ISD::BIT_CONVERT, dl, ValueVT, Val);
+
+  llvm_unreachable("Unknown mismatch!");
+  return SDValue();
+}
+
+/// getCopyToParts - Create a series of nodes that contain the specified value
+/// split into legal parts.  If the parts contain more bits than Val, then, for
+/// integers, ExtendKind can be used to specify how to generate the extra bits.
+static void getCopyToParts(SelectionDAG &DAG, DebugLoc dl,
+                           SDValue Val, SDValue *Parts, unsigned NumParts,
+                           EVT PartVT,
+                           ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT PtrVT = TLI.getPointerTy();
+  EVT ValueVT = Val.getValueType();
+  unsigned PartBits = PartVT.getSizeInBits();
+  unsigned OrigNumParts = NumParts;
+  assert(TLI.isTypeLegal(PartVT) && "Copying to an illegal type!");
+
+  if (!NumParts)
+    return;
+
+  if (!ValueVT.isVector()) {
+    if (PartVT == ValueVT) {
+      assert(NumParts == 1 && "No-op copy with multiple parts!");
+      Parts[0] = Val;
+      return;
+    }
+
+    if (NumParts * PartBits > ValueVT.getSizeInBits()) {
+      // If the parts cover more bits than the value has, promote the value.
+      if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
+        assert(NumParts == 1 && "Do not know what to promote to!");
+        Val = DAG.getNode(ISD::FP_EXTEND, dl, PartVT, Val);
+      } else if (PartVT.isInteger() && ValueVT.isInteger()) {
+        ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+        Val = DAG.getNode(ExtendKind, dl, ValueVT, Val);
+      } else {
+        llvm_unreachable("Unknown mismatch!");
+      }
+    } else if (PartBits == ValueVT.getSizeInBits()) {
+      // Different types of the same size.
+      assert(NumParts == 1 && PartVT != ValueVT);
+      Val = DAG.getNode(ISD::BIT_CONVERT, dl, PartVT, Val);
+    } else if (NumParts * PartBits < ValueVT.getSizeInBits()) {
+      // If the parts cover less bits than value has, truncate the value.
+      if (PartVT.isInteger() && ValueVT.isInteger()) {
+        ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+        Val = DAG.getNode(ISD::TRUNCATE, dl, ValueVT, Val);
+      } else {
+        llvm_unreachable("Unknown mismatch!");
+      }
+    }
+
+    // The value may have changed - recompute ValueVT.
+    ValueVT = Val.getValueType();
+    assert(NumParts * PartBits == ValueVT.getSizeInBits() &&
+           "Failed to tile the value with PartVT!");
+
+    if (NumParts == 1) {
+      assert(PartVT == ValueVT && "Type conversion failed!");
+      Parts[0] = Val;
+      return;
+    }
+
+    // Expand the value into multiple parts.
+    if (NumParts & (NumParts - 1)) {
+      // The number of parts is not a power of 2.  Split off and copy the tail.
+      assert(PartVT.isInteger() && ValueVT.isInteger() &&
+             "Do not know what to expand to!");
+      unsigned RoundParts = 1 << Log2_32(NumParts);
+      unsigned RoundBits = RoundParts * PartBits;
+      unsigned OddParts = NumParts - RoundParts;
+      SDValue OddVal = DAG.getNode(ISD::SRL, dl, ValueVT, Val,
+                                   DAG.getConstant(RoundBits,
+                                                   TLI.getPointerTy()));
+      getCopyToParts(DAG, dl, OddVal, Parts + RoundParts,
+                     OddParts, PartVT);
+
+      if (TLI.isBigEndian())
+        // The odd parts were reversed by getCopyToParts - unreverse them.
+        std::reverse(Parts + RoundParts, Parts + NumParts);
+
+      NumParts = RoundParts;
+      ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+      Val = DAG.getNode(ISD::TRUNCATE, dl, ValueVT, Val);
+    }
+
+    // The number of parts is a power of 2.  Repeatedly bisect the value using
+    // EXTRACT_ELEMENT.
+    Parts[0] = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           EVT::getIntegerVT(*DAG.getContext(),
+                                             ValueVT.getSizeInBits()),
+                           Val);
+
+    for (unsigned StepSize = NumParts; StepSize > 1; StepSize /= 2) {
+      for (unsigned i = 0; i < NumParts; i += StepSize) {
+        unsigned ThisBits = StepSize * PartBits / 2;
+        EVT ThisVT = EVT::getIntegerVT(*DAG.getContext(), ThisBits);
+        SDValue &Part0 = Parts[i];
+        SDValue &Part1 = Parts[i+StepSize/2];
+
+        Part1 = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                            ThisVT, Part0,
+                            DAG.getConstant(1, PtrVT));
+        Part0 = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                            ThisVT, Part0,
+                            DAG.getConstant(0, PtrVT));
+
+        if (ThisBits == PartBits && ThisVT != PartVT) {
+          Part0 = DAG.getNode(ISD::BIT_CONVERT, dl,
+                                                PartVT, Part0);
+          Part1 = DAG.getNode(ISD::BIT_CONVERT, dl,
+                                                PartVT, Part1);
+        }
+      }
+    }
+
+    if (TLI.isBigEndian())
+      std::reverse(Parts, Parts + OrigNumParts);
+
+    return;
+  }
+
+  // Vector ValueVT.
+  if (NumParts == 1) {
+    if (PartVT != ValueVT) {
+      if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
+        Val = DAG.getNode(ISD::BIT_CONVERT, dl, PartVT, Val);
+      } else {
+        assert(ValueVT.getVectorElementType() == PartVT &&
+               ValueVT.getVectorNumElements() == 1 &&
+               "Only trivial vector-to-scalar conversions should get here!");
+        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                          PartVT, Val,
+                          DAG.getConstant(0, PtrVT));
+      }
+    }
+
+    Parts[0] = Val;
+    return;
+  }
+
+  // Handle a multi-element vector.
+  EVT IntermediateVT, RegisterVT;
+  unsigned NumIntermediates;
+  unsigned NumRegs = TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT,
+                              IntermediateVT, NumIntermediates, RegisterVT);
+  unsigned NumElements = ValueVT.getVectorNumElements();
+
+  assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
+  NumParts = NumRegs; // Silence a compiler warning.
+  assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
+
+  // Split the vector into intermediate operands.
+  SmallVector<SDValue, 8> Ops(NumIntermediates);
+  for (unsigned i = 0; i != NumIntermediates; ++i) {
+    if (IntermediateVT.isVector())
+      Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
+                           IntermediateVT, Val,
+                           DAG.getConstant(i * (NumElements / NumIntermediates),
+                                           PtrVT));
+    else
+      Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                           IntermediateVT, Val,
+                           DAG.getConstant(i, PtrVT));
+  }
+
+  // Split the intermediate operands into legal parts.
+  if (NumParts == NumIntermediates) {
+    // If the register was not expanded, promote or copy the value,
+    // as appropriate.
+    for (unsigned i = 0; i != NumParts; ++i)
+      getCopyToParts(DAG, dl, Ops[i], &Parts[i], 1, PartVT);
+  } else if (NumParts > 0) {
+    // If the intermediate type was expanded, split each the value into
+    // legal parts.
+    assert(NumParts % NumIntermediates == 0 &&
+           "Must expand into a divisible number of parts!");
+    unsigned Factor = NumParts / NumIntermediates;
+    for (unsigned i = 0; i != NumIntermediates; ++i)
+      getCopyToParts(DAG, dl, Ops[i], &Parts[i*Factor], Factor, PartVT);
+  }
+}
+
+namespace {
+  /// RegsForValue - This struct represents the registers (physical or virtual)
+  /// that a particular set of values is assigned, and the type information
+  /// about the value. The most common situation is to represent one value at a
+  /// time, but struct or array values are handled element-wise as multiple
+  /// values.  The splitting of aggregates is performed recursively, so that we
+  /// never have aggregate-typed registers. The values at this point do not
+  /// necessarily have legal types, so each value may require one or more
+  /// registers of some legal type.
+  ///
+  struct RegsForValue {
+    /// ValueVTs - The value types of the values, which may not be legal, and
+    /// may need be promoted or synthesized from one or more registers.
+    ///
+    SmallVector<EVT, 4> ValueVTs;
+
+    /// RegVTs - The value types of the registers. This is the same size as
+    /// ValueVTs and it records, for each value, what the type of the assigned
+    /// register or registers are. (Individual values are never synthesized
+    /// from more than one type of register.)
+    ///
+    /// With virtual registers, the contents of RegVTs is redundant with TLI's
+    /// getRegisterType member function, however when with physical registers
+    /// it is necessary to have a separate record of the types.
+    ///
+    SmallVector<EVT, 4> RegVTs;
+
+    /// Regs - This list holds the registers assigned to the values.
+    /// Each legal or promoted value requires one register, and each
+    /// expanded value requires multiple registers.
+    ///
+    SmallVector<unsigned, 4> Regs;
+
+    RegsForValue() {}
+
+    RegsForValue(const SmallVector<unsigned, 4> &regs,
+                 EVT regvt, EVT valuevt)
+      : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
+
+    RegsForValue(const SmallVector<unsigned, 4> &regs,
+                 const SmallVector<EVT, 4> &regvts,
+                 const SmallVector<EVT, 4> &valuevts)
+      : ValueVTs(valuevts), RegVTs(regvts), Regs(regs) {}
+
+    RegsForValue(LLVMContext &Context, const TargetLowering &tli,
+                 unsigned Reg, const Type *Ty) {
+      ComputeValueVTs(tli, Ty, ValueVTs);
+
+      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+        EVT ValueVT = ValueVTs[Value];
+        unsigned NumRegs = tli.getNumRegisters(Context, ValueVT);
+        EVT RegisterVT = tli.getRegisterType(Context, ValueVT);
+        for (unsigned i = 0; i != NumRegs; ++i)
+          Regs.push_back(Reg + i);
+        RegVTs.push_back(RegisterVT);
+        Reg += NumRegs;
+      }
+    }
+
+    /// areValueTypesLegal - Return true if types of all the values are legal.
+    bool areValueTypesLegal(const TargetLowering &TLI) {
+      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+        EVT RegisterVT = RegVTs[Value];
+        if (!TLI.isTypeLegal(RegisterVT))
+          return false;
+      }
+      return true;
+    }
+
+    /// append - Add the specified values to this one.
+    void append(const RegsForValue &RHS) {
+      ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
+      RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
+      Regs.append(RHS.Regs.begin(), RHS.Regs.end());
+    }
+
+    /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
+    /// this value and returns the result as a ValueVTs value.  This uses
+    /// Chain/Flag as the input and updates them for the output Chain/Flag.
+    /// If the Flag pointer is NULL, no flag is used.
+    SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
+                            DebugLoc dl,
+                            SDValue &Chain, SDValue *Flag) const;
+
+    /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
+    /// specified value into the registers specified by this object.  This uses
+    /// Chain/Flag as the input and updates them for the output Chain/Flag.
+    /// If the Flag pointer is NULL, no flag is used.
+    void getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
+                       SDValue &Chain, SDValue *Flag) const;
+
+    /// AddInlineAsmOperands - Add this value to the specified inlineasm node
+    /// operand list.  This adds the code marker, matching input operand index
+    /// (if applicable), and includes the number of values added into it.
+    void AddInlineAsmOperands(unsigned Kind,
+                              bool HasMatching, unsigned MatchingIdx,
+                              SelectionDAG &DAG,
+                              std::vector<SDValue> &Ops) const;
+  };
+}
+
+/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
+/// this value and returns the result as a ValueVT value.  This uses
+/// Chain/Flag as the input and updates them for the output Chain/Flag.
+/// If the Flag pointer is NULL, no flag is used.
+SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
+                                      FunctionLoweringInfo &FuncInfo,
+                                      DebugLoc dl,
+                                      SDValue &Chain, SDValue *Flag) const {
+  // A Value with type {} or [0 x %t] needs no registers.
+  if (ValueVTs.empty())
+    return SDValue();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Assemble the legal parts into the final values.
+  SmallVector<SDValue, 4> Values(ValueVTs.size());
+  SmallVector<SDValue, 8> Parts;
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    // Copy the legal parts from the registers.
+    EVT ValueVT = ValueVTs[Value];
+    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
+    EVT RegisterVT = RegVTs[Value];
+
+    Parts.resize(NumRegs);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      SDValue P;
+      if (Flag == 0) {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
+      } else {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
+        *Flag = P.getValue(2);
+      }
+
+      Chain = P.getValue(1);
+
+      // If the source register was virtual and if we know something about it,
+      // add an assert node.
+      if (TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) &&
+          RegisterVT.isInteger() && !RegisterVT.isVector()) {
+        unsigned SlotNo = Regs[Part+i]-TargetRegisterInfo::FirstVirtualRegister;
+        if (FuncInfo.LiveOutRegInfo.size() > SlotNo) {
+          const FunctionLoweringInfo::LiveOutInfo &LOI =
+            FuncInfo.LiveOutRegInfo[SlotNo];
+
+          unsigned RegSize = RegisterVT.getSizeInBits();
+          unsigned NumSignBits = LOI.NumSignBits;
+          unsigned NumZeroBits = LOI.KnownZero.countLeadingOnes();
+
+          // FIXME: We capture more information than the dag can represent.  For
+          // now, just use the tightest assertzext/assertsext possible.
+          bool isSExt = true;
+          EVT FromVT(MVT::Other);
+          if (NumSignBits == RegSize)
+            isSExt = true, FromVT = MVT::i1;   // ASSERT SEXT 1
+          else if (NumZeroBits >= RegSize-1)
+            isSExt = false, FromVT = MVT::i1;  // ASSERT ZEXT 1
+          else if (NumSignBits > RegSize-8)
+            isSExt = true, FromVT = MVT::i8;   // ASSERT SEXT 8
+          else if (NumZeroBits >= RegSize-8)
+            isSExt = false, FromVT = MVT::i8;  // ASSERT ZEXT 8
+          else if (NumSignBits > RegSize-16)
+            isSExt = true, FromVT = MVT::i16;  // ASSERT SEXT 16
+          else if (NumZeroBits >= RegSize-16)
+            isSExt = false, FromVT = MVT::i16; // ASSERT ZEXT 16
+          else if (NumSignBits > RegSize-32)
+            isSExt = true, FromVT = MVT::i32;  // ASSERT SEXT 32
+          else if (NumZeroBits >= RegSize-32)
+            isSExt = false, FromVT = MVT::i32; // ASSERT ZEXT 32
+
+          if (FromVT != MVT::Other)
+            P = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
+                            RegisterVT, P, DAG.getValueType(FromVT));
+        }
+      }
+
+      Parts[i] = P;
+    }
+
+    Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(),
+                                     NumRegs, RegisterVT, ValueVT);
+    Part += NumRegs;
+    Parts.clear();
+  }
+
+  return DAG.getNode(ISD::MERGE_VALUES, dl,
+                     DAG.getVTList(&ValueVTs[0], ValueVTs.size()),
+                     &Values[0], ValueVTs.size());
+}
+
+/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
+/// specified value into the registers specified by this object.  This uses
+/// Chain/Flag as the input and updates them for the output Chain/Flag.
+/// If the Flag pointer is NULL, no flag is used.
+void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
+                                 SDValue &Chain, SDValue *Flag) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Get the list of the values's legal parts.
+  unsigned NumRegs = Regs.size();
+  SmallVector<SDValue, 8> Parts(NumRegs);
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    EVT ValueVT = ValueVTs[Value];
+    unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
+    EVT RegisterVT = RegVTs[Value];
+
+    getCopyToParts(DAG, dl,
+                   Val.getValue(Val.getResNo() + Value),
+                   &Parts[Part], NumParts, RegisterVT);
+    Part += NumParts;
+  }
+
+  // Copy the parts into the registers.
+  SmallVector<SDValue, 8> Chains(NumRegs);
+  for (unsigned i = 0; i != NumRegs; ++i) {
+    SDValue Part;
+    if (Flag == 0) {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
+    } else {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
+      *Flag = Part.getValue(1);
+    }
+
+    Chains[i] = Part.getValue(0);
+  }
+
+  if (NumRegs == 1 || Flag)
+    // If NumRegs > 1 && Flag is used then the use of the last CopyToReg is
+    // flagged to it. That is the CopyToReg nodes and the user are considered
+    // a single scheduling unit. If we create a TokenFactor and return it as
+    // chain, then the TokenFactor is both a predecessor (operand) of the
+    // user as well as a successor (the TF operands are flagged to the user).
+    // c1, f1 = CopyToReg
+    // c2, f2 = CopyToReg
+    // c3     = TokenFactor c1, c2
+    // ...
+    //        = op c3, ..., f2
+    Chain = Chains[NumRegs-1];
+  else
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], NumRegs);
+}
+
+/// AddInlineAsmOperands - Add this value to the specified inlineasm node
+/// operand list.  This adds the code marker and includes the number of
+/// values added into it.
+void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
+                                        unsigned MatchingIdx,
+                                        SelectionDAG &DAG,
+                                        std::vector<SDValue> &Ops) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
+  if (HasMatching)
+    Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
+  SDValue Res = DAG.getTargetConstant(Flag, MVT::i32);
+  Ops.push_back(Res);
+
+  for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value]);
+    EVT RegisterVT = RegVTs[Value];
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      assert(Reg < Regs.size() && "Mismatch in # registers expected");
+      Ops.push_back(DAG.getRegister(Regs[Reg++], RegisterVT));
+    }
+  }
+}
+
+void SelectionDAGBuilder::init(AliasAnalysis &aa) {
+  AA = &aa;
+  TD = DAG.getTarget().getTargetData();
+}
+
+/// clear - Clear out the current SelectionDAG and the associated
+/// state and prepare this SelectionDAGBuilder object to be used
+/// for a new block. This doesn't clear out information about
+/// additional blocks that are needed to complete switch lowering
+/// or PHI node updating; that information is cleared out as it is
+/// consumed.
+void SelectionDAGBuilder::clear() {
+  NodeMap.clear();
+  UnusedArgNodeMap.clear();
+  PendingLoads.clear();
+  PendingExports.clear();
+  CurDebugLoc = DebugLoc();
+  HasTailCall = false;
+}
+
+/// getRoot - Return the current virtual root of the Selection DAG,
+/// flushing any PendingLoad items. This must be done before emitting
+/// a store or any other node that may need to be ordered after any
+/// prior load instructions.
+///
+SDValue SelectionDAGBuilder::getRoot() {
+  if (PendingLoads.empty())
+    return DAG.getRoot();
+
+  if (PendingLoads.size() == 1) {
+    SDValue Root = PendingLoads[0];
+    DAG.setRoot(Root);
+    PendingLoads.clear();
+    return Root;
+  }
+
+  // Otherwise, we have to make a token factor node.
+  SDValue Root = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(), MVT::Other,
+                               &PendingLoads[0], PendingLoads.size());
+  PendingLoads.clear();
+  DAG.setRoot(Root);
+  return Root;
+}
+
+/// getControlRoot - Similar to getRoot, but instead of flushing all the
+/// PendingLoad items, flush all the PendingExports items. It is necessary
+/// to do this before emitting a terminator instruction.
+///
+SDValue SelectionDAGBuilder::getControlRoot() {
+  SDValue Root = DAG.getRoot();
+
+  if (PendingExports.empty())
+    return Root;
+
+  // Turn all of the CopyToReg chains into one factored node.
+  if (Root.getOpcode() != ISD::EntryToken) {
+    unsigned i = 0, e = PendingExports.size();
+    for (; i != e; ++i) {
+      assert(PendingExports[i].getNode()->getNumOperands() > 1);
+      if (PendingExports[i].getNode()->getOperand(0) == Root)
+        break;  // Don't add the root if we already indirectly depend on it.
+    }
+
+    if (i == e)
+      PendingExports.push_back(Root);
+  }
+
+  Root = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(), MVT::Other,
+                     &PendingExports[0],
+                     PendingExports.size());
+  PendingExports.clear();
+  DAG.setRoot(Root);
+  return Root;
+}
+
+void SelectionDAGBuilder::AssignOrderingToNode(const SDNode *Node) {
+  if (DAG.GetOrdering(Node) != 0) return; // Already has ordering.
+  DAG.AssignOrdering(Node, SDNodeOrder);
+
+  for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I)
+    AssignOrderingToNode(Node->getOperand(I).getNode());
+}
+
+void SelectionDAGBuilder::visit(const Instruction &I) {
+  // Set up outgoing PHI node register values before emitting the terminator.
+  if (isa<TerminatorInst>(&I))
+    HandlePHINodesInSuccessorBlocks(I.getParent());
+
+  CurDebugLoc = I.getDebugLoc();
+
+  visit(I.getOpcode(), I);
+
+  if (!isa<TerminatorInst>(&I) && !HasTailCall)
+    CopyToExportRegsIfNeeded(&I);
+
+  CurDebugLoc = DebugLoc();
+}
+
+void SelectionDAGBuilder::visitPHI(const PHINode &) {
+  llvm_unreachable("SelectionDAGBuilder shouldn't visit PHI nodes!");
+}
+
+void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
+  // Note: this doesn't use InstVisitor, because it has to work with
+  // ConstantExpr's in addition to instructions.
+  switch (Opcode) {
+  default: llvm_unreachable("Unknown instruction type encountered!");
+    // Build the switch statement using the Instruction.def file.
+#define HANDLE_INST(NUM, OPCODE, CLASS) \
+    case Instruction::OPCODE: visit##OPCODE((CLASS&)I); break;
+#include "llvm/Instruction.def"
+  }
+
+  // Assign the ordering to the freshly created DAG nodes.
+  if (NodeMap.count(&I)) {
+    ++SDNodeOrder;
+    AssignOrderingToNode(getValue(&I).getNode());
+  }
+}
+
+// getValue - Return an SDValue for the given Value.
+SDValue SelectionDAGBuilder::getValue(const Value *V) {
+  // If we already have an SDValue for this value, use it. It's important
+  // to do this first, so that we don't create a CopyFromReg if we already
+  // have a regular SDValue.
+  SDValue &N = NodeMap[V];
+  if (N.getNode()) return N;
+
+  // If there's a virtual register allocated and initialized for this
+  // value, use it.
+  DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
+  if (It != FuncInfo.ValueMap.end()) {
+    unsigned InReg = It->second;
+    RegsForValue RFV(*DAG.getContext(), TLI, InReg, V->getType());
+    SDValue Chain = DAG.getEntryNode();
+    return N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain,NULL);
+  }
+
+  // Otherwise create a new SDValue and remember it.
+  SDValue Val = getValueImpl(V);
+  NodeMap[V] = Val;
+
+  return Val;
+}
+
+/// getNonRegisterValue - Return an SDValue for the given Value, but
+/// don't look in FuncInfo.ValueMap for a virtual register.
+SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
+  // If we already have an SDValue for this value, use it.
+  SDValue &N = NodeMap[V];
+  if (N.getNode()) return N;
+
+  // Otherwise create a new SDValue and remember it.
+  SDValue Val = getValueImpl(V);
+  NodeMap[V] = Val;
+
+  return Val;
+}
+
+/// getValueImpl - Helper function for getValue and getNonRegisterValue.
+/// Create an SDValue for the given value.
+SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    EVT VT = TLI.getValueType(V->getType(), true);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
+      return DAG.getConstant(*CI, VT);
+
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+      return DAG.getGlobalAddress(GV, getCurDebugLoc(), VT);
+
+    if (isa<ConstantPointerNull>(C))
+      return DAG.getConstant(0, TLI.getPointerTy());
+
+    if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+      return DAG.getConstantFP(*CFP, VT);
+
+    if (isa<UndefValue>(C) && !V->getType()->isAggregateType())
+      return DAG.getUNDEF(VT);
+
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+      visit(CE->getOpcode(), *CE);
+      SDValue N1 = NodeMap[V];
+      assert(N1.getNode() && "visit didn't populate the NodeMap!");
+      return N1;
+    }
+
+    if (isa<ConstantStruct>(C) || isa<ConstantArray>(C)) {
+      SmallVector<SDValue, 4> Constants;
+      for (User::const_op_iterator OI = C->op_begin(), OE = C->op_end();
+           OI != OE; ++OI) {
+        SDNode *Val = getValue(*OI).getNode();
+        // If the operand is an empty aggregate, there are no values.
+        if (!Val) continue;
+        // Add each leaf value from the operand to the Constants list
+        // to form a flattened list of all the values.
+        for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
+          Constants.push_back(SDValue(Val, i));
+      }
+
+      return DAG.getMergeValues(&Constants[0], Constants.size(),
+                                getCurDebugLoc());
+    }
+
+    if (C->getType()->isStructTy() || C->getType()->isArrayTy()) {
+      assert((isa<ConstantAggregateZero>(C) || isa<UndefValue>(C)) &&
+             "Unknown struct or array constant!");
+
+      SmallVector<EVT, 4> ValueVTs;
+      ComputeValueVTs(TLI, C->getType(), ValueVTs);
+      unsigned NumElts = ValueVTs.size();
+      if (NumElts == 0)
+        return SDValue(); // empty struct
+      SmallVector<SDValue, 4> Constants(NumElts);
+      for (unsigned i = 0; i != NumElts; ++i) {
+        EVT EltVT = ValueVTs[i];
+        if (isa<UndefValue>(C))
+          Constants[i] = DAG.getUNDEF(EltVT);
+        else if (EltVT.isFloatingPoint())
+          Constants[i] = DAG.getConstantFP(0, EltVT);
+        else
+          Constants[i] = DAG.getConstant(0, EltVT);
+      }
+
+      return DAG.getMergeValues(&Constants[0], NumElts,
+                                getCurDebugLoc());
+    }
+
+    if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
+      return DAG.getBlockAddress(BA, VT);
+
+    const VectorType *VecTy = cast<VectorType>(V->getType());
+    unsigned NumElements = VecTy->getNumElements();
+
+    // Now that we know the number and type of the elements, get that number of
+    // elements into the Ops array based on what kind of constant it is.
+    SmallVector<SDValue, 16> Ops;
+    if (const ConstantVector *CP = dyn_cast<ConstantVector>(C)) {
+      for (unsigned i = 0; i != NumElements; ++i)
+        Ops.push_back(getValue(CP->getOperand(i)));
+    } else {
+      assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
+      EVT EltVT = TLI.getValueType(VecTy->getElementType());
+
+      SDValue Op;
+      if (EltVT.isFloatingPoint())
+        Op = DAG.getConstantFP(0, EltVT);
+      else
+        Op = DAG.getConstant(0, EltVT);
+      Ops.assign(NumElements, Op);
+    }
+
+    // Create a BUILD_VECTOR node.
+    return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurDebugLoc(),
+                                    VT, &Ops[0], Ops.size());
+  }
+
+  // If this is a static alloca, generate it as the frameindex instead of
+  // computation.
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    DenseMap<const AllocaInst*, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end())
+      return DAG.getFrameIndex(SI->second, TLI.getPointerTy());
+  }
+
+  // If this is an instruction which fast-isel has deferred, select it now.
+  if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+    unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
+    RegsForValue RFV(*DAG.getContext(), TLI, InReg, Inst->getType());
+    SDValue Chain = DAG.getEntryNode();
+    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL);
+  }
+
+  llvm_unreachable("Can't get register for value!");
+  return SDValue();
+}
+
+void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
+  SDValue Chain = getControlRoot();
+  SmallVector<ISD::OutputArg, 8> Outs;
+  SmallVector<SDValue, 8> OutVals;
+
+  if (!FuncInfo.CanLowerReturn) {
+    unsigned DemoteReg = FuncInfo.DemoteRegister;
+    const Function *F = I.getParent()->getParent();
+
+    // Emit a store of the return value through the virtual register.
+    // Leave Outs empty so that LowerReturn won't try to load return
+    // registers the usual way.
+    SmallVector<EVT, 1> PtrValueVTs;
+    ComputeValueVTs(TLI, PointerType::getUnqual(F->getReturnType()),
+                    PtrValueVTs);
+
+    SDValue RetPtr = DAG.getRegister(DemoteReg, PtrValueVTs[0]);
+    SDValue RetOp = getValue(I.getOperand(0));
+
+    SmallVector<EVT, 4> ValueVTs;
+    SmallVector<uint64_t, 4> Offsets;
+    ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs, &Offsets);
+    unsigned NumValues = ValueVTs.size();
+
+    SmallVector<SDValue, 4> Chains(NumValues);
+    EVT PtrVT = PtrValueVTs[0];
+    for (unsigned i = 0; i != NumValues; ++i) {
+      SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT, RetPtr,
+                                DAG.getConstant(Offsets[i], PtrVT));
+      Chains[i] =
+        DAG.getStore(Chain, getCurDebugLoc(),
+                     SDValue(RetOp.getNode(), RetOp.getResNo() + i),
+                     Add, NULL, Offsets[i], false, false, 0);
+    }
+
+    Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+                        MVT::Other, &Chains[0], NumValues);
+  } else if (I.getNumOperands() != 0) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs);
+    unsigned NumValues = ValueVTs.size();
+    if (NumValues) {
+      SDValue RetOp = getValue(I.getOperand(0));
+      for (unsigned j = 0, f = NumValues; j != f; ++j) {
+        EVT VT = ValueVTs[j];
+
+        ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+        const Function *F = I.getParent()->getParent();
+        if (F->paramHasAttr(0, Attribute::SExt))
+          ExtendKind = ISD::SIGN_EXTEND;
+        else if (F->paramHasAttr(0, Attribute::ZExt))
+          ExtendKind = ISD::ZERO_EXTEND;
+
+        // FIXME: C calling convention requires the return type to be promoted
+        // to at least 32-bit. But this is not necessary for non-C calling
+        // conventions. The frontend should mark functions whose return values
+        // require promoting with signext or zeroext attributes.
+        if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
+          EVT MinVT = TLI.getRegisterType(*DAG.getContext(), MVT::i32);
+          if (VT.bitsLT(MinVT))
+            VT = MinVT;
+        }
+
+        unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), VT);
+        EVT PartVT = TLI.getRegisterType(*DAG.getContext(), VT);
+        SmallVector<SDValue, 4> Parts(NumParts);
+        getCopyToParts(DAG, getCurDebugLoc(),
+                       SDValue(RetOp.getNode(), RetOp.getResNo() + j),
+                       &Parts[0], NumParts, PartVT, ExtendKind);
+
+        // 'inreg' on function refers to return value
+        ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+        if (F->paramHasAttr(0, Attribute::InReg))
+          Flags.setInReg();
+
+        // Propagate extension type if any
+        if (F->paramHasAttr(0, Attribute::SExt))
+          Flags.setSExt();
+        else if (F->paramHasAttr(0, Attribute::ZExt))
+          Flags.setZExt();
+
+        for (unsigned i = 0; i < NumParts; ++i) {
+          Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
+                                        /*isfixed=*/true));
+          OutVals.push_back(Parts[i]);
+        }
+      }
+    }
+  }
+
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  CallingConv::ID CallConv =
+    DAG.getMachineFunction().getFunction()->getCallingConv();
+  Chain = TLI.LowerReturn(Chain, CallConv, isVarArg,
+                          Outs, OutVals, getCurDebugLoc(), DAG);
+
+  // Verify that the target's LowerReturn behaved as expected.
+  assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
+         "LowerReturn didn't return a valid chain!");
+
+  // Update the DAG with the new chain value resulting from return lowering.
+  DAG.setRoot(Chain);
+}
+
+/// CopyToExportRegsIfNeeded - If the given value has virtual registers
+/// created for it, emit nodes to copy the value into the virtual
+/// registers.
+void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) {
+  DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
+  if (VMI != FuncInfo.ValueMap.end()) {
+    assert(!V->use_empty() && "Unused value assigned virtual registers!");
+    CopyValueToVirtualRegister(V, VMI->second);
+  }
+}
+
+/// ExportFromCurrentBlock - If this condition isn't known to be exported from
+/// the current basic block, add it to ValueMap now so that we'll get a
+/// CopyTo/FromReg.
+void SelectionDAGBuilder::ExportFromCurrentBlock(const Value *V) {
+  // No need to export constants.
+  if (!isa<Instruction>(V) && !isa<Argument>(V)) return;
+
+  // Already exported?
+  if (FuncInfo.isExportedInst(V)) return;
+
+  unsigned Reg = FuncInfo.InitializeRegForValue(V);
+  CopyValueToVirtualRegister(V, Reg);
+}
+
+bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V,
+                                                     const BasicBlock *FromBB) {
+  // The operands of the setcc have to be in this block.  We don't know
+  // how to export them from some other block.
+  if (const Instruction *VI = dyn_cast<Instruction>(V)) {
+    // Can export from current BB.
+    if (VI->getParent() == FromBB)
+      return true;
+
+    // Is already exported, noop.
+    return FuncInfo.isExportedInst(V);
+  }
+
+  // If this is an argument, we can export it if the BB is the entry block or
+  // if it is already exported.
+  if (isa<Argument>(V)) {
+    if (FromBB == &FromBB->getParent()->getEntryBlock())
+      return true;
+
+    // Otherwise, can only export this if it is already exported.
+    return FuncInfo.isExportedInst(V);
+  }
+
+  // Otherwise, constants can always be exported.
+  return true;
+}
+
+static bool InBlock(const Value *V, const BasicBlock *BB) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() == BB;
+  return true;
+}
+
+/// EmitBranchForMergedCondition - Helper method for FindMergedConditions.
+/// This function emits a branch and is used at the leaves of an OR or an
+/// AND operator tree.
+///
+void
+SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
+                                                  MachineBasicBlock *TBB,
+                                                  MachineBasicBlock *FBB,
+                                                  MachineBasicBlock *CurBB,
+                                                  MachineBasicBlock *SwitchBB) {
+  const BasicBlock *BB = CurBB->getBasicBlock();
+
+  // If the leaf of the tree is a comparison, merge the condition into
+  // the caseblock.
+  if (const CmpInst *BOp = dyn_cast<CmpInst>(Cond)) {
+    // The operands of the cmp have to be in this block.  We don't know
+    // how to export them from some other block.  If this is the first block
+    // of the sequence, no exporting is needed.
+    if (CurBB == SwitchBB ||
+        (isExportableFromCurrentBlock(BOp->getOperand(0), BB) &&
+         isExportableFromCurrentBlock(BOp->getOperand(1), BB))) {
+      ISD::CondCode Condition;
+      if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
+        Condition = getICmpCondCode(IC->getPredicate());
+      } else if (const FCmpInst *FC = dyn_cast<FCmpInst>(Cond)) {
+        Condition = getFCmpCondCode(FC->getPredicate());
+      } else {
+        Condition = ISD::SETEQ; // silence warning.
+        llvm_unreachable("Unknown compare instruction");
+      }
+
+      CaseBlock CB(Condition, BOp->getOperand(0),
+                   BOp->getOperand(1), NULL, TBB, FBB, CurBB);
+      SwitchCases.push_back(CB);
+      return;
+    }
+  }
+
+  // Create a CaseBlock record representing this branch.
+  CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()),
+               NULL, TBB, FBB, CurBB);
+  SwitchCases.push_back(CB);
+}
+
+/// FindMergedConditions - If Cond is an expression like
+void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
+                                               MachineBasicBlock *TBB,
+                                               MachineBasicBlock *FBB,
+                                               MachineBasicBlock *CurBB,
+                                               MachineBasicBlock *SwitchBB,
+                                               unsigned Opc) {
+  // If this node is not part of the or/and tree, emit it as a branch.
+  const Instruction *BOp = dyn_cast<Instruction>(Cond);
+  if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) ||
+      (unsigned)BOp->getOpcode() != Opc || !BOp->hasOneUse() ||
+      BOp->getParent() != CurBB->getBasicBlock() ||
+      !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) ||
+      !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {
+    EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB);
+    return;
+  }
+
+  //  Create TmpBB after CurBB.
+  MachineFunction::iterator BBI = CurBB;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineBasicBlock *TmpBB = MF.CreateMachineBasicBlock(CurBB->getBasicBlock());
+  CurBB->getParent()->insert(++BBI, TmpBB);
+
+  if (Opc == Instruction::Or) {
+    // Codegen X | Y as:
+    //   jmp_if_X TBB
+    //   jmp TmpBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+
+    // Emit the LHS condition.
+    FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc);
+
+    // Emit the RHS condition into TmpBB.
+    FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc);
+  } else {
+    assert(Opc == Instruction::And && "Unknown merge op!");
+    // Codegen X & Y as:
+    //   jmp_if_X TmpBB
+    //   jmp FBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+    //  This requires creation of TmpBB after CurBB.
+
+    // Emit the LHS condition.
+    FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc);
+
+    // Emit the RHS condition into TmpBB.
+    FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc);
+  }
+}
+
+/// If the set of cases should be emitted as a series of branches, return true.
+/// If we should emit this as a bunch of and/or'd together conditions, return
+/// false.
+bool
+SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases){
+  if (Cases.size() != 2) return true;
+
+  // If this is two comparisons of the same values or'd or and'd together, they
+  // will get folded into a single comparison, so don't emit two blocks.
+  if ((Cases[0].CmpLHS == Cases[1].CmpLHS &&
+       Cases[0].CmpRHS == Cases[1].CmpRHS) ||
+      (Cases[0].CmpRHS == Cases[1].CmpLHS &&
+       Cases[0].CmpLHS == Cases[1].CmpRHS)) {
+    return false;
+  }
+
+  // Handle: (X != null) | (Y != null) --> (X|Y) != 0
+  // Handle: (X == null) & (Y == null) --> (X|Y) == 0
+  if (Cases[0].CmpRHS == Cases[1].CmpRHS &&
+      Cases[0].CC == Cases[1].CC &&
+      isa<Constant>(Cases[0].CmpRHS) &&
+      cast<Constant>(Cases[0].CmpRHS)->isNullValue()) {
+    if (Cases[0].CC == ISD::SETEQ && Cases[0].TrueBB == Cases[1].ThisBB)
+      return false;
+    if (Cases[0].CC == ISD::SETNE && Cases[0].FalseBB == Cases[1].ThisBB)
+      return false;
+  }
+  
+  return true;
+}
+
+void SelectionDAGBuilder::visitBr(const BranchInst &I) {
+  MachineBasicBlock *BrMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges.
+  MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)];
+
+  // Figure out which block is immediately after the current one.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = BrMBB;
+  if (++BBI != FuncInfo.MF->end())
+    NextBlock = BBI;
+
+  if (I.isUnconditional()) {
+    // Update machine-CFG edges.
+    BrMBB->addSuccessor(Succ0MBB);
+
+    // If this is not a fall-through branch, emit the branch.
+    if (Succ0MBB != NextBlock)
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(),
+                              MVT::Other, getControlRoot(),
+                              DAG.getBasicBlock(Succ0MBB)));
+
+    return;
+  }
+
+  // If this condition is one of the special cases we handle, do special stuff
+  // now.
+  const Value *CondVal = I.getCondition();
+  MachineBasicBlock *Succ1MBB = FuncInfo.MBBMap[I.getSuccessor(1)];
+
+  // If this is a series of conditions that are or'd or and'd together, emit
+  // this as a sequence of branches instead of setcc's with and/or operations.
+  // For example, instead of something like:
+  //     cmp A, B
+  //     C = seteq
+  //     cmp D, E
+  //     F = setle
+  //     or C, F
+  //     jnz foo
+  // Emit:
+  //     cmp A, B
+  //     je foo
+  //     cmp D, E
+  //     jle foo
+  //
+  if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
+    if (BOp->hasOneUse() &&
+        (BOp->getOpcode() == Instruction::And ||
+         BOp->getOpcode() == Instruction::Or)) {
+      FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
+                           BOp->getOpcode());
+      // If the compares in later blocks need to use values not currently
+      // exported from this block, export them now.  This block should always
+      // be the first entry.
+      assert(SwitchCases[0].ThisBB == BrMBB && "Unexpected lowering!");
+
+      // Allow some cases to be rejected.
+      if (ShouldEmitAsBranches(SwitchCases)) {
+        for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i) {
+          ExportFromCurrentBlock(SwitchCases[i].CmpLHS);
+          ExportFromCurrentBlock(SwitchCases[i].CmpRHS);
+        }
+
+        // Emit the branch for this block.
+        visitSwitchCase(SwitchCases[0], BrMBB);
+        SwitchCases.erase(SwitchCases.begin());
+        return;
+      }
+
+      // Okay, we decided not to do this, remove any inserted MBB's and clear
+      // SwitchCases.
+      for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i)
+        FuncInfo.MF->erase(SwitchCases[i].ThisBB);
+
+      SwitchCases.clear();
+    }
+  }
+
+  // Create a CaseBlock record representing this branch.
+  CaseBlock CB(ISD::SETEQ, CondVal, ConstantInt::getTrue(*DAG.getContext()),
+               NULL, Succ0MBB, Succ1MBB, BrMBB);
+
+  // Use visitSwitchCase to actually insert the fast branch sequence for this
+  // cond branch.
+  visitSwitchCase(CB, BrMBB);
+}
+
+/// visitSwitchCase - Emits the necessary code to represent a single node in
+/// the binary search tree resulting from lowering a switch instruction.
+void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
+                                          MachineBasicBlock *SwitchBB) {
+  SDValue Cond;
+  SDValue CondLHS = getValue(CB.CmpLHS);
+  DebugLoc dl = getCurDebugLoc();
+
+  // Build the setcc now.
+  if (CB.CmpMHS == NULL) {
+    // Fold "(X == true)" to X and "(X == false)" to !X to
+    // handle common cases produced by branch lowering.
+    if (CB.CmpRHS == ConstantInt::getTrue(*DAG.getContext()) &&
+        CB.CC == ISD::SETEQ)
+      Cond = CondLHS;
+    else if (CB.CmpRHS == ConstantInt::getFalse(*DAG.getContext()) &&
+             CB.CC == ISD::SETEQ) {
+      SDValue True = DAG.getConstant(1, CondLHS.getValueType());
+      Cond = DAG.getNode(ISD::XOR, dl, CondLHS.getValueType(), CondLHS, True);
+    } else
+      Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC);
+  } else {
+    assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");
+
+    const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
+    const APInt& High  = cast<ConstantInt>(CB.CmpRHS)->getValue();
+
+    SDValue CmpOp = getValue(CB.CmpMHS);
+    EVT VT = CmpOp.getValueType();
+
+    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
+      Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, VT),
+                          ISD::SETLE);
+    } else {
+      SDValue SUB = DAG.getNode(ISD::SUB, dl,
+                                VT, CmpOp, DAG.getConstant(Low, VT));
+      Cond = DAG.getSetCC(dl, MVT::i1, SUB,
+                          DAG.getConstant(High-Low, VT), ISD::SETULE);
+    }
+  }
+
+  // Update successor info
+  SwitchBB->addSuccessor(CB.TrueBB);
+  SwitchBB->addSuccessor(CB.FalseBB);
+
+  // Set NextBlock to be the MBB immediately after the current one, if any.
+  // This is used to avoid emitting unnecessary branches to the next block.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = SwitchBB;
+  if (++BBI != FuncInfo.MF->end())
+    NextBlock = BBI;
+
+  // If the lhs block is the next block, invert the condition so that we can
+  // fall through to the lhs instead of the rhs block.
+  if (CB.TrueBB == NextBlock) {
+    std::swap(CB.TrueBB, CB.FalseBB);
+    SDValue True = DAG.getConstant(1, Cond.getValueType());
+    Cond = DAG.getNode(ISD::XOR, dl, Cond.getValueType(), Cond, True);
+  }
+
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
+                               MVT::Other, getControlRoot(), Cond,
+                               DAG.getBasicBlock(CB.TrueBB));
+
+  // Insert the false branch.
+  if (CB.FalseBB != NextBlock)
+    BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
+                         DAG.getBasicBlock(CB.FalseBB));
+
+  DAG.setRoot(BrCond);
+}
+
+/// visitJumpTable - Emit JumpTable node in the current MBB
+void SelectionDAGBuilder::visitJumpTable(JumpTable &JT) {
+  // Emit the code for the jump table
+  assert(JT.Reg != -1U && "Should lower JT Header first!");
+  EVT PTy = TLI.getPointerTy();
+  SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurDebugLoc(),
+                                     JT.Reg, PTy);
+  SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
+  SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, getCurDebugLoc(),
+                                    MVT::Other, Index.getValue(1),
+                                    Table, Index);
+  DAG.setRoot(BrJumpTable);
+}
+
+/// visitJumpTableHeader - This function emits necessary code to produce index
+/// in the JumpTable from switch case.
+void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
+                                               JumpTableHeader &JTH,
+                                               MachineBasicBlock *SwitchBB) {
+  // Subtract the lowest switch case value from the value being switched on and
+  // conditional branch to default mbb if the result is greater than the
+  // difference between smallest and largest cases.
+  SDValue SwitchOp = getValue(JTH.SValue);
+  EVT VT = SwitchOp.getValueType();
+  SDValue Sub = DAG.getNode(ISD::SUB, getCurDebugLoc(), VT, SwitchOp,
+                            DAG.getConstant(JTH.First, VT));
+
+  // The SDNode we just created, which holds the value being switched on minus
+  // the smallest case value, needs to be copied to a virtual register so it
+  // can be used as an index into the jump table in a subsequent basic block.
+  // This value may be smaller or larger than the target's pointer type, and
+  // therefore require extension or truncating.
+  SwitchOp = DAG.getZExtOrTrunc(Sub, getCurDebugLoc(), TLI.getPointerTy());
+
+  unsigned JumpTableReg = FuncInfo.CreateReg(TLI.getPointerTy());
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
+                                    JumpTableReg, SwitchOp);
+  JT.Reg = JumpTableReg;
+
+  // Emit the range check for the jump table, and branch to the default block
+  // for the switch statement if the value being switched on exceeds the largest
+  // case in the switch.
+  SDValue CMP = DAG.getSetCC(getCurDebugLoc(),
+                             TLI.getSetCCResultType(Sub.getValueType()), Sub,
+                             DAG.getConstant(JTH.Last-JTH.First,VT),
+                             ISD::SETUGT);
+
+  // Set NextBlock to be the MBB immediately after the current one, if any.
+  // This is used to avoid emitting unnecessary branches to the next block.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = SwitchBB;
+
+  if (++BBI != FuncInfo.MF->end())
+    NextBlock = BBI;
+
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
+                               MVT::Other, CopyTo, CMP,
+                               DAG.getBasicBlock(JT.Default));
+
+  if (JT.MBB != NextBlock)
+    BrCond = DAG.getNode(ISD::BR, getCurDebugLoc(), MVT::Other, BrCond,
+                         DAG.getBasicBlock(JT.MBB));
+
+  DAG.setRoot(BrCond);
+}
+
+/// visitBitTestHeader - This function emits necessary code to produce value
+/// suitable for "bit tests"
+void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
+                                             MachineBasicBlock *SwitchBB) {
+  // Subtract the minimum value
+  SDValue SwitchOp = getValue(B.SValue);
+  EVT VT = SwitchOp.getValueType();
+  SDValue Sub = DAG.getNode(ISD::SUB, getCurDebugLoc(), VT, SwitchOp,
+                            DAG.getConstant(B.First, VT));
+
+  // Check range
+  SDValue RangeCmp = DAG.getSetCC(getCurDebugLoc(),
+                                  TLI.getSetCCResultType(Sub.getValueType()),
+                                  Sub, DAG.getConstant(B.Range, VT),
+                                  ISD::SETUGT);
+
+  SDValue ShiftOp = DAG.getZExtOrTrunc(Sub, getCurDebugLoc(),
+                                       TLI.getPointerTy());
+
+  B.Reg = FuncInfo.CreateReg(TLI.getPointerTy());
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
+                                    B.Reg, ShiftOp);
+
+  // Set NextBlock to be the MBB immediately after the current one, if any.
+  // This is used to avoid emitting unnecessary branches to the next block.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = SwitchBB;
+  if (++BBI != FuncInfo.MF->end())
+    NextBlock = BBI;
+
+  MachineBasicBlock* MBB = B.Cases[0].ThisBB;
+
+  SwitchBB->addSuccessor(B.Default);
+  SwitchBB->addSuccessor(MBB);
+
+  SDValue BrRange = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
+                                MVT::Other, CopyTo, RangeCmp,
+                                DAG.getBasicBlock(B.Default));
+
+  if (MBB != NextBlock)
+    BrRange = DAG.getNode(ISD::BR, getCurDebugLoc(), MVT::Other, CopyTo,
+                          DAG.getBasicBlock(MBB));
+
+  DAG.setRoot(BrRange);
+}
+
+/// visitBitTestCase - this function produces one "bit test"
+void SelectionDAGBuilder::visitBitTestCase(MachineBasicBlock* NextMBB,
+                                           unsigned Reg,
+                                           BitTestCase &B,
+                                           MachineBasicBlock *SwitchBB) {
+  SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), getCurDebugLoc(), Reg,
+                                       TLI.getPointerTy());
+  SDValue Cmp;
+  if (CountPopulation_64(B.Mask) == 1) {
+    // Testing for a single bit; just compare the shift count with what it
+    // would need to be to shift a 1 bit in that position.
+    Cmp = DAG.getSetCC(getCurDebugLoc(),
+                       TLI.getSetCCResultType(ShiftOp.getValueType()),
+                       ShiftOp,
+                       DAG.getConstant(CountTrailingZeros_64(B.Mask),
+                                       TLI.getPointerTy()),
+                       ISD::SETEQ);
+  } else {
+    // Make desired shift
+    SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurDebugLoc(),
+                                    TLI.getPointerTy(),
+                                    DAG.getConstant(1, TLI.getPointerTy()),
+                                    ShiftOp);
+
+    // Emit bit tests and jumps
+    SDValue AndOp = DAG.getNode(ISD::AND, getCurDebugLoc(),
+                                TLI.getPointerTy(), SwitchVal,
+                                DAG.getConstant(B.Mask, TLI.getPointerTy()));
+    Cmp = DAG.getSetCC(getCurDebugLoc(),
+                       TLI.getSetCCResultType(AndOp.getValueType()),
+                       AndOp, DAG.getConstant(0, TLI.getPointerTy()),
+                       ISD::SETNE);
+  }
+
+  SwitchBB->addSuccessor(B.TargetBB);
+  SwitchBB->addSuccessor(NextMBB);
+
+  SDValue BrAnd = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
+                              MVT::Other, getControlRoot(),
+                              Cmp, DAG.getBasicBlock(B.TargetBB));
+
+  // Set NextBlock to be the MBB immediately after the current one, if any.
+  // This is used to avoid emitting unnecessary branches to the next block.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = SwitchBB;
+  if (++BBI != FuncInfo.MF->end())
+    NextBlock = BBI;
+
+  if (NextMBB != NextBlock)
+    BrAnd = DAG.getNode(ISD::BR, getCurDebugLoc(), MVT::Other, BrAnd,
+                        DAG.getBasicBlock(NextMBB));
+
+  DAG.setRoot(BrAnd);
+}
+
+/// handleSmallSwitchCaseRange - Emit a series of specific tests (suitable for
+/// small case ranges).
+bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
+                                                 CaseRecVector& WorkList,
+                                                 const Value* SV,
+                                                 MachineBasicBlock *Default,
+                                                 MachineBasicBlock *SwitchBB) {
+  Case& BackCase  = *(CR.Range.second-1);
+
+  // Size is the number of Cases represented by this range.
+  size_t Size = CR.Range.second - CR.Range.first;
+  if (Size > 3)
+    return false;
+
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = FuncInfo.MF;
+
+  // Figure out which block is immediately after the current one.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = CR.CaseBB;
+
+  if (++BBI != FuncInfo.MF->end())
+    NextBlock = BBI;
+
+  // TODO: If any two of the cases has the same destination, and if one value
+  // is the same as the other, but has one bit unset that the other has set,
+  // use bit manipulation to do two compares at once.  For example:
+  // "if (X == 6 || X == 4)" -> "if ((X|2) == 6)"
+
+  // Rearrange the case blocks so that the last one falls through if possible.
+  if (NextBlock && Default != NextBlock && BackCase.BB != NextBlock) {
+    // The last case block won't fall through into 'NextBlock' if we emit the
+    // branches in this order.  See if rearranging a case value would help.
+    for (CaseItr I = CR.Range.first, E = CR.Range.second-1; I != E; ++I) {
+      if (I->BB == NextBlock) {
+        std::swap(*I, BackCase);
+        break;
+      }
+    }
+  }
+
+  // Create a CaseBlock record representing a conditional branch to
+  // the Case's target mbb if the value being switched on SV is equal
+  // to C.
+  MachineBasicBlock *CurBlock = CR.CaseBB;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
+    MachineBasicBlock *FallThrough;
+    if (I != E-1) {
+      FallThrough = CurMF->CreateMachineBasicBlock(CurBlock->getBasicBlock());
+      CurMF->insert(BBI, FallThrough);
+
+      // Put SV in a virtual register to make it available from the new blocks.
+      ExportFromCurrentBlock(SV);
+    } else {
+      // If the last case doesn't match, go to the default block.
+      FallThrough = Default;
+    }
+
+    const Value *RHS, *LHS, *MHS;
+    ISD::CondCode CC;
+    if (I->High == I->Low) {
+      // This is just small small case range :) containing exactly 1 case
+      CC = ISD::SETEQ;
+      LHS = SV; RHS = I->High; MHS = NULL;
+    } else {
+      CC = ISD::SETLE;
+      LHS = I->Low; MHS = SV; RHS = I->High;
+    }
+    CaseBlock CB(CC, LHS, RHS, MHS, I->BB, FallThrough, CurBlock);
+
+    // If emitting the first comparison, just call visitSwitchCase to emit the
+    // code into the current block.  Otherwise, push the CaseBlock onto the
+    // vector to be later processed by SDISel, and insert the node's MBB
+    // before the next MBB.
+    if (CurBlock == SwitchBB)
+      visitSwitchCase(CB, SwitchBB);
+    else
+      SwitchCases.push_back(CB);
+
+    CurBlock = FallThrough;
+  }
+
+  return true;
+}
+
+static inline bool areJTsAllowed(const TargetLowering &TLI) {
+  return !DisableJumpTables &&
+          (TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
+           TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
+}
+
+static APInt ComputeRange(const APInt &First, const APInt &Last) {
+  APInt LastExt(Last), FirstExt(First);
+  uint32_t BitWidth = std::max(Last.getBitWidth(), First.getBitWidth()) + 1;
+  LastExt.sext(BitWidth); FirstExt.sext(BitWidth);
+  return (LastExt - FirstExt + 1ULL);
+}
+
+/// handleJTSwitchCase - Emit jumptable for current switch case range
+bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec& CR,
+                                             CaseRecVector& WorkList,
+                                             const Value* SV,
+                                             MachineBasicBlock* Default,
+                                             MachineBasicBlock *SwitchBB) {
+  Case& FrontCase = *CR.Range.first;
+  Case& BackCase  = *(CR.Range.second-1);
+
+  const APInt &First = cast<ConstantInt>(FrontCase.Low)->getValue();
+  const APInt &Last  = cast<ConstantInt>(BackCase.High)->getValue();
+
+  APInt TSize(First.getBitWidth(), 0);
+  for (CaseItr I = CR.Range.first, E = CR.Range.second;
+       I!=E; ++I)
+    TSize += I->size();
+
+  if (!areJTsAllowed(TLI) || TSize.ult(4))
+    return false;
+
+  APInt Range = ComputeRange(First, Last);
+  double Density = TSize.roundToDouble() / Range.roundToDouble();
+  if (Density < 0.4)
+    return false;
+
+  DEBUG(dbgs() << "Lowering jump table\n"
+               << "First entry: " << First << ". Last entry: " << Last << '\n'
+               << "Range: " << Range
+               << "Size: " << TSize << ". Density: " << Density << "\n\n");
+
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = FuncInfo.MF;
+
+  // Figure out which block is immediately after the current one.
+  MachineFunction::iterator BBI = CR.CaseBB;
+  ++BBI;
+
+  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
+
+  // Create a new basic block to hold the code for loading the address
+  // of the jump table, and jumping to it.  Update successor information;
+  // we will either branch to the default case for the switch, or the jump
+  // table.
+  MachineBasicBlock *JumpTableBB = CurMF->CreateMachineBasicBlock(LLVMBB);
+  CurMF->insert(BBI, JumpTableBB);
+  CR.CaseBB->addSuccessor(Default);
+  CR.CaseBB->addSuccessor(JumpTableBB);
+
+  // Build a vector of destination BBs, corresponding to each target
+  // of the jump table. If the value of the jump table slot corresponds to
+  // a case statement, push the case's BB onto the vector, otherwise, push
+  // the default BB.
+  std::vector<MachineBasicBlock*> DestBBs;
+  APInt TEI = First;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++TEI) {
+    const APInt &Low = cast<ConstantInt>(I->Low)->getValue();
+    const APInt &High = cast<ConstantInt>(I->High)->getValue();
+
+    if (Low.sle(TEI) && TEI.sle(High)) {
+      DestBBs.push_back(I->BB);
+      if (TEI==High)
+        ++I;
+    } else {
+      DestBBs.push_back(Default);
+    }
+  }
+
+  // Update successor info. Add one edge to each unique successor.
+  BitVector SuccsHandled(CR.CaseBB->getParent()->getNumBlockIDs());
+  for (std::vector<MachineBasicBlock*>::iterator I = DestBBs.begin(),
+         E = DestBBs.end(); I != E; ++I) {
+    if (!SuccsHandled[(*I)->getNumber()]) {
+      SuccsHandled[(*I)->getNumber()] = true;
+      JumpTableBB->addSuccessor(*I);
+    }
+  }
+
+  // Create a jump table index for this jump table.
+  unsigned JTEncoding = TLI.getJumpTableEncoding();
+  unsigned JTI = CurMF->getOrCreateJumpTableInfo(JTEncoding)
+                       ->createJumpTableIndex(DestBBs);
+
+  // Set the jump table information so that we can codegen it as a second
+  // MachineBasicBlock
+  JumpTable JT(-1U, JTI, JumpTableBB, Default);
+  JumpTableHeader JTH(First, Last, SV, CR.CaseBB, (CR.CaseBB == SwitchBB));
+  if (CR.CaseBB == SwitchBB)
+    visitJumpTableHeader(JT, JTH, SwitchBB);
+
+  JTCases.push_back(JumpTableBlock(JTH, JT));
+
+  return true;
+}
+
+/// handleBTSplitSwitchCase - emit comparison and split binary search tree into
+/// 2 subtrees.
+bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
+                                                  CaseRecVector& WorkList,
+                                                  const Value* SV,
+                                                  MachineBasicBlock *Default,
+                                                  MachineBasicBlock *SwitchBB) {
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = FuncInfo.MF;
+
+  // Figure out which block is immediately after the current one.
+  MachineFunction::iterator BBI = CR.CaseBB;
+  ++BBI;
+
+  Case& FrontCase = *CR.Range.first;
+  Case& BackCase  = *(CR.Range.second-1);
+  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
+
+  // Size is the number of Cases represented by this range.
+  unsigned Size = CR.Range.second - CR.Range.first;
+
+  const APInt &First = cast<ConstantInt>(FrontCase.Low)->getValue();
+  const APInt &Last  = cast<ConstantInt>(BackCase.High)->getValue();
+  double FMetric = 0;
+  CaseItr Pivot = CR.Range.first + Size/2;
+
+  // Select optimal pivot, maximizing sum density of LHS and RHS. This will
+  // (heuristically) allow us to emit JumpTable's later.
+  APInt TSize(First.getBitWidth(), 0);
+  for (CaseItr I = CR.Range.first, E = CR.Range.second;
+       I!=E; ++I)
+    TSize += I->size();
+
+  APInt LSize = FrontCase.size();
+  APInt RSize = TSize-LSize;
+  DEBUG(dbgs() << "Selecting best pivot: \n"
+               << "First: " << First << ", Last: " << Last <<'\n'
+               << "LSize: " << LSize << ", RSize: " << RSize << '\n');
+  for (CaseItr I = CR.Range.first, J=I+1, E = CR.Range.second;
+       J!=E; ++I, ++J) {
+    const APInt &LEnd = cast<ConstantInt>(I->High)->getValue();
+    const APInt &RBegin = cast<ConstantInt>(J->Low)->getValue();
+    APInt Range = ComputeRange(LEnd, RBegin);
+    assert((Range - 2ULL).isNonNegative() &&
+           "Invalid case distance");
+    double LDensity = (double)LSize.roundToDouble() /
+                           (LEnd - First + 1ULL).roundToDouble();
+    double RDensity = (double)RSize.roundToDouble() /
+                           (Last - RBegin + 1ULL).roundToDouble();
+    double Metric = Range.logBase2()*(LDensity+RDensity);
+    // Should always split in some non-trivial place
+    DEBUG(dbgs() <<"=>Step\n"
+                 << "LEnd: " << LEnd << ", RBegin: " << RBegin << '\n'
+                 << "LDensity: " << LDensity
+                 << ", RDensity: " << RDensity << '\n'
+                 << "Metric: " << Metric << '\n');
+    if (FMetric < Metric) {
+      Pivot = J;
+      FMetric = Metric;
+      DEBUG(dbgs() << "Current metric set to: " << FMetric << '\n');
+    }
+
+    LSize += J->size();
+    RSize -= J->size();
+  }
+  if (areJTsAllowed(TLI)) {
+    // If our case is dense we *really* should handle it earlier!
+    assert((FMetric > 0) && "Should handle dense range earlier!");
+  } else {
+    Pivot = CR.Range.first + Size/2;
+  }
+
+  CaseRange LHSR(CR.Range.first, Pivot);
+  CaseRange RHSR(Pivot, CR.Range.second);
+  Constant *C = Pivot->Low;
+  MachineBasicBlock *FalseBB = 0, *TrueBB = 0;
+
+  // We know that we branch to the LHS if the Value being switched on is
+  // less than the Pivot value, C.  We use this to optimize our binary
+  // tree a bit, by recognizing that if SV is greater than or equal to the
+  // LHS's Case Value, and that Case Value is exactly one less than the
+  // Pivot's Value, then we can branch directly to the LHS's Target,
+  // rather than creating a leaf node for it.
+  if ((LHSR.second - LHSR.first) == 1 &&
+      LHSR.first->High == CR.GE &&
+      cast<ConstantInt>(C)->getValue() ==
+      (cast<ConstantInt>(CR.GE)->getValue() + 1LL)) {
+    TrueBB = LHSR.first->BB;
+  } else {
+    TrueBB = CurMF->CreateMachineBasicBlock(LLVMBB);
+    CurMF->insert(BBI, TrueBB);
+    WorkList.push_back(CaseRec(TrueBB, C, CR.GE, LHSR));
+
+    // Put SV in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(SV);
+  }
+
+  // Similar to the optimization above, if the Value being switched on is
+  // known to be less than the Constant CR.LT, and the current Case Value
+  // is CR.LT - 1, then we can branch directly to the target block for
+  // the current Case Value, rather than emitting a RHS leaf node for it.
+  if ((RHSR.second - RHSR.first) == 1 && CR.LT &&
+      cast<ConstantInt>(RHSR.first->Low)->getValue() ==
+      (cast<ConstantInt>(CR.LT)->getValue() - 1LL)) {
+    FalseBB = RHSR.first->BB;
+  } else {
+    FalseBB = CurMF->CreateMachineBasicBlock(LLVMBB);
+    CurMF->insert(BBI, FalseBB);
+    WorkList.push_back(CaseRec(FalseBB,CR.LT,C,RHSR));
+
+    // Put SV in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(SV);
+  }
+
+  // Create a CaseBlock record representing a conditional branch to
+  // the LHS node if the value being switched on SV is less than C.
+  // Otherwise, branch to LHS.
+  CaseBlock CB(ISD::SETLT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
+
+  if (CR.CaseBB == SwitchBB)
+    visitSwitchCase(CB, SwitchBB);
+  else
+    SwitchCases.push_back(CB);
+
+  return true;
+}
+
+/// handleBitTestsSwitchCase - if current case range has few destination and
+/// range span less, than machine word bitwidth, encode case range into series
+/// of masks and emit bit tests with these masks.
+bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
+                                                   CaseRecVector& WorkList,
+                                                   const Value* SV,
+                                                   MachineBasicBlock* Default,
+                                                   MachineBasicBlock *SwitchBB){
+  EVT PTy = TLI.getPointerTy();
+  unsigned IntPtrBits = PTy.getSizeInBits();
+
+  Case& FrontCase = *CR.Range.first;
+  Case& BackCase  = *(CR.Range.second-1);
+
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = FuncInfo.MF;
+
+  // If target does not have legal shift left, do not emit bit tests at all.
+  if (!TLI.isOperationLegal(ISD::SHL, TLI.getPointerTy()))
+    return false;
+
+  size_t numCmps = 0;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second;
+       I!=E; ++I) {
+    // Single case counts one, case range - two.
+    numCmps += (I->Low == I->High ? 1 : 2);
+  }
+
+  // Count unique destinations
+  SmallSet<MachineBasicBlock*, 4> Dests;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I!=E; ++I) {
+    Dests.insert(I->BB);
+    if (Dests.size() > 3)
+      // Don't bother the code below, if there are too much unique destinations
+      return false;
+  }
+  DEBUG(dbgs() << "Total number of unique destinations: "
+        << Dests.size() << '\n'
+        << "Total number of comparisons: " << numCmps << '\n');
+
+  // Compute span of values.
+  const APInt& minValue = cast<ConstantInt>(FrontCase.Low)->getValue();
+  const APInt& maxValue = cast<ConstantInt>(BackCase.High)->getValue();
+  APInt cmpRange = maxValue - minValue;
+
+  DEBUG(dbgs() << "Compare range: " << cmpRange << '\n'
+               << "Low bound: " << minValue << '\n'
+               << "High bound: " << maxValue << '\n');
+
+  if (cmpRange.uge(IntPtrBits) ||
+      (!(Dests.size() == 1 && numCmps >= 3) &&
+       !(Dests.size() == 2 && numCmps >= 5) &&
+       !(Dests.size() >= 3 && numCmps >= 6)))
+    return false;
+
+  DEBUG(dbgs() << "Emitting bit tests\n");
+  APInt lowBound = APInt::getNullValue(cmpRange.getBitWidth());
+
+  // Optimize the case where all the case values fit in a
+  // word without having to subtract minValue. In this case,
+  // we can optimize away the subtraction.
+  if (minValue.isNonNegative() && maxValue.slt(IntPtrBits)) {
+    cmpRange = maxValue;
+  } else {
+    lowBound = minValue;
+  }
+
+  CaseBitsVector CasesBits;
+  unsigned i, count = 0;
+
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I!=E; ++I) {
+    MachineBasicBlock* Dest = I->BB;
+    for (i = 0; i < count; ++i)
+      if (Dest == CasesBits[i].BB)
+        break;
+
+    if (i == count) {
+      assert((count < 3) && "Too much destinations to test!");
+      CasesBits.push_back(CaseBits(0, Dest, 0));
+      count++;
+    }
+
+    const APInt& lowValue = cast<ConstantInt>(I->Low)->getValue();
+    const APInt& highValue = cast<ConstantInt>(I->High)->getValue();
+
+    uint64_t lo = (lowValue - lowBound).getZExtValue();
+    uint64_t hi = (highValue - lowBound).getZExtValue();
+
+    for (uint64_t j = lo; j <= hi; j++) {
+      CasesBits[i].Mask |=  1ULL << j;
+      CasesBits[i].Bits++;
+    }
+
+  }
+  std::sort(CasesBits.begin(), CasesBits.end(), CaseBitsCmp());
+
+  BitTestInfo BTC;
+
+  // Figure out which block is immediately after the current one.
+  MachineFunction::iterator BBI = CR.CaseBB;
+  ++BBI;
+
+  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
+
+  DEBUG(dbgs() << "Cases:\n");
+  for (unsigned i = 0, e = CasesBits.size(); i!=e; ++i) {
+    DEBUG(dbgs() << "Mask: " << CasesBits[i].Mask
+                 << ", Bits: " << CasesBits[i].Bits
+                 << ", BB: " << CasesBits[i].BB << '\n');
+
+    MachineBasicBlock *CaseBB = CurMF->CreateMachineBasicBlock(LLVMBB);
+    CurMF->insert(BBI, CaseBB);
+    BTC.push_back(BitTestCase(CasesBits[i].Mask,
+                              CaseBB,
+                              CasesBits[i].BB));
+
+    // Put SV in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(SV);
+  }
+
+  BitTestBlock BTB(lowBound, cmpRange, SV,
+                   -1U, (CR.CaseBB == SwitchBB),
+                   CR.CaseBB, Default, BTC);
+
+  if (CR.CaseBB == SwitchBB)
+    visitBitTestHeader(BTB, SwitchBB);
+
+  BitTestCases.push_back(BTB);
+
+  return true;
+}
+
+/// Clusterify - Transform simple list of Cases into list of CaseRange's
+size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
+                                       const SwitchInst& SI) {
+  size_t numCmps = 0;
+
+  // Start with "simple" cases
+  for (size_t i = 1; i < SI.getNumSuccessors(); ++i) {
+    MachineBasicBlock *SMBB = FuncInfo.MBBMap[SI.getSuccessor(i)];
+    Cases.push_back(Case(SI.getSuccessorValue(i),
+                         SI.getSuccessorValue(i),
+                         SMBB));
+  }
+  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+
+  // Merge case into clusters
+  if (Cases.size() >= 2)
+    // Must recompute end() each iteration because it may be
+    // invalidated by erase if we hold on to it
+    for (CaseItr I = Cases.begin(), J = ++(Cases.begin()); J != Cases.end(); ) {
+      const APInt& nextValue = cast<ConstantInt>(J->Low)->getValue();
+      const APInt& currentValue = cast<ConstantInt>(I->High)->getValue();
+      MachineBasicBlock* nextBB = J->BB;
+      MachineBasicBlock* currentBB = I->BB;
+
+      // If the two neighboring cases go to the same destination, merge them
+      // into a single case.
+      if ((nextValue - currentValue == 1) && (currentBB == nextBB)) {
+        I->High = J->High;
+        J = Cases.erase(J);
+      } else {
+        I = J++;
+      }
+    }
+
+  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
+    if (I->Low != I->High)
+      // A range counts double, since it requires two compares.
+      ++numCmps;
+  }
+
+  return numCmps;
+}
+
+void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
+  MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
+
+  // Figure out which block is immediately after the current one.
+  MachineBasicBlock *NextBlock = 0;
+  MachineBasicBlock *Default = FuncInfo.MBBMap[SI.getDefaultDest()];
+
+  // If there is only the default destination, branch to it if it is not the
+  // next basic block.  Otherwise, just fall through.
+  if (SI.getNumOperands() == 2) {
+    // Update machine-CFG edges.
+
+    // If this is not a fall-through branch, emit the branch.
+    SwitchMBB->addSuccessor(Default);
+    if (Default != NextBlock)
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(),
+                              MVT::Other, getControlRoot(),
+                              DAG.getBasicBlock(Default)));
+
+    return;
+  }
+
+  // If there are any non-default case statements, create a vector of Cases
+  // representing each one, and sort the vector so that we can efficiently
+  // create a binary search tree from them.
+  CaseVector Cases;
+  size_t numCmps = Clusterify(Cases, SI);
+  DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
+               << ". Total compares: " << numCmps << '\n');
+  numCmps = 0;
+
+  // Get the Value to be switched on and default basic blocks, which will be
+  // inserted into CaseBlock records, representing basic blocks in the binary
+  // search tree.
+  const Value *SV = SI.getOperand(0);
+
+  // Push the initial CaseRec onto the worklist
+  CaseRecVector WorkList;
+  WorkList.push_back(CaseRec(SwitchMBB,0,0,
+                             CaseRange(Cases.begin(),Cases.end())));
+
+  while (!WorkList.empty()) {
+    // Grab a record representing a case range to process off the worklist
+    CaseRec CR = WorkList.back();
+    WorkList.pop_back();
+
+    if (handleBitTestsSwitchCase(CR, WorkList, SV, Default, SwitchMBB))
+      continue;
+
+    // If the range has few cases (two or less) emit a series of specific
+    // tests.
+    if (handleSmallSwitchRange(CR, WorkList, SV, Default, SwitchMBB))
+      continue;
+
+    // If the switch has more than 5 blocks, and at least 40% dense, and the
+    // target supports indirect branches, then emit a jump table rather than
+    // lowering the switch to a binary tree of conditional branches.
+    if (handleJTSwitchCase(CR, WorkList, SV, Default, SwitchMBB))
+      continue;
+
+    // Emit binary tree. We need to pick a pivot, and push left and right ranges
+    // onto the worklist. Leafs are handled via handleSmallSwitchRange() call.
+    handleBTSplitSwitchCase(CR, WorkList, SV, Default, SwitchMBB);
+  }
+}
+
+void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
+  MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges with unique successors.
+  SmallVector<BasicBlock*, 32> succs;
+  succs.reserve(I.getNumSuccessors());
+  for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i)
+    succs.push_back(I.getSuccessor(i));
+  array_pod_sort(succs.begin(), succs.end());
+  succs.erase(std::unique(succs.begin(), succs.end()), succs.end());
+  for (unsigned i = 0, e = succs.size(); i != e; ++i)
+    IndirectBrMBB->addSuccessor(FuncInfo.MBBMap[succs[i]]);
+
+  DAG.setRoot(DAG.getNode(ISD::BRIND, getCurDebugLoc(),
+                          MVT::Other, getControlRoot(),
+                          getValue(I.getAddress())));
+}
+
+void SelectionDAGBuilder::visitFSub(const User &I) {
+  // -0.0 - X --> fneg
+  const Type *Ty = I.getType();
+  if (Ty->isVectorTy()) {
+    if (ConstantVector *CV = dyn_cast<ConstantVector>(I.getOperand(0))) {
+      const VectorType *DestTy = cast<VectorType>(I.getType());
+      const Type *ElTy = DestTy->getElementType();
+      unsigned VL = DestTy->getNumElements();
+      std::vector<Constant*> NZ(VL, ConstantFP::getNegativeZero(ElTy));
+      Constant *CNZ = ConstantVector::get(&NZ[0], NZ.size());
+      if (CV == CNZ) {
+        SDValue Op2 = getValue(I.getOperand(1));
+        setValue(&I, DAG.getNode(ISD::FNEG, getCurDebugLoc(),
+                                 Op2.getValueType(), Op2));
+        return;
+      }
+    }
+  }
+
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(I.getOperand(0)))
+    if (CFP->isExactlyValue(ConstantFP::getNegativeZero(Ty)->getValueAPF())) {
+      SDValue Op2 = getValue(I.getOperand(1));
+      setValue(&I, DAG.getNode(ISD::FNEG, getCurDebugLoc(),
+                               Op2.getValueType(), Op2));
+      return;
+    }
+
+  visitBinary(I, ISD::FSUB);
+}
+
+void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  setValue(&I, DAG.getNode(OpCode, getCurDebugLoc(),
+                           Op1.getValueType(), Op1, Op2));
+}
+
+void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  if (!I.getType()->isVectorTy() &&
+      Op2.getValueType() != TLI.getShiftAmountTy()) {
+    // If the operand is smaller than the shift count type, promote it.
+    EVT PTy = TLI.getPointerTy();
+    EVT STy = TLI.getShiftAmountTy();
+    if (STy.bitsGT(Op2.getValueType()))
+      Op2 = DAG.getNode(ISD::ANY_EXTEND, getCurDebugLoc(),
+                        TLI.getShiftAmountTy(), Op2);
+    // If the operand is larger than the shift count type but the shift
+    // count type has enough bits to represent any shift value, truncate
+    // it now. This is a common case and it exposes the truncate to
+    // optimization early.
+    else if (STy.getSizeInBits() >=
+             Log2_32_Ceil(Op2.getValueType().getSizeInBits()))
+      Op2 = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(),
+                        TLI.getShiftAmountTy(), Op2);
+    // Otherwise we'll need to temporarily settle for some other
+    // convenient type; type legalization will make adjustments as
+    // needed.
+    else if (PTy.bitsLT(Op2.getValueType()))
+      Op2 = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(),
+                        TLI.getPointerTy(), Op2);
+    else if (PTy.bitsGT(Op2.getValueType()))
+      Op2 = DAG.getNode(ISD::ANY_EXTEND, getCurDebugLoc(),
+                        TLI.getPointerTy(), Op2);
+  }
+
+  setValue(&I, DAG.getNode(Opcode, getCurDebugLoc(),
+                           Op1.getValueType(), Op1, Op2));
+}
+
+void SelectionDAGBuilder::visitICmp(const User &I) {
+  ICmpInst::Predicate predicate = ICmpInst::BAD_ICMP_PREDICATE;
+  if (const ICmpInst *IC = dyn_cast<ICmpInst>(&I))
+    predicate = IC->getPredicate();
+  else if (const ConstantExpr *IC = dyn_cast<ConstantExpr>(&I))
+    predicate = ICmpInst::Predicate(IC->getPredicate());
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  ISD::CondCode Opcode = getICmpCondCode(predicate);
+
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getSetCC(getCurDebugLoc(), DestVT, Op1, Op2, Opcode));
+}
+
+void SelectionDAGBuilder::visitFCmp(const User &I) {
+  FCmpInst::Predicate predicate = FCmpInst::BAD_FCMP_PREDICATE;
+  if (const FCmpInst *FC = dyn_cast<FCmpInst>(&I))
+    predicate = FC->getPredicate();
+  else if (const ConstantExpr *FC = dyn_cast<ConstantExpr>(&I))
+    predicate = FCmpInst::Predicate(FC->getPredicate());
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  ISD::CondCode Condition = getFCmpCondCode(predicate);
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getSetCC(getCurDebugLoc(), DestVT, Op1, Op2, Condition));
+}
+
+void SelectionDAGBuilder::visitSelect(const User &I) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, I.getType(), ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+
+  SmallVector<SDValue, 4> Values(NumValues);
+  SDValue Cond     = getValue(I.getOperand(0));
+  SDValue TrueVal  = getValue(I.getOperand(1));
+  SDValue FalseVal = getValue(I.getOperand(2));
+
+  for (unsigned i = 0; i != NumValues; ++i)
+    Values[i] = DAG.getNode(ISD::SELECT, getCurDebugLoc(),
+                          TrueVal.getNode()->getValueType(TrueVal.getResNo()+i),
+                            Cond,
+                            SDValue(TrueVal.getNode(),
+                                    TrueVal.getResNo() + i),
+                            SDValue(FalseVal.getNode(),
+                                    FalseVal.getResNo() + i));
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                           DAG.getVTList(&ValueVTs[0], NumValues),
+                           &Values[0], NumValues));
+}
+
+void SelectionDAGBuilder::visitTrunc(const User &I) {
+  // TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitZExt(const User &I) {
+  // ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
+  // ZExt also can't be a cast to bool for same reason. So, nothing much to do
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitSExt(const User &I) {
+  // SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
+  // SExt also can't be a cast to bool for same reason. So, nothing much to do
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitFPTrunc(const User &I) {
+  // FPTrunc is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_ROUND, getCurDebugLoc(),
+                           DestVT, N, DAG.getIntPtrConstant(0)));
+}
+
+void SelectionDAGBuilder::visitFPExt(const User &I){
+  // FPTrunc is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitFPToUI(const User &I) {
+  // FPToUI is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitFPToSI(const User &I) {
+  // FPToSI is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitUIToFP(const User &I) {
+  // UIToFP is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitSIToFP(const User &I){
+  // SIToFP is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitPtrToInt(const User &I) {
+  // What to do depends on the size of the integer and the size of the pointer.
+  // We can either truncate, zero extend, or no-op, accordingly.
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getZExtOrTrunc(N, getCurDebugLoc(), DestVT));
+}
+
+void SelectionDAGBuilder::visitIntToPtr(const User &I) {
+  // What to do depends on the size of the integer and the size of the pointer.
+  // We can either truncate, zero extend, or no-op, accordingly.
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getZExtOrTrunc(N, getCurDebugLoc(), DestVT));
+}
+
+void SelectionDAGBuilder::visitBitCast(const User &I) {
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+
+  // BitCast assures us that source and destination are the same size so this is
+  // either a BIT_CONVERT or a no-op.
+  if (DestVT != N.getValueType())
+    setValue(&I, DAG.getNode(ISD::BIT_CONVERT, getCurDebugLoc(),
+                             DestVT, N)); // convert types.
+  else
+    setValue(&I, N);            // noop cast.
+}
+
+void SelectionDAGBuilder::visitInsertElement(const User &I) {
+  SDValue InVec = getValue(I.getOperand(0));
+  SDValue InVal = getValue(I.getOperand(1));
+  SDValue InIdx = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(),
+                              TLI.getPointerTy(),
+                              getValue(I.getOperand(2)));
+  setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurDebugLoc(),
+                           TLI.getValueType(I.getType()),
+                           InVec, InVal, InIdx));
+}
+
+void SelectionDAGBuilder::visitExtractElement(const User &I) {
+  SDValue InVec = getValue(I.getOperand(0));
+  SDValue InIdx = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(),
+                              TLI.getPointerTy(),
+                              getValue(I.getOperand(1)));
+  setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
+                           TLI.getValueType(I.getType()), InVec, InIdx));
+}
+
+// Utility for visitShuffleVector - Returns true if the mask is mask starting
+// from SIndx and increasing to the element length (undefs are allowed).
+static bool SequentialMask(SmallVectorImpl<int> &Mask, unsigned SIndx) {
+  unsigned MaskNumElts = Mask.size();
+  for (unsigned i = 0; i != MaskNumElts; ++i)
+    if ((Mask[i] >= 0) && (Mask[i] != (int)(i + SIndx)))
+      return false;
+  return true;
+}
+
+void SelectionDAGBuilder::visitShuffleVector(const User &I) {
+  SmallVector<int, 8> Mask;
+  SDValue Src1 = getValue(I.getOperand(0));
+  SDValue Src2 = getValue(I.getOperand(1));
+
+  // Convert the ConstantVector mask operand into an array of ints, with -1
+  // representing undef values.
+  SmallVector<Constant*, 8> MaskElts;
+  cast<Constant>(I.getOperand(2))->getVectorElements(MaskElts);
+  unsigned MaskNumElts = MaskElts.size();
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
+    if (isa<UndefValue>(MaskElts[i]))
+      Mask.push_back(-1);
+    else
+      Mask.push_back(cast<ConstantInt>(MaskElts[i])->getSExtValue());
+  }
+
+  EVT VT = TLI.getValueType(I.getType());
+  EVT SrcVT = Src1.getValueType();
+  unsigned SrcNumElts = SrcVT.getVectorNumElements();
+
+  if (SrcNumElts == MaskNumElts) {
+    setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
+                                      &Mask[0]));
+    return;
+  }
+
+  // Normalize the shuffle vector since mask and vector length don't match.
+  if (SrcNumElts < MaskNumElts && MaskNumElts % SrcNumElts == 0) {
+    // Mask is longer than the source vectors and is a multiple of the source
+    // vectors.  We can use concatenate vector to make the mask and vectors
+    // lengths match.
+    if (SrcNumElts*2 == MaskNumElts && SequentialMask(Mask, 0)) {
+      // The shuffle is concatenating two vectors together.
+      setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurDebugLoc(),
+                               VT, Src1, Src2));
+      return;
+    }
+
+    // Pad both vectors with undefs to make them the same length as the mask.
+    unsigned NumConcat = MaskNumElts / SrcNumElts;
+    bool Src1U = Src1.getOpcode() == ISD::UNDEF;
+    bool Src2U = Src2.getOpcode() == ISD::UNDEF;
+    SDValue UndefVal = DAG.getUNDEF(SrcVT);
+
+    SmallVector<SDValue, 8> MOps1(NumConcat, UndefVal);
+    SmallVector<SDValue, 8> MOps2(NumConcat, UndefVal);
+    MOps1[0] = Src1;
+    MOps2[0] = Src2;
+
+    Src1 = Src1U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS,
+                                                  getCurDebugLoc(), VT,
+                                                  &MOps1[0], NumConcat);
+    Src2 = Src2U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS,
+                                                  getCurDebugLoc(), VT,
+                                                  &MOps2[0], NumConcat);
+
+    // Readjust mask for new input vector length.
+    SmallVector<int, 8> MappedOps;
+    for (unsigned i = 0; i != MaskNumElts; ++i) {
+      int Idx = Mask[i];
+      if (Idx < (int)SrcNumElts)
+        MappedOps.push_back(Idx);
+      else
+        MappedOps.push_back(Idx + MaskNumElts - SrcNumElts);
+    }
+
+    setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
+                                      &MappedOps[0]));
+    return;
+  }
+
+  if (SrcNumElts > MaskNumElts) {
+    // Analyze the access pattern of the vector to see if we can extract
+    // two subvectors and do the shuffle. The analysis is done by calculating
+    // the range of elements the mask access on both vectors.
+    int MinRange[2] = { SrcNumElts+1, SrcNumElts+1};
+    int MaxRange[2] = {-1, -1};
+
+    for (unsigned i = 0; i != MaskNumElts; ++i) {
+      int Idx = Mask[i];
+      int Input = 0;
+      if (Idx < 0)
+        continue;
+
+      if (Idx >= (int)SrcNumElts) {
+        Input = 1;
+        Idx -= SrcNumElts;
+      }
+      if (Idx > MaxRange[Input])
+        MaxRange[Input] = Idx;
+      if (Idx < MinRange[Input])
+        MinRange[Input] = Idx;
+    }
+
+    // Check if the access is smaller than the vector size and can we find
+    // a reasonable extract index.
+    int RangeUse[2] = { 2, 2 };  // 0 = Unused, 1 = Extract, 2 = Can not
+                                 // Extract.
+    int StartIdx[2];  // StartIdx to extract from
+    for (int Input=0; Input < 2; ++Input) {
+      if (MinRange[Input] == (int)(SrcNumElts+1) && MaxRange[Input] == -1) {
+        RangeUse[Input] = 0; // Unused
+        StartIdx[Input] = 0;
+      } else if (MaxRange[Input] - MinRange[Input] < (int)MaskNumElts) {
+        // Fits within range but we should see if we can find a good
+        // start index that is a multiple of the mask length.
+        if (MaxRange[Input] < (int)MaskNumElts) {
+          RangeUse[Input] = 1; // Extract from beginning of the vector
+          StartIdx[Input] = 0;
+        } else {
+          StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts;
+          if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts &&
+              StartIdx[Input] + MaskNumElts < SrcNumElts)
+            RangeUse[Input] = 1; // Extract from a multiple of the mask length.
+        }
+      }
+    }
+
+    if (RangeUse[0] == 0 && RangeUse[1] == 0) {
+      setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.
+      return;
+    }
+    else if (RangeUse[0] < 2 && RangeUse[1] < 2) {
+      // Extract appropriate subvector and generate a vector shuffle
+      for (int Input=0; Input < 2; ++Input) {
+        SDValue &Src = Input == 0 ? Src1 : Src2;
+        if (RangeUse[Input] == 0)
+          Src = DAG.getUNDEF(VT);
+        else
+          Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, getCurDebugLoc(), VT,
+                            Src, DAG.getIntPtrConstant(StartIdx[Input]));
+      }
+
+      // Calculate new mask.
+      SmallVector<int, 8> MappedOps;
+      for (unsigned i = 0; i != MaskNumElts; ++i) {
+        int Idx = Mask[i];
+        if (Idx < 0)
+          MappedOps.push_back(Idx);
+        else if (Idx < (int)SrcNumElts)
+          MappedOps.push_back(Idx - StartIdx[0]);
+        else
+          MappedOps.push_back(Idx - SrcNumElts - StartIdx[1] + MaskNumElts);
+      }
+
+      setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
+                                        &MappedOps[0]));
+      return;
+    }
+  }
+
+  // We can't use either concat vectors or extract subvectors so fall back to
+  // replacing the shuffle with extract and build vector.
+  // to insert and build vector.
+  EVT EltVT = VT.getVectorElementType();
+  EVT PtrVT = TLI.getPointerTy();
+  SmallVector<SDValue,8> Ops;
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
+    if (Mask[i] < 0) {
+      Ops.push_back(DAG.getUNDEF(EltVT));
+    } else {
+      int Idx = Mask[i];
+      SDValue Res;
+
+      if (Idx < (int)SrcNumElts)
+        Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
+                          EltVT, Src1, DAG.getConstant(Idx, PtrVT));
+      else
+        Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
+                          EltVT, Src2,
+                          DAG.getConstant(Idx - SrcNumElts, PtrVT));
+
+      Ops.push_back(Res);
+    }
+  }
+
+  setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, getCurDebugLoc(),
+                           VT, &Ops[0], Ops.size()));
+}
+
+void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
+  const Value *Op0 = I.getOperand(0);
+  const Value *Op1 = I.getOperand(1);
+  const Type *AggTy = I.getType();
+  const Type *ValTy = Op1->getType();
+  bool IntoUndef = isa<UndefValue>(Op0);
+  bool FromUndef = isa<UndefValue>(Op1);
+
+  unsigned LinearIndex = ComputeLinearIndex(TLI, AggTy,
+                                            I.idx_begin(), I.idx_end());
+
+  SmallVector<EVT, 4> AggValueVTs;
+  ComputeValueVTs(TLI, AggTy, AggValueVTs);
+  SmallVector<EVT, 4> ValValueVTs;
+  ComputeValueVTs(TLI, ValTy, ValValueVTs);
+
+  unsigned NumAggValues = AggValueVTs.size();
+  unsigned NumValValues = ValValueVTs.size();
+  SmallVector<SDValue, 4> Values(NumAggValues);
+
+  SDValue Agg = getValue(Op0);
+  SDValue Val = getValue(Op1);
+  unsigned i = 0;
+  // Copy the beginning value(s) from the original aggregate.
+  for (; i != LinearIndex; ++i)
+    Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                SDValue(Agg.getNode(), Agg.getResNo() + i);
+  // Copy values from the inserted value(s).
+  for (; i != LinearIndex + NumValValues; ++i)
+    Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
+  // Copy remaining value(s) from the original aggregate.
+  for (; i != NumAggValues; ++i)
+    Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                SDValue(Agg.getNode(), Agg.getResNo() + i);
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                           DAG.getVTList(&AggValueVTs[0], NumAggValues),
+                           &Values[0], NumAggValues));
+}
+
+void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
+  const Value *Op0 = I.getOperand(0);
+  const Type *AggTy = Op0->getType();
+  const Type *ValTy = I.getType();
+  bool OutOfUndef = isa<UndefValue>(Op0);
+
+  unsigned LinearIndex = ComputeLinearIndex(TLI, AggTy,
+                                            I.idx_begin(), I.idx_end());
+
+  SmallVector<EVT, 4> ValValueVTs;
+  ComputeValueVTs(TLI, ValTy, ValValueVTs);
+
+  unsigned NumValValues = ValValueVTs.size();
+  SmallVector<SDValue, 4> Values(NumValValues);
+
+  SDValue Agg = getValue(Op0);
+  // Copy out the selected value(s).
+  for (unsigned i = LinearIndex; i != LinearIndex + NumValValues; ++i)
+    Values[i - LinearIndex] =
+      OutOfUndef ?
+        DAG.getUNDEF(Agg.getNode()->getValueType(Agg.getResNo() + i)) :
+        SDValue(Agg.getNode(), Agg.getResNo() + i);
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                           DAG.getVTList(&ValValueVTs[0], NumValValues),
+                           &Values[0], NumValValues));
+}
+
+void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
+  SDValue N = getValue(I.getOperand(0));
+  const Type *Ty = I.getOperand(0)->getType();
+
+  for (GetElementPtrInst::const_op_iterator OI = I.op_begin()+1, E = I.op_end();
+       OI != E; ++OI) {
+    const Value *Idx = *OI;
+    if (const StructType *StTy = dyn_cast<StructType>(Ty)) {
+      unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
+      if (Field) {
+        // N = N + Offset
+        uint64_t Offset = TD->getStructLayout(StTy)->getElementOffset(Field);
+        N = DAG.getNode(ISD::ADD, getCurDebugLoc(), N.getValueType(), N,
+                        DAG.getIntPtrConstant(Offset));
+      }
+
+      Ty = StTy->getElementType(Field);
+    } else if (const UnionType *UnTy = dyn_cast<UnionType>(Ty)) {
+      unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
+      
+      // Offset canonically 0 for unions, but type changes
+      Ty = UnTy->getElementType(Field);
+    } else {
+      Ty = cast<SequentialType>(Ty)->getElementType();
+
+      // If this is a constant subscript, handle it quickly.
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->isZero()) continue;
+        uint64_t Offs =
+            TD->getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
+        SDValue OffsVal;
+        EVT PTy = TLI.getPointerTy();
+        unsigned PtrBits = PTy.getSizeInBits();
+        if (PtrBits < 64)
+          OffsVal = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(),
+                                TLI.getPointerTy(),
+                                DAG.getConstant(Offs, MVT::i64));
+        else
+          OffsVal = DAG.getIntPtrConstant(Offs);
+
+        N = DAG.getNode(ISD::ADD, getCurDebugLoc(), N.getValueType(), N,
+                        OffsVal);
+        continue;
+      }
+
+      // N = N + Idx * ElementSize;
+      APInt ElementSize = APInt(TLI.getPointerTy().getSizeInBits(),
+                                TD->getTypeAllocSize(Ty));
+      SDValue IdxN = getValue(Idx);
+
+      // If the index is smaller or larger than intptr_t, truncate or extend
+      // it.
+      IdxN = DAG.getSExtOrTrunc(IdxN, getCurDebugLoc(), N.getValueType());
+
+      // If this is a multiply by a power of two, turn it into a shl
+      // immediately.  This is a very common case.
+      if (ElementSize != 1) {
+        if (ElementSize.isPowerOf2()) {
+          unsigned Amt = ElementSize.logBase2();
+          IdxN = DAG.getNode(ISD::SHL, getCurDebugLoc(),
+                             N.getValueType(), IdxN,
+                             DAG.getConstant(Amt, TLI.getPointerTy()));
+        } else {
+          SDValue Scale = DAG.getConstant(ElementSize, TLI.getPointerTy());
+          IdxN = DAG.getNode(ISD::MUL, getCurDebugLoc(),
+                             N.getValueType(), IdxN, Scale);
+        }
+      }
+
+      N = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+                      N.getValueType(), N, IdxN);
+    }
+  }
+
+  setValue(&I, N);
+}
+
+void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
+  // If this is a fixed sized alloca in the entry block of the function,
+  // allocate it statically on the stack.
+  if (FuncInfo.StaticAllocaMap.count(&I))
+    return;   // getValue will auto-populate this.
+
+  const Type *Ty = I.getAllocatedType();
+  uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(Ty);
+  unsigned Align =
+    std::max((unsigned)TLI.getTargetData()->getPrefTypeAlignment(Ty),
+             I.getAlignment());
+
+  SDValue AllocSize = getValue(I.getArraySize());
+
+  EVT IntPtr = TLI.getPointerTy();
+  if (AllocSize.getValueType() != IntPtr)
+    AllocSize = DAG.getZExtOrTrunc(AllocSize, getCurDebugLoc(), IntPtr);
+
+  AllocSize = DAG.getNode(ISD::MUL, getCurDebugLoc(), IntPtr,
+                          AllocSize,
+                          DAG.getConstant(TySize, IntPtr));
+
+  // Handle alignment.  If the requested alignment is less than or equal to
+  // the stack alignment, ignore it.  If the size is greater than or equal to
+  // the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
+  unsigned StackAlign = TM.getFrameInfo()->getStackAlignment();
+  if (Align <= StackAlign)
+    Align = 0;
+
+  // Round the size of the allocation up to the stack alignment size
+  // by add SA-1 to the size.
+  AllocSize = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+                          AllocSize.getValueType(), AllocSize,
+                          DAG.getIntPtrConstant(StackAlign-1));
+
+  // Mask out the low bits for alignment purposes.
+  AllocSize = DAG.getNode(ISD::AND, getCurDebugLoc(),
+                          AllocSize.getValueType(), AllocSize,
+                          DAG.getIntPtrConstant(~(uint64_t)(StackAlign-1)));
+
+  SDValue Ops[] = { getRoot(), AllocSize, DAG.getIntPtrConstant(Align) };
+  SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
+  SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, getCurDebugLoc(),
+                            VTs, Ops, 3);
+  setValue(&I, DSA);
+  DAG.setRoot(DSA.getValue(1));
+
+  // Inform the Frame Information that we have just allocated a variable-sized
+  // object.
+  FuncInfo.MF->getFrameInfo()->CreateVariableSizedObject(Align ? Align : 1);
+}
+
+void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
+  const Value *SV = I.getOperand(0);
+  SDValue Ptr = getValue(SV);
+
+  const Type *Ty = I.getType();
+
+  bool isVolatile = I.isVolatile();
+  bool isNonTemporal = I.getMetadata("nontemporal") != 0;
+  unsigned Alignment = I.getAlignment();
+
+  SmallVector<EVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(TLI, Ty, ValueVTs, &Offsets);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0)
+    return;
+
+  SDValue Root;
+  bool ConstantMemory = false;
+  if (I.isVolatile())
+    // Serialize volatile loads with other side effects.
+    Root = getRoot();
+  else if (AA->pointsToConstantMemory(SV)) {
+    // Do not serialize (non-volatile) loads of constant memory with anything.
+    Root = DAG.getEntryNode();
+    ConstantMemory = true;
+  } else {
+    // Do not serialize non-volatile loads against each other.
+    Root = DAG.getRoot();
+  }
+
+  SmallVector<SDValue, 4> Values(NumValues);
+  SmallVector<SDValue, 4> Chains(NumValues);
+  EVT PtrVT = Ptr.getValueType();
+  for (unsigned i = 0; i != NumValues; ++i) {
+    SDValue A = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+                            PtrVT, Ptr,
+                            DAG.getConstant(Offsets[i], PtrVT));
+    SDValue L = DAG.getLoad(ValueVTs[i], getCurDebugLoc(), Root,
+                            A, SV, Offsets[i], isVolatile, 
+                            isNonTemporal, Alignment);
+
+    Values[i] = L;
+    Chains[i] = L.getValue(1);
+  }
+
+  if (!ConstantMemory) {
+    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+                                MVT::Other, &Chains[0], NumValues);
+    if (isVolatile)
+      DAG.setRoot(Chain);
+    else
+      PendingLoads.push_back(Chain);
+  }
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                           DAG.getVTList(&ValueVTs[0], NumValues),
+                           &Values[0], NumValues));
+}
+
+void SelectionDAGBuilder::visitStore(const StoreInst &I) {
+  const Value *SrcV = I.getOperand(0);
+  const Value *PtrV = I.getOperand(1);
+
+  SmallVector<EVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(TLI, SrcV->getType(), ValueVTs, &Offsets);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0)
+    return;
+
+  // Get the lowered operands. Note that we do this after
+  // checking if NumResults is zero, because with zero results
+  // the operands won't have values in the map.
+  SDValue Src = getValue(SrcV);
+  SDValue Ptr = getValue(PtrV);
+
+  SDValue Root = getRoot();
+  SmallVector<SDValue, 4> Chains(NumValues);
+  EVT PtrVT = Ptr.getValueType();
+  bool isVolatile = I.isVolatile();
+  bool isNonTemporal = I.getMetadata("nontemporal") != 0;
+  unsigned Alignment = I.getAlignment();
+
+  for (unsigned i = 0; i != NumValues; ++i) {
+    SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT, Ptr,
+                              DAG.getConstant(Offsets[i], PtrVT));
+    Chains[i] = DAG.getStore(Root, getCurDebugLoc(),
+                             SDValue(Src.getNode(), Src.getResNo() + i),
+                             Add, PtrV, Offsets[i], isVolatile, 
+                             isNonTemporal, Alignment);
+  }
+
+  DAG.setRoot(DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+                          MVT::Other, &Chains[0], NumValues));
+}
+
+/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
+/// node.
+void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
+                                               unsigned Intrinsic) {
+  bool HasChain = !I.doesNotAccessMemory();
+  bool OnlyLoad = HasChain && I.onlyReadsMemory();
+
+  // Build the operand list.
+  SmallVector<SDValue, 8> Ops;
+  if (HasChain) {  // If this intrinsic has side-effects, chainify it.
+    if (OnlyLoad) {
+      // We don't need to serialize loads against other loads.
+      Ops.push_back(DAG.getRoot());
+    } else {
+      Ops.push_back(getRoot());
+    }
+  }
+
+  // Info is set by getTgtMemInstrinsic
+  TargetLowering::IntrinsicInfo Info;
+  bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I, Intrinsic);
+
+  // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
+  if (!IsTgtIntrinsic)
+    Ops.push_back(DAG.getConstant(Intrinsic, TLI.getPointerTy()));
+
+  // Add all operands of the call to the operand list.
+  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+    SDValue Op = getValue(I.getArgOperand(i));
+    assert(TLI.isTypeLegal(Op.getValueType()) &&
+           "Intrinsic uses a non-legal type?");
+    Ops.push_back(Op);
+  }
+
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, I.getType(), ValueVTs);
+#ifndef NDEBUG
+  for (unsigned Val = 0, E = ValueVTs.size(); Val != E; ++Val) {
+    assert(TLI.isTypeLegal(ValueVTs[Val]) &&
+           "Intrinsic uses a non-legal type?");
+  }
+#endif // NDEBUG
+
+  if (HasChain)
+    ValueVTs.push_back(MVT::Other);
+
+  SDVTList VTs = DAG.getVTList(ValueVTs.data(), ValueVTs.size());
+
+  // Create the node.
+  SDValue Result;
+  if (IsTgtIntrinsic) {
+    // This is target intrinsic that touches memory
+    Result = DAG.getMemIntrinsicNode(Info.opc, getCurDebugLoc(),
+                                     VTs, &Ops[0], Ops.size(),
+                                     Info.memVT, Info.ptrVal, Info.offset,
+                                     Info.align, Info.vol,
+                                     Info.readMem, Info.writeMem);
+  } else if (!HasChain) {
+    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurDebugLoc(),
+                         VTs, &Ops[0], Ops.size());
+  } else if (!I.getType()->isVoidTy()) {
+    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurDebugLoc(),
+                         VTs, &Ops[0], Ops.size());
+  } else {
+    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurDebugLoc(),
+                         VTs, &Ops[0], Ops.size());
+  }
+
+  if (HasChain) {
+    SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
+    if (OnlyLoad)
+      PendingLoads.push_back(Chain);
+    else
+      DAG.setRoot(Chain);
+  }
+
+  if (!I.getType()->isVoidTy()) {
+    if (const VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
+      EVT VT = TLI.getValueType(PTy);
+      Result = DAG.getNode(ISD::BIT_CONVERT, getCurDebugLoc(), VT, Result);
+    }
+
+    setValue(&I, Result);
+  }
+}
+
+/// GetSignificand - Get the significand and build it into a floating-point
+/// number with exponent of 1:
+///
+///   Op = (Op & 0x007fffff) | 0x3f800000;
+///
+/// where Op is the hexidecimal representation of floating point value.
+static SDValue
+GetSignificand(SelectionDAG &DAG, SDValue Op, DebugLoc dl) {
+  SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
+                           DAG.getConstant(0x007fffff, MVT::i32));
+  SDValue t2 = DAG.getNode(ISD::OR, dl, MVT::i32, t1,
+                           DAG.getConstant(0x3f800000, MVT::i32));
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, t2);
+}
+
+/// GetExponent - Get the exponent:
+///
+///   (float)(int)(((Op & 0x7f800000) >> 23) - 127);
+///
+/// where Op is the hexidecimal representation of floating point value.
+static SDValue
+GetExponent(SelectionDAG &DAG, SDValue Op, const TargetLowering &TLI,
+            DebugLoc dl) {
+  SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
+                           DAG.getConstant(0x7f800000, MVT::i32));
+  SDValue t1 = DAG.getNode(ISD::SRL, dl, MVT::i32, t0,
+                           DAG.getConstant(23, TLI.getPointerTy()));
+  SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1,
+                           DAG.getConstant(127, MVT::i32));
+  return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2);
+}
+
+/// getF32Constant - Get 32-bit floating point constant.
+static SDValue
+getF32Constant(SelectionDAG &DAG, unsigned Flt) {
+  return DAG.getConstantFP(APFloat(APInt(32, Flt)), MVT::f32);
+}
+
+/// Inlined utility function to implement binary input atomic intrinsics for
+/// visitIntrinsicCall: I is a call instruction
+///                     Op is the associated NodeType for I
+const char *
+SelectionDAGBuilder::implVisitBinaryAtomic(const CallInst& I,
+                                           ISD::NodeType Op) {
+  SDValue Root = getRoot();
+  SDValue L =
+    DAG.getAtomic(Op, getCurDebugLoc(),
+                  getValue(I.getArgOperand(1)).getValueType().getSimpleVT(),
+                  Root,
+                  getValue(I.getArgOperand(0)),
+                  getValue(I.getArgOperand(1)),
+                  I.getArgOperand(0));
+  setValue(&I, L);
+  DAG.setRoot(L.getValue(1));
+  return 0;
+}
+
+// implVisitAluOverflow - Lower arithmetic overflow instrinsics.
+const char *
+SelectionDAGBuilder::implVisitAluOverflow(const CallInst &I, ISD::NodeType Op) {
+  SDValue Op1 = getValue(I.getArgOperand(0));
+  SDValue Op2 = getValue(I.getArgOperand(1));
+
+  SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
+  setValue(&I, DAG.getNode(Op, getCurDebugLoc(), VTs, Op1, Op2));
+  return 0;
+}
+
+/// visitExp - Lower an exp intrinsic. Handles the special sequences for
+/// limited-precision mode.
+void
+SelectionDAGBuilder::visitExp(const CallInst &I) {
+  SDValue result;
+  DebugLoc dl = getCurDebugLoc();
+
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op = getValue(I.getArgOperand(0));
+
+    // Put the exponent in the right bit position for later addition to the
+    // final result:
+    //
+    //   #define LOG2OFe 1.4426950f
+    //   IntegerPartOfX = ((int32_t)(X * LOG2OFe));
+    SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
+                             getF32Constant(DAG, 0x3fb8aa3b));
+    SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);
+
+    //   FractionalPartOfX = (X * LOG2OFe) - (float)IntegerPartOfX;
+    SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
+    SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);
+
+    //   IntegerPartOfX <<= 23;
+    IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+                                 DAG.getConstant(23, TLI.getPointerTy()));
+
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.997535578f +
+      //       (0.735607626f + 0.252464424f * x) * x;
+      //
+      // error 0.0144103317, which is 6 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3e814304));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f3c50c8));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f7f5e7e));
+      SDValue TwoToFracPartOfX = DAG.getNode(ISD::BIT_CONVERT, dl,MVT::i32, t5);
+
+      // Add the exponent into the result in integer domain.
+      SDValue t6 = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                               TwoToFracPartOfX, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, t6);
+    } else if (LimitFloatPrecision > 6 && LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999892986f +
+      //       (0.696457318f +
+      //         (0.224338339f + 0.792043434e-1f * x) * x) * x;
+      //
+      // 0.000107046256 error, which is 13 to 14 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3da235e3));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3e65b8f3));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f324b07));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3f7ff8fd));
+      SDValue TwoToFracPartOfX = DAG.getNode(ISD::BIT_CONVERT, dl,MVT::i32, t7);
+
+      // Add the exponent into the result in integer domain.
+      SDValue t8 = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                               TwoToFracPartOfX, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, t8);
+    } else { // LimitFloatPrecision > 12 && LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999999982f +
+      //       (0.693148872f +
+      //         (0.240227044f +
+      //           (0.554906021e-1f +
+      //             (0.961591928e-2f +
+      //               (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
+      //
+      // error 2.47208000*10^(-7), which is better than 18 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3924b03e));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3ab24b87));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3c1d8c17));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3d634a1d));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x3e75fe14));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
+                                getF32Constant(DAG, 0x3f317234));
+      SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
+      SDValue t13 = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
+                                getF32Constant(DAG, 0x3f800000));
+      SDValue TwoToFracPartOfX = DAG.getNode(ISD::BIT_CONVERT, dl,
+                                             MVT::i32, t13);
+
+      // Add the exponent into the result in integer domain.
+      SDValue t14 = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                TwoToFracPartOfX, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, t14);
+    }
+  } else {
+    // No special expansion.
+    result = DAG.getNode(ISD::FEXP, dl,
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
+  }
+
+  setValue(&I, result);
+}
+
+/// visitLog - Lower a log intrinsic. Handles the special sequences for
+/// limited-precision mode.
+void
+SelectionDAGBuilder::visitLog(const CallInst &I) {
+  SDValue result;
+  DebugLoc dl = getCurDebugLoc();
+
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op = getValue(I.getArgOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
+
+    // Scale the exponent by log(2) [0.69314718f].
+    SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
+    SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
+                                        getF32Constant(DAG, 0x3f317218));
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   LogofMantissa =
+      //     -1.1609546f +
+      //       (1.4034025f - 0.23903021f * x) * x;
+      //
+      // error 0.0034276066, which is better than 8 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbe74c456));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3fb3a2b1));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                          getF32Constant(DAG, 0x3f949a29));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, LogOfMantissa);
+    } else if (LimitFloatPrecision > 6 && LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   LogOfMantissa =
+      //     -1.7417939f +
+      //       (2.8212026f +
+      //         (-1.4699568f +
+      //           (0.44717955f - 0.56570851e-1f * x) * x) * x) * x;
+      //
+      // error 0.000061011436, which is 14 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbd67b6d6));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3ee4f4b8));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3fbc278b));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40348e95));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                                          getF32Constant(DAG, 0x3fdef31a));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, LogOfMantissa);
+    } else { // LimitFloatPrecision > 12 && LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   LogOfMantissa =
+      //     -2.1072184f +
+      //       (4.2372794f +
+      //         (-3.7029485f +
+      //           (2.2781945f +
+      //             (-0.87823314f +
+      //               (0.19073739f - 0.17809712e-1f * x) * x) * x) * x) * x)*x;
+      //
+      // error 0.0000023660568, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbc91e5ac));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e4350aa));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f60d3e3));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x4011cdf0));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x406cfd1c));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x408797cb));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
+                                          getF32Constant(DAG, 0x4006dcab));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, LogOfMantissa);
+    }
+  } else {
+    // No special expansion.
+    result = DAG.getNode(ISD::FLOG, dl,
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
+  }
+
+  setValue(&I, result);
+}
+
+/// visitLog2 - Lower a log2 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+void
+SelectionDAGBuilder::visitLog2(const CallInst &I) {
+  SDValue result;
+  DebugLoc dl = getCurDebugLoc();
+
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op = getValue(I.getArgOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
+
+    // Get the exponent.
+    SDValue LogOfExponent = GetExponent(DAG, Op1, TLI, dl);
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    // Different possible minimax approximations of significand in
+    // floating-point for various degrees of accuracy over [1,2].
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   Log2ofMantissa = -1.6749035f + (2.0246817f - .34484768f * x) * x;
+      //
+      // error 0.0049451742, which is more than 7 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbeb08fe0));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x40019463));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                           getF32Constant(DAG, 0x3fd6633d));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, Log2ofMantissa);
+    } else if (LimitFloatPrecision > 6 && LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   Log2ofMantissa =
+      //     -2.51285454f +
+      //       (4.07009056f +
+      //         (-2.12067489f +
+      //           (.645142248f - 0.816157886e-1f * x) * x) * x) * x;
+      //
+      // error 0.0000876136000, which is better than 13 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbda7262e));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3f25280b));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x4007b923));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40823e2f));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                                           getF32Constant(DAG, 0x4020d29c));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, Log2ofMantissa);
+    } else { // LimitFloatPrecision > 12 && LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   Log2ofMantissa =
+      //     -3.0400495f +
+      //       (6.1129976f +
+      //         (-5.3420409f +
+      //           (3.2865683f +
+      //             (-1.2669343f +
+      //               (0.27515199f -
+      //                 0.25691327e-1f * x) * x) * x) * x) * x) * x;
+      //
+      // error 0.0000018516, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbcd2769e));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e8ce0b9));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3fa22ae7));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40525723));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x40aaf200));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x40c39dad));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
+                                           getF32Constant(DAG, 0x4042902c));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, Log2ofMantissa);
+    }
+  } else {
+    // No special expansion.
+    result = DAG.getNode(ISD::FLOG2, dl,
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
+  }
+
+  setValue(&I, result);
+}
+
+/// visitLog10 - Lower a log10 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+void
+SelectionDAGBuilder::visitLog10(const CallInst &I) {
+  SDValue result;
+  DebugLoc dl = getCurDebugLoc();
+
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op = getValue(I.getArgOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
+
+    // Scale the exponent by log10(2) [0.30102999f].
+    SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
+    SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
+                                        getF32Constant(DAG, 0x3e9a209a));
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   Log10ofMantissa =
+      //     -0.50419619f +
+      //       (0.60948995f - 0.10380950f * x) * x;
+      //
+      // error 0.0014886165, which is 6 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbdd49a13));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3f1c0789));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                            getF32Constant(DAG, 0x3f011300));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, Log10ofMantissa);
+    } else if (LimitFloatPrecision > 6 && LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   Log10ofMantissa =
+      //     -0.64831180f +
+      //       (0.91751397f +
+      //         (-0.31664806f + 0.47637168e-1f * x) * x) * x;
+      //
+      // error 0.00019228036, which is better than 12 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3d431f31));
+      SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3ea21fb2));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f6ae232));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
+                                            getF32Constant(DAG, 0x3f25f7c3));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, Log10ofMantissa);
+    } else { // LimitFloatPrecision > 12 && LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   Log10ofMantissa =
+      //     -0.84299375f +
+      //       (1.5327582f +
+      //         (-1.0688956f +
+      //           (0.49102474f +
+      //             (-0.12539807f + 0.13508273e-1f * x) * x) * x) * x) * x;
+      //
+      // error 0.0000037995730, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3c5d51ce));
+      SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e00685a));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3efb6798));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f88d192));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3fc4316c));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t8,
+                                            getF32Constant(DAG, 0x3f57ce70));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, Log10ofMantissa);
+    }
+  } else {
+    // No special expansion.
+    result = DAG.getNode(ISD::FLOG10, dl,
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
+  }
+
+  setValue(&I, result);
+}
+
+/// visitExp2 - Lower an exp2 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+void
+SelectionDAGBuilder::visitExp2(const CallInst &I) {
+  SDValue result;
+  DebugLoc dl = getCurDebugLoc();
+
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op = getValue(I.getArgOperand(0));
+
+    SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Op);
+
+    //   FractionalPartOfX = x - (float)IntegerPartOfX;
+    SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
+    SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, Op, t1);
+
+    //   IntegerPartOfX <<= 23;
+    IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+                                 DAG.getConstant(23, TLI.getPointerTy()));
+
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.997535578f +
+      //       (0.735607626f + 0.252464424f * x) * x;
+      //
+      // error 0.0144103317, which is 6 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3e814304));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f3c50c8));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f7f5e7e));
+      SDValue t6 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, t5);
+      SDValue TwoToFractionalPartOfX =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, t6, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::f32, TwoToFractionalPartOfX);
+    } else if (LimitFloatPrecision > 6 && LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999892986f +
+      //       (0.696457318f +
+      //         (0.224338339f + 0.792043434e-1f * x) * x) * x;
+      //
+      // error 0.000107046256, which is 13 to 14 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3da235e3));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3e65b8f3));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f324b07));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3f7ff8fd));
+      SDValue t8 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, t7);
+      SDValue TwoToFractionalPartOfX =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, t8, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::f32, TwoToFractionalPartOfX);
+    } else { // LimitFloatPrecision > 12 && LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999999982f +
+      //       (0.693148872f +
+      //         (0.240227044f +
+      //           (0.554906021e-1f +
+      //             (0.961591928e-2f +
+      //               (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
+      // error 2.47208000*10^(-7), which is better than 18 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3924b03e));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3ab24b87));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3c1d8c17));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3d634a1d));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x3e75fe14));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
+                                getF32Constant(DAG, 0x3f317234));
+      SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
+      SDValue t13 = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
+                                getF32Constant(DAG, 0x3f800000));
+      SDValue t14 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, t13);
+      SDValue TwoToFractionalPartOfX =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, t14, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::f32, TwoToFractionalPartOfX);
+    }
+  } else {
+    // No special expansion.
+    result = DAG.getNode(ISD::FEXP2, dl,
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
+  }
+
+  setValue(&I, result);
+}
+
+/// visitPow - Lower a pow intrinsic. Handles the special sequences for
+/// limited-precision mode with x == 10.0f.
+void
+SelectionDAGBuilder::visitPow(const CallInst &I) {
+  SDValue result;
+  const Value *Val = I.getArgOperand(0);
+  DebugLoc dl = getCurDebugLoc();
+  bool IsExp10 = false;
+
+  if (getValue(Val).getValueType() == MVT::f32 &&
+      getValue(I.getArgOperand(1)).getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    if (Constant *C = const_cast<Constant*>(dyn_cast<Constant>(Val))) {
+      if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+        APFloat Ten(10.0f);
+        IsExp10 = CFP->getValueAPF().bitwiseIsEqual(Ten);
+      }
+    }
+  }
+
+  if (IsExp10 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op = getValue(I.getArgOperand(1));
+
+    // Put the exponent in the right bit position for later addition to the
+    // final result:
+    //
+    //   #define LOG2OF10 3.3219281f
+    //   IntegerPartOfX = (int32_t)(x * LOG2OF10);
+    SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
+                             getF32Constant(DAG, 0x40549a78));
+    SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);
+
+    //   FractionalPartOfX = x - (float)IntegerPartOfX;
+    SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
+    SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);
+
+    //   IntegerPartOfX <<= 23;
+    IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+                                 DAG.getConstant(23, TLI.getPointerTy()));
+
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   twoToFractionalPartOfX =
+      //     0.997535578f +
+      //       (0.735607626f + 0.252464424f * x) * x;
+      //
+      // error 0.0144103317, which is 6 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3e814304));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f3c50c8));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f7f5e7e));
+      SDValue t6 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, t5);
+      SDValue TwoToFractionalPartOfX =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, t6, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::f32, TwoToFractionalPartOfX);
+    } else if (LimitFloatPrecision > 6 && LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999892986f +
+      //       (0.696457318f +
+      //         (0.224338339f + 0.792043434e-1f * x) * x) * x;
+      //
+      // error 0.000107046256, which is 13 to 14 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3da235e3));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3e65b8f3));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f324b07));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3f7ff8fd));
+      SDValue t8 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, t7);
+      SDValue TwoToFractionalPartOfX =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, t8, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::f32, TwoToFractionalPartOfX);
+    } else { // LimitFloatPrecision > 12 && LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999999982f +
+      //       (0.693148872f +
+      //         (0.240227044f +
+      //           (0.554906021e-1f +
+      //             (0.961591928e-2f +
+      //               (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
+      // error 2.47208000*10^(-7), which is better than 18 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3924b03e));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3ab24b87));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3c1d8c17));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3d634a1d));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x3e75fe14));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
+                                getF32Constant(DAG, 0x3f317234));
+      SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
+      SDValue t13 = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
+                                getF32Constant(DAG, 0x3f800000));
+      SDValue t14 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, t13);
+      SDValue TwoToFractionalPartOfX =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, t14, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::f32, TwoToFractionalPartOfX);
+    }
+  } else {
+    // No special expansion.
+    result = DAG.getNode(ISD::FPOW, dl,
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)),
+                         getValue(I.getArgOperand(1)));
+  }
+
+  setValue(&I, result);
+}
+
+
+/// ExpandPowI - Expand a llvm.powi intrinsic.
+static SDValue ExpandPowI(DebugLoc DL, SDValue LHS, SDValue RHS,
+                          SelectionDAG &DAG) {
+  // If RHS is a constant, we can expand this out to a multiplication tree,
+  // otherwise we end up lowering to a call to __powidf2 (for example).  When
+  // optimizing for size, we only want to do this if the expansion would produce
+  // a small number of multiplies, otherwise we do the full expansion.
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+    // Get the exponent as a positive value.
+    unsigned Val = RHSC->getSExtValue();
+    if ((int)Val < 0) Val = -Val;
+
+    // powi(x, 0) -> 1.0
+    if (Val == 0)
+      return DAG.getConstantFP(1.0, LHS.getValueType());
+
+    const Function *F = DAG.getMachineFunction().getFunction();
+    if (!F->hasFnAttr(Attribute::OptimizeForSize) ||
+        // If optimizing for size, don't insert too many multiplies.  This
+        // inserts up to 5 multiplies.
+        CountPopulation_32(Val)+Log2_32(Val) < 7) {
+      // We use the simple binary decomposition method to generate the multiply
+      // sequence.  There are more optimal ways to do this (for example,
+      // powi(x,15) generates one more multiply than it should), but this has
+      // the benefit of being both really simple and much better than a libcall.
+      SDValue Res;  // Logically starts equal to 1.0
+      SDValue CurSquare = LHS;
+      while (Val) {
+        if (Val & 1) {
+          if (Res.getNode())
+            Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
+          else
+            Res = CurSquare;  // 1.0*CurSquare.
+        }
+
+        CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
+                                CurSquare, CurSquare);
+        Val >>= 1;
+      }
+
+      // If the original was negative, invert the result, producing 1/(x*x*x).
+      if (RHSC->getSExtValue() < 0)
+        Res = DAG.getNode(ISD::FDIV, DL, LHS.getValueType(),
+                          DAG.getConstantFP(1.0, LHS.getValueType()), Res);
+      return Res;
+    }
+  }
+
+  // Otherwise, expand to a libcall.
+  return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
+}
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp)
+#define setjmp_undefined_for_visual_studio
+#undef setjmp
+#endif
+
+/// visitIntrinsicCall - Lower the call to the specified intrinsic function.  If
+/// we want to emit this as a call to a named external function, return the name
+/// otherwise lower it and return null.
+const char *
+SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
+  DebugLoc dl = getCurDebugLoc();
+  SDValue Res;
+
+  switch (Intrinsic) {
+  default:
+    // By default, turn this into a target intrinsic node.
+    visitTargetIntrinsic(I, Intrinsic);
+    return 0;
+  case Intrinsic::returnaddress:
+    setValue(&I, DAG.getNode(ISD::RETURNADDR, dl, TLI.getPointerTy(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::frameaddress:
+    setValue(&I, DAG.getNode(ISD::FRAMEADDR, dl, TLI.getPointerTy(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::setjmp:
+    return "_setjmp"+!TLI.usesUnderscoreSetJmp();
+  case Intrinsic::longjmp:
+    return "_longjmp"+!TLI.usesUnderscoreLongJmp();
+  case Intrinsic::memcpy: {
+    // Assert for address < 256 since we support only user defined address
+    // spaces.
+    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
+           < 256 &&
+           cast<PointerType>(I.getArgOperand(1)->getType())->getAddressSpace()
+           < 256 &&
+           "Unknown address space");
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
+    DAG.setRoot(DAG.getMemcpy(getRoot(), dl, Op1, Op2, Op3, Align, isVol, false,
+                              I.getArgOperand(0), 0, I.getArgOperand(1), 0));
+    return 0;
+  }
+  case Intrinsic::memset: {
+    // Assert for address < 256 since we support only user defined address
+    // spaces.
+    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
+           < 256 &&
+           "Unknown address space");
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
+    DAG.setRoot(DAG.getMemset(getRoot(), dl, Op1, Op2, Op3, Align, isVol,
+                              I.getArgOperand(0), 0));
+    return 0;
+  }
+  case Intrinsic::memmove: {
+    // Assert for address < 256 since we support only user defined address
+    // spaces.
+    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
+           < 256 &&
+           cast<PointerType>(I.getArgOperand(1)->getType())->getAddressSpace()
+           < 256 &&
+           "Unknown address space");
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
+
+    // If the source and destination are known to not be aliases, we can
+    // lower memmove as memcpy.
+    uint64_t Size = -1ULL;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op3))
+      Size = C->getZExtValue();
+    if (AA->alias(I.getArgOperand(0), Size, I.getArgOperand(1), Size) ==
+        AliasAnalysis::NoAlias) {
+      DAG.setRoot(DAG.getMemcpy(getRoot(), dl, Op1, Op2, Op3, Align, isVol, 
+                                false, I.getArgOperand(0), 0,
+                                I.getArgOperand(1), 0));
+      return 0;
+    }
+
+    DAG.setRoot(DAG.getMemmove(getRoot(), dl, Op1, Op2, Op3, Align, isVol,
+                               I.getArgOperand(0), 0, I.getArgOperand(1), 0));
+    return 0;
+  }
+  case Intrinsic::convertff:
+  case Intrinsic::convertfsi:
+  case Intrinsic::convertfui:
+  case Intrinsic::convertsif:
+  case Intrinsic::convertuif:
+  case Intrinsic::convertss:
+  case Intrinsic::convertsu:
+  case Intrinsic::convertus:
+  case Intrinsic::convertuu: {
+    ISD::CvtCode Code = ISD::CVT_INVALID;
+    switch (Intrinsic) {
+    case Intrinsic::convertff:  Code = ISD::CVT_FF; break;
+    case Intrinsic::convertfsi: Code = ISD::CVT_FS; break;
+    case Intrinsic::convertfui: Code = ISD::CVT_FU; break;
+    case Intrinsic::convertsif: Code = ISD::CVT_SF; break;
+    case Intrinsic::convertuif: Code = ISD::CVT_UF; break;
+    case Intrinsic::convertss:  Code = ISD::CVT_SS; break;
+    case Intrinsic::convertsu:  Code = ISD::CVT_SU; break;
+    case Intrinsic::convertus:  Code = ISD::CVT_US; break;
+    case Intrinsic::convertuu:  Code = ISD::CVT_UU; break;
+    }
+    EVT DestVT = TLI.getValueType(I.getType());
+    const Value *Op1 = I.getArgOperand(0);
+    Res = DAG.getConvertRndSat(DestVT, getCurDebugLoc(), getValue(Op1),
+                               DAG.getValueType(DestVT),
+                               DAG.getValueType(getValue(Op1).getValueType()),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2)),
+                               Code);
+    setValue(&I, Res);
+    return 0;
+  }
+  case Intrinsic::sqrt:
+    setValue(&I, DAG.getNode(ISD::FSQRT, dl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::powi:
+    setValue(&I, ExpandPowI(dl, getValue(I.getArgOperand(0)),
+                            getValue(I.getArgOperand(1)), DAG));
+    return 0;
+  case Intrinsic::sin:
+    setValue(&I, DAG.getNode(ISD::FSIN, dl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::cos:
+    setValue(&I, DAG.getNode(ISD::FCOS, dl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::log:
+    visitLog(I);
+    return 0;
+  case Intrinsic::log2:
+    visitLog2(I);
+    return 0;
+  case Intrinsic::log10:
+    visitLog10(I);
+    return 0;
+  case Intrinsic::exp:
+    visitExp(I);
+    return 0;
+  case Intrinsic::exp2:
+    visitExp2(I);
+    return 0;
+  case Intrinsic::pow:
+    visitPow(I);
+    return 0;
+  case Intrinsic::convert_to_fp16:
+    setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, dl,
+                             MVT::i16, getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::convert_from_fp16:
+    setValue(&I, DAG.getNode(ISD::FP16_TO_FP32, dl,
+                             MVT::f32, getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::pcmarker: {
+    SDValue Tmp = getValue(I.getArgOperand(0));
+    DAG.setRoot(DAG.getNode(ISD::PCMARKER, dl, MVT::Other, getRoot(), Tmp));
+    return 0;
+  }
+  case Intrinsic::readcyclecounter: {
+    SDValue Op = getRoot();
+    Res = DAG.getNode(ISD::READCYCLECOUNTER, dl,
+                      DAG.getVTList(MVT::i64, MVT::Other),
+                      &Op, 1);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return 0;
+  }
+  case Intrinsic::bswap:
+    setValue(&I, DAG.getNode(ISD::BSWAP, dl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::cttz: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    EVT Ty = Arg.getValueType();
+    setValue(&I, DAG.getNode(ISD::CTTZ, dl, Ty, Arg));
+    return 0;
+  }
+  case Intrinsic::ctlz: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    EVT Ty = Arg.getValueType();
+    setValue(&I, DAG.getNode(ISD::CTLZ, dl, Ty, Arg));
+    return 0;
+  }
+  case Intrinsic::ctpop: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    EVT Ty = Arg.getValueType();
+    setValue(&I, DAG.getNode(ISD::CTPOP, dl, Ty, Arg));
+    return 0;
+  }
+  case Intrinsic::stacksave: {
+    SDValue Op = getRoot();
+    Res = DAG.getNode(ISD::STACKSAVE, dl,
+                      DAG.getVTList(TLI.getPointerTy(), MVT::Other), &Op, 1);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return 0;
+  }
+  case Intrinsic::stackrestore: {
+    Res = getValue(I.getArgOperand(0));
+    DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, dl, MVT::Other, getRoot(), Res));
+    return 0;
+  }
+  case Intrinsic::stackprotector: {
+    // Emit code into the DAG to store the stack guard onto the stack.
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    EVT PtrTy = TLI.getPointerTy();
+
+    SDValue Src = getValue(I.getArgOperand(0));   // The guard's value.
+    AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
+
+    int FI = FuncInfo.StaticAllocaMap[Slot];
+    MFI->setStackProtectorIndex(FI);
+
+    SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
+
+    // Store the stack protector onto the stack.
+    Res = DAG.getStore(getRoot(), getCurDebugLoc(), Src, FIN,
+                       PseudoSourceValue::getFixedStack(FI),
+                       0, true, false, 0);
+    setValue(&I, Res);
+    DAG.setRoot(Res);
+    return 0;
+  }
+  case Intrinsic::objectsize: {
+    // If we don't know by now, we're never going to know.
+    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));
+
+    assert(CI && "Non-constant type in __builtin_object_size?");
+
+    SDValue Arg = getValue(I.getCalledValue());
+    EVT Ty = Arg.getValueType();
+
+    if (CI->isZero())
+      Res = DAG.getConstant(-1ULL, Ty);
+    else
+      Res = DAG.getConstant(0, Ty);
+
+    setValue(&I, Res);
+    return 0;
+  }
+  case Intrinsic::var_annotation:
+    // Discard annotate attributes
+    return 0;
+
+  case Intrinsic::init_trampoline: {
+    const Function *F = cast<Function>(I.getArgOperand(1)->stripPointerCasts());
+
+    SDValue Ops[6];
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getArgOperand(0));
+    Ops[2] = getValue(I.getArgOperand(1));
+    Ops[3] = getValue(I.getArgOperand(2));
+    Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
+    Ops[5] = DAG.getSrcValue(F);
+
+    Res = DAG.getNode(ISD::TRAMPOLINE, dl,
+                      DAG.getVTList(TLI.getPointerTy(), MVT::Other),
+                      Ops, 6);
+
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return 0;
+  }
+  case Intrinsic::flt_rounds:
+    setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, dl, MVT::i32));
+    return 0;
+  case Intrinsic::trap:
+    DAG.setRoot(DAG.getNode(ISD::TRAP, dl,MVT::Other, getRoot()));
+    return 0;
+  case Intrinsic::uadd_with_overflow:
+    return implVisitAluOverflow(I, ISD::UADDO);
+  case Intrinsic::sadd_with_overflow:
+    return implVisitAluOverflow(I, ISD::SADDO);
+  case Intrinsic::usub_with_overflow:
+    return implVisitAluOverflow(I, ISD::USUBO);
+  case Intrinsic::ssub_with_overflow:
+    return implVisitAluOverflow(I, ISD::SSUBO);
+  case Intrinsic::umul_with_overflow:
+    return implVisitAluOverflow(I, ISD::UMULO);
+  case Intrinsic::smul_with_overflow:
+    return implVisitAluOverflow(I, ISD::SMULO);
+
+  case Intrinsic::prefetch: {
+    SDValue Ops[4];
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getArgOperand(0));
+    Ops[2] = getValue(I.getArgOperand(1));
+    Ops[3] = getValue(I.getArgOperand(2));
+    DAG.setRoot(DAG.getNode(ISD::PREFETCH, dl, MVT::Other, &Ops[0], 4));
+    return 0;
+  }
+
+  case Intrinsic::memory_barrier: {
+    SDValue Ops[6];
+    Ops[0] = getRoot();
+    for (int x = 1; x < 6; ++x)
+      Ops[x] = getValue(I.getArgOperand(x - 1));
+
+    DAG.setRoot(DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, &Ops[0], 6));
+    return 0;
+  }
+  case Intrinsic::atomic_cmp_swap: {
+    SDValue Root = getRoot();
+    SDValue L =
+      DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, getCurDebugLoc(),
+                    getValue(I.getArgOperand(1)).getValueType().getSimpleVT(),
+                    Root,
+                    getValue(I.getArgOperand(0)),
+                    getValue(I.getArgOperand(1)),
+                    getValue(I.getArgOperand(2)),
+                    I.getArgOperand(0));
+    setValue(&I, L);
+    DAG.setRoot(L.getValue(1));
+    return 0;
+  }
+  case Intrinsic::atomic_load_add:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_ADD);
+  case Intrinsic::atomic_load_sub:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_SUB);
+  case Intrinsic::atomic_load_or:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_OR);
+  case Intrinsic::atomic_load_xor:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_XOR);
+  case Intrinsic::atomic_load_and:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_AND);
+  case Intrinsic::atomic_load_nand:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_NAND);
+  case Intrinsic::atomic_load_max:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_MAX);
+  case Intrinsic::atomic_load_min:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_MIN);
+  case Intrinsic::atomic_load_umin:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_UMIN);
+  case Intrinsic::atomic_load_umax:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_UMAX);
+  case Intrinsic::atomic_swap:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_SWAP);
+
+  case Intrinsic::invariant_start:
+  case Intrinsic::lifetime_start:
+    // Discard region information.
+    setValue(&I, DAG.getUNDEF(TLI.getPointerTy()));
+    return 0;
+  case Intrinsic::invariant_end:
+  case Intrinsic::lifetime_end:
+    // Discard region information.
+    return 0;
+  }
+}
+
+void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
+                                      bool isTailCall) {
+  const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  const Type *RetTy = FTy->getReturnType();
+  MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+  MCSymbol *BeginLabel = 0;
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Args.reserve(CS.arg_size());
+
+  // Check whether the function can return without sret-demotion.
+  SmallVector<ISD::OutputArg, 4> Outs;
+  SmallVector<uint64_t, 4> Offsets;
+  GetReturnInfo(RetTy, CS.getAttributes().getRetAttributes(),
+                Outs, TLI, &Offsets);
+
+  bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
+                        FTy->isVarArg(), Outs, FTy->getContext());
+
+  SDValue DemoteStackSlot;
+
+  if (!CanLowerReturn) {
+    uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(
+                      FTy->getReturnType());
+    unsigned Align  = TLI.getTargetData()->getPrefTypeAlignment(
+                      FTy->getReturnType());
+    MachineFunction &MF = DAG.getMachineFunction();
+    int SSFI = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
+    const Type *StackSlotPtrType = PointerType::getUnqual(FTy->getReturnType());
+
+    DemoteStackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy());
+    Entry.Node = DemoteStackSlot;
+    Entry.Ty = StackSlotPtrType;
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Entry.isInReg = false;
+    Entry.isSRet = true;
+    Entry.isNest = false;
+    Entry.isByVal = false;
+    Entry.Alignment = Align;
+    Args.push_back(Entry);
+    RetTy = Type::getVoidTy(FTy->getContext());
+  }
+
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    SDValue ArgNode = getValue(*i);
+    Entry.Node = ArgNode; Entry.Ty = (*i)->getType();
+
+    unsigned attrInd = i - CS.arg_begin() + 1;
+    Entry.isSExt  = CS.paramHasAttr(attrInd, Attribute::SExt);
+    Entry.isZExt  = CS.paramHasAttr(attrInd, Attribute::ZExt);
+    Entry.isInReg = CS.paramHasAttr(attrInd, Attribute::InReg);
+    Entry.isSRet  = CS.paramHasAttr(attrInd, Attribute::StructRet);
+    Entry.isNest  = CS.paramHasAttr(attrInd, Attribute::Nest);
+    Entry.isByVal = CS.paramHasAttr(attrInd, Attribute::ByVal);
+    Entry.Alignment = CS.getParamAlignment(attrInd);
+    Args.push_back(Entry);
+  }
+
+  // Check if target-independent constraints permit a tail call here.
+  // Target-dependent constraints are checked within TLI.LowerCallTo.
+  if (isTailCall &&
+      !isInTailCallPosition(CS, CS.getAttributes().getRetAttributes(), TLI))
+    isTailCall = false;
+
+  std::pair<SDValue,SDValue> Result =
+    TLI.LowerCallTo(getRoot(), RetTy,
+                    CS.paramHasAttr(0, Attribute::SExt),
+                    CS.paramHasAttr(0, Attribute::ZExt), FTy->isVarArg(),
+                    CS.paramHasAttr(0, Attribute::InReg), FTy->getNumParams(),
+                    CS.getCallingConv(),
+                    isTailCall,
+                    !CS.getInstruction()->use_empty(),
+                    Callee, Args, DAG, getCurDebugLoc());
+  assert((isTailCall || Result.second.getNode()) &&
+         "Non-null chain expected with non-tail call!");
+  assert((Result.second.getNode() || !Result.first.getNode()) &&
+         "Null value expected with tail call!");
+  if (Result.first.getNode()) {
+    setValue(CS.getInstruction(), Result.first);
+  } else if (!CanLowerReturn && Result.second.getNode()) {
+    // The instruction result is the result of loading from the
+    // hidden sret parameter.
+    SmallVector<EVT, 1> PVTs;
+    const Type *PtrRetTy = PointerType::getUnqual(FTy->getReturnType());
+
+    ComputeValueVTs(TLI, PtrRetTy, PVTs);
+    assert(PVTs.size() == 1 && "Pointers should fit in one register");
+    EVT PtrVT = PVTs[0];
+    unsigned NumValues = Outs.size();
+    SmallVector<SDValue, 4> Values(NumValues);
+    SmallVector<SDValue, 4> Chains(NumValues);
+
+    for (unsigned i = 0; i < NumValues; ++i) {
+      SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT,
+                                DemoteStackSlot,
+                                DAG.getConstant(Offsets[i], PtrVT));
+      SDValue L = DAG.getLoad(Outs[i].VT, getCurDebugLoc(), Result.second,
+                              Add, NULL, Offsets[i], false, false, 1);
+      Values[i] = L;
+      Chains[i] = L.getValue(1);
+    }
+
+    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+                                MVT::Other, &Chains[0], NumValues);
+    PendingLoads.push_back(Chain);
+    
+    // Collect the legal value parts into potentially illegal values
+    // that correspond to the original function's return values.
+    SmallVector<EVT, 4> RetTys;
+    RetTy = FTy->getReturnType();
+    ComputeValueVTs(TLI, RetTy, RetTys);
+    ISD::NodeType AssertOp = ISD::DELETED_NODE;
+    SmallVector<SDValue, 4> ReturnValues;
+    unsigned CurReg = 0;
+    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+      EVT VT = RetTys[I];
+      EVT RegisterVT = TLI.getRegisterType(RetTy->getContext(), VT);
+      unsigned NumRegs = TLI.getNumRegisters(RetTy->getContext(), VT);
+  
+      SDValue ReturnValue =
+        getCopyFromParts(DAG, getCurDebugLoc(), &Values[CurReg], NumRegs,
+                         RegisterVT, VT, AssertOp);
+      ReturnValues.push_back(ReturnValue);
+      CurReg += NumRegs;
+    }
+
+    setValue(CS.getInstruction(),
+             DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                         DAG.getVTList(&RetTys[0], RetTys.size()),
+                         &ReturnValues[0], ReturnValues.size()));
+
+  }
+
+  // As a special case, a null chain means that a tail call has been emitted and
+  // the DAG root is already updated.
+  if (Result.second.getNode())
+    DAG.setRoot(Result.second);
+  else
+    HasTailCall = true;
+}
+
+/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
+/// value is equal or not-equal to zero.
+static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {
+  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (const ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality())
+        if (const Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
+                             const Type *LoadTy,
+                             SelectionDAGBuilder &Builder) {
+
+  // Check to see if this load can be trivially constant folded, e.g. if the
+  // input is from a string literal.
+  if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
+    // Cast pointer to the type we really want to load.
+    LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
+                                         PointerType::getUnqual(LoadTy));
+
+    if (const Constant *LoadCst =
+          ConstantFoldLoadFromConstPtr(const_cast<Constant *>(LoadInput),
+                                       Builder.TD))
+      return Builder.getValue(LoadCst);
+  }
+
+  // Otherwise, we have to emit the load.  If the pointer is to unfoldable but
+  // still constant memory, the input chain can be the entry node.
+  SDValue Root;
+  bool ConstantMemory = false;
+
+  // Do not serialize (non-volatile) loads of constant memory with anything.
+  if (Builder.AA->pointsToConstantMemory(PtrVal)) {
+    Root = Builder.DAG.getEntryNode();
+    ConstantMemory = true;
+  } else {
+    // Do not serialize non-volatile loads against each other.
+    Root = Builder.DAG.getRoot();
+  }
+
+  SDValue Ptr = Builder.getValue(PtrVal);
+  SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurDebugLoc(), Root,
+                                        Ptr, PtrVal /*SrcValue*/, 0/*SVOffset*/,
+                                        false /*volatile*/,
+                                        false /*nontemporal*/, 1 /* align=1 */);
+
+  if (!ConstantMemory)
+    Builder.PendingLoads.push_back(LoadVal.getValue(1));
+  return LoadVal;
+}
+
+
+/// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form.
+/// If so, return true and lower it, otherwise return false and it will be
+/// lowered like a normal call.
+bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  int memcmp(void*,void*,size_t)
+  if (I.getNumArgOperands() != 3)
+    return false;
+
+  const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
+  if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() ||
+      !I.getArgOperand(2)->getType()->isIntegerTy() ||
+      !I.getType()->isIntegerTy())
+    return false;
+
+  const ConstantInt *Size = dyn_cast<ConstantInt>(I.getArgOperand(2));
+
+  // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
+  // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
+  if (Size && IsOnlyUsedInZeroEqualityComparison(&I)) {
+    bool ActuallyDoIt = true;
+    MVT LoadVT;
+    const Type *LoadTy;
+    switch (Size->getZExtValue()) {
+    default:
+      LoadVT = MVT::Other;
+      LoadTy = 0;
+      ActuallyDoIt = false;
+      break;
+    case 2:
+      LoadVT = MVT::i16;
+      LoadTy = Type::getInt16Ty(Size->getContext());
+      break;
+    case 4:
+      LoadVT = MVT::i32;
+      LoadTy = Type::getInt32Ty(Size->getContext());
+      break;
+    case 8:
+      LoadVT = MVT::i64;
+      LoadTy = Type::getInt64Ty(Size->getContext());
+      break;
+        /*
+    case 16:
+      LoadVT = MVT::v4i32;
+      LoadTy = Type::getInt32Ty(Size->getContext());
+      LoadTy = VectorType::get(LoadTy, 4);
+      break;
+         */
+    }
+
+    // This turns into unaligned loads.  We only do this if the target natively
+    // supports the MVT we'll be loading or if it is small enough (<= 4) that
+    // we'll only produce a small number of byte loads.
+
+    // Require that we can find a legal MVT, and only do this if the target
+    // supports unaligned loads of that type.  Expanding into byte loads would
+    // bloat the code.
+    if (ActuallyDoIt && Size->getZExtValue() > 4) {
+      // TODO: Handle 5 byte compare as 4-byte + 1 byte.
+      // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
+      if (!TLI.isTypeLegal(LoadVT) ||!TLI.allowsUnalignedMemoryAccesses(LoadVT))
+        ActuallyDoIt = false;
+    }
+
+    if (ActuallyDoIt) {
+      SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this);
+      SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this);
+
+      SDValue Res = DAG.getSetCC(getCurDebugLoc(), MVT::i1, LHSVal, RHSVal,
+                                 ISD::SETNE);
+      EVT CallVT = TLI.getValueType(I.getType(), true);
+      setValue(&I, DAG.getZExtOrTrunc(Res, getCurDebugLoc(), CallVT));
+      return true;
+    }
+  }
+
+
+  return false;
+}
+
+
+void SelectionDAGBuilder::visitCall(const CallInst &I) {
+  const char *RenameFn = 0;
+  if (Function *F = I.getCalledFunction()) {
+    if (F->isDeclaration()) {
+      if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo()) {
+        if (unsigned IID = II->getIntrinsicID(F)) {
+          RenameFn = visitIntrinsicCall(I, IID);
+          if (!RenameFn)
+            return;
+        }
+      }
+      if (unsigned IID = F->getIntrinsicID()) {
+        RenameFn = visitIntrinsicCall(I, IID);
+        if (!RenameFn)
+          return;
+      }
+    }
+
+    // Check for well-known libc/libm calls.  If the function is internal, it
+    // can't be a library call.
+    if (!F->hasLocalLinkage() && F->hasName()) {
+      StringRef Name = F->getName();
+      if (Name == "copysign" || Name == "copysignf" || Name == "copysignl") {
+        if (I.getNumArgOperands() == 2 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
+            I.getType() == I.getArgOperand(1)->getType()) {
+          SDValue LHS = getValue(I.getArgOperand(0));
+          SDValue RHS = getValue(I.getArgOperand(1));
+          setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurDebugLoc(),
+                                   LHS.getValueType(), LHS, RHS));
+          return;
+        }
+      } else if (Name == "fabs" || Name == "fabsf" || Name == "fabsl") {
+        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType()) {
+          SDValue Tmp = getValue(I.getArgOperand(0));
+          setValue(&I, DAG.getNode(ISD::FABS, getCurDebugLoc(),
+                                   Tmp.getValueType(), Tmp));
+          return;
+        }
+      } else if (Name == "sin" || Name == "sinf" || Name == "sinl") {
+        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
+            I.onlyReadsMemory()) {
+          SDValue Tmp = getValue(I.getArgOperand(0));
+          setValue(&I, DAG.getNode(ISD::FSIN, getCurDebugLoc(),
+                                   Tmp.getValueType(), Tmp));
+          return;
+        }
+      } else if (Name == "cos" || Name == "cosf" || Name == "cosl") {
+        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
+            I.onlyReadsMemory()) {
+          SDValue Tmp = getValue(I.getArgOperand(0));
+          setValue(&I, DAG.getNode(ISD::FCOS, getCurDebugLoc(),
+                                   Tmp.getValueType(), Tmp));
+          return;
+        }
+      } else if (Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl") {
+        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
+            I.onlyReadsMemory()) {
+          SDValue Tmp = getValue(I.getArgOperand(0));
+          setValue(&I, DAG.getNode(ISD::FSQRT, getCurDebugLoc(),
+                                   Tmp.getValueType(), Tmp));
+          return;
+        }
+      } else if (Name == "memcmp") {
+        if (visitMemCmpCall(I))
+          return;
+      }
+    }
+  }
+  
+  SDValue Callee;
+  if (!RenameFn)
+    Callee = getValue(I.getCalledValue());
+  else
+    Callee = DAG.getExternalSymbol(RenameFn, TLI.getPointerTy());
+
+  // Check if we can potentially perform a tail call. More detailed checking is
+  // be done within LowerCallTo, after more information about the call is known.
+  LowerCallTo(&I, Callee, I.isTailCall());
+}
+
+namespace llvm {
+
+/// AsmOperandInfo - This contains information for each constraint that we are
+/// lowering.
+class LLVM_LIBRARY_VISIBILITY SDISelAsmOperandInfo :
+    public TargetLowering::AsmOperandInfo {
+public:
+  /// CallOperand - If this is the result output operand or a clobber
+  /// this is null, otherwise it is the incoming operand to the CallInst.
+  /// This gets modified as the asm is processed.
+  SDValue CallOperand;
+
+  /// AssignedRegs - If this is a register or register class operand, this
+  /// contains the set of register corresponding to the operand.
+  RegsForValue AssignedRegs;
+
+  explicit SDISelAsmOperandInfo(const InlineAsm::ConstraintInfo &info)
+    : TargetLowering::AsmOperandInfo(info), CallOperand(0,0) {
+  }
+
+  /// MarkAllocatedRegs - Once AssignedRegs is set, mark the assigned registers
+  /// busy in OutputRegs/InputRegs.
+  void MarkAllocatedRegs(bool isOutReg, bool isInReg,
+                         std::set<unsigned> &OutputRegs,
+                         std::set<unsigned> &InputRegs,
+                         const TargetRegisterInfo &TRI) const {
+    if (isOutReg) {
+      for (unsigned i = 0, e = AssignedRegs.Regs.size(); i != e; ++i)
+        MarkRegAndAliases(AssignedRegs.Regs[i], OutputRegs, TRI);
+    }
+    if (isInReg) {
+      for (unsigned i = 0, e = AssignedRegs.Regs.size(); i != e; ++i)
+        MarkRegAndAliases(AssignedRegs.Regs[i], InputRegs, TRI);
+    }
+  }
+
+  /// getCallOperandValEVT - Return the EVT of the Value* that this operand
+  /// corresponds to.  If there is no Value* for this operand, it returns
+  /// MVT::Other.
+  EVT getCallOperandValEVT(LLVMContext &Context,
+                           const TargetLowering &TLI,
+                           const TargetData *TD) const {
+    if (CallOperandVal == 0) return MVT::Other;
+
+    if (isa<BasicBlock>(CallOperandVal))
+      return TLI.getPointerTy();
+
+    const llvm::Type *OpTy = CallOperandVal->getType();
+
+    // If this is an indirect operand, the operand is a pointer to the
+    // accessed type.
+    if (isIndirect) {
+      const llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
+      if (!PtrTy)
+        report_fatal_error("Indirect operand for inline asm not a pointer!");
+      OpTy = PtrTy->getElementType();
+    }
+
+    // If OpTy is not a single value, it may be a struct/union that we
+    // can tile with integers.
+    if (!OpTy->isSingleValueType() && OpTy->isSized()) {
+      unsigned BitSize = TD->getTypeSizeInBits(OpTy);
+      switch (BitSize) {
+      default: break;
+      case 1:
+      case 8:
+      case 16:
+      case 32:
+      case 64:
+      case 128:
+        OpTy = IntegerType::get(Context, BitSize);
+        break;
+      }
+    }
+
+    return TLI.getValueType(OpTy, true);
+  }
+
+private:
+  /// MarkRegAndAliases - Mark the specified register and all aliases in the
+  /// specified set.
+  static void MarkRegAndAliases(unsigned Reg, std::set<unsigned> &Regs,
+                                const TargetRegisterInfo &TRI) {
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg) && "Isn't a physreg");
+    Regs.insert(Reg);
+    if (const unsigned *Aliases = TRI.getAliasSet(Reg))
+      for (; *Aliases; ++Aliases)
+        Regs.insert(*Aliases);
+  }
+};
+
+} // end llvm namespace.
+
+/// isAllocatableRegister - If the specified register is safe to allocate,
+/// i.e. it isn't a stack pointer or some other special register, return the
+/// register class for the register.  Otherwise, return null.
+static const TargetRegisterClass *
+isAllocatableRegister(unsigned Reg, MachineFunction &MF,
+                      const TargetLowering &TLI,
+                      const TargetRegisterInfo *TRI) {
+  EVT FoundVT = MVT::Other;
+  const TargetRegisterClass *FoundRC = 0;
+  for (TargetRegisterInfo::regclass_iterator RCI = TRI->regclass_begin(),
+       E = TRI->regclass_end(); RCI != E; ++RCI) {
+    EVT ThisVT = MVT::Other;
+
+    const TargetRegisterClass *RC = *RCI;
+    // If none of the value types for this register class are valid, we
+    // can't use it.  For example, 64-bit reg classes on 32-bit targets.
+    for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
+         I != E; ++I) {
+      if (TLI.isTypeLegal(*I)) {
+        // If we have already found this register in a different register class,
+        // choose the one with the largest VT specified.  For example, on
+        // PowerPC, we favor f64 register classes over f32.
+        if (FoundVT == MVT::Other || FoundVT.bitsLT(*I)) {
+          ThisVT = *I;
+          break;
+        }
+      }
+    }
+
+    if (ThisVT == MVT::Other) continue;
+
+    // NOTE: This isn't ideal.  In particular, this might allocate the
+    // frame pointer in functions that need it (due to them not being taken
+    // out of allocation, because a variable sized allocation hasn't been seen
+    // yet).  This is a slight code pessimization, but should still work.
+    for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF),
+         E = RC->allocation_order_end(MF); I != E; ++I)
+      if (*I == Reg) {
+        // We found a matching register class.  Keep looking at others in case
+        // we find one with larger registers that this physreg is also in.
+        FoundRC = RC;
+        FoundVT = ThisVT;
+        break;
+      }
+  }
+  return FoundRC;
+}
+
+/// GetRegistersForValue - Assign registers (virtual or physical) for the
+/// specified operand.  We prefer to assign virtual registers, to allow the
+/// register allocator to handle the assignment process.  However, if the asm
+/// uses features that we can't model on machineinstrs, we have SDISel do the
+/// allocation.  This produces generally horrible, but correct, code.
+///
+///   OpInfo describes the operand.
+///   Input and OutputRegs are the set of already allocated physical registers.
+///
+void SelectionDAGBuilder::
+GetRegistersForValue(SDISelAsmOperandInfo &OpInfo,
+                     std::set<unsigned> &OutputRegs,
+                     std::set<unsigned> &InputRegs) {
+  LLVMContext &Context = FuncInfo.Fn->getContext();
+
+  // Compute whether this value requires an input register, an output register,
+  // or both.
+  bool isOutReg = false;
+  bool isInReg = false;
+  switch (OpInfo.Type) {
+  case InlineAsm::isOutput:
+    isOutReg = true;
+
+    // If there is an input constraint that matches this, we need to reserve
+    // the input register so no other inputs allocate to it.
+    isInReg = OpInfo.hasMatchingInput();
+    break;
+  case InlineAsm::isInput:
+    isInReg = true;
+    isOutReg = false;
+    break;
+  case InlineAsm::isClobber:
+    isOutReg = true;
+    isInReg = true;
+    break;
+  }
+
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  SmallVector<unsigned, 4> Regs;
+
+  // If this is a constraint for a single physreg, or a constraint for a
+  // register class, find it.
+  std::pair<unsigned, const TargetRegisterClass*> PhysReg =
+    TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+                                     OpInfo.ConstraintVT);
+
+  unsigned NumRegs = 1;
+  if (OpInfo.ConstraintVT != MVT::Other) {
+    // If this is a FP input in an integer register (or visa versa) insert a bit
+    // cast of the input value.  More generally, handle any case where the input
+    // value disagrees with the register class we plan to stick this in.
+    if (OpInfo.Type == InlineAsm::isInput &&
+        PhysReg.second && !PhysReg.second->hasType(OpInfo.ConstraintVT)) {
+      // Try to convert to the first EVT that the reg class contains.  If the
+      // types are identical size, use a bitcast to convert (e.g. two differing
+      // vector types).
+      EVT RegVT = *PhysReg.second->vt_begin();
+      if (RegVT.getSizeInBits() == OpInfo.ConstraintVT.getSizeInBits()) {
+        OpInfo.CallOperand = DAG.getNode(ISD::BIT_CONVERT, getCurDebugLoc(),
+                                         RegVT, OpInfo.CallOperand);
+        OpInfo.ConstraintVT = RegVT;
+      } else if (RegVT.isInteger() && OpInfo.ConstraintVT.isFloatingPoint()) {
+        // If the input is a FP value and we want it in FP registers, do a
+        // bitcast to the corresponding integer type.  This turns an f64 value
+        // into i64, which can be passed with two i32 values on a 32-bit
+        // machine.
+        RegVT = EVT::getIntegerVT(Context,
+                                  OpInfo.ConstraintVT.getSizeInBits());
+        OpInfo.CallOperand = DAG.getNode(ISD::BIT_CONVERT, getCurDebugLoc(),
+                                         RegVT, OpInfo.CallOperand);
+        OpInfo.ConstraintVT = RegVT;
+      }
+    }
+
+    NumRegs = TLI.getNumRegisters(Context, OpInfo.ConstraintVT);
+  }
+
+  EVT RegVT;
+  EVT ValueVT = OpInfo.ConstraintVT;
+
+  // If this is a constraint for a specific physical register, like {r17},
+  // assign it now.
+  if (unsigned AssignedReg = PhysReg.first) {
+    const TargetRegisterClass *RC = PhysReg.second;
+    if (OpInfo.ConstraintVT == MVT::Other)
+      ValueVT = *RC->vt_begin();
+
+    // Get the actual register value type.  This is important, because the user
+    // may have asked for (e.g.) the AX register in i32 type.  We need to
+    // remember that AX is actually i16 to get the right extension.
+    RegVT = *RC->vt_begin();
+
+    // This is a explicit reference to a physical register.
+    Regs.push_back(AssignedReg);
+
+    // If this is an expanded reference, add the rest of the regs to Regs.
+    if (NumRegs != 1) {
+      TargetRegisterClass::iterator I = RC->begin();
+      for (; *I != AssignedReg; ++I)
+        assert(I != RC->end() && "Didn't find reg!");
+
+      // Already added the first reg.
+      --NumRegs; ++I;
+      for (; NumRegs; --NumRegs, ++I) {
+        assert(I != RC->end() && "Ran out of registers to allocate!");
+        Regs.push_back(*I);
+      }
+    }
+
+    OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
+    const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+    OpInfo.MarkAllocatedRegs(isOutReg, isInReg, OutputRegs, InputRegs, *TRI);
+    return;
+  }
+
+  // Otherwise, if this was a reference to an LLVM register class, create vregs
+  // for this reference.
+  if (const TargetRegisterClass *RC = PhysReg.second) {
+    RegVT = *RC->vt_begin();
+    if (OpInfo.ConstraintVT == MVT::Other)
+      ValueVT = RegVT;
+
+    // Create the appropriate number of virtual registers.
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+    for (; NumRegs; --NumRegs)
+      Regs.push_back(RegInfo.createVirtualRegister(RC));
+
+    OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
+    return;
+  }
+
+  // This is a reference to a register class that doesn't directly correspond
+  // to an LLVM register class.  Allocate NumRegs consecutive, available,
+  // registers from the class.
+  std::vector<unsigned> RegClassRegs
+    = TLI.getRegClassForInlineAsmConstraint(OpInfo.ConstraintCode,
+                                            OpInfo.ConstraintVT);
+
+  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+  unsigned NumAllocated = 0;
+  for (unsigned i = 0, e = RegClassRegs.size(); i != e; ++i) {
+    unsigned Reg = RegClassRegs[i];
+    // See if this register is available.
+    if ((isOutReg && OutputRegs.count(Reg)) ||   // Already used.
+        (isInReg  && InputRegs.count(Reg))) {    // Already used.
+      // Make sure we find consecutive registers.
+      NumAllocated = 0;
+      continue;
+    }
+
+    // Check to see if this register is allocatable (i.e. don't give out the
+    // stack pointer).
+    const TargetRegisterClass *RC = isAllocatableRegister(Reg, MF, TLI, TRI);
+    if (!RC) {        // Couldn't allocate this register.
+      // Reset NumAllocated to make sure we return consecutive registers.
+      NumAllocated = 0;
+      continue;
+    }
+
+    // Okay, this register is good, we can use it.
+    ++NumAllocated;
+
+    // If we allocated enough consecutive registers, succeed.
+    if (NumAllocated == NumRegs) {
+      unsigned RegStart = (i-NumAllocated)+1;
+      unsigned RegEnd   = i+1;
+      // Mark all of the allocated registers used.
+      for (unsigned i = RegStart; i != RegEnd; ++i)
+        Regs.push_back(RegClassRegs[i]);
+
+      OpInfo.AssignedRegs = RegsForValue(Regs, *RC->vt_begin(),
+                                         OpInfo.ConstraintVT);
+      OpInfo.MarkAllocatedRegs(isOutReg, isInReg, OutputRegs, InputRegs, *TRI);
+      return;
+    }
+  }
+
+  // Otherwise, we couldn't allocate enough registers for this.
+}
+
+/// TargetLowering::LowerCallTo - This is the default LowerCallTo
+/// implementation, which just calls LowerCall.
+/// FIXME: When all targets are
+/// migrated to using LowerCall, this hook should be integrated into SDISel.
+std::pair<SDValue, SDValue>
+TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
+                            bool RetSExt, bool RetZExt, bool isVarArg,
+                            bool isInreg, unsigned NumFixedArgs,
+                            CallingConv::ID CallConv, bool isTailCall,
+                            bool isReturnValueUsed,
+                            SDValue Callee,
+                            ArgListTy &Args, SelectionDAG &DAG,
+                            DebugLoc dl) const {
+  // Handle all of the outgoing arguments.
+  SmallVector<ISD::OutputArg, 32> Outs;
+  SmallVector<SDValue, 32> OutVals;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(*this, Args[i].Ty, ValueVTs);
+    for (unsigned Value = 0, NumValues = ValueVTs.size();
+         Value != NumValues; ++Value) {
+      EVT VT = ValueVTs[Value];
+      const Type *ArgTy = VT.getTypeForEVT(RetTy->getContext());
+      SDValue Op = SDValue(Args[i].Node.getNode(),
+                           Args[i].Node.getResNo() + Value);
+      ISD::ArgFlagsTy Flags;
+      unsigned OriginalAlignment =
+        getTargetData()->getABITypeAlignment(ArgTy);
+
+      if (Args[i].isZExt)
+        Flags.setZExt();
+      if (Args[i].isSExt)
+        Flags.setSExt();
+      if (Args[i].isInReg)
+        Flags.setInReg();
+      if (Args[i].isSRet)
+        Flags.setSRet();
+      if (Args[i].isByVal) {
+        Flags.setByVal();
+        const PointerType *Ty = cast<PointerType>(Args[i].Ty);
+        const Type *ElementTy = Ty->getElementType();
+        unsigned FrameAlign = getByValTypeAlignment(ElementTy);
+        unsigned FrameSize  = getTargetData()->getTypeAllocSize(ElementTy);
+        // For ByVal, alignment should come from FE.  BE will guess if this
+        // info is not there but there are cases it cannot get right.
+        if (Args[i].Alignment)
+          FrameAlign = Args[i].Alignment;
+        Flags.setByValAlign(FrameAlign);
+        Flags.setByValSize(FrameSize);
+      }
+      if (Args[i].isNest)
+        Flags.setNest();
+      Flags.setOrigAlign(OriginalAlignment);
+
+      EVT PartVT = getRegisterType(RetTy->getContext(), VT);
+      unsigned NumParts = getNumRegisters(RetTy->getContext(), VT);
+      SmallVector<SDValue, 4> Parts(NumParts);
+      ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+      if (Args[i].isSExt)
+        ExtendKind = ISD::SIGN_EXTEND;
+      else if (Args[i].isZExt)
+        ExtendKind = ISD::ZERO_EXTEND;
+
+      getCopyToParts(DAG, dl, Op, &Parts[0], NumParts,
+                     PartVT, ExtendKind);
+
+      for (unsigned j = 0; j != NumParts; ++j) {
+        // if it isn't first piece, alignment must be 1
+        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(),
+                               i < NumFixedArgs);
+        if (NumParts > 1 && j == 0)
+          MyFlags.Flags.setSplit();
+        else if (j != 0)
+          MyFlags.Flags.setOrigAlign(1);
+
+        Outs.push_back(MyFlags);
+        OutVals.push_back(Parts[j]);
+      }
+    }
+  }
+
+  // Handle the incoming return values from the call.
+  SmallVector<ISD::InputArg, 32> Ins;
+  SmallVector<EVT, 4> RetTys;
+  ComputeValueVTs(*this, RetTy, RetTys);
+  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+    EVT VT = RetTys[I];
+    EVT RegisterVT = getRegisterType(RetTy->getContext(), VT);
+    unsigned NumRegs = getNumRegisters(RetTy->getContext(), VT);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      ISD::InputArg MyFlags;
+      MyFlags.VT = RegisterVT;
+      MyFlags.Used = isReturnValueUsed;
+      if (RetSExt)
+        MyFlags.Flags.setSExt();
+      if (RetZExt)
+        MyFlags.Flags.setZExt();
+      if (isInreg)
+        MyFlags.Flags.setInReg();
+      Ins.push_back(MyFlags);
+    }
+  }
+
+  SmallVector<SDValue, 4> InVals;
+  Chain = LowerCall(Chain, Callee, CallConv, isVarArg, isTailCall,
+                    Outs, OutVals, Ins, dl, DAG, InVals);
+
+  // Verify that the target's LowerCall behaved as expected.
+  assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
+         "LowerCall didn't return a valid chain!");
+  assert((!isTailCall || InVals.empty()) &&
+         "LowerCall emitted a return value for a tail call!");
+  assert((isTailCall || InVals.size() == Ins.size()) &&
+         "LowerCall didn't emit the correct number of values!");
+
+  // For a tail call, the return value is merely live-out and there aren't
+  // any nodes in the DAG representing it. Return a special value to
+  // indicate that a tail call has been emitted and no more Instructions
+  // should be processed in the current block.
+  if (isTailCall) {
+    DAG.setRoot(Chain);
+    return std::make_pair(SDValue(), SDValue());
+  }
+
+  DEBUG(for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+          assert(InVals[i].getNode() &&
+                 "LowerCall emitted a null value!");
+          assert(Ins[i].VT == InVals[i].getValueType() &&
+                 "LowerCall emitted a value with the wrong type!");
+        });
+
+  // Collect the legal value parts into potentially illegal values
+  // that correspond to the original function's return values.
+  ISD::NodeType AssertOp = ISD::DELETED_NODE;
+  if (RetSExt)
+    AssertOp = ISD::AssertSext;
+  else if (RetZExt)
+    AssertOp = ISD::AssertZext;
+  SmallVector<SDValue, 4> ReturnValues;
+  unsigned CurReg = 0;
+  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+    EVT VT = RetTys[I];
+    EVT RegisterVT = getRegisterType(RetTy->getContext(), VT);
+    unsigned NumRegs = getNumRegisters(RetTy->getContext(), VT);
+
+    ReturnValues.push_back(getCopyFromParts(DAG, dl, &InVals[CurReg],
+                                            NumRegs, RegisterVT, VT,
+                                            AssertOp));
+    CurReg += NumRegs;
+  }
+
+  // For a function returning void, there is no return value. We can't create
+  // such a node, so we just return a null return value in that case. In
+  // that case, nothing will actualy look at the value.
+  if (ReturnValues.empty())
+    return std::make_pair(SDValue(), Chain);
+
+  SDValue Res = DAG.getNode(ISD::MERGE_VALUES, dl,
+                            DAG.getVTList(&RetTys[0], RetTys.size()),
+                            &ReturnValues[0], ReturnValues.size());
+  return std::make_pair(Res, Chain);
+}
+
+void TargetLowering::LowerOperationWrapper(SDNode *N,
+                                           SmallVectorImpl<SDValue> &Results,
+                                           SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+  if (Res.getNode())
+    Results.push_back(Res);
+}
+
+SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  llvm_unreachable("LowerOperation not implemented for this target!");
+  return SDValue();
+}
+
+void
+SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
+  SDValue Op = getNonRegisterValue(V);
+  assert((Op.getOpcode() != ISD::CopyFromReg ||
+          cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
+         "Copy from a reg to the same reg!");
+  assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
+
+  RegsForValue RFV(V->getContext(), TLI, Reg, V->getType());
+  SDValue Chain = DAG.getEntryNode();
+  RFV.getCopyToRegs(Op, DAG, getCurDebugLoc(), Chain, 0);
+  PendingExports.push_back(Chain);
+}
+
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
+  // If this is the entry block, emit arguments.
+  const Function &F = *LLVMBB->getParent();
+  SelectionDAG &DAG = SDB->DAG;
+  DebugLoc dl = SDB->getCurDebugLoc();
+  const TargetData *TD = TLI.getTargetData();
+  SmallVector<ISD::InputArg, 16> Ins;
+
+  // Check whether the function can return without sret-demotion.
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(),
+                Outs, TLI);
+
+  if (!FuncInfo->CanLowerReturn) {
+    // Put in an sret pointer parameter before all the other parameters.
+    SmallVector<EVT, 1> ValueVTs;
+    ComputeValueVTs(TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
+
+    // NOTE: Assuming that a pointer will never break down to more than one VT
+    // or one register.
+    ISD::ArgFlagsTy Flags;
+    Flags.setSRet();
+    EVT RegisterVT = TLI.getRegisterType(*DAG.getContext(), ValueVTs[0]);
+    ISD::InputArg RetArg(Flags, RegisterVT, true);
+    Ins.push_back(RetArg);
+  }
+
+  // Set up the incoming argument description vector.
+  unsigned Idx = 1;
+  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
+       I != E; ++I, ++Idx) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(TLI, I->getType(), ValueVTs);
+    bool isArgValueUsed = !I->use_empty();
+    for (unsigned Value = 0, NumValues = ValueVTs.size();
+         Value != NumValues; ++Value) {
+      EVT VT = ValueVTs[Value];
+      const Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+      ISD::ArgFlagsTy Flags;
+      unsigned OriginalAlignment =
+        TD->getABITypeAlignment(ArgTy);
+
+      if (F.paramHasAttr(Idx, Attribute::ZExt))
+        Flags.setZExt();
+      if (F.paramHasAttr(Idx, Attribute::SExt))
+        Flags.setSExt();
+      if (F.paramHasAttr(Idx, Attribute::InReg))
+        Flags.setInReg();
+      if (F.paramHasAttr(Idx, Attribute::StructRet))
+        Flags.setSRet();
+      if (F.paramHasAttr(Idx, Attribute::ByVal)) {
+        Flags.setByVal();
+        const PointerType *Ty = cast<PointerType>(I->getType());
+        const Type *ElementTy = Ty->getElementType();
+        unsigned FrameAlign = TLI.getByValTypeAlignment(ElementTy);
+        unsigned FrameSize  = TD->getTypeAllocSize(ElementTy);
+        // For ByVal, alignment should be passed from FE.  BE will guess if
+        // this info is not there but there are cases it cannot get right.
+        if (F.getParamAlignment(Idx))
+          FrameAlign = F.getParamAlignment(Idx);
+        Flags.setByValAlign(FrameAlign);
+        Flags.setByValSize(FrameSize);
+      }
+      if (F.paramHasAttr(Idx, Attribute::Nest))
+        Flags.setNest();
+      Flags.setOrigAlign(OriginalAlignment);
+
+      EVT RegisterVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
+      unsigned NumRegs = TLI.getNumRegisters(*CurDAG->getContext(), VT);
+      for (unsigned i = 0; i != NumRegs; ++i) {
+        ISD::InputArg MyFlags(Flags, RegisterVT, isArgValueUsed);
+        if (NumRegs > 1 && i == 0)
+          MyFlags.Flags.setSplit();
+        // if it isn't first piece, alignment must be 1
+        else if (i > 0)
+          MyFlags.Flags.setOrigAlign(1);
+        Ins.push_back(MyFlags);
+      }
+    }
+  }
+
+  // Call the target to set up the argument values.
+  SmallVector<SDValue, 8> InVals;
+  SDValue NewRoot = TLI.LowerFormalArguments(DAG.getRoot(), F.getCallingConv(),
+                                             F.isVarArg(), Ins,
+                                             dl, DAG, InVals);
+
+  // Verify that the target's LowerFormalArguments behaved as expected.
+  assert(NewRoot.getNode() && NewRoot.getValueType() == MVT::Other &&
+         "LowerFormalArguments didn't return a valid chain!");
+  assert(InVals.size() == Ins.size() &&
+         "LowerFormalArguments didn't emit the correct number of values!");
+  DEBUG({
+      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+        assert(InVals[i].getNode() &&
+               "LowerFormalArguments emitted a null value!");
+        assert(Ins[i].VT == InVals[i].getValueType() &&
+               "LowerFormalArguments emitted a value with the wrong type!");
+      }
+    });
+
+  // Update the DAG with the new chain value resulting from argument lowering.
+  DAG.setRoot(NewRoot);
+
+  // Set up the argument values.
+  unsigned i = 0;
+  Idx = 1;
+  if (!FuncInfo->CanLowerReturn) {
+    // Create a virtual register for the sret pointer, and put in a copy
+    // from the sret argument into it.
+    SmallVector<EVT, 1> ValueVTs;
+    ComputeValueVTs(TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
+    EVT VT = ValueVTs[0];
+    EVT RegVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
+    ISD::NodeType AssertOp = ISD::DELETED_NODE;
+    SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1,
+                                        RegVT, VT, AssertOp);
+
+    MachineFunction& MF = SDB->DAG.getMachineFunction();
+    MachineRegisterInfo& RegInfo = MF.getRegInfo();
+    unsigned SRetReg = RegInfo.createVirtualRegister(TLI.getRegClassFor(RegVT));
+    FuncInfo->DemoteRegister = SRetReg;
+    NewRoot = SDB->DAG.getCopyToReg(NewRoot, SDB->getCurDebugLoc(),
+                                    SRetReg, ArgValue);
+    DAG.setRoot(NewRoot);
+
+    // i indexes lowered arguments.  Bump it past the hidden sret argument.
+    // Idx indexes LLVM arguments.  Don't touch it.
+    ++i;
+  }
+
+  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;
+      ++I, ++Idx) {
+    SmallVector<SDValue, 4> ArgValues;
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(TLI, I->getType(), ValueVTs);
+    unsigned NumValues = ValueVTs.size();
+
+    // If this argument is unused then remember its value. It is used to generate
+    // debugging information.
+    if (I->use_empty() && NumValues)
+      SDB->setUnusedArgValue(I, InVals[i]);
+
+    for (unsigned Value = 0; Value != NumValues; ++Value) {
+      EVT VT = ValueVTs[Value];
+      EVT PartVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
+      unsigned NumParts = TLI.getNumRegisters(*CurDAG->getContext(), VT);
+
+      if (!I->use_empty()) {
+        ISD::NodeType AssertOp = ISD::DELETED_NODE;
+        if (F.paramHasAttr(Idx, Attribute::SExt))
+          AssertOp = ISD::AssertSext;
+        else if (F.paramHasAttr(Idx, Attribute::ZExt))
+          AssertOp = ISD::AssertZext;
+
+        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i],
+                                             NumParts, PartVT, VT,
+                                             AssertOp));
+      }
+
+      i += NumParts;
+    }
+
+    if (!I->use_empty()) {
+      SDValue Res;
+      if (!ArgValues.empty())
+        Res = DAG.getMergeValues(&ArgValues[0], NumValues,
+                                 SDB->getCurDebugLoc());
+      SDB->setValue(I, Res);
+
+      // If this argument is live outside of the entry block, insert a copy from
+      // whereever we got it to the vreg that other BB's will reference it as.
+      SDB->CopyToExportRegsIfNeeded(I);
+    }
+  }
+
+  assert(i == InVals.size() && "Argument register count mismatch!");
+
+  // Finally, if the target has anything special to do, allow it to do so.
+  // FIXME: this should insert code into the DAG!
+  EmitFunctionEntryCode();
+}
+
+/// Handle PHI nodes in successor blocks.  Emit code into the SelectionDAG to
+/// ensure constants are generated when needed.  Remember the virtual registers
+/// that need to be added to the Machine PHI nodes as input.  We cannot just
+/// directly add them, because expansion might result in multiple MBB's for one
+/// BB.  As such, the start of the BB might correspond to a different MBB than
+/// the end.
+///
+void
+SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
+  const TerminatorInst *TI = LLVMBB->getTerminator();
+
+  SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
+
+  // Check successor nodes' PHI nodes that expect a constant to be available
+  // from this block.
+  for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
+    const BasicBlock *SuccBB = TI->getSuccessor(succ);
+    if (!isa<PHINode>(SuccBB->begin())) continue;
+    MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
+
+    // If this terminator has multiple identical successors (common for
+    // switches), only handle each succ once.
+    if (!SuccsHandled.insert(SuccMBB)) continue;
+
+    MachineBasicBlock::iterator MBBI = SuccMBB->begin();
+
+    // At this point we know that there is a 1-1 correspondence between LLVM PHI
+    // nodes and Machine PHI nodes, but the incoming operands have not been
+    // emitted yet.
+    for (BasicBlock::const_iterator I = SuccBB->begin();
+         const PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+      // Ignore dead phi's.
+      if (PN->use_empty()) continue;
+
+      unsigned Reg;
+      const Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB);
+
+      if (const Constant *C = dyn_cast<Constant>(PHIOp)) {
+        unsigned &RegOut = ConstantsOut[C];
+        if (RegOut == 0) {
+          RegOut = FuncInfo.CreateRegs(C->getType());
+          CopyValueToVirtualRegister(C, RegOut);
+        }
+        Reg = RegOut;
+      } else {
+        DenseMap<const Value *, unsigned>::iterator I =
+          FuncInfo.ValueMap.find(PHIOp);
+        if (I != FuncInfo.ValueMap.end())
+          Reg = I->second;
+        else {
+          assert(isa<AllocaInst>(PHIOp) &&
+                 FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
+                 "Didn't codegen value into a register!??");
+          Reg = FuncInfo.CreateRegs(PHIOp->getType());
+          CopyValueToVirtualRegister(PHIOp, Reg);
+        }
+      }
+
+      // Remember that this register needs to added to the machine PHI node as
+      // the input for this MBB.
+      SmallVector<EVT, 4> ValueVTs;
+      ComputeValueVTs(TLI, PN->getType(), ValueVTs);
+      for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
+        EVT VT = ValueVTs[vti];
+        unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
+        for (unsigned i = 0, e = NumRegisters; i != e; ++i)
+          FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg+i));
+        Reg += NumRegisters;
+      }
+    }
+  }
+  ConstantsOut.clear();
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
new file mode 100644
index 0000000..6bd7eb3
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -0,0 +1,495 @@
+//===-- SelectionDAGBuilder.h - Selection-DAG building --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating from LLVM IR into SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SELECTIONDAGBUILDER_H
+#define SELECTIONDAGBUILDER_H
+
+#include "llvm/Constants.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#ifndef NDEBUG
+#include "llvm/ADT/SmallSet.h"
+#endif
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <vector>
+#include <set>
+
+namespace llvm {
+
+class AliasAnalysis;
+class AllocaInst;
+class BasicBlock;
+class BitCastInst;
+class BranchInst;
+class CallInst;
+class ExtractElementInst;
+class ExtractValueInst;
+class FCmpInst;
+class FPExtInst;
+class FPToSIInst;
+class FPToUIInst;
+class FPTruncInst;
+class Function;
+class FunctionLoweringInfo;
+class GetElementPtrInst;
+class ICmpInst;
+class IntToPtrInst;
+class IndirectBrInst;
+class InsertElementInst;
+class InsertValueInst;
+class Instruction;
+class LoadInst;
+class MachineBasicBlock;
+class MachineInstr;
+class MachineRegisterInfo;
+class MDNode;
+class PHINode;
+class PtrToIntInst;
+class ReturnInst;
+class SDISelAsmOperandInfo;
+class SDDbgValue;
+class SExtInst;
+class SelectInst;
+class ShuffleVectorInst;
+class SIToFPInst;
+class StoreInst;
+class SwitchInst;
+class TargetData;
+class TargetLowering;
+class TruncInst;
+class UIToFPInst;
+class UnreachableInst;
+class VAArgInst;
+class ZExtInst;
+
+//===----------------------------------------------------------------------===//
+/// SelectionDAGBuilder - This is the common target-independent lowering
+/// implementation that is parameterized by a TargetLowering object.
+///
+class SelectionDAGBuilder {
+  /// CurDebugLoc - current file + line number.  Changes as we build the DAG.
+  DebugLoc CurDebugLoc;
+
+  DenseMap<const Value*, SDValue> NodeMap;
+  
+  /// UnusedArgNodeMap - Maps argument value for unused arguments. This is used
+  /// to preserve debug information for incoming arguments.
+  DenseMap<const Value*, SDValue> UnusedArgNodeMap;
+
+public:
+  /// PendingLoads - Loads are not emitted to the program immediately.  We bunch
+  /// them up and then emit token factor nodes when possible.  This allows us to
+  /// get simple disambiguation between loads without worrying about alias
+  /// analysis.
+  SmallVector<SDValue, 8> PendingLoads;
+private:
+
+  /// PendingExports - CopyToReg nodes that copy values to virtual registers
+  /// for export to other blocks need to be emitted before any terminator
+  /// instruction, but they have no other ordering requirements. We bunch them
+  /// up and the emit a single tokenfactor for them just before terminator
+  /// instructions.
+  SmallVector<SDValue, 8> PendingExports;
+
+  /// SDNodeOrder - A unique monotonically increasing number used to order the
+  /// SDNodes we create.
+  unsigned SDNodeOrder;
+
+  /// Case - A struct to record the Value for a switch case, and the
+  /// case's target basic block.
+  struct Case {
+    Constant* Low;
+    Constant* High;
+    MachineBasicBlock* BB;
+
+    Case() : Low(0), High(0), BB(0) { }
+    Case(Constant* low, Constant* high, MachineBasicBlock* bb) :
+      Low(low), High(high), BB(bb) { }
+    APInt size() const {
+      const APInt &rHigh = cast<ConstantInt>(High)->getValue();
+      const APInt &rLow  = cast<ConstantInt>(Low)->getValue();
+      return (rHigh - rLow + 1ULL);
+    }
+  };
+
+  struct CaseBits {
+    uint64_t Mask;
+    MachineBasicBlock* BB;
+    unsigned Bits;
+
+    CaseBits(uint64_t mask, MachineBasicBlock* bb, unsigned bits):
+      Mask(mask), BB(bb), Bits(bits) { }
+  };
+
+  typedef std::vector<Case>           CaseVector;
+  typedef std::vector<CaseBits>       CaseBitsVector;
+  typedef CaseVector::iterator        CaseItr;
+  typedef std::pair<CaseItr, CaseItr> CaseRange;
+
+  /// CaseRec - A struct with ctor used in lowering switches to a binary tree
+  /// of conditional branches.
+  struct CaseRec {
+    CaseRec(MachineBasicBlock *bb, const Constant *lt, const Constant *ge,
+            CaseRange r) :
+    CaseBB(bb), LT(lt), GE(ge), Range(r) {}
+
+    /// CaseBB - The MBB in which to emit the compare and branch
+    MachineBasicBlock *CaseBB;
+    /// LT, GE - If nonzero, we know the current case value must be less-than or
+    /// greater-than-or-equal-to these Constants.
+    const Constant *LT;
+    const Constant *GE;
+    /// Range - A pair of iterators representing the range of case values to be
+    /// processed at this point in the binary search tree.
+    CaseRange Range;
+  };
+
+  typedef std::vector<CaseRec> CaseRecVector;
+
+  /// The comparison function for sorting the switch case values in the vector.
+  /// WARNING: Case ranges should be disjoint!
+  struct CaseCmp {
+    bool operator()(const Case &C1, const Case &C2) {
+      assert(isa<ConstantInt>(C1.Low) && isa<ConstantInt>(C2.High));
+      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
+      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
+      return CI1->getValue().slt(CI2->getValue());
+    }
+  };
+
+  struct CaseBitsCmp {
+    bool operator()(const CaseBits &C1, const CaseBits &C2) {
+      return C1.Bits > C2.Bits;
+    }
+  };
+
+  size_t Clusterify(CaseVector &Cases, const SwitchInst &SI);
+
+  /// CaseBlock - This structure is used to communicate between
+  /// SelectionDAGBuilder and SDISel for the code generation of additional basic
+  /// blocks needed by multi-case switch statements.
+  struct CaseBlock {
+    CaseBlock(ISD::CondCode cc, const Value *cmplhs, const Value *cmprhs,
+              const Value *cmpmiddle,
+              MachineBasicBlock *truebb, MachineBasicBlock *falsebb,
+              MachineBasicBlock *me)
+      : CC(cc), CmpLHS(cmplhs), CmpMHS(cmpmiddle), CmpRHS(cmprhs),
+        TrueBB(truebb), FalseBB(falsebb), ThisBB(me) {}
+    // CC - the condition code to use for the case block's setcc node
+    ISD::CondCode CC;
+    // CmpLHS/CmpRHS/CmpMHS - The LHS/MHS/RHS of the comparison to emit.
+    // Emit by default LHS op RHS. MHS is used for range comparisons:
+    // If MHS is not null: (LHS <= MHS) and (MHS <= RHS).
+    const Value *CmpLHS, *CmpMHS, *CmpRHS;
+    // TrueBB/FalseBB - the block to branch to if the setcc is true/false.
+    MachineBasicBlock *TrueBB, *FalseBB;
+    // ThisBB - the block into which to emit the code for the setcc and branches
+    MachineBasicBlock *ThisBB;
+  };
+  struct JumpTable {
+    JumpTable(unsigned R, unsigned J, MachineBasicBlock *M,
+              MachineBasicBlock *D): Reg(R), JTI(J), MBB(M), Default(D) {}
+  
+    /// Reg - the virtual register containing the index of the jump table entry
+    //. to jump to.
+    unsigned Reg;
+    /// JTI - the JumpTableIndex for this jump table in the function.
+    unsigned JTI;
+    /// MBB - the MBB into which to emit the code for the indirect jump.
+    MachineBasicBlock *MBB;
+    /// Default - the MBB of the default bb, which is a successor of the range
+    /// check MBB.  This is when updating PHI nodes in successors.
+    MachineBasicBlock *Default;
+  };
+  struct JumpTableHeader {
+    JumpTableHeader(APInt F, APInt L, const Value *SV, MachineBasicBlock *H,
+                    bool E = false):
+      First(F), Last(L), SValue(SV), HeaderBB(H), Emitted(E) {}
+    APInt First;
+    APInt Last;
+    const Value *SValue;
+    MachineBasicBlock *HeaderBB;
+    bool Emitted;
+  };
+  typedef std::pair<JumpTableHeader, JumpTable> JumpTableBlock;
+
+  struct BitTestCase {
+    BitTestCase(uint64_t M, MachineBasicBlock* T, MachineBasicBlock* Tr):
+      Mask(M), ThisBB(T), TargetBB(Tr) { }
+    uint64_t Mask;
+    MachineBasicBlock *ThisBB;
+    MachineBasicBlock *TargetBB;
+  };
+
+  typedef SmallVector<BitTestCase, 3> BitTestInfo;
+
+  struct BitTestBlock {
+    BitTestBlock(APInt F, APInt R, const Value* SV,
+                 unsigned Rg, bool E,
+                 MachineBasicBlock* P, MachineBasicBlock* D,
+                 const BitTestInfo& C):
+      First(F), Range(R), SValue(SV), Reg(Rg), Emitted(E),
+      Parent(P), Default(D), Cases(C) { }
+    APInt First;
+    APInt Range;
+    const Value *SValue;
+    unsigned Reg;
+    bool Emitted;
+    MachineBasicBlock *Parent;
+    MachineBasicBlock *Default;
+    BitTestInfo Cases;
+  };
+
+public:
+  // TLI - This is information that describes the available target features we
+  // need for lowering.  This indicates when operations are unavailable,
+  // implemented with a libcall, etc.
+  const TargetMachine &TM;
+  const TargetLowering &TLI;
+  SelectionDAG &DAG;
+  const TargetData *TD;
+  AliasAnalysis *AA;
+
+  /// SwitchCases - Vector of CaseBlock structures used to communicate
+  /// SwitchInst code generation information.
+  std::vector<CaseBlock> SwitchCases;
+  /// JTCases - Vector of JumpTable structures used to communicate
+  /// SwitchInst code generation information.
+  std::vector<JumpTableBlock> JTCases;
+  /// BitTestCases - Vector of BitTestBlock structures used to communicate
+  /// SwitchInst code generation information.
+  std::vector<BitTestBlock> BitTestCases;
+
+  // Emit PHI-node-operand constants only once even if used by multiple
+  // PHI nodes.
+  DenseMap<const Constant *, unsigned> ConstantsOut;
+
+  /// FuncInfo - Information about the function as a whole.
+  ///
+  FunctionLoweringInfo &FuncInfo;
+
+  /// OptLevel - What optimization level we're generating code for.
+  /// 
+  CodeGenOpt::Level OptLevel;
+
+  /// HasTailCall - This is set to true if a call in the current
+  /// block has been translated as a tail call. In this case,
+  /// no subsequent DAG nodes should be created.
+  ///
+  bool HasTailCall;
+
+  LLVMContext *Context;
+
+  SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
+                      CodeGenOpt::Level ol)
+    : SDNodeOrder(0), TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
+      DAG(dag), FuncInfo(funcinfo), OptLevel(ol),
+      HasTailCall(false), Context(dag.getContext()) {
+  }
+
+  void init(AliasAnalysis &aa);
+
+  /// clear - Clear out the current SelectionDAG and the associated
+  /// state and prepare this SelectionDAGBuilder object to be used
+  /// for a new block. This doesn't clear out information about
+  /// additional blocks that are needed to complete switch lowering
+  /// or PHI node updating; that information is cleared out as it is
+  /// consumed.
+  void clear();
+
+  /// getRoot - Return the current virtual root of the Selection DAG,
+  /// flushing any PendingLoad items. This must be done before emitting
+  /// a store or any other node that may need to be ordered after any
+  /// prior load instructions.
+  ///
+  SDValue getRoot();
+
+  /// getControlRoot - Similar to getRoot, but instead of flushing all the
+  /// PendingLoad items, flush all the PendingExports items. It is necessary
+  /// to do this before emitting a terminator instruction.
+  ///
+  SDValue getControlRoot();
+
+  DebugLoc getCurDebugLoc() const { return CurDebugLoc; }
+
+  unsigned getSDNodeOrder() const { return SDNodeOrder; }
+
+  void CopyValueToVirtualRegister(const Value *V, unsigned Reg);
+
+  /// AssignOrderingToNode - Assign an ordering to the node. The order is gotten
+  /// from how the code appeared in the source. The ordering is used by the
+  /// scheduler to effectively turn off scheduling.
+  void AssignOrderingToNode(const SDNode *Node);
+
+  void visit(const Instruction &I);
+
+  void visit(unsigned Opcode, const User &I);
+
+  SDValue getValue(const Value *V);
+  SDValue getNonRegisterValue(const Value *V);
+  SDValue getValueImpl(const Value *V);
+
+  void setValue(const Value *V, SDValue NewN) {
+    SDValue &N = NodeMap[V];
+    assert(N.getNode() == 0 && "Already set a value for this node!");
+    N = NewN;
+  }
+  
+  void setUnusedArgValue(const Value *V, SDValue NewN) {
+    SDValue &N = UnusedArgNodeMap[V];
+    assert(N.getNode() == 0 && "Already set a value for this node!");
+    N = NewN;
+  }
+  
+  void GetRegistersForValue(SDISelAsmOperandInfo &OpInfo,
+                            std::set<unsigned> &OutputRegs, 
+                            std::set<unsigned> &InputRegs);
+
+  void FindMergedConditions(const Value *Cond, MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB, MachineBasicBlock *CurBB,
+                            MachineBasicBlock *SwitchBB, unsigned Opc);
+  void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    MachineBasicBlock *CurBB,
+                                    MachineBasicBlock *SwitchBB);
+  bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases);
+  bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB);
+  void CopyToExportRegsIfNeeded(const Value *V);
+  void ExportFromCurrentBlock(const Value *V);
+  void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall);
+
+private:
+  // Terminator instructions.
+  void visitRet(const ReturnInst &I);
+  void visitBr(const BranchInst &I);
+  void visitSwitch(const SwitchInst &I);
+  void visitIndirectBr(const IndirectBrInst &I);
+  void visitUnreachable(const UnreachableInst &I) { /* noop */ }
+
+  // Helpers for visitSwitch
+  bool handleSmallSwitchRange(CaseRec& CR,
+                              CaseRecVector& WorkList,
+                              const Value* SV,
+                              MachineBasicBlock* Default,
+                              MachineBasicBlock *SwitchBB);
+  bool handleJTSwitchCase(CaseRec& CR,
+                          CaseRecVector& WorkList,
+                          const Value* SV,
+                          MachineBasicBlock* Default,
+                          MachineBasicBlock *SwitchBB);
+  bool handleBTSplitSwitchCase(CaseRec& CR,
+                               CaseRecVector& WorkList,
+                               const Value* SV,
+                               MachineBasicBlock* Default,
+                               MachineBasicBlock *SwitchBB);
+  bool handleBitTestsSwitchCase(CaseRec& CR,
+                                CaseRecVector& WorkList,
+                                const Value* SV,
+                                MachineBasicBlock* Default,
+                                MachineBasicBlock *SwitchBB);
+public:
+  void visitSwitchCase(CaseBlock &CB,
+                       MachineBasicBlock *SwitchBB);
+  void visitBitTestHeader(BitTestBlock &B, MachineBasicBlock *SwitchBB);
+  void visitBitTestCase(MachineBasicBlock* NextMBB,
+                        unsigned Reg,
+                        BitTestCase &B,
+                        MachineBasicBlock *SwitchBB);
+  void visitJumpTable(JumpTable &JT);
+  void visitJumpTableHeader(JumpTable &JT, JumpTableHeader &JTH,
+                            MachineBasicBlock *SwitchBB);
+  
+private:
+  void visitBinary(const User &I, unsigned OpCode);
+  void visitShift(const User &I, unsigned Opcode);
+  void visitAdd(const User &I)  { visitBinary(I, ISD::ADD); }
+  void visitFAdd(const User &I) { visitBinary(I, ISD::FADD); }
+  void visitSub(const User &I)  { visitBinary(I, ISD::SUB); }
+  void visitFSub(const User &I);
+  void visitMul(const User &I)  { visitBinary(I, ISD::MUL); }
+  void visitFMul(const User &I) { visitBinary(I, ISD::FMUL); }
+  void visitURem(const User &I) { visitBinary(I, ISD::UREM); }
+  void visitSRem(const User &I) { visitBinary(I, ISD::SREM); }
+  void visitFRem(const User &I) { visitBinary(I, ISD::FREM); }
+  void visitUDiv(const User &I) { visitBinary(I, ISD::UDIV); }
+  void visitSDiv(const User &I) { visitBinary(I, ISD::SDIV); }
+  void visitFDiv(const User &I) { visitBinary(I, ISD::FDIV); }
+  void visitAnd (const User &I) { visitBinary(I, ISD::AND); }
+  void visitOr  (const User &I) { visitBinary(I, ISD::OR); }
+  void visitXor (const User &I) { visitBinary(I, ISD::XOR); }
+  void visitShl (const User &I) { visitShift(I, ISD::SHL); }
+  void visitLShr(const User &I) { visitShift(I, ISD::SRL); }
+  void visitAShr(const User &I) { visitShift(I, ISD::SRA); }
+  void visitICmp(const User &I);
+  void visitFCmp(const User &I);
+  // Visit the conversion instructions
+  void visitTrunc(const User &I);
+  void visitZExt(const User &I);
+  void visitSExt(const User &I);
+  void visitFPTrunc(const User &I);
+  void visitFPExt(const User &I);
+  void visitFPToUI(const User &I);
+  void visitFPToSI(const User &I);
+  void visitUIToFP(const User &I);
+  void visitSIToFP(const User &I);
+  void visitPtrToInt(const User &I);
+  void visitIntToPtr(const User &I);
+  void visitBitCast(const User &I);
+
+  void visitExtractElement(const User &I);
+  void visitInsertElement(const User &I);
+  void visitShuffleVector(const User &I);
+
+  void visitExtractValue(const ExtractValueInst &I);
+  void visitInsertValue(const InsertValueInst &I);
+
+  void visitGetElementPtr(const User &I);
+  void visitSelect(const User &I);
+
+  void visitAlloca(const AllocaInst &I);
+  void visitLoad(const LoadInst &I);
+  void visitStore(const StoreInst &I);
+  void visitPHI(const PHINode &I);
+  void visitCall(const CallInst &I);
+  bool visitMemCmpCall(const CallInst &I);
+  
+  const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
+  void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
+
+  void visitPow(const CallInst &I);
+  void visitExp2(const CallInst &I);
+  void visitExp(const CallInst &I);
+  void visitLog(const CallInst &I);
+  void visitLog2(const CallInst &I);
+  void visitLog10(const CallInst &I);
+
+  void visitUserOp1(const Instruction &I) {
+    llvm_unreachable("UserOp1 should not exist at instruction selection time!");
+  }
+  void visitUserOp2(const Instruction &I) {
+    llvm_unreachable("UserOp2 should not exist at instruction selection time!");
+  }
+  
+  const char *implVisitBinaryAtomic(const CallInst& I, ISD::NodeType Op);
+  const char *implVisitAluOverflow(const CallInst &I, ISD::NodeType Op);
+
+  void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
new file mode 100644
index 0000000..4f93fd2
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -0,0 +1,2417 @@
+//===-- SelectionDAGISel.cpp - Implement the SelectionDAGISel class -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAGISel class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "isel"
+#include "ScheduleDAGSDNodes.h"
+#include "SelectionDAGBuilder.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumFastIselFailures, "Number of instructions fast isel failed on");
+STATISTIC(NumDAGIselRetries,"Number of times dag isel has to try another path");
+
+static cl::opt<bool>
+EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
+          cl::desc("Enable verbose messages in the \"fast\" "
+                   "instruction selector"));
+static cl::opt<bool>
+EnableFastISelAbort("fast-isel-abort", cl::Hidden,
+          cl::desc("Enable abort calls when \"fast\" instruction fails"));
+
+#ifndef NDEBUG
+static cl::opt<bool>
+ViewDAGCombine1("view-dag-combine1-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the first "
+                   "dag combine pass"));
+static cl::opt<bool>
+ViewLegalizeTypesDAGs("view-legalize-types-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before legalize types"));
+static cl::opt<bool>
+ViewLegalizeDAGs("view-legalize-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before legalize"));
+static cl::opt<bool>
+ViewDAGCombine2("view-dag-combine2-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the second "
+                   "dag combine pass"));
+static cl::opt<bool>
+ViewDAGCombineLT("view-dag-combine-lt-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the post legalize types"
+                   " dag combine pass"));
+static cl::opt<bool>
+ViewISelDAGs("view-isel-dags", cl::Hidden,
+          cl::desc("Pop up a window to show isel dags as they are selected"));
+static cl::opt<bool>
+ViewSchedDAGs("view-sched-dags", cl::Hidden,
+          cl::desc("Pop up a window to show sched dags as they are processed"));
+static cl::opt<bool>
+ViewSUnitDAGs("view-sunit-dags", cl::Hidden,
+      cl::desc("Pop up a window to show SUnit dags after they are processed"));
+#else
+static const bool ViewDAGCombine1 = false,
+                  ViewLegalizeTypesDAGs = false, ViewLegalizeDAGs = false,
+                  ViewDAGCombine2 = false,
+                  ViewDAGCombineLT = false,
+                  ViewISelDAGs = false, ViewSchedDAGs = false,
+                  ViewSUnitDAGs = false;
+#endif
+
+//===---------------------------------------------------------------------===//
+///
+/// RegisterScheduler class - Track the registration of instruction schedulers.
+///
+//===---------------------------------------------------------------------===//
+MachinePassRegistry RegisterScheduler::Registry;
+
+//===---------------------------------------------------------------------===//
+///
+/// ISHeuristic command line option for instruction schedulers.
+///
+//===---------------------------------------------------------------------===//
+static cl::opt<RegisterScheduler::FunctionPassCtor, false,
+               RegisterPassParser<RegisterScheduler> >
+ISHeuristic("pre-RA-sched",
+            cl::init(&createDefaultScheduler),
+            cl::desc("Instruction schedulers available (before register"
+                     " allocation):"));
+
+static RegisterScheduler
+defaultListDAGScheduler("default", "Best scheduler for the target",
+                        createDefaultScheduler);
+
+namespace llvm {
+  //===--------------------------------------------------------------------===//
+  /// createDefaultScheduler - This creates an instruction scheduler appropriate
+  /// for the target.
+  ScheduleDAGSDNodes* createDefaultScheduler(SelectionDAGISel *IS,
+                                             CodeGenOpt::Level OptLevel) {
+    const TargetLowering &TLI = IS->getTargetLowering();
+
+    if (OptLevel == CodeGenOpt::None)
+      return createSourceListDAGScheduler(IS, OptLevel);
+    if (TLI.getSchedulingPreference() == Sched::Latency)
+      return createTDListDAGScheduler(IS, OptLevel);
+    if (TLI.getSchedulingPreference() == Sched::RegPressure)
+      return createBURRListDAGScheduler(IS, OptLevel);
+    if (TLI.getSchedulingPreference() == Sched::Hybrid)
+      return createHybridListDAGScheduler(IS, OptLevel);
+    assert(TLI.getSchedulingPreference() == Sched::ILP &&
+           "Unknown sched type!");
+    return createILPListDAGScheduler(IS, OptLevel);
+  }
+}
+
+// EmitInstrWithCustomInserter - This method should be implemented by targets
+// that mark instructions with the 'usesCustomInserter' flag.  These
+// instructions are special in various ways, which require special support to
+// insert.  The specified MachineInstr is created but not inserted into any
+// basic blocks, and this method is called to expand it into a sequence of
+// instructions, potentially also creating new basic blocks and control flow.
+// When new basic blocks are inserted and the edges from MBB to its successors
+// are modified, the method should insert pairs of <OldSucc, NewSucc> into the
+// DenseMap.
+MachineBasicBlock *
+TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                            MachineBasicBlock *MBB) const {
+#ifndef NDEBUG
+  dbgs() << "If a target marks an instruction with "
+          "'usesCustomInserter', it must implement "
+          "TargetLowering::EmitInstrWithCustomInserter!";
+#endif
+  llvm_unreachable(0);
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// SelectionDAGISel code
+//===----------------------------------------------------------------------===//
+
+SelectionDAGISel::SelectionDAGISel(const TargetMachine &tm, CodeGenOpt::Level OL) :
+  MachineFunctionPass(ID), TM(tm), TLI(*tm.getTargetLowering()),
+  FuncInfo(new FunctionLoweringInfo(TLI)),
+  CurDAG(new SelectionDAG(tm)),
+  SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
+  OptLevel(OL),
+  DAGSize(0)
+{}
+
+SelectionDAGISel::~SelectionDAGISel() {
+  delete SDB;
+  delete CurDAG;
+  delete FuncInfo;
+}
+
+void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();
+  AU.addPreserved<AliasAnalysis>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// FunctionCallsSetJmp - Return true if the function has a call to setjmp or
+/// other function that gcc recognizes as "returning twice". This is used to
+/// limit code-gen optimizations on the machine function.
+///
+/// FIXME: Remove after <rdar://problem/8031714> is fixed.
+static bool FunctionCallsSetJmp(const Function *F) {
+  const Module *M = F->getParent();
+  static const char *ReturnsTwiceFns[] = {
+    "setjmp",
+    "sigsetjmp",
+    "setjmp_syscall",
+    "savectx",
+    "qsetjmp",
+    "vfork",
+    "getcontext"
+  };
+#define NUM_RETURNS_TWICE_FNS sizeof(ReturnsTwiceFns) / sizeof(const char *)
+
+  for (unsigned I = 0; I < NUM_RETURNS_TWICE_FNS; ++I)
+    if (const Function *Callee = M->getFunction(ReturnsTwiceFns[I])) {
+      if (!Callee->use_empty())
+        for (Value::const_use_iterator
+               I = Callee->use_begin(), E = Callee->use_end();
+             I != E; ++I)
+          if (const CallInst *CI = dyn_cast<CallInst>(*I))
+            if (CI->getParent()->getParent() == F)
+              return true;
+    }
+
+  return false;
+#undef NUM_RETURNS_TWICE_FNS
+}
+
+bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
+  const Function &Fn = *mf.getFunction();
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+
+  MF = &mf;
+  RegInfo = &MF->getRegInfo();
+  AA = &getAnalysis<AliasAnalysis>();
+
+  DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
+
+  CurDAG->init(*MF);
+  FuncInfo->set(Fn, *MF);
+  SDB->init(*AA);
+
+  SelectAllBasicBlocks(Fn);
+
+  // If the first basic block in the function has live ins that need to be
+  // copied into vregs, emit the copies into the top of the block before
+  // emitting the code for the block.
+  MachineBasicBlock *EntryMBB = MF->begin();
+  RegInfo->EmitLiveInCopies(EntryMBB, TRI, TII);
+
+  DenseMap<unsigned, unsigned> LiveInMap;
+  if (!FuncInfo->ArgDbgValues.empty())
+    for (MachineRegisterInfo::livein_iterator LI = RegInfo->livein_begin(),
+           E = RegInfo->livein_end(); LI != E; ++LI)
+      if (LI->second) 
+        LiveInMap.insert(std::make_pair(LI->first, LI->second));
+
+  // Insert DBG_VALUE instructions for function arguments to the entry block.
+  for (unsigned i = 0, e = FuncInfo->ArgDbgValues.size(); i != e; ++i) {
+    MachineInstr *MI = FuncInfo->ArgDbgValues[e-i-1];
+    unsigned Reg = MI->getOperand(0).getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      EntryMBB->insert(EntryMBB->begin(), MI);
+    else {
+      MachineInstr *Def = RegInfo->getVRegDef(Reg);
+      MachineBasicBlock::iterator InsertPos = Def;
+      // FIXME: VR def may not be in entry block.
+      Def->getParent()->insert(llvm::next(InsertPos), MI);
+    }
+
+    // If Reg is live-in then update debug info to track its copy in a vreg.
+    DenseMap<unsigned, unsigned>::iterator LDI = LiveInMap.find(Reg);
+    if (LDI != LiveInMap.end()) {
+      MachineInstr *Def = RegInfo->getVRegDef(LDI->second);
+      MachineBasicBlock::iterator InsertPos = Def;
+      const MDNode *Variable = 
+        MI->getOperand(MI->getNumOperands()-1).getMetadata();
+      unsigned Offset = MI->getOperand(1).getImm();
+      // Def is never a terminator here, so it is ok to increment InsertPos.
+      BuildMI(*EntryMBB, ++InsertPos, MI->getDebugLoc(), 
+              TII.get(TargetOpcode::DBG_VALUE))
+        .addReg(LDI->second, RegState::Debug)
+        .addImm(Offset).addMetadata(Variable);
+    }
+  }
+
+  // Determine if there are any calls in this machine function.
+  MachineFrameInfo *MFI = MF->getFrameInfo();
+  if (!MFI->hasCalls()) {
+    for (MachineFunction::const_iterator
+           I = MF->begin(), E = MF->end(); I != E; ++I) {
+      const MachineBasicBlock *MBB = I;
+      for (MachineBasicBlock::const_iterator
+             II = MBB->begin(), IE = MBB->end(); II != IE; ++II) {
+        const TargetInstrDesc &TID = TM.getInstrInfo()->get(II->getOpcode());
+
+        // Operand 1 of an inline asm instruction indicates whether the asm
+        // needs stack or not.
+        if (TID.isCall() && !TID.isReturn()) {
+          MFI->setHasCalls(true);
+          goto done;
+        }
+      }
+    }
+  done:;
+  }
+
+  // Determine if there is a call to setjmp in the machine function.
+  MF->setCallsSetJmp(FunctionCallsSetJmp(&Fn));
+
+  // Replace forward-declared registers with the registers containing
+  // the desired value.
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  for (DenseMap<unsigned, unsigned>::iterator
+       I = FuncInfo->RegFixups.begin(), E = FuncInfo->RegFixups.end();
+       I != E; ++I) {
+    unsigned From = I->first;
+    unsigned To = I->second;
+    // If To is also scheduled to be replaced, find what its ultimate
+    // replacement is.
+    for (;;) {
+      DenseMap<unsigned, unsigned>::iterator J =
+        FuncInfo->RegFixups.find(To);
+      if (J == E) break;
+      To = J->second;
+    }
+    // Replace it.
+    MRI.replaceRegWith(From, To);
+  }
+
+  // Release function-specific state. SDB and CurDAG are already cleared
+  // at this point.
+  FuncInfo->clear();
+
+  return true;
+}
+
+void
+SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
+                                   BasicBlock::const_iterator End,
+                                   bool &HadTailCall) {
+  // Lower all of the non-terminator instructions. If a call is emitted
+  // as a tail call, cease emitting nodes for this block. Terminators
+  // are handled below.
+  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I)
+    SDB->visit(*I);
+
+  // Make sure the root of the DAG is up-to-date.
+  CurDAG->setRoot(SDB->getControlRoot());
+  HadTailCall = SDB->HasTailCall;
+  SDB->clear();
+
+  // Final step, emit the lowered DAG as machine code.
+  CodeGenAndEmitDAG();
+}
+
+namespace {
+/// WorkListRemover - This class is a DAGUpdateListener that removes any deleted
+/// nodes from the worklist.
+class SDOPsWorkListRemover : public SelectionDAG::DAGUpdateListener {
+  SmallVector<SDNode*, 128> &Worklist;
+  SmallPtrSet<SDNode*, 128> &InWorklist;
+public:
+  SDOPsWorkListRemover(SmallVector<SDNode*, 128> &wl,
+                       SmallPtrSet<SDNode*, 128> &inwl)
+    : Worklist(wl), InWorklist(inwl) {}
+
+  void RemoveFromWorklist(SDNode *N) {
+    if (!InWorklist.erase(N)) return;
+    
+    SmallVector<SDNode*, 128>::iterator I =
+    std::find(Worklist.begin(), Worklist.end(), N);
+    assert(I != Worklist.end() && "Not in worklist");
+    
+    *I = Worklist.back();
+    Worklist.pop_back();
+  }
+  
+  virtual void NodeDeleted(SDNode *N, SDNode *E) {
+    RemoveFromWorklist(N);
+  }
+
+  virtual void NodeUpdated(SDNode *N) {
+    // Ignore updates.
+  }
+};
+}
+
+void SelectionDAGISel::ComputeLiveOutVRegInfo() {
+  SmallPtrSet<SDNode*, 128> VisitedNodes;
+  SmallVector<SDNode*, 128> Worklist;
+
+  Worklist.push_back(CurDAG->getRoot().getNode());
+
+  APInt Mask;
+  APInt KnownZero;
+  APInt KnownOne;
+
+  do {
+    SDNode *N = Worklist.pop_back_val();
+
+    // If we've already seen this node, ignore it.
+    if (!VisitedNodes.insert(N))
+      continue;
+
+    // Otherwise, add all chain operands to the worklist.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      if (N->getOperand(i).getValueType() == MVT::Other)
+        Worklist.push_back(N->getOperand(i).getNode());
+
+    // If this is a CopyToReg with a vreg dest, process it.
+    if (N->getOpcode() != ISD::CopyToReg)
+      continue;
+
+    unsigned DestReg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(DestReg))
+      continue;
+
+    // Ignore non-scalar or non-integer values.
+    SDValue Src = N->getOperand(2);
+    EVT SrcVT = Src.getValueType();
+    if (!SrcVT.isInteger() || SrcVT.isVector())
+      continue;
+
+    unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src);
+    Mask = APInt::getAllOnesValue(SrcVT.getSizeInBits());
+    CurDAG->ComputeMaskedBits(Src, Mask, KnownZero, KnownOne);
+
+    // Only install this information if it tells us something.
+    if (NumSignBits != 1 || KnownZero != 0 || KnownOne != 0) {
+      DestReg -= TargetRegisterInfo::FirstVirtualRegister;
+      if (DestReg >= FuncInfo->LiveOutRegInfo.size())
+        FuncInfo->LiveOutRegInfo.resize(DestReg+1);
+      FunctionLoweringInfo::LiveOutInfo &LOI =
+        FuncInfo->LiveOutRegInfo[DestReg];
+      LOI.NumSignBits = NumSignBits;
+      LOI.KnownOne = KnownOne;
+      LOI.KnownZero = KnownZero;
+    }
+  } while (!Worklist.empty());
+}
+
+void SelectionDAGISel::CodeGenAndEmitDAG() {
+  std::string GroupName;
+  if (TimePassesIsEnabled)
+    GroupName = "Instruction Selection and Scheduling";
+  std::string BlockName;
+  if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewLegalizeDAGs ||
+      ViewDAGCombine2 || ViewDAGCombineLT || ViewISelDAGs || ViewSchedDAGs ||
+      ViewSUnitDAGs)
+    BlockName = MF->getFunction()->getNameStr() + ":" +
+                FuncInfo->MBB->getBasicBlock()->getNameStr();
+
+  DEBUG(dbgs() << "Initial selection DAG:\n"; CurDAG->dump());
+
+  if (ViewDAGCombine1) CurDAG->viewGraph("dag-combine1 input for " + BlockName);
+
+  // Run the DAG combiner in pre-legalize mode.
+  {
+    NamedRegionTimer T("DAG Combining 1", GroupName, TimePassesIsEnabled);
+    CurDAG->Combine(Unrestricted, *AA, OptLevel);
+  }
+
+  DEBUG(dbgs() << "Optimized lowered selection DAG:\n"; CurDAG->dump());
+
+  // Second step, hack on the DAG until it only uses operations and types that
+  // the target supports.
+  if (ViewLegalizeTypesDAGs) CurDAG->viewGraph("legalize-types input for " +
+                                               BlockName);
+
+  bool Changed;
+  {
+    NamedRegionTimer T("Type Legalization", GroupName, TimePassesIsEnabled);
+    Changed = CurDAG->LegalizeTypes();
+  }
+
+  DEBUG(dbgs() << "Type-legalized selection DAG:\n"; CurDAG->dump());
+
+  if (Changed) {
+    if (ViewDAGCombineLT)
+      CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
+
+    // Run the DAG combiner in post-type-legalize mode.
+    {
+      NamedRegionTimer T("DAG Combining after legalize types", GroupName,
+                         TimePassesIsEnabled);
+      CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
+    }
+
+    DEBUG(dbgs() << "Optimized type-legalized selection DAG:\n";
+          CurDAG->dump());
+  }
+
+  {
+    NamedRegionTimer T("Vector Legalization", GroupName, TimePassesIsEnabled);
+    Changed = CurDAG->LegalizeVectors();
+  }
+
+  if (Changed) {
+    {
+      NamedRegionTimer T("Type Legalization 2", GroupName, TimePassesIsEnabled);
+      CurDAG->LegalizeTypes();
+    }
+
+    if (ViewDAGCombineLT)
+      CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
+
+    // Run the DAG combiner in post-type-legalize mode.
+    {
+      NamedRegionTimer T("DAG Combining after legalize vectors", GroupName,
+                         TimePassesIsEnabled);
+      CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
+    }
+
+    DEBUG(dbgs() << "Optimized vector-legalized selection DAG:\n";
+          CurDAG->dump());
+  }
+
+  if (ViewLegalizeDAGs) CurDAG->viewGraph("legalize input for " + BlockName);
+
+  {
+    NamedRegionTimer T("DAG Legalization", GroupName, TimePassesIsEnabled);
+    CurDAG->Legalize(OptLevel);
+  }
+
+  DEBUG(dbgs() << "Legalized selection DAG:\n"; CurDAG->dump());
+
+  if (ViewDAGCombine2) CurDAG->viewGraph("dag-combine2 input for " + BlockName);
+
+  // Run the DAG combiner in post-legalize mode.
+  {
+    NamedRegionTimer T("DAG Combining 2", GroupName, TimePassesIsEnabled);
+    CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
+  }
+
+  DEBUG(dbgs() << "Optimized legalized selection DAG:\n"; CurDAG->dump());
+
+  if (OptLevel != CodeGenOpt::None)
+    ComputeLiveOutVRegInfo();
+
+  if (ViewISelDAGs) CurDAG->viewGraph("isel input for " + BlockName);
+
+  // Third, instruction select all of the operations to machine code, adding the
+  // code to the MachineBasicBlock.
+  {
+    NamedRegionTimer T("Instruction Selection", GroupName, TimePassesIsEnabled);
+    DoInstructionSelection();
+  }
+
+  DEBUG(dbgs() << "Selected selection DAG:\n"; CurDAG->dump());
+
+  if (ViewSchedDAGs) CurDAG->viewGraph("scheduler input for " + BlockName);
+
+  // Schedule machine code.
+  ScheduleDAGSDNodes *Scheduler = CreateScheduler();
+  {
+    NamedRegionTimer T("Instruction Scheduling", GroupName,
+                       TimePassesIsEnabled);
+    Scheduler->Run(CurDAG, FuncInfo->MBB, FuncInfo->InsertPt);
+  }
+
+  if (ViewSUnitDAGs) Scheduler->viewGraph();
+
+  // Emit machine code to BB.  This can change 'BB' to the last block being
+  // inserted into.
+  {
+    NamedRegionTimer T("Instruction Creation", GroupName, TimePassesIsEnabled);
+
+    FuncInfo->MBB = Scheduler->EmitSchedule();
+    FuncInfo->InsertPt = Scheduler->InsertPos;
+  }
+
+  // Free the scheduler state.
+  {
+    NamedRegionTimer T("Instruction Scheduling Cleanup", GroupName,
+                       TimePassesIsEnabled);
+    delete Scheduler;
+  }
+
+  // Free the SelectionDAG state, now that we're finished with it.
+  CurDAG->clear();
+}
+
+void SelectionDAGISel::DoInstructionSelection() {
+  DEBUG(errs() << "===== Instruction selection begins:\n");
+
+  PreprocessISelDAG();
+  
+  // Select target instructions for the DAG.
+  {
+    // Number all nodes with a topological order and set DAGSize.
+    DAGSize = CurDAG->AssignTopologicalOrder();
+    
+    // Create a dummy node (which is not added to allnodes), that adds
+    // a reference to the root node, preventing it from being deleted,
+    // and tracking any changes of the root.
+    HandleSDNode Dummy(CurDAG->getRoot());
+    ISelPosition = SelectionDAG::allnodes_iterator(CurDAG->getRoot().getNode());
+    ++ISelPosition;
+    
+    // The AllNodes list is now topological-sorted. Visit the
+    // nodes by starting at the end of the list (the root of the
+    // graph) and preceding back toward the beginning (the entry
+    // node).
+    while (ISelPosition != CurDAG->allnodes_begin()) {
+      SDNode *Node = --ISelPosition;
+      // Skip dead nodes. DAGCombiner is expected to eliminate all dead nodes,
+      // but there are currently some corner cases that it misses. Also, this
+      // makes it theoretically possible to disable the DAGCombiner.
+      if (Node->use_empty())
+        continue;
+      
+      SDNode *ResNode = Select(Node);
+      
+      // FIXME: This is pretty gross.  'Select' should be changed to not return
+      // anything at all and this code should be nuked with a tactical strike.
+      
+      // If node should not be replaced, continue with the next one.
+      if (ResNode == Node || Node->getOpcode() == ISD::DELETED_NODE)
+        continue;
+      // Replace node.
+      if (ResNode)
+        ReplaceUses(Node, ResNode);
+      
+      // If after the replacement this node is not used any more,
+      // remove this dead node.
+      if (Node->use_empty()) { // Don't delete EntryToken, etc.
+        ISelUpdater ISU(ISelPosition);
+        CurDAG->RemoveDeadNode(Node, &ISU);
+      }
+    }
+    
+    CurDAG->setRoot(Dummy.getValue());
+  }    
+
+  DEBUG(errs() << "===== Instruction selection ends:\n");
+
+  PostprocessISelDAG();
+}
+
+void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
+  // Iterate over all basic blocks in the function.
+  for (Function::const_iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
+    const BasicBlock *LLVMBB = &*I;
+    FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB];
+    FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
+
+    BasicBlock::const_iterator const Begin = LLVMBB->getFirstNonPHI();
+    BasicBlock::const_iterator const End = LLVMBB->end();
+    BasicBlock::const_iterator BI = End;
+
+    FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
+    
+    // Lower any arguments needed in this block if this is the entry block.
+    if (LLVMBB == &Fn.getEntryBlock())
+      LowerArguments(LLVMBB);
+
+    // Run SelectionDAG instruction selection on the remainder of the block
+    // not handled by FastISel. If FastISel is not run, this is the entire
+    // block.
+    bool HadTailCall;
+    SelectBasicBlock(Begin, BI, HadTailCall);
+
+    FinishBasicBlock();
+    FuncInfo->PHINodesToUpdate.clear();
+  }
+}
+
+void
+SelectionDAGISel::FinishBasicBlock() {
+
+  DEBUG(dbgs() << "Total amount of phi nodes to update: "
+               << FuncInfo->PHINodesToUpdate.size() << "\n";
+        for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i)
+          dbgs() << "Node " << i << " : ("
+                 << FuncInfo->PHINodesToUpdate[i].first
+                 << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");
+
+  // Next, now that we know what the last MBB the LLVM BB expanded is, update
+  // PHI nodes in successors.
+  if (SDB->SwitchCases.empty() &&
+      SDB->JTCases.empty() &&
+      SDB->BitTestCases.empty()) {
+    for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
+      MachineInstr *PHI = FuncInfo->PHINodesToUpdate[i].first;
+      assert(PHI->isPHI() &&
+             "This is not a machine PHI node that we are updating!");
+      if (!FuncInfo->MBB->isSuccessor(PHI->getParent()))
+        continue;
+      PHI->addOperand(
+        MachineOperand::CreateReg(FuncInfo->PHINodesToUpdate[i].second, false));
+      PHI->addOperand(MachineOperand::CreateMBB(FuncInfo->MBB));
+    }
+    return;
+  }
+
+  for (unsigned i = 0, e = SDB->BitTestCases.size(); i != e; ++i) {
+    // Lower header first, if it wasn't already lowered
+    if (!SDB->BitTestCases[i].Emitted) {
+      // Set the current basic block to the mbb we wish to insert the code into
+      FuncInfo->MBB = SDB->BitTestCases[i].Parent;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // Emit the code
+      SDB->visitBitTestHeader(SDB->BitTestCases[i], FuncInfo->MBB);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j) {
+      // Set the current basic block to the mbb we wish to insert the code into
+      FuncInfo->MBB = SDB->BitTestCases[i].Cases[j].ThisBB;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // Emit the code
+      if (j+1 != ej)
+        SDB->visitBitTestCase(SDB->BitTestCases[i].Cases[j+1].ThisBB,
+                              SDB->BitTestCases[i].Reg,
+                              SDB->BitTestCases[i].Cases[j],
+                              FuncInfo->MBB);
+      else
+        SDB->visitBitTestCase(SDB->BitTestCases[i].Default,
+                              SDB->BitTestCases[i].Reg,
+                              SDB->BitTestCases[i].Cases[j],
+                              FuncInfo->MBB);
+
+
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // Update PHI Nodes
+    for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size();
+         pi != pe; ++pi) {
+      MachineInstr *PHI = FuncInfo->PHINodesToUpdate[pi].first;
+      MachineBasicBlock *PHIBB = PHI->getParent();
+      assert(PHI->isPHI() &&
+             "This is not a machine PHI node that we are updating!");
+      // This is "default" BB. We have two jumps to it. From "header" BB and
+      // from last "case" BB.
+      if (PHIBB == SDB->BitTestCases[i].Default) {
+        PHI->addOperand(MachineOperand::
+                        CreateReg(FuncInfo->PHINodesToUpdate[pi].second,
+                                  false));
+        PHI->addOperand(MachineOperand::CreateMBB(SDB->BitTestCases[i].Parent));
+        PHI->addOperand(MachineOperand::
+                        CreateReg(FuncInfo->PHINodesToUpdate[pi].second,
+                                  false));
+        PHI->addOperand(MachineOperand::CreateMBB(SDB->BitTestCases[i].Cases.
+                                                  back().ThisBB));
+      }
+      // One of "cases" BB.
+      for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size();
+           j != ej; ++j) {
+        MachineBasicBlock* cBB = SDB->BitTestCases[i].Cases[j].ThisBB;
+        if (cBB->isSuccessor(PHIBB)) {
+          PHI->addOperand(MachineOperand::
+                          CreateReg(FuncInfo->PHINodesToUpdate[pi].second,
+                                    false));
+          PHI->addOperand(MachineOperand::CreateMBB(cBB));
+        }
+      }
+    }
+  }
+  SDB->BitTestCases.clear();
+
+  // If the JumpTable record is filled in, then we need to emit a jump table.
+  // Updating the PHI nodes is tricky in this case, since we need to determine
+  // whether the PHI is a successor of the range check MBB or the jump table MBB
+  for (unsigned i = 0, e = SDB->JTCases.size(); i != e; ++i) {
+    // Lower header first, if it wasn't already lowered
+    if (!SDB->JTCases[i].first.Emitted) {
+      // Set the current basic block to the mbb we wish to insert the code into
+      FuncInfo->MBB = SDB->JTCases[i].first.HeaderBB;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // Emit the code
+      SDB->visitJumpTableHeader(SDB->JTCases[i].second, SDB->JTCases[i].first,
+                                FuncInfo->MBB);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // Set the current basic block to the mbb we wish to insert the code into
+    FuncInfo->MBB = SDB->JTCases[i].second.MBB;
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
+    // Emit the code
+    SDB->visitJumpTable(SDB->JTCases[i].second);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // Update PHI Nodes
+    for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size();
+         pi != pe; ++pi) {
+      MachineInstr *PHI = FuncInfo->PHINodesToUpdate[pi].first;
+      MachineBasicBlock *PHIBB = PHI->getParent();
+      assert(PHI->isPHI() &&
+             "This is not a machine PHI node that we are updating!");
+      // "default" BB. We can go there only from header BB.
+      if (PHIBB == SDB->JTCases[i].second.Default) {
+        PHI->addOperand
+          (MachineOperand::CreateReg(FuncInfo->PHINodesToUpdate[pi].second,
+                                     false));
+        PHI->addOperand
+          (MachineOperand::CreateMBB(SDB->JTCases[i].first.HeaderBB));
+      }
+      // JT BB. Just iterate over successors here
+      if (FuncInfo->MBB->isSuccessor(PHIBB)) {
+        PHI->addOperand
+          (MachineOperand::CreateReg(FuncInfo->PHINodesToUpdate[pi].second,
+                                     false));
+        PHI->addOperand(MachineOperand::CreateMBB(FuncInfo->MBB));
+      }
+    }
+  }
+  SDB->JTCases.clear();
+
+  // If the switch block involved a branch to one of the actual successors, we
+  // need to update PHI nodes in that block.
+  for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
+    MachineInstr *PHI = FuncInfo->PHINodesToUpdate[i].first;
+    assert(PHI->isPHI() &&
+           "This is not a machine PHI node that we are updating!");
+    if (FuncInfo->MBB->isSuccessor(PHI->getParent())) {
+      PHI->addOperand(
+        MachineOperand::CreateReg(FuncInfo->PHINodesToUpdate[i].second, false));
+      PHI->addOperand(MachineOperand::CreateMBB(FuncInfo->MBB));
+    }
+  }
+
+  // If we generated any switch lowering information, build and codegen any
+  // additional DAGs necessary.
+  for (unsigned i = 0, e = SDB->SwitchCases.size(); i != e; ++i) {
+    // Set the current basic block to the mbb we wish to insert the code into
+    MachineBasicBlock *ThisBB = FuncInfo->MBB = SDB->SwitchCases[i].ThisBB;
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
+
+    // Determine the unique successors.
+    SmallVector<MachineBasicBlock *, 2> Succs;
+    Succs.push_back(SDB->SwitchCases[i].TrueBB);
+    if (SDB->SwitchCases[i].TrueBB != SDB->SwitchCases[i].FalseBB)
+      Succs.push_back(SDB->SwitchCases[i].FalseBB);
+
+    // Emit the code. Note that this could result in ThisBB being split, so
+    // we need to check for updates.
+    SDB->visitSwitchCase(SDB->SwitchCases[i], FuncInfo->MBB);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+    ThisBB = FuncInfo->MBB;
+
+    // Handle any PHI nodes in successors of this chunk, as if we were coming
+    // from the original BB before switch expansion.  Note that PHI nodes can
+    // occur multiple times in PHINodesToUpdate.  We have to be very careful to
+    // handle them the right number of times.
+    for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
+      FuncInfo->MBB = Succs[i];
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // FuncInfo->MBB may have been removed from the CFG if a branch was
+      // constant folded.
+      if (ThisBB->isSuccessor(FuncInfo->MBB)) {
+        for (MachineBasicBlock::iterator Phi = FuncInfo->MBB->begin();
+             Phi != FuncInfo->MBB->end() && Phi->isPHI();
+             ++Phi) {
+          // This value for this PHI node is recorded in PHINodesToUpdate.
+          for (unsigned pn = 0; ; ++pn) {
+            assert(pn != FuncInfo->PHINodesToUpdate.size() &&
+                   "Didn't find PHI entry!");
+            if (FuncInfo->PHINodesToUpdate[pn].first == Phi) {
+              Phi->addOperand(MachineOperand::
+                              CreateReg(FuncInfo->PHINodesToUpdate[pn].second,
+                                        false));
+              Phi->addOperand(MachineOperand::CreateMBB(ThisBB));
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+  SDB->SwitchCases.clear();
+}
+
+
+/// Create the scheduler. If a specific scheduler was specified
+/// via the SchedulerRegistry, use it, otherwise select the
+/// one preferred by the target.
+///
+ScheduleDAGSDNodes *SelectionDAGISel::CreateScheduler() {
+  RegisterScheduler::FunctionPassCtor Ctor = RegisterScheduler::getDefault();
+
+  if (!Ctor) {
+    Ctor = ISHeuristic;
+    RegisterScheduler::setDefault(Ctor);
+  }
+
+  return Ctor(this, OptLevel);
+}
+
+ScheduleHazardRecognizer *SelectionDAGISel::CreateTargetHazardRecognizer() {
+  return new ScheduleHazardRecognizer();
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions used by the generated instruction selector.
+//===----------------------------------------------------------------------===//
+// Calls to these methods are generated by tblgen.
+
+/// CheckAndMask - The isel is trying to match something like (and X, 255).  If
+/// the dag combiner simplified the 255, we still want to match.  RHS is the
+/// actual value in the DAG on the RHS of an AND, and DesiredMaskS is the value
+/// specified in the .td file (e.g. 255).
+bool SelectionDAGISel::CheckAndMask(SDValue LHS, ConstantSDNode *RHS,
+                                    int64_t DesiredMaskS) const {
+  const APInt &ActualMask = RHS->getAPIntValue();
+  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
+
+  // If the actual mask exactly matches, success!
+  if (ActualMask == DesiredMask)
+    return true;
+
+  // If the actual AND mask is allowing unallowed bits, this doesn't match.
+  if (ActualMask.intersects(~DesiredMask))
+    return false;
+
+  // Otherwise, the DAG Combiner may have proven that the value coming in is
+  // either already zero or is not demanded.  Check for known zero input bits.
+  APInt NeededMask = DesiredMask & ~ActualMask;
+  if (CurDAG->MaskedValueIsZero(LHS, NeededMask))
+    return true;
+
+  // TODO: check to see if missing bits are just not demanded.
+
+  // Otherwise, this pattern doesn't match.
+  return false;
+}
+
+/// CheckOrMask - The isel is trying to match something like (or X, 255).  If
+/// the dag combiner simplified the 255, we still want to match.  RHS is the
+/// actual value in the DAG on the RHS of an OR, and DesiredMaskS is the value
+/// specified in the .td file (e.g. 255).
+bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS,
+                                   int64_t DesiredMaskS) const {
+  const APInt &ActualMask = RHS->getAPIntValue();
+  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
+
+  // If the actual mask exactly matches, success!
+  if (ActualMask == DesiredMask)
+    return true;
+
+  // If the actual AND mask is allowing unallowed bits, this doesn't match.
+  if (ActualMask.intersects(~DesiredMask))
+    return false;
+
+  // Otherwise, the DAG Combiner may have proven that the value coming in is
+  // either already zero or is not demanded.  Check for known zero input bits.
+  APInt NeededMask = DesiredMask & ~ActualMask;
+
+  APInt KnownZero, KnownOne;
+  CurDAG->ComputeMaskedBits(LHS, NeededMask, KnownZero, KnownOne);
+
+  // If all the missing bits in the or are already known to be set, match!
+  if ((NeededMask & KnownOne) == NeededMask)
+    return true;
+
+  // TODO: check to see if missing bits are just not demanded.
+
+  // Otherwise, this pattern doesn't match.
+  return false;
+}
+
+
+/// SelectInlineAsmMemoryOperands - Calls to this are automatically generated
+/// by tblgen.  Others should not call it.
+void SelectionDAGISel::
+SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops) {
+  std::vector<SDValue> InOps;
+  std::swap(InOps, Ops);
+
+  Ops.push_back(InOps[InlineAsm::Op_InputChain]); // 0
+  Ops.push_back(InOps[InlineAsm::Op_AsmString]);  // 1
+  Ops.push_back(InOps[InlineAsm::Op_MDNode]);     // 2, !srcloc
+  Ops.push_back(InOps[InlineAsm::Op_IsAlignStack]);  // 3
+
+  unsigned i = InlineAsm::Op_FirstOperand, e = InOps.size();
+  if (InOps[e-1].getValueType() == MVT::Flag)
+    --e;  // Don't process a flag operand if it is here.
+
+  while (i != e) {
+    unsigned Flags = cast<ConstantSDNode>(InOps[i])->getZExtValue();
+    if (!InlineAsm::isMemKind(Flags)) {
+      // Just skip over this operand, copying the operands verbatim.
+      Ops.insert(Ops.end(), InOps.begin()+i,
+                 InOps.begin()+i+InlineAsm::getNumOperandRegisters(Flags) + 1);
+      i += InlineAsm::getNumOperandRegisters(Flags) + 1;
+    } else {
+      assert(InlineAsm::getNumOperandRegisters(Flags) == 1 &&
+             "Memory operand with multiple values?");
+      // Otherwise, this is a memory operand.  Ask the target to select it.
+      std::vector<SDValue> SelOps;
+      if (SelectInlineAsmMemoryOperand(InOps[i+1], 'm', SelOps))
+        report_fatal_error("Could not match memory address.  Inline asm"
+                           " failure!");
+
+      // Add this to the output node.
+      unsigned NewFlags =
+        InlineAsm::getFlagWord(InlineAsm::Kind_Mem, SelOps.size());
+      Ops.push_back(CurDAG->getTargetConstant(NewFlags, MVT::i32));
+      Ops.insert(Ops.end(), SelOps.begin(), SelOps.end());
+      i += 2;
+    }
+  }
+
+  // Add the flag input back if present.
+  if (e != InOps.size())
+    Ops.push_back(InOps.back());
+}
+
+/// findFlagUse - Return use of EVT::Flag value produced by the specified
+/// SDNode.
+///
+static SDNode *findFlagUse(SDNode *N) {
+  unsigned FlagResNo = N->getNumValues()-1;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDUse &Use = I.getUse();
+    if (Use.getResNo() == FlagResNo)
+      return Use.getUser();
+  }
+  return NULL;
+}
+
+/// findNonImmUse - Return true if "Use" is a non-immediate use of "Def".
+/// This function recursively traverses up the operand chain, ignoring
+/// certain nodes.
+static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
+                          SDNode *Root, SmallPtrSet<SDNode*, 16> &Visited,
+                          bool IgnoreChains) {
+  // The NodeID's are given uniques ID's where a node ID is guaranteed to be
+  // greater than all of its (recursive) operands.  If we scan to a point where
+  // 'use' is smaller than the node we're scanning for, then we know we will
+  // never find it.
+  //
+  // The Use may be -1 (unassigned) if it is a newly allocated node.  This can
+  // happen because we scan down to newly selected nodes in the case of flag
+  // uses.
+  if ((Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1))
+    return false;
+  
+  // Don't revisit nodes if we already scanned it and didn't fail, we know we
+  // won't fail if we scan it again.
+  if (!Visited.insert(Use))
+    return false;
+
+  for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
+    // Ignore chain uses, they are validated by HandleMergeInputChains.
+    if (Use->getOperand(i).getValueType() == MVT::Other && IgnoreChains)
+      continue;
+    
+    SDNode *N = Use->getOperand(i).getNode();
+    if (N == Def) {
+      if (Use == ImmedUse || Use == Root)
+        continue;  // We are not looking for immediate use.
+      assert(N != Root);
+      return true;
+    }
+
+    // Traverse up the operand chain.
+    if (findNonImmUse(N, Def, ImmedUse, Root, Visited, IgnoreChains))
+      return true;
+  }
+  return false;
+}
+
+/// IsProfitableToFold - Returns true if it's profitable to fold the specific
+/// operand node N of U during instruction selection that starts at Root.
+bool SelectionDAGISel::IsProfitableToFold(SDValue N, SDNode *U,
+                                          SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+  return N.hasOneUse();
+}
+
+/// IsLegalToFold - Returns true if the specific operand node N of
+/// U can be folded during instruction selection that starts at Root.
+bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
+                                     CodeGenOpt::Level OptLevel,
+                                     bool IgnoreChains) {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  // If Root use can somehow reach N through a path that that doesn't contain
+  // U then folding N would create a cycle. e.g. In the following
+  // diagram, Root can reach N through X. If N is folded into into Root, then
+  // X is both a predecessor and a successor of U.
+  //
+  //          [N*]           //
+  //         ^   ^           //
+  //        /     \          //
+  //      [U*]    [X]?       //
+  //        ^     ^          //
+  //         \   /           //
+  //          \ /            //
+  //         [Root*]         //
+  //
+  // * indicates nodes to be folded together.
+  //
+  // If Root produces a flag, then it gets (even more) interesting. Since it
+  // will be "glued" together with its flag use in the scheduler, we need to
+  // check if it might reach N.
+  //
+  //          [N*]           //
+  //         ^   ^           //
+  //        /     \          //
+  //      [U*]    [X]?       //
+  //        ^       ^        //
+  //         \       \       //
+  //          \      |       //
+  //         [Root*] |       //
+  //          ^      |       //
+  //          f      |       //
+  //          |      /       //
+  //         [Y]    /        //
+  //           ^   /         //
+  //           f  /          //
+  //           | /           //
+  //          [FU]           //
+  //
+  // If FU (flag use) indirectly reaches N (the load), and Root folds N
+  // (call it Fold), then X is a predecessor of FU and a successor of
+  // Fold. But since Fold and FU are flagged together, this will create
+  // a cycle in the scheduling graph.
+
+  // If the node has flags, walk down the graph to the "lowest" node in the
+  // flagged set.
+  EVT VT = Root->getValueType(Root->getNumValues()-1);
+  while (VT == MVT::Flag) {
+    SDNode *FU = findFlagUse(Root);
+    if (FU == NULL)
+      break;
+    Root = FU;
+    VT = Root->getValueType(Root->getNumValues()-1);
+    
+    // If our query node has a flag result with a use, we've walked up it.  If
+    // the user (which has already been selected) has a chain or indirectly uses
+    // the chain, our WalkChainUsers predicate will not consider it.  Because of
+    // this, we cannot ignore chains in this predicate.
+    IgnoreChains = false;
+  }
+  
+
+  SmallPtrSet<SDNode*, 16> Visited;
+  return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains);
+}
+
+SDNode *SelectionDAGISel::Select_UNDEF(SDNode *N) {
+  return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF,N->getValueType(0));
+}
+
+/// GetVBR - decode a vbr encoding whose top bit is set.
+ALWAYS_INLINE static uint64_t
+GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
+  assert(Val >= 128 && "Not a VBR");
+  Val &= 127;  // Remove first vbr bit.
+  
+  unsigned Shift = 7;
+  uint64_t NextBits;
+  do {
+    NextBits = MatcherTable[Idx++];
+    Val |= (NextBits&127) << Shift;
+    Shift += 7;
+  } while (NextBits & 128);
+  
+  return Val;
+}
+
+
+/// UpdateChainsAndFlags - When a match is complete, this method updates uses of
+/// interior flag and chain results to use the new flag and chain results.
+void SelectionDAGISel::
+UpdateChainsAndFlags(SDNode *NodeToMatch, SDValue InputChain,
+                     const SmallVectorImpl<SDNode*> &ChainNodesMatched,
+                     SDValue InputFlag,
+                     const SmallVectorImpl<SDNode*> &FlagResultNodesMatched,
+                     bool isMorphNodeTo) {
+  SmallVector<SDNode*, 4> NowDeadNodes;
+  
+  ISelUpdater ISU(ISelPosition);
+
+  // Now that all the normal results are replaced, we replace the chain and
+  // flag results if present.
+  if (!ChainNodesMatched.empty()) {
+    assert(InputChain.getNode() != 0 &&
+           "Matched input chains but didn't produce a chain");
+    // Loop over all of the nodes we matched that produced a chain result.
+    // Replace all the chain results with the final chain we ended up with.
+    for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
+      SDNode *ChainNode = ChainNodesMatched[i];
+      
+      // If this node was already deleted, don't look at it.
+      if (ChainNode->getOpcode() == ISD::DELETED_NODE)
+        continue;
+      
+      // Don't replace the results of the root node if we're doing a
+      // MorphNodeTo.
+      if (ChainNode == NodeToMatch && isMorphNodeTo)
+        continue;
+      
+      SDValue ChainVal = SDValue(ChainNode, ChainNode->getNumValues()-1);
+      if (ChainVal.getValueType() == MVT::Flag)
+        ChainVal = ChainVal.getValue(ChainVal->getNumValues()-2);
+      assert(ChainVal.getValueType() == MVT::Other && "Not a chain?");
+      CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain, &ISU);
+      
+      // If the node became dead and we haven't already seen it, delete it.
+      if (ChainNode->use_empty() &&
+          !std::count(NowDeadNodes.begin(), NowDeadNodes.end(), ChainNode))
+        NowDeadNodes.push_back(ChainNode);
+    }
+  }
+  
+  // If the result produces a flag, update any flag results in the matched
+  // pattern with the flag result.
+  if (InputFlag.getNode() != 0) {
+    // Handle any interior nodes explicitly marked.
+    for (unsigned i = 0, e = FlagResultNodesMatched.size(); i != e; ++i) {
+      SDNode *FRN = FlagResultNodesMatched[i];
+      
+      // If this node was already deleted, don't look at it.
+      if (FRN->getOpcode() == ISD::DELETED_NODE)
+        continue;
+      
+      assert(FRN->getValueType(FRN->getNumValues()-1) == MVT::Flag &&
+             "Doesn't have a flag result");
+      CurDAG->ReplaceAllUsesOfValueWith(SDValue(FRN, FRN->getNumValues()-1),
+                                        InputFlag, &ISU);
+      
+      // If the node became dead and we haven't already seen it, delete it.
+      if (FRN->use_empty() &&
+          !std::count(NowDeadNodes.begin(), NowDeadNodes.end(), FRN))
+        NowDeadNodes.push_back(FRN);
+    }
+  }
+  
+  if (!NowDeadNodes.empty())
+    CurDAG->RemoveDeadNodes(NowDeadNodes, &ISU);
+  
+  DEBUG(errs() << "ISEL: Match complete!\n");
+}
+
+enum ChainResult {
+  CR_Simple,
+  CR_InducesCycle,
+  CR_LeadsToInteriorNode
+};
+
+/// WalkChainUsers - Walk down the users of the specified chained node that is
+/// part of the pattern we're matching, looking at all of the users we find.
+/// This determines whether something is an interior node, whether we have a
+/// non-pattern node in between two pattern nodes (which prevent folding because
+/// it would induce a cycle) and whether we have a TokenFactor node sandwiched
+/// between pattern nodes (in which case the TF becomes part of the pattern).
+///
+/// The walk we do here is guaranteed to be small because we quickly get down to
+/// already selected nodes "below" us.
+static ChainResult 
+WalkChainUsers(SDNode *ChainedNode,
+               SmallVectorImpl<SDNode*> &ChainedNodesInPattern,
+               SmallVectorImpl<SDNode*> &InteriorChainedNodes) {
+  ChainResult Result = CR_Simple;
+  
+  for (SDNode::use_iterator UI = ChainedNode->use_begin(),
+         E = ChainedNode->use_end(); UI != E; ++UI) {
+    // Make sure the use is of the chain, not some other value we produce.
+    if (UI.getUse().getValueType() != MVT::Other) continue;
+    
+    SDNode *User = *UI;
+
+    // If we see an already-selected machine node, then we've gone beyond the
+    // pattern that we're selecting down into the already selected chunk of the
+    // DAG.
+    if (User->isMachineOpcode() ||
+        User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
+      continue;
+    
+    if (User->getOpcode() == ISD::CopyToReg ||
+        User->getOpcode() == ISD::CopyFromReg) {
+      // If their node ID got reset to -1 then they've already been selected.
+      // Treat them like a MachineOpcode.
+      if (User->getNodeId() == -1)
+        continue;
+    }
+
+    // If we have a TokenFactor, we handle it specially.
+    if (User->getOpcode() != ISD::TokenFactor) {
+      // If the node isn't a token factor and isn't part of our pattern, then it
+      // must be a random chained node in between two nodes we're selecting.
+      // This happens when we have something like:
+      //   x = load ptr
+      //   call
+      //   y = x+4
+      //   store y -> ptr
+      // Because we structurally match the load/store as a read/modify/write,
+      // but the call is chained between them.  We cannot fold in this case
+      // because it would induce a cycle in the graph.
+      if (!std::count(ChainedNodesInPattern.begin(),
+                      ChainedNodesInPattern.end(), User))
+        return CR_InducesCycle;
+      
+      // Otherwise we found a node that is part of our pattern.  For example in:
+      //   x = load ptr
+      //   y = x+4
+      //   store y -> ptr
+      // This would happen when we're scanning down from the load and see the
+      // store as a user.  Record that there is a use of ChainedNode that is
+      // part of the pattern and keep scanning uses.
+      Result = CR_LeadsToInteriorNode;
+      InteriorChainedNodes.push_back(User);
+      continue;
+    }
+    
+    // If we found a TokenFactor, there are two cases to consider: first if the
+    // TokenFactor is just hanging "below" the pattern we're matching (i.e. no
+    // uses of the TF are in our pattern) we just want to ignore it.  Second,
+    // the TokenFactor can be sandwiched in between two chained nodes, like so:
+    //     [Load chain]
+    //         ^
+    //         |
+    //       [Load]
+    //       ^    ^
+    //       |    \                    DAG's like cheese
+    //      /       \                       do you?
+    //     /         |
+    // [TokenFactor] [Op]
+    //     ^          ^
+    //     |          |
+    //      \        /
+    //       \      /
+    //       [Store]
+    //
+    // In this case, the TokenFactor becomes part of our match and we rewrite it
+    // as a new TokenFactor.
+    //
+    // To distinguish these two cases, do a recursive walk down the uses.
+    switch (WalkChainUsers(User, ChainedNodesInPattern, InteriorChainedNodes)) {
+    case CR_Simple:
+      // If the uses of the TokenFactor are just already-selected nodes, ignore
+      // it, it is "below" our pattern.
+      continue;
+    case CR_InducesCycle:
+      // If the uses of the TokenFactor lead to nodes that are not part of our
+      // pattern that are not selected, folding would turn this into a cycle,
+      // bail out now.
+      return CR_InducesCycle;
+    case CR_LeadsToInteriorNode:
+      break;  // Otherwise, keep processing.
+    }
+    
+    // Okay, we know we're in the interesting interior case.  The TokenFactor
+    // is now going to be considered part of the pattern so that we rewrite its
+    // uses (it may have uses that are not part of the pattern) with the
+    // ultimate chain result of the generated code.  We will also add its chain
+    // inputs as inputs to the ultimate TokenFactor we create.
+    Result = CR_LeadsToInteriorNode;
+    ChainedNodesInPattern.push_back(User);
+    InteriorChainedNodes.push_back(User);
+    continue;
+  }
+  
+  return Result;
+}
+
+/// HandleMergeInputChains - This implements the OPC_EmitMergeInputChains
+/// operation for when the pattern matched at least one node with a chains.  The
+/// input vector contains a list of all of the chained nodes that we match.  We
+/// must determine if this is a valid thing to cover (i.e. matching it won't
+/// induce cycles in the DAG) and if so, creating a TokenFactor node. that will
+/// be used as the input node chain for the generated nodes.
+static SDValue
+HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
+                       SelectionDAG *CurDAG) {
+  // Walk all of the chained nodes we've matched, recursively scanning down the
+  // users of the chain result. This adds any TokenFactor nodes that are caught
+  // in between chained nodes to the chained and interior nodes list.
+  SmallVector<SDNode*, 3> InteriorChainedNodes;
+  for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
+    if (WalkChainUsers(ChainNodesMatched[i], ChainNodesMatched,
+                       InteriorChainedNodes) == CR_InducesCycle)
+      return SDValue(); // Would induce a cycle.
+  }
+  
+  // Okay, we have walked all the matched nodes and collected TokenFactor nodes
+  // that we are interested in.  Form our input TokenFactor node.
+  SmallVector<SDValue, 3> InputChains;
+  for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
+    // Add the input chain of this node to the InputChains list (which will be
+    // the operands of the generated TokenFactor) if it's not an interior node.
+    SDNode *N = ChainNodesMatched[i];
+    if (N->getOpcode() != ISD::TokenFactor) {
+      if (std::count(InteriorChainedNodes.begin(),InteriorChainedNodes.end(),N))
+        continue;
+      
+      // Otherwise, add the input chain.
+      SDValue InChain = ChainNodesMatched[i]->getOperand(0);
+      assert(InChain.getValueType() == MVT::Other && "Not a chain");
+      InputChains.push_back(InChain);
+      continue;
+    }
+    
+    // If we have a token factor, we want to add all inputs of the token factor
+    // that are not part of the pattern we're matching.
+    for (unsigned op = 0, e = N->getNumOperands(); op != e; ++op) {
+      if (!std::count(ChainNodesMatched.begin(), ChainNodesMatched.end(),
+                      N->getOperand(op).getNode()))
+        InputChains.push_back(N->getOperand(op));
+    }
+  }
+  
+  SDValue Res;
+  if (InputChains.size() == 1)
+    return InputChains[0];
+  return CurDAG->getNode(ISD::TokenFactor, ChainNodesMatched[0]->getDebugLoc(),
+                         MVT::Other, &InputChains[0], InputChains.size());
+}  
+
+/// MorphNode - Handle morphing a node in place for the selector.
+SDNode *SelectionDAGISel::
+MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
+          const SDValue *Ops, unsigned NumOps, unsigned EmitNodeInfo) {
+  // It is possible we're using MorphNodeTo to replace a node with no
+  // normal results with one that has a normal result (or we could be
+  // adding a chain) and the input could have flags and chains as well.
+  // In this case we need to shift the operands down.
+  // FIXME: This is a horrible hack and broken in obscure cases, no worse
+  // than the old isel though.
+  int OldFlagResultNo = -1, OldChainResultNo = -1;
+
+  unsigned NTMNumResults = Node->getNumValues();
+  if (Node->getValueType(NTMNumResults-1) == MVT::Flag) {
+    OldFlagResultNo = NTMNumResults-1;
+    if (NTMNumResults != 1 &&
+        Node->getValueType(NTMNumResults-2) == MVT::Other)
+      OldChainResultNo = NTMNumResults-2;
+  } else if (Node->getValueType(NTMNumResults-1) == MVT::Other)
+    OldChainResultNo = NTMNumResults-1;
+
+  // Call the underlying SelectionDAG routine to do the transmogrification. Note
+  // that this deletes operands of the old node that become dead.
+  SDNode *Res = CurDAG->MorphNodeTo(Node, ~TargetOpc, VTList, Ops, NumOps);
+
+  // MorphNodeTo can operate in two ways: if an existing node with the
+  // specified operands exists, it can just return it.  Otherwise, it
+  // updates the node in place to have the requested operands.
+  if (Res == Node) {
+    // If we updated the node in place, reset the node ID.  To the isel,
+    // this should be just like a newly allocated machine node.
+    Res->setNodeId(-1);
+  }
+
+  unsigned ResNumResults = Res->getNumValues();
+  // Move the flag if needed.
+  if ((EmitNodeInfo & OPFL_FlagOutput) && OldFlagResultNo != -1 &&
+      (unsigned)OldFlagResultNo != ResNumResults-1)
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldFlagResultNo), 
+                                      SDValue(Res, ResNumResults-1));
+
+  if ((EmitNodeInfo & OPFL_FlagOutput) != 0)
+    --ResNumResults;
+
+  // Move the chain reference if needed.
+  if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 &&
+      (unsigned)OldChainResultNo != ResNumResults-1)
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldChainResultNo), 
+                                      SDValue(Res, ResNumResults-1));
+
+  // Otherwise, no replacement happened because the node already exists. Replace
+  // Uses of the old node with the new one.
+  if (Res != Node)
+    CurDAG->ReplaceAllUsesWith(Node, Res);
+  
+  return Res;
+}
+
+/// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
+ALWAYS_INLINE static bool
+CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+          SDValue N, const SmallVectorImpl<SDValue> &RecordedNodes) {
+  // Accept if it is exactly the same as a previously recorded node.
+  unsigned RecNo = MatcherTable[MatcherIndex++];
+  assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+  return N == RecordedNodes[RecNo];
+}
+  
+/// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
+ALWAYS_INLINE static bool
+CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+                      SelectionDAGISel &SDISel) {
+  return SDISel.CheckPatternPredicate(MatcherTable[MatcherIndex++]);
+}
+
+/// CheckNodePredicate - Implements OP_CheckNodePredicate.
+ALWAYS_INLINE static bool
+CheckNodePredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+                   SelectionDAGISel &SDISel, SDNode *N) {
+  return SDISel.CheckNodePredicate(N, MatcherTable[MatcherIndex++]);
+}
+
+ALWAYS_INLINE static bool
+CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+            SDNode *N) {
+  uint16_t Opc = MatcherTable[MatcherIndex++];
+  Opc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+  return N->getOpcode() == Opc;
+}
+
+ALWAYS_INLINE static bool
+CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+          SDValue N, const TargetLowering &TLI) {
+  MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+  if (N.getValueType() == VT) return true;
+  
+  // Handle the case when VT is iPTR.
+  return VT == MVT::iPTR && N.getValueType() == TLI.getPointerTy();
+}
+
+ALWAYS_INLINE static bool
+CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+               SDValue N, const TargetLowering &TLI,
+               unsigned ChildNo) {
+  if (ChildNo >= N.getNumOperands())
+    return false;  // Match fails if out of range child #.
+  return ::CheckType(MatcherTable, MatcherIndex, N.getOperand(ChildNo), TLI);
+}
+
+
+ALWAYS_INLINE static bool
+CheckCondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+              SDValue N) {
+  return cast<CondCodeSDNode>(N)->get() ==
+      (ISD::CondCode)MatcherTable[MatcherIndex++];
+}
+
+ALWAYS_INLINE static bool
+CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+               SDValue N, const TargetLowering &TLI) {
+  MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+  if (cast<VTSDNode>(N)->getVT() == VT)
+    return true;
+  
+  // Handle the case when VT is iPTR.
+  return VT == MVT::iPTR && cast<VTSDNode>(N)->getVT() == TLI.getPointerTy();
+}
+
+ALWAYS_INLINE static bool
+CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+             SDValue N) {
+  int64_t Val = MatcherTable[MatcherIndex++];
+  if (Val & 128)
+    Val = GetVBR(Val, MatcherTable, MatcherIndex);
+  
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
+  return C != 0 && C->getSExtValue() == Val;
+}
+
+ALWAYS_INLINE static bool
+CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+            SDValue N, SelectionDAGISel &SDISel) {
+  int64_t Val = MatcherTable[MatcherIndex++];
+  if (Val & 128)
+    Val = GetVBR(Val, MatcherTable, MatcherIndex);
+  
+  if (N->getOpcode() != ISD::AND) return false;
+  
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  return C != 0 && SDISel.CheckAndMask(N.getOperand(0), C, Val);
+}
+
+ALWAYS_INLINE static bool
+CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+           SDValue N, SelectionDAGISel &SDISel) {
+  int64_t Val = MatcherTable[MatcherIndex++];
+  if (Val & 128)
+    Val = GetVBR(Val, MatcherTable, MatcherIndex);
+  
+  if (N->getOpcode() != ISD::OR) return false;
+  
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  return C != 0 && SDISel.CheckOrMask(N.getOperand(0), C, Val);
+}
+
+/// IsPredicateKnownToFail - If we know how and can do so without pushing a
+/// scope, evaluate the current node.  If the current predicate is known to
+/// fail, set Result=true and return anything.  If the current predicate is
+/// known to pass, set Result=false and return the MatcherIndex to continue
+/// with.  If the current predicate is unknown, set Result=false and return the
+/// MatcherIndex to continue with. 
+static unsigned IsPredicateKnownToFail(const unsigned char *Table,
+                                       unsigned Index, SDValue N,
+                                       bool &Result, SelectionDAGISel &SDISel,
+                                       SmallVectorImpl<SDValue> &RecordedNodes){
+  switch (Table[Index++]) {
+  default:
+    Result = false;
+    return Index-1;  // Could not evaluate this predicate.
+  case SelectionDAGISel::OPC_CheckSame:
+    Result = !::CheckSame(Table, Index, N, RecordedNodes);
+    return Index;
+  case SelectionDAGISel::OPC_CheckPatternPredicate:
+    Result = !::CheckPatternPredicate(Table, Index, SDISel);
+    return Index;
+  case SelectionDAGISel::OPC_CheckPredicate:
+    Result = !::CheckNodePredicate(Table, Index, SDISel, N.getNode());
+    return Index;
+  case SelectionDAGISel::OPC_CheckOpcode:
+    Result = !::CheckOpcode(Table, Index, N.getNode());
+    return Index;
+  case SelectionDAGISel::OPC_CheckType:
+    Result = !::CheckType(Table, Index, N, SDISel.TLI);
+    return Index;
+  case SelectionDAGISel::OPC_CheckChild0Type:
+  case SelectionDAGISel::OPC_CheckChild1Type:
+  case SelectionDAGISel::OPC_CheckChild2Type:
+  case SelectionDAGISel::OPC_CheckChild3Type:
+  case SelectionDAGISel::OPC_CheckChild4Type:
+  case SelectionDAGISel::OPC_CheckChild5Type:
+  case SelectionDAGISel::OPC_CheckChild6Type:
+  case SelectionDAGISel::OPC_CheckChild7Type:
+    Result = !::CheckChildType(Table, Index, N, SDISel.TLI,
+                        Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Type);
+    return Index;
+  case SelectionDAGISel::OPC_CheckCondCode:
+    Result = !::CheckCondCode(Table, Index, N);
+    return Index;
+  case SelectionDAGISel::OPC_CheckValueType:
+    Result = !::CheckValueType(Table, Index, N, SDISel.TLI);
+    return Index;
+  case SelectionDAGISel::OPC_CheckInteger:
+    Result = !::CheckInteger(Table, Index, N);
+    return Index;
+  case SelectionDAGISel::OPC_CheckAndImm:
+    Result = !::CheckAndImm(Table, Index, N, SDISel);
+    return Index;
+  case SelectionDAGISel::OPC_CheckOrImm:
+    Result = !::CheckOrImm(Table, Index, N, SDISel);
+    return Index;
+  }
+}
+
+namespace {
+
+struct MatchScope {
+  /// FailIndex - If this match fails, this is the index to continue with.
+  unsigned FailIndex;
+  
+  /// NodeStack - The node stack when the scope was formed.
+  SmallVector<SDValue, 4> NodeStack;
+  
+  /// NumRecordedNodes - The number of recorded nodes when the scope was formed.
+  unsigned NumRecordedNodes;
+  
+  /// NumMatchedMemRefs - The number of matched memref entries.
+  unsigned NumMatchedMemRefs;
+  
+  /// InputChain/InputFlag - The current chain/flag 
+  SDValue InputChain, InputFlag;
+
+  /// HasChainNodesMatched - True if the ChainNodesMatched list is non-empty.
+  bool HasChainNodesMatched, HasFlagResultNodesMatched;
+};
+
+}
+
+SDNode *SelectionDAGISel::
+SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
+                 unsigned TableSize) {
+  // FIXME: Should these even be selected?  Handle these cases in the caller?
+  switch (NodeToMatch->getOpcode()) {
+  default:
+    break;
+  case ISD::EntryToken:       // These nodes remain the same.
+  case ISD::BasicBlock:
+  case ISD::Register:
+  //case ISD::VALUETYPE:
+  //case ISD::CONDCODE:
+  case ISD::HANDLENODE:
+  case ISD::MDNODE_SDNODE:
+  case ISD::TargetConstant:
+  case ISD::TargetConstantFP:
+  case ISD::TargetConstantPool:
+  case ISD::TargetFrameIndex:
+  case ISD::TargetExternalSymbol:
+  case ISD::TargetBlockAddress:
+  case ISD::TargetJumpTable:
+  case ISD::TargetGlobalTLSAddress:
+  case ISD::TargetGlobalAddress:
+  case ISD::TokenFactor:
+  case ISD::CopyFromReg:
+  case ISD::CopyToReg:
+    NodeToMatch->setNodeId(-1); // Mark selected.
+    return 0;
+  case ISD::AssertSext:
+  case ISD::AssertZext:
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, 0),
+                                      NodeToMatch->getOperand(0));
+    return 0;
+  case ISD::UNDEF:     return Select_UNDEF(NodeToMatch);
+  }
+  
+  assert(!NodeToMatch->isMachineOpcode() && "Node already selected!");
+
+  // Set up the node stack with NodeToMatch as the only node on the stack.
+  SmallVector<SDValue, 8> NodeStack;
+  SDValue N = SDValue(NodeToMatch, 0);
+  NodeStack.push_back(N);
+
+  // MatchScopes - Scopes used when matching, if a match failure happens, this
+  // indicates where to continue checking.
+  SmallVector<MatchScope, 8> MatchScopes;
+  
+  // RecordedNodes - This is the set of nodes that have been recorded by the
+  // state machine.
+  SmallVector<SDValue, 8> RecordedNodes;
+  
+  // MatchedMemRefs - This is the set of MemRef's we've seen in the input
+  // pattern.
+  SmallVector<MachineMemOperand*, 2> MatchedMemRefs;
+  
+  // These are the current input chain and flag for use when generating nodes.
+  // Various Emit operations change these.  For example, emitting a copytoreg
+  // uses and updates these.
+  SDValue InputChain, InputFlag;
+  
+  // ChainNodesMatched - If a pattern matches nodes that have input/output
+  // chains, the OPC_EmitMergeInputChains operation is emitted which indicates
+  // which ones they are.  The result is captured into this list so that we can
+  // update the chain results when the pattern is complete.
+  SmallVector<SDNode*, 3> ChainNodesMatched;
+  SmallVector<SDNode*, 3> FlagResultNodesMatched;
+  
+  DEBUG(errs() << "ISEL: Starting pattern match on root node: ";
+        NodeToMatch->dump(CurDAG);
+        errs() << '\n');
+  
+  // Determine where to start the interpreter.  Normally we start at opcode #0,
+  // but if the state machine starts with an OPC_SwitchOpcode, then we
+  // accelerate the first lookup (which is guaranteed to be hot) with the
+  // OpcodeOffset table.
+  unsigned MatcherIndex = 0;
+  
+  if (!OpcodeOffset.empty()) {
+    // Already computed the OpcodeOffset table, just index into it.
+    if (N.getOpcode() < OpcodeOffset.size())
+      MatcherIndex = OpcodeOffset[N.getOpcode()];
+    DEBUG(errs() << "  Initial Opcode index to " << MatcherIndex << "\n");
+
+  } else if (MatcherTable[0] == OPC_SwitchOpcode) {
+    // Otherwise, the table isn't computed, but the state machine does start
+    // with an OPC_SwitchOpcode instruction.  Populate the table now, since this
+    // is the first time we're selecting an instruction.
+    unsigned Idx = 1;
+    while (1) {
+      // Get the size of this case.
+      unsigned CaseSize = MatcherTable[Idx++];
+      if (CaseSize & 128)
+        CaseSize = GetVBR(CaseSize, MatcherTable, Idx);
+      if (CaseSize == 0) break;
+
+      // Get the opcode, add the index to the table.
+      uint16_t Opc = MatcherTable[Idx++];
+      Opc |= (unsigned short)MatcherTable[Idx++] << 8;
+      if (Opc >= OpcodeOffset.size())
+        OpcodeOffset.resize((Opc+1)*2);
+      OpcodeOffset[Opc] = Idx;
+      Idx += CaseSize;
+    }
+
+    // Okay, do the lookup for the first opcode.
+    if (N.getOpcode() < OpcodeOffset.size())
+      MatcherIndex = OpcodeOffset[N.getOpcode()];
+  }
+  
+  while (1) {
+    assert(MatcherIndex < TableSize && "Invalid index");
+#ifndef NDEBUG
+    unsigned CurrentOpcodeIndex = MatcherIndex;
+#endif
+    BuiltinOpcodes Opcode = (BuiltinOpcodes)MatcherTable[MatcherIndex++];
+    switch (Opcode) {
+    case OPC_Scope: {
+      // Okay, the semantics of this operation are that we should push a scope
+      // then evaluate the first child.  However, pushing a scope only to have
+      // the first check fail (which then pops it) is inefficient.  If we can
+      // determine immediately that the first check (or first several) will
+      // immediately fail, don't even bother pushing a scope for them.
+      unsigned FailIndex;
+      
+      while (1) {
+        unsigned NumToSkip = MatcherTable[MatcherIndex++];
+        if (NumToSkip & 128)
+          NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
+        // Found the end of the scope with no match.
+        if (NumToSkip == 0) {
+          FailIndex = 0;
+          break;
+        }
+        
+        FailIndex = MatcherIndex+NumToSkip;
+        
+        unsigned MatcherIndexOfPredicate = MatcherIndex;
+        (void)MatcherIndexOfPredicate; // silence warning.
+        
+        // If we can't evaluate this predicate without pushing a scope (e.g. if
+        // it is a 'MoveParent') or if the predicate succeeds on this node, we
+        // push the scope and evaluate the full predicate chain.
+        bool Result;
+        MatcherIndex = IsPredicateKnownToFail(MatcherTable, MatcherIndex, N,
+                                              Result, *this, RecordedNodes);
+        if (!Result)
+          break;
+        
+        DEBUG(errs() << "  Skipped scope entry (due to false predicate) at "
+                     << "index " << MatcherIndexOfPredicate
+                     << ", continuing at " << FailIndex << "\n");
+        ++NumDAGIselRetries;
+        
+        // Otherwise, we know that this case of the Scope is guaranteed to fail,
+        // move to the next case.
+        MatcherIndex = FailIndex;
+      }
+      
+      // If the whole scope failed to match, bail.
+      if (FailIndex == 0) break;
+      
+      // Push a MatchScope which indicates where to go if the first child fails
+      // to match.
+      MatchScope NewEntry;
+      NewEntry.FailIndex = FailIndex;
+      NewEntry.NodeStack.append(NodeStack.begin(), NodeStack.end());
+      NewEntry.NumRecordedNodes = RecordedNodes.size();
+      NewEntry.NumMatchedMemRefs = MatchedMemRefs.size();
+      NewEntry.InputChain = InputChain;
+      NewEntry.InputFlag = InputFlag;
+      NewEntry.HasChainNodesMatched = !ChainNodesMatched.empty();
+      NewEntry.HasFlagResultNodesMatched = !FlagResultNodesMatched.empty();
+      MatchScopes.push_back(NewEntry);
+      continue;
+    }
+    case OPC_RecordNode:
+      // Remember this node, it may end up being an operand in the pattern.
+      RecordedNodes.push_back(N);
+      continue;
+        
+    case OPC_RecordChild0: case OPC_RecordChild1:
+    case OPC_RecordChild2: case OPC_RecordChild3:
+    case OPC_RecordChild4: case OPC_RecordChild5:
+    case OPC_RecordChild6: case OPC_RecordChild7: {
+      unsigned ChildNo = Opcode-OPC_RecordChild0;
+      if (ChildNo >= N.getNumOperands())
+        break;  // Match fails if out of range child #.
+
+      RecordedNodes.push_back(N->getOperand(ChildNo));
+      continue;
+    }
+    case OPC_RecordMemRef:
+      MatchedMemRefs.push_back(cast<MemSDNode>(N)->getMemOperand());
+      continue;
+        
+    case OPC_CaptureFlagInput:
+      // If the current node has an input flag, capture it in InputFlag.
+      if (N->getNumOperands() != 0 &&
+          N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Flag)
+        InputFlag = N->getOperand(N->getNumOperands()-1);
+      continue;
+        
+    case OPC_MoveChild: {
+      unsigned ChildNo = MatcherTable[MatcherIndex++];
+      if (ChildNo >= N.getNumOperands())
+        break;  // Match fails if out of range child #.
+      N = N.getOperand(ChildNo);
+      NodeStack.push_back(N);
+      continue;
+    }
+        
+    case OPC_MoveParent:
+      // Pop the current node off the NodeStack.
+      NodeStack.pop_back();
+      assert(!NodeStack.empty() && "Node stack imbalance!");
+      N = NodeStack.back();  
+      continue;
+     
+    case OPC_CheckSame:
+      if (!::CheckSame(MatcherTable, MatcherIndex, N, RecordedNodes)) break;
+      continue;
+    case OPC_CheckPatternPredicate:
+      if (!::CheckPatternPredicate(MatcherTable, MatcherIndex, *this)) break;
+      continue;
+    case OPC_CheckPredicate:
+      if (!::CheckNodePredicate(MatcherTable, MatcherIndex, *this,
+                                N.getNode()))
+        break;
+      continue;
+    case OPC_CheckComplexPat: {
+      unsigned CPNum = MatcherTable[MatcherIndex++];
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckComplexPat");
+      if (!CheckComplexPattern(NodeToMatch, RecordedNodes[RecNo], CPNum,
+                               RecordedNodes))
+        break;
+      continue;
+    }
+    case OPC_CheckOpcode:
+      if (!::CheckOpcode(MatcherTable, MatcherIndex, N.getNode())) break;
+      continue;
+        
+    case OPC_CheckType:
+      if (!::CheckType(MatcherTable, MatcherIndex, N, TLI)) break;
+      continue;
+        
+    case OPC_SwitchOpcode: {
+      unsigned CurNodeOpcode = N.getOpcode();
+      unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
+      unsigned CaseSize;
+      while (1) {
+        // Get the size of this case.
+        CaseSize = MatcherTable[MatcherIndex++];
+        if (CaseSize & 128)
+          CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
+        if (CaseSize == 0) break;
+
+        uint16_t Opc = MatcherTable[MatcherIndex++];
+        Opc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+
+        // If the opcode matches, then we will execute this case.
+        if (CurNodeOpcode == Opc)
+          break;
+      
+        // Otherwise, skip over this case.
+        MatcherIndex += CaseSize;
+      }
+      
+      // If no cases matched, bail out.
+      if (CaseSize == 0) break;
+      
+      // Otherwise, execute the case we found.
+      DEBUG(errs() << "  OpcodeSwitch from " << SwitchStart
+                   << " to " << MatcherIndex << "\n");
+      continue;
+    }
+        
+    case OPC_SwitchType: {
+      MVT::SimpleValueType CurNodeVT = N.getValueType().getSimpleVT().SimpleTy;
+      unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
+      unsigned CaseSize;
+      while (1) {
+        // Get the size of this case.
+        CaseSize = MatcherTable[MatcherIndex++];
+        if (CaseSize & 128)
+          CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
+        if (CaseSize == 0) break;
+        
+        MVT::SimpleValueType CaseVT =
+          (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+        if (CaseVT == MVT::iPTR)
+          CaseVT = TLI.getPointerTy().SimpleTy;
+        
+        // If the VT matches, then we will execute this case.
+        if (CurNodeVT == CaseVT)
+          break;
+        
+        // Otherwise, skip over this case.
+        MatcherIndex += CaseSize;
+      }
+      
+      // If no cases matched, bail out.
+      if (CaseSize == 0) break;
+      
+      // Otherwise, execute the case we found.
+      DEBUG(errs() << "  TypeSwitch[" << EVT(CurNodeVT).getEVTString()
+                   << "] from " << SwitchStart << " to " << MatcherIndex<<'\n');
+      continue;
+    }
+    case OPC_CheckChild0Type: case OPC_CheckChild1Type:
+    case OPC_CheckChild2Type: case OPC_CheckChild3Type:
+    case OPC_CheckChild4Type: case OPC_CheckChild5Type:
+    case OPC_CheckChild6Type: case OPC_CheckChild7Type:
+      if (!::CheckChildType(MatcherTable, MatcherIndex, N, TLI,
+                            Opcode-OPC_CheckChild0Type))
+        break;
+      continue;
+    case OPC_CheckCondCode:
+      if (!::CheckCondCode(MatcherTable, MatcherIndex, N)) break;
+      continue;
+    case OPC_CheckValueType:
+      if (!::CheckValueType(MatcherTable, MatcherIndex, N, TLI)) break;
+      continue;
+    case OPC_CheckInteger:
+      if (!::CheckInteger(MatcherTable, MatcherIndex, N)) break;
+      continue;
+    case OPC_CheckAndImm:
+      if (!::CheckAndImm(MatcherTable, MatcherIndex, N, *this)) break;
+      continue;
+    case OPC_CheckOrImm:
+      if (!::CheckOrImm(MatcherTable, MatcherIndex, N, *this)) break;
+      continue;
+        
+    case OPC_CheckFoldableChainNode: {
+      assert(NodeStack.size() != 1 && "No parent node");
+      // Verify that all intermediate nodes between the root and this one have
+      // a single use.
+      bool HasMultipleUses = false;
+      for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i)
+        if (!NodeStack[i].hasOneUse()) {
+          HasMultipleUses = true;
+          break;
+        }
+      if (HasMultipleUses) break;
+
+      // Check to see that the target thinks this is profitable to fold and that
+      // we can fold it without inducing cycles in the graph.
+      if (!IsProfitableToFold(N, NodeStack[NodeStack.size()-2].getNode(),
+                              NodeToMatch) ||
+          !IsLegalToFold(N, NodeStack[NodeStack.size()-2].getNode(),
+                         NodeToMatch, OptLevel,
+                         true/*We validate our own chains*/))
+        break;
+      
+      continue;
+    }
+    case OPC_EmitInteger: {
+      MVT::SimpleValueType VT =
+        (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+      int64_t Val = MatcherTable[MatcherIndex++];
+      if (Val & 128)
+        Val = GetVBR(Val, MatcherTable, MatcherIndex);
+      RecordedNodes.push_back(CurDAG->getTargetConstant(Val, VT));
+      continue;
+    }
+    case OPC_EmitRegister: {
+      MVT::SimpleValueType VT =
+        (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+      unsigned RegNo = MatcherTable[MatcherIndex++];
+      RecordedNodes.push_back(CurDAG->getRegister(RegNo, VT));
+      continue;
+    }
+        
+    case OPC_EmitConvertToTarget:  {
+      // Convert from IMM/FPIMM to target version.
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      SDValue Imm = RecordedNodes[RecNo];
+
+      if (Imm->getOpcode() == ISD::Constant) {
+        int64_t Val = cast<ConstantSDNode>(Imm)->getZExtValue();
+        Imm = CurDAG->getTargetConstant(Val, Imm.getValueType());
+      } else if (Imm->getOpcode() == ISD::ConstantFP) {
+        const ConstantFP *Val=cast<ConstantFPSDNode>(Imm)->getConstantFPValue();
+        Imm = CurDAG->getTargetConstantFP(*Val, Imm.getValueType());
+      }
+      
+      RecordedNodes.push_back(Imm);
+      continue;
+    }
+        
+    case OPC_EmitMergeInputChains1_0:    // OPC_EmitMergeInputChains, 1, 0
+    case OPC_EmitMergeInputChains1_1: {  // OPC_EmitMergeInputChains, 1, 1
+      // These are space-optimized forms of OPC_EmitMergeInputChains.
+      assert(InputChain.getNode() == 0 &&
+             "EmitMergeInputChains should be the first chain producing node");
+      assert(ChainNodesMatched.empty() &&
+             "Should only have one EmitMergeInputChains per match");
+      
+      // Read all of the chained nodes.
+      unsigned RecNo = Opcode == OPC_EmitMergeInputChains1_1;
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      ChainNodesMatched.push_back(RecordedNodes[RecNo].getNode());
+        
+      // FIXME: What if other value results of the node have uses not matched
+      // by this pattern?
+      if (ChainNodesMatched.back() != NodeToMatch &&
+          !RecordedNodes[RecNo].hasOneUse()) {
+        ChainNodesMatched.clear();
+        break;
+      }
+      
+      // Merge the input chains if they are not intra-pattern references.
+      InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
+      
+      if (InputChain.getNode() == 0)
+        break;  // Failed to merge.
+      continue;
+    }
+        
+    case OPC_EmitMergeInputChains: {
+      assert(InputChain.getNode() == 0 &&
+             "EmitMergeInputChains should be the first chain producing node");
+      // This node gets a list of nodes we matched in the input that have
+      // chains.  We want to token factor all of the input chains to these nodes
+      // together.  However, if any of the input chains is actually one of the
+      // nodes matched in this pattern, then we have an intra-match reference.
+      // Ignore these because the newly token factored chain should not refer to
+      // the old nodes.
+      unsigned NumChains = MatcherTable[MatcherIndex++];
+      assert(NumChains != 0 && "Can't TF zero chains");
+
+      assert(ChainNodesMatched.empty() &&
+             "Should only have one EmitMergeInputChains per match");
+
+      // Read all of the chained nodes.
+      for (unsigned i = 0; i != NumChains; ++i) {
+        unsigned RecNo = MatcherTable[MatcherIndex++];
+        assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+        ChainNodesMatched.push_back(RecordedNodes[RecNo].getNode());
+        
+        // FIXME: What if other value results of the node have uses not matched
+        // by this pattern?
+        if (ChainNodesMatched.back() != NodeToMatch &&
+            !RecordedNodes[RecNo].hasOneUse()) {
+          ChainNodesMatched.clear();
+          break;
+        }
+      }
+      
+      // If the inner loop broke out, the match fails.
+      if (ChainNodesMatched.empty())
+        break;
+
+      // Merge the input chains if they are not intra-pattern references.
+      InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
+      
+      if (InputChain.getNode() == 0)
+        break;  // Failed to merge.
+
+      continue;
+    }
+        
+    case OPC_EmitCopyToReg: {
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      unsigned DestPhysReg = MatcherTable[MatcherIndex++];
+      
+      if (InputChain.getNode() == 0)
+        InputChain = CurDAG->getEntryNode();
+      
+      InputChain = CurDAG->getCopyToReg(InputChain, NodeToMatch->getDebugLoc(),
+                                        DestPhysReg, RecordedNodes[RecNo],
+                                        InputFlag);
+      
+      InputFlag = InputChain.getValue(1);
+      continue;
+    }
+        
+    case OPC_EmitNodeXForm: {
+      unsigned XFormNo = MatcherTable[MatcherIndex++];
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      RecordedNodes.push_back(RunSDNodeXForm(RecordedNodes[RecNo], XFormNo));
+      continue;
+    }
+        
+    case OPC_EmitNode:
+    case OPC_MorphNodeTo: {
+      uint16_t TargetOpc = MatcherTable[MatcherIndex++];
+      TargetOpc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+      unsigned EmitNodeInfo = MatcherTable[MatcherIndex++];
+      // Get the result VT list.
+      unsigned NumVTs = MatcherTable[MatcherIndex++];
+      SmallVector<EVT, 4> VTs;
+      for (unsigned i = 0; i != NumVTs; ++i) {
+        MVT::SimpleValueType VT =
+          (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+        if (VT == MVT::iPTR) VT = TLI.getPointerTy().SimpleTy;
+        VTs.push_back(VT);
+      }
+      
+      if (EmitNodeInfo & OPFL_Chain)
+        VTs.push_back(MVT::Other);
+      if (EmitNodeInfo & OPFL_FlagOutput)
+        VTs.push_back(MVT::Flag);
+      
+      // This is hot code, so optimize the two most common cases of 1 and 2
+      // results.
+      SDVTList VTList;
+      if (VTs.size() == 1)
+        VTList = CurDAG->getVTList(VTs[0]);
+      else if (VTs.size() == 2)
+        VTList = CurDAG->getVTList(VTs[0], VTs[1]);
+      else
+        VTList = CurDAG->getVTList(VTs.data(), VTs.size());
+
+      // Get the operand list.
+      unsigned NumOps = MatcherTable[MatcherIndex++];
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0; i != NumOps; ++i) {
+        unsigned RecNo = MatcherTable[MatcherIndex++];
+        if (RecNo & 128)
+          RecNo = GetVBR(RecNo, MatcherTable, MatcherIndex);
+        
+        assert(RecNo < RecordedNodes.size() && "Invalid EmitNode");
+        Ops.push_back(RecordedNodes[RecNo]);
+      }
+      
+      // If there are variadic operands to add, handle them now.
+      if (EmitNodeInfo & OPFL_VariadicInfo) {
+        // Determine the start index to copy from.
+        unsigned FirstOpToCopy = getNumFixedFromVariadicInfo(EmitNodeInfo);
+        FirstOpToCopy += (EmitNodeInfo & OPFL_Chain) ? 1 : 0;
+        assert(NodeToMatch->getNumOperands() >= FirstOpToCopy &&
+               "Invalid variadic node");
+        // Copy all of the variadic operands, not including a potential flag
+        // input.
+        for (unsigned i = FirstOpToCopy, e = NodeToMatch->getNumOperands();
+             i != e; ++i) {
+          SDValue V = NodeToMatch->getOperand(i);
+          if (V.getValueType() == MVT::Flag) break;
+          Ops.push_back(V);
+        }
+      }
+      
+      // If this has chain/flag inputs, add them.
+      if (EmitNodeInfo & OPFL_Chain)
+        Ops.push_back(InputChain);
+      if ((EmitNodeInfo & OPFL_FlagInput) && InputFlag.getNode() != 0)
+        Ops.push_back(InputFlag);
+      
+      // Create the node.
+      SDNode *Res = 0;
+      if (Opcode != OPC_MorphNodeTo) {
+        // If this is a normal EmitNode command, just create the new node and
+        // add the results to the RecordedNodes list.
+        Res = CurDAG->getMachineNode(TargetOpc, NodeToMatch->getDebugLoc(),
+                                     VTList, Ops.data(), Ops.size());
+        
+        // Add all the non-flag/non-chain results to the RecordedNodes list.
+        for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+          if (VTs[i] == MVT::Other || VTs[i] == MVT::Flag) break;
+          RecordedNodes.push_back(SDValue(Res, i));
+        }
+        
+      } else {
+        Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops.data(), Ops.size(),
+                        EmitNodeInfo);
+      }
+      
+      // If the node had chain/flag results, update our notion of the current
+      // chain and flag.
+      if (EmitNodeInfo & OPFL_FlagOutput) {
+        InputFlag = SDValue(Res, VTs.size()-1);
+        if (EmitNodeInfo & OPFL_Chain)
+          InputChain = SDValue(Res, VTs.size()-2);
+      } else if (EmitNodeInfo & OPFL_Chain)
+        InputChain = SDValue(Res, VTs.size()-1);
+
+      // If the OPFL_MemRefs flag is set on this node, slap all of the
+      // accumulated memrefs onto it.
+      //
+      // FIXME: This is vastly incorrect for patterns with multiple outputs
+      // instructions that access memory and for ComplexPatterns that match
+      // loads.
+      if (EmitNodeInfo & OPFL_MemRefs) {
+        MachineSDNode::mmo_iterator MemRefs =
+          MF->allocateMemRefsArray(MatchedMemRefs.size());
+        std::copy(MatchedMemRefs.begin(), MatchedMemRefs.end(), MemRefs);
+        cast<MachineSDNode>(Res)
+          ->setMemRefs(MemRefs, MemRefs + MatchedMemRefs.size());
+      }
+      
+      DEBUG(errs() << "  "
+                   << (Opcode == OPC_MorphNodeTo ? "Morphed" : "Created")
+                   << " node: "; Res->dump(CurDAG); errs() << "\n");
+      
+      // If this was a MorphNodeTo then we're completely done!
+      if (Opcode == OPC_MorphNodeTo) {
+        // Update chain and flag uses.
+        UpdateChainsAndFlags(NodeToMatch, InputChain, ChainNodesMatched,
+                             InputFlag, FlagResultNodesMatched, true);
+        return Res;
+      }
+      
+      continue;
+    }
+        
+    case OPC_MarkFlagResults: {
+      unsigned NumNodes = MatcherTable[MatcherIndex++];
+      
+      // Read and remember all the flag-result nodes.
+      for (unsigned i = 0; i != NumNodes; ++i) {
+        unsigned RecNo = MatcherTable[MatcherIndex++];
+        if (RecNo & 128)
+          RecNo = GetVBR(RecNo, MatcherTable, MatcherIndex);
+
+        assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+        FlagResultNodesMatched.push_back(RecordedNodes[RecNo].getNode());
+      }
+      continue;
+    }
+      
+    case OPC_CompleteMatch: {
+      // The match has been completed, and any new nodes (if any) have been
+      // created.  Patch up references to the matched dag to use the newly
+      // created nodes.
+      unsigned NumResults = MatcherTable[MatcherIndex++];
+
+      for (unsigned i = 0; i != NumResults; ++i) {
+        unsigned ResSlot = MatcherTable[MatcherIndex++];
+        if (ResSlot & 128)
+          ResSlot = GetVBR(ResSlot, MatcherTable, MatcherIndex);
+        
+        assert(ResSlot < RecordedNodes.size() && "Invalid CheckSame");
+        SDValue Res = RecordedNodes[ResSlot];
+        
+        assert(i < NodeToMatch->getNumValues() &&
+               NodeToMatch->getValueType(i) != MVT::Other &&
+               NodeToMatch->getValueType(i) != MVT::Flag &&
+               "Invalid number of results to complete!");
+        assert((NodeToMatch->getValueType(i) == Res.getValueType() ||
+                NodeToMatch->getValueType(i) == MVT::iPTR ||
+                Res.getValueType() == MVT::iPTR ||
+                NodeToMatch->getValueType(i).getSizeInBits() ==
+                    Res.getValueType().getSizeInBits()) &&
+               "invalid replacement");
+        CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, i), Res);
+      }
+
+      // If the root node defines a flag, add it to the flag nodes to update
+      // list.
+      if (NodeToMatch->getValueType(NodeToMatch->getNumValues()-1) == MVT::Flag)
+        FlagResultNodesMatched.push_back(NodeToMatch);
+      
+      // Update chain and flag uses.
+      UpdateChainsAndFlags(NodeToMatch, InputChain, ChainNodesMatched,
+                           InputFlag, FlagResultNodesMatched, false);
+      
+      assert(NodeToMatch->use_empty() &&
+             "Didn't replace all uses of the node?");
+      
+      // FIXME: We just return here, which interacts correctly with SelectRoot
+      // above.  We should fix this to not return an SDNode* anymore.
+      return 0;
+    }
+    }
+    
+    // If the code reached this point, then the match failed.  See if there is
+    // another child to try in the current 'Scope', otherwise pop it until we
+    // find a case to check.
+    DEBUG(errs() << "  Match failed at index " << CurrentOpcodeIndex << "\n");
+    ++NumDAGIselRetries;
+    while (1) {
+      if (MatchScopes.empty()) {
+        CannotYetSelect(NodeToMatch);
+        return 0;
+      }
+
+      // Restore the interpreter state back to the point where the scope was
+      // formed.
+      MatchScope &LastScope = MatchScopes.back();
+      RecordedNodes.resize(LastScope.NumRecordedNodes);
+      NodeStack.clear();
+      NodeStack.append(LastScope.NodeStack.begin(), LastScope.NodeStack.end());
+      N = NodeStack.back();
+
+      if (LastScope.NumMatchedMemRefs != MatchedMemRefs.size())
+        MatchedMemRefs.resize(LastScope.NumMatchedMemRefs);
+      MatcherIndex = LastScope.FailIndex;
+      
+      DEBUG(errs() << "  Continuing at " << MatcherIndex << "\n");
+    
+      InputChain = LastScope.InputChain;
+      InputFlag = LastScope.InputFlag;
+      if (!LastScope.HasChainNodesMatched)
+        ChainNodesMatched.clear();
+      if (!LastScope.HasFlagResultNodesMatched)
+        FlagResultNodesMatched.clear();
+
+      // Check to see what the offset is at the new MatcherIndex.  If it is zero
+      // we have reached the end of this scope, otherwise we have another child
+      // in the current scope to try.
+      unsigned NumToSkip = MatcherTable[MatcherIndex++];
+      if (NumToSkip & 128)
+        NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
+
+      // If we have another child in this scope to match, update FailIndex and
+      // try it.
+      if (NumToSkip != 0) {
+        LastScope.FailIndex = MatcherIndex+NumToSkip;
+        break;
+      }
+      
+      // End of this scope, pop it and try the next child in the containing
+      // scope.
+      MatchScopes.pop_back();
+    }
+  }
+}
+    
+
+
+void SelectionDAGISel::CannotYetSelect(SDNode *N) {
+  std::string msg;
+  raw_string_ostream Msg(msg);
+  Msg << "Cannot yet select: ";
+  
+  if (N->getOpcode() != ISD::INTRINSIC_W_CHAIN &&
+      N->getOpcode() != ISD::INTRINSIC_WO_CHAIN &&
+      N->getOpcode() != ISD::INTRINSIC_VOID) {
+    N->printrFull(Msg, CurDAG);
+  } else {
+    bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other;
+    unsigned iid =
+      cast<ConstantSDNode>(N->getOperand(HasInputChain))->getZExtValue();
+    if (iid < Intrinsic::num_intrinsics)
+      Msg << "intrinsic %" << Intrinsic::getName((Intrinsic::ID)iid);
+    else if (const TargetIntrinsicInfo *TII = TM.getIntrinsicInfo())
+      Msg << "target intrinsic %" << TII->getName(iid);
+    else
+      Msg << "unknown intrinsic #" << iid;
+  }
+  report_fatal_error(Msg.str());
+}
+
+char SelectionDAGISel::ID = 0;
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
new file mode 100644
index 0000000..8313de5
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -0,0 +1,301 @@
+//===-- SelectionDAGPrinter.cpp - Implement SelectionDAG::viewGraph() -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG::viewGraph method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/config.h"
+using namespace llvm;
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<SelectionDAG*> : public DefaultDOTGraphTraits {
+
+    explicit DOTGraphTraits(bool isSimple=false) :
+      DefaultDOTGraphTraits(isSimple) {}
+
+    static bool hasEdgeDestLabels() {
+      return true;
+    }
+
+    static unsigned numEdgeDestLabels(const void *Node) {
+      return ((const SDNode *) Node)->getNumValues();
+    }
+
+    static std::string getEdgeDestLabel(const void *Node, unsigned i) {
+      return ((const SDNode *) Node)->getValueType(i).getEVTString();
+    }
+
+    template<typename EdgeIter>
+    static std::string getEdgeSourceLabel(const void *Node, EdgeIter I) {
+      return itostr(I - SDNodeIterator::begin((SDNode *) Node));
+    }
+
+    /// edgeTargetsEdgeSource - This method returns true if this outgoing edge
+    /// should actually target another edge source, not a node.  If this method
+    /// is implemented, getEdgeTarget should be implemented.
+    template<typename EdgeIter>
+    static bool edgeTargetsEdgeSource(const void *Node, EdgeIter I) {
+      return true;
+    }
+
+    /// getEdgeTarget - If edgeTargetsEdgeSource returns true, this method is
+    /// called to determine which outgoing edge of Node is the target of this
+    /// edge.
+    template<typename EdgeIter>
+    static EdgeIter getEdgeTarget(const void *Node, EdgeIter I) {
+      SDNode *TargetNode = *I;
+      SDNodeIterator NI = SDNodeIterator::begin(TargetNode);
+      std::advance(NI, I.getNode()->getOperand(I.getOperand()).getResNo());
+      return NI;
+    }
+
+    static std::string getGraphName(const SelectionDAG *G) {
+      return G->getMachineFunction().getFunction()->getName();
+    }
+
+    static bool renderGraphFromBottomUp() {
+      return true;
+    }
+
+    static bool hasNodeAddressLabel(const SDNode *Node,
+                                    const SelectionDAG *Graph) {
+      return true;
+    }
+
+    /// If you want to override the dot attributes printed for a particular
+    /// edge, override this method.
+    template<typename EdgeIter>
+    static std::string getEdgeAttributes(const void *Node, EdgeIter EI) {
+      SDValue Op = EI.getNode()->getOperand(EI.getOperand());
+      EVT VT = Op.getValueType();
+      if (VT == MVT::Flag)
+        return "color=red,style=bold";
+      else if (VT == MVT::Other)
+        return "color=blue,style=dashed";
+      return "";
+    }
+
+
+    static std::string getSimpleNodeLabel(const SDNode *Node,
+                                          const SelectionDAG *G) {
+      std::string Result = Node->getOperationName(G);
+      {
+        raw_string_ostream OS(Result);
+        Node->print_details(OS, G);
+      }
+      return Result;
+    }
+    std::string getNodeLabel(const SDNode *Node, const SelectionDAG *Graph);
+    static std::string getNodeAttributes(const SDNode *N,
+                                         const SelectionDAG *Graph) {
+#ifndef NDEBUG
+      const std::string &Attrs = Graph->getGraphAttrs(N);
+      if (!Attrs.empty()) {
+        if (Attrs.find("shape=") == std::string::npos)
+          return std::string("shape=Mrecord,") + Attrs;
+        else
+          return Attrs;
+      }
+#endif
+      return "shape=Mrecord";
+    }
+
+    static void addCustomGraphFeatures(SelectionDAG *G,
+                                       GraphWriter<SelectionDAG*> &GW) {
+      GW.emitSimpleNode(0, "plaintext=circle", "GraphRoot");
+      if (G->getRoot().getNode())
+        GW.emitEdge(0, -1, G->getRoot().getNode(), G->getRoot().getResNo(),
+                    "color=blue,style=dashed");
+    }
+  };
+}
+
+std::string DOTGraphTraits<SelectionDAG*>::getNodeLabel(const SDNode *Node,
+                                                        const SelectionDAG *G) {
+  return DOTGraphTraits<SelectionDAG*>::getSimpleNodeLabel(Node, G);
+}
+
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void SelectionDAG::viewGraph(const std::string &Title) {
+// This code is only for debugging!
+#ifndef NDEBUG
+  ViewGraph(this, "dag." + getMachineFunction().getFunction()->getNameStr(),
+            false, Title);
+#else
+  errs() << "SelectionDAG::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
+
+// This overload is defined out-of-line here instead of just using a
+// default parameter because this is easiest for gdb to call.
+void SelectionDAG::viewGraph() {
+  viewGraph("");
+}
+
+/// clearGraphAttrs - Clear all previously defined node graph attributes.
+/// Intended to be used from a debugging tool (eg. gdb).
+void SelectionDAG::clearGraphAttrs() {
+#ifndef NDEBUG
+  NodeGraphAttrs.clear();
+#else
+  errs() << "SelectionDAG::clearGraphAttrs is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+
+/// setGraphAttrs - Set graph attributes for a node. (eg. "color=red".)
+///
+void SelectionDAG::setGraphAttrs(const SDNode *N, const char *Attrs) {
+#ifndef NDEBUG
+  NodeGraphAttrs[N] = Attrs;
+#else
+  errs() << "SelectionDAG::setGraphAttrs is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+
+/// getGraphAttrs - Get graph attributes for a node. (eg. "color=red".)
+/// Used from getNodeAttributes.
+const std::string SelectionDAG::getGraphAttrs(const SDNode *N) const {
+#ifndef NDEBUG
+  std::map<const SDNode *, std::string>::const_iterator I =
+    NodeGraphAttrs.find(N);
+
+  if (I != NodeGraphAttrs.end())
+    return I->second;
+  else
+    return "";
+#else
+  errs() << "SelectionDAG::getGraphAttrs is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+  return std::string();
+#endif
+}
+
+/// setGraphColor - Convenience for setting node color attribute.
+///
+void SelectionDAG::setGraphColor(const SDNode *N, const char *Color) {
+#ifndef NDEBUG
+  NodeGraphAttrs[N] = std::string("color=") + Color;
+#else
+  errs() << "SelectionDAG::setGraphColor is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+/// setSubgraphColorHelper - Implement setSubgraphColor.  Return
+/// whether we truncated the search.
+///
+bool SelectionDAG::setSubgraphColorHelper(SDNode *N, const char *Color, DenseSet<SDNode *> &visited,
+                                          int level, bool &printed) {
+  bool hit_limit = false;
+
+#ifndef NDEBUG
+  if (level >= 20) {
+    if (!printed) {
+      printed = true;
+      DEBUG(dbgs() << "setSubgraphColor hit max level\n");
+    }
+    return true;
+  }
+
+  unsigned oldSize = visited.size();
+  visited.insert(N);
+  if (visited.size() != oldSize) {
+    setGraphColor(N, Color);
+    for(SDNodeIterator i = SDNodeIterator::begin(N), iend = SDNodeIterator::end(N);
+        i != iend;
+        ++i) {
+      hit_limit = setSubgraphColorHelper(*i, Color, visited, level+1, printed) || hit_limit;
+    }
+  }
+#else
+  errs() << "SelectionDAG::setSubgraphColor is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+  return hit_limit;
+}
+
+/// setSubgraphColor - Convenience for setting subgraph color attribute.
+///
+void SelectionDAG::setSubgraphColor(SDNode *N, const char *Color) {
+#ifndef NDEBUG
+  DenseSet<SDNode *> visited;
+  bool printed = false;
+  if (setSubgraphColorHelper(N, Color, visited, 0, printed)) {
+    // Visually mark that we hit the limit
+    if (strcmp(Color, "red") == 0) {
+      setSubgraphColorHelper(N, "blue", visited, 0, printed);
+    } else if (strcmp(Color, "yellow") == 0) {
+      setSubgraphColorHelper(N, "green", visited, 0, printed);
+    }
+  }
+
+#else
+  errs() << "SelectionDAG::setSubgraphColor is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+std::string ScheduleDAGSDNodes::getGraphNodeLabel(const SUnit *SU) const {
+  std::string s;
+  raw_string_ostream O(s);
+  O << "SU(" << SU->NodeNum << "): ";
+  if (SU->getNode()) {
+    SmallVector<SDNode *, 4> FlaggedNodes;
+    for (SDNode *N = SU->getNode(); N; N = N->getFlaggedNode())
+      FlaggedNodes.push_back(N);
+    while (!FlaggedNodes.empty()) {
+      O << DOTGraphTraits<SelectionDAG*>
+        ::getSimpleNodeLabel(FlaggedNodes.back(), DAG);
+      FlaggedNodes.pop_back();
+      if (!FlaggedNodes.empty())
+        O << "\n    ";
+    }
+  } else {
+    O << "CROSS RC COPY";
+  }
+  return O.str();
+}
+
+void ScheduleDAGSDNodes::getCustomGraphFeatures(GraphWriter<ScheduleDAG*> &GW) const {
+  if (DAG) {
+    // Draw a special "GraphRoot" node to indicate the root of the graph.
+    GW.emitSimpleNode(0, "plaintext=circle", "GraphRoot");
+    const SDNode *N = DAG->getRoot().getNode();
+    if (N && N->getNodeId() != -1)
+      GW.emitEdge(0, -1, &SUnits[N->getNodeId()], -1,
+                  "color=blue,style=dashed");
+  }
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/TargetLowering.cpp
new file mode 100644
index 0000000..c6a4b71
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -0,0 +1,2907 @@
+//===-- TargetLowering.cpp - Implement the TargetLowering class -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+namespace llvm {
+TLSModel::Model getTLSModel(const GlobalValue *GV, Reloc::Model reloc) {
+  bool isLocal = GV->hasLocalLinkage();
+  bool isDeclaration = GV->isDeclaration();
+  // FIXME: what should we do for protected and internal visibility?
+  // For variables, is internal different from hidden?
+  bool isHidden = GV->hasHiddenVisibility();
+
+  if (reloc == Reloc::PIC_) {
+    if (isLocal || isHidden)
+      return TLSModel::LocalDynamic;
+    else
+      return TLSModel::GeneralDynamic;
+  } else {
+    if (!isDeclaration || isHidden)
+      return TLSModel::LocalExec;
+    else
+      return TLSModel::InitialExec;
+  }
+}
+}
+
+/// InitLibcallNames - Set default libcall names.
+///
+static void InitLibcallNames(const char **Names) {
+  Names[RTLIB::SHL_I16] = "__ashlhi3";
+  Names[RTLIB::SHL_I32] = "__ashlsi3";
+  Names[RTLIB::SHL_I64] = "__ashldi3";
+  Names[RTLIB::SHL_I128] = "__ashlti3";
+  Names[RTLIB::SRL_I16] = "__lshrhi3";
+  Names[RTLIB::SRL_I32] = "__lshrsi3";
+  Names[RTLIB::SRL_I64] = "__lshrdi3";
+  Names[RTLIB::SRL_I128] = "__lshrti3";
+  Names[RTLIB::SRA_I16] = "__ashrhi3";
+  Names[RTLIB::SRA_I32] = "__ashrsi3";
+  Names[RTLIB::SRA_I64] = "__ashrdi3";
+  Names[RTLIB::SRA_I128] = "__ashrti3";
+  Names[RTLIB::MUL_I8] = "__mulqi3";
+  Names[RTLIB::MUL_I16] = "__mulhi3";
+  Names[RTLIB::MUL_I32] = "__mulsi3";
+  Names[RTLIB::MUL_I64] = "__muldi3";
+  Names[RTLIB::MUL_I128] = "__multi3";
+  Names[RTLIB::SDIV_I8] = "__divqi3";
+  Names[RTLIB::SDIV_I16] = "__divhi3";
+  Names[RTLIB::SDIV_I32] = "__divsi3";
+  Names[RTLIB::SDIV_I64] = "__divdi3";
+  Names[RTLIB::SDIV_I128] = "__divti3";
+  Names[RTLIB::UDIV_I8] = "__udivqi3";
+  Names[RTLIB::UDIV_I16] = "__udivhi3";
+  Names[RTLIB::UDIV_I32] = "__udivsi3";
+  Names[RTLIB::UDIV_I64] = "__udivdi3";
+  Names[RTLIB::UDIV_I128] = "__udivti3";
+  Names[RTLIB::SREM_I8] = "__modqi3";
+  Names[RTLIB::SREM_I16] = "__modhi3";
+  Names[RTLIB::SREM_I32] = "__modsi3";
+  Names[RTLIB::SREM_I64] = "__moddi3";
+  Names[RTLIB::SREM_I128] = "__modti3";
+  Names[RTLIB::UREM_I8] = "__umodqi3";
+  Names[RTLIB::UREM_I16] = "__umodhi3";
+  Names[RTLIB::UREM_I32] = "__umodsi3";
+  Names[RTLIB::UREM_I64] = "__umoddi3";
+  Names[RTLIB::UREM_I128] = "__umodti3";
+  Names[RTLIB::NEG_I32] = "__negsi2";
+  Names[RTLIB::NEG_I64] = "__negdi2";
+  Names[RTLIB::ADD_F32] = "__addsf3";
+  Names[RTLIB::ADD_F64] = "__adddf3";
+  Names[RTLIB::ADD_F80] = "__addxf3";
+  Names[RTLIB::ADD_PPCF128] = "__gcc_qadd";
+  Names[RTLIB::SUB_F32] = "__subsf3";
+  Names[RTLIB::SUB_F64] = "__subdf3";
+  Names[RTLIB::SUB_F80] = "__subxf3";
+  Names[RTLIB::SUB_PPCF128] = "__gcc_qsub";
+  Names[RTLIB::MUL_F32] = "__mulsf3";
+  Names[RTLIB::MUL_F64] = "__muldf3";
+  Names[RTLIB::MUL_F80] = "__mulxf3";
+  Names[RTLIB::MUL_PPCF128] = "__gcc_qmul";
+  Names[RTLIB::DIV_F32] = "__divsf3";
+  Names[RTLIB::DIV_F64] = "__divdf3";
+  Names[RTLIB::DIV_F80] = "__divxf3";
+  Names[RTLIB::DIV_PPCF128] = "__gcc_qdiv";
+  Names[RTLIB::REM_F32] = "fmodf";
+  Names[RTLIB::REM_F64] = "fmod";
+  Names[RTLIB::REM_F80] = "fmodl";
+  Names[RTLIB::REM_PPCF128] = "fmodl";
+  Names[RTLIB::POWI_F32] = "__powisf2";
+  Names[RTLIB::POWI_F64] = "__powidf2";
+  Names[RTLIB::POWI_F80] = "__powixf2";
+  Names[RTLIB::POWI_PPCF128] = "__powitf2";
+  Names[RTLIB::SQRT_F32] = "sqrtf";
+  Names[RTLIB::SQRT_F64] = "sqrt";
+  Names[RTLIB::SQRT_F80] = "sqrtl";
+  Names[RTLIB::SQRT_PPCF128] = "sqrtl";
+  Names[RTLIB::LOG_F32] = "logf";
+  Names[RTLIB::LOG_F64] = "log";
+  Names[RTLIB::LOG_F80] = "logl";
+  Names[RTLIB::LOG_PPCF128] = "logl";
+  Names[RTLIB::LOG2_F32] = "log2f";
+  Names[RTLIB::LOG2_F64] = "log2";
+  Names[RTLIB::LOG2_F80] = "log2l";
+  Names[RTLIB::LOG2_PPCF128] = "log2l";
+  Names[RTLIB::LOG10_F32] = "log10f";
+  Names[RTLIB::LOG10_F64] = "log10";
+  Names[RTLIB::LOG10_F80] = "log10l";
+  Names[RTLIB::LOG10_PPCF128] = "log10l";
+  Names[RTLIB::EXP_F32] = "expf";
+  Names[RTLIB::EXP_F64] = "exp";
+  Names[RTLIB::EXP_F80] = "expl";
+  Names[RTLIB::EXP_PPCF128] = "expl";
+  Names[RTLIB::EXP2_F32] = "exp2f";
+  Names[RTLIB::EXP2_F64] = "exp2";
+  Names[RTLIB::EXP2_F80] = "exp2l";
+  Names[RTLIB::EXP2_PPCF128] = "exp2l";
+  Names[RTLIB::SIN_F32] = "sinf";
+  Names[RTLIB::SIN_F64] = "sin";
+  Names[RTLIB::SIN_F80] = "sinl";
+  Names[RTLIB::SIN_PPCF128] = "sinl";
+  Names[RTLIB::COS_F32] = "cosf";
+  Names[RTLIB::COS_F64] = "cos";
+  Names[RTLIB::COS_F80] = "cosl";
+  Names[RTLIB::COS_PPCF128] = "cosl";
+  Names[RTLIB::POW_F32] = "powf";
+  Names[RTLIB::POW_F64] = "pow";
+  Names[RTLIB::POW_F80] = "powl";
+  Names[RTLIB::POW_PPCF128] = "powl";
+  Names[RTLIB::CEIL_F32] = "ceilf";
+  Names[RTLIB::CEIL_F64] = "ceil";
+  Names[RTLIB::CEIL_F80] = "ceill";
+  Names[RTLIB::CEIL_PPCF128] = "ceill";
+  Names[RTLIB::TRUNC_F32] = "truncf";
+  Names[RTLIB::TRUNC_F64] = "trunc";
+  Names[RTLIB::TRUNC_F80] = "truncl";
+  Names[RTLIB::TRUNC_PPCF128] = "truncl";
+  Names[RTLIB::RINT_F32] = "rintf";
+  Names[RTLIB::RINT_F64] = "rint";
+  Names[RTLIB::RINT_F80] = "rintl";
+  Names[RTLIB::RINT_PPCF128] = "rintl";
+  Names[RTLIB::NEARBYINT_F32] = "nearbyintf";
+  Names[RTLIB::NEARBYINT_F64] = "nearbyint";
+  Names[RTLIB::NEARBYINT_F80] = "nearbyintl";
+  Names[RTLIB::NEARBYINT_PPCF128] = "nearbyintl";
+  Names[RTLIB::FLOOR_F32] = "floorf";
+  Names[RTLIB::FLOOR_F64] = "floor";
+  Names[RTLIB::FLOOR_F80] = "floorl";
+  Names[RTLIB::FLOOR_PPCF128] = "floorl";
+  Names[RTLIB::COPYSIGN_F32] = "copysignf";
+  Names[RTLIB::COPYSIGN_F64] = "copysign";
+  Names[RTLIB::COPYSIGN_F80] = "copysignl";
+  Names[RTLIB::COPYSIGN_PPCF128] = "copysignl";
+  Names[RTLIB::FPEXT_F32_F64] = "__extendsfdf2";
+  Names[RTLIB::FPEXT_F16_F32] = "__gnu_h2f_ieee";
+  Names[RTLIB::FPROUND_F32_F16] = "__gnu_f2h_ieee";
+  Names[RTLIB::FPROUND_F64_F32] = "__truncdfsf2";
+  Names[RTLIB::FPROUND_F80_F32] = "__truncxfsf2";
+  Names[RTLIB::FPROUND_PPCF128_F32] = "__trunctfsf2";
+  Names[RTLIB::FPROUND_F80_F64] = "__truncxfdf2";
+  Names[RTLIB::FPROUND_PPCF128_F64] = "__trunctfdf2";
+  Names[RTLIB::FPTOSINT_F32_I8] = "__fixsfqi";
+  Names[RTLIB::FPTOSINT_F32_I16] = "__fixsfhi";
+  Names[RTLIB::FPTOSINT_F32_I32] = "__fixsfsi";
+  Names[RTLIB::FPTOSINT_F32_I64] = "__fixsfdi";
+  Names[RTLIB::FPTOSINT_F32_I128] = "__fixsfti";
+  Names[RTLIB::FPTOSINT_F64_I8] = "__fixdfqi";
+  Names[RTLIB::FPTOSINT_F64_I16] = "__fixdfhi";
+  Names[RTLIB::FPTOSINT_F64_I32] = "__fixdfsi";
+  Names[RTLIB::FPTOSINT_F64_I64] = "__fixdfdi";
+  Names[RTLIB::FPTOSINT_F64_I128] = "__fixdfti";
+  Names[RTLIB::FPTOSINT_F80_I32] = "__fixxfsi";
+  Names[RTLIB::FPTOSINT_F80_I64] = "__fixxfdi";
+  Names[RTLIB::FPTOSINT_F80_I128] = "__fixxfti";
+  Names[RTLIB::FPTOSINT_PPCF128_I32] = "__fixtfsi";
+  Names[RTLIB::FPTOSINT_PPCF128_I64] = "__fixtfdi";
+  Names[RTLIB::FPTOSINT_PPCF128_I128] = "__fixtfti";
+  Names[RTLIB::FPTOUINT_F32_I8] = "__fixunssfqi";
+  Names[RTLIB::FPTOUINT_F32_I16] = "__fixunssfhi";
+  Names[RTLIB::FPTOUINT_F32_I32] = "__fixunssfsi";
+  Names[RTLIB::FPTOUINT_F32_I64] = "__fixunssfdi";
+  Names[RTLIB::FPTOUINT_F32_I128] = "__fixunssfti";
+  Names[RTLIB::FPTOUINT_F64_I8] = "__fixunsdfqi";
+  Names[RTLIB::FPTOUINT_F64_I16] = "__fixunsdfhi";
+  Names[RTLIB::FPTOUINT_F64_I32] = "__fixunsdfsi";
+  Names[RTLIB::FPTOUINT_F64_I64] = "__fixunsdfdi";
+  Names[RTLIB::FPTOUINT_F64_I128] = "__fixunsdfti";
+  Names[RTLIB::FPTOUINT_F80_I32] = "__fixunsxfsi";
+  Names[RTLIB::FPTOUINT_F80_I64] = "__fixunsxfdi";
+  Names[RTLIB::FPTOUINT_F80_I128] = "__fixunsxfti";
+  Names[RTLIB::FPTOUINT_PPCF128_I32] = "__fixunstfsi";
+  Names[RTLIB::FPTOUINT_PPCF128_I64] = "__fixunstfdi";
+  Names[RTLIB::FPTOUINT_PPCF128_I128] = "__fixunstfti";
+  Names[RTLIB::SINTTOFP_I32_F32] = "__floatsisf";
+  Names[RTLIB::SINTTOFP_I32_F64] = "__floatsidf";
+  Names[RTLIB::SINTTOFP_I32_F80] = "__floatsixf";
+  Names[RTLIB::SINTTOFP_I32_PPCF128] = "__floatsitf";
+  Names[RTLIB::SINTTOFP_I64_F32] = "__floatdisf";
+  Names[RTLIB::SINTTOFP_I64_F64] = "__floatdidf";
+  Names[RTLIB::SINTTOFP_I64_F80] = "__floatdixf";
+  Names[RTLIB::SINTTOFP_I64_PPCF128] = "__floatditf";
+  Names[RTLIB::SINTTOFP_I128_F32] = "__floattisf";
+  Names[RTLIB::SINTTOFP_I128_F64] = "__floattidf";
+  Names[RTLIB::SINTTOFP_I128_F80] = "__floattixf";
+  Names[RTLIB::SINTTOFP_I128_PPCF128] = "__floattitf";
+  Names[RTLIB::UINTTOFP_I32_F32] = "__floatunsisf";
+  Names[RTLIB::UINTTOFP_I32_F64] = "__floatunsidf";
+  Names[RTLIB::UINTTOFP_I32_F80] = "__floatunsixf";
+  Names[RTLIB::UINTTOFP_I32_PPCF128] = "__floatunsitf";
+  Names[RTLIB::UINTTOFP_I64_F32] = "__floatundisf";
+  Names[RTLIB::UINTTOFP_I64_F64] = "__floatundidf";
+  Names[RTLIB::UINTTOFP_I64_F80] = "__floatundixf";
+  Names[RTLIB::UINTTOFP_I64_PPCF128] = "__floatunditf";
+  Names[RTLIB::UINTTOFP_I128_F32] = "__floatuntisf";
+  Names[RTLIB::UINTTOFP_I128_F64] = "__floatuntidf";
+  Names[RTLIB::UINTTOFP_I128_F80] = "__floatuntixf";
+  Names[RTLIB::UINTTOFP_I128_PPCF128] = "__floatuntitf";
+  Names[RTLIB::OEQ_F32] = "__eqsf2";
+  Names[RTLIB::OEQ_F64] = "__eqdf2";
+  Names[RTLIB::UNE_F32] = "__nesf2";
+  Names[RTLIB::UNE_F64] = "__nedf2";
+  Names[RTLIB::OGE_F32] = "__gesf2";
+  Names[RTLIB::OGE_F64] = "__gedf2";
+  Names[RTLIB::OLT_F32] = "__ltsf2";
+  Names[RTLIB::OLT_F64] = "__ltdf2";
+  Names[RTLIB::OLE_F32] = "__lesf2";
+  Names[RTLIB::OLE_F64] = "__ledf2";
+  Names[RTLIB::OGT_F32] = "__gtsf2";
+  Names[RTLIB::OGT_F64] = "__gtdf2";
+  Names[RTLIB::UO_F32] = "__unordsf2";
+  Names[RTLIB::UO_F64] = "__unorddf2";
+  Names[RTLIB::O_F32] = "__unordsf2";
+  Names[RTLIB::O_F64] = "__unorddf2";
+  Names[RTLIB::MEMCPY] = "memcpy";
+  Names[RTLIB::MEMMOVE] = "memmove";
+  Names[RTLIB::MEMSET] = "memset";
+  Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = "__sync_val_compare_and_swap_4";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = "__sync_val_compare_and_swap_8";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = "__sync_lock_test_and_set_1";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = "__sync_lock_test_and_set_2";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = "__sync_lock_test_and_set_4";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = "__sync_lock_test_and_set_8";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_1] = "__sync_fetch_and_add_1";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_2] = "__sync_fetch_and_add_2";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_4] = "__sync_fetch_and_add_4";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_8] = "__sync_fetch_and_add_8";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_1] = "__sync_fetch_and_sub_1";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_2] = "__sync_fetch_and_sub_2";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_4] = "__sync_fetch_and_sub_4";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_8] = "__sync_fetch_and_sub_8";
+  Names[RTLIB::SYNC_FETCH_AND_AND_1] = "__sync_fetch_and_and_1";
+  Names[RTLIB::SYNC_FETCH_AND_AND_2] = "__sync_fetch_and_and_2";
+  Names[RTLIB::SYNC_FETCH_AND_AND_4] = "__sync_fetch_and_and_4";
+  Names[RTLIB::SYNC_FETCH_AND_AND_8] = "__sync_fetch_and_and_8";
+  Names[RTLIB::SYNC_FETCH_AND_OR_1] = "__sync_fetch_and_or_1";
+  Names[RTLIB::SYNC_FETCH_AND_OR_2] = "__sync_fetch_and_or_2";
+  Names[RTLIB::SYNC_FETCH_AND_OR_4] = "__sync_fetch_and_or_4";
+  Names[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and-xor_4";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_4] = "__sync_fetch_and_nand_4";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_8] = "__sync_fetch_and_nand_8";
+}
+
+/// InitLibcallCallingConvs - Set default libcall CallingConvs.
+///
+static void InitLibcallCallingConvs(CallingConv::ID *CCs) {
+  for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
+    CCs[i] = CallingConv::C;
+  }
+}
+
+/// getFPEXT - Return the FPEXT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::f64)
+      return FPEXT_F32_F64;
+  }
+
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPROUND - Return the FPROUND_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) {
+  if (RetVT == MVT::f32) {
+    if (OpVT == MVT::f64)
+      return FPROUND_F64_F32;
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F32;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F32;
+  } else if (RetVT == MVT::f64) {
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F64;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F64;
+  }
+
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::i8)
+      return FPTOSINT_F32_I8;
+    if (RetVT == MVT::i16)
+      return FPTOSINT_F32_I16;
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F32_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F32_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F32_I128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i8)
+      return FPTOSINT_F64_I8;
+    if (RetVT == MVT::i16)
+      return FPTOSINT_F64_I16;
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F64_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F64_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F64_I128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F80_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F80_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F80_I128;
+  } else if (OpVT == MVT::ppcf128) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_PPCF128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_PPCF128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_PPCF128_I128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::i8)
+      return FPTOUINT_F32_I8;
+    if (RetVT == MVT::i16)
+      return FPTOUINT_F32_I16;
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F32_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F32_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F32_I128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i8)
+      return FPTOUINT_F64_I8;
+    if (RetVT == MVT::i16)
+      return FPTOUINT_F64_I16;
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F64_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F64_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F64_I128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F80_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F80_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F80_I128;
+  } else if (OpVT == MVT::ppcf128) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_PPCF128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_PPCF128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_PPCF128_I128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I32_F32;
+    else if (RetVT == MVT::f64)
+      return SINTTOFP_I32_F64;
+    else if (RetVT == MVT::f80)
+      return SINTTOFP_I32_F80;
+    else if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I32_PPCF128;
+  } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I64_F32;
+    else if (RetVT == MVT::f64)
+      return SINTTOFP_I64_F64;
+    else if (RetVT == MVT::f80)
+      return SINTTOFP_I64_F80;
+    else if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I64_PPCF128;
+  } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I128_F32;
+    else if (RetVT == MVT::f64)
+      return SINTTOFP_I128_F64;
+    else if (RetVT == MVT::f80)
+      return SINTTOFP_I128_F80;
+    else if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I128_PPCF128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I32_F32;
+    else if (RetVT == MVT::f64)
+      return UINTTOFP_I32_F64;
+    else if (RetVT == MVT::f80)
+      return UINTTOFP_I32_F80;
+    else if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I32_PPCF128;
+  } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I64_F32;
+    else if (RetVT == MVT::f64)
+      return UINTTOFP_I64_F64;
+    else if (RetVT == MVT::f80)
+      return UINTTOFP_I64_F80;
+    else if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I64_PPCF128;
+  } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I128_F32;
+    else if (RetVT == MVT::f64)
+      return UINTTOFP_I128_F64;
+    else if (RetVT == MVT::f80)
+      return UINTTOFP_I128_F80;
+    else if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I128_PPCF128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// InitCmpLibcallCCs - Set default comparison libcall CC.
+///
+static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
+  memset(CCs, ISD::SETCC_INVALID, sizeof(ISD::CondCode)*RTLIB::UNKNOWN_LIBCALL);
+  CCs[RTLIB::OEQ_F32] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_F64] = ISD::SETEQ;
+  CCs[RTLIB::UNE_F32] = ISD::SETNE;
+  CCs[RTLIB::UNE_F64] = ISD::SETNE;
+  CCs[RTLIB::OGE_F32] = ISD::SETGE;
+  CCs[RTLIB::OGE_F64] = ISD::SETGE;
+  CCs[RTLIB::OLT_F32] = ISD::SETLT;
+  CCs[RTLIB::OLT_F64] = ISD::SETLT;
+  CCs[RTLIB::OLE_F32] = ISD::SETLE;
+  CCs[RTLIB::OLE_F64] = ISD::SETLE;
+  CCs[RTLIB::OGT_F32] = ISD::SETGT;
+  CCs[RTLIB::OGT_F64] = ISD::SETGT;
+  CCs[RTLIB::UO_F32] = ISD::SETNE;
+  CCs[RTLIB::UO_F64] = ISD::SETNE;
+  CCs[RTLIB::O_F32] = ISD::SETEQ;
+  CCs[RTLIB::O_F64] = ISD::SETEQ;
+}
+
+/// NOTE: The constructor takes ownership of TLOF.
+TargetLowering::TargetLowering(const TargetMachine &tm)
+  : TM(tm), TD(TM.getTargetData()) {
+  // All operations default to being supported.
+  memset(OpActions, 0, sizeof(OpActions));
+  memset(LoadExtActions, 0, sizeof(LoadExtActions));
+  memset(TruncStoreActions, 0, sizeof(TruncStoreActions));
+  memset(IndexedModeActions, 0, sizeof(IndexedModeActions));
+  memset(CondCodeActions, 0, sizeof(CondCodeActions));
+
+  // Set default actions for various operations.
+  for (unsigned VT = 0; VT != (unsigned)MVT::LAST_VALUETYPE; ++VT) {
+    // Default all indexed load / store to expand.
+    for (unsigned IM = (unsigned)ISD::PRE_INC;
+         IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
+      setIndexedLoadAction(IM, (MVT::SimpleValueType)VT, Expand);
+      setIndexedStoreAction(IM, (MVT::SimpleValueType)VT, Expand);
+    }
+    
+    // These operations default to expand.
+    setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
+  }
+
+  // Most targets ignore the @llvm.prefetch intrinsic.
+  setOperationAction(ISD::PREFETCH, MVT::Other, Expand);
+  
+  // ConstantFP nodes default to expand.  Targets can either change this to 
+  // Legal, in which case all fp constants are legal, or use isFPImmLegal()
+  // to optimize expansions for certain constants.
+  setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f80, Expand);
+
+  // These library functions default to expand.
+  setOperationAction(ISD::FLOG , MVT::f64, Expand);
+  setOperationAction(ISD::FLOG2, MVT::f64, Expand);
+  setOperationAction(ISD::FLOG10,MVT::f64, Expand);
+  setOperationAction(ISD::FEXP , MVT::f64, Expand);
+  setOperationAction(ISD::FEXP2, MVT::f64, Expand);
+  setOperationAction(ISD::FLOG , MVT::f32, Expand);
+  setOperationAction(ISD::FLOG2, MVT::f32, Expand);
+  setOperationAction(ISD::FLOG10,MVT::f32, Expand);
+  setOperationAction(ISD::FEXP , MVT::f32, Expand);
+  setOperationAction(ISD::FEXP2, MVT::f32, Expand);
+
+  // Default ISD::TRAP to expand (which turns it into abort).
+  setOperationAction(ISD::TRAP, MVT::Other, Expand);
+    
+  IsLittleEndian = TD->isLittleEndian();
+  ShiftAmountTy = PointerTy = MVT::getIntegerVT(8*TD->getPointerSize());
+  memset(RegClassForVT, 0,MVT::LAST_VALUETYPE*sizeof(TargetRegisterClass*));
+  memset(TargetDAGCombineArray, 0, array_lengthof(TargetDAGCombineArray));
+  maxStoresPerMemset = maxStoresPerMemcpy = maxStoresPerMemmove = 8;
+  benefitFromCodePlacementOpt = false;
+  UseUnderscoreSetJmp = false;
+  UseUnderscoreLongJmp = false;
+  SelectIsExpensive = false;
+  IntDivIsCheap = false;
+  Pow2DivIsCheap = false;
+  StackPointerRegisterToSaveRestore = 0;
+  BooleanContents = UndefinedBooleanContent;
+  SchedPreferenceInfo = Sched::Latency;
+  JumpBufSize = 0;
+  JumpBufAlignment = 0;
+  PrefLoopAlignment = 0;
+  MinStackArgumentAlignment = 1;
+  ShouldFoldAtomicFences = false;
+
+  InitLibcallNames(LibcallRoutineNames);
+  InitCmpLibcallCCs(CmpLibcallCCs);
+  InitLibcallCallingConvs(LibcallCallingConvs);
+}
+
+TargetLowering::~TargetLowering() {
+}
+
+/// canOpTrap - Returns true if the operation can trap for the value type.
+/// VT must be a legal type.
+bool TargetLowering::canOpTrap(unsigned Op, EVT VT) const {
+  assert(isTypeLegal(VT));
+  switch (Op) {
+  default:
+    return false;
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+    return true;
+  }
+}
+
+
+static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
+                                          unsigned &NumIntermediates,
+                                          EVT &RegisterVT,
+                                          TargetLowering *TLI) {
+  // Figure out the right, legal destination reg to copy into.
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType();
+  
+  unsigned NumVectorRegs = 1;
+  
+  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we 
+  // could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(NumElts)) {
+    NumVectorRegs = NumElts;
+    NumElts = 1;
+  }
+  
+  // Divide the input until we get to a supported size.  This will always
+  // end with a scalar if the target doesn't support vectors.
+  while (NumElts > 1 && !TLI->isTypeLegal(MVT::getVectorVT(EltTy, NumElts))) {
+    NumElts >>= 1;
+    NumVectorRegs <<= 1;
+  }
+
+  NumIntermediates = NumVectorRegs;
+  
+  MVT NewVT = MVT::getVectorVT(EltTy, NumElts);
+  if (!TLI->isTypeLegal(NewVT))
+    NewVT = EltTy;
+  IntermediateVT = NewVT;
+
+  EVT DestVT = TLI->getRegisterType(NewVT);
+  RegisterVT = DestVT;
+  if (EVT(DestVT).bitsLT(NewVT))    // Value is expanded, e.g. i64 -> i16.
+    return NumVectorRegs*(NewVT.getSizeInBits()/DestVT.getSizeInBits());
+  
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
+}
+
+/// isLegalRC - Return true if the value types that can be represented by the
+/// specified register class are all legal.
+bool TargetLowering::isLegalRC(const TargetRegisterClass *RC) const {
+  for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
+       I != E; ++I) {
+    if (isTypeLegal(*I))
+      return true;
+  }
+  return false;
+}
+
+/// hasLegalSuperRegRegClasses - Return true if the specified register class
+/// has one or more super-reg register classes that are legal.
+bool
+TargetLowering::hasLegalSuperRegRegClasses(const TargetRegisterClass *RC) const{
+  if (*RC->superregclasses_begin() == 0)
+    return false;
+  for (TargetRegisterInfo::regclass_iterator I = RC->superregclasses_begin(),
+         E = RC->superregclasses_end(); I != E; ++I) {
+    const TargetRegisterClass *RRC = *I;
+    if (isLegalRC(RRC))
+      return true;
+  }
+  return false;
+}
+
+/// findRepresentativeClass - Return the largest legal super-reg register class
+/// of the register class for the specified type and its associated "cost".
+std::pair<const TargetRegisterClass*, uint8_t>
+TargetLowering::findRepresentativeClass(EVT VT) const {
+  const TargetRegisterClass *RC = RegClassForVT[VT.getSimpleVT().SimpleTy];
+  if (!RC)
+    return std::make_pair(RC, 0);
+  const TargetRegisterClass *BestRC = RC;
+  for (TargetRegisterInfo::regclass_iterator I = RC->superregclasses_begin(),
+         E = RC->superregclasses_end(); I != E; ++I) {
+    const TargetRegisterClass *RRC = *I;
+    if (RRC->isASubClass() || !isLegalRC(RRC))
+      continue;
+    if (!hasLegalSuperRegRegClasses(RRC))
+      return std::make_pair(RRC, 1);
+    BestRC = RRC;
+  }
+  return std::make_pair(BestRC, 1);
+}
+
+/// computeRegisterProperties - Once all of the register classes are added,
+/// this allows us to compute derived properties we expose.
+void TargetLowering::computeRegisterProperties() {
+  assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
+         "Too many value types for ValueTypeActions to hold!");
+
+  // Everything defaults to needing one register.
+  for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
+    NumRegistersForVT[i] = 1;
+    RegisterTypeForVT[i] = TransformToType[i] = (MVT::SimpleValueType)i;
+  }
+  // ...except isVoid, which doesn't need any registers.
+  NumRegistersForVT[MVT::isVoid] = 0;
+
+  // Find the largest integer register class.
+  unsigned LargestIntReg = MVT::LAST_INTEGER_VALUETYPE;
+  for (; RegClassForVT[LargestIntReg] == 0; --LargestIntReg)
+    assert(LargestIntReg != MVT::i1 && "No integer registers defined!");
+
+  // Every integer value type larger than this largest register takes twice as
+  // many registers to represent as the previous ValueType.
+  for (unsigned ExpandedReg = LargestIntReg + 1; ; ++ExpandedReg) {
+    EVT ExpandedVT = (MVT::SimpleValueType)ExpandedReg;
+    if (!ExpandedVT.isInteger())
+      break;
+    NumRegistersForVT[ExpandedReg] = 2*NumRegistersForVT[ExpandedReg-1];
+    RegisterTypeForVT[ExpandedReg] = (MVT::SimpleValueType)LargestIntReg;
+    TransformToType[ExpandedReg] = (MVT::SimpleValueType)(ExpandedReg - 1);
+    ValueTypeActions.setTypeAction(ExpandedVT, Expand);
+  }
+
+  // Inspect all of the ValueType's smaller than the largest integer
+  // register to see which ones need promotion.
+  unsigned LegalIntReg = LargestIntReg;
+  for (unsigned IntReg = LargestIntReg - 1;
+       IntReg >= (unsigned)MVT::i1; --IntReg) {
+    EVT IVT = (MVT::SimpleValueType)IntReg;
+    if (isTypeLegal(IVT)) {
+      LegalIntReg = IntReg;
+    } else {
+      RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
+        (MVT::SimpleValueType)LegalIntReg;
+      ValueTypeActions.setTypeAction(IVT, Promote);
+    }
+  }
+
+  // ppcf128 type is really two f64's.
+  if (!isTypeLegal(MVT::ppcf128)) {
+    NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64];
+    RegisterTypeForVT[MVT::ppcf128] = MVT::f64;
+    TransformToType[MVT::ppcf128] = MVT::f64;
+    ValueTypeActions.setTypeAction(MVT::ppcf128, Expand);
+  }    
+
+  // Decide how to handle f64. If the target does not have native f64 support,
+  // expand it to i64 and we will be generating soft float library calls.
+  if (!isTypeLegal(MVT::f64)) {
+    NumRegistersForVT[MVT::f64] = NumRegistersForVT[MVT::i64];
+    RegisterTypeForVT[MVT::f64] = RegisterTypeForVT[MVT::i64];
+    TransformToType[MVT::f64] = MVT::i64;
+    ValueTypeActions.setTypeAction(MVT::f64, Expand);
+  }
+
+  // Decide how to handle f32. If the target does not have native support for
+  // f32, promote it to f64 if it is legal. Otherwise, expand it to i32.
+  if (!isTypeLegal(MVT::f32)) {
+    if (isTypeLegal(MVT::f64)) {
+      NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::f64];
+      RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::f64];
+      TransformToType[MVT::f32] = MVT::f64;
+      ValueTypeActions.setTypeAction(MVT::f32, Promote);
+    } else {
+      NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::i32];
+      RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::i32];
+      TransformToType[MVT::f32] = MVT::i32;
+      ValueTypeActions.setTypeAction(MVT::f32, Expand);
+    }
+  }
+  
+  // Loop over all of the vector value types to see which need transformations.
+  for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
+       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT VT = (MVT::SimpleValueType)i;
+    if (isTypeLegal(VT)) continue;
+    
+    MVT IntermediateVT;
+    EVT RegisterVT;
+    unsigned NumIntermediates;
+    NumRegistersForVT[i] =
+      getVectorTypeBreakdownMVT(VT, IntermediateVT, NumIntermediates,
+                                RegisterVT, this);
+    RegisterTypeForVT[i] = RegisterVT;
+    
+    // Determine if there is a legal wider type.
+    bool IsLegalWiderType = false;
+    EVT EltVT = VT.getVectorElementType();
+    unsigned NElts = VT.getVectorNumElements();
+    for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+      EVT SVT = (MVT::SimpleValueType)nVT;
+      if (isTypeSynthesizable(SVT) && SVT.getVectorElementType() == EltVT &&
+          SVT.getVectorNumElements() > NElts && NElts != 1) {
+        TransformToType[i] = SVT;
+        ValueTypeActions.setTypeAction(VT, Promote);
+        IsLegalWiderType = true;
+        break;
+      }
+    }
+    if (!IsLegalWiderType) {
+      EVT NVT = VT.getPow2VectorType();
+      if (NVT == VT) {
+        // Type is already a power of 2.  The default action is to split.
+        TransformToType[i] = MVT::Other;
+        ValueTypeActions.setTypeAction(VT, Expand);
+      } else {
+        TransformToType[i] = NVT;
+        ValueTypeActions.setTypeAction(VT, Promote);
+      }
+    }
+  }
+
+  // Determine the 'representative' register class for each value type.
+  // An representative register class is the largest (meaning one which is
+  // not a sub-register class / subreg register class) legal register class for
+  // a group of value types. For example, on i386, i8, i16, and i32
+  // representative would be GR32; while on x86_64 it's GR64.
+  for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
+    const TargetRegisterClass* RRC;
+    uint8_t Cost;
+    tie(RRC, Cost) =  findRepresentativeClass((MVT::SimpleValueType)i);
+    RepRegClassForVT[i] = RRC;
+    RepRegClassCostForVT[i] = Cost;
+  }
+}
+
+const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  return NULL;
+}
+
+
+MVT::SimpleValueType TargetLowering::getSetCCResultType(EVT VT) const {
+  return PointerTy.SimpleTy;
+}
+
+MVT::SimpleValueType TargetLowering::getCmpLibcallReturnType() const {
+  return MVT::i32; // return the default value
+}
+
+/// getVectorTypeBreakdown - Vector types are broken down into some number of
+/// legal first class types.  For example, MVT::v8f32 maps to 2 MVT::v4f32
+/// with Altivec or SSE1, or 8 promoted MVT::f64 values with the X86 FP stack.
+/// Similarly, MVT::v2i64 turns into 4 MVT::i32 values with both PPC and X86.
+///
+/// This method returns the number of registers needed, and the VT for each
+/// register.  It also returns the VT and quantity of the intermediate values
+/// before they are promoted/expanded.
+///
+unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
+                                                EVT &IntermediateVT,
+                                                unsigned &NumIntermediates,
+                                                EVT &RegisterVT) const {
+  // Figure out the right, legal destination reg to copy into.
+  unsigned NumElts = VT.getVectorNumElements();
+  EVT EltTy = VT.getVectorElementType();
+  
+  unsigned NumVectorRegs = 1;
+  
+  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we 
+  // could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(NumElts)) {
+    NumVectorRegs = NumElts;
+    NumElts = 1;
+  }
+  
+  // Divide the input until we get to a supported size.  This will always
+  // end with a scalar if the target doesn't support vectors.
+  while (NumElts > 1 && !isTypeLegal(
+                                   EVT::getVectorVT(Context, EltTy, NumElts))) {
+    NumElts >>= 1;
+    NumVectorRegs <<= 1;
+  }
+
+  NumIntermediates = NumVectorRegs;
+  
+  EVT NewVT = EVT::getVectorVT(Context, EltTy, NumElts);
+  if (!isTypeLegal(NewVT))
+    NewVT = EltTy;
+  IntermediateVT = NewVT;
+
+  EVT DestVT = getRegisterType(Context, NewVT);
+  RegisterVT = DestVT;
+  if (DestVT.bitsLT(NewVT)) {
+    // Value is expanded, e.g. i64 -> i16.
+    return NumVectorRegs*(NewVT.getSizeInBits()/DestVT.getSizeInBits());
+  } else {
+    // Otherwise, promotion or legal types use the same number of registers as
+    // the vector decimated to the appropriate level.
+    return NumVectorRegs;
+  }
+  
+  return 1;
+}
+
+/// Get the EVTs and ArgFlags collections that represent the legalized return 
+/// type of the given function.  This does not require a DAG or a return value,
+/// and is suitable for use before any DAGs for the function are constructed.
+/// TODO: Move this out of TargetLowering.cpp.
+void llvm::GetReturnInfo(const Type* ReturnType, Attributes attr,
+                         SmallVectorImpl<ISD::OutputArg> &Outs,
+                         const TargetLowering &TLI,
+                         SmallVectorImpl<uint64_t> *Offsets) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, ReturnType, ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+  unsigned Offset = 0;
+
+  for (unsigned j = 0, f = NumValues; j != f; ++j) {
+    EVT VT = ValueVTs[j];
+    ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+    if (attr & Attribute::SExt)
+      ExtendKind = ISD::SIGN_EXTEND;
+    else if (attr & Attribute::ZExt)
+      ExtendKind = ISD::ZERO_EXTEND;
+
+    // FIXME: C calling convention requires the return type to be promoted to
+    // at least 32-bit. But this is not necessary for non-C calling
+    // conventions. The frontend should mark functions whose return values
+    // require promoting with signext or zeroext attributes.
+    if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
+      EVT MinVT = TLI.getRegisterType(ReturnType->getContext(), MVT::i32);
+      if (VT.bitsLT(MinVT))
+        VT = MinVT;
+    }
+
+    unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
+    EVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
+    unsigned PartSize = TLI.getTargetData()->getTypeAllocSize(
+                        PartVT.getTypeForEVT(ReturnType->getContext()));
+
+    // 'inreg' on function refers to return value
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+    if (attr & Attribute::InReg)
+      Flags.setInReg();
+
+    // Propagate extension type if any
+    if (attr & Attribute::SExt)
+      Flags.setSExt();
+    else if (attr & Attribute::ZExt)
+      Flags.setZExt();
+
+    for (unsigned i = 0; i < NumParts; ++i) {
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true));
+      if (Offsets) {
+        Offsets->push_back(Offset);
+        Offset += PartSize;
+      }
+    }
+  }
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area.  This is the actual
+/// alignment, not its logarithm.
+unsigned TargetLowering::getByValTypeAlignment(const Type *Ty) const {
+  return TD->getCallFrameTypeAlignment(Ty);
+}
+
+/// getJumpTableEncoding - Return the entry encoding for a jump table in the
+/// current function.  The returned value is a member of the
+/// MachineJumpTableInfo::JTEntryKind enum.
+unsigned TargetLowering::getJumpTableEncoding() const {
+  // In non-pic modes, just use the address of a block.
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
+    return MachineJumpTableInfo::EK_BlockAddress;
+  
+  // In PIC mode, if the target supports a GPRel32 directive, use it.
+  if (getTargetMachine().getMCAsmInfo()->getGPRel32Directive() != 0)
+    return MachineJumpTableInfo::EK_GPRel32BlockAddress;
+  
+  // Otherwise, use a label difference.
+  return MachineJumpTableInfo::EK_LabelDifference32;
+}
+
+SDValue TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                 SelectionDAG &DAG) const {
+  // If our PIC model is GP relative, use the global offset table as the base.
+  if (getJumpTableEncoding() == MachineJumpTableInfo::EK_GPRel32BlockAddress)
+    return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy());
+  return Table;
+}
+
+/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
+/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
+/// MCExpr.
+const MCExpr *
+TargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                                             unsigned JTI,MCContext &Ctx) const{
+  // The normal PIC reloc base is the label at the start of the jump table.
+  return MCSymbolRefExpr::Create(MF->getJTISymbol(JTI, Ctx), Ctx);
+}
+
+bool
+TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // Assume that everything is safe in static mode.
+  if (getTargetMachine().getRelocationModel() == Reloc::Static)
+    return true;
+
+  // In dynamic-no-pic mode, assume that known defined values are safe.
+  if (getTargetMachine().getRelocationModel() == Reloc::DynamicNoPIC &&
+      GA &&
+      !GA->getGlobal()->isDeclaration() &&
+      !GA->getGlobal()->isWeakForLinker())
+    return true;
+
+  // Otherwise assume nothing is safe.
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//  Optimization Methods
+//===----------------------------------------------------------------------===//
+
+/// ShrinkDemandedConstant - Check to see if the specified operand of the 
+/// specified instruction is a constant integer.  If so, check to see if there
+/// are any bits set in the constant that are not demanded.  If so, shrink the
+/// constant and return true.
+bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op, 
+                                                        const APInt &Demanded) {
+  DebugLoc dl = Op.getDebugLoc();
+
+  // FIXME: ISD::SELECT, ISD::SELECT_CC
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::XOR:
+  case ISD::AND:
+  case ISD::OR: {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!C) return false;
+
+    if (Op.getOpcode() == ISD::XOR &&
+        (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
+      return false;
+
+    // if we can expand it to have all bits set, do it
+    if (C->getAPIntValue().intersects(~Demanded)) {
+      EVT VT = Op.getValueType();
+      SDValue New = DAG.getNode(Op.getOpcode(), dl, VT, Op.getOperand(0),
+                                DAG.getConstant(Demanded &
+                                                C->getAPIntValue(), 
+                                                VT));
+      return CombineTo(Op, New);
+    }
+
+    break;
+  }
+  }
+
+  return false;
+}
+
+/// ShrinkDemandedOp - Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the
+/// casts are free.  This uses isZExtFree and ZERO_EXTEND for the widening
+/// cast, but it could be generalized for targets with other types of
+/// implicit widening casts.
+bool
+TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
+                                                    unsigned BitWidth,
+                                                    const APInt &Demanded,
+                                                    DebugLoc dl) {
+  assert(Op.getNumOperands() == 2 &&
+         "ShrinkDemandedOp only supports binary operators!");
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "ShrinkDemandedOp only supports nodes with one result!");
+
+  // Don't do this if the node has another user, which may require the
+  // full value.
+  if (!Op.getNode()->hasOneUse())
+    return false;
+
+  // Search for the smallest integer type with free casts to and from
+  // Op's type. For expedience, just check power-of-2 integer types.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned SmallVTBits = BitWidth - Demanded.countLeadingZeros();
+  if (!isPowerOf2_32(SmallVTBits))
+    SmallVTBits = NextPowerOf2(SmallVTBits);
+  for (; SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
+    EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), SmallVTBits);
+    if (TLI.isTruncateFree(Op.getValueType(), SmallVT) &&
+        TLI.isZExtFree(SmallVT, Op.getValueType())) {
+      // We found a type with free casts.
+      SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT,
+                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
+                                          Op.getNode()->getOperand(0)),
+                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
+                                          Op.getNode()->getOperand(1)));
+      SDValue Z = DAG.getNode(ISD::ZERO_EXTEND, dl, Op.getValueType(), X);
+      return CombineTo(Op, Z);
+    }
+  }
+  return false;
+}
+
+/// SimplifyDemandedBits - Look at Op.  At this point, we know that only the
+/// DemandedMask bits of the result of Op are ever used downstream.  If we can
+/// use this information to simplify Op, create a new simplified DAG node and
+/// return true, returning the original and new nodes in Old and New. Otherwise,
+/// analyze the expression and return a mask of KnownOne and KnownZero bits for
+/// the expression (used to simplify the caller).  The KnownZero/One bits may
+/// only be accurate for those bits in the DemandedMask.
+bool TargetLowering::SimplifyDemandedBits(SDValue Op,
+                                          const APInt &DemandedMask,
+                                          APInt &KnownZero,
+                                          APInt &KnownOne,
+                                          TargetLoweringOpt &TLO,
+                                          unsigned Depth) const {
+  unsigned BitWidth = DemandedMask.getBitWidth();
+  assert(Op.getValueType().getScalarType().getSizeInBits() == BitWidth &&
+         "Mask size mismatches value type size!");
+  APInt NewMask = DemandedMask;
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Don't know anything.
+  KnownZero = KnownOne = APInt(BitWidth, 0);
+
+  // Other users may use these bits.
+  if (!Op.getNode()->hasOneUse()) { 
+    if (Depth != 0) {
+      // If not at the root, Just compute the KnownZero/KnownOne bits to 
+      // simplify things downstream.
+      TLO.DAG.ComputeMaskedBits(Op, DemandedMask, KnownZero, KnownOne, Depth);
+      return false;
+    }
+    // If this is the root being simplified, allow it to have multiple uses,
+    // just set the NewMask to all bits.
+    NewMask = APInt::getAllOnesValue(BitWidth);
+  } else if (DemandedMask == 0) {   
+    // Not demanding any bits from Op.
+    if (Op.getOpcode() != ISD::UNDEF)
+      return TLO.CombineTo(Op, TLO.DAG.getUNDEF(Op.getValueType()));
+    return false;
+  } else if (Depth == 6) {        // Limit search depth.
+    return false;
+  }
+
+  APInt KnownZero2, KnownOne2, KnownZeroOut, KnownOneOut;
+  switch (Op.getOpcode()) {
+  case ISD::Constant:
+    // We know all of the bits for a constant!
+    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue() & NewMask;
+    KnownZero = ~KnownOne & NewMask;
+    return false;   // Don't fall through, will infinitely loop.
+  case ISD::AND:
+    // If the RHS is a constant, check to see if the LHS would be zero without
+    // using the bits from the RHS.  Below, we use knowledge about the RHS to
+    // simplify the LHS, here we're using information from the LHS to simplify
+    // the RHS.
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      APInt LHSZero, LHSOne;
+      TLO.DAG.ComputeMaskedBits(Op.getOperand(0), NewMask,
+                                LHSZero, LHSOne, Depth+1);
+      // If the LHS already has zeros where RHSC does, this and is dead.
+      if ((LHSZero & NewMask) == (~RHSC->getAPIntValue() & NewMask))
+        return TLO.CombineTo(Op, Op.getOperand(0));
+      // If any of the set bits in the RHS are known zero on the LHS, shrink
+      // the constant.
+      if (TLO.ShrinkDemandedConstant(Op, ~LHSZero & NewMask))
+        return true;
+    }
+    
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    if (SimplifyDemandedBits(Op.getOperand(0), ~KnownZero & NewMask,
+                             KnownZero2, KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+      
+    // If all of the demanded bits are known one on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If all of the demanded bits in the inputs are known zeros, return zero.
+    if ((NewMask & (KnownZero|KnownZero2)) == NewMask)
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, Op.getValueType()));
+    // If the RHS is a constant, see if we can simplify it.
+    if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask))
+      return true;
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    break;
+  case ISD::OR:
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero, 
+                             KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    if (SimplifyDemandedBits(Op.getOperand(0), ~KnownOne & NewMask,
+                             KnownZero2, KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if ((NewMask & ~KnownOne2 & KnownZero) == (~KnownOne2 & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((NewMask & ~KnownOne & KnownZero2) == (~KnownOne & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If all of the potentially set bits on one side are known to be set on
+    // the other side, just use the 'other' side.
+    if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If the RHS is a constant, see if we can simplify it.
+    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+      return true;
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    break;
+  case ISD::XOR:
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero, 
+                             KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    if (SimplifyDemandedBits(Op.getOperand(0), NewMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'xor'.
+    if ((KnownZero & NewMask) == NewMask)
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((KnownZero2 & NewMask) == NewMask)
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
+    // If all of the unknown bits are known to be zero on one side or the other
+    // (but not both) turn this into an *inclusive* or.
+    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+    if ((NewMask & ~KnownZero & ~KnownZero2) == 0)
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, Op.getValueType(),
+                                               Op.getOperand(0),
+                                               Op.getOperand(1)));
+    
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOneOut = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+    
+    // If all of the demanded bits on one side are known, and all of the set
+    // bits on that side are also known to be set on the other side, turn this
+    // into an AND, as we know the bits will be cleared.
+    //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+    if ((NewMask & (KnownZero|KnownOne)) == NewMask) { // all known
+      if ((KnownOne & KnownOne2) == KnownOne) {
+        EVT VT = Op.getValueType();
+        SDValue ANDC = TLO.DAG.getConstant(~KnownOne & NewMask, VT);
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, dl, VT, 
+                                                 Op.getOperand(0), ANDC));
+      }
+    }
+    
+    // If the RHS is a constant, see if we can simplify it.
+    // for XOR, we prefer to force bits to 1 if they will make a -1.
+    // if we can't force bits, try to shrink constant
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      APInt Expanded = C->getAPIntValue() | (~NewMask);
+      // if we can expand it to have all bits set, do it
+      if (Expanded.isAllOnesValue()) {
+        if (Expanded != C->getAPIntValue()) {
+          EVT VT = Op.getValueType();
+          SDValue New = TLO.DAG.getNode(Op.getOpcode(), dl,VT, Op.getOperand(0),
+                                          TLO.DAG.getConstant(Expanded, VT));
+          return TLO.CombineTo(Op, New);
+        }
+        // if it already has all the bits set, nothing to change
+        // but don't shrink either!
+      } else if (TLO.ShrinkDemandedConstant(Op, NewMask)) {
+        return true;
+      }
+    }
+
+    KnownZero = KnownZeroOut;
+    KnownOne  = KnownOneOut;
+    break;
+  case ISD::SELECT:
+    if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero, 
+                             KnownOne, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // If the operands are constants, see if we can simplify them.
+    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+      return true;
+    
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  case ISD::SELECT_CC:
+    if (SimplifyDemandedBits(Op.getOperand(3), NewMask, KnownZero, 
+                             KnownOne, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // If the operands are constants, see if we can simplify them.
+    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+      return true;
+      
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  case ISD::SHL:
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+      SDValue InOp = Op.getOperand(0);
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
+      // single shift.  We can do this if the bottom bits (which are shifted
+      // out) are never demanded.
+      if (InOp.getOpcode() == ISD::SRL &&
+          isa<ConstantSDNode>(InOp.getOperand(1))) {
+        if (ShAmt && (NewMask & APInt::getLowBitsSet(BitWidth, ShAmt)) == 0) {
+          unsigned C1= cast<ConstantSDNode>(InOp.getOperand(1))->getZExtValue();
+          unsigned Opc = ISD::SHL;
+          int Diff = ShAmt-C1;
+          if (Diff < 0) {
+            Diff = -Diff;
+            Opc = ISD::SRL;
+          }          
+          
+          SDValue NewSA = 
+            TLO.DAG.getConstant(Diff, Op.getOperand(1).getValueType());
+          EVT VT = Op.getValueType();
+          return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
+                                                   InOp.getOperand(0), NewSA));
+        }
+      }      
+      
+      if (SimplifyDemandedBits(InOp, NewMask.lshr(ShAmt),
+                               KnownZero, KnownOne, TLO, Depth+1))
+        return true;
+
+      // Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
+      // are not demanded. This will likely allow the anyext to be folded away.
+      if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) {
+        SDValue InnerOp = InOp.getNode()->getOperand(0);
+        EVT InnerVT = InnerOp.getValueType();
+        if ((APInt::getHighBitsSet(BitWidth,
+                                   BitWidth - InnerVT.getSizeInBits()) &
+               DemandedMask) == 0 &&
+            isTypeDesirableForOp(ISD::SHL, InnerVT)) {
+          EVT ShTy = getShiftAmountTy();
+          if (!APInt(BitWidth, ShAmt).isIntN(ShTy.getSizeInBits()))
+            ShTy = InnerVT;
+          SDValue NarrowShl =
+            TLO.DAG.getNode(ISD::SHL, dl, InnerVT, InnerOp,
+                            TLO.DAG.getConstant(ShAmt, ShTy));
+          return
+            TLO.CombineTo(Op,
+                          TLO.DAG.getNode(ISD::ANY_EXTEND, dl, Op.getValueType(),
+                                          NarrowShl));
+        }
+      }
+
+      KnownZero <<= SA->getZExtValue();
+      KnownOne  <<= SA->getZExtValue();
+      // low bits known zero.
+      KnownZero |= APInt::getLowBitsSet(BitWidth, SA->getZExtValue());
+    }
+    break;
+  case ISD::SRL:
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      EVT VT = Op.getValueType();
+      unsigned ShAmt = SA->getZExtValue();
+      unsigned VTSize = VT.getSizeInBits();
+      SDValue InOp = Op.getOperand(0);
+      
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
+      // single shift.  We can do this if the top bits (which are shifted out)
+      // are never demanded.
+      if (InOp.getOpcode() == ISD::SHL &&
+          isa<ConstantSDNode>(InOp.getOperand(1))) {
+        if (ShAmt && (NewMask & APInt::getHighBitsSet(VTSize, ShAmt)) == 0) {
+          unsigned C1= cast<ConstantSDNode>(InOp.getOperand(1))->getZExtValue();
+          unsigned Opc = ISD::SRL;
+          int Diff = ShAmt-C1;
+          if (Diff < 0) {
+            Diff = -Diff;
+            Opc = ISD::SHL;
+          }          
+          
+          SDValue NewSA =
+            TLO.DAG.getConstant(Diff, Op.getOperand(1).getValueType());
+          return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
+                                                   InOp.getOperand(0), NewSA));
+        }
+      }      
+      
+      // Compute the new bits that are at the top now.
+      if (SimplifyDemandedBits(InOp, (NewMask << ShAmt),
+                               KnownZero, KnownOne, TLO, Depth+1))
+        return true;
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
+      KnownZero |= HighBits;  // High bits known zero.
+    }
+    break;
+  case ISD::SRA:
+    // If this is an arithmetic shift right and only the low-bit is set, we can
+    // always convert this into a logical shr, even if the shift amount is
+    // variable.  The low bit of the shift cannot be an input sign bit unless
+    // the shift amount is >= the size of the datatype, which is undefined.
+    if (DemandedMask == 1)
+      return TLO.CombineTo(Op,
+                           TLO.DAG.getNode(ISD::SRL, dl, Op.getValueType(),
+                                           Op.getOperand(0), Op.getOperand(1)));
+
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      EVT VT = Op.getValueType();
+      unsigned ShAmt = SA->getZExtValue();
+      
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      APInt InDemandedMask = (NewMask << ShAmt);
+
+      // If any of the demanded bits are produced by the sign extension, we also
+      // demand the input sign bit.
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
+      if (HighBits.intersects(NewMask))
+        InDemandedMask |= APInt::getSignBit(VT.getScalarType().getSizeInBits());
+      
+      if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask,
+                               KnownZero, KnownOne, TLO, Depth+1))
+        return true;
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+      
+      // Handle the sign bit, adjusted to where it is now in the mask.
+      APInt SignBit = APInt::getSignBit(BitWidth).lshr(ShAmt);
+      
+      // If the input sign bit is known to be zero, or if none of the top bits
+      // are demanded, turn this into an unsigned shift right.
+      if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits) {
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, 
+                                                 Op.getOperand(0),
+                                                 Op.getOperand(1)));
+      } else if (KnownOne.intersects(SignBit)) { // New bits are known one.
+        KnownOne |= HighBits;
+      }
+    }
+    break;
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+
+    // Sign extension.  Compute the demanded bits in the result that are not 
+    // present in the input.
+    APInt NewBits =
+      APInt::getHighBitsSet(BitWidth,
+                            BitWidth - EVT.getScalarType().getSizeInBits());
+    
+    // If none of the extended bits are demanded, eliminate the sextinreg.
+    if ((NewBits & NewMask) == 0)
+      return TLO.CombineTo(Op, Op.getOperand(0));
+
+    APInt InSignBit = APInt::getSignBit(EVT.getScalarType().getSizeInBits());
+    InSignBit.zext(BitWidth);
+    APInt InputDemandedBits =
+      APInt::getLowBitsSet(BitWidth,
+                           EVT.getScalarType().getSizeInBits()) &
+      NewMask;
+    
+    // Since the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    InputDemandedBits |= InSignBit;
+
+    if (SimplifyDemandedBits(Op.getOperand(0), InputDemandedBits,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    
+    // If the input sign bit is known zero, convert this into a zero extension.
+    if (KnownZero.intersects(InSignBit))
+      return TLO.CombineTo(Op, 
+                           TLO.DAG.getZeroExtendInReg(Op.getOperand(0),dl,EVT));
+    
+    if (KnownOne.intersects(InSignBit)) {    // Input sign bit known set
+      KnownOne |= NewBits;
+      KnownZero &= ~NewBits;
+    } else {                       // Input sign bit unknown
+      KnownZero &= ~NewBits;
+      KnownOne &= ~NewBits;
+    }
+    break;
+  }
+  case ISD::ZERO_EXTEND: {
+    unsigned OperandBitWidth =
+      Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    APInt InMask = NewMask;
+    InMask.trunc(OperandBitWidth);
+    
+    // If none of the top bits are demanded, convert this into an any_extend.
+    APInt NewBits =
+      APInt::getHighBitsSet(BitWidth, BitWidth - OperandBitWidth) & NewMask;
+    if (!NewBits.intersects(NewMask))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl,
+                                               Op.getValueType(), 
+                                               Op.getOperand(0)));
+    
+    if (SimplifyDemandedBits(Op.getOperand(0), InMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+    KnownZero |= NewBits;
+    break;
+  }
+  case ISD::SIGN_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    APInt InMask    = APInt::getLowBitsSet(BitWidth, InBits);
+    APInt InSignBit = APInt::getBitsSet(BitWidth, InBits - 1, InBits);
+    APInt NewBits   = ~InMask & NewMask;
+    
+    // If none of the top bits are demanded, convert this into an any_extend.
+    if (NewBits == 0)
+      return TLO.CombineTo(Op,TLO.DAG.getNode(ISD::ANY_EXTEND, dl,
+                                              Op.getValueType(),
+                                              Op.getOperand(0)));
+    
+    // Since some of the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    APInt InDemandedBits = InMask & NewMask;
+    InDemandedBits |= InSignBit;
+    InDemandedBits.trunc(InBits);
+    
+    if (SimplifyDemandedBits(Op.getOperand(0), InDemandedBits, KnownZero, 
+                             KnownOne, TLO, Depth+1))
+      return true;
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+    
+    // If the sign bit is known zero, convert this to a zero extend.
+    if (KnownZero.intersects(InSignBit))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl,
+                                               Op.getValueType(), 
+                                               Op.getOperand(0)));
+    
+    // If the sign bit is known one, the top bits match.
+    if (KnownOne.intersects(InSignBit)) {
+      KnownOne  |= NewBits;
+      KnownZero &= ~NewBits;
+    } else {   // Otherwise, top bits aren't known.
+      KnownOne  &= ~NewBits;
+      KnownZero &= ~NewBits;
+    }
+    break;
+  }
+  case ISD::ANY_EXTEND: {
+    unsigned OperandBitWidth =
+      Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    APInt InMask = NewMask;
+    InMask.trunc(OperandBitWidth);
+    if (SimplifyDemandedBits(Op.getOperand(0), InMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+    break;
+  }
+  case ISD::TRUNCATE: {
+    // Simplify the input, using demanded bit information, and compute the known
+    // zero/one bits live out.
+    unsigned OperandBitWidth =
+      Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    APInt TruncMask = NewMask;
+    TruncMask.zext(OperandBitWidth);
+    if (SimplifyDemandedBits(Op.getOperand(0), TruncMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    KnownZero.trunc(BitWidth);
+    KnownOne.trunc(BitWidth);
+    
+    // If the input is only used by this truncate, see if we can shrink it based
+    // on the known demanded bits.
+    if (Op.getOperand(0).getNode()->hasOneUse()) {
+      SDValue In = Op.getOperand(0);
+      switch (In.getOpcode()) {
+      default: break;
+      case ISD::SRL:
+        // Shrink SRL by a constant if none of the high bits shifted in are
+        // demanded.
+        if (TLO.LegalTypes() &&
+            !isTypeDesirableForOp(ISD::SRL, Op.getValueType()))
+          // Do not turn (vt1 truncate (vt2 srl)) into (vt1 srl) if vt1 is
+          // undesirable.
+          break;
+        ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
+        if (!ShAmt)
+          break;
+        APInt HighBits = APInt::getHighBitsSet(OperandBitWidth,
+                                               OperandBitWidth - BitWidth);
+        HighBits = HighBits.lshr(ShAmt->getZExtValue());
+        HighBits.trunc(BitWidth);
+
+        if (ShAmt->getZExtValue() < BitWidth && !(HighBits & NewMask)) {
+          // None of the shifted in bits are needed.  Add a truncate of the
+          // shift input, then shift it.
+          SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl,
+                                             Op.getValueType(), 
+                                             In.getOperand(0));
+          return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl,
+                                                   Op.getValueType(),
+                                                   NewTrunc, 
+                                                   In.getOperand(1)));
+        }
+        break;
+      }
+    }
+    
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    break;
+  }
+  case ISD::AssertZext: {
+    // Demand all the bits of the input that are demanded in the output.
+    // The low bits are obvious; the high bits are demanded because we're
+    // asserting that they're zero here.
+    if (SimplifyDemandedBits(Op.getOperand(0), NewMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+
+    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth,
+                                        VT.getSizeInBits());
+    KnownZero |= ~InMask & NewMask;
+    break;
+  }
+  case ISD::BIT_CONVERT:
+#if 0
+    // If this is an FP->Int bitcast and if the sign bit is the only thing that
+    // is demanded, turn this into a FGETSIGN.
+    if (NewMask == EVT::getIntegerVTSignBit(Op.getValueType()) &&
+        MVT::isFloatingPoint(Op.getOperand(0).getValueType()) &&
+        !MVT::isVector(Op.getOperand(0).getValueType())) {
+      // Only do this xform if FGETSIGN is valid or if before legalize.
+      if (!TLO.AfterLegalize ||
+          isOperationLegal(ISD::FGETSIGN, Op.getValueType())) {
+        // Make a FGETSIGN + SHL to move the sign bit into the appropriate
+        // place.  We expect the SHL to be eliminated by other optimizations.
+        SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, Op.getValueType(), 
+                                         Op.getOperand(0));
+        unsigned ShVal = Op.getValueType().getSizeInBits()-1;
+        SDValue ShAmt = TLO.DAG.getConstant(ShVal, getShiftAmountTy());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, Op.getValueType(),
+                                                 Sign, ShAmt));
+      }
+    }
+#endif
+    break;
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::SUB: {
+    // Add, Sub, and Mul don't demand any bits in positions beyond that
+    // of the highest bit demanded of them.
+    APInt LoMask = APInt::getLowBitsSet(BitWidth,
+                                        BitWidth - NewMask.countLeadingZeros());
+    if (SimplifyDemandedBits(Op.getOperand(0), LoMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(1), LoMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    // See if the operation should be performed at a smaller bit width.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+  }
+  // FALL THROUGH
+  default:
+    // Just use ComputeMaskedBits to compute output bits.
+    TLO.DAG.ComputeMaskedBits(Op, NewMask, KnownZero, KnownOne, Depth);
+    break;
+  }
+  
+  // If we know the value of all of the demanded bits, return this as a
+  // constant.
+  if ((NewMask & (KnownZero|KnownOne)) == NewMask)
+    return TLO.CombineTo(Op, TLO.DAG.getConstant(KnownOne, Op.getValueType()));
+  
+  return false;
+}
+
+/// computeMaskedBitsForTargetNode - Determine which of the bits specified 
+/// in Mask are known to be either zero or one and return them in the 
+/// KnownZero/KnownOne bitsets.
+void TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 
+                                                    const APInt &Mask,
+                                                    APInt &KnownZero, 
+                                                    APInt &KnownOne,
+                                                    const SelectionDAG &DAG,
+                                                    unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+}
+
+/// ComputeNumSignBitsForTargetNode - This method can be implemented by
+/// targets that want to expose additional information about sign bits to the
+/// DAG Combiner.
+unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                         unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use ComputeNumSignBits if you don't know whether Op"
+         " is a target node!");
+  return 1;
+}
+
+/// ValueHasExactlyOneBitSet - Test if the given value is known to have exactly
+/// one bit set. This differs from ComputeMaskedBits in that it doesn't need to
+/// determine which bit is set.
+///
+static bool ValueHasExactlyOneBitSet(SDValue Val, const SelectionDAG &DAG) {
+  // A left-shift of a constant one will have exactly one bit set, because
+  // shifting the bit off the end is undefined.
+  if (Val.getOpcode() == ISD::SHL)
+    if (ConstantSDNode *C =
+         dyn_cast<ConstantSDNode>(Val.getNode()->getOperand(0)))
+      if (C->getAPIntValue() == 1)
+        return true;
+
+  // Similarly, a right-shift of a constant sign-bit will have exactly
+  // one bit set.
+  if (Val.getOpcode() == ISD::SRL)
+    if (ConstantSDNode *C =
+         dyn_cast<ConstantSDNode>(Val.getNode()->getOperand(0)))
+      if (C->getAPIntValue().isSignBit())
+        return true;
+
+  // More could be done here, though the above checks are enough
+  // to handle some common cases.
+
+  // Fall back to ComputeMaskedBits to catch other known cases.
+  EVT OpVT = Val.getValueType();
+  unsigned BitWidth = OpVT.getScalarType().getSizeInBits();
+  APInt Mask = APInt::getAllOnesValue(BitWidth);
+  APInt KnownZero, KnownOne;
+  DAG.ComputeMaskedBits(Val, Mask, KnownZero, KnownOne);
+  return (KnownZero.countPopulation() == BitWidth - 1) &&
+         (KnownOne.countPopulation() == 1);
+}
+
+/// SimplifySetCC - Try to simplify a setcc built with the specified operands 
+/// and cc. If it is unable to simplify it, return a null SDValue.
+SDValue
+TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
+                              ISD::CondCode Cond, bool foldBooleans,
+                              DAGCombinerInfo &DCI, DebugLoc dl) const {
+  SelectionDAG &DAG = DCI.DAG;
+  LLVMContext &Context = *DAG.getContext();
+
+  // These setcc operations always fold.
+  switch (Cond) {
+  default: break;
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2: return DAG.getConstant(0, VT);
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2:  return DAG.getConstant(1, VT);
+  }
+
+  if (isa<ConstantSDNode>(N0.getNode())) {
+    // Ensure that the constant occurs on the RHS, and fold constant
+    // comparisons.
+    return DAG.getSetCC(dl, VT, N1, N0, ISD::getSetCCSwappedOperands(Cond));
+  }
+
+  if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+    const APInt &C1 = N1C->getAPIntValue();
+
+    // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
+    // equality comparison, then we're just comparing whether X itself is
+    // zero.
+    if (N0.getOpcode() == ISD::SRL && (C1 == 0 || C1 == 1) &&
+        N0.getOperand(0).getOpcode() == ISD::CTLZ &&
+        N0.getOperand(1).getOpcode() == ISD::Constant) {
+      const APInt &ShAmt
+        = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+      if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+          ShAmt == Log2_32(N0.getValueType().getSizeInBits())) {
+        if ((C1 == 0) == (Cond == ISD::SETEQ)) {
+          // (srl (ctlz x), 5) == 0  -> X != 0
+          // (srl (ctlz x), 5) != 1  -> X != 0
+          Cond = ISD::SETNE;
+        } else {
+          // (srl (ctlz x), 5) != 0  -> X == 0
+          // (srl (ctlz x), 5) == 1  -> X == 0
+          Cond = ISD::SETEQ;
+        }
+        SDValue Zero = DAG.getConstant(0, N0.getValueType());
+        return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0),
+                            Zero, Cond);
+      }
+    }
+
+    // If the LHS is '(and load, const)', the RHS is 0,
+    // the test is for equality or unsigned, and all 1 bits of the const are
+    // in the same partial word, see if we can shorten the load.
+    if (DCI.isBeforeLegalize() &&
+        N0.getOpcode() == ISD::AND && C1 == 0 &&
+        N0.getNode()->hasOneUse() &&
+        isa<LoadSDNode>(N0.getOperand(0)) &&
+        N0.getOperand(0).getNode()->hasOneUse() &&
+        isa<ConstantSDNode>(N0.getOperand(1))) {
+      LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
+      APInt bestMask;
+      unsigned bestWidth = 0, bestOffset = 0;
+      if (!Lod->isVolatile() && Lod->isUnindexed()) {
+        unsigned origWidth = N0.getValueType().getSizeInBits();
+        unsigned maskWidth = origWidth;
+        // We can narrow (e.g.) 16-bit extending loads on 32-bit target to 
+        // 8 bits, but have to be careful...
+        if (Lod->getExtensionType() != ISD::NON_EXTLOAD)
+          origWidth = Lod->getMemoryVT().getSizeInBits();
+        const APInt &Mask =
+          cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+        for (unsigned width = origWidth / 2; width>=8; width /= 2) {
+          APInt newMask = APInt::getLowBitsSet(maskWidth, width);
+          for (unsigned offset=0; offset<origWidth/width; offset++) {
+            if ((newMask & Mask) == Mask) {
+              if (!TD->isLittleEndian())
+                bestOffset = (origWidth/width - offset - 1) * (width/8);
+              else
+                bestOffset = (uint64_t)offset * (width/8);
+              bestMask = Mask.lshr(offset * (width/8) * 8);
+              bestWidth = width;
+              break;
+            }
+            newMask = newMask << width;
+          }
+        }
+      }
+      if (bestWidth) {
+        EVT newVT = EVT::getIntegerVT(Context, bestWidth);
+        if (newVT.isRound()) {
+          EVT PtrType = Lod->getOperand(1).getValueType();
+          SDValue Ptr = Lod->getBasePtr();
+          if (bestOffset != 0)
+            Ptr = DAG.getNode(ISD::ADD, dl, PtrType, Lod->getBasePtr(),
+                              DAG.getConstant(bestOffset, PtrType));
+          unsigned NewAlign = MinAlign(Lod->getAlignment(), bestOffset);
+          SDValue NewLoad = DAG.getLoad(newVT, dl, Lod->getChain(), Ptr,
+                                        Lod->getSrcValue(), 
+                                        Lod->getSrcValueOffset() + bestOffset,
+                                        false, false, NewAlign);
+          return DAG.getSetCC(dl, VT, 
+                              DAG.getNode(ISD::AND, dl, newVT, NewLoad,
+                                      DAG.getConstant(bestMask.trunc(bestWidth),
+                                                      newVT)),
+                              DAG.getConstant(0LL, newVT), Cond);
+        }
+      }
+    }
+
+    // If the LHS is a ZERO_EXTEND, perform the comparison on the input.
+    if (N0.getOpcode() == ISD::ZERO_EXTEND) {
+      unsigned InSize = N0.getOperand(0).getValueType().getSizeInBits();
+
+      // If the comparison constant has bits in the upper part, the
+      // zero-extended value could never match.
+      if (C1.intersects(APInt::getHighBitsSet(C1.getBitWidth(),
+                                              C1.getBitWidth() - InSize))) {
+        switch (Cond) {
+        case ISD::SETUGT:
+        case ISD::SETUGE:
+        case ISD::SETEQ: return DAG.getConstant(0, VT);
+        case ISD::SETULT:
+        case ISD::SETULE:
+        case ISD::SETNE: return DAG.getConstant(1, VT);
+        case ISD::SETGT:
+        case ISD::SETGE:
+          // True if the sign bit of C1 is set.
+          return DAG.getConstant(C1.isNegative(), VT);
+        case ISD::SETLT:
+        case ISD::SETLE:
+          // True if the sign bit of C1 isn't set.
+          return DAG.getConstant(C1.isNonNegative(), VT);
+        default:
+          break;
+        }
+      }
+
+      // Otherwise, we can perform the comparison with the low bits.
+      switch (Cond) {
+      case ISD::SETEQ:
+      case ISD::SETNE:
+      case ISD::SETUGT:
+      case ISD::SETUGE:
+      case ISD::SETULT:
+      case ISD::SETULE: {
+        EVT newVT = N0.getOperand(0).getValueType();
+        if (DCI.isBeforeLegalizeOps() ||
+            (isOperationLegal(ISD::SETCC, newVT) &&
+              getCondCodeAction(Cond, newVT)==Legal))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                              DAG.getConstant(APInt(C1).trunc(InSize), newVT),
+                              Cond);
+        break;
+      }
+      default:
+        break;   // todo, be more careful with signed comparisons
+      }
+    } else if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+               (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+      EVT ExtSrcTy = cast<VTSDNode>(N0.getOperand(1))->getVT();
+      unsigned ExtSrcTyBits = ExtSrcTy.getSizeInBits();
+      EVT ExtDstTy = N0.getValueType();
+      unsigned ExtDstTyBits = ExtDstTy.getSizeInBits();
+
+      // If the constant doesn't fit into the number of bits for the source of
+      // the sign extension, it is impossible for both sides to be equal.
+      if (C1.getMinSignedBits() > ExtSrcTyBits)
+        return DAG.getConstant(Cond == ISD::SETNE, VT);
+      
+      SDValue ZextOp;
+      EVT Op0Ty = N0.getOperand(0).getValueType();
+      if (Op0Ty == ExtSrcTy) {
+        ZextOp = N0.getOperand(0);
+      } else {
+        APInt Imm = APInt::getLowBitsSet(ExtDstTyBits, ExtSrcTyBits);
+        ZextOp = DAG.getNode(ISD::AND, dl, Op0Ty, N0.getOperand(0),
+                              DAG.getConstant(Imm, Op0Ty));
+      }
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(ZextOp.getNode());
+      // Otherwise, make this a use of a zext.
+      return DAG.getSetCC(dl, VT, ZextOp, 
+                          DAG.getConstant(C1 & APInt::getLowBitsSet(
+                                                              ExtDstTyBits,
+                                                              ExtSrcTyBits), 
+                                          ExtDstTy),
+                          Cond);
+    } else if ((N1C->isNullValue() || N1C->getAPIntValue() == 1) &&
+                (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+      // SETCC (SETCC), [0|1], [EQ|NE]  -> SETCC
+      if (N0.getOpcode() == ISD::SETCC &&
+          isTypeLegal(VT) && VT.bitsLE(N0.getValueType())) {
+        bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (N1C->getAPIntValue() != 1);
+        if (TrueWhenTrue)
+          return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);        
+        // Invert the condition.
+        ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+        CC = ISD::getSetCCInverse(CC, 
+                                  N0.getOperand(0).getValueType().isInteger());
+        return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
+      }
+
+      if ((N0.getOpcode() == ISD::XOR ||
+           (N0.getOpcode() == ISD::AND && 
+            N0.getOperand(0).getOpcode() == ISD::XOR &&
+            N0.getOperand(1) == N0.getOperand(0).getOperand(1))) &&
+          isa<ConstantSDNode>(N0.getOperand(1)) &&
+          cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue() == 1) {
+        // If this is (X^1) == 0/1, swap the RHS and eliminate the xor.  We
+        // can only do this if the top bits are known zero.
+        unsigned BitWidth = N0.getValueSizeInBits();
+        if (DAG.MaskedValueIsZero(N0,
+                                  APInt::getHighBitsSet(BitWidth,
+                                                        BitWidth-1))) {
+          // Okay, get the un-inverted input value.
+          SDValue Val;
+          if (N0.getOpcode() == ISD::XOR)
+            Val = N0.getOperand(0);
+          else {
+            assert(N0.getOpcode() == ISD::AND && 
+                    N0.getOperand(0).getOpcode() == ISD::XOR);
+            // ((X^1)&1)^1 -> X & 1
+            Val = DAG.getNode(ISD::AND, dl, N0.getValueType(),
+                              N0.getOperand(0).getOperand(0),
+                              N0.getOperand(1));
+          }
+
+          return DAG.getSetCC(dl, VT, Val, N1,
+                              Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+        }
+      } else if (N1C->getAPIntValue() == 1 &&
+                 (VT == MVT::i1 ||
+                  getBooleanContents() == ZeroOrOneBooleanContent)) {
+        SDValue Op0 = N0;
+        if (Op0.getOpcode() == ISD::TRUNCATE)
+          Op0 = Op0.getOperand(0);
+
+        if ((Op0.getOpcode() == ISD::XOR) &&
+            Op0.getOperand(0).getOpcode() == ISD::SETCC &&
+            Op0.getOperand(1).getOpcode() == ISD::SETCC) {
+          // (xor (setcc), (setcc)) == / != 1 -> (setcc) != / == (setcc)
+          Cond = (Cond == ISD::SETEQ) ? ISD::SETNE : ISD::SETEQ;
+          return DAG.getSetCC(dl, VT, Op0.getOperand(0), Op0.getOperand(1),
+                              Cond);
+        } else if (Op0.getOpcode() == ISD::AND &&
+                isa<ConstantSDNode>(Op0.getOperand(1)) &&
+                cast<ConstantSDNode>(Op0.getOperand(1))->getAPIntValue() == 1) {
+          // If this is (X&1) == / != 1, normalize it to (X&1) != / == 0.
+          if (Op0.getValueType().bitsGT(VT))
+            Op0 = DAG.getNode(ISD::AND, dl, VT,
+                          DAG.getNode(ISD::TRUNCATE, dl, VT, Op0.getOperand(0)),
+                          DAG.getConstant(1, VT));
+          else if (Op0.getValueType().bitsLT(VT))
+            Op0 = DAG.getNode(ISD::AND, dl, VT,
+                        DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op0.getOperand(0)),
+                        DAG.getConstant(1, VT));
+
+          return DAG.getSetCC(dl, VT, Op0,
+                              DAG.getConstant(0, Op0.getValueType()),
+                              Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+        }
+      }
+    }
+    
+    APInt MinVal, MaxVal;
+    unsigned OperandBitSize = N1C->getValueType(0).getSizeInBits();
+    if (ISD::isSignedIntSetCC(Cond)) {
+      MinVal = APInt::getSignedMinValue(OperandBitSize);
+      MaxVal = APInt::getSignedMaxValue(OperandBitSize);
+    } else {
+      MinVal = APInt::getMinValue(OperandBitSize);
+      MaxVal = APInt::getMaxValue(OperandBitSize);
+    }
+
+    // Canonicalize GE/LE comparisons to use GT/LT comparisons.
+    if (Cond == ISD::SETGE || Cond == ISD::SETUGE) {
+      if (C1 == MinVal) return DAG.getConstant(1, VT);   // X >= MIN --> true
+      // X >= C0 --> X > (C0-1)
+      return DAG.getSetCC(dl, VT, N0, 
+                          DAG.getConstant(C1-1, N1.getValueType()),
+                          (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT);
+    }
+
+    if (Cond == ISD::SETLE || Cond == ISD::SETULE) {
+      if (C1 == MaxVal) return DAG.getConstant(1, VT);   // X <= MAX --> true
+      // X <= C0 --> X < (C0+1)
+      return DAG.getSetCC(dl, VT, N0, 
+                          DAG.getConstant(C1+1, N1.getValueType()),
+                          (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT);
+    }
+
+    if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MinVal)
+      return DAG.getConstant(0, VT);      // X < MIN --> false
+    if ((Cond == ISD::SETGE || Cond == ISD::SETUGE) && C1 == MinVal)
+      return DAG.getConstant(1, VT);      // X >= MIN --> true
+    if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MaxVal)
+      return DAG.getConstant(0, VT);      // X > MAX --> false
+    if ((Cond == ISD::SETLE || Cond == ISD::SETULE) && C1 == MaxVal)
+      return DAG.getConstant(1, VT);      // X <= MAX --> true
+
+    // Canonicalize setgt X, Min --> setne X, Min
+    if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MinVal)
+      return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
+    // Canonicalize setlt X, Max --> setne X, Max
+    if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MaxVal)
+      return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
+
+    // If we have setult X, 1, turn it into seteq X, 0
+    if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MinVal+1)
+      return DAG.getSetCC(dl, VT, N0, 
+                          DAG.getConstant(MinVal, N0.getValueType()), 
+                          ISD::SETEQ);
+    // If we have setugt X, Max-1, turn it into seteq X, Max
+    else if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MaxVal-1)
+      return DAG.getSetCC(dl, VT, N0, 
+                          DAG.getConstant(MaxVal, N0.getValueType()),
+                          ISD::SETEQ);
+
+    // If we have "setcc X, C0", check to see if we can shrink the immediate
+    // by changing cc.
+
+    // SETUGT X, SINTMAX  -> SETLT X, 0
+    if (Cond == ISD::SETUGT && 
+        C1 == APInt::getSignedMaxValue(OperandBitSize))
+      return DAG.getSetCC(dl, VT, N0, 
+                          DAG.getConstant(0, N1.getValueType()),
+                          ISD::SETLT);
+
+    // SETULT X, SINTMIN  -> SETGT X, -1
+    if (Cond == ISD::SETULT &&
+        C1 == APInt::getSignedMinValue(OperandBitSize)) {
+      SDValue ConstMinusOne =
+          DAG.getConstant(APInt::getAllOnesValue(OperandBitSize),
+                          N1.getValueType());
+      return DAG.getSetCC(dl, VT, N0, ConstMinusOne, ISD::SETGT);
+    }
+
+    // Fold bit comparisons when we can.
+    if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+        (VT == N0.getValueType() ||
+         (isTypeLegal(VT) && VT.bitsLE(N0.getValueType()))) &&
+        N0.getOpcode() == ISD::AND)
+      if (ConstantSDNode *AndRHS =
+                  dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+        EVT ShiftTy = DCI.isBeforeLegalize() ?
+          getPointerTy() : getShiftAmountTy();
+        if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0  -->  (X & 8) >> 3
+          // Perform the xform if the AND RHS is a single bit.
+          if (AndRHS->getAPIntValue().isPowerOf2()) {
+            return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                              DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
+                   DAG.getConstant(AndRHS->getAPIntValue().logBase2(), ShiftTy)));
+          }
+        } else if (Cond == ISD::SETEQ && C1 == AndRHS->getAPIntValue()) {
+          // (X & 8) == 8  -->  (X & 8) >> 3
+          // Perform the xform if C1 is a single bit.
+          if (C1.isPowerOf2()) {
+            return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                               DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
+                                      DAG.getConstant(C1.logBase2(), ShiftTy)));
+          }
+        }
+      }
+  }
+
+  if (isa<ConstantFPSDNode>(N0.getNode())) {
+    // Constant fold or commute setcc.
+    SDValue O = DAG.FoldSetCC(VT, N0, N1, Cond, dl);
+    if (O.getNode()) return O;
+  } else if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1.getNode())) {
+    // If the RHS of an FP comparison is a constant, simplify it away in
+    // some cases.
+    if (CFP->getValueAPF().isNaN()) {
+      // If an operand is known to be a nan, we can fold it.
+      switch (ISD::getUnorderedFlavor(Cond)) {
+      default: llvm_unreachable("Unknown flavor!");
+      case 0:  // Known false.
+        return DAG.getConstant(0, VT);
+      case 1:  // Known true.
+        return DAG.getConstant(1, VT);
+      case 2:  // Undefined.
+        return DAG.getUNDEF(VT);
+      }
+    }
+    
+    // Otherwise, we know the RHS is not a NaN.  Simplify the node to drop the
+    // constant if knowing that the operand is non-nan is enough.  We prefer to
+    // have SETO(x,x) instead of SETO(x, 0.0) because this avoids having to
+    // materialize 0.0.
+    if (Cond == ISD::SETO || Cond == ISD::SETUO)
+      return DAG.getSetCC(dl, VT, N0, N0, Cond);
+
+    // If the condition is not legal, see if we can find an equivalent one
+    // which is legal.
+    if (!isCondCodeLegal(Cond, N0.getValueType())) {
+      // If the comparison was an awkward floating-point == or != and one of
+      // the comparison operands is infinity or negative infinity, convert the
+      // condition to a less-awkward <= or >=.
+      if (CFP->getValueAPF().isInfinity()) {
+        if (CFP->getValueAPF().isNegative()) {
+          if (Cond == ISD::SETOEQ &&
+              isCondCodeLegal(ISD::SETOLE, N0.getValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLE);
+          if (Cond == ISD::SETUEQ &&
+              isCondCodeLegal(ISD::SETOLE, N0.getValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULE);
+          if (Cond == ISD::SETUNE &&
+              isCondCodeLegal(ISD::SETUGT, N0.getValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGT);
+          if (Cond == ISD::SETONE &&
+              isCondCodeLegal(ISD::SETUGT, N0.getValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGT);
+        } else {
+          if (Cond == ISD::SETOEQ &&
+              isCondCodeLegal(ISD::SETOGE, N0.getValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGE);
+          if (Cond == ISD::SETUEQ &&
+              isCondCodeLegal(ISD::SETOGE, N0.getValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGE);
+          if (Cond == ISD::SETUNE &&
+              isCondCodeLegal(ISD::SETULT, N0.getValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULT);
+          if (Cond == ISD::SETONE &&
+              isCondCodeLegal(ISD::SETULT, N0.getValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLT);
+        }
+      }
+    }
+  }
+
+  if (N0 == N1) {
+    // We can always fold X == X for integer setcc's.
+    if (N0.getValueType().isInteger())
+      return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
+    unsigned UOF = ISD::getUnorderedFlavor(Cond);
+    if (UOF == 2)   // FP operators that are undefined on NaNs.
+      return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
+    if (UOF == unsigned(ISD::isTrueWhenEqual(Cond)))
+      return DAG.getConstant(UOF, VT);
+    // Otherwise, we can't fold it.  However, we can simplify it to SETUO/SETO
+    // if it is not already.
+    ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO;
+    if (NewCond != Cond)
+      return DAG.getSetCC(dl, VT, N0, N1, NewCond);
+  }
+
+  if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+      N0.getValueType().isInteger()) {
+    if (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::SUB ||
+        N0.getOpcode() == ISD::XOR) {
+      // Simplify (X+Y) == (X+Z) -->  Y == Z
+      if (N0.getOpcode() == N1.getOpcode()) {
+        if (N0.getOperand(0) == N1.getOperand(0))
+          return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(1), Cond);
+        if (N0.getOperand(1) == N1.getOperand(1))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(0), Cond);
+        if (DAG.isCommutativeBinOp(N0.getOpcode())) {
+          // If X op Y == Y op X, try other combinations.
+          if (N0.getOperand(0) == N1.getOperand(1))
+            return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(0), 
+                                Cond);
+          if (N0.getOperand(1) == N1.getOperand(0))
+            return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(1), 
+                                Cond);
+        }
+      }
+      
+      if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(N1)) {
+        if (ConstantSDNode *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+          // Turn (X+C1) == C2 --> X == C2-C1
+          if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse()) {
+            return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                                DAG.getConstant(RHSC->getAPIntValue()-
+                                                LHSR->getAPIntValue(),
+                                N0.getValueType()), Cond);
+          }
+          
+          // Turn (X^C1) == C2 into X == C1^C2 iff X&~C1 = 0.
+          if (N0.getOpcode() == ISD::XOR)
+            // If we know that all of the inverted bits are zero, don't bother
+            // performing the inversion.
+            if (DAG.MaskedValueIsZero(N0.getOperand(0), ~LHSR->getAPIntValue()))
+              return
+                DAG.getSetCC(dl, VT, N0.getOperand(0),
+                             DAG.getConstant(LHSR->getAPIntValue() ^
+                                               RHSC->getAPIntValue(),
+                                             N0.getValueType()),
+                             Cond);
+        }
+        
+        // Turn (C1-X) == C2 --> X == C1-C2
+        if (ConstantSDNode *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
+          if (N0.getOpcode() == ISD::SUB && N0.getNode()->hasOneUse()) {
+            return
+              DAG.getSetCC(dl, VT, N0.getOperand(1),
+                           DAG.getConstant(SUBC->getAPIntValue() -
+                                             RHSC->getAPIntValue(),
+                                           N0.getValueType()),
+                           Cond);
+          }
+        }          
+      }
+
+      // Simplify (X+Z) == X -->  Z == 0
+      if (N0.getOperand(0) == N1)
+        return DAG.getSetCC(dl, VT, N0.getOperand(1),
+                        DAG.getConstant(0, N0.getValueType()), Cond);
+      if (N0.getOperand(1) == N1) {
+        if (DAG.isCommutativeBinOp(N0.getOpcode()))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                          DAG.getConstant(0, N0.getValueType()), Cond);
+        else if (N0.getNode()->hasOneUse()) {
+          assert(N0.getOpcode() == ISD::SUB && "Unexpected operation!");
+          // (Z-X) == X  --> Z == X<<1
+          SDValue SH = DAG.getNode(ISD::SHL, dl, N1.getValueType(),
+                                     N1, 
+                                     DAG.getConstant(1, getShiftAmountTy()));
+          if (!DCI.isCalledByLegalizer())
+            DCI.AddToWorklist(SH.getNode());
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), SH, Cond);
+        }
+      }
+    }
+
+    if (N1.getOpcode() == ISD::ADD || N1.getOpcode() == ISD::SUB ||
+        N1.getOpcode() == ISD::XOR) {
+      // Simplify  X == (X+Z) -->  Z == 0
+      if (N1.getOperand(0) == N0) {
+        return DAG.getSetCC(dl, VT, N1.getOperand(1),
+                        DAG.getConstant(0, N1.getValueType()), Cond);
+      } else if (N1.getOperand(1) == N0) {
+        if (DAG.isCommutativeBinOp(N1.getOpcode())) {
+          return DAG.getSetCC(dl, VT, N1.getOperand(0),
+                          DAG.getConstant(0, N1.getValueType()), Cond);
+        } else if (N1.getNode()->hasOneUse()) {
+          assert(N1.getOpcode() == ISD::SUB && "Unexpected operation!");
+          // X == (Z-X)  --> X<<1 == Z
+          SDValue SH = DAG.getNode(ISD::SHL, dl, N1.getValueType(), N0, 
+                                     DAG.getConstant(1, getShiftAmountTy()));
+          if (!DCI.isCalledByLegalizer())
+            DCI.AddToWorklist(SH.getNode());
+          return DAG.getSetCC(dl, VT, SH, N1.getOperand(0), Cond);
+        }
+      }
+    }
+
+    // Simplify x&y == y to x&y != 0 if y has exactly one bit set.
+    // Note that where y is variable and is known to have at most
+    // one bit set (for example, if it is z&1) we cannot do this;
+    // the expressions are not equivalent when y==0.
+    if (N0.getOpcode() == ISD::AND)
+      if (N0.getOperand(0) == N1 || N0.getOperand(1) == N1) {
+        if (ValueHasExactlyOneBitSet(N1, DAG)) {
+          Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
+          SDValue Zero = DAG.getConstant(0, N1.getValueType());
+          return DAG.getSetCC(dl, VT, N0, Zero, Cond);
+        }
+      }
+    if (N1.getOpcode() == ISD::AND)
+      if (N1.getOperand(0) == N0 || N1.getOperand(1) == N0) {
+        if (ValueHasExactlyOneBitSet(N0, DAG)) {
+          Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
+          SDValue Zero = DAG.getConstant(0, N0.getValueType());
+          return DAG.getSetCC(dl, VT, N1, Zero, Cond);
+        }
+      }
+  }
+
+  // Fold away ALL boolean setcc's.
+  SDValue Temp;
+  if (N0.getValueType() == MVT::i1 && foldBooleans) {
+    switch (Cond) {
+    default: llvm_unreachable("Unknown integer setcc!");
+    case ISD::SETEQ:  // X == Y  -> ~(X^Y)
+      Temp = DAG.getNode(ISD::XOR, dl, MVT::i1, N0, N1);
+      N0 = DAG.getNOT(dl, Temp, MVT::i1);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETNE:  // X != Y   -->  (X^Y)
+      N0 = DAG.getNode(ISD::XOR, dl, MVT::i1, N0, N1);
+      break;
+    case ISD::SETGT:  // X >s Y   -->  X == 0 & Y == 1  -->  ~X & Y
+    case ISD::SETULT: // X <u Y   -->  X == 0 & Y == 1  -->  ~X & Y
+      Temp = DAG.getNOT(dl, N0, MVT::i1);
+      N0 = DAG.getNode(ISD::AND, dl, MVT::i1, N1, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETLT:  // X <s Y   --> X == 1 & Y == 0  -->  ~Y & X
+    case ISD::SETUGT: // X >u Y   --> X == 1 & Y == 0  -->  ~Y & X
+      Temp = DAG.getNOT(dl, N1, MVT::i1);
+      N0 = DAG.getNode(ISD::AND, dl, MVT::i1, N0, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETULE: // X <=u Y  --> X == 0 | Y == 1  -->  ~X | Y
+    case ISD::SETGE:  // X >=s Y  --> X == 0 | Y == 1  -->  ~X | Y
+      Temp = DAG.getNOT(dl, N0, MVT::i1);
+      N0 = DAG.getNode(ISD::OR, dl, MVT::i1, N1, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETUGE: // X >=u Y  --> X == 1 | Y == 0  -->  ~Y | X
+    case ISD::SETLE:  // X <=s Y  --> X == 1 | Y == 0  -->  ~Y | X
+      Temp = DAG.getNOT(dl, N1, MVT::i1);
+      N0 = DAG.getNode(ISD::OR, dl, MVT::i1, N0, Temp);
+      break;
+    }
+    if (VT != MVT::i1) {
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(N0.getNode());
+      // FIXME: If running after legalize, we probably can't do this.
+      N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, N0);
+    }
+    return N0;
+  }
+
+  // Could not fold it.
+  return SDValue();
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
+                                    int64_t &Offset) const {
+  if (isa<GlobalAddressSDNode>(N)) {
+    GlobalAddressSDNode *GASD = cast<GlobalAddressSDNode>(N);
+    GA = GASD->getGlobal();
+    Offset += GASD->getOffset();
+    return true;
+  }
+
+  if (N->getOpcode() == ISD::ADD) {
+    SDValue N1 = N->getOperand(0);
+    SDValue N2 = N->getOperand(1);
+    if (isGAPlusOffset(N1.getNode(), GA, Offset)) {
+      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N2);
+      if (V) {
+        Offset += V->getSExtValue();
+        return true;
+      }
+    } else if (isGAPlusOffset(N2.getNode(), GA, Offset)) {
+      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N1);
+      if (V) {
+        Offset += V->getSExtValue();
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+SDValue TargetLowering::
+PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
+  // Default implementation: no optimization.
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Inline Assembler Implementation Methods
+//===----------------------------------------------------------------------===//
+
+
+TargetLowering::ConstraintType
+TargetLowering::getConstraintType(const std::string &Constraint) const {
+  // FIXME: lots more standard ones to handle.
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'r': return C_RegisterClass;
+    case 'm':    // memory
+    case 'o':    // offsetable
+    case 'V':    // not offsetable
+      return C_Memory;
+    case 'i':    // Simple Integer or Relocatable Constant
+    case 'n':    // Simple Integer
+    case 's':    // Relocatable Constant
+    case 'X':    // Allow ANY value.
+    case 'I':    // Target registers.
+    case 'J':
+    case 'K':
+    case 'L':
+    case 'M':
+    case 'N':
+    case 'O':
+    case 'P':
+      return C_Other;
+    }
+  }
+  
+  if (Constraint.size() > 1 && Constraint[0] == '{' && 
+      Constraint[Constraint.size()-1] == '}')
+    return C_Register;
+  return C_Unknown;
+}
+
+/// LowerXConstraint - try to replace an X constraint, which matches anything,
+/// with another that has more specific requirements based on the type of the
+/// corresponding operand.
+const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{
+  if (ConstraintVT.isInteger())
+    return "r";
+  if (ConstraintVT.isFloatingPoint())
+    return "f";      // works for many targets
+  return 0;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                  char ConstraintLetter,
+                                                  std::vector<SDValue> &Ops,
+                                                  SelectionDAG &DAG) const {
+  switch (ConstraintLetter) {
+  default: break;
+  case 'X':     // Allows any operand; labels (basic block) use this.
+    if (Op.getOpcode() == ISD::BasicBlock) {
+      Ops.push_back(Op);
+      return;
+    }
+    // fall through
+  case 'i':    // Simple Integer or Relocatable Constant
+  case 'n':    // Simple Integer
+  case 's': {  // Relocatable Constant
+    // These operands are interested in values of the form (GV+C), where C may
+    // be folded in as an offset of GV, or it may be explicitly added.  Also, it
+    // is possible and fine if either GV or C are missing.
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
+    
+    // If we have "(add GV, C)", pull out GV/C
+    if (Op.getOpcode() == ISD::ADD) {
+      C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+      GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
+      if (C == 0 || GA == 0) {
+        C = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+        GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(1));
+      }
+      if (C == 0 || GA == 0)
+        C = 0, GA = 0;
+    }
+    
+    // If we find a valid operand, map to the TargetXXX version so that the
+    // value itself doesn't get selected.
+    if (GA) {   // Either &GV   or   &GV+C
+      if (ConstraintLetter != 'n') {
+        int64_t Offs = GA->getOffset();
+        if (C) Offs += C->getZExtValue();
+        Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), 
+                                                 C ? C->getDebugLoc() : DebugLoc(),
+                                                 Op.getValueType(), Offs));
+        return;
+      }
+    }
+    if (C) {   // just C, no GV.
+      // Simple constants are not allowed for 's'.
+      if (ConstraintLetter != 's') {
+        // gcc prints these as sign extended.  Sign extend value to 64 bits
+        // now; without this it would get ZExt'd later in
+        // ScheduleDAGSDNodes::EmitNode, which is very generic.
+        Ops.push_back(DAG.getTargetConstant(C->getAPIntValue().getSExtValue(),
+                                            MVT::i64));
+        return;
+      }
+    }
+    break;
+  }
+  }
+}
+
+std::vector<unsigned> TargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  EVT VT) const {
+  return std::vector<unsigned>();
+}
+
+
+std::pair<unsigned, const TargetRegisterClass*> TargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint,
+                             EVT VT) const {
+  if (Constraint[0] != '{')
+    return std::make_pair(0u, static_cast<TargetRegisterClass*>(0));
+  assert(*(Constraint.end()-1) == '}' && "Not a brace enclosed constraint?");
+
+  // Remove the braces from around the name.
+  StringRef RegName(Constraint.data()+1, Constraint.size()-2);
+
+  // Figure out which register class contains this reg.
+  const TargetRegisterInfo *RI = TM.getRegisterInfo();
+  for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(),
+       E = RI->regclass_end(); RCI != E; ++RCI) {
+    const TargetRegisterClass *RC = *RCI;
+    
+    // If none of the value types for this register class are valid, we 
+    // can't use it.  For example, 64-bit reg classes on 32-bit targets.
+    bool isLegal = false;
+    for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
+         I != E; ++I) {
+      if (isTypeLegal(*I)) {
+        isLegal = true;
+        break;
+      }
+    }
+    
+    if (!isLegal) continue;
+    
+    for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); 
+         I != E; ++I) {
+      if (RegName.equals_lower(RI->getName(*I)))
+        return std::make_pair(*I, RC);
+    }
+  }
+  
+  return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
+}
+
+//===----------------------------------------------------------------------===//
+// Constraint Selection.
+
+/// isMatchingInputConstraint - Return true of this is an input operand that is
+/// a matching constraint like "4".
+bool TargetLowering::AsmOperandInfo::isMatchingInputConstraint() const {
+  assert(!ConstraintCode.empty() && "No known constraint!");
+  return isdigit(ConstraintCode[0]);
+}
+
+/// getMatchedOperand - If this is an input matching constraint, this method
+/// returns the output operand it matches.
+unsigned TargetLowering::AsmOperandInfo::getMatchedOperand() const {
+  assert(!ConstraintCode.empty() && "No known constraint!");
+  return atoi(ConstraintCode.c_str());
+}
+
+
+/// getConstraintGenerality - Return an integer indicating how general CT
+/// is.
+static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
+  switch (CT) {
+  default: llvm_unreachable("Unknown constraint type!");
+  case TargetLowering::C_Other:
+  case TargetLowering::C_Unknown:
+    return 0;
+  case TargetLowering::C_Register:
+    return 1;
+  case TargetLowering::C_RegisterClass:
+    return 2;
+  case TargetLowering::C_Memory:
+    return 3;
+  }
+}
+
+/// ChooseConstraint - If there are multiple different constraints that we
+/// could pick for this operand (e.g. "imr") try to pick the 'best' one.
+/// This is somewhat tricky: constraints fall into four classes:
+///    Other         -> immediates and magic values
+///    Register      -> one specific register
+///    RegisterClass -> a group of regs
+///    Memory        -> memory
+/// Ideally, we would pick the most specific constraint possible: if we have
+/// something that fits into a register, we would pick it.  The problem here
+/// is that if we have something that could either be in a register or in
+/// memory that use of the register could cause selection of *other*
+/// operands to fail: they might only succeed if we pick memory.  Because of
+/// this the heuristic we use is:
+///
+///  1) If there is an 'other' constraint, and if the operand is valid for
+///     that constraint, use it.  This makes us take advantage of 'i'
+///     constraints when available.
+///  2) Otherwise, pick the most general constraint present.  This prefers
+///     'm' over 'r', for example.
+///
+static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
+                             const TargetLowering &TLI,
+                             SDValue Op, SelectionDAG *DAG) {
+  assert(OpInfo.Codes.size() > 1 && "Doesn't have multiple constraint options");
+  unsigned BestIdx = 0;
+  TargetLowering::ConstraintType BestType = TargetLowering::C_Unknown;
+  int BestGenerality = -1;
+
+  // Loop over the options, keeping track of the most general one.
+  for (unsigned i = 0, e = OpInfo.Codes.size(); i != e; ++i) {
+    TargetLowering::ConstraintType CType =
+      TLI.getConstraintType(OpInfo.Codes[i]);
+
+    // If this is an 'other' constraint, see if the operand is valid for it.
+    // For example, on X86 we might have an 'rI' constraint.  If the operand
+    // is an integer in the range [0..31] we want to use I (saving a load
+    // of a register), otherwise we must use 'r'.
+    if (CType == TargetLowering::C_Other && Op.getNode()) {
+      assert(OpInfo.Codes[i].size() == 1 &&
+             "Unhandled multi-letter 'other' constraint");
+      std::vector<SDValue> ResultOps;
+      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i][0],
+                                       ResultOps, *DAG);
+      if (!ResultOps.empty()) {
+        BestType = CType;
+        BestIdx = i;
+        break;
+      }
+    }
+    
+    // Things with matching constraints can only be registers, per gcc
+    // documentation.  This mainly affects "g" constraints.
+    if (CType == TargetLowering::C_Memory && OpInfo.hasMatchingInput())
+      continue;
+    
+    // This constraint letter is more general than the previous one, use it.
+    int Generality = getConstraintGenerality(CType);
+    if (Generality > BestGenerality) {
+      BestType = CType;
+      BestIdx = i;
+      BestGenerality = Generality;
+    }
+  }
+  
+  OpInfo.ConstraintCode = OpInfo.Codes[BestIdx];
+  OpInfo.ConstraintType = BestType;
+}
+
+/// ComputeConstraintToUse - Determines the constraint code and constraint
+/// type to use for the specific AsmOperandInfo, setting
+/// OpInfo.ConstraintCode and OpInfo.ConstraintType.
+void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
+                                            SDValue Op, 
+                                            SelectionDAG *DAG) const {
+  assert(!OpInfo.Codes.empty() && "Must have at least one constraint");
+  
+  // Single-letter constraints ('r') are very common.
+  if (OpInfo.Codes.size() == 1) {
+    OpInfo.ConstraintCode = OpInfo.Codes[0];
+    OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
+  } else {
+    ChooseConstraint(OpInfo, *this, Op, DAG);
+  }
+  
+  // 'X' matches anything.
+  if (OpInfo.ConstraintCode == "X" && OpInfo.CallOperandVal) {
+    // Labels and constants are handled elsewhere ('X' is the only thing
+    // that matches labels).  For Functions, the type here is the type of
+    // the result, which is not what we want to look at; leave them alone.
+    Value *v = OpInfo.CallOperandVal;
+    if (isa<BasicBlock>(v) || isa<ConstantInt>(v) || isa<Function>(v)) {
+      OpInfo.CallOperandVal = v;
+      return;
+    }
+    
+    // Otherwise, try to resolve it to something we know about by looking at
+    // the actual operand type.
+    if (const char *Repl = LowerXConstraint(OpInfo.ConstraintVT)) {
+      OpInfo.ConstraintCode = Repl;
+      OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Loop Strength Reduction hooks
+//===----------------------------------------------------------------------===//
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool TargetLowering::isLegalAddressingMode(const AddrMode &AM, 
+                                           const Type *Ty) const {
+  // The default implementation of this implements a conservative RISCy, r+r and
+  // r+i addr mode.
+
+  // Allows a sign-extended 16-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+    return false;
+  
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+  
+  // Only support r+r, 
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  }
+  
+  return true;
+}
+
+/// BuildSDIVSequence - Given an ISD::SDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.  See:
+/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG, 
+                                  std::vector<SDNode*>* Created) const {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl= N->getDebugLoc();
+  
+  // Check to see if we can do this.
+  // FIXME: We should be more aggressive here.
+  if (!isTypeLegal(VT))
+    return SDValue();
+  
+  APInt d = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+  APInt::ms magics = d.magic();
+  
+  // Multiply the numerator (operand 0) by the magic value
+  // FIXME: We should support doing a MUL in a wider type
+  SDValue Q;
+  if (isOperationLegalOrCustom(ISD::MULHS, VT))
+    Q = DAG.getNode(ISD::MULHS, dl, VT, N->getOperand(0),
+                    DAG.getConstant(magics.m, VT));
+  else if (isOperationLegalOrCustom(ISD::SMUL_LOHI, VT))
+    Q = SDValue(DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT),
+                              N->getOperand(0),
+                              DAG.getConstant(magics.m, VT)).getNode(), 1);
+  else
+    return SDValue();       // No mulhs or equvialent
+  // If d > 0 and m < 0, add the numerator
+  if (d.isStrictlyPositive() && magics.m.isNegative()) { 
+    Q = DAG.getNode(ISD::ADD, dl, VT, Q, N->getOperand(0));
+    if (Created)
+      Created->push_back(Q.getNode());
+  }
+  // If d < 0 and m > 0, subtract the numerator.
+  if (d.isNegative() && magics.m.isStrictlyPositive()) {
+    Q = DAG.getNode(ISD::SUB, dl, VT, Q, N->getOperand(0));
+    if (Created)
+      Created->push_back(Q.getNode());
+  }
+  // Shift right algebraic if shift value is nonzero
+  if (magics.s > 0) {
+    Q = DAG.getNode(ISD::SRA, dl, VT, Q, 
+                    DAG.getConstant(magics.s, getShiftAmountTy()));
+    if (Created)
+      Created->push_back(Q.getNode());
+  }
+  // Extract the sign bit and add it to the quotient
+  SDValue T =
+    DAG.getNode(ISD::SRL, dl, VT, Q, DAG.getConstant(VT.getSizeInBits()-1,
+                                                 getShiftAmountTy()));
+  if (Created)
+    Created->push_back(T.getNode());
+  return DAG.getNode(ISD::ADD, dl, VT, Q, T);
+}
+
+/// BuildUDIVSequence - Given an ISD::UDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.  See:
+/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
+                                  std::vector<SDNode*>* Created) const {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Check to see if we can do this.
+  // FIXME: We should be more aggressive here.
+  if (!isTypeLegal(VT))
+    return SDValue();
+
+  // FIXME: We should use a narrower constant when the upper
+  // bits are known to be zero.
+  ConstantSDNode *N1C = cast<ConstantSDNode>(N->getOperand(1));
+  APInt::mu magics = N1C->getAPIntValue().magicu();
+
+  // Multiply the numerator (operand 0) by the magic value
+  // FIXME: We should support doing a MUL in a wider type
+  SDValue Q;
+  if (isOperationLegalOrCustom(ISD::MULHU, VT))
+    Q = DAG.getNode(ISD::MULHU, dl, VT, N->getOperand(0),
+                    DAG.getConstant(magics.m, VT));
+  else if (isOperationLegalOrCustom(ISD::UMUL_LOHI, VT))
+    Q = SDValue(DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT),
+                              N->getOperand(0),
+                              DAG.getConstant(magics.m, VT)).getNode(), 1);
+  else
+    return SDValue();       // No mulhu or equvialent
+  if (Created)
+    Created->push_back(Q.getNode());
+
+  if (magics.a == 0) {
+    assert(magics.s < N1C->getAPIntValue().getBitWidth() &&
+           "We shouldn't generate an undefined shift!");
+    return DAG.getNode(ISD::SRL, dl, VT, Q, 
+                       DAG.getConstant(magics.s, getShiftAmountTy()));
+  } else {
+    SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Q);
+    if (Created)
+      Created->push_back(NPQ.getNode());
+    NPQ = DAG.getNode(ISD::SRL, dl, VT, NPQ, 
+                      DAG.getConstant(1, getShiftAmountTy()));
+    if (Created)
+      Created->push_back(NPQ.getNode());
+    NPQ = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
+    if (Created)
+      Created->push_back(NPQ.getNode());
+    return DAG.getNode(ISD::SRL, dl, VT, NPQ, 
+                       DAG.getConstant(magics.s-1, getShiftAmountTy()));
+  }
+}
diff --git a/src/LLVM/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp b/src/LLVM/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
new file mode 100644
index 0000000..a081e3c
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
@@ -0,0 +1,23 @@
+//===-- TargetSelectionDAGInfo.cpp - SelectionDAG Info --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the TargetSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+TargetSelectionDAGInfo::TargetSelectionDAGInfo(const TargetMachine &TM)
+  : TD(TM.getTargetData()) {
+}
+
+TargetSelectionDAGInfo::~TargetSelectionDAGInfo() {
+}
diff --git a/src/LLVM/lib/CodeGen/ShrinkWrapping.cpp b/src/LLVM/lib/CodeGen/ShrinkWrapping.cpp
new file mode 100644
index 0000000..aeaa38b
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/ShrinkWrapping.cpp
@@ -0,0 +1,1152 @@
+//===-- ShrinkWrapping.cpp - Reduce spills/restores of callee-saved regs --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a shrink wrapping variant of prolog/epilog insertion:
+// - Spills and restores of callee-saved registers (CSRs) are placed in the
+//   machine CFG to tightly surround their uses so that execution paths that
+//   do not use CSRs do not pay the spill/restore penalty.
+//
+// - Avoiding placment of spills/restores in loops: if a CSR is used inside a
+//   loop the spills are placed in the loop preheader, and restores are
+//   placed in the loop exit nodes (the successors of loop _exiting_ nodes).
+//
+// - Covering paths without CSR uses:
+//   If a region in a CFG uses CSRs and has multiple entry and/or exit points,
+//   the use info for the CSRs inside the region is propagated outward in the
+//   CFG to ensure validity of the spill/restore placements. This decreases
+//   the effectiveness of shrink wrapping but does not require edge splitting
+//   in the machine CFG.
+//
+// This shrink wrapping implementation uses an iterative analysis to determine
+// which basic blocks require spills and restores for CSRs.
+//
+// This pass uses MachineDominators and MachineLoopInfo. Loop information
+// is used to prevent placement of callee-saved register spills/restores
+// in the bodies of loops.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "shrink-wrap"
+
+#include "PrologEpilogInserter.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include <sstream>
+
+using namespace llvm;
+
+STATISTIC(numSRReduced, "Number of CSR spills+restores reduced.");
+
+// Shrink Wrapping:
+static cl::opt<bool>
+ShrinkWrapping("shrink-wrap",
+               cl::desc("Shrink wrap callee-saved register spills/restores"));
+
+// Shrink wrap only the specified function, a debugging aid.
+static cl::opt<std::string>
+ShrinkWrapFunc("shrink-wrap-func", cl::Hidden,
+               cl::desc("Shrink wrap the specified function"),
+               cl::value_desc("funcname"),
+               cl::init(""));
+
+// Debugging level for shrink wrapping.
+enum ShrinkWrapDebugLevel {
+  None, BasicInfo, Iterations, Details
+};
+
+static cl::opt<enum ShrinkWrapDebugLevel>
+ShrinkWrapDebugging("shrink-wrap-dbg", cl::Hidden,
+  cl::desc("Print shrink wrapping debugging information"),
+  cl::values(
+    clEnumVal(None      , "disable debug output"),
+    clEnumVal(BasicInfo , "print basic DF sets"),
+    clEnumVal(Iterations, "print SR sets for each iteration"),
+    clEnumVal(Details   , "print all DF sets"),
+    clEnumValEnd));
+
+
+void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  if (ShrinkWrapping || ShrinkWrapFunc != "") {
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<MachineDominatorTree>();
+  }
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+//===----------------------------------------------------------------------===//
+//  ShrinkWrapping implementation
+//===----------------------------------------------------------------------===//
+
+// Convienences for dealing with machine loops.
+MachineBasicBlock* PEI::getTopLevelLoopPreheader(MachineLoop* LP) {
+  assert(LP && "Machine loop is NULL.");
+  MachineBasicBlock* PHDR = LP->getLoopPreheader();
+  MachineLoop* PLP = LP->getParentLoop();
+  while (PLP) {
+    PHDR = PLP->getLoopPreheader();
+    PLP = PLP->getParentLoop();
+  }
+  return PHDR;
+}
+
+MachineLoop* PEI::getTopLevelLoopParent(MachineLoop *LP) {
+  if (LP == 0)
+    return 0;
+  MachineLoop* PLP = LP->getParentLoop();
+  while (PLP) {
+    LP = PLP;
+    PLP = PLP->getParentLoop();
+  }
+  return LP;
+}
+
+bool PEI::isReturnBlock(MachineBasicBlock* MBB) {
+  return (MBB && !MBB->empty() && MBB->back().getDesc().isReturn());
+}
+
+// Initialize shrink wrapping DFA sets, called before iterations.
+void PEI::clearAnticAvailSets() {
+  AnticIn.clear();
+  AnticOut.clear();
+  AvailIn.clear();
+  AvailOut.clear();
+}
+
+// Clear all sets constructed by shrink wrapping.
+void PEI::clearAllSets() {
+  ReturnBlocks.clear();
+  clearAnticAvailSets();
+  UsedCSRegs.clear();
+  CSRUsed.clear();
+  TLLoops.clear();
+  CSRSave.clear();
+  CSRRestore.clear();
+}
+
+// Initialize all shrink wrapping data.
+void PEI::initShrinkWrappingInfo() {
+  clearAllSets();
+  EntryBlock = 0;
+#ifndef NDEBUG
+  HasFastExitPath = false;
+#endif
+  ShrinkWrapThisFunction = ShrinkWrapping;
+  // DEBUG: enable or disable shrink wrapping for the current function
+  // via --shrink-wrap-func=<funcname>.
+#ifndef NDEBUG
+  if (ShrinkWrapFunc != "") {
+    std::string MFName = MF->getFunction()->getNameStr();
+    ShrinkWrapThisFunction = (MFName == ShrinkWrapFunc);
+  }
+#endif
+}
+
+
+/// placeCSRSpillsAndRestores - determine which MBBs of the function
+/// need save, restore code for callee-saved registers by doing a DF analysis
+/// similar to the one used in code motion (GVNPRE). This produces maps of MBBs
+/// to sets of registers (CSRs) for saves and restores. MachineLoopInfo
+/// is used to ensure that CSR save/restore code is not placed inside loops.
+/// This function computes the maps of MBBs -> CSRs to spill and restore
+/// in CSRSave, CSRRestore.
+///
+/// If shrink wrapping is not being performed, place all spills in
+/// the entry block, all restores in return blocks. In this case,
+/// CSRSave has a single mapping, CSRRestore has mappings for each
+/// return block.
+///
+void PEI::placeCSRSpillsAndRestores(MachineFunction &Fn) {
+
+  DEBUG(MF = &Fn);
+
+  initShrinkWrappingInfo();
+
+  DEBUG(if (ShrinkWrapThisFunction) {
+      dbgs() << "Place CSR spills/restores for "
+             << MF->getFunction()->getName() << "\n";
+    });
+
+  if (calculateSets(Fn))
+    placeSpillsAndRestores(Fn);
+}
+
+/// calcAnticInOut - calculate the anticipated in/out reg sets
+/// for the given MBB by looking forward in the MCFG at MBB's
+/// successors.
+///
+bool PEI::calcAnticInOut(MachineBasicBlock* MBB) {
+  bool changed = false;
+
+  // AnticOut[MBB] = INTERSECT(AnticIn[S] for S in SUCCESSORS(MBB))
+  SmallVector<MachineBasicBlock*, 4> successors;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock* SUCC = *SI;
+    if (SUCC != MBB)
+      successors.push_back(SUCC);
+  }
+
+  unsigned i = 0, e = successors.size();
+  if (i != e) {
+    CSRegSet prevAnticOut = AnticOut[MBB];
+    MachineBasicBlock* SUCC = successors[i];
+
+    AnticOut[MBB] = AnticIn[SUCC];
+    for (++i; i != e; ++i) {
+      SUCC = successors[i];
+      AnticOut[MBB] &= AnticIn[SUCC];
+    }
+    if (prevAnticOut != AnticOut[MBB])
+      changed = true;
+  }
+
+  // AnticIn[MBB] = UNION(CSRUsed[MBB], AnticOut[MBB]);
+  CSRegSet prevAnticIn = AnticIn[MBB];
+  AnticIn[MBB] = CSRUsed[MBB] | AnticOut[MBB];
+  if (prevAnticIn |= AnticIn[MBB])
+    changed = true;
+  return changed;
+}
+
+/// calcAvailInOut - calculate the available in/out reg sets
+/// for the given MBB by looking backward in the MCFG at MBB's
+/// predecessors.
+///
+bool PEI::calcAvailInOut(MachineBasicBlock* MBB) {
+  bool changed = false;
+
+  // AvailIn[MBB] = INTERSECT(AvailOut[P] for P in PREDECESSORS(MBB))
+  SmallVector<MachineBasicBlock*, 4> predecessors;
+  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+    MachineBasicBlock* PRED = *PI;
+    if (PRED != MBB)
+      predecessors.push_back(PRED);
+  }
+
+  unsigned i = 0, e = predecessors.size();
+  if (i != e) {
+    CSRegSet prevAvailIn = AvailIn[MBB];
+    MachineBasicBlock* PRED = predecessors[i];
+
+    AvailIn[MBB] = AvailOut[PRED];
+    for (++i; i != e; ++i) {
+      PRED = predecessors[i];
+      AvailIn[MBB] &= AvailOut[PRED];
+    }
+    if (prevAvailIn != AvailIn[MBB])
+      changed = true;
+  }
+
+  // AvailOut[MBB] = UNION(CSRUsed[MBB], AvailIn[MBB]);
+  CSRegSet prevAvailOut = AvailOut[MBB];
+  AvailOut[MBB] = CSRUsed[MBB] | AvailIn[MBB];
+  if (prevAvailOut |= AvailOut[MBB])
+    changed = true;
+  return changed;
+}
+
+/// calculateAnticAvail - build the sets anticipated and available
+/// registers in the MCFG of the current function iteratively,
+/// doing a combined forward and backward analysis.
+///
+void PEI::calculateAnticAvail(MachineFunction &Fn) {
+  // Initialize data flow sets.
+  clearAnticAvailSets();
+
+  // Calulate Antic{In,Out} and Avail{In,Out} iteratively on the MCFG.
+  bool changed = true;
+  unsigned iterations = 0;
+  while (changed) {
+    changed = false;
+    ++iterations;
+    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+
+      // Calculate anticipated in, out regs at MBB from
+      // anticipated at successors of MBB.
+      changed |= calcAnticInOut(MBB);
+
+      // Calculate available in, out regs at MBB from
+      // available at predecessors of MBB.
+      changed |= calcAvailInOut(MBB);
+    }
+  }
+
+  DEBUG({
+      if (ShrinkWrapDebugging >= Details) {
+        dbgs()
+          << "-----------------------------------------------------------\n"
+          << " Antic/Avail Sets:\n"
+          << "-----------------------------------------------------------\n"
+          << "iterations = " << iterations << "\n"
+          << "-----------------------------------------------------------\n"
+          << "MBB | USED | ANTIC_IN | ANTIC_OUT | AVAIL_IN | AVAIL_OUT\n"
+          << "-----------------------------------------------------------\n";
+
+        for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+             MBBI != MBBE; ++MBBI) {
+          MachineBasicBlock* MBB = MBBI;
+          dumpSets(MBB);
+        }
+
+        dbgs()
+          << "-----------------------------------------------------------\n";
+      }
+    });
+}
+
+/// propagateUsesAroundLoop - copy used register info from MBB to all blocks
+/// of the loop given by LP and its parent loops. This prevents spills/restores
+/// from being placed in the bodies of loops.
+///
+void PEI::propagateUsesAroundLoop(MachineBasicBlock* MBB, MachineLoop* LP) {
+  if (! MBB || !LP)
+    return;
+
+  std::vector<MachineBasicBlock*> loopBlocks = LP->getBlocks();
+  for (unsigned i = 0, e = loopBlocks.size(); i != e; ++i) {
+    MachineBasicBlock* LBB = loopBlocks[i];
+    if (LBB == MBB)
+      continue;
+    if (CSRUsed[LBB].contains(CSRUsed[MBB]))
+      continue;
+    CSRUsed[LBB] |= CSRUsed[MBB];
+  }
+}
+
+/// calculateSets - collect the CSRs used in this function, compute
+/// the DF sets that describe the initial minimal regions in the
+/// Machine CFG around which CSR spills and restores must be placed.
+///
+/// Additionally, this function decides if shrink wrapping should
+/// be disabled for the current function, checking the following:
+///  1. the current function has more than 500 MBBs: heuristic limit
+///     on function size to reduce compile time impact of the current
+///     iterative algorithm.
+///  2. all CSRs are used in the entry block.
+///  3. all CSRs are used in all immediate successors of the entry block.
+///  4. all CSRs are used in a subset of blocks, each of which dominates
+///     all return blocks. These blocks, taken as a subgraph of the MCFG,
+///     are equivalent to the entry block since all execution paths pass
+///     through them.
+///
+bool PEI::calculateSets(MachineFunction &Fn) {
+  // Sets used to compute spill, restore placement sets.
+  const std::vector<CalleeSavedInfo> CSI =
+    Fn.getFrameInfo()->getCalleeSavedInfo();
+
+  // If no CSRs used, we are done.
+  if (CSI.empty()) {
+    DEBUG(if (ShrinkWrapThisFunction)
+            dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+                   << ": uses no callee-saved registers\n");
+    return false;
+  }
+
+  // Save refs to entry and return blocks.
+  EntryBlock = Fn.begin();
+  for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end();
+       MBB != E; ++MBB)
+    if (isReturnBlock(MBB))
+      ReturnBlocks.push_back(MBB);
+
+  // Determine if this function has fast exit paths.
+  DEBUG(if (ShrinkWrapThisFunction)
+          findFastExitPath());
+
+  // Limit shrink wrapping via the current iterative bit vector
+  // implementation to functions with <= 500 MBBs.
+  if (Fn.size() > 500) {
+    DEBUG(if (ShrinkWrapThisFunction)
+            dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+                   << ": too large (" << Fn.size() << " MBBs)\n");
+    ShrinkWrapThisFunction = false;
+  }
+
+  // Return now if not shrink wrapping.
+  if (! ShrinkWrapThisFunction)
+    return false;
+
+  // Collect set of used CSRs.
+  for (unsigned inx = 0, e = CSI.size(); inx != e; ++inx) {
+    UsedCSRegs.set(inx);
+  }
+
+  // Walk instructions in all MBBs, create CSRUsed[] sets, choose
+  // whether or not to shrink wrap this function.
+  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
+  MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
+  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+
+  bool allCSRUsesInEntryBlock = true;
+  for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+       MBBI != MBBE; ++MBBI) {
+    MachineBasicBlock* MBB = MBBI;
+    for (MachineBasicBlock::iterator I = MBB->begin(); I != MBB->end(); ++I) {
+      for (unsigned inx = 0, e = CSI.size(); inx != e; ++inx) {
+        unsigned Reg = CSI[inx].getReg();
+        // If instruction I reads or modifies Reg, add it to UsedCSRegs,
+        // CSRUsed map for the current block.
+        for (unsigned opInx = 0, opEnd = I->getNumOperands();
+             opInx != opEnd; ++opInx) {
+          const MachineOperand &MO = I->getOperand(opInx);
+          if (! (MO.isReg() && (MO.isUse() || MO.isDef())))
+            continue;
+          unsigned MOReg = MO.getReg();
+          if (!MOReg)
+            continue;
+          if (MOReg == Reg ||
+              (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+               TargetRegisterInfo::isPhysicalRegister(Reg) &&
+               TRI->isSubRegister(Reg, MOReg))) {
+            // CSR Reg is defined/used in block MBB.
+            CSRUsed[MBB].set(inx);
+            // Check for uses in EntryBlock.
+            if (MBB != EntryBlock)
+              allCSRUsesInEntryBlock = false;
+          }
+        }
+      }
+    }
+
+    if (CSRUsed[MBB].empty())
+      continue;
+
+    // Propagate CSRUsed[MBB] in loops
+    if (MachineLoop* LP = LI.getLoopFor(MBB)) {
+      // Add top level loop to work list.
+      MachineBasicBlock* HDR = getTopLevelLoopPreheader(LP);
+      MachineLoop* PLP = getTopLevelLoopParent(LP);
+
+      if (! HDR) {
+        HDR = PLP->getHeader();
+        assert(HDR->pred_size() > 0 && "Loop header has no predecessors?");
+        MachineBasicBlock::pred_iterator PI = HDR->pred_begin();
+        HDR = *PI;
+      }
+      TLLoops[HDR] = PLP;
+
+      // Push uses from inside loop to its parent loops,
+      // or to all other MBBs in its loop.
+      if (LP->getLoopDepth() > 1) {
+        for (MachineLoop* PLP = LP->getParentLoop(); PLP;
+             PLP = PLP->getParentLoop()) {
+          propagateUsesAroundLoop(MBB, PLP);
+        }
+      } else {
+        propagateUsesAroundLoop(MBB, LP);
+      }
+    }
+  }
+
+  if (allCSRUsesInEntryBlock) {
+    DEBUG(dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+                 << ": all CSRs used in EntryBlock\n");
+    ShrinkWrapThisFunction = false;
+  } else {
+    bool allCSRsUsedInEntryFanout = true;
+    for (MachineBasicBlock::succ_iterator SI = EntryBlock->succ_begin(),
+           SE = EntryBlock->succ_end(); SI != SE; ++SI) {
+      MachineBasicBlock* SUCC = *SI;
+      if (CSRUsed[SUCC] != UsedCSRegs)
+        allCSRsUsedInEntryFanout = false;
+    }
+    if (allCSRsUsedInEntryFanout) {
+      DEBUG(dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+                   << ": all CSRs used in imm successors of EntryBlock\n");
+      ShrinkWrapThisFunction = false;
+    }
+  }
+
+  if (ShrinkWrapThisFunction) {
+    // Check if MBB uses CSRs and dominates all exit nodes.
+    // Such nodes are equiv. to the entry node w.r.t.
+    // CSR uses: every path through the function must
+    // pass through this node. If each CSR is used at least
+    // once by these nodes, shrink wrapping is disabled.
+    CSRegSet CSRUsedInChokePoints;
+    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+      if (MBB == EntryBlock || CSRUsed[MBB].empty() || MBB->succ_size() < 1)
+        continue;
+      bool dominatesExitNodes = true;
+      for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri)
+        if (! DT.dominates(MBB, ReturnBlocks[ri])) {
+          dominatesExitNodes = false;
+          break;
+        }
+      if (dominatesExitNodes) {
+        CSRUsedInChokePoints |= CSRUsed[MBB];
+        if (CSRUsedInChokePoints == UsedCSRegs) {
+          DEBUG(dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+                       << ": all CSRs used in choke point(s) at "
+                       << getBasicBlockName(MBB) << "\n");
+          ShrinkWrapThisFunction = false;
+          break;
+        }
+      }
+    }
+  }
+
+  // Return now if we have decided not to apply shrink wrapping
+  // to the current function.
+  if (! ShrinkWrapThisFunction)
+    return false;
+
+  DEBUG({
+      dbgs() << "ENABLED: " << Fn.getFunction()->getName();
+      if (HasFastExitPath)
+        dbgs() << " (fast exit path)";
+      dbgs() << "\n";
+      if (ShrinkWrapDebugging >= BasicInfo) {
+        dbgs() << "------------------------------"
+             << "-----------------------------\n";
+        dbgs() << "UsedCSRegs = " << stringifyCSRegSet(UsedCSRegs) << "\n";
+        if (ShrinkWrapDebugging >= Details) {
+          dbgs() << "------------------------------"
+               << "-----------------------------\n";
+          dumpAllUsed();
+        }
+      }
+    });
+
+  // Build initial DF sets to determine minimal regions in the
+  // Machine CFG around which CSRs must be spilled and restored.
+  calculateAnticAvail(Fn);
+
+  return true;
+}
+
+/// addUsesForMEMERegion - add uses of CSRs spilled or restored in
+/// multi-entry, multi-exit (MEME) regions so spill and restore
+/// placement will not break code that enters or leaves a
+/// shrink-wrapped region by inducing spills with no matching
+/// restores or restores with no matching spills. A MEME region
+/// is a subgraph of the MCFG with multiple entry edges, multiple
+/// exit edges, or both. This code propagates use information
+/// through the MCFG until all paths requiring spills and restores
+/// _outside_ the computed minimal placement regions have been covered.
+///
+bool PEI::addUsesForMEMERegion(MachineBasicBlock* MBB,
+                               SmallVector<MachineBasicBlock*, 4>& blks) {
+  if (MBB->succ_size() < 2 && MBB->pred_size() < 2) {
+    bool processThisBlock = false;
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+           SE = MBB->succ_end(); SI != SE; ++SI) {
+      MachineBasicBlock* SUCC = *SI;
+      if (SUCC->pred_size() > 1) {
+        processThisBlock = true;
+        break;
+      }
+    }
+    if (!CSRRestore[MBB].empty() && MBB->succ_size() > 0) {
+      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+             PE = MBB->pred_end(); PI != PE; ++PI) {
+        MachineBasicBlock* PRED = *PI;
+        if (PRED->succ_size() > 1) {
+          processThisBlock = true;
+          break;
+        }
+      }
+    }
+    if (! processThisBlock)
+      return false;
+  }
+
+  CSRegSet prop;
+  if (!CSRSave[MBB].empty())
+    prop = CSRSave[MBB];
+  else if (!CSRRestore[MBB].empty())
+    prop = CSRRestore[MBB];
+  else
+    prop = CSRUsed[MBB];
+  if (prop.empty())
+    return false;
+
+  // Propagate selected bits to successors, predecessors of MBB.
+  bool addedUses = false;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock* SUCC = *SI;
+    // Self-loop
+    if (SUCC == MBB)
+      continue;
+    if (! CSRUsed[SUCC].contains(prop)) {
+      CSRUsed[SUCC] |= prop;
+      addedUses = true;
+      blks.push_back(SUCC);
+      DEBUG(if (ShrinkWrapDebugging >= Iterations)
+              dbgs() << getBasicBlockName(MBB)
+                   << "(" << stringifyCSRegSet(prop) << ")->"
+                   << "successor " << getBasicBlockName(SUCC) << "\n");
+    }
+  }
+  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+    MachineBasicBlock* PRED = *PI;
+    // Self-loop
+    if (PRED == MBB)
+      continue;
+    if (! CSRUsed[PRED].contains(prop)) {
+      CSRUsed[PRED] |= prop;
+      addedUses = true;
+      blks.push_back(PRED);
+      DEBUG(if (ShrinkWrapDebugging >= Iterations)
+              dbgs() << getBasicBlockName(MBB)
+                   << "(" << stringifyCSRegSet(prop) << ")->"
+                   << "predecessor " << getBasicBlockName(PRED) << "\n");
+    }
+  }
+  return addedUses;
+}
+
+/// addUsesForTopLevelLoops - add uses for CSRs used inside top
+/// level loops to the exit blocks of those loops.
+///
+bool PEI::addUsesForTopLevelLoops(SmallVector<MachineBasicBlock*, 4>& blks) {
+  bool addedUses = false;
+
+  // Place restores for top level loops where needed.
+  for (DenseMap<MachineBasicBlock*, MachineLoop*>::iterator
+         I = TLLoops.begin(), E = TLLoops.end(); I != E; ++I) {
+    MachineBasicBlock* MBB = I->first;
+    MachineLoop* LP = I->second;
+    MachineBasicBlock* HDR = LP->getHeader();
+    SmallVector<MachineBasicBlock*, 4> exitBlocks;
+    CSRegSet loopSpills;
+
+    loopSpills = CSRSave[MBB];
+    if (CSRSave[MBB].empty()) {
+      loopSpills = CSRUsed[HDR];
+      assert(!loopSpills.empty() && "No CSRs used in loop?");
+    } else if (CSRRestore[MBB].contains(CSRSave[MBB]))
+      continue;
+
+    LP->getExitBlocks(exitBlocks);
+    assert(exitBlocks.size() > 0 && "Loop has no top level exit blocks?");
+    for (unsigned i = 0, e = exitBlocks.size(); i != e; ++i) {
+      MachineBasicBlock* EXB = exitBlocks[i];
+      if (! CSRUsed[EXB].contains(loopSpills)) {
+        CSRUsed[EXB] |= loopSpills;
+        addedUses = true;
+        DEBUG(if (ShrinkWrapDebugging >= Iterations)
+                dbgs() << "LOOP " << getBasicBlockName(MBB)
+                     << "(" << stringifyCSRegSet(loopSpills) << ")->"
+                     << getBasicBlockName(EXB) << "\n");
+        if (EXB->succ_size() > 1 || EXB->pred_size() > 1)
+          blks.push_back(EXB);
+      }
+    }
+  }
+  return addedUses;
+}
+
+/// calcSpillPlacements - determine which CSRs should be spilled
+/// in MBB using AnticIn sets of MBB's predecessors, keeping track
+/// of changes to spilled reg sets. Add MBB to the set of blocks
+/// that need to be processed for propagating use info to cover
+/// multi-entry/exit regions.
+///
+bool PEI::calcSpillPlacements(MachineBasicBlock* MBB,
+                              SmallVector<MachineBasicBlock*, 4> &blks,
+                              CSRegBlockMap &prevSpills) {
+  bool placedSpills = false;
+  // Intersect (CSRegs - AnticIn[P]) for P in Predecessors(MBB)
+  CSRegSet anticInPreds;
+  SmallVector<MachineBasicBlock*, 4> predecessors;
+  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+    MachineBasicBlock* PRED = *PI;
+    if (PRED != MBB)
+      predecessors.push_back(PRED);
+  }
+  unsigned i = 0, e = predecessors.size();
+  if (i != e) {
+    MachineBasicBlock* PRED = predecessors[i];
+    anticInPreds = UsedCSRegs - AnticIn[PRED];
+    for (++i; i != e; ++i) {
+      PRED = predecessors[i];
+      anticInPreds &= (UsedCSRegs - AnticIn[PRED]);
+    }
+  } else {
+    // Handle uses in entry blocks (which have no predecessors).
+    // This is necessary because the DFA formulation assumes the
+    // entry and (multiple) exit nodes cannot have CSR uses, which
+    // is not the case in the real world.
+    anticInPreds = UsedCSRegs;
+  }
+  // Compute spills required at MBB:
+  CSRSave[MBB] |= (AnticIn[MBB] - AvailIn[MBB]) & anticInPreds;
+
+  if (! CSRSave[MBB].empty()) {
+    if (MBB == EntryBlock) {
+      for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri)
+        CSRRestore[ReturnBlocks[ri]] |= CSRSave[MBB];
+    } else {
+      // Reset all regs spilled in MBB that are also spilled in EntryBlock.
+      if (CSRSave[EntryBlock].intersects(CSRSave[MBB])) {
+        CSRSave[MBB] = CSRSave[MBB] - CSRSave[EntryBlock];
+      }
+    }
+  }
+  placedSpills = (CSRSave[MBB] != prevSpills[MBB]);
+  prevSpills[MBB] = CSRSave[MBB];
+  // Remember this block for adding restores to successor
+  // blocks for multi-entry region.
+  if (placedSpills)
+    blks.push_back(MBB);
+
+  DEBUG(if (! CSRSave[MBB].empty() && ShrinkWrapDebugging >= Iterations)
+          dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
+               << stringifyCSRegSet(CSRSave[MBB]) << "\n");
+
+  return placedSpills;
+}
+
+/// calcRestorePlacements - determine which CSRs should be restored
+/// in MBB using AvailOut sets of MBB's succcessors, keeping track
+/// of changes to restored reg sets. Add MBB to the set of blocks
+/// that need to be processed for propagating use info to cover
+/// multi-entry/exit regions.
+///
+bool PEI::calcRestorePlacements(MachineBasicBlock* MBB,
+                                SmallVector<MachineBasicBlock*, 4> &blks,
+                                CSRegBlockMap &prevRestores) {
+  bool placedRestores = false;
+  // Intersect (CSRegs - AvailOut[S]) for S in Successors(MBB)
+  CSRegSet availOutSucc;
+  SmallVector<MachineBasicBlock*, 4> successors;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock* SUCC = *SI;
+    if (SUCC != MBB)
+      successors.push_back(SUCC);
+  }
+  unsigned i = 0, e = successors.size();
+  if (i != e) {
+    MachineBasicBlock* SUCC = successors[i];
+    availOutSucc = UsedCSRegs - AvailOut[SUCC];
+    for (++i; i != e; ++i) {
+      SUCC = successors[i];
+      availOutSucc &= (UsedCSRegs - AvailOut[SUCC]);
+    }
+  } else {
+    if (! CSRUsed[MBB].empty() || ! AvailOut[MBB].empty()) {
+      // Handle uses in return blocks (which have no successors).
+      // This is necessary because the DFA formulation assumes the
+      // entry and (multiple) exit nodes cannot have CSR uses, which
+      // is not the case in the real world.
+      availOutSucc = UsedCSRegs;
+    }
+  }
+  // Compute restores required at MBB:
+  CSRRestore[MBB] |= (AvailOut[MBB] - AnticOut[MBB]) & availOutSucc;
+
+  // Postprocess restore placements at MBB.
+  // Remove the CSRs that are restored in the return blocks.
+  // Lest this be confusing, note that:
+  // CSRSave[EntryBlock] == CSRRestore[B] for all B in ReturnBlocks.
+  if (MBB->succ_size() && ! CSRRestore[MBB].empty()) {
+    if (! CSRSave[EntryBlock].empty())
+      CSRRestore[MBB] = CSRRestore[MBB] - CSRSave[EntryBlock];
+  }
+  placedRestores = (CSRRestore[MBB] != prevRestores[MBB]);
+  prevRestores[MBB] = CSRRestore[MBB];
+  // Remember this block for adding saves to predecessor
+  // blocks for multi-entry region.
+  if (placedRestores)
+    blks.push_back(MBB);
+
+  DEBUG(if (! CSRRestore[MBB].empty() && ShrinkWrapDebugging >= Iterations)
+          dbgs() << "RESTORE[" << getBasicBlockName(MBB) << "] = "
+               << stringifyCSRegSet(CSRRestore[MBB]) << "\n");
+
+  return placedRestores;
+}
+
+/// placeSpillsAndRestores - place spills and restores of CSRs
+/// used in MBBs in minimal regions that contain the uses.
+///
+void PEI::placeSpillsAndRestores(MachineFunction &Fn) {
+  CSRegBlockMap prevCSRSave;
+  CSRegBlockMap prevCSRRestore;
+  SmallVector<MachineBasicBlock*, 4> cvBlocks, ncvBlocks;
+  bool changed = true;
+  unsigned iterations = 0;
+
+  // Iterate computation of spill and restore placements in the MCFG until:
+  //   1. CSR use info has been fully propagated around the MCFG, and
+  //   2. computation of CSRSave[], CSRRestore[] reach fixed points.
+  while (changed) {
+    changed = false;
+    ++iterations;
+
+    DEBUG(if (ShrinkWrapDebugging >= Iterations)
+            dbgs() << "iter " << iterations
+                 << " --------------------------------------------------\n");
+
+    // Calculate CSR{Save,Restore} sets using Antic, Avail on the MCFG,
+    // which determines the placements of spills and restores.
+    // Keep track of changes to spills, restores in each iteration to
+    // minimize the total iterations.
+    bool SRChanged = false;
+    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+
+      // Place spills for CSRs in MBB.
+      SRChanged |= calcSpillPlacements(MBB, cvBlocks, prevCSRSave);
+
+      // Place restores for CSRs in MBB.
+      SRChanged |= calcRestorePlacements(MBB, cvBlocks, prevCSRRestore);
+    }
+
+    // Add uses of CSRs used inside loops where needed.
+    changed |= addUsesForTopLevelLoops(cvBlocks);
+
+    // Add uses for CSRs spilled or restored at branch, join points.
+    if (changed || SRChanged) {
+      while (! cvBlocks.empty()) {
+        MachineBasicBlock* MBB = cvBlocks.pop_back_val();
+        changed |= addUsesForMEMERegion(MBB, ncvBlocks);
+      }
+      if (! ncvBlocks.empty()) {
+        cvBlocks = ncvBlocks;
+        ncvBlocks.clear();
+      }
+    }
+
+    if (changed) {
+      calculateAnticAvail(Fn);
+      CSRSave.clear();
+      CSRRestore.clear();
+    }
+  }
+
+  // Check for effectiveness:
+  //  SR0 = {r | r in CSRSave[EntryBlock], CSRRestore[RB], RB in ReturnBlocks}
+  //  numSRReduced = |(UsedCSRegs - SR0)|, approx. SR0 by CSRSave[EntryBlock]
+  // Gives a measure of how many CSR spills have been moved from EntryBlock
+  // to minimal regions enclosing their uses.
+  CSRegSet notSpilledInEntryBlock = (UsedCSRegs - CSRSave[EntryBlock]);
+  unsigned numSRReducedThisFunc = notSpilledInEntryBlock.count();
+  numSRReduced += numSRReducedThisFunc;
+  DEBUG(if (ShrinkWrapDebugging >= BasicInfo) {
+      dbgs() << "-----------------------------------------------------------\n";
+      dbgs() << "total iterations = " << iterations << " ( "
+           << Fn.getFunction()->getName()
+           << " " << numSRReducedThisFunc
+           << " " << Fn.size()
+           << " )\n";
+      dbgs() << "-----------------------------------------------------------\n";
+      dumpSRSets();
+      dbgs() << "-----------------------------------------------------------\n";
+      if (numSRReducedThisFunc)
+        verifySpillRestorePlacement();
+    });
+}
+
+// Debugging methods.
+#ifndef NDEBUG
+/// findFastExitPath - debugging method used to detect functions
+/// with at least one path from the entry block to a return block
+/// directly or which has a very small number of edges.
+///
+void PEI::findFastExitPath() {
+  if (! EntryBlock)
+    return;
+  // Fina a path from EntryBlock to any return block that does not branch:
+  //        Entry
+  //          |     ...
+  //          v      |
+  //         B1<-----+
+  //          |
+  //          v
+  //       Return
+  for (MachineBasicBlock::succ_iterator SI = EntryBlock->succ_begin(),
+         SE = EntryBlock->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock* SUCC = *SI;
+
+    // Assume positive, disprove existence of fast path.
+    HasFastExitPath = true;
+
+    // Check the immediate successors.
+    if (isReturnBlock(SUCC)) {
+      if (ShrinkWrapDebugging >= BasicInfo)
+        dbgs() << "Fast exit path: " << getBasicBlockName(EntryBlock)
+             << "->" << getBasicBlockName(SUCC) << "\n";
+      break;
+    }
+    // Traverse df from SUCC, look for a branch block.
+    std::string exitPath = getBasicBlockName(SUCC);
+    for (df_iterator<MachineBasicBlock*> BI = df_begin(SUCC),
+           BE = df_end(SUCC); BI != BE; ++BI) {
+      MachineBasicBlock* SBB = *BI;
+      // Reject paths with branch nodes.
+      if (SBB->succ_size() > 1) {
+        HasFastExitPath = false;
+        break;
+      }
+      exitPath += "->" + getBasicBlockName(SBB);
+    }
+    if (HasFastExitPath) {
+      if (ShrinkWrapDebugging >= BasicInfo)
+        dbgs() << "Fast exit path: " << getBasicBlockName(EntryBlock)
+             << "->" << exitPath << "\n";
+      break;
+    }
+  }
+}
+
+/// verifySpillRestorePlacement - check the current spill/restore
+/// sets for safety. Attempt to find spills without restores or
+/// restores without spills.
+/// Spills: walk df from each MBB in spill set ensuring that
+///         all CSRs spilled at MMBB are restored on all paths
+///         from MBB to all exit blocks.
+/// Restores: walk idf from each MBB in restore set ensuring that
+///           all CSRs restored at MBB are spilled on all paths
+///           reaching MBB.
+///
+void PEI::verifySpillRestorePlacement() {
+  unsigned numReturnBlocks = 0;
+  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+       MBBI != MBBE; ++MBBI) {
+    MachineBasicBlock* MBB = MBBI;
+    if (isReturnBlock(MBB) || MBB->succ_size() == 0)
+      ++numReturnBlocks;
+  }
+  for (CSRegBlockMap::iterator BI = CSRSave.begin(),
+         BE = CSRSave.end(); BI != BE; ++BI) {
+    MachineBasicBlock* MBB = BI->first;
+    CSRegSet spilled = BI->second;
+    CSRegSet restored;
+
+    if (spilled.empty())
+      continue;
+
+    DEBUG(dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
+                 << stringifyCSRegSet(spilled)
+                 << "  RESTORE[" << getBasicBlockName(MBB) << "] = "
+                 << stringifyCSRegSet(CSRRestore[MBB]) << "\n");
+
+    if (CSRRestore[MBB].intersects(spilled)) {
+      restored |= (CSRRestore[MBB] & spilled);
+    }
+
+    // Walk depth first from MBB to find restores of all CSRs spilled at MBB:
+    // we must find restores for all spills w/no intervening spills on all
+    // paths from MBB to all return blocks.
+    for (df_iterator<MachineBasicBlock*> BI = df_begin(MBB),
+           BE = df_end(MBB); BI != BE; ++BI) {
+      MachineBasicBlock* SBB = *BI;
+      if (SBB == MBB)
+        continue;
+      // Stop when we encounter spills of any CSRs spilled at MBB that
+      // have not yet been seen to be restored.
+      if (CSRSave[SBB].intersects(spilled) &&
+          !restored.contains(CSRSave[SBB] & spilled))
+        break;
+      // Collect the CSRs spilled at MBB that are restored
+      // at this DF successor of MBB.
+      if (CSRRestore[SBB].intersects(spilled))
+        restored |= (CSRRestore[SBB] & spilled);
+      // If we are at a retun block, check that the restores
+      // we have seen so far exhaust the spills at MBB, then
+      // reset the restores.
+      if (isReturnBlock(SBB) || SBB->succ_size() == 0) {
+        if (restored != spilled) {
+          CSRegSet notRestored = (spilled - restored);
+          DEBUG(dbgs() << MF->getFunction()->getName() << ": "
+                       << stringifyCSRegSet(notRestored)
+                       << " spilled at " << getBasicBlockName(MBB)
+                       << " are never restored on path to return "
+                       << getBasicBlockName(SBB) << "\n");
+        }
+        restored.clear();
+      }
+    }
+  }
+
+  // Check restore placements.
+  for (CSRegBlockMap::iterator BI = CSRRestore.begin(),
+         BE = CSRRestore.end(); BI != BE; ++BI) {
+    MachineBasicBlock* MBB = BI->first;
+    CSRegSet restored = BI->second;
+    CSRegSet spilled;
+
+    if (restored.empty())
+      continue;
+
+    DEBUG(dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
+                 << stringifyCSRegSet(CSRSave[MBB])
+                 << "  RESTORE[" << getBasicBlockName(MBB) << "] = "
+                 << stringifyCSRegSet(restored) << "\n");
+
+    if (CSRSave[MBB].intersects(restored)) {
+      spilled |= (CSRSave[MBB] & restored);
+    }
+    // Walk inverse depth first from MBB to find spills of all
+    // CSRs restored at MBB:
+    for (idf_iterator<MachineBasicBlock*> BI = idf_begin(MBB),
+           BE = idf_end(MBB); BI != BE; ++BI) {
+      MachineBasicBlock* PBB = *BI;
+      if (PBB == MBB)
+        continue;
+      // Stop when we encounter restores of any CSRs restored at MBB that
+      // have not yet been seen to be spilled.
+      if (CSRRestore[PBB].intersects(restored) &&
+          !spilled.contains(CSRRestore[PBB] & restored))
+        break;
+      // Collect the CSRs restored at MBB that are spilled
+      // at this DF predecessor of MBB.
+      if (CSRSave[PBB].intersects(restored))
+        spilled |= (CSRSave[PBB] & restored);
+    }
+    if (spilled != restored) {
+      CSRegSet notSpilled = (restored - spilled);
+      DEBUG(dbgs() << MF->getFunction()->getName() << ": "
+                   << stringifyCSRegSet(notSpilled)
+                   << " restored at " << getBasicBlockName(MBB)
+                   << " are never spilled\n");
+    }
+  }
+}
+
+// Debugging print methods.
+std::string PEI::getBasicBlockName(const MachineBasicBlock* MBB) {
+  if (!MBB)
+    return "";
+
+  if (MBB->getBasicBlock())
+    return MBB->getBasicBlock()->getNameStr();
+
+  std::ostringstream name;
+  name << "_MBB_" << MBB->getNumber();
+  return name.str();
+}
+
+std::string PEI::stringifyCSRegSet(const CSRegSet& s) {
+  const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
+  const std::vector<CalleeSavedInfo> CSI =
+    MF->getFrameInfo()->getCalleeSavedInfo();
+
+  std::ostringstream srep;
+  if (CSI.size() == 0) {
+    srep << "[]";
+    return srep.str();
+  }
+  srep << "[";
+  CSRegSet::iterator I = s.begin(), E = s.end();
+  if (I != E) {
+    unsigned reg = CSI[*I].getReg();
+    srep << TRI->getName(reg);
+    for (++I; I != E; ++I) {
+      reg = CSI[*I].getReg();
+      srep << ",";
+      srep << TRI->getName(reg);
+    }
+  }
+  srep << "]";
+  return srep.str();
+}
+
+void PEI::dumpSet(const CSRegSet& s) {
+  DEBUG(dbgs() << stringifyCSRegSet(s) << "\n");
+}
+
+void PEI::dumpUsed(MachineBasicBlock* MBB) {
+  DEBUG({
+      if (MBB)
+        dbgs() << "CSRUsed[" << getBasicBlockName(MBB) << "] = "
+               << stringifyCSRegSet(CSRUsed[MBB])  << "\n";
+    });
+}
+
+void PEI::dumpAllUsed() {
+    for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+      dumpUsed(MBB);
+    }
+}
+
+void PEI::dumpSets(MachineBasicBlock* MBB) {
+  DEBUG({
+      if (MBB)
+        dbgs() << getBasicBlockName(MBB)           << " | "
+               << stringifyCSRegSet(CSRUsed[MBB])  << " | "
+               << stringifyCSRegSet(AnticIn[MBB])  << " | "
+               << stringifyCSRegSet(AnticOut[MBB]) << " | "
+               << stringifyCSRegSet(AvailIn[MBB])  << " | "
+               << stringifyCSRegSet(AvailOut[MBB]) << "\n";
+    });
+}
+
+void PEI::dumpSets1(MachineBasicBlock* MBB) {
+  DEBUG({
+      if (MBB)
+        dbgs() << getBasicBlockName(MBB)             << " | "
+               << stringifyCSRegSet(CSRUsed[MBB])    << " | "
+               << stringifyCSRegSet(AnticIn[MBB])    << " | "
+               << stringifyCSRegSet(AnticOut[MBB])   << " | "
+               << stringifyCSRegSet(AvailIn[MBB])    << " | "
+               << stringifyCSRegSet(AvailOut[MBB])   << " | "
+               << stringifyCSRegSet(CSRSave[MBB])    << " | "
+               << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
+    });
+}
+
+void PEI::dumpAllSets() {
+    for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+      dumpSets1(MBB);
+    }
+}
+
+void PEI::dumpSRSets() {
+  DEBUG({
+      for (MachineFunction::iterator MBB = MF->begin(), E = MF->end();
+           MBB != E; ++MBB) {
+        if (!CSRSave[MBB].empty()) {
+          dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
+                 << stringifyCSRegSet(CSRSave[MBB]);
+          if (CSRRestore[MBB].empty())
+            dbgs() << '\n';
+        }
+
+        if (!CSRRestore[MBB].empty() && !CSRSave[MBB].empty())
+          dbgs() << "    "
+                 << "RESTORE[" << getBasicBlockName(MBB) << "] = "
+                 << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
+      }
+    });
+}
+#endif
diff --git a/src/LLVM/lib/CodeGen/SimpleRegisterCoalescing.cpp b/src/LLVM/lib/CodeGen/SimpleRegisterCoalescing.cpp
new file mode 100644
index 0000000..1dcc674
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SimpleRegisterCoalescing.cpp
@@ -0,0 +1,1838 @@
+//===-- SimpleRegisterCoalescing.cpp - Register Coalescing ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple register coalescing pass that attempts to
+// aggressively coalesce every register copy that it can.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regcoalescing"
+#include "SimpleRegisterCoalescing.h"
+#include "VirtRegMap.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/Value.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <cmath>
+using namespace llvm;
+
+STATISTIC(numJoins    , "Number of interval joins performed");
+STATISTIC(numCrossRCs , "Number of cross class joins performed");
+STATISTIC(numCommutes , "Number of instruction commuting performed");
+STATISTIC(numExtends  , "Number of copies extended");
+STATISTIC(NumReMats   , "Number of instructions re-materialized");
+STATISTIC(numPeep     , "Number of identity moves eliminated after coalescing");
+STATISTIC(numAborts   , "Number of times interval joining aborted");
+STATISTIC(numDeadValNo, "Number of valno def marked dead");
+
+char SimpleRegisterCoalescing::ID = 0;
+static cl::opt<bool>
+EnableJoining("join-liveintervals",
+              cl::desc("Coalesce copies (default=true)"),
+              cl::init(true));
+
+static cl::opt<bool>
+DisableCrossClassJoin("disable-cross-class-join",
+               cl::desc("Avoid coalescing cross register class copies"),
+               cl::init(false), cl::Hidden);
+
+static RegisterPass<SimpleRegisterCoalescing>
+X("simple-register-coalescing", "Simple Register Coalescing");
+
+// Declare that we implement the RegisterCoalescer interface
+static RegisterAnalysisGroup<RegisterCoalescer, true/*The Default*/> V(X);
+
+char &llvm::SimpleRegisterCoalescingID = SimpleRegisterCoalescing::ID;
+
+void SimpleRegisterCoalescing::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreservedID(MachineDominatorsID);
+  if (StrongPHIElim)
+    AU.addPreservedID(StrongPHIEliminationID);
+  else
+    AU.addPreservedID(PHIEliminationID);
+  AU.addPreservedID(TwoAddressInstructionPassID);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy with IntA
+/// being the source and IntB being the dest, thus this defines a value number
+/// in IntB.  If the source value number (in IntA) is defined by a copy from B,
+/// see if we can merge these two pieces of B into a single value number,
+/// eliminating a copy.  For example:
+///
+///  A3 = B0
+///    ...
+///  B1 = A3      <- this copy
+///
+/// In this case, B0 can be extended to where the B1 copy lives, allowing the B1
+/// value number to be replaced with B0 (which simplifies the B liveinterval).
+///
+/// This returns true if an interval was modified.
+///
+bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(const CoalescerPair &CP,
+                                                    MachineInstr *CopyMI) {
+  // Bail if there is no dst interval - can happen when merging physical subreg
+  // operations.
+  if (!li_->hasInterval(CP.getDstReg()))
+    return false;
+
+  LiveInterval &IntA =
+    li_->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+    li_->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+  SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI).getDefIndex();
+
+  // BValNo is a value number in B that is defined by a copy from A.  'B3' in
+  // the example above.
+  LiveInterval::iterator BLR = IntB.FindLiveRangeContaining(CopyIdx);
+  if (BLR == IntB.end()) return false;
+  VNInfo *BValNo = BLR->valno;
+
+  // Get the location that B is defined at.  Two options: either this value has
+  // an unknown definition point or it is defined at CopyIdx.  If unknown, we
+  // can't process it.
+  if (!BValNo->getCopy()) return false;
+  assert(BValNo->def == CopyIdx && "Copy doesn't define the value?");
+
+  // AValNo is the value number in A that defines the copy, A3 in the example.
+  SlotIndex CopyUseIdx = CopyIdx.getUseIndex();
+  LiveInterval::iterator ALR = IntA.FindLiveRangeContaining(CopyUseIdx);
+  // The live range might not exist after fun with physreg coalescing.
+  if (ALR == IntA.end()) return false;
+  VNInfo *AValNo = ALR->valno;
+  // If it's re-defined by an early clobber somewhere in the live range, then
+  // it's not safe to eliminate the copy. FIXME: This is a temporary workaround.
+  // See PR3149:
+  // 172     %ECX<def> = MOV32rr %reg1039<kill>
+  // 180     INLINEASM <es:subl $5,$1
+  //         sbbl $3,$0>, 10, %EAX<def>, 14, %ECX<earlyclobber,def>, 9,
+  //         %EAX<kill>,
+  // 36, <fi#0>, 1, %reg0, 0, 9, %ECX<kill>, 36, <fi#1>, 1, %reg0, 0
+  // 188     %EAX<def> = MOV32rr %EAX<kill>
+  // 196     %ECX<def> = MOV32rr %ECX<kill>
+  // 204     %ECX<def> = MOV32rr %ECX<kill>
+  // 212     %EAX<def> = MOV32rr %EAX<kill>
+  // 220     %EAX<def> = MOV32rr %EAX
+  // 228     %reg1039<def> = MOV32rr %ECX<kill>
+  // The early clobber operand ties ECX input to the ECX def.
+  //
+  // The live interval of ECX is represented as this:
+  // %reg20,inf = [46,47:1)[174,230:0)  0@174-(230) 1@46-(47)
+  // The coalescer has no idea there was a def in the middle of [174,230].
+  if (AValNo->hasRedefByEC())
+    return false;
+
+  // If AValNo is defined as a copy from IntB, we can potentially process this.
+  // Get the instruction that defines this value number.
+  if (!CP.isCoalescable(AValNo->getCopy()))
+    return false;
+
+  // Get the LiveRange in IntB that this value number starts with.
+  LiveInterval::iterator ValLR =
+    IntB.FindLiveRangeContaining(AValNo->def.getPrevSlot());
+  if (ValLR == IntB.end())
+    return false;
+
+  // Make sure that the end of the live range is inside the same block as
+  // CopyMI.
+  MachineInstr *ValLREndInst =
+    li_->getInstructionFromIndex(ValLR->end.getPrevSlot());
+  if (!ValLREndInst || ValLREndInst->getParent() != CopyMI->getParent())
+    return false;
+
+  // Okay, we now know that ValLR ends in the same block that the CopyMI
+  // live-range starts.  If there are no intervening live ranges between them in
+  // IntB, we can merge them.
+  if (ValLR+1 != BLR) return false;
+
+  // If a live interval is a physical register, conservatively check if any
+  // of its sub-registers is overlapping the live interval of the virtual
+  // register. If so, do not coalesce.
+  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg) &&
+      *tri_->getSubRegisters(IntB.reg)) {
+    for (const unsigned* SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR)
+      if (li_->hasInterval(*SR) && IntA.overlaps(li_->getInterval(*SR))) {
+        DEBUG({
+            dbgs() << "\t\tInterfere with sub-register ";
+            li_->getInterval(*SR).print(dbgs(), tri_);
+          });
+        return false;
+      }
+  }
+
+  DEBUG({
+      dbgs() << "Extending: ";
+      IntB.print(dbgs(), tri_);
+    });
+
+  SlotIndex FillerStart = ValLR->end, FillerEnd = BLR->start;
+  // We are about to delete CopyMI, so need to remove it as the 'instruction
+  // that defines this value #'. Update the valnum with the new defining
+  // instruction #.
+  BValNo->def  = FillerStart;
+  BValNo->setCopy(0);
+
+  // Okay, we can merge them.  We need to insert a new liverange:
+  // [ValLR.end, BLR.begin) of either value number, then we merge the
+  // two value numbers.
+  IntB.addRange(LiveRange(FillerStart, FillerEnd, BValNo));
+
+  // If the IntB live range is assigned to a physical register, and if that
+  // physreg has sub-registers, update their live intervals as well.
+  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) {
+    for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
+      if (!li_->hasInterval(*SR))
+        continue;
+      LiveInterval &SRLI = li_->getInterval(*SR);
+      SRLI.addRange(LiveRange(FillerStart, FillerEnd,
+                              SRLI.getNextValue(FillerStart, 0, true,
+                                                li_->getVNInfoAllocator())));
+    }
+  }
+
+  // Okay, merge "B1" into the same value number as "B0".
+  if (BValNo != ValLR->valno) {
+    IntB.MergeValueNumberInto(BValNo, ValLR->valno);
+  }
+  DEBUG({
+      dbgs() << "   result = ";
+      IntB.print(dbgs(), tri_);
+      dbgs() << "\n";
+    });
+
+  // If the source instruction was killing the source register before the
+  // merge, unset the isKill marker given the live range has been extended.
+  int UIdx = ValLREndInst->findRegisterUseOperandIdx(IntB.reg, true);
+  if (UIdx != -1) {
+    ValLREndInst->getOperand(UIdx).setIsKill(false);
+  }
+
+  // If the copy instruction was killing the destination register before the
+  // merge, find the last use and trim the live range. That will also add the
+  // isKill marker.
+  if (ALR->end == CopyIdx)
+    TrimLiveIntervalToLastUse(CopyUseIdx, CopyMI->getParent(), IntA, ALR);
+
+  ++numExtends;
+  return true;
+}
+
+/// HasOtherReachingDefs - Return true if there are definitions of IntB
+/// other than BValNo val# that can reach uses of AValno val# of IntA.
+bool SimpleRegisterCoalescing::HasOtherReachingDefs(LiveInterval &IntA,
+                                                    LiveInterval &IntB,
+                                                    VNInfo *AValNo,
+                                                    VNInfo *BValNo) {
+  for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
+       AI != AE; ++AI) {
+    if (AI->valno != AValNo) continue;
+    LiveInterval::Ranges::iterator BI =
+      std::upper_bound(IntB.ranges.begin(), IntB.ranges.end(), AI->start);
+    if (BI != IntB.ranges.begin())
+      --BI;
+    for (; BI != IntB.ranges.end() && AI->end >= BI->start; ++BI) {
+      if (BI->valno == BValNo)
+        continue;
+      // When BValNo is null, we're looking for a dummy clobber-value for a subreg.
+      if (!BValNo && !BI->valno->isDefAccurate() && !BI->valno->getCopy())
+        continue;
+      if (BI->start <= AI->start && BI->end > AI->start)
+        return true;
+      if (BI->start > AI->start && BI->start < AI->end)
+        return true;
+    }
+  }
+  return false;
+}
+
+static void
+TransferImplicitOps(MachineInstr *MI, MachineInstr *NewMI) {
+  for (unsigned i = MI->getDesc().getNumOperands(), e = MI->getNumOperands();
+       i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isImplicit())
+      NewMI->addOperand(MO);
+  }
+}
+
+/// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy with
+/// IntA being the source and IntB being the dest, thus this defines a value
+/// number in IntB.  If the source value number (in IntA) is defined by a
+/// commutable instruction and its other operand is coalesced to the copy dest
+/// register, see if we can transform the copy into a noop by commuting the
+/// definition. For example,
+///
+///  A3 = op A2 B0<kill>
+///    ...
+///  B1 = A3      <- this copy
+///    ...
+///     = op A3   <- more uses
+///
+/// ==>
+///
+///  B2 = op B0 A2<kill>
+///    ...
+///  B1 = B2      <- now an identify copy
+///    ...
+///     = op B2   <- more uses
+///
+/// This returns true if an interval was modified.
+///
+bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP,
+                                                        MachineInstr *CopyMI) {
+  // FIXME: For now, only eliminate the copy by commuting its def when the
+  // source register is a virtual register. We want to guard against cases
+  // where the copy is a back edge copy and commuting the def lengthen the
+  // live interval of the source register to the entire loop.
+  if (CP.isPhys() && CP.isFlipped())
+    return false;
+
+  // Bail if there is no dst interval.
+  if (!li_->hasInterval(CP.getDstReg()))
+    return false;
+
+  SlotIndex CopyIdx =
+    li_->getInstructionIndex(CopyMI).getDefIndex();
+
+  LiveInterval &IntA =
+    li_->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+    li_->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+
+  // BValNo is a value number in B that is defined by a copy from A. 'B3' in
+  // the example above.
+  LiveInterval::iterator BLR = IntB.FindLiveRangeContaining(CopyIdx);
+  if (BLR == IntB.end()) return false;
+  VNInfo *BValNo = BLR->valno;
+
+  // Get the location that B is defined at.  Two options: either this value has
+  // an unknown definition point or it is defined at CopyIdx.  If unknown, we
+  // can't process it.
+  if (!BValNo->getCopy()) return false;
+  assert(BValNo->def == CopyIdx && "Copy doesn't define the value?");
+
+  // AValNo is the value number in A that defines the copy, A3 in the example.
+  LiveInterval::iterator ALR =
+    IntA.FindLiveRangeContaining(CopyIdx.getUseIndex()); // 
+
+  assert(ALR != IntA.end() && "Live range not found!");
+  VNInfo *AValNo = ALR->valno;
+  // If other defs can reach uses of this def, then it's not safe to perform
+  // the optimization. FIXME: Do isPHIDef and isDefAccurate both need to be
+  // tested?
+  if (AValNo->isPHIDef() || !AValNo->isDefAccurate() ||
+      AValNo->isUnused() || AValNo->hasPHIKill())
+    return false;
+  MachineInstr *DefMI = li_->getInstructionFromIndex(AValNo->def);
+  if (!DefMI)
+    return false;
+  const TargetInstrDesc &TID = DefMI->getDesc();
+  if (!TID.isCommutable())
+    return false;
+  // If DefMI is a two-address instruction then commuting it will change the
+  // destination register.
+  int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg);
+  assert(DefIdx != -1);
+  unsigned UseOpIdx;
+  if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))
+    return false;
+  unsigned Op1, Op2, NewDstIdx;
+  if (!tii_->findCommutedOpIndices(DefMI, Op1, Op2))
+    return false;
+  if (Op1 == UseOpIdx)
+    NewDstIdx = Op2;
+  else if (Op2 == UseOpIdx)
+    NewDstIdx = Op1;
+  else
+    return false;
+
+  MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
+  unsigned NewReg = NewDstMO.getReg();
+  if (NewReg != IntB.reg || !NewDstMO.isKill())
+    return false;
+
+  // Make sure there are no other definitions of IntB that would reach the
+  // uses which the new definition can reach.
+  if (HasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
+    return false;
+
+  bool BHasSubRegs = false;
+  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg))
+    BHasSubRegs = *tri_->getSubRegisters(IntB.reg);
+
+  // Abort if the subregisters of IntB.reg have values that are not simply the
+  // clobbers from the superreg.
+  if (BHasSubRegs)
+    for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR)
+      if (li_->hasInterval(*SR) &&
+          HasOtherReachingDefs(IntA, li_->getInterval(*SR), AValNo, 0))
+        return false;
+
+  // If some of the uses of IntA.reg is already coalesced away, return false.
+  // It's not possible to determine whether it's safe to perform the coalescing.
+  for (MachineRegisterInfo::use_nodbg_iterator UI = 
+         mri_->use_nodbg_begin(IntA.reg), 
+       UE = mri_->use_nodbg_end(); UI != UE; ++UI) {
+    MachineInstr *UseMI = &*UI;
+    SlotIndex UseIdx = li_->getInstructionIndex(UseMI);
+    LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
+    if (ULR == IntA.end())
+      continue;
+    if (ULR->valno == AValNo && JoinedCopies.count(UseMI))
+      return false;
+  }
+
+  // At this point we have decided that it is legal to do this
+  // transformation.  Start by commuting the instruction.
+  MachineBasicBlock *MBB = DefMI->getParent();
+  MachineInstr *NewMI = tii_->commuteInstruction(DefMI);
+  if (!NewMI)
+    return false;
+  if (NewMI != DefMI) {
+    li_->ReplaceMachineInstrInMaps(DefMI, NewMI);
+    MBB->insert(DefMI, NewMI);
+    MBB->erase(DefMI);
+  }
+  unsigned OpIdx = NewMI->findRegisterUseOperandIdx(IntA.reg, false);
+  NewMI->getOperand(OpIdx).setIsKill();
+
+  bool BHasPHIKill = BValNo->hasPHIKill();
+  SmallVector<VNInfo*, 4> BDeadValNos;
+  std::map<SlotIndex, SlotIndex> BExtend;
+
+  // If ALR and BLR overlaps and end of BLR extends beyond end of ALR, e.g.
+  // A = or A, B
+  // ...
+  // B = A
+  // ...
+  // C = A<kill>
+  // ...
+  //   = B
+  bool Extended = BLR->end > ALR->end && ALR->end != ALR->start;
+  if (Extended)
+    BExtend[ALR->end] = BLR->end;
+
+  // Update uses of IntA of the specific Val# with IntB.
+  for (MachineRegisterInfo::use_iterator UI = mri_->use_begin(IntA.reg),
+         UE = mri_->use_end(); UI != UE;) {
+    MachineOperand &UseMO = UI.getOperand();
+    MachineInstr *UseMI = &*UI;
+    ++UI;
+    if (JoinedCopies.count(UseMI))
+      continue;
+    if (UseMI->isDebugValue()) {
+      // FIXME These don't have an instruction index.  Not clear we have enough
+      // info to decide whether to do this replacement or not.  For now do it.
+      UseMO.setReg(NewReg);
+      continue;
+    }
+    SlotIndex UseIdx = li_->getInstructionIndex(UseMI).getUseIndex();
+    LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
+    if (ULR == IntA.end() || ULR->valno != AValNo)
+      continue;
+    if (TargetRegisterInfo::isPhysicalRegister(NewReg))
+      UseMO.substPhysReg(NewReg, *tri_);
+    else
+      UseMO.setReg(NewReg);
+    if (UseMI == CopyMI)
+      continue;
+    if (UseMO.isKill()) {
+      if (Extended)
+        UseMO.setIsKill(false);
+    }
+    if (!UseMI->isCopy())
+      continue;
+    if (UseMI->getOperand(0).getReg() != IntB.reg ||
+        UseMI->getOperand(0).getSubReg())
+      continue;
+        
+    // This copy will become a noop. If it's defining a new val#,
+    // remove that val# as well. However this live range is being
+    // extended to the end of the existing live range defined by the copy.
+    SlotIndex DefIdx = UseIdx.getDefIndex();
+    const LiveRange *DLR = IntB.getLiveRangeContaining(DefIdx);
+    if (!DLR)
+      continue;
+    BHasPHIKill |= DLR->valno->hasPHIKill();
+    assert(DLR->valno->def == DefIdx);
+    BDeadValNos.push_back(DLR->valno);
+    BExtend[DLR->start] = DLR->end;
+    JoinedCopies.insert(UseMI);
+  }
+
+  // We need to insert a new liverange: [ALR.start, LastUse). It may be we can
+  // simply extend BLR if CopyMI doesn't end the range.
+  DEBUG({
+      dbgs() << "Extending: ";
+      IntB.print(dbgs(), tri_);
+    });
+
+  // Remove val#'s defined by copies that will be coalesced away.
+  for (unsigned i = 0, e = BDeadValNos.size(); i != e; ++i) {
+    VNInfo *DeadVNI = BDeadValNos[i];
+    if (BHasSubRegs) {
+      for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
+        if (!li_->hasInterval(*SR))
+          continue;
+        LiveInterval &SRLI = li_->getInterval(*SR);
+        if (const LiveRange *SRLR = SRLI.getLiveRangeContaining(DeadVNI->def))
+          SRLI.removeValNo(SRLR->valno);
+      }
+    }
+    IntB.removeValNo(BDeadValNos[i]);
+  }
+
+  // Extend BValNo by merging in IntA live ranges of AValNo. Val# definition
+  // is updated.
+  VNInfo *ValNo = BValNo;
+  ValNo->def = AValNo->def;
+  ValNo->setCopy(0);
+  for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
+       AI != AE; ++AI) {
+    if (AI->valno != AValNo) continue;
+    SlotIndex End = AI->end;
+    std::map<SlotIndex, SlotIndex>::iterator
+      EI = BExtend.find(End);
+    if (EI != BExtend.end())
+      End = EI->second;
+    IntB.addRange(LiveRange(AI->start, End, ValNo));
+  }
+  ValNo->setHasPHIKill(BHasPHIKill);
+
+  DEBUG({
+      dbgs() << "   result = ";
+      IntB.print(dbgs(), tri_);
+      dbgs() << "\nShortening: ";
+      IntA.print(dbgs(), tri_);
+    });
+
+  IntA.removeValNo(AValNo);
+
+  DEBUG({
+      dbgs() << "   result = ";
+      IntA.print(dbgs(), tri_);
+      dbgs() << '\n';
+    });
+
+  ++numCommutes;
+  return true;
+}
+
+/// isSameOrFallThroughBB - Return true if MBB == SuccMBB or MBB simply
+/// fallthoughs to SuccMBB.
+static bool isSameOrFallThroughBB(MachineBasicBlock *MBB,
+                                  MachineBasicBlock *SuccMBB,
+                                  const TargetInstrInfo *tii_) {
+  if (MBB == SuccMBB)
+    return true;
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  return !tii_->AnalyzeBranch(*MBB, TBB, FBB, Cond) && !TBB && !FBB &&
+    MBB->isSuccessor(SuccMBB);
+}
+
+/// removeRange - Wrapper for LiveInterval::removeRange. This removes a range
+/// from a physical register live interval as well as from the live intervals
+/// of its sub-registers.
+static void removeRange(LiveInterval &li,
+                        SlotIndex Start, SlotIndex End,
+                        LiveIntervals *li_, const TargetRegisterInfo *tri_) {
+  li.removeRange(Start, End, true);
+  if (TargetRegisterInfo::isPhysicalRegister(li.reg)) {
+    for (const unsigned* SR = tri_->getSubRegisters(li.reg); *SR; ++SR) {
+      if (!li_->hasInterval(*SR))
+        continue;
+      LiveInterval &sli = li_->getInterval(*SR);
+      SlotIndex RemoveStart = Start;
+      SlotIndex RemoveEnd = Start;
+
+      while (RemoveEnd != End) {
+        LiveInterval::iterator LR = sli.FindLiveRangeContaining(RemoveStart);
+        if (LR == sli.end())
+          break;
+        RemoveEnd = (LR->end < End) ? LR->end : End;
+        sli.removeRange(RemoveStart, RemoveEnd, true);
+        RemoveStart = RemoveEnd;
+      }
+    }
+  }
+}
+
+/// TrimLiveIntervalToLastUse - If there is a last use in the same basic block
+/// as the copy instruction, trim the live interval to the last use and return
+/// true.
+bool
+SimpleRegisterCoalescing::TrimLiveIntervalToLastUse(SlotIndex CopyIdx,
+                                                    MachineBasicBlock *CopyMBB,
+                                                    LiveInterval &li,
+                                                    const LiveRange *LR) {
+  SlotIndex MBBStart = li_->getMBBStartIdx(CopyMBB);
+  SlotIndex LastUseIdx;
+  MachineOperand *LastUse =
+    lastRegisterUse(LR->start, CopyIdx.getPrevSlot(), li.reg, LastUseIdx);
+  if (LastUse) {
+    MachineInstr *LastUseMI = LastUse->getParent();
+    if (!isSameOrFallThroughBB(LastUseMI->getParent(), CopyMBB, tii_)) {
+      // r1024 = op
+      // ...
+      // BB1:
+      //       = r1024
+      //
+      // BB2:
+      // r1025<dead> = r1024<kill>
+      if (MBBStart < LR->end)
+        removeRange(li, MBBStart, LR->end, li_, tri_);
+      return true;
+    }
+
+    // There are uses before the copy, just shorten the live range to the end
+    // of last use.
+    LastUse->setIsKill();
+    removeRange(li, LastUseIdx.getDefIndex(), LR->end, li_, tri_);
+    if (LastUseMI->isCopy()) {
+      MachineOperand &DefMO = LastUseMI->getOperand(0);
+      if (DefMO.getReg() == li.reg && !DefMO.getSubReg())
+        DefMO.setIsDead();
+    }
+    return true;
+  }
+
+  // Is it livein?
+  if (LR->start <= MBBStart && LR->end > MBBStart) {
+    if (LR->start == li_->getZeroIndex()) {
+      assert(TargetRegisterInfo::isPhysicalRegister(li.reg));
+      // Live-in to the function but dead. Remove it from entry live-in set.
+      mf_->begin()->removeLiveIn(li.reg);
+    }
+    // FIXME: Shorten intervals in BBs that reaches this BB.
+  }
+
+  return false;
+}
+
+/// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial
+/// computation, replace the copy by rematerialize the definition.
+bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
+                                                       unsigned DstReg,
+                                                       unsigned DstSubIdx,
+                                                       MachineInstr *CopyMI) {
+  SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI).getUseIndex();
+  LiveInterval::iterator SrcLR = SrcInt.FindLiveRangeContaining(CopyIdx);
+  assert(SrcLR != SrcInt.end() && "Live range not found!");
+  VNInfo *ValNo = SrcLR->valno;
+  // If other defs can reach uses of this def, then it's not safe to perform
+  // the optimization. FIXME: Do isPHIDef and isDefAccurate both need to be
+  // tested?
+  if (ValNo->isPHIDef() || !ValNo->isDefAccurate() ||
+      ValNo->isUnused() || ValNo->hasPHIKill())
+    return false;
+  MachineInstr *DefMI = li_->getInstructionFromIndex(ValNo->def);
+  assert(DefMI && "Defining instruction disappeared");
+  const TargetInstrDesc &TID = DefMI->getDesc();
+  if (!TID.isAsCheapAsAMove())
+    return false;
+  if (!tii_->isTriviallyReMaterializable(DefMI, AA))
+    return false;
+  bool SawStore = false;
+  if (!DefMI->isSafeToMove(tii_, AA, SawStore))
+    return false;
+  if (TID.getNumDefs() != 1)
+    return false;
+  if (!DefMI->isImplicitDef()) {
+    // Make sure the copy destination register class fits the instruction
+    // definition register class. The mismatch can happen as a result of earlier
+    // extract_subreg, insert_subreg, subreg_to_reg coalescing.
+    const TargetRegisterClass *RC = TID.OpInfo[0].getRegClass(tri_);
+    if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+      if (mri_->getRegClass(DstReg) != RC)
+        return false;
+    } else if (!RC->contains(DstReg))
+      return false;
+  }
+
+  // If destination register has a sub-register index on it, make sure it mtches
+  // the instruction register class.
+  if (DstSubIdx) {
+    const TargetInstrDesc &TID = DefMI->getDesc();
+    if (TID.getNumDefs() != 1)
+      return false;
+    const TargetRegisterClass *DstRC = mri_->getRegClass(DstReg);
+    const TargetRegisterClass *DstSubRC =
+      DstRC->getSubRegisterRegClass(DstSubIdx);
+    const TargetRegisterClass *DefRC = TID.OpInfo[0].getRegClass(tri_);
+    if (DefRC == DstRC)
+      DstSubIdx = 0;
+    else if (DefRC != DstSubRC)
+      return false;
+  }
+
+  RemoveCopyFlag(DstReg, CopyMI);
+
+  // If copy kills the source register, find the last use and propagate
+  // kill.
+  bool checkForDeadDef = false;
+  MachineBasicBlock *MBB = CopyMI->getParent();
+  if (SrcLR->end == CopyIdx.getDefIndex())
+    if (!TrimLiveIntervalToLastUse(CopyIdx, MBB, SrcInt, SrcLR)) {
+      checkForDeadDef = true;
+    }
+
+  MachineBasicBlock::iterator MII =
+    llvm::next(MachineBasicBlock::iterator(CopyMI));
+  tii_->reMaterialize(*MBB, MII, DstReg, DstSubIdx, DefMI, *tri_);
+  MachineInstr *NewMI = prior(MII);
+
+  if (checkForDeadDef) {
+    // PR4090 fix: Trim interval failed because there was no use of the
+    // source interval in this MBB. If the def is in this MBB too then we
+    // should mark it dead:
+    if (DefMI->getParent() == MBB) {
+      DefMI->addRegisterDead(SrcInt.reg, tri_);
+      SrcLR->end = SrcLR->start.getNextSlot();
+    }
+  }
+
+  // CopyMI may have implicit operands, transfer them over to the newly
+  // rematerialized instruction. And update implicit def interval valnos.
+  for (unsigned i = CopyMI->getDesc().getNumOperands(),
+         e = CopyMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = CopyMI->getOperand(i);
+    if (MO.isReg() && MO.isImplicit())
+      NewMI->addOperand(MO);
+    if (MO.isDef())
+      RemoveCopyFlag(MO.getReg(), CopyMI);
+  }
+
+  TransferImplicitOps(CopyMI, NewMI);
+  li_->ReplaceMachineInstrInMaps(CopyMI, NewMI);
+  CopyMI->eraseFromParent();
+  ReMatCopies.insert(CopyMI);
+  ReMatDefs.insert(DefMI);
+  DEBUG(dbgs() << "Remat: " << *NewMI);
+  ++NumReMats;
+  return true;
+}
+
+/// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+/// update the subregister number if it is not zero. If DstReg is a
+/// physical register and the existing subregister number of the def / use
+/// being updated is not zero, make sure to set it to the correct physical
+/// subregister.
+void
+SimpleRegisterCoalescing::UpdateRegDefsUses(const CoalescerPair &CP) {
+  bool DstIsPhys = CP.isPhys();
+  unsigned SrcReg = CP.getSrcReg();
+  unsigned DstReg = CP.getDstReg();
+  unsigned SubIdx = CP.getSubIdx();
+
+  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(SrcReg);
+       MachineInstr *UseMI = I.skipInstruction();) {
+    // A PhysReg copy that won't be coalesced can perhaps be rematerialized
+    // instead.
+    if (DstIsPhys) {
+      if (UseMI->isCopy() &&
+          !UseMI->getOperand(1).getSubReg() &&
+          !UseMI->getOperand(0).getSubReg() &&
+          UseMI->getOperand(1).getReg() == SrcReg &&
+          UseMI->getOperand(0).getReg() != SrcReg &&
+          UseMI->getOperand(0).getReg() != DstReg &&
+          !JoinedCopies.count(UseMI) &&
+          ReMaterializeTrivialDef(li_->getInterval(SrcReg),
+                                  UseMI->getOperand(0).getReg(), 0, UseMI))
+        continue;
+    }
+
+    SmallVector<unsigned,8> Ops;
+    bool Reads, Writes;
+    tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops);
+    bool Kills = false, Deads = false;
+
+    // Replace SrcReg with DstReg in all UseMI operands.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = UseMI->getOperand(Ops[i]);
+      Kills |= MO.isKill();
+      Deads |= MO.isDead();
+
+      if (DstIsPhys)
+        MO.substPhysReg(DstReg, *tri_);
+      else
+        MO.substVirtReg(DstReg, SubIdx, *tri_);
+    }
+
+    // This instruction is a copy that will be removed.
+    if (JoinedCopies.count(UseMI))
+      continue;
+
+    if (SubIdx) {
+      // If UseMI was a simple SrcReg def, make sure we didn't turn it into a
+      // read-modify-write of DstReg.
+      if (Deads)
+        UseMI->addRegisterDead(DstReg, tri_);
+      else if (!Reads && Writes)
+        UseMI->addRegisterDefined(DstReg, tri_);
+
+      // Kill flags apply to the whole physical register.
+      if (DstIsPhys && Kills)
+        UseMI->addRegisterKilled(DstReg, tri_);
+    }
+
+    DEBUG({
+        dbgs() << "\t\tupdated: ";
+        if (!UseMI->isDebugValue())
+          dbgs() << li_->getInstructionIndex(UseMI) << "\t";
+        dbgs() << *UseMI;
+      });
+  }
+}
+
+/// removeIntervalIfEmpty - Check if the live interval of a physical register
+/// is empty, if so remove it and also remove the empty intervals of its
+/// sub-registers. Return true if live interval is removed.
+static bool removeIntervalIfEmpty(LiveInterval &li, LiveIntervals *li_,
+                                  const TargetRegisterInfo *tri_) {
+  if (li.empty()) {
+    if (TargetRegisterInfo::isPhysicalRegister(li.reg))
+      for (const unsigned* SR = tri_->getSubRegisters(li.reg); *SR; ++SR) {
+        if (!li_->hasInterval(*SR))
+          continue;
+        LiveInterval &sli = li_->getInterval(*SR);
+        if (sli.empty())
+          li_->removeInterval(*SR);
+      }
+    li_->removeInterval(li.reg);
+    return true;
+  }
+  return false;
+}
+
+/// ShortenDeadCopyLiveRange - Shorten a live range defined by a dead copy.
+/// Return true if live interval is removed.
+bool SimpleRegisterCoalescing::ShortenDeadCopyLiveRange(LiveInterval &li,
+                                                        MachineInstr *CopyMI) {
+  SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI);
+  LiveInterval::iterator MLR =
+    li.FindLiveRangeContaining(CopyIdx.getDefIndex());
+  if (MLR == li.end())
+    return false;  // Already removed by ShortenDeadCopySrcLiveRange.
+  SlotIndex RemoveStart = MLR->start;
+  SlotIndex RemoveEnd = MLR->end;
+  SlotIndex DefIdx = CopyIdx.getDefIndex();
+  // Remove the liverange that's defined by this.
+  if (RemoveStart == DefIdx && RemoveEnd == DefIdx.getStoreIndex()) {
+    removeRange(li, RemoveStart, RemoveEnd, li_, tri_);
+    return removeIntervalIfEmpty(li, li_, tri_);
+  }
+  return false;
+}
+
+/// RemoveDeadDef - If a def of a live interval is now determined dead, remove
+/// the val# it defines. If the live interval becomes empty, remove it as well.
+bool SimpleRegisterCoalescing::RemoveDeadDef(LiveInterval &li,
+                                             MachineInstr *DefMI) {
+  SlotIndex DefIdx = li_->getInstructionIndex(DefMI).getDefIndex();
+  LiveInterval::iterator MLR = li.FindLiveRangeContaining(DefIdx);
+  if (DefIdx != MLR->valno->def)
+    return false;
+  li.removeValNo(MLR->valno);
+  return removeIntervalIfEmpty(li, li_, tri_);
+}
+
+void SimpleRegisterCoalescing::RemoveCopyFlag(unsigned DstReg,
+                                              const MachineInstr *CopyMI) {
+  SlotIndex DefIdx = li_->getInstructionIndex(CopyMI).getDefIndex();
+  if (li_->hasInterval(DstReg)) {
+    LiveInterval &LI = li_->getInterval(DstReg);
+    if (const LiveRange *LR = LI.getLiveRangeContaining(DefIdx))
+      if (LR->valno->getCopy() == CopyMI)
+        LR->valno->setCopy(0);
+  }
+  if (!TargetRegisterInfo::isPhysicalRegister(DstReg))
+    return;
+  for (const unsigned* AS = tri_->getAliasSet(DstReg); *AS; ++AS) {
+    if (!li_->hasInterval(*AS))
+      continue;
+    LiveInterval &LI = li_->getInterval(*AS);
+    if (const LiveRange *LR = LI.getLiveRangeContaining(DefIdx))
+      if (LR->valno->getCopy() == CopyMI)
+        LR->valno->setCopy(0);
+  }
+}
+
+/// PropagateDeadness - Propagate the dead marker to the instruction which
+/// defines the val#.
+static void PropagateDeadness(LiveInterval &li, MachineInstr *CopyMI,
+                              SlotIndex &LRStart, LiveIntervals *li_,
+                              const TargetRegisterInfo* tri_) {
+  MachineInstr *DefMI =
+    li_->getInstructionFromIndex(LRStart.getDefIndex());
+  if (DefMI && DefMI != CopyMI) {
+    int DeadIdx = DefMI->findRegisterDefOperandIdx(li.reg);
+    if (DeadIdx != -1)
+      DefMI->getOperand(DeadIdx).setIsDead();
+    else
+      DefMI->addOperand(MachineOperand::CreateReg(li.reg,
+                   /*def*/true, /*implicit*/true, /*kill*/false, /*dead*/true));
+    LRStart = LRStart.getNextSlot();
+  }
+}
+
+/// ShortenDeadCopySrcLiveRange - Shorten a live range as it's artificially
+/// extended by a dead copy. Mark the last use (if any) of the val# as kill as
+/// ends the live range there. If there isn't another use, then this live range
+/// is dead. Return true if live interval is removed.
+bool
+SimpleRegisterCoalescing::ShortenDeadCopySrcLiveRange(LiveInterval &li,
+                                                      MachineInstr *CopyMI) {
+  SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI);
+  if (CopyIdx == SlotIndex()) {
+    // FIXME: special case: function live in. It can be a general case if the
+    // first instruction index starts at > 0 value.
+    assert(TargetRegisterInfo::isPhysicalRegister(li.reg));
+    // Live-in to the function but dead. Remove it from entry live-in set.
+    if (mf_->begin()->isLiveIn(li.reg))
+      mf_->begin()->removeLiveIn(li.reg);
+    if (const LiveRange *LR = li.getLiveRangeContaining(CopyIdx))
+      removeRange(li, LR->start, LR->end, li_, tri_);
+    return removeIntervalIfEmpty(li, li_, tri_);
+  }
+
+  LiveInterval::iterator LR =
+    li.FindLiveRangeContaining(CopyIdx.getPrevIndex().getStoreIndex());
+  if (LR == li.end())
+    // Livein but defined by a phi.
+    return false;
+
+  SlotIndex RemoveStart = LR->start;
+  SlotIndex RemoveEnd = CopyIdx.getStoreIndex();
+  if (LR->end > RemoveEnd)
+    // More uses past this copy? Nothing to do.
+    return false;
+
+  // If there is a last use in the same bb, we can't remove the live range.
+  // Shorten the live interval and return.
+  MachineBasicBlock *CopyMBB = CopyMI->getParent();
+  if (TrimLiveIntervalToLastUse(CopyIdx, CopyMBB, li, LR))
+    return false;
+
+  // There are other kills of the val#. Nothing to do.
+  if (!li.isOnlyLROfValNo(LR))
+    return false;
+
+  MachineBasicBlock *StartMBB = li_->getMBBFromIndex(RemoveStart);
+  if (!isSameOrFallThroughBB(StartMBB, CopyMBB, tii_))
+    // If the live range starts in another mbb and the copy mbb is not a fall
+    // through mbb, then we can only cut the range from the beginning of the
+    // copy mbb.
+    RemoveStart = li_->getMBBStartIdx(CopyMBB).getNextIndex().getBaseIndex();
+
+  if (LR->valno->def == RemoveStart) {
+    // If the def MI defines the val# and this copy is the only kill of the
+    // val#, then propagate the dead marker.
+    PropagateDeadness(li, CopyMI, RemoveStart, li_, tri_);
+    ++numDeadValNo;
+  }
+
+  removeRange(li, RemoveStart, RemoveEnd, li_, tri_);
+  return removeIntervalIfEmpty(li, li_, tri_);
+}
+
+
+/// isWinToJoinCrossClass - Return true if it's profitable to coalesce
+/// two virtual registers from different register classes.
+bool
+SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned SrcReg,
+                                                unsigned DstReg,
+                                             const TargetRegisterClass *SrcRC,
+                                             const TargetRegisterClass *DstRC,
+                                             const TargetRegisterClass *NewRC) {
+  unsigned NewRCCount = allocatableRCRegs_[NewRC].count();
+  // This heuristics is good enough in practice, but it's obviously not *right*.
+  // 4 is a magic number that works well enough for x86, ARM, etc. It filter
+  // out all but the most restrictive register classes.
+  if (NewRCCount > 4 ||
+      // Early exit if the function is fairly small, coalesce aggressively if
+      // that's the case. For really special register classes with 3 or
+      // fewer registers, be a bit more careful.
+      (li_->getFuncInstructionCount() / NewRCCount) < 8)
+    return true;
+  LiveInterval &SrcInt = li_->getInterval(SrcReg);
+  LiveInterval &DstInt = li_->getInterval(DstReg);
+  unsigned SrcSize = li_->getApproximateInstructionCount(SrcInt);
+  unsigned DstSize = li_->getApproximateInstructionCount(DstInt);
+  if (SrcSize <= NewRCCount && DstSize <= NewRCCount)
+    return true;
+  // Estimate *register use density*. If it doubles or more, abort.
+  unsigned SrcUses = std::distance(mri_->use_nodbg_begin(SrcReg),
+                                   mri_->use_nodbg_end());
+  unsigned DstUses = std::distance(mri_->use_nodbg_begin(DstReg),
+                                   mri_->use_nodbg_end());
+  unsigned NewUses = SrcUses + DstUses;
+  unsigned NewSize = SrcSize + DstSize;
+  if (SrcRC != NewRC && SrcSize > NewRCCount) {
+    unsigned SrcRCCount = allocatableRCRegs_[SrcRC].count();
+    if (NewUses*SrcSize*SrcRCCount > 2*SrcUses*NewSize*NewRCCount)
+      return false;
+  }
+  if (DstRC != NewRC && DstSize > NewRCCount) {
+    unsigned DstRCCount = allocatableRCRegs_[DstRC].count();
+    if (NewUses*DstSize*DstRCCount > 2*DstUses*NewSize*NewRCCount)
+      return false;
+  }
+  return true;
+}
+
+
+/// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+/// which are the src/dst of the copy instruction CopyMI.  This returns true
+/// if the copy was successfully coalesced away. If it is not currently
+/// possible to coalesce this interval, but it may be possible if other
+/// things get coalesced, then it returns true by reference in 'Again'.
+bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
+  MachineInstr *CopyMI = TheCopy.MI;
+
+  Again = false;
+  if (JoinedCopies.count(CopyMI) || ReMatCopies.count(CopyMI))
+    return false; // Already done.
+
+  DEBUG(dbgs() << li_->getInstructionIndex(CopyMI) << '\t' << *CopyMI);
+
+  CoalescerPair CP(*tii_, *tri_);
+  if (!CP.setRegisters(CopyMI)) {
+    DEBUG(dbgs() << "\tNot coalescable.\n");
+    return false;
+  }
+
+  // If they are already joined we continue.
+  if (CP.getSrcReg() == CP.getDstReg()) {
+    DEBUG(dbgs() << "\tCopy already coalesced.\n");
+    return false;  // Not coalescable.
+  }
+
+  DEBUG(dbgs() << "\tConsidering merging %reg" << CP.getSrcReg());
+
+  // Enforce policies.
+  if (CP.isPhys()) {
+    DEBUG(dbgs() <<" with physreg %" << tri_->getName(CP.getDstReg()) << "\n");
+    // Only coalesce to allocatable physreg.
+    if (!li_->isAllocatable(CP.getDstReg())) {
+      DEBUG(dbgs() << "\tRegister is an unallocatable physreg.\n");
+      return false;  // Not coalescable.
+    }
+  } else {
+    DEBUG({
+      dbgs() << " with reg%" << CP.getDstReg();
+      if (CP.getSubIdx())
+        dbgs() << ":" << tri_->getSubRegIndexName(CP.getSubIdx());
+      dbgs() << " to " << CP.getNewRC()->getName() << "\n";
+    });
+
+    // Avoid constraining virtual register regclass too much.
+    if (CP.isCrossClass()) {
+      if (DisableCrossClassJoin) {
+        DEBUG(dbgs() << "\tCross-class joins disabled.\n");
+        return false;
+      }
+      if (!isWinToJoinCrossClass(CP.getSrcReg(), CP.getDstReg(),
+                                 mri_->getRegClass(CP.getSrcReg()),
+                                 mri_->getRegClass(CP.getDstReg()),
+                                 CP.getNewRC())) {
+        DEBUG(dbgs() << "\tAvoid coalescing to constrained register class: "
+                     << CP.getNewRC()->getName() << ".\n");
+        Again = true;  // May be possible to coalesce later.
+        return false;
+      }
+    }
+
+    // When possible, let DstReg be the larger interval.
+    if (!CP.getSubIdx() && li_->getInterval(CP.getSrcReg()).ranges.size() >
+                           li_->getInterval(CP.getDstReg()).ranges.size())
+      CP.flip();
+  }
+
+  // We need to be careful about coalescing a source physical register with a
+  // virtual register. Once the coalescing is done, it cannot be broken and
+  // these are not spillable! If the destination interval uses are far away,
+  // think twice about coalescing them!
+  // FIXME: Why are we skipping this test for partial copies?
+  //        CodeGen/X86/phys_subreg_coalesce-3.ll needs it.
+  if (!CP.isPartial() && CP.isPhys()) {
+    LiveInterval &JoinVInt = li_->getInterval(CP.getSrcReg());
+
+    // Don't join with physregs that have a ridiculous number of live
+    // ranges. The data structure performance is really bad when that
+    // happens.
+    if (li_->hasInterval(CP.getDstReg()) &&
+        li_->getInterval(CP.getDstReg()).ranges.size() > 1000) {
+      ++numAborts;
+      DEBUG(dbgs()
+           << "\tPhysical register live interval too complicated, abort!\n");
+      return false;
+    }
+
+    const TargetRegisterClass *RC = mri_->getRegClass(CP.getSrcReg());
+    unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
+    unsigned Length = li_->getApproximateInstructionCount(JoinVInt);
+    if (Length > Threshold &&
+        std::distance(mri_->use_nodbg_begin(CP.getSrcReg()),
+                      mri_->use_nodbg_end()) * Threshold < Length) {
+      // Before giving up coalescing, if definition of source is defined by
+      // trivial computation, try rematerializing it.
+      if (!CP.isFlipped() &&
+          ReMaterializeTrivialDef(JoinVInt, CP.getDstReg(), 0, CopyMI))
+        return true;
+
+      ++numAborts;
+      DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
+      Again = true;  // May be possible to coalesce later.
+      return false;
+    }
+  }
+
+  // Okay, attempt to join these two intervals.  On failure, this returns false.
+  // Otherwise, if one of the intervals being joined is a physreg, this method
+  // always canonicalizes DstInt to be it.  The output "SrcInt" will not have
+  // been modified, so we can use this information below to update aliases.
+  if (!JoinIntervals(CP)) {
+    // Coalescing failed.
+
+    // If definition of source is defined by trivial computation, try
+    // rematerializing it.
+    if (!CP.isFlipped() &&
+        ReMaterializeTrivialDef(li_->getInterval(CP.getSrcReg()),
+                                CP.getDstReg(), 0, CopyMI))
+      return true;
+
+    // If we can eliminate the copy without merging the live ranges, do so now.
+    if (!CP.isPartial()) {
+      if (AdjustCopiesBackFrom(CP, CopyMI) ||
+          RemoveCopyByCommutingDef(CP, CopyMI)) {
+        JoinedCopies.insert(CopyMI);
+        DEBUG(dbgs() << "\tTrivial!\n");
+        return true;
+      }
+    }
+
+    // Otherwise, we are unable to join the intervals.
+    DEBUG(dbgs() << "\tInterference!\n");
+    Again = true;  // May be possible to coalesce later.
+    return false;
+  }
+
+  // Coalescing to a virtual register that is of a sub-register class of the
+  // other. Make sure the resulting register is set to the right register class.
+  if (CP.isCrossClass()) {
+    ++numCrossRCs;
+    mri_->setRegClass(CP.getDstReg(), CP.getNewRC());
+  }
+
+  // Remember to delete the copy instruction.
+  JoinedCopies.insert(CopyMI);
+
+  UpdateRegDefsUses(CP);
+
+  // If we have extended the live range of a physical register, make sure we
+  // update live-in lists as well.
+  if (CP.isPhys()) {
+    SmallVector<MachineBasicBlock*, 16> BlockSeq;
+    // JoinIntervals invalidates the VNInfos in SrcInt, but we only need the
+    // ranges for this, and they are preserved.
+    LiveInterval &SrcInt = li_->getInterval(CP.getSrcReg());
+    for (LiveInterval::const_iterator I = SrcInt.begin(), E = SrcInt.end();
+         I != E; ++I ) {
+      li_->findLiveInMBBs(I->start, I->end, BlockSeq);
+      for (unsigned idx = 0, size = BlockSeq.size(); idx != size; ++idx) {
+        MachineBasicBlock &block = *BlockSeq[idx];
+        if (!block.isLiveIn(CP.getDstReg()))
+          block.addLiveIn(CP.getDstReg());
+      }
+      BlockSeq.clear();
+    }
+  }
+
+  // SrcReg is guarateed to be the register whose live interval that is
+  // being merged.
+  li_->removeInterval(CP.getSrcReg());
+
+  // Update regalloc hint.
+  tri_->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *mf_);
+
+  DEBUG({
+    LiveInterval &DstInt = li_->getInterval(CP.getDstReg());
+    dbgs() << "\tJoined. Result = ";
+    DstInt.print(dbgs(), tri_);
+    dbgs() << "\n";
+  });
+
+  ++numJoins;
+  return true;
+}
+
+/// ComputeUltimateVN - Assuming we are going to join two live intervals,
+/// compute what the resultant value numbers for each value in the input two
+/// ranges will be.  This is complicated by copies between the two which can
+/// and will commonly cause multiple value numbers to be merged into one.
+///
+/// VN is the value number that we're trying to resolve.  InstDefiningValue
+/// keeps track of the new InstDefiningValue assignment for the result
+/// LiveInterval.  ThisFromOther/OtherFromThis are sets that keep track of
+/// whether a value in this or other is a copy from the opposite set.
+/// ThisValNoAssignments/OtherValNoAssignments keep track of value #'s that have
+/// already been assigned.
+///
+/// ThisFromOther[x] - If x is defined as a copy from the other interval, this
+/// contains the value number the copy is from.
+///
+static unsigned ComputeUltimateVN(VNInfo *VNI,
+                                  SmallVector<VNInfo*, 16> &NewVNInfo,
+                                  DenseMap<VNInfo*, VNInfo*> &ThisFromOther,
+                                  DenseMap<VNInfo*, VNInfo*> &OtherFromThis,
+                                  SmallVector<int, 16> &ThisValNoAssignments,
+                                  SmallVector<int, 16> &OtherValNoAssignments) {
+  unsigned VN = VNI->id;
+
+  // If the VN has already been computed, just return it.
+  if (ThisValNoAssignments[VN] >= 0)
+    return ThisValNoAssignments[VN];
+  assert(ThisValNoAssignments[VN] != -2 && "Cyclic value numbers");
+
+  // If this val is not a copy from the other val, then it must be a new value
+  // number in the destination.
+  DenseMap<VNInfo*, VNInfo*>::iterator I = ThisFromOther.find(VNI);
+  if (I == ThisFromOther.end()) {
+    NewVNInfo.push_back(VNI);
+    return ThisValNoAssignments[VN] = NewVNInfo.size()-1;
+  }
+  VNInfo *OtherValNo = I->second;
+
+  // Otherwise, this *is* a copy from the RHS.  If the other side has already
+  // been computed, return it.
+  if (OtherValNoAssignments[OtherValNo->id] >= 0)
+    return ThisValNoAssignments[VN] = OtherValNoAssignments[OtherValNo->id];
+
+  // Mark this value number as currently being computed, then ask what the
+  // ultimate value # of the other value is.
+  ThisValNoAssignments[VN] = -2;
+  unsigned UltimateVN =
+    ComputeUltimateVN(OtherValNo, NewVNInfo, OtherFromThis, ThisFromOther,
+                      OtherValNoAssignments, ThisValNoAssignments);
+  return ThisValNoAssignments[VN] = UltimateVN;
+}
+
+/// JoinIntervals - Attempt to join these two intervals.  On failure, this
+/// returns false.
+bool SimpleRegisterCoalescing::JoinIntervals(CoalescerPair &CP) {
+  LiveInterval &RHS = li_->getInterval(CP.getSrcReg());
+  DEBUG({ dbgs() << "\t\tRHS = "; RHS.print(dbgs(), tri_); dbgs() << "\n"; });
+
+  // If a live interval is a physical register, check for interference with any
+  // aliases. The interference check implemented here is a bit more conservative
+  // than the full interfeence check below. We allow overlapping live ranges
+  // only when one is a copy of the other.
+  if (CP.isPhys()) {
+    for (const unsigned *AS = tri_->getAliasSet(CP.getDstReg()); *AS; ++AS){
+      if (!li_->hasInterval(*AS))
+        continue;
+      const LiveInterval &LHS = li_->getInterval(*AS);
+      LiveInterval::const_iterator LI = LHS.begin();
+      for (LiveInterval::const_iterator RI = RHS.begin(), RE = RHS.end();
+           RI != RE; ++RI) {
+        LI = std::lower_bound(LI, LHS.end(), RI->start);
+        // Does LHS have an overlapping live range starting before RI?
+        if ((LI != LHS.begin() && LI[-1].end > RI->start) &&
+            (RI->start != RI->valno->def ||
+             !CP.isCoalescable(li_->getInstructionFromIndex(RI->start)))) {
+          DEBUG({
+            dbgs() << "\t\tInterference from alias: ";
+            LHS.print(dbgs(), tri_);
+            dbgs() << "\n\t\tOverlap at " << RI->start << " and no copy.\n";
+          });
+          return false;
+        }
+
+        // Check that LHS ranges beginning in this range are copies.
+        for (; LI != LHS.end() && LI->start < RI->end; ++LI) {
+          if (LI->start != LI->valno->def ||
+              !CP.isCoalescable(li_->getInstructionFromIndex(LI->start))) {
+            DEBUG({
+              dbgs() << "\t\tInterference from alias: ";
+              LHS.print(dbgs(), tri_);
+              dbgs() << "\n\t\tDef at " << LI->start << " is not a copy.\n";
+            });
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  // Compute the final value assignment, assuming that the live ranges can be
+  // coalesced.
+  SmallVector<int, 16> LHSValNoAssignments;
+  SmallVector<int, 16> RHSValNoAssignments;
+  DenseMap<VNInfo*, VNInfo*> LHSValsDefinedFromRHS;
+  DenseMap<VNInfo*, VNInfo*> RHSValsDefinedFromLHS;
+  SmallVector<VNInfo*, 16> NewVNInfo;
+
+  LiveInterval &LHS = li_->getOrCreateInterval(CP.getDstReg());
+  DEBUG({ dbgs() << "\t\tLHS = "; LHS.print(dbgs(), tri_); dbgs() << "\n"; });
+
+  // Loop over the value numbers of the LHS, seeing if any are defined from
+  // the RHS.
+  for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
+       i != e; ++i) {
+    VNInfo *VNI = *i;
+    if (VNI->isUnused() || VNI->getCopy() == 0)  // Src not defined by a copy?
+      continue;
+
+    // Never join with a register that has EarlyClobber redefs.
+    if (VNI->hasRedefByEC())
+      return false;
+
+    // DstReg is known to be a register in the LHS interval.  If the src is
+    // from the RHS interval, we can use its value #.
+    if (!CP.isCoalescable(VNI->getCopy()))
+      continue;
+
+    // Figure out the value # from the RHS.
+    LiveRange *lr = RHS.getLiveRangeContaining(VNI->def.getPrevSlot());
+    // The copy could be to an aliased physreg.
+    if (!lr) continue;
+    LHSValsDefinedFromRHS[VNI] = lr->valno;
+  }
+
+  // Loop over the value numbers of the RHS, seeing if any are defined from
+  // the LHS.
+  for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
+       i != e; ++i) {
+    VNInfo *VNI = *i;
+    if (VNI->isUnused() || VNI->getCopy() == 0)  // Src not defined by a copy?
+      continue;
+
+    // Never join with a register that has EarlyClobber redefs.
+    if (VNI->hasRedefByEC())
+      return false;
+
+    // DstReg is known to be a register in the RHS interval.  If the src is
+    // from the LHS interval, we can use its value #.
+    if (!CP.isCoalescable(VNI->getCopy()))
+      continue;
+
+    // Figure out the value # from the LHS.
+    LiveRange *lr = LHS.getLiveRangeContaining(VNI->def.getPrevSlot());
+    // The copy could be to an aliased physreg.
+    if (!lr) continue;
+    RHSValsDefinedFromLHS[VNI] = lr->valno;
+  }
+
+  LHSValNoAssignments.resize(LHS.getNumValNums(), -1);
+  RHSValNoAssignments.resize(RHS.getNumValNums(), -1);
+  NewVNInfo.reserve(LHS.getNumValNums() + RHS.getNumValNums());
+
+  for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
+       i != e; ++i) {
+    VNInfo *VNI = *i;
+    unsigned VN = VNI->id;
+    if (LHSValNoAssignments[VN] >= 0 || VNI->isUnused())
+      continue;
+    ComputeUltimateVN(VNI, NewVNInfo,
+                      LHSValsDefinedFromRHS, RHSValsDefinedFromLHS,
+                      LHSValNoAssignments, RHSValNoAssignments);
+  }
+  for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
+       i != e; ++i) {
+    VNInfo *VNI = *i;
+    unsigned VN = VNI->id;
+    if (RHSValNoAssignments[VN] >= 0 || VNI->isUnused())
+      continue;
+    // If this value number isn't a copy from the LHS, it's a new number.
+    if (RHSValsDefinedFromLHS.find(VNI) == RHSValsDefinedFromLHS.end()) {
+      NewVNInfo.push_back(VNI);
+      RHSValNoAssignments[VN] = NewVNInfo.size()-1;
+      continue;
+    }
+
+    ComputeUltimateVN(VNI, NewVNInfo,
+                      RHSValsDefinedFromLHS, LHSValsDefinedFromRHS,
+                      RHSValNoAssignments, LHSValNoAssignments);
+  }
+
+  // Armed with the mappings of LHS/RHS values to ultimate values, walk the
+  // interval lists to see if these intervals are coalescable.
+  LiveInterval::const_iterator I = LHS.begin();
+  LiveInterval::const_iterator IE = LHS.end();
+  LiveInterval::const_iterator J = RHS.begin();
+  LiveInterval::const_iterator JE = RHS.end();
+
+  // Skip ahead until the first place of potential sharing.
+  if (I != IE && J != JE) {
+    if (I->start < J->start) {
+      I = std::upper_bound(I, IE, J->start);
+      if (I != LHS.begin()) --I;
+    } else if (J->start < I->start) {
+      J = std::upper_bound(J, JE, I->start);
+      if (J != RHS.begin()) --J;
+    }
+  }
+
+  while (I != IE && J != JE) {
+    // Determine if these two live ranges overlap.
+    bool Overlaps;
+    if (I->start < J->start) {
+      Overlaps = I->end > J->start;
+    } else {
+      Overlaps = J->end > I->start;
+    }
+
+    // If so, check value # info to determine if they are really different.
+    if (Overlaps) {
+      // If the live range overlap will map to the same value number in the
+      // result liverange, we can still coalesce them.  If not, we can't.
+      if (LHSValNoAssignments[I->valno->id] !=
+          RHSValNoAssignments[J->valno->id])
+        return false;
+      // If it's re-defined by an early clobber somewhere in the live range,
+      // then conservatively abort coalescing.
+      if (NewVNInfo[LHSValNoAssignments[I->valno->id]]->hasRedefByEC())
+        return false;
+    }
+
+    if (I->end < J->end)
+      ++I;
+    else
+      ++J;
+  }
+
+  // Update kill info. Some live ranges are extended due to copy coalescing.
+  for (DenseMap<VNInfo*, VNInfo*>::iterator I = LHSValsDefinedFromRHS.begin(),
+         E = LHSValsDefinedFromRHS.end(); I != E; ++I) {
+    VNInfo *VNI = I->first;
+    unsigned LHSValID = LHSValNoAssignments[VNI->id];
+    if (VNI->hasPHIKill())
+      NewVNInfo[LHSValID]->setHasPHIKill(true);
+  }
+
+  // Update kill info. Some live ranges are extended due to copy coalescing.
+  for (DenseMap<VNInfo*, VNInfo*>::iterator I = RHSValsDefinedFromLHS.begin(),
+         E = RHSValsDefinedFromLHS.end(); I != E; ++I) {
+    VNInfo *VNI = I->first;
+    unsigned RHSValID = RHSValNoAssignments[VNI->id];
+    if (VNI->hasPHIKill())
+      NewVNInfo[RHSValID]->setHasPHIKill(true);
+  }
+
+  if (LHSValNoAssignments.empty())
+    LHSValNoAssignments.push_back(-1);
+  if (RHSValNoAssignments.empty())
+    RHSValNoAssignments.push_back(-1);
+
+  // If we get here, we know that we can coalesce the live ranges.  Ask the
+  // intervals to coalesce themselves now.
+  LHS.join(RHS, &LHSValNoAssignments[0], &RHSValNoAssignments[0], NewVNInfo,
+           mri_);
+  return true;
+}
+
+namespace {
+  // DepthMBBCompare - Comparison predicate that sort first based on the loop
+  // depth of the basic block (the unsigned), and then on the MBB number.
+  struct DepthMBBCompare {
+    typedef std::pair<unsigned, MachineBasicBlock*> DepthMBBPair;
+    bool operator()(const DepthMBBPair &LHS, const DepthMBBPair &RHS) const {
+      // Deeper loops first
+      if (LHS.first != RHS.first)
+        return LHS.first > RHS.first;
+
+      // Prefer blocks that are more connected in the CFG. This takes care of
+      // the most difficult copies first while intervals are short.
+      unsigned cl = LHS.second->pred_size() + LHS.second->succ_size();
+      unsigned cr = RHS.second->pred_size() + RHS.second->succ_size();
+      if (cl != cr)
+        return cl > cr;
+
+      // As a last resort, sort by block number.
+      return LHS.second->getNumber() < RHS.second->getNumber();
+    }
+  };
+}
+
+void SimpleRegisterCoalescing::CopyCoalesceInMBB(MachineBasicBlock *MBB,
+                                               std::vector<CopyRec> &TryAgain) {
+  DEBUG(dbgs() << MBB->getName() << ":\n");
+
+  std::vector<CopyRec> VirtCopies;
+  std::vector<CopyRec> PhysCopies;
+  std::vector<CopyRec> ImpDefCopies;
+  for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
+       MII != E;) {
+    MachineInstr *Inst = MII++;
+
+    // If this isn't a copy nor a extract_subreg, we can't join intervals.
+    unsigned SrcReg, DstReg;
+    if (Inst->isCopy()) {
+      DstReg = Inst->getOperand(0).getReg();
+      SrcReg = Inst->getOperand(1).getReg();
+    } else if (Inst->isSubregToReg()) {
+      DstReg = Inst->getOperand(0).getReg();
+      SrcReg = Inst->getOperand(2).getReg();
+    } else
+      continue;
+
+    bool SrcIsPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
+    bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+    if (li_->hasInterval(SrcReg) && li_->getInterval(SrcReg).empty())
+      ImpDefCopies.push_back(CopyRec(Inst, 0));
+    else if (SrcIsPhys || DstIsPhys)
+      PhysCopies.push_back(CopyRec(Inst, 0));
+    else
+      VirtCopies.push_back(CopyRec(Inst, 0));
+  }
+
+  // Try coalescing implicit copies and insert_subreg <undef> first,
+  // followed by copies to / from physical registers, then finally copies
+  // from virtual registers to virtual registers.
+  for (unsigned i = 0, e = ImpDefCopies.size(); i != e; ++i) {
+    CopyRec &TheCopy = ImpDefCopies[i];
+    bool Again = false;
+    if (!JoinCopy(TheCopy, Again))
+      if (Again)
+        TryAgain.push_back(TheCopy);
+  }
+  for (unsigned i = 0, e = PhysCopies.size(); i != e; ++i) {
+    CopyRec &TheCopy = PhysCopies[i];
+    bool Again = false;
+    if (!JoinCopy(TheCopy, Again))
+      if (Again)
+        TryAgain.push_back(TheCopy);
+  }
+  for (unsigned i = 0, e = VirtCopies.size(); i != e; ++i) {
+    CopyRec &TheCopy = VirtCopies[i];
+    bool Again = false;
+    if (!JoinCopy(TheCopy, Again))
+      if (Again)
+        TryAgain.push_back(TheCopy);
+  }
+}
+
+void SimpleRegisterCoalescing::joinIntervals() {
+  DEBUG(dbgs() << "********** JOINING INTERVALS ***********\n");
+
+  std::vector<CopyRec> TryAgainList;
+  if (loopInfo->empty()) {
+    // If there are no loops in the function, join intervals in function order.
+    for (MachineFunction::iterator I = mf_->begin(), E = mf_->end();
+         I != E; ++I)
+      CopyCoalesceInMBB(I, TryAgainList);
+  } else {
+    // Otherwise, join intervals in inner loops before other intervals.
+    // Unfortunately we can't just iterate over loop hierarchy here because
+    // there may be more MBB's than BB's.  Collect MBB's for sorting.
+
+    // Join intervals in the function prolog first. We want to join physical
+    // registers with virtual registers before the intervals got too long.
+    std::vector<std::pair<unsigned, MachineBasicBlock*> > MBBs;
+    for (MachineFunction::iterator I = mf_->begin(), E = mf_->end();I != E;++I){
+      MachineBasicBlock *MBB = I;
+      MBBs.push_back(std::make_pair(loopInfo->getLoopDepth(MBB), I));
+    }
+
+    // Sort by loop depth.
+    std::sort(MBBs.begin(), MBBs.end(), DepthMBBCompare());
+
+    // Finally, join intervals in loop nest order.
+    for (unsigned i = 0, e = MBBs.size(); i != e; ++i)
+      CopyCoalesceInMBB(MBBs[i].second, TryAgainList);
+  }
+
+  // Joining intervals can allow other intervals to be joined.  Iteratively join
+  // until we make no progress.
+  bool ProgressMade = true;
+  while (ProgressMade) {
+    ProgressMade = false;
+
+    for (unsigned i = 0, e = TryAgainList.size(); i != e; ++i) {
+      CopyRec &TheCopy = TryAgainList[i];
+      if (!TheCopy.MI)
+        continue;
+
+      bool Again = false;
+      bool Success = JoinCopy(TheCopy, Again);
+      if (Success || !Again) {
+        TheCopy.MI = 0;   // Mark this one as done.
+        ProgressMade = true;
+      }
+    }
+  }
+}
+
+/// Return true if the two specified registers belong to different register
+/// classes.  The registers may be either phys or virt regs.
+bool
+SimpleRegisterCoalescing::differingRegisterClasses(unsigned RegA,
+                                                   unsigned RegB) const {
+  // Get the register classes for the first reg.
+  if (TargetRegisterInfo::isPhysicalRegister(RegA)) {
+    assert(TargetRegisterInfo::isVirtualRegister(RegB) &&
+           "Shouldn't consider two physregs!");
+    return !mri_->getRegClass(RegB)->contains(RegA);
+  }
+
+  // Compare against the regclass for the second reg.
+  const TargetRegisterClass *RegClassA = mri_->getRegClass(RegA);
+  if (TargetRegisterInfo::isVirtualRegister(RegB)) {
+    const TargetRegisterClass *RegClassB = mri_->getRegClass(RegB);
+    return RegClassA != RegClassB;
+  }
+  return !RegClassA->contains(RegB);
+}
+
+/// lastRegisterUse - Returns the last (non-debug) use of the specific register
+/// between cycles Start and End or NULL if there are no uses.
+MachineOperand *
+SimpleRegisterCoalescing::lastRegisterUse(SlotIndex Start,
+                                          SlotIndex End,
+                                          unsigned Reg,
+                                          SlotIndex &UseIdx) const{
+  UseIdx = SlotIndex();
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    MachineOperand *LastUse = NULL;
+    for (MachineRegisterInfo::use_nodbg_iterator I = mri_->use_nodbg_begin(Reg),
+           E = mri_->use_nodbg_end(); I != E; ++I) {
+      MachineOperand &Use = I.getOperand();
+      MachineInstr *UseMI = Use.getParent();
+      if (UseMI->isIdentityCopy())
+        continue;
+      SlotIndex Idx = li_->getInstructionIndex(UseMI);
+      // FIXME: Should this be Idx != UseIdx? SlotIndex() will return something
+      // that compares higher than any other interval.
+      if (Idx >= Start && Idx < End && Idx >= UseIdx) {
+        LastUse = &Use;
+        UseIdx = Idx.getUseIndex();
+      }
+    }
+    return LastUse;
+  }
+
+  SlotIndex s = Start;
+  SlotIndex e = End.getPrevSlot().getBaseIndex();
+  while (e >= s) {
+    // Skip deleted instructions
+    MachineInstr *MI = li_->getInstructionFromIndex(e);
+    while (e != SlotIndex() && e.getPrevIndex() >= s && !MI) {
+      e = e.getPrevIndex();
+      MI = li_->getInstructionFromIndex(e);
+    }
+    if (e < s || MI == NULL)
+      return NULL;
+
+    // Ignore identity copies.
+    if (!MI->isIdentityCopy())
+      for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) {
+        MachineOperand &Use = MI->getOperand(i);
+        if (Use.isReg() && Use.isUse() && Use.getReg() &&
+            tri_->regsOverlap(Use.getReg(), Reg)) {
+          UseIdx = e.getUseIndex();
+          return &Use;
+        }
+      }
+
+    e = e.getPrevIndex();
+  }
+
+  return NULL;
+}
+
+void SimpleRegisterCoalescing::releaseMemory() {
+  JoinedCopies.clear();
+  ReMatCopies.clear();
+  ReMatDefs.clear();
+}
+
+bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
+  mf_ = &fn;
+  mri_ = &fn.getRegInfo();
+  tm_ = &fn.getTarget();
+  tri_ = tm_->getRegisterInfo();
+  tii_ = tm_->getInstrInfo();
+  li_ = &getAnalysis<LiveIntervals>();
+  AA = &getAnalysis<AliasAnalysis>();
+  loopInfo = &getAnalysis<MachineLoopInfo>();
+
+  DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n"
+               << "********** Function: "
+               << ((Value*)mf_->getFunction())->getName() << '\n');
+
+  for (TargetRegisterInfo::regclass_iterator I = tri_->regclass_begin(),
+         E = tri_->regclass_end(); I != E; ++I)
+    allocatableRCRegs_.insert(std::make_pair(*I,
+                                             tri_->getAllocatableSet(fn, *I)));
+
+  // Join (coalesce) intervals if requested.
+  if (EnableJoining) {
+    joinIntervals();
+    DEBUG({
+        dbgs() << "********** INTERVALS POST JOINING **********\n";
+        for (LiveIntervals::iterator I = li_->begin(), E = li_->end();
+             I != E; ++I){
+          I->second->print(dbgs(), tri_);
+          dbgs() << "\n";
+        }
+      });
+  }
+
+  // Perform a final pass over the instructions and compute spill weights
+  // and remove identity moves.
+  SmallVector<unsigned, 4> DeadDefs;
+  for (MachineFunction::iterator mbbi = mf_->begin(), mbbe = mf_->end();
+       mbbi != mbbe; ++mbbi) {
+    MachineBasicBlock* mbb = mbbi;
+    for (MachineBasicBlock::iterator mii = mbb->begin(), mie = mbb->end();
+         mii != mie; ) {
+      MachineInstr *MI = mii;
+      if (JoinedCopies.count(MI)) {
+        // Delete all coalesced copies.
+        bool DoDelete = true;
+        assert(MI->isCopyLike() && "Unrecognized copy instruction");
+        unsigned SrcReg = MI->getOperand(MI->isSubregToReg() ? 2 : 1).getReg();
+        if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+            MI->getNumOperands() > 2)
+          // Do not delete extract_subreg, insert_subreg of physical
+          // registers unless the definition is dead. e.g.
+          // %DO<def> = INSERT_SUBREG %D0<undef>, %S0<kill>, 1
+          // or else the scavenger may complain. LowerSubregs will
+          // delete them later.
+          DoDelete = false;
+        
+        if (MI->allDefsAreDead()) {
+          LiveInterval &li = li_->getInterval(SrcReg);
+          if (!ShortenDeadCopySrcLiveRange(li, MI))
+            ShortenDeadCopyLiveRange(li, MI);
+          DoDelete = true;
+        }
+        if (!DoDelete) {
+          // We need the instruction to adjust liveness, so make it a KILL.
+          if (MI->isSubregToReg()) {
+            MI->RemoveOperand(3);
+            MI->RemoveOperand(1);
+          }
+          MI->setDesc(tii_->get(TargetOpcode::KILL));
+          mii = llvm::next(mii);
+        } else {
+          li_->RemoveMachineInstrFromMaps(MI);
+          mii = mbbi->erase(mii);
+          ++numPeep;
+        }
+        continue;
+      }
+
+      // Now check if this is a remat'ed def instruction which is now dead.
+      if (ReMatDefs.count(MI)) {
+        bool isDead = true;
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          const MachineOperand &MO = MI->getOperand(i);
+          if (!MO.isReg())
+            continue;
+          unsigned Reg = MO.getReg();
+          if (!Reg)
+            continue;
+          if (TargetRegisterInfo::isVirtualRegister(Reg))
+            DeadDefs.push_back(Reg);
+          if (MO.isDead())
+            continue;
+          if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+              !mri_->use_nodbg_empty(Reg)) {
+            isDead = false;
+            break;
+          }
+        }
+        if (isDead) {
+          while (!DeadDefs.empty()) {
+            unsigned DeadDef = DeadDefs.back();
+            DeadDefs.pop_back();
+            RemoveDeadDef(li_->getInterval(DeadDef), MI);
+          }
+          li_->RemoveMachineInstrFromMaps(mii);
+          mii = mbbi->erase(mii);
+          continue;
+        } else
+          DeadDefs.clear();
+      }
+
+      // If the move will be an identity move delete it
+      if (MI->isIdentityCopy()) {
+        unsigned SrcReg = MI->getOperand(1).getReg();
+        if (li_->hasInterval(SrcReg)) {
+          LiveInterval &RegInt = li_->getInterval(SrcReg);
+          // If def of this move instruction is dead, remove its live range
+          // from the destination register's live interval.
+          if (MI->allDefsAreDead()) {
+            if (!ShortenDeadCopySrcLiveRange(RegInt, MI))
+              ShortenDeadCopyLiveRange(RegInt, MI);
+          }
+        }
+        li_->RemoveMachineInstrFromMaps(MI);
+        mii = mbbi->erase(mii);
+        ++numPeep;
+        continue;
+      }
+
+      ++mii;
+
+      // Check for now unnecessary kill flags.
+      if (li_->isNotInMIMap(MI)) continue;
+      SlotIndex DefIdx = li_->getInstructionIndex(MI).getDefIndex();
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg() || !MO.isKill()) continue;
+        unsigned reg = MO.getReg();
+        if (!reg || !li_->hasInterval(reg)) continue;
+        if (!li_->getInterval(reg).killedAt(DefIdx))
+          MO.setIsKill(false);
+      }
+    }
+  }
+
+  DEBUG(dump());
+  return true;
+}
+
+/// print - Implement the dump method.
+void SimpleRegisterCoalescing::print(raw_ostream &O, const Module* m) const {
+   li_->print(O, m);
+}
+
+RegisterCoalescer* llvm::createSimpleRegisterCoalescer() {
+  return new SimpleRegisterCoalescing();
+}
+
+// Make sure that anything that uses RegisterCoalescer pulls in this file...
+DEFINING_FILE_FOR(SimpleRegisterCoalescing)
diff --git a/src/LLVM/lib/CodeGen/SimpleRegisterCoalescing.h b/src/LLVM/lib/CodeGen/SimpleRegisterCoalescing.h
new file mode 100644
index 0000000..855bdb9
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SimpleRegisterCoalescing.h
@@ -0,0 +1,188 @@
+//===-- SimpleRegisterCoalescing.h - Register Coalescing --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple register copy coalescing phase.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SIMPLE_REGISTER_COALESCING_H
+#define LLVM_CODEGEN_SIMPLE_REGISTER_COALESCING_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/ADT/BitVector.h"
+
+namespace llvm {
+  class SimpleRegisterCoalescing;
+  class LiveVariables;
+  class TargetRegisterInfo;
+  class TargetInstrInfo;
+  class VirtRegMap;
+  class MachineLoopInfo;
+
+  /// CopyRec - Representation for copy instructions in coalescer queue.
+  ///
+  struct CopyRec {
+    MachineInstr *MI;
+    unsigned LoopDepth;
+    CopyRec(MachineInstr *mi, unsigned depth)
+      : MI(mi), LoopDepth(depth) {}
+  };
+
+  class SimpleRegisterCoalescing : public MachineFunctionPass,
+                                   public RegisterCoalescer {
+    MachineFunction* mf_;
+    MachineRegisterInfo* mri_;
+    const TargetMachine* tm_;
+    const TargetRegisterInfo* tri_;
+    const TargetInstrInfo* tii_;
+    LiveIntervals *li_;
+    const MachineLoopInfo* loopInfo;
+    AliasAnalysis *AA;
+    
+    DenseMap<const TargetRegisterClass*, BitVector> allocatableRCRegs_;
+
+    /// JoinedCopies - Keep track of copies eliminated due to coalescing.
+    ///
+    SmallPtrSet<MachineInstr*, 32> JoinedCopies;
+
+    /// ReMatCopies - Keep track of copies eliminated due to remat.
+    ///
+    SmallPtrSet<MachineInstr*, 32> ReMatCopies;
+
+    /// ReMatDefs - Keep track of definition instructions which have
+    /// been remat'ed.
+    SmallPtrSet<MachineInstr*, 8> ReMatDefs;
+
+  public:
+    static char ID; // Pass identifcation, replacement for typeid
+    SimpleRegisterCoalescing() : MachineFunctionPass(ID) {}
+
+    struct InstrSlots {
+      enum {
+        LOAD  = 0,
+        USE   = 1,
+        DEF   = 2,
+        STORE = 3,
+        NUM   = 4
+      };
+    };
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual void releaseMemory();
+
+    /// runOnMachineFunction - pass entry point
+    virtual bool runOnMachineFunction(MachineFunction&);
+
+    bool coalesceFunction(MachineFunction &mf, RegallocQuery &) {
+      // This runs as an independent pass, so don't do anything.
+      return false;
+    }
+
+    /// print - Implement the dump method.
+    virtual void print(raw_ostream &O, const Module* = 0) const;
+
+  private:
+    /// joinIntervals - join compatible live intervals
+    void joinIntervals();
+
+    /// CopyCoalesceInMBB - Coalesce copies in the specified MBB, putting
+    /// copies that cannot yet be coalesced into the "TryAgain" list.
+    void CopyCoalesceInMBB(MachineBasicBlock *MBB,
+                           std::vector<CopyRec> &TryAgain);
+
+    /// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+    /// which are the src/dst of the copy instruction CopyMI.  This returns true
+    /// if the copy was successfully coalesced away. If it is not currently
+    /// possible to coalesce this interval, but it may be possible if other
+    /// things get coalesced, then it returns true by reference in 'Again'.
+    bool JoinCopy(CopyRec &TheCopy, bool &Again);
+
+    /// JoinIntervals - Attempt to join these two intervals.  On failure, this
+    /// returns false.  The output "SrcInt" will not have been modified, so we can
+    /// use this information below to update aliases.
+    bool JoinIntervals(CoalescerPair &CP);
+
+    /// Return true if the two specified registers belong to different register
+    /// classes.  The registers may be either phys or virt regs.
+    bool differingRegisterClasses(unsigned RegA, unsigned RegB) const;
+
+    /// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
+    /// the source value number is defined by a copy from the destination reg
+    /// see if we can merge these two destination reg valno# into a single
+    /// value number, eliminating a copy.
+    bool AdjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
+
+    /// HasOtherReachingDefs - Return true if there are definitions of IntB
+    /// other than BValNo val# that can reach uses of AValno val# of IntA.
+    bool HasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
+                              VNInfo *AValNo, VNInfo *BValNo);
+
+    /// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy.
+    /// If the source value number is defined by a commutable instruction and
+    /// its other operand is coalesced to the copy dest register, see if we
+    /// can transform the copy into a noop by commuting the definition.
+    bool RemoveCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
+
+    /// TrimLiveIntervalToLastUse - If there is a last use in the same basic
+    /// block as the copy instruction, trim the ive interval to the last use
+    /// and return true.
+    bool TrimLiveIntervalToLastUse(SlotIndex CopyIdx,
+                                   MachineBasicBlock *CopyMBB,
+                                   LiveInterval &li, const LiveRange *LR);
+
+    /// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial
+    /// computation, replace the copy by rematerialize the definition.
+    bool ReMaterializeTrivialDef(LiveInterval &SrcInt, unsigned DstReg,
+                                 unsigned DstSubIdx, MachineInstr *CopyMI);
+
+    /// isWinToJoinCrossClass - Return true if it's profitable to coalesce
+    /// two virtual registers from different register classes.
+    bool isWinToJoinCrossClass(unsigned SrcReg,
+                               unsigned DstReg,
+                               const TargetRegisterClass *SrcRC,
+                               const TargetRegisterClass *DstRC,
+                               const TargetRegisterClass *NewRC);
+
+    /// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+    /// update the subregister number if it is not zero. If DstReg is a
+    /// physical register and the existing subregister number of the def / use
+    /// being updated is not zero, make sure to set it to the correct physical
+    /// subregister.
+    void UpdateRegDefsUses(const CoalescerPair &CP);
+
+    /// ShortenDeadCopyLiveRange - Shorten a live range defined by a dead copy.
+    /// Return true if live interval is removed.
+    bool ShortenDeadCopyLiveRange(LiveInterval &li, MachineInstr *CopyMI);
+
+    /// ShortenDeadCopyLiveRange - Shorten a live range as it's artificially
+    /// extended by a dead copy. Mark the last use (if any) of the val# as kill
+    /// as ends the live range there. If there isn't another use, then this
+    /// live range is dead. Return true if live interval is removed.
+    bool ShortenDeadCopySrcLiveRange(LiveInterval &li, MachineInstr *CopyMI);
+
+    /// RemoveDeadDef - If a def of a live interval is now determined dead,
+    /// remove the val# it defines. If the live interval becomes empty, remove
+    /// it as well.
+    bool RemoveDeadDef(LiveInterval &li, MachineInstr *DefMI);
+
+    /// RemoveCopyFlag - If DstReg is no longer defined by CopyMI, clear the
+    /// VNInfo copy flag for DstReg and all aliases.
+    void RemoveCopyFlag(unsigned DstReg, const MachineInstr *CopyMI);
+
+    /// lastRegisterUse - Returns the last use of the specific register between
+    /// cycles Start and End or NULL if there are no uses.
+    MachineOperand *lastRegisterUse(SlotIndex Start, SlotIndex End,
+                                    unsigned Reg, SlotIndex &LastUseIdx) const;
+  };
+
+} // End llvm namespace
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/SlotIndexes.cpp b/src/LLVM/lib/CodeGen/SlotIndexes.cpp
new file mode 100644
index 0000000..1bc148f
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SlotIndexes.cpp
@@ -0,0 +1,229 @@
+//===-- SlotIndexes.cpp - Slot Indexes Pass  ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "slotindexes"
+
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+
+// Yep - these are thread safe. See the header for details. 
+namespace {
+
+
+  class EmptyIndexListEntry : public IndexListEntry {
+  public:
+    EmptyIndexListEntry() : IndexListEntry(EMPTY_KEY) {}
+  };
+
+  class TombstoneIndexListEntry : public IndexListEntry {
+  public:
+    TombstoneIndexListEntry() : IndexListEntry(TOMBSTONE_KEY) {}
+  };
+
+  // The following statics are thread safe. They're read only, and you
+  // can't step from them to any other list entries.
+  ManagedStatic<EmptyIndexListEntry> IndexListEntryEmptyKey;
+  ManagedStatic<TombstoneIndexListEntry> IndexListEntryTombstoneKey;
+}
+
+char SlotIndexes::ID = 0;
+INITIALIZE_PASS(SlotIndexes, "slotindexes",
+                "Slot index numbering", false, false);
+
+IndexListEntry* IndexListEntry::getEmptyKeyEntry() {
+  return &*IndexListEntryEmptyKey;
+}
+
+IndexListEntry* IndexListEntry::getTombstoneKeyEntry() {
+  return &*IndexListEntryTombstoneKey;
+}
+
+
+void SlotIndexes::getAnalysisUsage(AnalysisUsage &au) const {
+  au.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(au);
+}
+
+void SlotIndexes::releaseMemory() {
+  mi2iMap.clear();
+  mbb2IdxMap.clear();
+  idx2MBBMap.clear();
+  terminatorGaps.clear();
+  clearList();
+}
+
+bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
+
+  // Compute numbering as follows:
+  // Grab an iterator to the start of the index list.
+  // Iterate over all MBBs, and within each MBB all MIs, keeping the MI
+  // iterator in lock-step (though skipping it over indexes which have
+  // null pointers in the instruction field).
+  // At each iteration assert that the instruction pointed to in the index
+  // is the same one pointed to by the MI iterator. This 
+
+  // FIXME: This can be simplified. The mi2iMap_, Idx2MBBMap, etc. should
+  // only need to be set up once after the first numbering is computed.
+
+  mf = &fn;
+  initList();
+
+  // Check that the list contains only the sentinal.
+  assert(indexListHead->getNext() == 0 &&
+         "Index list non-empty at initial numbering?");
+  assert(idx2MBBMap.empty() &&
+         "Index -> MBB mapping non-empty at initial numbering?");
+  assert(mbb2IdxMap.empty() &&
+         "MBB -> Index mapping non-empty at initial numbering?");
+  assert(mi2iMap.empty() &&
+         "MachineInstr -> Index mapping non-empty at initial numbering?");
+
+  functionSize = 0;
+  unsigned index = 0;
+
+  push_back(createEntry(0, index));
+
+  // Iterate over the function.
+  for (MachineFunction::iterator mbbItr = mf->begin(), mbbEnd = mf->end();
+       mbbItr != mbbEnd; ++mbbItr) {
+    MachineBasicBlock *mbb = &*mbbItr;
+
+    // Insert an index for the MBB start.
+    SlotIndex blockStartIndex(back(), SlotIndex::LOAD);
+
+    index += SlotIndex::NUM;
+
+    for (MachineBasicBlock::iterator miItr = mbb->begin(), miEnd = mbb->end();
+         miItr != miEnd; ++miItr) {
+      MachineInstr *mi = miItr;
+      if (mi->isDebugValue())
+        continue;
+
+      if (miItr == mbb->getFirstTerminator()) {
+        push_back(createEntry(0, index));
+        terminatorGaps.insert(
+          std::make_pair(mbb, SlotIndex(back(), SlotIndex::PHI_BIT)));
+        index += SlotIndex::NUM;
+      }
+
+      // Insert a store index for the instr.
+      push_back(createEntry(mi, index));
+
+      // Save this base index in the maps.
+      mi2iMap.insert(
+        std::make_pair(mi, SlotIndex(back(), SlotIndex::LOAD)));
+ 
+      ++functionSize;
+
+      unsigned Slots = mi->getDesc().getNumDefs();
+      if (Slots == 0)
+        Slots = 1;
+
+      index += (Slots + 1) * SlotIndex::NUM;
+    }
+
+    if (mbb->getFirstTerminator() == mbb->end()) {
+      push_back(createEntry(0, index));
+      terminatorGaps.insert(
+        std::make_pair(mbb, SlotIndex(back(), SlotIndex::PHI_BIT)));
+      index += SlotIndex::NUM;
+    }
+
+    // One blank instruction at the end.
+    push_back(createEntry(0, index));    
+
+    SlotIndex blockEndIndex(back(), SlotIndex::LOAD);
+    mbb2IdxMap.insert(
+      std::make_pair(mbb, std::make_pair(blockStartIndex, blockEndIndex)));
+
+    idx2MBBMap.push_back(IdxMBBPair(blockStartIndex, mbb));
+  }
+
+  // Sort the Idx2MBBMap
+  std::sort(idx2MBBMap.begin(), idx2MBBMap.end(), Idx2MBBCompare());
+
+  DEBUG(dump());
+
+  // And we're done!
+  return false;
+}
+
+void SlotIndexes::renumberIndexes() {
+
+  // Renumber updates the index of every element of the index list.
+  // If all instrs in the function have been allocated an index (which has been
+  // placed in the index list in the order of instruction iteration) then the
+  // resulting numbering will match what would have been generated by the
+  // pass during the initial numbering of the function if the new instructions
+  // had been present.
+
+  functionSize = 0;
+  unsigned index = 0;
+
+  for (IndexListEntry *curEntry = front(); curEntry != getTail();
+       curEntry = curEntry->getNext()) {
+
+    curEntry->setIndex(index);
+
+    if (curEntry->getInstr() == 0) {
+      // MBB start entry or terminator gap. Just step index by 1.
+      index += SlotIndex::NUM;
+    }
+    else {
+      ++functionSize;
+      unsigned Slots = curEntry->getInstr()->getDesc().getNumDefs();
+      if (Slots == 0)
+        Slots = 1;
+
+      index += (Slots + 1) * SlotIndex::NUM;
+    }
+  }
+}
+
+void SlotIndexes::dump() const {
+  for (const IndexListEntry *itr = front(); itr != getTail();
+       itr = itr->getNext()) {
+    dbgs() << itr->getIndex() << " ";
+
+    if (itr->getInstr() != 0) {
+      dbgs() << *itr->getInstr();
+    } else {
+      dbgs() << "\n";
+    }
+  }
+
+  for (MBB2IdxMap::const_iterator itr = mbb2IdxMap.begin();
+       itr != mbb2IdxMap.end(); ++itr) {
+    dbgs() << "MBB " << itr->first->getNumber() << " (" << itr->first << ") - ["
+           << itr->second.first << ", " << itr->second.second << "]\n";
+  }
+}
+
+// Print a SlotIndex to a raw_ostream.
+void SlotIndex::print(raw_ostream &os) const {
+  os << entry().getIndex();
+  if (isPHI())
+    os << "*";
+  else
+    os << "LudS"[getSlot()];
+}
+
+// Dump a SlotIndex to stderr.
+void SlotIndex::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
diff --git a/src/LLVM/lib/CodeGen/Spiller.cpp b/src/LLVM/lib/CodeGen/Spiller.cpp
new file mode 100644
index 0000000..1c17af8
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/Spiller.cpp
@@ -0,0 +1,529 @@
+//===-- llvm/CodeGen/Spiller.cpp -  Spiller -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "spiller"
+
+#include "Spiller.h"
+#include "VirtRegMap.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <set>
+
+using namespace llvm;
+
+namespace {
+  enum SpillerName { trivial, standard, splitting, inline_ };
+}
+
+static cl::opt<SpillerName>
+spillerOpt("spiller",
+           cl::desc("Spiller to use: (default: standard)"),
+           cl::Prefix,
+           cl::values(clEnumVal(trivial,   "trivial spiller"),
+                      clEnumVal(standard,  "default spiller"),
+                      clEnumVal(splitting, "splitting spiller"),
+                      clEnumValN(inline_,  "inline", "inline spiller"),
+                      clEnumValEnd),
+           cl::init(standard));
+
+// Spiller virtual destructor implementation.
+Spiller::~Spiller() {}
+
+namespace {
+
+/// Utility class for spillers.
+class SpillerBase : public Spiller {
+protected:
+  MachineFunctionPass *pass;
+  MachineFunction *mf;
+  VirtRegMap *vrm;
+  LiveIntervals *lis;
+  MachineFrameInfo *mfi;
+  MachineRegisterInfo *mri;
+  const TargetInstrInfo *tii;
+  const TargetRegisterInfo *tri;
+
+  /// Construct a spiller base.
+  SpillerBase(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm)
+    : pass(&pass), mf(&mf), vrm(&vrm)
+  {
+    lis = &pass.getAnalysis<LiveIntervals>();
+    mfi = mf.getFrameInfo();
+    mri = &mf.getRegInfo();
+    tii = mf.getTarget().getInstrInfo();
+    tri = mf.getTarget().getRegisterInfo();
+  }
+
+  /// Add spill ranges for every use/def of the live interval, inserting loads
+  /// immediately before each use, and stores after each def. No folding or
+  /// remat is attempted.
+  void trivialSpillEverywhere(LiveInterval *li,
+                              std::vector<LiveInterval*> &newIntervals) {
+    DEBUG(dbgs() << "Spilling everywhere " << *li << "\n");
+
+    assert(li->weight != HUGE_VALF &&
+           "Attempting to spill already spilled value.");
+
+    assert(!li->isStackSlot() &&
+           "Trying to spill a stack slot.");
+
+    DEBUG(dbgs() << "Trivial spill everywhere of reg" << li->reg << "\n");
+
+    const TargetRegisterClass *trc = mri->getRegClass(li->reg);
+    unsigned ss = vrm->assignVirt2StackSlot(li->reg);
+
+    // Iterate over reg uses/defs.
+    for (MachineRegisterInfo::reg_iterator
+         regItr = mri->reg_begin(li->reg); regItr != mri->reg_end();) {
+
+      // Grab the use/def instr.
+      MachineInstr *mi = &*regItr;
+
+      DEBUG(dbgs() << "  Processing " << *mi);
+
+      // Step regItr to the next use/def instr.
+      do {
+        ++regItr;
+      } while (regItr != mri->reg_end() && (&*regItr == mi));
+
+      // Collect uses & defs for this instr.
+      SmallVector<unsigned, 2> indices;
+      bool hasUse = false;
+      bool hasDef = false;
+      for (unsigned i = 0; i != mi->getNumOperands(); ++i) {
+        MachineOperand &op = mi->getOperand(i);
+        if (!op.isReg() || op.getReg() != li->reg)
+          continue;
+        hasUse |= mi->getOperand(i).isUse();
+        hasDef |= mi->getOperand(i).isDef();
+        indices.push_back(i);
+      }
+
+      // Create a new vreg & interval for this instr.
+      unsigned newVReg = mri->createVirtualRegister(trc);
+      vrm->grow();
+      vrm->assignVirt2StackSlot(newVReg, ss);
+      LiveInterval *newLI = &lis->getOrCreateInterval(newVReg);
+      newLI->weight = HUGE_VALF;
+
+      // Update the reg operands & kill flags.
+      for (unsigned i = 0; i < indices.size(); ++i) {
+        unsigned mopIdx = indices[i];
+        MachineOperand &mop = mi->getOperand(mopIdx);
+        mop.setReg(newVReg);
+        if (mop.isUse() && !mi->isRegTiedToDefOperand(mopIdx)) {
+          mop.setIsKill(true);
+        }
+      }
+      assert(hasUse || hasDef);
+
+      // Insert reload if necessary.
+      MachineBasicBlock::iterator miItr(mi);
+      if (hasUse) {
+        tii->loadRegFromStackSlot(*mi->getParent(), miItr, newVReg, ss, trc,
+                                  tri);
+        MachineInstr *loadInstr(prior(miItr));
+        SlotIndex loadIndex =
+          lis->InsertMachineInstrInMaps(loadInstr).getDefIndex();
+        vrm->addSpillSlotUse(ss, loadInstr);
+        SlotIndex endIndex = loadIndex.getNextIndex();
+        VNInfo *loadVNI =
+          newLI->getNextValue(loadIndex, 0, true, lis->getVNInfoAllocator());
+        newLI->addRange(LiveRange(loadIndex, endIndex, loadVNI));
+      }
+
+      // Insert store if necessary.
+      if (hasDef) {
+        tii->storeRegToStackSlot(*mi->getParent(), llvm::next(miItr), newVReg,
+                                 true, ss, trc, tri);
+        MachineInstr *storeInstr(llvm::next(miItr));
+        SlotIndex storeIndex =
+          lis->InsertMachineInstrInMaps(storeInstr).getDefIndex();
+        vrm->addSpillSlotUse(ss, storeInstr);
+        SlotIndex beginIndex = storeIndex.getPrevIndex();
+        VNInfo *storeVNI =
+          newLI->getNextValue(beginIndex, 0, true, lis->getVNInfoAllocator());
+        newLI->addRange(LiveRange(beginIndex, storeIndex, storeVNI));
+      }
+
+      newIntervals.push_back(newLI);
+    }
+  }
+};
+
+} // end anonymous namespace
+
+namespace {
+
+/// Spills any live range using the spill-everywhere method with no attempt at
+/// folding.
+class TrivialSpiller : public SpillerBase {
+public:
+
+  TrivialSpiller(MachineFunctionPass &pass, MachineFunction &mf,
+                 VirtRegMap &vrm)
+    : SpillerBase(pass, mf, vrm) {}
+
+  void spill(LiveInterval *li,
+             std::vector<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &,
+             SlotIndex*) {
+    // Ignore spillIs - we don't use it.
+    trivialSpillEverywhere(li, newIntervals);
+  }
+};
+
+} // end anonymous namespace
+
+namespace {
+
+/// Falls back on LiveIntervals::addIntervalsForSpills.
+class StandardSpiller : public Spiller {
+protected:
+  LiveIntervals *lis;
+  MachineLoopInfo *loopInfo;
+  VirtRegMap *vrm;
+public:
+  StandardSpiller(MachineFunctionPass &pass, MachineFunction &mf,
+                  VirtRegMap &vrm)
+    : lis(&pass.getAnalysis<LiveIntervals>()),
+      loopInfo(pass.getAnalysisIfAvailable<MachineLoopInfo>()),
+      vrm(&vrm) {}
+
+  /// Falls back on LiveIntervals::addIntervalsForSpills.
+  void spill(LiveInterval *li,
+             std::vector<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &spillIs,
+             SlotIndex*) {
+    std::vector<LiveInterval*> added =
+      lis->addIntervalsForSpills(*li, spillIs, loopInfo, *vrm);
+    newIntervals.insert(newIntervals.end(), added.begin(), added.end());
+  }
+};
+
+} // end anonymous namespace
+
+namespace {
+
+/// When a call to spill is placed this spiller will first try to break the
+/// interval up into its component values (one new interval per value).
+/// If this fails, or if a call is placed to spill a previously split interval
+/// then the spiller falls back on the standard spilling mechanism.
+class SplittingSpiller : public StandardSpiller {
+public:
+  SplittingSpiller(MachineFunctionPass &pass, MachineFunction &mf,
+                   VirtRegMap &vrm)
+    : StandardSpiller(pass, mf, vrm) {
+    mri = &mf.getRegInfo();
+    tii = mf.getTarget().getInstrInfo();
+    tri = mf.getTarget().getRegisterInfo();
+  }
+
+  void spill(LiveInterval *li,
+             std::vector<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &spillIs,
+             SlotIndex *earliestStart) {
+    if (worthTryingToSplit(li))
+      tryVNISplit(li, earliestStart);
+    else
+      StandardSpiller::spill(li, newIntervals, spillIs, earliestStart);
+  }
+
+private:
+
+  MachineRegisterInfo *mri;
+  const TargetInstrInfo *tii;
+  const TargetRegisterInfo *tri;
+  DenseSet<LiveInterval*> alreadySplit;
+
+  bool worthTryingToSplit(LiveInterval *li) const {
+    return (!alreadySplit.count(li) && li->getNumValNums() > 1);
+  }
+
+  /// Try to break a LiveInterval into its component values.
+  std::vector<LiveInterval*> tryVNISplit(LiveInterval *li,
+                                         SlotIndex *earliestStart) {
+
+    DEBUG(dbgs() << "Trying VNI split of %reg" << *li << "\n");
+
+    std::vector<LiveInterval*> added;
+    SmallVector<VNInfo*, 4> vnis;
+
+    std::copy(li->vni_begin(), li->vni_end(), std::back_inserter(vnis));
+
+    for (SmallVectorImpl<VNInfo*>::iterator vniItr = vnis.begin(),
+         vniEnd = vnis.end(); vniItr != vniEnd; ++vniItr) {
+      VNInfo *vni = *vniItr;
+
+      // Skip unused VNIs.
+      if (vni->isUnused())
+        continue;
+
+      DEBUG(dbgs() << "  Extracted Val #" << vni->id << " as ");
+      LiveInterval *splitInterval = extractVNI(li, vni);
+
+      if (splitInterval != 0) {
+        DEBUG(dbgs() << *splitInterval << "\n");
+        added.push_back(splitInterval);
+        alreadySplit.insert(splitInterval);
+        if (earliestStart != 0) {
+          if (splitInterval->beginIndex() < *earliestStart)
+            *earliestStart = splitInterval->beginIndex();
+        }
+      } else {
+        DEBUG(dbgs() << "0\n");
+      }
+    }
+
+    DEBUG(dbgs() << "Original LI: " << *li << "\n");
+
+    // If there original interval still contains some live ranges
+    // add it to added and alreadySplit.
+    if (!li->empty()) {
+      added.push_back(li);
+      alreadySplit.insert(li);
+      if (earliestStart != 0) {
+        if (li->beginIndex() < *earliestStart)
+          *earliestStart = li->beginIndex();
+      }
+    }
+
+    return added;
+  }
+
+  /// Extract the given value number from the interval.
+  LiveInterval* extractVNI(LiveInterval *li, VNInfo *vni) const {
+    assert(vni->isDefAccurate() || vni->isPHIDef());
+
+    // Create a new vreg and live interval, copy VNI ranges over.
+    const TargetRegisterClass *trc = mri->getRegClass(li->reg);
+    unsigned newVReg = mri->createVirtualRegister(trc);
+    vrm->grow();
+    LiveInterval *newLI = &lis->getOrCreateInterval(newVReg);
+    VNInfo *newVNI = newLI->createValueCopy(vni, lis->getVNInfoAllocator());
+
+    // Start by copying all live ranges in the VN to the new interval.
+    for (LiveInterval::iterator rItr = li->begin(), rEnd = li->end();
+         rItr != rEnd; ++rItr) {
+      if (rItr->valno == vni) {
+        newLI->addRange(LiveRange(rItr->start, rItr->end, newVNI));
+      }
+    }
+
+    // Erase the old VNI & ranges.
+    li->removeValNo(vni);
+
+    // Collect all current uses of the register belonging to the given VNI.
+    // We'll use this to rename the register after we've dealt with the def.
+    std::set<MachineInstr*> uses;
+    for (MachineRegisterInfo::use_iterator
+         useItr = mri->use_begin(li->reg), useEnd = mri->use_end();
+         useItr != useEnd; ++useItr) {
+      uses.insert(&*useItr);
+    }
+
+    // Process the def instruction for this VNI.
+    if (newVNI->isPHIDef()) {
+      // Insert a copy at the start of the MBB. The range proceeding the
+      // copy will be attached to the original LiveInterval.
+      MachineBasicBlock *defMBB = lis->getMBBFromIndex(newVNI->def);
+      MachineInstr *copyMI = BuildMI(*defMBB, defMBB->begin(), DebugLoc(),
+                                     tii->get(TargetOpcode::COPY), newVReg)
+                               .addReg(li->reg, RegState::Kill);
+      SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
+      VNInfo *phiDefVNI = li->getNextValue(lis->getMBBStartIdx(defMBB),
+                                           0, false, lis->getVNInfoAllocator());
+      phiDefVNI->setIsPHIDef(true);
+      li->addRange(LiveRange(phiDefVNI->def, copyIdx.getDefIndex(), phiDefVNI));
+      LiveRange *oldPHIDefRange =
+        newLI->getLiveRangeContaining(lis->getMBBStartIdx(defMBB));
+
+      // If the old phi def starts in the middle of the range chop it up.
+      if (oldPHIDefRange->start < lis->getMBBStartIdx(defMBB)) {
+        LiveRange oldPHIDefRange2(copyIdx.getDefIndex(), oldPHIDefRange->end,
+                                  oldPHIDefRange->valno);
+        oldPHIDefRange->end = lis->getMBBStartIdx(defMBB);
+        newLI->addRange(oldPHIDefRange2);
+      } else if (oldPHIDefRange->start == lis->getMBBStartIdx(defMBB)) {
+        // Otherwise if it's at the start of the range just trim it.
+        oldPHIDefRange->start = copyIdx.getDefIndex();
+      } else {
+        assert(false && "PHI def range doesn't cover PHI def?");
+      }
+
+      newVNI->def = copyIdx.getDefIndex();
+      newVNI->setCopy(copyMI);
+      newVNI->setIsPHIDef(false); // not a PHI def anymore.
+      newVNI->setIsDefAccurate(true);
+    } else {
+      // non-PHI def. Rename the def. If it's two-addr that means renaming the
+      // use and inserting a new copy too.
+      MachineInstr *defInst = lis->getInstructionFromIndex(newVNI->def);
+      // We'll rename this now, so we can remove it from uses.
+      uses.erase(defInst);
+      unsigned defOpIdx = defInst->findRegisterDefOperandIdx(li->reg);
+      bool isTwoAddr = defInst->isRegTiedToUseOperand(defOpIdx),
+        twoAddrUseIsUndef = false;
+
+      for (unsigned i = 0; i < defInst->getNumOperands(); ++i) {
+        MachineOperand &mo = defInst->getOperand(i);
+        if (mo.isReg() && (mo.isDef() || isTwoAddr) && (mo.getReg()==li->reg)) {
+          mo.setReg(newVReg);
+          if (isTwoAddr && mo.isUse() && mo.isUndef())
+            twoAddrUseIsUndef = true;
+        }
+      }
+
+      SlotIndex defIdx = lis->getInstructionIndex(defInst);
+      newVNI->def = defIdx.getDefIndex();
+
+      if (isTwoAddr && !twoAddrUseIsUndef) {
+        MachineBasicBlock *defMBB = defInst->getParent();
+        MachineInstr *copyMI = BuildMI(*defMBB, defInst, DebugLoc(),
+                                       tii->get(TargetOpcode::COPY), newVReg)
+                                 .addReg(li->reg, RegState::Kill);
+        SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
+        LiveRange *origUseRange =
+          li->getLiveRangeContaining(newVNI->def.getUseIndex());
+        origUseRange->end = copyIdx.getDefIndex();
+        VNInfo *copyVNI = newLI->getNextValue(copyIdx.getDefIndex(), copyMI,
+                                              true, lis->getVNInfoAllocator());
+        LiveRange copyRange(copyIdx.getDefIndex(),defIdx.getDefIndex(),copyVNI);
+        newLI->addRange(copyRange);
+      }
+    }
+
+    for (std::set<MachineInstr*>::iterator
+         usesItr = uses.begin(), usesEnd = uses.end();
+         usesItr != usesEnd; ++usesItr) {
+      MachineInstr *useInst = *usesItr;
+      SlotIndex useIdx = lis->getInstructionIndex(useInst);
+      LiveRange *useRange =
+        newLI->getLiveRangeContaining(useIdx.getUseIndex());
+
+      // If this use doesn't belong to the new interval skip it.
+      if (useRange == 0)
+        continue;
+
+      // This use doesn't belong to the VNI, skip it.
+      if (useRange->valno != newVNI)
+        continue;
+
+      // Check if this instr is two address.
+      unsigned useOpIdx = useInst->findRegisterUseOperandIdx(li->reg);
+      bool isTwoAddress = useInst->isRegTiedToDefOperand(useOpIdx);
+
+      // Rename uses (and defs for two-address instrs).
+      for (unsigned i = 0; i < useInst->getNumOperands(); ++i) {
+        MachineOperand &mo = useInst->getOperand(i);
+        if (mo.isReg() && (mo.isUse() || isTwoAddress) &&
+            (mo.getReg() == li->reg)) {
+          mo.setReg(newVReg);
+        }
+      }
+
+      // If this is a two address instruction we've got some extra work to do.
+      if (isTwoAddress) {
+        // We modified the def operand, so we need to copy back to the original
+        // reg.
+        MachineBasicBlock *useMBB = useInst->getParent();
+        MachineBasicBlock::iterator useItr(useInst);
+        MachineInstr *copyMI = BuildMI(*useMBB, llvm::next(useItr), DebugLoc(),
+                                       tii->get(TargetOpcode::COPY), newVReg)
+                                 .addReg(li->reg, RegState::Kill);
+        SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
+
+        // Change the old two-address defined range & vni to start at
+        // (and be defined by) the copy.
+        LiveRange *origDefRange =
+          li->getLiveRangeContaining(useIdx.getDefIndex());
+        origDefRange->start = copyIdx.getDefIndex();
+        origDefRange->valno->def = copyIdx.getDefIndex();
+        origDefRange->valno->setCopy(copyMI);
+
+        // Insert a new range & vni for the two-address-to-copy value. This
+        // will be attached to the new live interval.
+        VNInfo *copyVNI =
+          newLI->getNextValue(useIdx.getDefIndex(), 0, true,
+                              lis->getVNInfoAllocator());
+        LiveRange copyRange(useIdx.getDefIndex(),copyIdx.getDefIndex(),copyVNI);
+        newLI->addRange(copyRange);
+      }
+    }
+
+    // Iterate over any PHI kills - we'll need to insert new copies for them.
+    for (LiveInterval::iterator LRI = newLI->begin(), LRE = newLI->end();
+         LRI != LRE; ++LRI) {
+      if (LRI->valno != newVNI || LRI->end.isPHI())
+        continue;
+      SlotIndex killIdx = LRI->end;
+      MachineBasicBlock *killMBB = lis->getMBBFromIndex(killIdx);
+      MachineInstr *copyMI = BuildMI(*killMBB, killMBB->getFirstTerminator(),
+                                     DebugLoc(), tii->get(TargetOpcode::COPY),
+                                     li->reg)
+                               .addReg(newVReg, RegState::Kill);
+      SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
+
+      // Save the current end. We may need it to add a new range if the
+      // current range runs of the end of the MBB.
+      SlotIndex newKillRangeEnd = LRI->end;
+      LRI->end = copyIdx.getDefIndex();
+
+      if (newKillRangeEnd != lis->getMBBEndIdx(killMBB)) {
+        assert(newKillRangeEnd > lis->getMBBEndIdx(killMBB) &&
+               "PHI kill range doesn't reach kill-block end. Not sane.");
+        newLI->addRange(LiveRange(lis->getMBBEndIdx(killMBB),
+                                  newKillRangeEnd, newVNI));
+      }
+
+      VNInfo *newKillVNI = li->getNextValue(copyIdx.getDefIndex(),
+                                            copyMI, true,
+                                            lis->getVNInfoAllocator());
+      newKillVNI->setHasPHIKill(true);
+      li->addRange(LiveRange(copyIdx.getDefIndex(),
+                             lis->getMBBEndIdx(killMBB),
+                             newKillVNI));
+    }
+    newVNI->setHasPHIKill(false);
+
+    return newLI;
+  }
+
+};
+
+} // end anonymous namespace
+
+
+namespace llvm {
+Spiller *createInlineSpiller(MachineFunctionPass &pass,
+                             MachineFunction &mf,
+                             VirtRegMap &vrm);
+}
+
+llvm::Spiller* llvm::createSpiller(MachineFunctionPass &pass,
+                                   MachineFunction &mf,
+                                   VirtRegMap &vrm) {
+  switch (spillerOpt) {
+  default: assert(0 && "unknown spiller");
+  case trivial: return new TrivialSpiller(pass, mf, vrm);
+  case standard: return new StandardSpiller(pass, mf, vrm);
+  case splitting: return new SplittingSpiller(pass, mf, vrm);
+  case inline_: return createInlineSpiller(pass, mf, vrm);
+  }
+}
diff --git a/src/LLVM/lib/CodeGen/Spiller.h b/src/LLVM/lib/CodeGen/Spiller.h
new file mode 100644
index 0000000..3d5172a
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/Spiller.h
@@ -0,0 +1,54 @@
+//===-- llvm/CodeGen/Spiller.h - Spiller -*- C++ -*------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SPILLER_H
+#define LLVM_CODEGEN_SPILLER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include <vector>
+
+namespace llvm {
+
+  class LiveInterval;
+  class MachineFunction;
+  class MachineFunctionPass;
+  class SlotIndex;
+  class VirtRegMap;
+
+  /// Spiller interface.
+  ///
+  /// Implementations are utility classes which insert spill or remat code on
+  /// demand.
+  class Spiller {
+  public:
+    virtual ~Spiller() = 0;
+
+    /// spill - Spill the given live interval. The method used will depend on
+    /// the Spiller implementation selected.
+    ///
+    /// @param li            The live interval to be spilled.
+    /// @param spillIs       A list of intervals that are about to be spilled,
+    ///                      and so cannot be used for remat etc.
+    /// @param newIntervals  The newly created intervals will be appended here.
+    /// @param earliestIndex The earliest point for splitting. (OK, it's another
+    ///                      pointer to the allocator guts).
+    virtual void spill(LiveInterval *li,
+                       std::vector<LiveInterval*> &newIntervals,
+                       SmallVectorImpl<LiveInterval*> &spillIs,
+                       SlotIndex *earliestIndex = 0) = 0;
+
+  };
+
+  /// Create and return a spiller object, as specified on the command line.
+  Spiller* createSpiller(MachineFunctionPass &pass,
+                         MachineFunction &mf,
+                         VirtRegMap &vrm);
+}
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/SplitKit.cpp b/src/LLVM/lib/CodeGen/SplitKit.cpp
new file mode 100644
index 0000000..a37a4d6
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SplitKit.cpp
@@ -0,0 +1,622 @@
+//===---------- SplitKit.cpp - Toolkit for splitting live ranges ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SplitAnalysis class as well as mutator functions for
+// live range splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "splitter"
+#include "SplitKit.h"
+#include "VirtRegMap.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+AllowSplit("spiller-splits-edges",
+           cl::desc("Allow critical edge splitting during spilling"));
+
+//===----------------------------------------------------------------------===//
+//                                 Split Analysis
+//===----------------------------------------------------------------------===//
+
+SplitAnalysis::SplitAnalysis(const MachineFunction &mf,
+                             const LiveIntervals &lis,
+                             const MachineLoopInfo &mli)
+  : mf_(mf),
+    lis_(lis),
+    loops_(mli),
+    tii_(*mf.getTarget().getInstrInfo()),
+    curli_(0) {}
+
+void SplitAnalysis::clear() {
+  usingInstrs_.clear();
+  usingBlocks_.clear();
+  usingLoops_.clear();
+  curli_ = 0;
+}
+
+bool SplitAnalysis::canAnalyzeBranch(const MachineBasicBlock *MBB) {
+  MachineBasicBlock *T, *F;
+  SmallVector<MachineOperand, 4> Cond;
+  return !tii_.AnalyzeBranch(const_cast<MachineBasicBlock&>(*MBB), T, F, Cond);
+}
+
+/// analyzeUses - Count instructions, basic blocks, and loops using curli.
+void SplitAnalysis::analyzeUses() {
+  const MachineRegisterInfo &MRI = mf_.getRegInfo();
+  for (MachineRegisterInfo::reg_iterator I = MRI.reg_begin(curli_->reg);
+       MachineInstr *MI = I.skipInstruction();) {
+    if (MI->isDebugValue() || !usingInstrs_.insert(MI))
+      continue;
+    MachineBasicBlock *MBB = MI->getParent();
+    if (usingBlocks_[MBB]++)
+      continue;
+    if (MachineLoop *Loop = loops_.getLoopFor(MBB))
+      usingLoops_.insert(Loop);
+  }
+  DEBUG(dbgs() << "Counted "
+               << usingInstrs_.size() << " instrs, "
+               << usingBlocks_.size() << " blocks, "
+               << usingLoops_.size()  << " loops in "
+               << *curli_ << "\n");
+}
+
+// Get three sets of basic blocks surrounding a loop: Blocks inside the loop,
+// predecessor blocks, and exit blocks.
+void SplitAnalysis::getLoopBlocks(const MachineLoop *Loop, LoopBlocks &Blocks) {
+  Blocks.clear();
+
+  // Blocks in the loop.
+  Blocks.Loop.insert(Loop->block_begin(), Loop->block_end());
+
+  // Predecessor blocks.
+  const MachineBasicBlock *Header = Loop->getHeader();
+  for (MachineBasicBlock::const_pred_iterator I = Header->pred_begin(),
+       E = Header->pred_end(); I != E; ++I)
+    if (!Blocks.Loop.count(*I))
+      Blocks.Preds.insert(*I);
+
+  // Exit blocks.
+  for (MachineLoop::block_iterator I = Loop->block_begin(),
+       E = Loop->block_end(); I != E; ++I) {
+    const MachineBasicBlock *MBB = *I;
+    for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+       SE = MBB->succ_end(); SI != SE; ++SI)
+      if (!Blocks.Loop.count(*SI))
+        Blocks.Exits.insert(*SI);
+  }
+}
+
+/// analyzeLoopPeripheralUse - Return an enum describing how curli_ is used in
+/// and around the Loop.
+SplitAnalysis::LoopPeripheralUse SplitAnalysis::
+analyzeLoopPeripheralUse(const SplitAnalysis::LoopBlocks &Blocks) {
+  LoopPeripheralUse use = ContainedInLoop;
+  for (BlockCountMap::iterator I = usingBlocks_.begin(), E = usingBlocks_.end();
+       I != E; ++I) {
+    const MachineBasicBlock *MBB = I->first;
+    // Is this a peripheral block?
+    if (use < MultiPeripheral &&
+        (Blocks.Preds.count(MBB) || Blocks.Exits.count(MBB))) {
+      if (I->second > 1) use = MultiPeripheral;
+      else               use = SinglePeripheral;
+      continue;
+    }
+    // Is it a loop block?
+    if (Blocks.Loop.count(MBB))
+      continue;
+    // It must be an unrelated block.
+    return OutsideLoop;
+  }
+  return use;
+}
+
+/// getCriticalExits - It may be necessary to partially break critical edges
+/// leaving the loop if an exit block has phi uses of curli. Collect the exit
+/// blocks that need special treatment into CriticalExits.
+void SplitAnalysis::getCriticalExits(const SplitAnalysis::LoopBlocks &Blocks,
+                                     BlockPtrSet &CriticalExits) {
+  CriticalExits.clear();
+
+  // A critical exit block contains a phi def of curli, and has a predecessor
+  // that is not in the loop nor a loop predecessor.
+  // For such an exit block, the edges carrying the new variable must be moved
+  // to a new pre-exit block.
+  for (BlockPtrSet::iterator I = Blocks.Exits.begin(), E = Blocks.Exits.end();
+       I != E; ++I) {
+    const MachineBasicBlock *Succ = *I;
+    SlotIndex SuccIdx = lis_.getMBBStartIdx(Succ);
+    VNInfo *SuccVNI = curli_->getVNInfoAt(SuccIdx);
+    // This exit may not have curli live in at all. No need to split.
+    if (!SuccVNI)
+      continue;
+    // If this is not a PHI def, it is either using a value from before the
+    // loop, or a value defined inside the loop. Both are safe.
+    if (!SuccVNI->isPHIDef() || SuccVNI->def.getBaseIndex() != SuccIdx)
+      continue;
+    // This exit block does have a PHI. Does it also have a predecessor that is
+    // not a loop block or loop predecessor?
+    for (MachineBasicBlock::const_pred_iterator PI = Succ->pred_begin(),
+         PE = Succ->pred_end(); PI != PE; ++PI) {
+      const MachineBasicBlock *Pred = *PI;
+      if (Blocks.Loop.count(Pred) || Blocks.Preds.count(Pred))
+        continue;
+      // This is a critical exit block, and we need to split the exit edge.
+      CriticalExits.insert(Succ);
+      break;
+    }
+  }
+}
+
+/// canSplitCriticalExits - Return true if it is possible to insert new exit
+/// blocks before the blocks in CriticalExits.
+bool
+SplitAnalysis::canSplitCriticalExits(const SplitAnalysis::LoopBlocks &Blocks,
+                                     BlockPtrSet &CriticalExits) {
+  // If we don't allow critical edge splitting, require no critical exits.
+  if (!AllowSplit)
+    return CriticalExits.empty();
+
+  for (BlockPtrSet::iterator I = CriticalExits.begin(), E = CriticalExits.end();
+       I != E; ++I) {
+    const MachineBasicBlock *Succ = *I;
+    // We want to insert a new pre-exit MBB before Succ, and change all the
+    // in-loop blocks to branch to the pre-exit instead of Succ.
+    // Check that all the in-loop predecessors can be changed.
+    for (MachineBasicBlock::const_pred_iterator PI = Succ->pred_begin(),
+         PE = Succ->pred_end(); PI != PE; ++PI) {
+      const MachineBasicBlock *Pred = *PI;
+      // The external predecessors won't be altered.
+      if (!Blocks.Loop.count(Pred) && !Blocks.Preds.count(Pred))
+        continue;
+      if (!canAnalyzeBranch(Pred))
+        return false;
+    }
+
+    // If Succ's layout predecessor falls through, that too must be analyzable.
+    // We need to insert the pre-exit block in the gap.
+    MachineFunction::const_iterator MFI = Succ;
+    if (MFI == mf_.begin())
+      continue;
+    if (!canAnalyzeBranch(--MFI))
+      return false;
+  }
+  // No problems found.
+  return true;
+}
+
+void SplitAnalysis::analyze(const LiveInterval *li) {
+  clear();
+  curli_ = li;
+  analyzeUses();
+}
+
+const MachineLoop *SplitAnalysis::getBestSplitLoop() {
+  assert(curli_ && "Call analyze() before getBestSplitLoop");
+  if (usingLoops_.empty())
+    return 0;
+
+  LoopPtrSet Loops, SecondLoops;
+  LoopBlocks Blocks;
+  BlockPtrSet CriticalExits;
+
+  // Find first-class and second class candidate loops.
+  // We prefer to split around loops where curli is used outside the periphery.
+  for (LoopPtrSet::const_iterator I = usingLoops_.begin(),
+       E = usingLoops_.end(); I != E; ++I) {
+    getLoopBlocks(*I, Blocks);
+
+    // FIXME: We need an SSA updater to properly handle multiple exit blocks.
+    if (Blocks.Exits.size() > 1) {
+      DEBUG(dbgs() << "MultipleExits: " << **I);
+      continue;
+    }
+
+    LoopPtrSet *LPS = 0;
+    switch(analyzeLoopPeripheralUse(Blocks)) {
+    case OutsideLoop:
+      LPS = &Loops;
+      break;
+    case MultiPeripheral:
+      LPS = &SecondLoops;
+      break;
+    case ContainedInLoop:
+      DEBUG(dbgs() << "ContainedInLoop: " << **I);
+      continue;
+    case SinglePeripheral:
+      DEBUG(dbgs() << "SinglePeripheral: " << **I);
+      continue;
+    }
+    // Will it be possible to split around this loop?
+    getCriticalExits(Blocks, CriticalExits);
+    DEBUG(dbgs() << CriticalExits.size() << " critical exits: " << **I);
+    if (!canSplitCriticalExits(Blocks, CriticalExits))
+      continue;
+    // This is a possible split.
+    assert(LPS);
+    LPS->insert(*I);
+  }
+
+  DEBUG(dbgs() << "Got " << Loops.size() << " + " << SecondLoops.size()
+               << " candidate loops\n");
+
+  // If there are no first class loops available, look at second class loops.
+  if (Loops.empty())
+    Loops = SecondLoops;
+
+  if (Loops.empty())
+    return 0;
+
+  // Pick the earliest loop.
+  // FIXME: Are there other heuristics to consider?
+  const MachineLoop *Best = 0;
+  SlotIndex BestIdx;
+  for (LoopPtrSet::const_iterator I = Loops.begin(), E = Loops.end(); I != E;
+       ++I) {
+    SlotIndex Idx = lis_.getMBBStartIdx((*I)->getHeader());
+    if (!Best || Idx < BestIdx)
+      Best = *I, BestIdx = Idx;
+  }
+  DEBUG(dbgs() << "Best: " << *Best);
+  return Best;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               Split Editor
+//===----------------------------------------------------------------------===//
+
+/// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
+SplitEditor::SplitEditor(SplitAnalysis &sa, LiveIntervals &lis, VirtRegMap &vrm,
+                         std::vector<LiveInterval*> &intervals)
+  : sa_(sa), lis_(lis), vrm_(vrm),
+    mri_(vrm.getMachineFunction().getRegInfo()),
+    tii_(*vrm.getMachineFunction().getTarget().getInstrInfo()),
+    curli_(sa_.getCurLI()),
+    dupli_(0), openli_(0),
+    intervals_(intervals),
+    firstInterval(intervals_.size())
+{
+  assert(curli_ && "SplitEditor created from empty SplitAnalysis");
+
+  // Make sure curli_ is assigned a stack slot, so all our intervals get the
+  // same slot as curli_.
+  if (vrm_.getStackSlot(curli_->reg) == VirtRegMap::NO_STACK_SLOT)
+    vrm_.assignVirt2StackSlot(curli_->reg);
+
+}
+
+LiveInterval *SplitEditor::createInterval() {
+  unsigned curli = sa_.getCurLI()->reg;
+  unsigned Reg = mri_.createVirtualRegister(mri_.getRegClass(curli));
+  LiveInterval &Intv = lis_.getOrCreateInterval(Reg);
+  vrm_.grow();
+  vrm_.assignVirt2StackSlot(Reg, vrm_.getStackSlot(curli));
+  return &Intv;
+}
+
+LiveInterval *SplitEditor::getDupLI() {
+  if (!dupli_) {
+    // Create an interval for dupli that is a copy of curli.
+    dupli_ = createInterval();
+    dupli_->Copy(*curli_, &mri_, lis_.getVNInfoAllocator());
+    DEBUG(dbgs() << "SplitEditor DupLI: " << *dupli_ << '\n');
+  }
+  return dupli_;
+}
+
+VNInfo *SplitEditor::mapValue(const VNInfo *curliVNI) {
+  VNInfo *&VNI = valueMap_[curliVNI];
+  if (!VNI)
+    VNI = openli_->createValueCopy(curliVNI, lis_.getVNInfoAllocator());
+  return VNI;
+}
+
+/// Insert a COPY instruction curli -> li. Allocate a new value from li
+/// defined by the COPY. Note that rewrite() will deal with the curli
+/// register, so this function can be used to copy from any interval - openli,
+/// curli, or dupli.
+VNInfo *SplitEditor::insertCopy(LiveInterval &LI,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) {
+  MachineInstr *MI = BuildMI(MBB, I, DebugLoc(), tii_.get(TargetOpcode::COPY),
+                             LI.reg).addReg(curli_->reg);
+  SlotIndex DefIdx = lis_.InsertMachineInstrInMaps(MI).getDefIndex();
+  return LI.getNextValue(DefIdx, MI, true, lis_.getVNInfoAllocator());
+}
+
+/// Create a new virtual register and live interval.
+void SplitEditor::openIntv() {
+  assert(!openli_ && "Previous LI not closed before openIntv");
+  openli_ = createInterval();
+  intervals_.push_back(openli_);
+  liveThrough_ = false;
+}
+
+/// enterIntvAtEnd - Enter openli at the end of MBB.
+/// PhiMBB is a successor inside openli where a PHI value is created.
+/// Currently, all entries must share the same PhiMBB.
+void SplitEditor::enterIntvAtEnd(MachineBasicBlock &A, MachineBasicBlock &B) {
+  assert(openli_ && "openIntv not called before enterIntvAtEnd");
+
+  SlotIndex EndA = lis_.getMBBEndIdx(&A);
+  VNInfo *CurVNIA = curli_->getVNInfoAt(EndA.getPrevIndex());
+  if (!CurVNIA) {
+    DEBUG(dbgs() << "  ignoring enterIntvAtEnd, curli not live out of BB#"
+                 << A.getNumber() << ".\n");
+    return;
+  }
+
+  // Add a phi kill value and live range out of A.
+  VNInfo *VNIA = insertCopy(*openli_, A, A.getFirstTerminator());
+  openli_->addRange(LiveRange(VNIA->def, EndA, VNIA));
+
+  // FIXME: If this is the only entry edge, we don't need the extra PHI value.
+  // FIXME: If there are multiple entry blocks (so not a loop), we need proper
+  // SSA update.
+
+  // Now look at the start of B.
+  SlotIndex StartB = lis_.getMBBStartIdx(&B);
+  SlotIndex EndB = lis_.getMBBEndIdx(&B);
+  const LiveRange *CurB = curli_->getLiveRangeContaining(StartB);
+  if (!CurB) {
+    DEBUG(dbgs() << "  enterIntvAtEnd: curli not live in to BB#"
+                 << B.getNumber() << ".\n");
+    return;
+  }
+
+  VNInfo *VNIB = openli_->getVNInfoAt(StartB);
+  if (!VNIB) {
+    // Create a phi value.
+    VNIB = openli_->getNextValue(SlotIndex(StartB, true), 0, false,
+                                 lis_.getVNInfoAllocator());
+    VNIB->setIsPHIDef(true);
+    // Add a minimal range for the new value.
+    openli_->addRange(LiveRange(VNIB->def, std::min(EndB, CurB->end), VNIB));
+
+    VNInfo *&mapVNI = valueMap_[CurB->valno];
+    if (mapVNI) {
+      // Multiple copies - must create PHI value.
+      abort();
+    } else {
+      // This is the first copy of dupLR. Mark the mapping.
+      mapVNI = VNIB;
+    }
+
+  }
+
+  DEBUG(dbgs() << "  enterIntvAtEnd: " << *openli_ << '\n');
+}
+
+/// useIntv - indicate that all instructions in MBB should use openli.
+void SplitEditor::useIntv(const MachineBasicBlock &MBB) {
+  useIntv(lis_.getMBBStartIdx(&MBB), lis_.getMBBEndIdx(&MBB));
+}
+
+void SplitEditor::useIntv(SlotIndex Start, SlotIndex End) {
+  assert(openli_ && "openIntv not called before useIntv");
+
+  // Map the curli values from the interval into openli_
+  LiveInterval::const_iterator B = curli_->begin(), E = curli_->end();
+  LiveInterval::const_iterator I = std::lower_bound(B, E, Start);
+
+  if (I != B) {
+    --I;
+    // I begins before Start, but overlaps. openli may already have a value.
+    if (I->end > Start && !openli_->liveAt(Start))
+      openli_->addRange(LiveRange(Start, std::min(End, I->end),
+                        mapValue(I->valno)));
+    ++I;
+  }
+
+  // The remaining ranges begin after Start.
+  for (;I != E && I->start < End; ++I)
+    openli_->addRange(LiveRange(I->start, std::min(End, I->end),
+                                mapValue(I->valno)));
+  DEBUG(dbgs() << "  added range [" << Start << ';' << End << "): " << *openli_
+               << '\n');
+}
+
+/// leaveIntvAtTop - Leave the interval at the top of MBB.
+/// Currently, only one value can leave the interval.
+void SplitEditor::leaveIntvAtTop(MachineBasicBlock &MBB) {
+  assert(openli_ && "openIntv not called before leaveIntvAtTop");
+
+  SlotIndex Start = lis_.getMBBStartIdx(&MBB);
+  const LiveRange *CurLR = curli_->getLiveRangeContaining(Start);
+
+  // Is curli even live-in to MBB?
+  if (!CurLR) {
+    DEBUG(dbgs() << "  leaveIntvAtTop at " << Start << ": not live\n");
+    return;
+  }
+
+  // Is curli defined by PHI at the beginning of MBB?
+  bool isPHIDef = CurLR->valno->isPHIDef() &&
+                  CurLR->valno->def.getBaseIndex() == Start;
+
+  // If MBB is using a value of curli that was defined outside the openli range,
+  // we don't want to copy it back here.
+  if (!isPHIDef && !openli_->liveAt(CurLR->valno->def)) {
+    DEBUG(dbgs() << "  leaveIntvAtTop at " << Start
+                 << ": using external value\n");
+    liveThrough_ = true;
+    return;
+  }
+
+  // We are going to insert a back copy, so we must have a dupli_.
+  LiveRange *DupLR = getDupLI()->getLiveRangeContaining(Start);
+  assert(DupLR && "dupli not live into black, but curli is?");
+
+  // Insert the COPY instruction.
+  MachineInstr *MI = BuildMI(MBB, MBB.begin(), DebugLoc(),
+                             tii_.get(TargetOpcode::COPY), dupli_->reg)
+                       .addReg(openli_->reg);
+  SlotIndex Idx = lis_.InsertMachineInstrInMaps(MI).getDefIndex();
+
+  // Adjust dupli and openli values.
+  if (isPHIDef) {
+    // dupli was already a PHI on entry to MBB. Simply insert an openli PHI,
+    // and shift the dupli def down to the COPY.
+    VNInfo *VNI = openli_->getNextValue(SlotIndex(Start, true), 0, false,
+                                        lis_.getVNInfoAllocator());
+    VNI->setIsPHIDef(true);
+    openli_->addRange(LiveRange(VNI->def, Idx, VNI));
+
+    dupli_->removeRange(Start, Idx);
+    DupLR->valno->def = Idx;
+    DupLR->valno->setIsPHIDef(false);
+  } else {
+    // The dupli value was defined somewhere inside the openli range.
+    DEBUG(dbgs() << "  leaveIntvAtTop source value defined at "
+                 << DupLR->valno->def << "\n");
+    // FIXME: We may not need a PHI here if all predecessors have the same
+    // value.
+    VNInfo *VNI = openli_->getNextValue(SlotIndex(Start, true), 0, false,
+                                        lis_.getVNInfoAllocator());
+    VNI->setIsPHIDef(true);
+    openli_->addRange(LiveRange(VNI->def, Idx, VNI));
+
+    // FIXME: What if DupLR->valno is used by multiple exits? SSA Update.
+
+    // closeIntv is going to remove the superfluous live ranges.
+    DupLR->valno->def = Idx;
+    DupLR->valno->setIsPHIDef(false);
+  }
+
+  DEBUG(dbgs() << "  leaveIntvAtTop at " << Idx << ": " << *openli_ << '\n');
+}
+
+/// closeIntv - Indicate that we are done editing the currently open
+/// LiveInterval, and ranges can be trimmed.
+void SplitEditor::closeIntv() {
+  assert(openli_ && "openIntv not called before closeIntv");
+
+  DEBUG(dbgs() << "  closeIntv cleaning up\n");
+  DEBUG(dbgs() << "    open " << *openli_ << '\n');
+
+  if (liveThrough_) {
+    DEBUG(dbgs() << "  value live through region, leaving dupli as is.\n");
+  } else {
+    // live out with copies inserted, or killed by region. Either way we need to
+    // remove the overlapping region from dupli.
+    getDupLI();
+    for (LiveInterval::iterator I = openli_->begin(), E = openli_->end();
+         I != E; ++I) {
+      dupli_->removeRange(I->start, I->end);
+    }
+    // FIXME: A block branching to the entry block may also branch elsewhere
+    // curli is live. We need both openli and curli to be live in that case.
+    DEBUG(dbgs() << "    dup2 " << *dupli_ << '\n');
+  }
+  openli_ = 0;
+}
+
+/// rewrite - after all the new live ranges have been created, rewrite
+/// instructions using curli to use the new intervals.
+void SplitEditor::rewrite() {
+  assert(!openli_ && "Previous LI not closed before rewrite");
+  const LiveInterval *curli = sa_.getCurLI();
+  for (MachineRegisterInfo::reg_iterator RI = mri_.reg_begin(curli->reg),
+       RE = mri_.reg_end(); RI != RE;) {
+    MachineOperand &MO = RI.getOperand();
+    MachineInstr *MI = MO.getParent();
+    ++RI;
+    if (MI->isDebugValue()) {
+      DEBUG(dbgs() << "Zapping " << *MI);
+      // FIXME: We can do much better with debug values.
+      MO.setReg(0);
+      continue;
+    }
+    SlotIndex Idx = lis_.getInstructionIndex(MI);
+    Idx = MO.isUse() ? Idx.getUseIndex() : Idx.getDefIndex();
+    LiveInterval *LI = dupli_;
+    for (unsigned i = firstInterval, e = intervals_.size(); i != e; ++i) {
+      LiveInterval *testli = intervals_[i];
+      if (testli->liveAt(Idx)) {
+        LI = testli;
+        break;
+      }
+    }
+    if (LI)
+      MO.setReg(LI->reg);
+    DEBUG(dbgs() << "rewrite " << Idx << '\t' << *MI);
+  }
+
+  // dupli_ goes in last, after rewriting.
+  if (dupli_) {
+    dupli_->RenumberValues();
+    intervals_.push_back(dupli_);
+  }
+
+  // Calculate spill weight and allocation hints for new intervals.
+  VirtRegAuxInfo vrai(vrm_.getMachineFunction(), lis_, sa_.loops_);
+  for (unsigned i = firstInterval, e = intervals_.size(); i != e; ++i) {
+    LiveInterval &li = *intervals_[i];
+    vrai.CalculateRegClass(li.reg);
+    vrai.CalculateWeightAndHint(li);
+    DEBUG(dbgs() << "new intv " << mri_.getRegClass(li.reg)->getName() << ":"
+                 << li << '\n');
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               Loop Splitting
+//===----------------------------------------------------------------------===//
+
+bool SplitEditor::splitAroundLoop(const MachineLoop *Loop) {
+  SplitAnalysis::LoopBlocks Blocks;
+  sa_.getLoopBlocks(Loop, Blocks);
+
+  // Break critical edges as needed.
+  SplitAnalysis::BlockPtrSet CriticalExits;
+  sa_.getCriticalExits(Blocks, CriticalExits);
+  assert(CriticalExits.empty() && "Cannot break critical exits yet");
+
+  // Create new live interval for the loop.
+  openIntv();
+
+  // Insert copies in the predecessors.
+  for (SplitAnalysis::BlockPtrSet::iterator I = Blocks.Preds.begin(),
+       E = Blocks.Preds.end(); I != E; ++I) {
+    MachineBasicBlock &MBB = const_cast<MachineBasicBlock&>(**I);
+    enterIntvAtEnd(MBB, *Loop->getHeader());
+  }
+
+  // Switch all loop blocks.
+  for (SplitAnalysis::BlockPtrSet::iterator I = Blocks.Loop.begin(),
+       E = Blocks.Loop.end(); I != E; ++I)
+     useIntv(**I);
+
+  // Insert back copies in the exit blocks.
+  for (SplitAnalysis::BlockPtrSet::iterator I = Blocks.Exits.begin(),
+       E = Blocks.Exits.end(); I != E; ++I) {
+    MachineBasicBlock &MBB = const_cast<MachineBasicBlock&>(**I);
+    leaveIntvAtTop(MBB);
+  }
+
+  // Done.
+  closeIntv();
+  rewrite();
+  return dupli_;
+}
+
diff --git a/src/LLVM/lib/CodeGen/SplitKit.h b/src/LLVM/lib/CodeGen/SplitKit.h
new file mode 100644
index 0000000..d125a45
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/SplitKit.h
@@ -0,0 +1,228 @@
+//===---------- SplitKit.cpp - Toolkit for splitting live ranges ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SplitAnalysis class as well as mutator functions for
+// live range splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+namespace llvm {
+
+class LiveInterval;
+class LiveIntervals;
+class MachineInstr;
+class MachineLoop;
+class MachineLoopInfo;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+class VirtRegMap;
+class VNInfo;
+
+/// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
+/// opportunities.
+class SplitAnalysis {
+public:
+  const MachineFunction &mf_;
+  const LiveIntervals &lis_;
+  const MachineLoopInfo &loops_;
+  const TargetInstrInfo &tii_;
+
+private:
+  // Current live interval.
+  const LiveInterval *curli_;
+
+  // Instructions using the the current register.
+  typedef SmallPtrSet<const MachineInstr*, 16> InstrPtrSet;
+  InstrPtrSet usingInstrs_;
+
+  // The number of instructions using curli in each basic block.
+  typedef DenseMap<const MachineBasicBlock*, unsigned> BlockCountMap;
+  BlockCountMap usingBlocks_;
+
+  // Loops where the curent interval is used.
+  typedef SmallPtrSet<const MachineLoop*, 16> LoopPtrSet;
+  LoopPtrSet usingLoops_;
+
+  // Sumarize statistics by counting instructions using curli_.
+  void analyzeUses();
+
+  /// canAnalyzeBranch - Return true if MBB ends in a branch that can be
+  /// analyzed.
+  bool canAnalyzeBranch(const MachineBasicBlock *MBB);
+
+public:
+  SplitAnalysis(const MachineFunction &mf, const LiveIntervals &lis,
+                const MachineLoopInfo &mli);
+
+  /// analyze - set curli to the specified interval, and analyze how it may be
+  /// split.
+  void analyze(const LiveInterval *li);
+
+  const LiveInterval *getCurLI() { return curli_; }
+
+  /// clear - clear all data structures so SplitAnalysis is ready to analyze a
+  /// new interval.
+  void clear();
+
+  typedef SmallPtrSet<const MachineBasicBlock*, 16> BlockPtrSet;
+
+  // Sets of basic blocks surrounding a machine loop.
+  struct LoopBlocks {
+    BlockPtrSet Loop;  // Blocks in the loop.
+    BlockPtrSet Preds; // Loop predecessor blocks.
+    BlockPtrSet Exits; // Loop exit blocks.
+
+    void clear() {
+      Loop.clear();
+      Preds.clear();
+      Exits.clear();
+    }
+  };
+
+  // Calculate the block sets surrounding the loop.
+  void getLoopBlocks(const MachineLoop *Loop, LoopBlocks &Blocks);
+
+  /// LoopPeripheralUse - how is a variable used in and around a loop?
+  /// Peripheral blocks are the loop predecessors and exit blocks.
+  enum LoopPeripheralUse {
+    ContainedInLoop,  // All uses are inside the loop.
+    SinglePeripheral, // At most one instruction per peripheral block.
+    MultiPeripheral,  // Multiple instructions in some peripheral blocks.
+    OutsideLoop       // Uses outside loop periphery.
+  };
+
+  /// analyzeLoopPeripheralUse - Return an enum describing how curli_ is used in
+  /// and around the Loop.
+  LoopPeripheralUse analyzeLoopPeripheralUse(const LoopBlocks&);
+
+  /// getCriticalExits - It may be necessary to partially break critical edges
+  /// leaving the loop if an exit block has phi uses of curli. Collect the exit
+  /// blocks that need special treatment into CriticalExits.
+  void getCriticalExits(const LoopBlocks &Blocks, BlockPtrSet &CriticalExits);
+
+  /// canSplitCriticalExits - Return true if it is possible to insert new exit
+  /// blocks before the blocks in CriticalExits.
+  bool canSplitCriticalExits(const LoopBlocks &Blocks,
+                             BlockPtrSet &CriticalExits);
+
+  /// getBestSplitLoop - Return the loop where curli may best be split to a
+  /// separate register, or NULL.
+  const MachineLoop *getBestSplitLoop();
+};
+
+/// SplitEditor - Edit machine code and LiveIntervals for live range
+/// splitting.
+///
+/// - Create a SplitEditor from a SplitAnalysis.
+/// - Start a new live interval with openIntv.
+/// - Mark the places where the new interval is entered using enterIntv*
+/// - Mark the ranges where the new interval is used with useIntv* 
+/// - Mark the places where the interval is exited with exitIntv*.
+/// - Finish the current interval with closeIntv and repeat from 2.
+/// - Rewrite instructions with rewrite().
+///
+class SplitEditor {
+  SplitAnalysis &sa_;
+  LiveIntervals &lis_;
+  VirtRegMap &vrm_;
+  MachineRegisterInfo &mri_;
+  const TargetInstrInfo &tii_;
+
+  /// curli_ - The immutable interval we are currently splitting.
+  const LiveInterval *const curli_;
+
+  /// dupli_ - Created as a copy of curli_, ranges are carved out as new
+  /// intervals get added through openIntv / closeIntv. This is used to avoid
+  /// editing curli_.
+  LiveInterval *dupli_;
+
+  /// Currently open LiveInterval.
+  LiveInterval *openli_;
+
+  /// createInterval - Create a new virtual register and LiveInterval with same
+  /// register class and spill slot as curli.
+  LiveInterval *createInterval();
+
+  /// getDupLI - Ensure dupli is created and return it.
+  LiveInterval *getDupLI();
+
+  /// valueMap_ - Map values in dupli to values in openIntv. These are direct 1-1
+  /// mappings, and do not include values created by inserted copies.
+  DenseMap<const VNInfo*, VNInfo*> valueMap_;
+
+  /// mapValue - Return the openIntv value that corresponds to the given curli
+  /// value.
+  VNInfo *mapValue(const VNInfo *curliVNI);
+
+  /// A dupli value is live through openIntv.
+  bool liveThrough_;
+
+  /// All the new intervals created for this split are added to intervals_.
+  std::vector<LiveInterval*> &intervals_;
+
+  /// The index into intervals_ of the first interval we added. There may be
+  /// others from before we got it.
+  unsigned firstInterval;
+
+  /// Insert a COPY instruction curli -> li. Allocate a new value from li
+  /// defined by the COPY
+  VNInfo *insertCopy(LiveInterval &LI,
+                     MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator I);
+
+public:
+  /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
+  /// Newly created intervals will be appended to newIntervals.
+  SplitEditor(SplitAnalysis &SA, LiveIntervals&, VirtRegMap&,
+              std::vector<LiveInterval*> &newIntervals);
+
+  /// getAnalysis - Get the corresponding analysis.
+  SplitAnalysis &getAnalysis() { return sa_; }
+
+  /// Create a new virtual register and live interval.
+  void openIntv();
+
+  /// enterIntvAtEnd - Enter openli at the end of MBB.
+  /// PhiMBB is a successor inside openli where a PHI value is created.
+  /// Currently, all entries must share the same PhiMBB.
+  void enterIntvAtEnd(MachineBasicBlock &MBB, MachineBasicBlock &PhiMBB);
+
+  /// useIntv - indicate that all instructions in MBB should use openli.
+  void useIntv(const MachineBasicBlock &MBB);
+
+  /// useIntv - indicate that all instructions in range should use openli.
+  void useIntv(SlotIndex Start, SlotIndex End);
+
+  /// leaveIntvAtTop - Leave the interval at the top of MBB.
+  /// Currently, only one value can leave the interval.
+  void leaveIntvAtTop(MachineBasicBlock &MBB);
+
+  /// closeIntv - Indicate that we are done editing the currently open
+  /// LiveInterval, and ranges can be trimmed.
+  void closeIntv();
+
+  /// rewrite - after all the new live ranges have been created, rewrite
+  /// instructions using curli to use the new intervals.
+  void rewrite();
+
+  // ===--- High level methods ---===
+
+  /// splitAroundLoop - Split curli into a separate live interval inside
+  /// the loop. Return true if curli has been completely replaced, false if
+  /// curli is still intact, and needs to be spilled or split further.
+  bool splitAroundLoop(const MachineLoop*);
+
+};
+
+
+}
diff --git a/src/LLVM/lib/CodeGen/Splitter.h b/src/LLVM/lib/CodeGen/Splitter.h
new file mode 100644
index 0000000..a726a7b
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/Splitter.h
@@ -0,0 +1,99 @@
+//===-- llvm/CodeGen/Splitter.h - Splitter -*- C++ -*----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SPLITTER_H
+#define LLVM_CODEGEN_SPLITTER_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+#include <deque>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+  class LiveInterval;
+  class LiveIntervals;
+  struct LiveRange;
+  class LoopSplit;
+  class MachineDominatorTree;
+  class MachineRegisterInfo;
+  class SlotIndexes;
+  class TargetInstrInfo;
+  class VNInfo;
+
+  class LoopSplitter : public MachineFunctionPass {
+    friend class LoopSplit;
+  public:
+    static char ID;
+
+    LoopSplitter() : MachineFunctionPass(ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &au) const;
+
+    virtual bool runOnMachineFunction(MachineFunction &fn);
+
+    virtual void releaseMemory();
+
+
+  private:
+
+    MachineFunction *mf;
+    LiveIntervals *lis;
+    MachineLoopInfo *mli;
+    MachineRegisterInfo *mri;
+    MachineDominatorTree *mdt;
+    SlotIndexes *sis;
+    const TargetInstrInfo *tii;
+    const TargetRegisterInfo *tri;
+
+    std::string fqn;
+    std::deque<LiveInterval*> intervals;
+
+    typedef std::pair<SlotIndex, SlotIndex> SlotPair;
+    typedef std::vector<SlotPair> LoopRanges;
+    typedef std::map<MachineLoop*, LoopRanges> LoopRangeMap;
+    LoopRangeMap loopRangeMap;
+
+    void dumpLoopInfo(MachineLoop &loop);
+
+    void dumpOddTerminators();
+
+    void updateTerminators(MachineBasicBlock &mbb);
+
+    bool canInsertPreHeader(MachineLoop &loop);
+    MachineBasicBlock& insertPreHeader(MachineLoop &loop);
+
+    bool isCriticalEdge(MachineLoop::Edge &edge);
+    bool canSplitEdge(MachineLoop::Edge &edge);
+    MachineBasicBlock& splitEdge(MachineLoop::Edge &edge, MachineLoop &loop);
+
+    LoopRanges& getLoopRanges(MachineLoop &loop);
+    std::pair<bool, SlotPair> getLoopSubRange(const LiveRange &lr,
+                                              MachineLoop &loop);
+
+    void dumpLoopRanges(MachineLoop &loop);
+
+    void processHeader(LoopSplit &split);
+    void processLoopExits(LoopSplit &split);
+    void processLoopUses(LoopSplit &split);
+
+    bool splitOverLoop(LiveInterval &li, MachineLoop &loop);
+
+    void processInterval(LiveInterval &li);
+
+    void processIntervals();
+  };
+
+}
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/StackProtector.cpp b/src/LLVM/lib/CodeGen/StackProtector.cpp
new file mode 100644
index 0000000..a16feb0
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/StackProtector.cpp
@@ -0,0 +1,240 @@
+//===-- StackProtector.cpp - Stack Protector Insertion --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass inserts stack protectors into functions which need them. A variable
+// with a random value in it is stored onto the stack before the local variables
+// are allocated. Upon exiting the block, the stored value is checked. If it's
+// changed, then there was some sort of violation and the program aborts.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "stack-protector"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Attributes.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+// SSPBufferSize - The lower bound for a buffer to be considered for stack
+// smashing protection.
+static cl::opt<unsigned>
+SSPBufferSize("stack-protector-buffer-size", cl::init(8),
+              cl::desc("Lower bound for a buffer to be considered for "
+                       "stack protection"));
+
+namespace {
+  class StackProtector : public FunctionPass {
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// target type sizes.
+    const TargetLowering *TLI;
+
+    Function *F;
+    Module *M;
+
+    /// InsertStackProtectors - Insert code into the prologue and epilogue of
+    /// the function.
+    ///
+    ///  - The prologue code loads and stores the stack guard onto the stack.
+    ///  - The epilogue checks the value stored in the prologue against the
+    ///    original value. It calls __stack_chk_fail if they differ.
+    bool InsertStackProtectors();
+
+    /// CreateFailBB - Create a basic block to jump to when the stack protector
+    /// check fails.
+    BasicBlock *CreateFailBB();
+
+    /// RequiresStackProtector - Check whether or not this function needs a
+    /// stack protector based upon the stack protector level.
+    bool RequiresStackProtector() const;
+  public:
+    static char ID;             // Pass identification, replacement for typeid.
+    StackProtector() : FunctionPass(ID), TLI(0) {}
+    StackProtector(const TargetLowering *tli)
+      : FunctionPass(ID), TLI(tli) {}
+
+    virtual bool runOnFunction(Function &Fn);
+  };
+} // end anonymous namespace
+
+char StackProtector::ID = 0;
+INITIALIZE_PASS(StackProtector, "stack-protector",
+                "Insert stack protectors", false, false);
+
+FunctionPass *llvm::createStackProtectorPass(const TargetLowering *tli) {
+  return new StackProtector(tli);
+}
+
+bool StackProtector::runOnFunction(Function &Fn) {
+  F = &Fn;
+  M = F->getParent();
+
+  if (!RequiresStackProtector()) return false;
+  
+  return InsertStackProtectors();
+}
+
+/// RequiresStackProtector - Check whether or not this function needs a stack
+/// protector based upon the stack protector level. The heuristic we use is to
+/// add a guard variable to functions that call alloca, and functions with
+/// buffers larger than SSPBufferSize bytes.
+bool StackProtector::RequiresStackProtector() const {
+  if (F->hasFnAttr(Attribute::StackProtectReq))
+    return true;
+
+  if (!F->hasFnAttr(Attribute::StackProtect))
+    return false;
+
+  const TargetData *TD = TLI->getTargetData();
+
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+    BasicBlock *BB = I;
+
+    for (BasicBlock::iterator
+           II = BB->begin(), IE = BB->end(); II != IE; ++II)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+        if (AI->isArrayAllocation())
+          // This is a call to alloca with a variable size. Emit stack
+          // protectors.
+          return true;
+
+        if (const ArrayType *AT = dyn_cast<ArrayType>(AI->getAllocatedType())) {
+          // We apparently only care about character arrays.
+          if (!AT->getElementType()->isIntegerTy(8))
+            continue;
+
+          // If an array has more than SSPBufferSize bytes of allocated space,
+          // then we emit stack protectors.
+          if (SSPBufferSize <= TD->getTypeAllocSize(AT))
+            return true;
+        }
+      }
+  }
+
+  return false;
+}
+
+/// InsertStackProtectors - Insert code into the prologue and epilogue of the
+/// function.
+///
+///  - The prologue code loads and stores the stack guard onto the stack.
+///  - The epilogue checks the value stored in the prologue against the original
+///    value. It calls __stack_chk_fail if they differ.
+bool StackProtector::InsertStackProtectors() {
+  BasicBlock *FailBB = 0;       // The basic block to jump to if check fails.
+  AllocaInst *AI = 0;           // Place on stack that stores the stack guard.
+  Value *StackGuardVar = 0;  // The stack guard variable.
+
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ) {
+    BasicBlock *BB = I++;
+
+    ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
+    if (!RI) continue;
+
+    if (!FailBB) {
+      // Insert code into the entry block that stores the __stack_chk_guard
+      // variable onto the stack:
+      //
+      //   entry:
+      //     StackGuardSlot = alloca i8*
+      //     StackGuard = load __stack_chk_guard
+      //     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
+      // 
+      const PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
+      unsigned AddressSpace, Offset;
+      if (TLI->getStackCookieLocation(AddressSpace, Offset)) {
+        Constant *OffsetVal =
+          ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
+        
+        StackGuardVar = ConstantExpr::getIntToPtr(OffsetVal,
+                                      PointerType::get(PtrTy, AddressSpace));
+      } else {
+        StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy); 
+      }
+
+      BasicBlock &Entry = F->getEntryBlock();
+      Instruction *InsPt = &Entry.front();
+
+      AI = new AllocaInst(PtrTy, InsPt);
+      LoadInst *LI = new LoadInst(StackGuardVar, false, InsPt);
+
+      Value *Args[] = { LI, AI };
+      CallInst::
+        Create(Intrinsic::getDeclaration(M, Intrinsic::stackprotector),
+               &Args[0], array_endof(Args), "", InsPt);
+
+      // Create the basic block to jump to when the guard check fails.
+      FailBB = CreateFailBB();
+    }
+
+    // For each block with a return instruction, convert this:
+    //
+    //   return:
+    //     ...
+    //     ret ...
+    //
+    // into this:
+    //
+    //   return:
+    //     ...
+    //     %1 = load __stack_chk_guard
+    //     %2 = load StackGuardSlot
+    //     %3 = cmp i1 %1, %2
+    //     br i1 %3, label %SP_return, label %CallStackCheckFailBlk
+    //
+    //   SP_return:
+    //     ret ...
+    //
+    //   CallStackCheckFailBlk:
+    //     call void @__stack_chk_fail()
+    //     unreachable
+
+    // Split the basic block before the return instruction.
+    BasicBlock *NewBB = BB->splitBasicBlock(RI);
+
+    // Remove default branch instruction to the new BB.
+    BB->getTerminator()->eraseFromParent();
+
+    // Move the newly created basic block to the point right after the old basic
+    // block so that it's in the "fall through" position.
+    NewBB->moveAfter(BB);
+
+    // Generate the stack protector instructions in the old basic block.
+    LoadInst *LI1 = new LoadInst(StackGuardVar, "", false, BB);
+    LoadInst *LI2 = new LoadInst(AI, "", true, BB);
+    ICmpInst *Cmp = new ICmpInst(*BB, CmpInst::ICMP_EQ, LI1, LI2);
+    BranchInst::Create(NewBB, FailBB, Cmp, BB);
+  }
+
+  // Return if we didn't modify any basic blocks. I.e., there are no return
+  // statements in the function.
+  if (!FailBB) return false;
+
+  return true;
+}
+
+/// CreateFailBB - Create a basic block to jump to when the stack protector
+/// check fails.
+BasicBlock *StackProtector::CreateFailBB() {
+  BasicBlock *FailBB = BasicBlock::Create(F->getContext(),
+                                          F);
+  Constant *StackChkFail =
+    M->getOrInsertFunction("__stack_chk_fail",
+                           Type::getVoidTy(F->getContext()), NULL);
+  CallInst::Create(StackChkFail, "", FailBB);
+  new UnreachableInst(F->getContext(), FailBB);
+  return FailBB;
+}
diff --git a/src/LLVM/lib/CodeGen/StackSlotColoring.cpp b/src/LLVM/lib/CodeGen/StackSlotColoring.cpp
new file mode 100644
index 0000000..dab6bc3
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/StackSlotColoring.cpp
@@ -0,0 +1,759 @@
+//===-- StackSlotColoring.cpp - Stack slot coloring pass. -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the stack slot coloring pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "stackcoloring"
+#include "VirtRegMap.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include <vector>
+using namespace llvm;
+
+static cl::opt<bool>
+DisableSharing("no-stack-slot-sharing",
+             cl::init(false), cl::Hidden,
+             cl::desc("Suppress slot sharing during stack coloring"));
+
+static cl::opt<bool>
+ColorWithRegsOpt("color-ss-with-regs",
+                 cl::init(false), cl::Hidden,
+                 cl::desc("Color stack slots with free registers"));
+
+
+static cl::opt<int> DCELimit("ssc-dce-limit", cl::init(-1), cl::Hidden);
+
+STATISTIC(NumEliminated, "Number of stack slots eliminated due to coloring");
+STATISTIC(NumRegRepl,    "Number of stack slot refs replaced with reg refs");
+STATISTIC(NumLoadElim,   "Number of loads eliminated");
+STATISTIC(NumStoreElim,  "Number of stores eliminated");
+STATISTIC(NumDead,       "Number of trivially dead stack accesses eliminated");
+
+namespace {
+  class StackSlotColoring : public MachineFunctionPass {
+    bool ColorWithRegs;
+    LiveStacks* LS;
+    VirtRegMap* VRM;
+    MachineFrameInfo *MFI;
+    MachineRegisterInfo *MRI;
+    const TargetInstrInfo  *TII;
+    const TargetRegisterInfo *TRI;
+    const MachineLoopInfo *loopInfo;
+
+    // SSIntervals - Spill slot intervals.
+    std::vector<LiveInterval*> SSIntervals;
+
+    // SSRefs - Keep a list of frame index references for each spill slot.
+    SmallVector<SmallVector<MachineInstr*, 8>, 16> SSRefs;
+
+    // OrigAlignments - Alignments of stack objects before coloring.
+    SmallVector<unsigned, 16> OrigAlignments;
+
+    // OrigSizes - Sizess of stack objects before coloring.
+    SmallVector<unsigned, 16> OrigSizes;
+
+    // AllColors - If index is set, it's a spill slot, i.e. color.
+    // FIXME: This assumes PEI locate spill slot with smaller indices
+    // closest to stack pointer / frame pointer. Therefore, smaller
+    // index == better color.
+    BitVector AllColors;
+
+    // NextColor - Next "color" that's not yet used.
+    int NextColor;
+
+    // UsedColors - "Colors" that have been assigned.
+    BitVector UsedColors;
+
+    // Assignments - Color to intervals mapping.
+    SmallVector<SmallVector<LiveInterval*,4>, 16> Assignments;
+
+  public:
+    static char ID; // Pass identification
+    StackSlotColoring() :
+      MachineFunctionPass(ID), ColorWithRegs(false), NextColor(-1) {}
+    StackSlotColoring(bool RegColor) :
+      MachineFunctionPass(ID), ColorWithRegs(RegColor), NextColor(-1) {}
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      AU.addRequired<LiveStacks>();
+      AU.addRequired<VirtRegMap>();
+      AU.addPreserved<VirtRegMap>();      
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+    virtual const char* getPassName() const {
+      return "Stack Slot Coloring";
+    }
+
+  private:
+    void InitializeSlots();
+    bool CheckForSetJmpCall(const MachineFunction &MF) const;
+    void ScanForSpillSlotRefs(MachineFunction &MF);
+    bool OverlapWithAssignments(LiveInterval *li, int Color) const;
+    int ColorSlot(LiveInterval *li);
+    bool ColorSlots(MachineFunction &MF);
+    bool ColorSlotsWithFreeRegs(SmallVector<int, 16> &SlotMapping,
+                                SmallVector<SmallVector<int, 4>, 16> &RevMap,
+                                BitVector &SlotIsReg);
+    void RewriteInstruction(MachineInstr *MI, int OldFI, int NewFI,
+                            MachineFunction &MF);
+    bool PropagateBackward(MachineBasicBlock::iterator MII,
+                           MachineBasicBlock *MBB,
+                           unsigned OldReg, unsigned NewReg);
+    bool PropagateForward(MachineBasicBlock::iterator MII,
+                          MachineBasicBlock *MBB,
+                          unsigned OldReg, unsigned NewReg);
+    void UnfoldAndRewriteInstruction(MachineInstr *MI, int OldFI,
+                                    unsigned Reg, const TargetRegisterClass *RC,
+                                    SmallSet<unsigned, 4> &Defs,
+                                    MachineFunction &MF);
+    bool AllMemRefsCanBeUnfolded(int SS);
+    bool RemoveDeadStores(MachineBasicBlock* MBB);
+  };
+} // end anonymous namespace
+
+char StackSlotColoring::ID = 0;
+
+INITIALIZE_PASS(StackSlotColoring, "stack-slot-coloring",
+                "Stack Slot Coloring", false, false);
+
+FunctionPass *llvm::createStackSlotColoringPass(bool RegColor) {
+  return new StackSlotColoring(RegColor);
+}
+
+namespace {
+  // IntervalSorter - Comparison predicate that sort live intervals by
+  // their weight.
+  struct IntervalSorter {
+    bool operator()(LiveInterval* LHS, LiveInterval* RHS) const {
+      return LHS->weight > RHS->weight;
+    }
+  };
+}
+
+/// ScanForSpillSlotRefs - Scan all the machine instructions for spill slot
+/// references and update spill slot weights.
+void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
+  SSRefs.resize(MFI->getObjectIndexEnd());
+
+  // FIXME: Need the equivalent of MachineRegisterInfo for frameindex operands.
+  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = &*MBBI;
+    unsigned loopDepth = loopInfo->getLoopDepth(MBB);
+    for (MachineBasicBlock::iterator MII = MBB->begin(), EE = MBB->end();
+         MII != EE; ++MII) {
+      MachineInstr *MI = &*MII;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isFI())
+          continue;
+        int FI = MO.getIndex();
+        if (FI < 0)
+          continue;
+        if (!LS->hasInterval(FI))
+          continue;
+        LiveInterval &li = LS->getInterval(FI);
+        if (!MI->isDebugValue())
+          li.weight += LiveIntervals::getSpillWeight(false, true, loopDepth);
+        SSRefs[FI].push_back(MI);
+      }
+    }
+  }
+}
+
+/// InitializeSlots - Process all spill stack slot liveintervals and add them
+/// to a sorted (by weight) list.
+void StackSlotColoring::InitializeSlots() {
+  int LastFI = MFI->getObjectIndexEnd();
+  OrigAlignments.resize(LastFI);
+  OrigSizes.resize(LastFI);
+  AllColors.resize(LastFI);
+  UsedColors.resize(LastFI);
+  Assignments.resize(LastFI);
+
+  // Gather all spill slots into a list.
+  DEBUG(dbgs() << "Spill slot intervals:\n");
+  for (LiveStacks::iterator i = LS->begin(), e = LS->end(); i != e; ++i) {
+    LiveInterval &li = i->second;
+    DEBUG(li.dump());
+    int FI = li.getStackSlotIndex();
+    if (MFI->isDeadObjectIndex(FI))
+      continue;
+    SSIntervals.push_back(&li);
+    OrigAlignments[FI] = MFI->getObjectAlignment(FI);
+    OrigSizes[FI]      = MFI->getObjectSize(FI);
+    AllColors.set(FI);
+  }
+  DEBUG(dbgs() << '\n');
+
+  // Sort them by weight.
+  std::stable_sort(SSIntervals.begin(), SSIntervals.end(), IntervalSorter());
+
+  // Get first "color".
+  NextColor = AllColors.find_first();
+}
+
+/// OverlapWithAssignments - Return true if LiveInterval overlaps with any
+/// LiveIntervals that have already been assigned to the specified color.
+bool
+StackSlotColoring::OverlapWithAssignments(LiveInterval *li, int Color) const {
+  const SmallVector<LiveInterval*,4> &OtherLIs = Assignments[Color];
+  for (unsigned i = 0, e = OtherLIs.size(); i != e; ++i) {
+    LiveInterval *OtherLI = OtherLIs[i];
+    if (OtherLI->overlaps(*li))
+      return true;
+  }
+  return false;
+}
+
+/// ColorSlotsWithFreeRegs - If there are any free registers available, try
+/// replacing spill slots references with registers instead.
+bool
+StackSlotColoring::ColorSlotsWithFreeRegs(SmallVector<int, 16> &SlotMapping,
+                                   SmallVector<SmallVector<int, 4>, 16> &RevMap,
+                                   BitVector &SlotIsReg) {
+  if (!(ColorWithRegs || ColorWithRegsOpt) || !VRM->HasUnusedRegisters())
+    return false;
+
+  bool Changed = false;
+  DEBUG(dbgs() << "Assigning unused registers to spill slots:\n");
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
+    LiveInterval *li = SSIntervals[i];
+    int SS = li->getStackSlotIndex();
+    if (!UsedColors[SS] || li->weight < 20)
+      // If the weight is < 20, i.e. two references in a loop with depth 1,
+      // don't bother with it.
+      continue;
+
+    // These slots allow to share the same registers.
+    bool AllColored = true;
+    SmallVector<unsigned, 4> ColoredRegs;
+    for (unsigned j = 0, ee = RevMap[SS].size(); j != ee; ++j) {
+      int RSS = RevMap[SS][j];
+      const TargetRegisterClass *RC = LS->getIntervalRegClass(RSS);
+      // If it's not colored to another stack slot, try coloring it
+      // to a "free" register.
+      if (!RC) {
+        AllColored = false;
+        continue;
+      }
+      unsigned Reg = VRM->getFirstUnusedRegister(RC);
+      if (!Reg) {
+        AllColored = false;
+        continue;
+      }
+      if (!AllMemRefsCanBeUnfolded(RSS)) {
+        AllColored = false;
+        continue;
+      } else {
+        DEBUG(dbgs() << "Assigning fi#" << RSS << " to "
+                     << TRI->getName(Reg) << '\n');
+        ColoredRegs.push_back(Reg);
+        SlotMapping[RSS] = Reg;
+        SlotIsReg.set(RSS);
+        Changed = true;
+      }
+    }
+
+    // Register and its sub-registers are no longer free.
+    while (!ColoredRegs.empty()) {
+      unsigned Reg = ColoredRegs.back();
+      ColoredRegs.pop_back();
+      VRM->setRegisterUsed(Reg);
+      // If reg is a callee-saved register, it will have to be spilled in
+      // the prologue.
+      MRI->setPhysRegUsed(Reg);
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
+        VRM->setRegisterUsed(*AS);
+        MRI->setPhysRegUsed(*AS);
+      }
+    }
+    // This spill slot is dead after the rewrites
+    if (AllColored) {
+      MFI->RemoveStackObject(SS);
+      ++NumEliminated;
+    }
+  }
+  DEBUG(dbgs() << '\n');
+
+  return Changed;
+}
+
+/// ColorSlot - Assign a "color" (stack slot) to the specified stack slot.
+///
+int StackSlotColoring::ColorSlot(LiveInterval *li) {
+  int Color = -1;
+  bool Share = false;
+  if (!DisableSharing) {
+    // Check if it's possible to reuse any of the used colors.
+    Color = UsedColors.find_first();
+    while (Color != -1) {
+      if (!OverlapWithAssignments(li, Color)) {
+        Share = true;
+        ++NumEliminated;
+        break;
+      }
+      Color = UsedColors.find_next(Color);
+    }
+  }
+
+  // Assign it to the first available color (assumed to be the best) if it's
+  // not possible to share a used color with other objects.
+  if (!Share) {
+    assert(NextColor != -1 && "No more spill slots?");
+    Color = NextColor;
+    UsedColors.set(Color);
+    NextColor = AllColors.find_next(NextColor);
+  }
+
+  // Record the assignment.
+  Assignments[Color].push_back(li);
+  int FI = li->getStackSlotIndex();
+  DEBUG(dbgs() << "Assigning fi#" << FI << " to fi#" << Color << "\n");
+
+  // Change size and alignment of the allocated slot. If there are multiple
+  // objects sharing the same slot, then make sure the size and alignment
+  // are large enough for all.
+  unsigned Align = OrigAlignments[FI];
+  if (!Share || Align > MFI->getObjectAlignment(Color))
+    MFI->setObjectAlignment(Color, Align);
+  int64_t Size = OrigSizes[FI];
+  if (!Share || Size > MFI->getObjectSize(Color))
+    MFI->setObjectSize(Color, Size);
+  return Color;
+}
+
+/// Colorslots - Color all spill stack slots and rewrite all frameindex machine
+/// operands in the function.
+bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
+  unsigned NumObjs = MFI->getObjectIndexEnd();
+  SmallVector<int, 16> SlotMapping(NumObjs, -1);
+  SmallVector<float, 16> SlotWeights(NumObjs, 0.0);
+  SmallVector<SmallVector<int, 4>, 16> RevMap(NumObjs);
+  BitVector SlotIsReg(NumObjs);
+  BitVector UsedColors(NumObjs);
+
+  DEBUG(dbgs() << "Color spill slot intervals:\n");
+  bool Changed = false;
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
+    LiveInterval *li = SSIntervals[i];
+    int SS = li->getStackSlotIndex();
+    int NewSS = ColorSlot(li);
+    assert(NewSS >= 0 && "Stack coloring failed?");
+    SlotMapping[SS] = NewSS;
+    RevMap[NewSS].push_back(SS);
+    SlotWeights[NewSS] += li->weight;
+    UsedColors.set(NewSS);
+    Changed |= (SS != NewSS);
+  }
+
+  DEBUG(dbgs() << "\nSpill slots after coloring:\n");
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
+    LiveInterval *li = SSIntervals[i];
+    int SS = li->getStackSlotIndex();
+    li->weight = SlotWeights[SS];
+  }
+  // Sort them by new weight.
+  std::stable_sort(SSIntervals.begin(), SSIntervals.end(), IntervalSorter());
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i)
+    DEBUG(SSIntervals[i]->dump());
+  DEBUG(dbgs() << '\n');
+#endif
+
+  // Can we "color" a stack slot with a unused register?
+  Changed |= ColorSlotsWithFreeRegs(SlotMapping, RevMap, SlotIsReg);
+
+  if (!Changed)
+    return false;
+
+  // Rewrite all MO_FrameIndex operands.
+  SmallVector<SmallSet<unsigned, 4>, 4> NewDefs(MF.getNumBlockIDs());
+  for (unsigned SS = 0, SE = SSRefs.size(); SS != SE; ++SS) {
+    bool isReg = SlotIsReg[SS];
+    int NewFI = SlotMapping[SS];
+    if (NewFI == -1 || (NewFI == (int)SS && !isReg))
+      continue;
+
+    const TargetRegisterClass *RC = LS->getIntervalRegClass(SS);
+    SmallVector<MachineInstr*, 8> &RefMIs = SSRefs[SS];
+    for (unsigned i = 0, e = RefMIs.size(); i != e; ++i)
+      if (!isReg)
+        RewriteInstruction(RefMIs[i], SS, NewFI, MF);
+      else {
+        // Rewrite to use a register instead.
+        unsigned MBBId = RefMIs[i]->getParent()->getNumber();
+        SmallSet<unsigned, 4> &Defs = NewDefs[MBBId];
+        UnfoldAndRewriteInstruction(RefMIs[i], SS, NewFI, RC, Defs, MF);
+      }
+  }
+
+  // Delete unused stack slots.
+  while (NextColor != -1) {
+    DEBUG(dbgs() << "Removing unused stack object fi#" << NextColor << "\n");
+    MFI->RemoveStackObject(NextColor);
+    NextColor = AllColors.find_next(NextColor);
+  }
+
+  return true;
+}
+
+/// AllMemRefsCanBeUnfolded - Return true if all references of the specified
+/// spill slot index can be unfolded.
+bool StackSlotColoring::AllMemRefsCanBeUnfolded(int SS) {
+  SmallVector<MachineInstr*, 8> &RefMIs = SSRefs[SS];
+  for (unsigned i = 0, e = RefMIs.size(); i != e; ++i) {
+    MachineInstr *MI = RefMIs[i];
+    if (TII->isLoadFromStackSlot(MI, SS) ||
+        TII->isStoreToStackSlot(MI, SS))
+      // Restore and spill will become copies.
+      return true;
+    if (!TII->getOpcodeAfterMemoryUnfold(MI->getOpcode(), false, false))
+      return false;
+    for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) {
+      MachineOperand &MO = MI->getOperand(j);
+      if (MO.isFI() && MO.getIndex() != SS)
+        // If it uses another frameindex, we can, currently* unfold it.
+        return false;
+    }
+  }
+  return true;
+}
+
+/// RewriteInstruction - Rewrite specified instruction by replacing references
+/// to old frame index with new one.
+void StackSlotColoring::RewriteInstruction(MachineInstr *MI, int OldFI,
+                                           int NewFI, MachineFunction &MF) {
+  // Update the operands.
+  for (unsigned i = 0, ee = MI->getNumOperands(); i != ee; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isFI())
+      continue;
+    int FI = MO.getIndex();
+    if (FI != OldFI)
+      continue;
+    MO.setIndex(NewFI);
+  }
+
+  // Update the memory references. This changes the MachineMemOperands
+  // directly. They may be in use by multiple instructions, however all
+  // instructions using OldFI are being rewritten to use NewFI.
+  const Value *OldSV = PseudoSourceValue::getFixedStack(OldFI);
+  const Value *NewSV = PseudoSourceValue::getFixedStack(NewFI);
+  for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
+       E = MI->memoperands_end(); I != E; ++I)
+    if ((*I)->getValue() == OldSV)
+      (*I)->setValue(NewSV);
+}
+
+/// PropagateBackward - Traverse backward and look for the definition of
+/// OldReg. If it can successfully update all of the references with NewReg,
+/// do so and return true.
+bool StackSlotColoring::PropagateBackward(MachineBasicBlock::iterator MII,
+                                          MachineBasicBlock *MBB,
+                                          unsigned OldReg, unsigned NewReg) {
+  if (MII == MBB->begin())
+    return false;
+
+  SmallVector<MachineOperand*, 4> Uses;
+  SmallVector<MachineOperand*, 4> Refs;
+  while (--MII != MBB->begin()) {
+    bool FoundDef = false;  // Not counting 2address def.
+
+    Uses.clear();
+    const TargetInstrDesc &TID = MII->getDesc();
+    for (unsigned i = 0, e = MII->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MII->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0)
+        continue;
+      if (Reg == OldReg) {
+        if (MO.isImplicit())
+          return false;
+
+        // Abort the use is actually a sub-register def. We don't have enough
+        // information to figure out if it is really legal.
+        if (MO.getSubReg() || MII->isSubregToReg())
+          return false;
+
+        const TargetRegisterClass *RC = TID.OpInfo[i].getRegClass(TRI);
+        if (RC && !RC->contains(NewReg))
+          return false;
+
+        if (MO.isUse()) {
+          Uses.push_back(&MO);
+        } else {
+          Refs.push_back(&MO);
+          if (!MII->isRegTiedToUseOperand(i))
+            FoundDef = true;
+        }
+      } else if (TRI->regsOverlap(Reg, NewReg)) {
+        return false;
+      } else if (TRI->regsOverlap(Reg, OldReg)) {
+        if (!MO.isUse() || !MO.isKill())
+          return false;
+      }
+    }
+
+    if (FoundDef) {
+      // Found non-two-address def. Stop here.
+      for (unsigned i = 0, e = Refs.size(); i != e; ++i)
+        Refs[i]->setReg(NewReg);
+      return true;
+    }
+
+    // Two-address uses must be updated as well.
+    for (unsigned i = 0, e = Uses.size(); i != e; ++i)
+      Refs.push_back(Uses[i]);
+  }
+  return false;
+}
+
+/// PropagateForward - Traverse forward and look for the kill of OldReg. If
+/// it can successfully update all of the uses with NewReg, do so and
+/// return true.
+bool StackSlotColoring::PropagateForward(MachineBasicBlock::iterator MII,
+                                         MachineBasicBlock *MBB,
+                                         unsigned OldReg, unsigned NewReg) {
+  if (MII == MBB->end())
+    return false;
+
+  SmallVector<MachineOperand*, 4> Uses;
+  while (++MII != MBB->end()) {
+    bool FoundKill = false;
+    const TargetInstrDesc &TID = MII->getDesc();
+    for (unsigned i = 0, e = MII->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MII->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0)
+        continue;
+      if (Reg == OldReg) {
+        if (MO.isDef() || MO.isImplicit())
+          return false;
+
+        // Abort the use is actually a sub-register use. We don't have enough
+        // information to figure out if it is really legal.
+        if (MO.getSubReg())
+          return false;
+
+        const TargetRegisterClass *RC = TID.OpInfo[i].getRegClass(TRI);
+        if (RC && !RC->contains(NewReg))
+          return false;
+        if (MO.isKill())
+          FoundKill = true;
+
+        Uses.push_back(&MO);
+      } else if (TRI->regsOverlap(Reg, NewReg) ||
+                 TRI->regsOverlap(Reg, OldReg))
+        return false;
+    }
+    if (FoundKill) {
+      for (unsigned i = 0, e = Uses.size(); i != e; ++i)
+        Uses[i]->setReg(NewReg);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// UnfoldAndRewriteInstruction - Rewrite specified instruction by unfolding
+/// folded memory references and replacing those references with register
+/// references instead.
+void
+StackSlotColoring::UnfoldAndRewriteInstruction(MachineInstr *MI, int OldFI,
+                                               unsigned Reg,
+                                               const TargetRegisterClass *RC,
+                                               SmallSet<unsigned, 4> &Defs,
+                                               MachineFunction &MF) {
+  MachineBasicBlock *MBB = MI->getParent();
+  if (unsigned DstReg = TII->isLoadFromStackSlot(MI, OldFI)) {
+    if (PropagateForward(MI, MBB, DstReg, Reg)) {
+      DEBUG(dbgs() << "Eliminated load: ");
+      DEBUG(MI->dump());
+      ++NumLoadElim;
+    } else {
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
+              DstReg).addReg(Reg);
+      ++NumRegRepl;
+    }
+
+    if (!Defs.count(Reg)) {
+      // If this is the first use of Reg in this MBB and it wasn't previously
+      // defined in MBB, add it to livein.
+      MBB->addLiveIn(Reg);
+      Defs.insert(Reg);
+    }
+  } else if (unsigned SrcReg = TII->isStoreToStackSlot(MI, OldFI)) {
+    if (MI->killsRegister(SrcReg) && PropagateBackward(MI, MBB, SrcReg, Reg)) {
+      DEBUG(dbgs() << "Eliminated store: ");
+      DEBUG(MI->dump());
+      ++NumStoreElim;
+    } else {
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY), Reg)
+        .addReg(SrcReg);
+      ++NumRegRepl;
+    }
+
+    // Remember reg has been defined in MBB.
+    Defs.insert(Reg);
+  } else {
+    SmallVector<MachineInstr*, 4> NewMIs;
+    bool Success = TII->unfoldMemoryOperand(MF, MI, Reg, false, false, NewMIs);
+    Success = Success; // Silence compiler warning.
+    assert(Success && "Failed to unfold!");
+    MachineInstr *NewMI = NewMIs[0];
+    MBB->insert(MI, NewMI);
+    ++NumRegRepl;
+
+    if (NewMI->readsRegister(Reg)) {
+      if (!Defs.count(Reg))
+        // If this is the first use of Reg in this MBB and it wasn't previously
+        // defined in MBB, add it to livein.
+        MBB->addLiveIn(Reg);
+      Defs.insert(Reg);
+    }
+  }
+  MBB->erase(MI);
+}
+
+/// RemoveDeadStores - Scan through a basic block and look for loads followed
+/// by stores.  If they're both using the same stack slot, then the store is
+/// definitely dead.  This could obviously be much more aggressive (consider
+/// pairs with instructions between them), but such extensions might have a
+/// considerable compile time impact.
+bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
+  // FIXME: This could be much more aggressive, but we need to investigate
+  // the compile time impact of doing so.
+  bool changed = false;
+
+  SmallVector<MachineInstr*, 4> toErase;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+       I != E; ++I) {
+    if (DCELimit != -1 && (int)NumDead >= DCELimit)
+      break;
+    
+    MachineBasicBlock::iterator NextMI = llvm::next(I);
+    if (NextMI == MBB->end()) continue;
+    
+    int FirstSS, SecondSS;
+    unsigned LoadReg = 0;
+    unsigned StoreReg = 0;
+    if (!(LoadReg = TII->isLoadFromStackSlot(I, FirstSS))) continue;
+    if (!(StoreReg = TII->isStoreToStackSlot(NextMI, SecondSS))) continue;
+    if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue;
+    
+    ++NumDead;
+    changed = true;
+    
+    if (NextMI->findRegisterUseOperandIdx(LoadReg, true, 0) != -1) {
+      ++NumDead;
+      toErase.push_back(I);
+    }
+    
+    toErase.push_back(NextMI);
+    ++I;
+  }
+  
+  for (SmallVector<MachineInstr*, 4>::iterator I = toErase.begin(),
+       E = toErase.end(); I != E; ++I)
+    (*I)->eraseFromParent();
+  
+  return changed;
+}
+
+
+bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+      dbgs() << "********** Stack Slot Coloring **********\n"
+             << "********** Function: " 
+             << MF.getFunction()->getName() << '\n';
+    });
+
+  MFI = MF.getFrameInfo();
+  MRI = &MF.getRegInfo(); 
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  LS = &getAnalysis<LiveStacks>();
+  VRM = &getAnalysis<VirtRegMap>();
+  loopInfo = &getAnalysis<MachineLoopInfo>();
+
+  bool Changed = false;
+
+  unsigned NumSlots = LS->getNumIntervals();
+  if (NumSlots < 2) {
+    if (NumSlots == 0 || !VRM->HasUnusedRegisters())
+      // Nothing to do!
+      return false;
+  }
+
+  // If there are calls to setjmp or sigsetjmp, don't perform stack slot
+  // coloring. The stack could be modified before the longjmp is executed,
+  // resulting in the wrong value being used afterwards. (See
+  // <rdar://problem/8007500>.)
+  if (MF.callsSetJmp())
+    return false;
+
+  // Gather spill slot references
+  ScanForSpillSlotRefs(MF);
+  InitializeSlots();
+  Changed = ColorSlots(MF);
+
+  NextColor = -1;
+  SSIntervals.clear();
+  for (unsigned i = 0, e = SSRefs.size(); i != e; ++i)
+    SSRefs[i].clear();
+  SSRefs.clear();
+  OrigAlignments.clear();
+  OrigSizes.clear();
+  AllColors.clear();
+  UsedColors.clear();
+  for (unsigned i = 0, e = Assignments.size(); i != e; ++i)
+    Assignments[i].clear();
+  Assignments.clear();
+
+  if (Changed) {
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+      Changed |= RemoveDeadStores(I);
+  }
+
+  return Changed;
+}
diff --git a/src/LLVM/lib/CodeGen/StrongPHIElimination.cpp b/src/LLVM/lib/CodeGen/StrongPHIElimination.cpp
new file mode 100644
index 0000000..5e2d414
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/StrongPHIElimination.cpp
@@ -0,0 +1,1046 @@
+//===- StrongPhiElimination.cpp - Eliminate PHI nodes by inserting copies -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates machine instruction PHI nodes by inserting copy
+// instructions, using an intelligent copy-folding technique based on
+// dominator information.  This is technique is derived from:
+// 
+//    Budimlic, et al. Fast copy coalescing and live-range identification.
+//    In Proceedings of the ACM SIGPLAN 2002 Conference on Programming Language
+//    Design and Implementation (Berlin, Germany, June 17 - 19, 2002).
+//    PLDI '02. ACM, New York, NY, 25-32.
+//    DOI= http://doi.acm.org/10.1145/512529.512534
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "strongphielim"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+namespace {
+  struct StrongPHIElimination : public MachineFunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    StrongPHIElimination() : MachineFunctionPass(ID) {}
+
+    // Waiting stores, for each MBB, the set of copies that need to
+    // be inserted into that MBB
+    DenseMap<MachineBasicBlock*,
+             std::multimap<unsigned, unsigned> > Waiting;
+    
+    // Stacks holds the renaming stack for each register
+    std::map<unsigned, std::vector<unsigned> > Stacks;
+    
+    // Registers in UsedByAnother are PHI nodes that are themselves
+    // used as operands to another PHI node
+    std::set<unsigned> UsedByAnother;
+    
+    // RenameSets are the is a map from a PHI-defined register
+    // to the input registers to be coalesced along with the 
+    // predecessor block for those input registers.
+    std::map<unsigned, std::map<unsigned, MachineBasicBlock*> > RenameSets;
+    
+    // PhiValueNumber holds the ID numbers of the VNs for each phi that we're
+    // eliminating, indexed by the register defined by that phi.
+    std::map<unsigned, unsigned> PhiValueNumber;
+
+    // Store the DFS-in number of each block
+    DenseMap<MachineBasicBlock*, unsigned> preorder;
+    
+    // Store the DFS-out number of each block
+    DenseMap<MachineBasicBlock*, unsigned> maxpreorder;
+
+    bool runOnMachineFunction(MachineFunction &Fn);
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      AU.addRequired<LiveIntervals>();
+      
+      // TODO: Actually make this true.
+      AU.addPreserved<LiveIntervals>();
+      AU.addPreserved<RegisterCoalescer>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    
+    virtual void releaseMemory() {
+      preorder.clear();
+      maxpreorder.clear();
+      
+      Waiting.clear();
+      Stacks.clear();
+      UsedByAnother.clear();
+      RenameSets.clear();
+    }
+
+  private:
+    
+    /// DomForestNode - Represents a node in the "dominator forest".  This is
+    /// a forest in which the nodes represent registers and the edges
+    /// represent a dominance relation in the block defining those registers.
+    struct DomForestNode {
+    private:
+      // Store references to our children
+      std::vector<DomForestNode*> children;
+      // The register we represent
+      unsigned reg;
+      
+      // Add another node as our child
+      void addChild(DomForestNode* DFN) { children.push_back(DFN); }
+      
+    public:
+      typedef std::vector<DomForestNode*>::iterator iterator;
+      
+      // Create a DomForestNode by providing the register it represents, and
+      // the node to be its parent.  The virtual root node has register 0
+      // and a null parent.
+      DomForestNode(unsigned r, DomForestNode* parent) : reg(r) {
+        if (parent)
+          parent->addChild(this);
+      }
+      
+      ~DomForestNode() {
+        for (iterator I = begin(), E = end(); I != E; ++I)
+          delete *I;
+      }
+      
+      /// getReg - Return the regiser that this node represents
+      inline unsigned getReg() { return reg; }
+      
+      // Provide iterator access to our children
+      inline DomForestNode::iterator begin() { return children.begin(); }
+      inline DomForestNode::iterator end() { return children.end(); }
+    };
+    
+    void computeDFS(MachineFunction& MF);
+    void processBlock(MachineBasicBlock* MBB);
+    
+    std::vector<DomForestNode*> computeDomForest(
+                           std::map<unsigned, MachineBasicBlock*>& instrs,
+                                                 MachineRegisterInfo& MRI);
+    void processPHIUnion(MachineInstr* Inst,
+                         std::map<unsigned, MachineBasicBlock*>& PHIUnion,
+                         std::vector<StrongPHIElimination::DomForestNode*>& DF,
+                         std::vector<std::pair<unsigned, unsigned> >& locals);
+    void ScheduleCopies(MachineBasicBlock* MBB, std::set<unsigned>& pushed);
+    void InsertCopies(MachineDomTreeNode* MBB,
+                      SmallPtrSet<MachineBasicBlock*, 16>& v);
+    bool mergeLiveIntervals(unsigned primary, unsigned secondary);
+  };
+}
+
+char StrongPHIElimination::ID = 0;
+static RegisterPass<StrongPHIElimination>
+X("strong-phi-node-elimination",
+  "Eliminate PHI nodes for register allocation, intelligently");
+
+char &llvm::StrongPHIEliminationID = StrongPHIElimination::ID;
+
+/// computeDFS - Computes the DFS-in and DFS-out numbers of the dominator tree
+/// of the given MachineFunction.  These numbers are then used in other parts
+/// of the PHI elimination process.
+void StrongPHIElimination::computeDFS(MachineFunction& MF) {
+  SmallPtrSet<MachineDomTreeNode*, 8> frontier;
+  SmallPtrSet<MachineDomTreeNode*, 8> visited;
+  
+  unsigned time = 0;
+  
+  MachineDominatorTree& DT = getAnalysis<MachineDominatorTree>();
+  
+  MachineDomTreeNode* node = DT.getRootNode();
+  
+  std::vector<MachineDomTreeNode*> worklist;
+  worklist.push_back(node);
+  
+  while (!worklist.empty()) {
+    MachineDomTreeNode* currNode = worklist.back();
+    
+    if (!frontier.count(currNode)) {
+      frontier.insert(currNode);
+      ++time;
+      preorder.insert(std::make_pair(currNode->getBlock(), time));
+    }
+    
+    bool inserted = false;
+    for (MachineDomTreeNode::iterator I = currNode->begin(), E = currNode->end();
+         I != E; ++I)
+      if (!frontier.count(*I) && !visited.count(*I)) {
+        worklist.push_back(*I);
+        inserted = true;
+        break;
+      }
+    
+    if (!inserted) {
+      frontier.erase(currNode);
+      visited.insert(currNode);
+      maxpreorder.insert(std::make_pair(currNode->getBlock(), time));
+      
+      worklist.pop_back();
+    }
+  }
+}
+
+namespace {
+
+/// PreorderSorter - a helper class that is used to sort registers
+/// according to the preorder number of their defining blocks
+class PreorderSorter {
+private:
+  DenseMap<MachineBasicBlock*, unsigned>& preorder;
+  MachineRegisterInfo& MRI;
+  
+public:
+  PreorderSorter(DenseMap<MachineBasicBlock*, unsigned>& p,
+                MachineRegisterInfo& M) : preorder(p), MRI(M) { }
+  
+  bool operator()(unsigned A, unsigned B) {
+    if (A == B)
+      return false;
+    
+    MachineBasicBlock* ABlock = MRI.getVRegDef(A)->getParent();
+    MachineBasicBlock* BBlock = MRI.getVRegDef(B)->getParent();
+    
+    if (preorder[ABlock] < preorder[BBlock])
+      return true;
+    else if (preorder[ABlock] > preorder[BBlock])
+      return false;
+    
+    return false;
+  }
+};
+
+}
+
+/// computeDomForest - compute the subforest of the DomTree corresponding
+/// to the defining blocks of the registers in question
+std::vector<StrongPHIElimination::DomForestNode*>
+StrongPHIElimination::computeDomForest(
+                  std::map<unsigned, MachineBasicBlock*>& regs, 
+                                       MachineRegisterInfo& MRI) {
+  // Begin by creating a virtual root node, since the actual results
+  // may well be a forest.  Assume this node has maximum DFS-out number.
+  DomForestNode* VirtualRoot = new DomForestNode(0, 0);
+  maxpreorder.insert(std::make_pair((MachineBasicBlock*)0, ~0UL));
+  
+  // Populate a worklist with the registers
+  std::vector<unsigned> worklist;
+  worklist.reserve(regs.size());
+  for (std::map<unsigned, MachineBasicBlock*>::iterator I = regs.begin(),
+       E = regs.end(); I != E; ++I)
+    worklist.push_back(I->first);
+  
+  // Sort the registers by the DFS-in number of their defining block
+  PreorderSorter PS(preorder, MRI);
+  std::sort(worklist.begin(), worklist.end(), PS);
+  
+  // Create a "current parent" stack, and put the virtual root on top of it
+  DomForestNode* CurrentParent = VirtualRoot;
+  std::vector<DomForestNode*> stack;
+  stack.push_back(VirtualRoot);
+  
+  // Iterate over all the registers in the previously computed order
+  for (std::vector<unsigned>::iterator I = worklist.begin(), E = worklist.end();
+       I != E; ++I) {
+    unsigned pre = preorder[MRI.getVRegDef(*I)->getParent()];
+    MachineBasicBlock* parentBlock = CurrentParent->getReg() ?
+                 MRI.getVRegDef(CurrentParent->getReg())->getParent() :
+                 0;
+    
+    // If the DFS-in number of the register is greater than the DFS-out number
+    // of the current parent, repeatedly pop the parent stack until it isn't.
+    while (pre > maxpreorder[parentBlock]) {
+      stack.pop_back();
+      CurrentParent = stack.back();
+      
+      parentBlock = CurrentParent->getReg() ?
+                   MRI.getVRegDef(CurrentParent->getReg())->getParent() :
+                   0;
+    }
+    
+    // Now that we've found the appropriate parent, create a DomForestNode for
+    // this register and attach it to the forest
+    DomForestNode* child = new DomForestNode(*I, CurrentParent);
+    
+    // Push this new node on the "current parent" stack
+    stack.push_back(child);
+    CurrentParent = child;
+  }
+  
+  // Return a vector containing the children of the virtual root node
+  std::vector<DomForestNode*> ret;
+  ret.insert(ret.end(), VirtualRoot->begin(), VirtualRoot->end());
+  return ret;
+}
+
+/// isLiveIn - helper method that determines, from a regno, if a register
+/// is live into a block
+static bool isLiveIn(unsigned r, MachineBasicBlock* MBB,
+                     LiveIntervals& LI) {
+  LiveInterval& I = LI.getOrCreateInterval(r);
+  SlotIndex idx = LI.getMBBStartIdx(MBB);
+  return I.liveAt(idx);
+}
+
+/// isLiveOut - help method that determines, from a regno, if a register is
+/// live out of a block.
+static bool isLiveOut(unsigned r, MachineBasicBlock* MBB,
+                      LiveIntervals& LI) {
+  for (MachineBasicBlock::succ_iterator PI = MBB->succ_begin(),
+       E = MBB->succ_end(); PI != E; ++PI)
+    if (isLiveIn(r, *PI, LI))
+      return true;
+  
+  return false;
+}
+
+/// interferes - checks for local interferences by scanning a block.  The only
+/// trick parameter is 'mode' which tells it the relationship of the two
+/// registers. 0 - defined in the same block, 1 - first properly dominates
+/// second, 2 - second properly dominates first 
+static bool interferes(unsigned a, unsigned b, MachineBasicBlock* scan,
+                       LiveIntervals& LV, unsigned mode) {
+  MachineInstr* def = 0;
+  MachineInstr* kill = 0;
+  
+  // The code is still in SSA form at this point, so there is only one
+  // definition per VReg.  Thus we can safely use MRI->getVRegDef().
+  const MachineRegisterInfo* MRI = &scan->getParent()->getRegInfo();
+  
+  bool interference = false;
+  
+  // Wallk the block, checking for interferences
+  for (MachineBasicBlock::iterator MBI = scan->begin(), MBE = scan->end();
+       MBI != MBE; ++MBI) {
+    MachineInstr* curr = MBI;
+    
+    // Same defining block...
+    if (mode == 0) {
+      if (curr == MRI->getVRegDef(a)) {
+        // If we find our first definition, save it
+        if (!def) {
+          def = curr;
+        // If there's already an unkilled definition, then 
+        // this is an interference
+        } else if (!kill) {
+          interference = true;
+          break;
+        // If there's a definition followed by a KillInst, then
+        // they can't interfere
+        } else {
+          interference = false;
+          break;
+        }
+      // Symmetric with the above
+      } else if (curr == MRI->getVRegDef(b)) {
+        if (!def) {
+          def = curr;
+        } else if (!kill) {
+          interference = true;
+          break;
+        } else {
+          interference = false;
+          break;
+        }
+      // Store KillInsts if they match up with the definition
+      } else if (curr->killsRegister(a)) {
+        if (def == MRI->getVRegDef(a)) {
+          kill = curr;
+        } else if (curr->killsRegister(b)) {
+          if (def == MRI->getVRegDef(b)) {
+            kill = curr;
+          }
+        }
+      }
+    // First properly dominates second...
+    } else if (mode == 1) {
+      if (curr == MRI->getVRegDef(b)) {
+        // Definition of second without kill of first is an interference
+        if (!kill) {
+          interference = true;
+          break;
+        // Definition after a kill is a non-interference
+        } else {
+          interference = false;
+          break;
+        }
+      // Save KillInsts of First
+      } else if (curr->killsRegister(a)) {
+        kill = curr;
+      }
+    // Symmetric with the above
+    } else if (mode == 2) {
+      if (curr == MRI->getVRegDef(a)) {
+        if (!kill) {
+          interference = true;
+          break;
+        } else {
+          interference = false;
+          break;
+        }
+      } else if (curr->killsRegister(b)) {
+        kill = curr;
+      }
+    }
+  }
+  
+  return interference;
+}
+
+/// processBlock - Determine how to break up PHIs in the current block.  Each
+/// PHI is broken up by some combination of renaming its operands and inserting
+/// copies.  This method is responsible for determining which operands receive
+/// which treatment.
+void StrongPHIElimination::processBlock(MachineBasicBlock* MBB) {
+  LiveIntervals& LI = getAnalysis<LiveIntervals>();
+  MachineRegisterInfo& MRI = MBB->getParent()->getRegInfo();
+  
+  // Holds names that have been added to a set in any PHI within this block
+  // before the current one.
+  std::set<unsigned> ProcessedNames;
+  
+  // Iterate over all the PHI nodes in this block
+  MachineBasicBlock::iterator P = MBB->begin();
+  while (P != MBB->end() && P->isPHI()) {
+    unsigned DestReg = P->getOperand(0).getReg();
+    
+    // Don't both doing PHI elimination for dead PHI's.
+    if (P->registerDefIsDead(DestReg)) {
+      ++P;
+      continue;
+    }
+
+    LiveInterval& PI = LI.getOrCreateInterval(DestReg);
+    SlotIndex pIdx = LI.getInstructionIndex(P).getDefIndex();
+    VNInfo* PVN = PI.getLiveRangeContaining(pIdx)->valno;
+    PhiValueNumber.insert(std::make_pair(DestReg, PVN->id));
+
+    // PHIUnion is the set of incoming registers to the PHI node that
+    // are going to be renames rather than having copies inserted.  This set
+    // is refinded over the course of this function.  UnionedBlocks is the set
+    // of corresponding MBBs.
+    std::map<unsigned, MachineBasicBlock*> PHIUnion;
+    SmallPtrSet<MachineBasicBlock*, 8> UnionedBlocks;
+  
+    // Iterate over the operands of the PHI node
+    for (int i = P->getNumOperands() - 1; i >= 2; i-=2) {
+      unsigned SrcReg = P->getOperand(i-1).getReg();
+      
+      // Don't need to try to coalesce a register with itself.
+      if (SrcReg == DestReg) {
+        ProcessedNames.insert(SrcReg);
+        continue;
+      }
+      
+      // We don't need to insert copies for implicit_defs.
+      MachineInstr* DefMI = MRI.getVRegDef(SrcReg);
+      if (DefMI->isImplicitDef())
+        ProcessedNames.insert(SrcReg);
+    
+      // Check for trivial interferences via liveness information, allowing us
+      // to avoid extra work later.  Any registers that interfere cannot both
+      // be in the renaming set, so choose one and add copies for it instead.
+      // The conditions are:
+      //   1) if the operand is live into the PHI node's block OR
+      //   2) if the PHI node is live out of the operand's defining block OR
+      //   3) if the operand is itself a PHI node and the original PHI is
+      //      live into the operand's defining block OR
+      //   4) if the operand is already being renamed for another PHI node
+      //      in this block OR
+      //   5) if any two operands are defined in the same block, insert copies
+      //      for one of them
+      if (isLiveIn(SrcReg, P->getParent(), LI) ||
+          isLiveOut(P->getOperand(0).getReg(),
+                    MRI.getVRegDef(SrcReg)->getParent(), LI) ||
+          ( MRI.getVRegDef(SrcReg)->isPHI() &&
+            isLiveIn(P->getOperand(0).getReg(),
+                     MRI.getVRegDef(SrcReg)->getParent(), LI) ) ||
+          ProcessedNames.count(SrcReg) ||
+          UnionedBlocks.count(MRI.getVRegDef(SrcReg)->getParent())) {
+        
+        // Add a copy for the selected register
+        MachineBasicBlock* From = P->getOperand(i).getMBB();
+        Waiting[From].insert(std::make_pair(SrcReg, DestReg));
+        UsedByAnother.insert(SrcReg);
+      } else {
+        // Otherwise, add it to the renaming set
+        PHIUnion.insert(std::make_pair(SrcReg,P->getOperand(i).getMBB()));
+        UnionedBlocks.insert(MRI.getVRegDef(SrcReg)->getParent());
+      }
+    }
+    
+    // Compute the dominator forest for the renaming set.  This is a forest
+    // where the nodes are the registers and the edges represent dominance 
+    // relations between the defining blocks of the registers
+    std::vector<StrongPHIElimination::DomForestNode*> DF = 
+                                                computeDomForest(PHIUnion, MRI);
+    
+    // Walk DomForest to resolve interferences at an inter-block level.  This
+    // will remove registers from the renaming set (and insert copies for them)
+    // if interferences are found.
+    std::vector<std::pair<unsigned, unsigned> > localInterferences;
+    processPHIUnion(P, PHIUnion, DF, localInterferences);
+    
+    // If one of the inputs is defined in the same block as the current PHI
+    // then we need to check for a local interference between that input and
+    // the PHI.
+    for (std::map<unsigned, MachineBasicBlock*>::iterator I = PHIUnion.begin(),
+         E = PHIUnion.end(); I != E; ++I)
+      if (MRI.getVRegDef(I->first)->getParent() == P->getParent())
+        localInterferences.push_back(std::make_pair(I->first,
+                                                    P->getOperand(0).getReg()));
+    
+    // The dominator forest walk may have returned some register pairs whose
+    // interference cannot be determined from dominator analysis.  We now 
+    // examine these pairs for local interferences.
+    for (std::vector<std::pair<unsigned, unsigned> >::iterator I =
+        localInterferences.begin(), E = localInterferences.end(); I != E; ++I) {
+      std::pair<unsigned, unsigned> p = *I;
+      
+      MachineDominatorTree& MDT = getAnalysis<MachineDominatorTree>();
+      
+      // Determine the block we need to scan and the relationship between
+      // the two registers
+      MachineBasicBlock* scan = 0;
+      unsigned mode = 0;
+      if (MRI.getVRegDef(p.first)->getParent() ==
+          MRI.getVRegDef(p.second)->getParent()) {
+        scan = MRI.getVRegDef(p.first)->getParent();
+        mode = 0; // Same block
+      } else if (MDT.dominates(MRI.getVRegDef(p.first)->getParent(),
+                               MRI.getVRegDef(p.second)->getParent())) {
+        scan = MRI.getVRegDef(p.second)->getParent();
+        mode = 1; // First dominates second
+      } else {
+        scan = MRI.getVRegDef(p.first)->getParent();
+        mode = 2; // Second dominates first
+      }
+      
+      // If there's an interference, we need to insert  copies
+      if (interferes(p.first, p.second, scan, LI, mode)) {
+        // Insert copies for First
+        for (int i = P->getNumOperands() - 1; i >= 2; i-=2) {
+          if (P->getOperand(i-1).getReg() == p.first) {
+            unsigned SrcReg = p.first;
+            MachineBasicBlock* From = P->getOperand(i).getMBB();
+            
+            Waiting[From].insert(std::make_pair(SrcReg,
+                                                P->getOperand(0).getReg()));
+            UsedByAnother.insert(SrcReg);
+            
+            PHIUnion.erase(SrcReg);
+          }
+        }
+      }
+    }
+    
+    // Add the renaming set for this PHI node to our overall renaming information
+    for (std::map<unsigned, MachineBasicBlock*>::iterator QI = PHIUnion.begin(),
+         QE = PHIUnion.end(); QI != QE; ++QI) {
+      DEBUG(dbgs() << "Adding Renaming: " << QI->first << " -> "
+                   << P->getOperand(0).getReg() << "\n");
+    }
+    
+    RenameSets.insert(std::make_pair(P->getOperand(0).getReg(), PHIUnion));
+    
+    // Remember which registers are already renamed, so that we don't try to 
+    // rename them for another PHI node in this block
+    for (std::map<unsigned, MachineBasicBlock*>::iterator I = PHIUnion.begin(),
+         E = PHIUnion.end(); I != E; ++I)
+      ProcessedNames.insert(I->first);
+    
+    ++P;
+  }
+}
+
+/// processPHIUnion - Take a set of candidate registers to be coalesced when
+/// decomposing the PHI instruction.  Use the DominanceForest to remove the ones
+/// that are known to interfere, and flag others that need to be checked for
+/// local interferences.
+void StrongPHIElimination::processPHIUnion(MachineInstr* Inst,
+                        std::map<unsigned, MachineBasicBlock*>& PHIUnion,
+                        std::vector<StrongPHIElimination::DomForestNode*>& DF,
+                        std::vector<std::pair<unsigned, unsigned> >& locals) {
+  
+  std::vector<DomForestNode*> worklist(DF.begin(), DF.end());
+  SmallPtrSet<DomForestNode*, 4> visited;
+  
+  // Code is still in SSA form, so we can use MRI::getVRegDef()
+  MachineRegisterInfo& MRI = Inst->getParent()->getParent()->getRegInfo();
+  
+  LiveIntervals& LI = getAnalysis<LiveIntervals>();
+  unsigned DestReg = Inst->getOperand(0).getReg();
+  
+  // DF walk on the DomForest
+  while (!worklist.empty()) {
+    DomForestNode* DFNode = worklist.back();
+    
+    visited.insert(DFNode);
+    
+    bool inserted = false;
+    for (DomForestNode::iterator CI = DFNode->begin(), CE = DFNode->end();
+         CI != CE; ++CI) {
+      DomForestNode* child = *CI;   
+      
+      // If the current node is live-out of the defining block of one of its
+      // children, insert a copy for it.  NOTE: The paper actually calls for
+      // a more elaborate heuristic for determining whether to insert copies
+      // for the child or the parent.  In the interest of simplicity, we're
+      // just always choosing the parent.
+      if (isLiveOut(DFNode->getReg(),
+          MRI.getVRegDef(child->getReg())->getParent(), LI)) {
+        // Insert copies for parent
+        for (int i = Inst->getNumOperands() - 1; i >= 2; i-=2) {
+          if (Inst->getOperand(i-1).getReg() == DFNode->getReg()) {
+            unsigned SrcReg = DFNode->getReg();
+            MachineBasicBlock* From = Inst->getOperand(i).getMBB();
+            
+            Waiting[From].insert(std::make_pair(SrcReg, DestReg));
+            UsedByAnother.insert(SrcReg);
+            
+            PHIUnion.erase(SrcReg);
+          }
+        }
+      
+      // If a node is live-in to the defining block of one of its children, but
+      // not live-out, then we need to scan that block for local interferences.
+      } else if (isLiveIn(DFNode->getReg(),
+                          MRI.getVRegDef(child->getReg())->getParent(), LI) ||
+                 MRI.getVRegDef(DFNode->getReg())->getParent() ==
+                                 MRI.getVRegDef(child->getReg())->getParent()) {
+        // Add (p, c) to possible local interferences
+        locals.push_back(std::make_pair(DFNode->getReg(), child->getReg()));
+      }
+      
+      if (!visited.count(child)) {
+        worklist.push_back(child);
+        inserted = true;
+      }
+    }
+    
+    if (!inserted) worklist.pop_back();
+  }
+}
+
+/// ScheduleCopies - Insert copies into predecessor blocks, scheduling
+/// them properly so as to avoid the 'lost copy' and the 'virtual swap'
+/// problems.
+///
+/// Based on "Practical Improvements to the Construction and Destruction
+/// of Static Single Assignment Form" by Briggs, et al.
+void StrongPHIElimination::ScheduleCopies(MachineBasicBlock* MBB,
+                                          std::set<unsigned>& pushed) {
+  // FIXME: This function needs to update LiveIntervals
+  std::multimap<unsigned, unsigned>& copy_set= Waiting[MBB];
+  
+  std::multimap<unsigned, unsigned> worklist;
+  std::map<unsigned, unsigned> map;
+  
+  // Setup worklist of initial copies
+  for (std::multimap<unsigned, unsigned>::iterator I = copy_set.begin(),
+       E = copy_set.end(); I != E; ) {
+    map.insert(std::make_pair(I->first, I->first));
+    map.insert(std::make_pair(I->second, I->second));
+         
+    if (!UsedByAnother.count(I->second)) {
+      worklist.insert(*I);
+      
+      // Avoid iterator invalidation
+      std::multimap<unsigned, unsigned>::iterator OI = I;
+      ++I;
+      copy_set.erase(OI);
+    } else {
+      ++I;
+    }
+  }
+  
+  LiveIntervals& LI = getAnalysis<LiveIntervals>();
+  MachineFunction* MF = MBB->getParent();
+  MachineRegisterInfo& MRI = MF->getRegInfo();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  
+  SmallVector<std::pair<unsigned, MachineInstr*>, 4> InsertedPHIDests;
+  
+  // Iterate over the worklist, inserting copies
+  while (!worklist.empty() || !copy_set.empty()) {
+    while (!worklist.empty()) {
+      std::multimap<unsigned, unsigned>::iterator WI = worklist.begin();
+      std::pair<unsigned, unsigned> curr = *WI;
+      worklist.erase(WI);
+      
+      const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(curr.first);
+      
+      if (isLiveOut(curr.second, MBB, LI)) {
+        // Create a temporary
+        unsigned t = MF->getRegInfo().createVirtualRegister(RC);
+        
+        // Insert copy from curr.second to a temporary at
+        // the Phi defining curr.second
+        MachineBasicBlock::iterator PI = MRI.getVRegDef(curr.second);
+        BuildMI(*PI->getParent(), PI, DebugLoc(), TII->get(TargetOpcode::COPY),
+                t).addReg(curr.second);
+        DEBUG(dbgs() << "Inserted copy from " << curr.second << " to " << t
+                     << "\n");
+        
+        // Push temporary on Stacks
+        Stacks[curr.second].push_back(t);
+        
+        // Insert curr.second in pushed
+        pushed.insert(curr.second);
+        
+        // Create a live interval for this temporary
+        InsertedPHIDests.push_back(std::make_pair(t, --PI));
+      }
+      
+      // Insert copy from map[curr.first] to curr.second
+      BuildMI(*MBB, MBB->getFirstTerminator(), DebugLoc(),
+             TII->get(TargetOpcode::COPY), curr.second).addReg(map[curr.first]);
+      map[curr.first] = curr.second;
+      DEBUG(dbgs() << "Inserted copy from " << curr.first << " to "
+                   << curr.second << "\n");
+      
+      // Push this copy onto InsertedPHICopies so we can
+      // update LiveIntervals with it.
+      MachineBasicBlock::iterator MI = MBB->getFirstTerminator();
+      InsertedPHIDests.push_back(std::make_pair(curr.second, --MI));
+      
+      // If curr.first is a destination in copy_set...
+      for (std::multimap<unsigned, unsigned>::iterator I = copy_set.begin(),
+           E = copy_set.end(); I != E; )
+        if (curr.first == I->second) {
+          std::pair<unsigned, unsigned> temp = *I;
+          worklist.insert(temp);
+          
+          // Avoid iterator invalidation
+          std::multimap<unsigned, unsigned>::iterator OI = I;
+          ++I;
+          copy_set.erase(OI);
+          
+          break;
+        } else {
+          ++I;
+        }
+    }
+    
+    if (!copy_set.empty()) {
+      std::multimap<unsigned, unsigned>::iterator CI = copy_set.begin();
+      std::pair<unsigned, unsigned> curr = *CI;
+      worklist.insert(curr);
+      copy_set.erase(CI);
+      
+      LiveInterval& I = LI.getInterval(curr.second);
+      MachineBasicBlock::iterator term = MBB->getFirstTerminator();
+      SlotIndex endIdx = SlotIndex();
+      if (term != MBB->end())
+        endIdx = LI.getInstructionIndex(term);
+      else
+        endIdx = LI.getMBBEndIdx(MBB);
+      
+      if (I.liveAt(endIdx)) {
+        const TargetRegisterClass *RC =
+                                       MF->getRegInfo().getRegClass(curr.first);
+        
+        // Insert a copy from dest to a new temporary t at the end of b
+        unsigned t = MF->getRegInfo().createVirtualRegister(RC);
+        BuildMI(*MBB, MBB->getFirstTerminator(), DebugLoc(),
+                TII->get(TargetOpcode::COPY), t).addReg(curr.second);
+        map[curr.second] = t;
+        
+        MachineBasicBlock::iterator TI = MBB->getFirstTerminator();
+        InsertedPHIDests.push_back(std::make_pair(t, --TI));
+      }
+    }
+  }
+  
+  // Renumber the instructions so that we can perform the index computations
+  // needed to create new live intervals.
+  LI.renumber();
+  
+  // For copies that we inserted at the ends of predecessors, we construct
+  // live intervals.  This is pretty easy, since we know that the destination
+  // register cannot have be in live at that point previously.  We just have
+  // to make sure that, for registers that serve as inputs to more than one
+  // PHI, we don't create multiple overlapping live intervals.
+  std::set<unsigned> RegHandled;
+  for (SmallVector<std::pair<unsigned, MachineInstr*>, 4>::iterator I =
+       InsertedPHIDests.begin(), E = InsertedPHIDests.end(); I != E; ++I) {
+    if (RegHandled.insert(I->first).second) {
+      LiveInterval& Int = LI.getOrCreateInterval(I->first);
+      SlotIndex instrIdx = LI.getInstructionIndex(I->second);
+      if (Int.liveAt(instrIdx.getDefIndex()))
+        Int.removeRange(instrIdx.getDefIndex(),
+                        LI.getMBBEndIdx(I->second->getParent()).getNextSlot(),
+                        true);
+      
+      LiveRange R = LI.addLiveRangeToEndOfBlock(I->first, I->second);
+      R.valno->setCopy(I->second);
+      R.valno->def = LI.getInstructionIndex(I->second).getDefIndex();
+    }
+  }
+}
+
+/// InsertCopies - insert copies into MBB and all of its successors
+void StrongPHIElimination::InsertCopies(MachineDomTreeNode* MDTN,
+                                 SmallPtrSet<MachineBasicBlock*, 16>& visited) {
+  MachineBasicBlock* MBB = MDTN->getBlock();
+  visited.insert(MBB);
+  
+  std::set<unsigned> pushed;
+  
+  LiveIntervals& LI = getAnalysis<LiveIntervals>();
+  // Rewrite register uses from Stacks
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+      I != E; ++I) {
+    if (I->isPHI())
+      continue;
+    
+    for (unsigned i = 0; i < I->getNumOperands(); ++i)
+      if (I->getOperand(i).isReg() &&
+          Stacks[I->getOperand(i).getReg()].size()) {
+        // Remove the live range for the old vreg.
+        LiveInterval& OldInt = LI.getInterval(I->getOperand(i).getReg());
+        LiveInterval::iterator OldLR =
+          OldInt.FindLiveRangeContaining(LI.getInstructionIndex(I).getUseIndex());
+        if (OldLR != OldInt.end())
+          OldInt.removeRange(*OldLR, true);
+        
+        // Change the register
+        I->getOperand(i).setReg(Stacks[I->getOperand(i).getReg()].back());
+        
+        // Add a live range for the new vreg
+        LiveInterval& Int = LI.getInterval(I->getOperand(i).getReg());
+        VNInfo* FirstVN = *Int.vni_begin();
+        FirstVN->setHasPHIKill(false);
+        LiveRange LR (LI.getMBBStartIdx(I->getParent()),
+                      LI.getInstructionIndex(I).getUseIndex().getNextSlot(),
+                      FirstVN);
+        
+        Int.addRange(LR);
+      }
+  }    
+  
+  // Schedule the copies for this block
+  ScheduleCopies(MBB, pushed);
+  
+  // Recur down the dominator tree.
+  for (MachineDomTreeNode::iterator I = MDTN->begin(),
+       E = MDTN->end(); I != E; ++I)
+    if (!visited.count((*I)->getBlock()))
+      InsertCopies(*I, visited);
+  
+  // As we exit this block, pop the names we pushed while processing it
+  for (std::set<unsigned>::iterator I = pushed.begin(), 
+       E = pushed.end(); I != E; ++I)
+    Stacks[*I].pop_back();
+}
+
+bool StrongPHIElimination::mergeLiveIntervals(unsigned primary,
+                                              unsigned secondary) {
+  
+  LiveIntervals& LI = getAnalysis<LiveIntervals>();
+  LiveInterval& LHS = LI.getOrCreateInterval(primary);
+  LiveInterval& RHS = LI.getOrCreateInterval(secondary);
+  
+  LI.renumber();
+  
+  DenseMap<VNInfo*, VNInfo*> VNMap;
+  for (LiveInterval::iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
+    LiveRange R = *I;
+ 
+    SlotIndex Start = R.start;
+    SlotIndex End = R.end;
+    if (LHS.getLiveRangeContaining(Start))
+      return false;
+    
+    if (LHS.getLiveRangeContaining(End))
+      return false;
+    
+    LiveInterval::iterator RI = std::upper_bound(LHS.begin(), LHS.end(), R);
+    if (RI != LHS.end() && RI->start < End)
+      return false;
+  }
+  
+  for (LiveInterval::iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
+    LiveRange R = *I;
+    VNInfo* OldVN = R.valno;
+    VNInfo*& NewVN = VNMap[OldVN];
+    if (!NewVN) {
+      NewVN = LHS.createValueCopy(OldVN, LI.getVNInfoAllocator());
+    }
+    
+    LiveRange LR (R.start, R.end, NewVN);
+    LHS.addRange(LR);
+  }
+  
+  LI.removeInterval(RHS.reg);
+  
+  return true;
+}
+
+bool StrongPHIElimination::runOnMachineFunction(MachineFunction &Fn) {
+  LiveIntervals& LI = getAnalysis<LiveIntervals>();
+  
+  // Compute DFS numbers of each block
+  computeDFS(Fn);
+  
+  // Determine which phi node operands need copies
+  for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I)
+    if (!I->empty() && I->begin()->isPHI())
+      processBlock(I);
+  
+  // Break interferences where two different phis want to coalesce
+  // in the same register.
+  std::set<unsigned> seen;
+  typedef std::map<unsigned, std::map<unsigned, MachineBasicBlock*> >
+          RenameSetType;
+  for (RenameSetType::iterator I = RenameSets.begin(), E = RenameSets.end();
+       I != E; ++I) {
+    for (std::map<unsigned, MachineBasicBlock*>::iterator
+         OI = I->second.begin(), OE = I->second.end(); OI != OE; ) {
+      if (!seen.count(OI->first)) {
+        seen.insert(OI->first);
+        ++OI;
+      } else {
+        Waiting[OI->second].insert(std::make_pair(OI->first, I->first));
+        unsigned reg = OI->first;
+        ++OI;
+        I->second.erase(reg);
+        DEBUG(dbgs() << "Removing Renaming: " << reg << " -> " << I->first
+                     << "\n");
+      }
+    }
+  }
+  
+  // Insert copies
+  // FIXME: This process should probably preserve LiveIntervals
+  SmallPtrSet<MachineBasicBlock*, 16> visited;
+  MachineDominatorTree& MDT = getAnalysis<MachineDominatorTree>();
+  InsertCopies(MDT.getRootNode(), visited);
+  
+  // Perform renaming
+  for (RenameSetType::iterator I = RenameSets.begin(), E = RenameSets.end();
+       I != E; ++I)
+    while (I->second.size()) {
+      std::map<unsigned, MachineBasicBlock*>::iterator SI = I->second.begin();
+      
+      DEBUG(dbgs() << "Renaming: " << SI->first << " -> " << I->first << "\n");
+      
+      if (SI->first != I->first) {
+        if (mergeLiveIntervals(I->first, SI->first)) {
+          Fn.getRegInfo().replaceRegWith(SI->first, I->first);
+      
+          if (RenameSets.count(SI->first)) {
+            I->second.insert(RenameSets[SI->first].begin(),
+                             RenameSets[SI->first].end());
+            RenameSets.erase(SI->first);
+          }
+        } else {
+          // Insert a last-minute copy if a conflict was detected.
+          const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+          BuildMI(*SI->second, SI->second->getFirstTerminator(), DebugLoc(),
+                  TII->get(TargetOpcode::COPY), I->first).addReg(SI->first);
+          
+          LI.renumber();
+          
+          LiveInterval& Int = LI.getOrCreateInterval(I->first);
+          SlotIndex instrIdx =
+                     LI.getInstructionIndex(--SI->second->getFirstTerminator());
+          if (Int.liveAt(instrIdx.getDefIndex()))
+            Int.removeRange(instrIdx.getDefIndex(),
+                            LI.getMBBEndIdx(SI->second).getNextSlot(), true);
+
+          LiveRange R = LI.addLiveRangeToEndOfBlock(I->first,
+                                            --SI->second->getFirstTerminator());
+          R.valno->setCopy(--SI->second->getFirstTerminator());
+          R.valno->def = instrIdx.getDefIndex();
+          
+          DEBUG(dbgs() << "Renaming failed: " << SI->first << " -> "
+                       << I->first << "\n");
+        }
+      }
+      
+      LiveInterval& Int = LI.getOrCreateInterval(I->first);
+      const LiveRange* LR =
+                       Int.getLiveRangeContaining(LI.getMBBEndIdx(SI->second));
+      LR->valno->setHasPHIKill(true);
+      
+      I->second.erase(SI->first);
+    }
+  
+  // Remove PHIs
+  std::vector<MachineInstr*> phis;
+  for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
+    for (MachineBasicBlock::iterator BI = I->begin(), BE = I->end();
+         BI != BE; ++BI)
+      if (BI->isPHI())
+        phis.push_back(BI);
+  }
+  
+  for (std::vector<MachineInstr*>::iterator I = phis.begin(), E = phis.end();
+       I != E; ) {
+    MachineInstr* PInstr = *(I++);
+    
+    // If this is a dead PHI node, then remove it from LiveIntervals.
+    unsigned DestReg = PInstr->getOperand(0).getReg();
+    LiveInterval& PI = LI.getInterval(DestReg);
+    if (PInstr->registerDefIsDead(DestReg)) {
+      if (PI.containsOneValue()) {
+        LI.removeInterval(DestReg);
+      } else {
+        SlotIndex idx = LI.getInstructionIndex(PInstr).getDefIndex();
+        PI.removeRange(*PI.getLiveRangeContaining(idx), true);
+      }
+    } else {
+      // Trim live intervals of input registers.  They are no longer live into
+      // this block if they died after the PHI.  If they lived after it, don't
+      // trim them because they might have other legitimate uses.
+      for (unsigned i = 1; i < PInstr->getNumOperands(); i += 2) {
+        unsigned reg = PInstr->getOperand(i).getReg();
+        
+        MachineBasicBlock* MBB = PInstr->getOperand(i+1).getMBB();
+        LiveInterval& InputI = LI.getInterval(reg);
+        if (MBB != PInstr->getParent() &&
+            InputI.liveAt(LI.getMBBStartIdx(PInstr->getParent())) &&
+            InputI.expiredAt(LI.getInstructionIndex(PInstr).getNextIndex()))
+          InputI.removeRange(LI.getMBBStartIdx(PInstr->getParent()),
+                             LI.getInstructionIndex(PInstr),
+                             true);
+      }
+      
+      // If the PHI is not dead, then the valno defined by the PHI
+      // now has an unknown def.
+      SlotIndex idx = LI.getInstructionIndex(PInstr).getDefIndex();
+      const LiveRange* PLR = PI.getLiveRangeContaining(idx);
+      PLR->valno->setIsPHIDef(true);
+      LiveRange R (LI.getMBBStartIdx(PInstr->getParent()),
+                   PLR->start, PLR->valno);
+      PI.addRange(R);
+    }
+    
+    LI.RemoveMachineInstrFromMaps(PInstr);
+    PInstr->eraseFromParent();
+  }
+  
+  LI.renumber();
+  
+  return true;
+}
diff --git a/src/LLVM/lib/CodeGen/TailDuplication.cpp b/src/LLVM/lib/CodeGen/TailDuplication.cpp
new file mode 100644
index 0000000..a815b36
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/TailDuplication.cpp
@@ -0,0 +1,653 @@
+//===-- TailDuplication.cpp - Duplicate blocks into predecessors' tails ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass duplicates basic blocks ending in unconditional branches into
+// the tails of their predecessors.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "tailduplication"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumTails     , "Number of tails duplicated");
+STATISTIC(NumTailDups  , "Number of tail duplicated blocks");
+STATISTIC(NumInstrDups , "Additional instructions due to tail duplication");
+STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
+
+// Heuristic for tail duplication.
+static cl::opt<unsigned>
+TailDuplicateSize("tail-dup-size",
+                  cl::desc("Maximum instructions to consider tail duplicating"),
+                  cl::init(2), cl::Hidden);
+
+static cl::opt<bool>
+TailDupVerify("tail-dup-verify",
+              cl::desc("Verify sanity of PHI instructions during taildup"),
+              cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned>
+TailDupLimit("tail-dup-limit", cl::init(~0U), cl::Hidden);
+
+typedef std::vector<std::pair<MachineBasicBlock*,unsigned> > AvailableValsTy;
+
+namespace {
+  /// TailDuplicatePass - Perform tail duplication.
+  class TailDuplicatePass : public MachineFunctionPass {
+    bool PreRegAlloc;
+    const TargetInstrInfo *TII;
+    MachineModuleInfo *MMI;
+    MachineRegisterInfo *MRI;
+
+    // SSAUpdateVRs - A list of virtual registers for which to update SSA form.
+    SmallVector<unsigned, 16> SSAUpdateVRs;
+
+    // SSAUpdateVals - For each virtual register in SSAUpdateVals keep a list of
+    // source virtual registers.
+    DenseMap<unsigned, AvailableValsTy> SSAUpdateVals;
+
+  public:
+    static char ID;
+    explicit TailDuplicatePass(bool PreRA) :
+      MachineFunctionPass(ID), PreRegAlloc(PreRA) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+    virtual const char *getPassName() const { return "Tail Duplication"; }
+
+  private:
+    void AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
+                           MachineBasicBlock *BB);
+    void ProcessPHI(MachineInstr *MI, MachineBasicBlock *TailBB,
+                    MachineBasicBlock *PredBB,
+                    DenseMap<unsigned, unsigned> &LocalVRMap,
+                    SmallVector<std::pair<unsigned,unsigned>, 4> &Copies);
+    void DuplicateInstruction(MachineInstr *MI,
+                              MachineBasicBlock *TailBB,
+                              MachineBasicBlock *PredBB,
+                              MachineFunction &MF,
+                              DenseMap<unsigned, unsigned> &LocalVRMap);
+    void UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
+                              SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                              SmallSetVector<MachineBasicBlock*, 8> &Succs);
+    bool TailDuplicateBlocks(MachineFunction &MF);
+    bool TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
+                       SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                       SmallVector<MachineInstr*, 16> &Copies);
+    void RemoveDeadBlock(MachineBasicBlock *MBB);
+  };
+
+  char TailDuplicatePass::ID = 0;
+}
+
+FunctionPass *llvm::createTailDuplicatePass(bool PreRegAlloc) {
+  return new TailDuplicatePass(PreRegAlloc);
+}
+
+bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+
+  bool MadeChange = false;
+  while (TailDuplicateBlocks(MF))
+    MadeChange = true;
+
+  return MadeChange;
+}
+
+static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
+  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = I;
+    SmallSetVector<MachineBasicBlock*, 8> Preds(MBB->pred_begin(),
+                                                MBB->pred_end());
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != MBB->end()) {
+      if (!MI->isPHI())
+        break;
+      for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+             PE = Preds.end(); PI != PE; ++PI) {
+        MachineBasicBlock *PredBB = *PI;
+        bool Found = false;
+        for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+          MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB();
+          if (PHIBB == PredBB) {
+            Found = true;
+            break;
+          }
+        }
+        if (!Found) {
+          dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
+          dbgs() << "  missing input from predecessor BB#"
+                 << PredBB->getNumber() << '\n';
+          llvm_unreachable(0);
+        }
+      }
+
+      for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+        MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB();
+        if (CheckExtra && !Preds.count(PHIBB)) {
+          // This is not a hard error.
+          dbgs() << "Warning: malformed PHI in BB#" << MBB->getNumber()
+                 << ": " << *MI;
+          dbgs() << "  extra input from predecessor BB#"
+                 << PHIBB->getNumber() << '\n';
+        }
+        if (PHIBB->getNumber() < 0) {
+          dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
+          dbgs() << "  non-existing BB#" << PHIBB->getNumber() << '\n';
+          llvm_unreachable(0);
+        }
+      }
+      ++MI;
+    }
+  }
+}
+
+/// TailDuplicateBlocks - Look for small blocks that are unconditionally
+/// branched to and do not fall through. Tail-duplicate their instructions
+/// into their predecessors to eliminate (dynamic) branches.
+bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  if (PreRegAlloc && TailDupVerify) {
+    DEBUG(dbgs() << "\n*** Before tail-duplicating\n");
+    VerifyPHIs(MF, true);
+  }
+
+  SmallVector<MachineInstr*, 8> NewPHIs;
+  MachineSSAUpdater SSAUpdate(MF, &NewPHIs);
+
+  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) {
+    MachineBasicBlock *MBB = I++;
+
+    if (NumTails == TailDupLimit)
+      break;
+
+    // Only duplicate blocks that end with unconditional branches.
+    if (MBB->canFallThrough())
+      continue;
+
+    // Save the successors list.
+    SmallSetVector<MachineBasicBlock*, 8> Succs(MBB->succ_begin(),
+                                                MBB->succ_end());
+
+    SmallVector<MachineBasicBlock*, 8> TDBBs;
+    SmallVector<MachineInstr*, 16> Copies;
+    if (TailDuplicate(MBB, MF, TDBBs, Copies)) {
+      ++NumTails;
+
+      // TailBB's immediate successors are now successors of those predecessors
+      // which duplicated TailBB. Add the predecessors as sources to the PHI
+      // instructions.
+      bool isDead = MBB->pred_empty();
+      if (PreRegAlloc)
+        UpdateSuccessorsPHIs(MBB, isDead, TDBBs, Succs);
+
+      // If it is dead, remove it.
+      if (isDead) {
+        NumInstrDups -= MBB->size();
+        RemoveDeadBlock(MBB);
+        ++NumDeadBlocks;
+      }
+
+      // Update SSA form.
+      if (!SSAUpdateVRs.empty()) {
+        for (unsigned i = 0, e = SSAUpdateVRs.size(); i != e; ++i) {
+          unsigned VReg = SSAUpdateVRs[i];
+          SSAUpdate.Initialize(VReg);
+
+          // If the original definition is still around, add it as an available
+          // value.
+          MachineInstr *DefMI = MRI->getVRegDef(VReg);
+          MachineBasicBlock *DefBB = 0;
+          if (DefMI) {
+            DefBB = DefMI->getParent();
+            SSAUpdate.AddAvailableValue(DefBB, VReg);
+          }
+
+          // Add the new vregs as available values.
+          DenseMap<unsigned, AvailableValsTy>::iterator LI =
+            SSAUpdateVals.find(VReg);  
+          for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
+            MachineBasicBlock *SrcBB = LI->second[j].first;
+            unsigned SrcReg = LI->second[j].second;
+            SSAUpdate.AddAvailableValue(SrcBB, SrcReg);
+          }
+
+          // Rewrite uses that are outside of the original def's block.
+          MachineRegisterInfo::use_iterator UI = MRI->use_begin(VReg);
+          while (UI != MRI->use_end()) {
+            MachineOperand &UseMO = UI.getOperand();
+            MachineInstr *UseMI = &*UI;
+            ++UI;
+            if (UseMI->getParent() == DefBB)
+              continue;
+            SSAUpdate.RewriteUse(UseMO);
+          }
+        }
+
+        SSAUpdateVRs.clear();
+        SSAUpdateVals.clear();
+      }
+
+      // Eliminate some of the copies inserted by tail duplication to maintain
+      // SSA form.
+      for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
+        MachineInstr *Copy = Copies[i];
+        if (!Copy->isCopy())
+          continue;
+        unsigned Dst = Copy->getOperand(0).getReg();
+        unsigned Src = Copy->getOperand(1).getReg();
+        MachineRegisterInfo::use_iterator UI = MRI->use_begin(Src);
+        if (++UI == MRI->use_end()) {
+          // Copy is the only use. Do trivial copy propagation here.
+          MRI->replaceRegWith(Dst, Src);
+          Copy->eraseFromParent();
+        }
+      }
+
+      if (PreRegAlloc && TailDupVerify)
+        VerifyPHIs(MF, false);
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+static bool isDefLiveOut(unsigned Reg, MachineBasicBlock *BB,
+                         const MachineRegisterInfo *MRI) {
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
+         UE = MRI->use_end(); UI != UE; ++UI) {
+    MachineInstr *UseMI = &*UI;
+    if (UseMI->getParent() != BB)
+      return true;
+  }
+  return false;
+}
+
+static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2)
+    if (MI->getOperand(i+1).getMBB() == SrcBB)
+      return i;
+  return 0;
+}
+
+/// AddSSAUpdateEntry - Add a definition and source virtual registers pair for
+/// SSA update.
+void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
+                                          MachineBasicBlock *BB) {
+  DenseMap<unsigned, AvailableValsTy>::iterator LI= SSAUpdateVals.find(OrigReg);
+  if (LI != SSAUpdateVals.end())
+    LI->second.push_back(std::make_pair(BB, NewReg));
+  else {
+    AvailableValsTy Vals;
+    Vals.push_back(std::make_pair(BB, NewReg));
+    SSAUpdateVals.insert(std::make_pair(OrigReg, Vals));
+    SSAUpdateVRs.push_back(OrigReg);
+  }
+}
+
+/// ProcessPHI - Process PHI node in TailBB by turning it into a copy in PredBB.
+/// Remember the source register that's contributed by PredBB and update SSA
+/// update map.
+void TailDuplicatePass::ProcessPHI(MachineInstr *MI,
+                                   MachineBasicBlock *TailBB,
+                                   MachineBasicBlock *PredBB,
+                                   DenseMap<unsigned, unsigned> &LocalVRMap,
+                         SmallVector<std::pair<unsigned,unsigned>, 4> &Copies) {
+  unsigned DefReg = MI->getOperand(0).getReg();
+  unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
+  assert(SrcOpIdx && "Unable to find matching PHI source?");
+  unsigned SrcReg = MI->getOperand(SrcOpIdx).getReg();
+  const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+  LocalVRMap.insert(std::make_pair(DefReg, SrcReg));
+
+  // Insert a copy from source to the end of the block. The def register is the
+  // available value liveout of the block.
+  unsigned NewDef = MRI->createVirtualRegister(RC);
+  Copies.push_back(std::make_pair(NewDef, SrcReg));
+  if (isDefLiveOut(DefReg, TailBB, MRI))
+    AddSSAUpdateEntry(DefReg, NewDef, PredBB);
+
+  // Remove PredBB from the PHI node.
+  MI->RemoveOperand(SrcOpIdx+1);
+  MI->RemoveOperand(SrcOpIdx);
+  if (MI->getNumOperands() == 1)
+    MI->eraseFromParent();
+}
+
+/// DuplicateInstruction - Duplicate a TailBB instruction to PredBB and update
+/// the source operands due to earlier PHI translation.
+void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
+                                     MachineBasicBlock *TailBB,
+                                     MachineBasicBlock *PredBB,
+                                     MachineFunction &MF,
+                                     DenseMap<unsigned, unsigned> &LocalVRMap) {
+  MachineInstr *NewMI = TII->duplicate(MI, MF);
+  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = NewMI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (MO.isDef()) {
+      const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+      unsigned NewReg = MRI->createVirtualRegister(RC);
+      MO.setReg(NewReg);
+      LocalVRMap.insert(std::make_pair(Reg, NewReg));
+      if (isDefLiveOut(Reg, TailBB, MRI))
+        AddSSAUpdateEntry(Reg, NewReg, PredBB);
+    } else {
+      DenseMap<unsigned, unsigned>::iterator VI = LocalVRMap.find(Reg);
+      if (VI != LocalVRMap.end())
+        MO.setReg(VI->second);
+    }
+  }
+  PredBB->insert(PredBB->end(), NewMI);
+}
+
+/// UpdateSuccessorsPHIs - After FromBB is tail duplicated into its predecessor
+/// blocks, the successors have gained new predecessors. Update the PHI
+/// instructions in them accordingly.
+void
+TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
+                                  SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                                  SmallSetVector<MachineBasicBlock*,8> &Succs) {
+  for (SmallSetVector<MachineBasicBlock*, 8>::iterator SI = Succs.begin(),
+         SE = Succs.end(); SI != SE; ++SI) {
+    MachineBasicBlock *SuccBB = *SI;
+    for (MachineBasicBlock::iterator II = SuccBB->begin(), EE = SuccBB->end();
+         II != EE; ++II) {
+      if (!II->isPHI())
+        break;
+      unsigned Idx = 0;
+      for (unsigned i = 1, e = II->getNumOperands(); i != e; i += 2) {
+        MachineOperand &MO = II->getOperand(i+1);
+        if (MO.getMBB() == FromBB) {
+          Idx = i;
+          break;
+        }
+      }
+
+      assert(Idx != 0);
+      MachineOperand &MO0 = II->getOperand(Idx);
+      unsigned Reg = MO0.getReg();
+      if (isDead) {
+        // Folded into the previous BB.
+        // There could be duplicate phi source entries. FIXME: Should sdisel
+        // or earlier pass fixed this?
+        for (unsigned i = II->getNumOperands()-2; i != Idx; i -= 2) {
+          MachineOperand &MO = II->getOperand(i+1);
+          if (MO.getMBB() == FromBB) {
+            II->RemoveOperand(i+1);
+            II->RemoveOperand(i);
+          }
+        }
+      } else
+        Idx = 0;
+
+      // If Idx is set, the operands at Idx and Idx+1 must be removed.
+      // We reuse the location to avoid expensive RemoveOperand calls.
+
+      DenseMap<unsigned,AvailableValsTy>::iterator LI=SSAUpdateVals.find(Reg);
+      if (LI != SSAUpdateVals.end()) {
+        // This register is defined in the tail block.
+        for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
+          MachineBasicBlock *SrcBB = LI->second[j].first;
+          unsigned SrcReg = LI->second[j].second;
+          if (Idx != 0) {
+            II->getOperand(Idx).setReg(SrcReg);
+            II->getOperand(Idx+1).setMBB(SrcBB);
+            Idx = 0;
+          } else {
+            II->addOperand(MachineOperand::CreateReg(SrcReg, false));
+            II->addOperand(MachineOperand::CreateMBB(SrcBB));
+          }
+        }
+      } else {
+        // Live in tail block, must also be live in predecessors.
+        for (unsigned j = 0, ee = TDBBs.size(); j != ee; ++j) {
+          MachineBasicBlock *SrcBB = TDBBs[j];
+          if (Idx != 0) {
+            II->getOperand(Idx).setReg(Reg);
+            II->getOperand(Idx+1).setMBB(SrcBB);
+            Idx = 0;
+          } else {
+            II->addOperand(MachineOperand::CreateReg(Reg, false));
+            II->addOperand(MachineOperand::CreateMBB(SrcBB));
+          }
+        }
+      }
+      if (Idx != 0) {
+        II->RemoveOperand(Idx+1);
+        II->RemoveOperand(Idx);
+      }
+    }
+  }
+}
+
+/// TailDuplicate - If it is profitable, duplicate TailBB's contents in each
+/// of its predecessors.
+bool
+TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
+                                 SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                                 SmallVector<MachineInstr*, 16> &Copies) {
+  // Set the limit on the number of instructions to duplicate, with a default
+  // of one less than the tail-merge threshold. When optimizing for size,
+  // duplicate only one, because one branch instruction can be eliminated to
+  // compensate for the duplication.
+  unsigned MaxDuplicateCount;
+  if (MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+    MaxDuplicateCount = 1;
+  else
+    MaxDuplicateCount = TailDuplicateSize;
+
+  if (PreRegAlloc) {
+      // Pre-regalloc tail duplication hurts compile time and doesn't help
+      // much except for indirect branches.
+    if (TailBB->empty() || !TailBB->back().getDesc().isIndirectBranch())
+      return false;
+    // If the target has hardware branch prediction that can handle indirect
+    // branches, duplicating them can often make them predictable when there
+    // are common paths through the code.  The limit needs to be high enough
+    // to allow undoing the effects of tail merging and other optimizations
+    // that rearrange the predecessors of the indirect branch.
+    MaxDuplicateCount = 20;
+  }
+
+  // Don't try to tail-duplicate single-block loops.
+  if (TailBB->isSuccessor(TailBB))
+    return false;
+
+  // Check the instructions in the block to determine whether tail-duplication
+  // is invalid or unlikely to be profitable.
+  unsigned InstrCount = 0;
+  bool HasCall = false;
+  for (MachineBasicBlock::iterator I = TailBB->begin();
+       I != TailBB->end(); ++I) {
+    // Non-duplicable things shouldn't be tail-duplicated.
+    if (I->getDesc().isNotDuplicable()) return false;
+    // Do not duplicate 'return' instructions if this is a pre-regalloc run.
+    // A return may expand into a lot more instructions (e.g. reload of callee
+    // saved registers) after PEI.
+    if (PreRegAlloc && I->getDesc().isReturn()) return false;
+    // Don't duplicate more than the threshold.
+    if (InstrCount == MaxDuplicateCount) return false;
+    // Remember if we saw a call.
+    if (I->getDesc().isCall()) HasCall = true;
+    if (!I->isPHI() && !I->isDebugValue())
+      InstrCount += 1;
+  }
+  // Heuristically, don't tail-duplicate calls if it would expand code size,
+  // as it's less likely to be worth the extra cost.
+  if (InstrCount > 1 && HasCall)
+    return false;
+
+  DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
+
+  // Iterate through all the unique predecessors and tail-duplicate this
+  // block into them, if possible. Copying the list ahead of time also
+  // avoids trouble with the predecessor list reallocating.
+  bool Changed = false;
+  SmallSetVector<MachineBasicBlock*, 8> Preds(TailBB->pred_begin(),
+                                              TailBB->pred_end());
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+       PE = Preds.end(); PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+
+    assert(TailBB != PredBB &&
+           "Single-block loop should have been rejected earlier!");
+    if (PredBB->succ_size() > 1) continue;
+
+    MachineBasicBlock *PredTBB, *PredFBB;
+    SmallVector<MachineOperand, 4> PredCond;
+    if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
+      continue;
+    if (!PredCond.empty())
+      continue;
+    // EH edges are ignored by AnalyzeBranch.
+    if (PredBB->succ_size() != 1)
+      continue;
+    // Don't duplicate into a fall-through predecessor (at least for now).
+    if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
+      continue;
+
+    DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
+                 << "From Succ: " << *TailBB);
+
+    TDBBs.push_back(PredBB);
+
+    // Remove PredBB's unconditional branch.
+    TII->RemoveBranch(*PredBB);
+
+    // Clone the contents of TailBB into PredBB.
+    DenseMap<unsigned, unsigned> LocalVRMap;
+    SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
+    MachineBasicBlock::iterator I = TailBB->begin();
+    while (I != TailBB->end()) {
+      MachineInstr *MI = &*I;
+      ++I;
+      if (MI->isPHI()) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos);
+      } else {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        DuplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap);
+      }
+    }
+    MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
+    for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
+      Copies.push_back(BuildMI(*PredBB, Loc, DebugLoc(),
+                               TII->get(TargetOpcode::COPY),
+                               CopyInfos[i].first).addReg(CopyInfos[i].second));
+    }
+    NumInstrDups += TailBB->size() - 1; // subtract one for removed branch
+
+    // Update the CFG.
+    PredBB->removeSuccessor(PredBB->succ_begin());
+    assert(PredBB->succ_empty() &&
+           "TailDuplicate called on block with multiple successors!");
+    for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(),
+           E = TailBB->succ_end(); I != E; ++I)
+      PredBB->addSuccessor(*I);
+
+    Changed = true;
+    ++NumTailDups;
+  }
+
+  // If TailBB was duplicated into all its predecessors except for the prior
+  // block, which falls through unconditionally, move the contents of this
+  // block into the prior block.
+  MachineBasicBlock *PrevBB = prior(MachineFunction::iterator(TailBB));
+  MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0;
+  SmallVector<MachineOperand, 4> PriorCond;
+  bool PriorUnAnalyzable =
+    TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true);
+  // This has to check PrevBB->succ_size() because EH edges are ignored by
+  // AnalyzeBranch.
+  if (!PriorUnAnalyzable && PriorCond.empty() && !PriorTBB &&
+      TailBB->pred_size() == 1 && PrevBB->succ_size() == 1 &&
+      !TailBB->hasAddressTaken()) {
+    DEBUG(dbgs() << "\nMerging into block: " << *PrevBB
+          << "From MBB: " << *TailBB);
+    if (PreRegAlloc) {
+      DenseMap<unsigned, unsigned> LocalVRMap;
+      SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
+      MachineBasicBlock::iterator I = TailBB->begin();
+      // Process PHI instructions first.
+      while (I != TailBB->end() && I->isPHI()) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        MachineInstr *MI = &*I++;
+        ProcessPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos);
+        if (MI->getParent())
+          MI->eraseFromParent();
+      }
+
+      // Now copy the non-PHI instructions.
+      while (I != TailBB->end()) {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        MachineInstr *MI = &*I++;
+        DuplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap);
+        MI->eraseFromParent();
+      }
+      MachineBasicBlock::iterator Loc = PrevBB->getFirstTerminator();
+      for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
+        Copies.push_back(BuildMI(*PrevBB, Loc, DebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 CopyInfos[i].first)
+                           .addReg(CopyInfos[i].second));
+      }
+    } else {
+      // No PHIs to worry about, just splice the instructions over.
+      PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end());
+    }
+    PrevBB->removeSuccessor(PrevBB->succ_begin());
+    assert(PrevBB->succ_empty());
+    PrevBB->transferSuccessors(TailBB);
+    TDBBs.push_back(PrevBB);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// RemoveDeadBlock - Remove the specified dead machine basic block from the
+/// function, updating the CFG.
+void TailDuplicatePass::RemoveDeadBlock(MachineBasicBlock *MBB) {
+  assert(MBB->pred_empty() && "MBB must be dead!");
+  DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
+
+  // Remove all successors.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_end()-1);
+
+  // Remove the block.
+  MBB->eraseFromParent();
+}
+
diff --git a/src/LLVM/lib/CodeGen/TargetInstrInfoImpl.cpp b/src/LLVM/lib/CodeGen/TargetInstrInfoImpl.cpp
new file mode 100644
index 0000000..6e4a0d8
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -0,0 +1,421 @@
+//===-- TargetInstrInfoImpl.cpp - Target Instruction Information ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetInstrInfoImpl class, it just provides default
+// implementations of various methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PostRAHazardRecognizer.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
+/// after it, replacing it with an unconditional branch to NewDest.
+void
+TargetInstrInfoImpl::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                                             MachineBasicBlock *NewDest) const {
+  MachineBasicBlock *MBB = Tail->getParent();
+
+  // Remove all the old successors of MBB from the CFG.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_begin());
+
+  // Remove all the dead instructions from the end of MBB.
+  MBB->erase(Tail, MBB->end());
+
+  // If MBB isn't immediately before MBB, insert a branch to it.
+  if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(NewDest))
+    InsertBranch(*MBB, NewDest, 0, SmallVector<MachineOperand, 0>(),
+                 Tail->getDebugLoc());
+  MBB->addSuccessor(NewDest);
+}
+
+// commuteInstruction - The default implementation of this method just exchanges
+// the two operands returned by findCommutedOpIndices.
+MachineInstr *TargetInstrInfoImpl::commuteInstruction(MachineInstr *MI,
+                                                      bool NewMI) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  bool HasDef = TID.getNumDefs();
+  if (HasDef && !MI->getOperand(0).isReg())
+    // No idea how to commute this instruction. Target should implement its own.
+    return 0;
+  unsigned Idx1, Idx2;
+  if (!findCommutedOpIndices(MI, Idx1, Idx2)) {
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "Don't know how to commute: " << *MI;
+    report_fatal_error(Msg.str());
+  }
+
+  assert(MI->getOperand(Idx1).isReg() && MI->getOperand(Idx2).isReg() &&
+         "This only knows how to commute register operands so far");
+  unsigned Reg1 = MI->getOperand(Idx1).getReg();
+  unsigned Reg2 = MI->getOperand(Idx2).getReg();
+  bool Reg1IsKill = MI->getOperand(Idx1).isKill();
+  bool Reg2IsKill = MI->getOperand(Idx2).isKill();
+  bool ChangeReg0 = false;
+  if (HasDef && MI->getOperand(0).getReg() == Reg1) {
+    // Must be two address instruction!
+    assert(MI->getDesc().getOperandConstraint(0, TOI::TIED_TO) &&
+           "Expecting a two-address instruction!");
+    Reg2IsKill = false;
+    ChangeReg0 = true;
+  }
+
+  if (NewMI) {
+    // Create a new instruction.
+    unsigned Reg0 = HasDef
+      ? (ChangeReg0 ? Reg2 : MI->getOperand(0).getReg()) : 0;
+    bool Reg0IsDead = HasDef ? MI->getOperand(0).isDead() : false;
+    MachineFunction &MF = *MI->getParent()->getParent();
+    if (HasDef)
+      return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
+        .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead))
+        .addReg(Reg2, getKillRegState(Reg2IsKill))
+        .addReg(Reg1, getKillRegState(Reg2IsKill));
+    else
+      return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
+        .addReg(Reg2, getKillRegState(Reg2IsKill))
+        .addReg(Reg1, getKillRegState(Reg2IsKill));
+  }
+
+  if (ChangeReg0)
+    MI->getOperand(0).setReg(Reg2);
+  MI->getOperand(Idx2).setReg(Reg1);
+  MI->getOperand(Idx1).setReg(Reg2);
+  MI->getOperand(Idx2).setIsKill(Reg1IsKill);
+  MI->getOperand(Idx1).setIsKill(Reg2IsKill);
+  return MI;
+}
+
+/// findCommutedOpIndices - If specified MI is commutable, return the two
+/// operand indices that would swap value. Return true if the instruction
+/// is not in a form which this routine understands.
+bool TargetInstrInfoImpl::findCommutedOpIndices(MachineInstr *MI,
+                                                unsigned &SrcOpIdx1,
+                                                unsigned &SrcOpIdx2) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isCommutable())
+    return false;
+  // This assumes v0 = op v1, v2 and commuting would swap v1 and v2. If this
+  // is not true, then the target must implement this.
+  SrcOpIdx1 = TID.getNumDefs();
+  SrcOpIdx2 = SrcOpIdx1 + 1;
+  if (!MI->getOperand(SrcOpIdx1).isReg() ||
+      !MI->getOperand(SrcOpIdx2).isReg())
+    // No idea.
+    return false;
+  return true;
+}
+
+
+bool TargetInstrInfoImpl::PredicateInstruction(MachineInstr *MI,
+                            const SmallVectorImpl<MachineOperand> &Pred) const {
+  bool MadeChange = false;
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isPredicable())
+    return false;
+  
+  for (unsigned j = 0, i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    if (TID.OpInfo[i].isPredicate()) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg()) {
+        MO.setReg(Pred[j].getReg());
+        MadeChange = true;
+      } else if (MO.isImm()) {
+        MO.setImm(Pred[j].getImm());
+        MadeChange = true;
+      } else if (MO.isMBB()) {
+        MO.setMBB(Pred[j].getMBB());
+        MadeChange = true;
+      }
+      ++j;
+    }
+  }
+  return MadeChange;
+}
+
+void TargetInstrInfoImpl::reMaterialize(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned DestReg,
+                                        unsigned SubIdx,
+                                        const MachineInstr *Orig,
+                                        const TargetRegisterInfo &TRI) const {
+  MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+  MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+  MBB.insert(I, MI);
+}
+
+bool TargetInstrInfoImpl::produceSameValue(const MachineInstr *MI0,
+                                           const MachineInstr *MI1) const {
+  return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
+}
+
+MachineInstr *TargetInstrInfoImpl::duplicate(MachineInstr *Orig,
+                                             MachineFunction &MF) const {
+  assert(!Orig->getDesc().isNotDuplicable() &&
+         "Instruction cannot be duplicated");
+  return MF.CloneMachineInstr(Orig);
+}
+
+// If the COPY instruction in MI can be folded to a stack operation, return
+// the register class to use.
+static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI,
+                                              unsigned FoldIdx) {
+  assert(MI->isCopy() && "MI must be a COPY instruction");
+  if (MI->getNumOperands() != 2)
+    return 0;
+  assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand");
+
+  const MachineOperand &FoldOp = MI->getOperand(FoldIdx);
+  const MachineOperand &LiveOp = MI->getOperand(1-FoldIdx);
+
+  if (FoldOp.getSubReg() || LiveOp.getSubReg())
+    return 0;
+
+  unsigned FoldReg = FoldOp.getReg();
+  unsigned LiveReg = LiveOp.getReg();
+
+  assert(TargetRegisterInfo::isVirtualRegister(FoldReg) &&
+         "Cannot fold physregs");
+
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = MRI.getRegClass(FoldReg);
+
+  if (TargetRegisterInfo::isPhysicalRegister(LiveOp.getReg()))
+    return RC->contains(LiveOp.getReg()) ? RC : 0;
+
+  const TargetRegisterClass *LiveRC = MRI.getRegClass(LiveReg);
+  if (RC == LiveRC || RC->hasSubClass(LiveRC))
+    return RC;
+
+  // FIXME: Allow folding when register classes are memory compatible.
+  return 0;
+}
+
+bool TargetInstrInfoImpl::
+canFoldMemoryOperand(const MachineInstr *MI,
+                     const SmallVectorImpl<unsigned> &Ops) const {
+  return MI->isCopy() && Ops.size() == 1 && canFoldCopy(MI, Ops[0]);
+}
+
+/// foldMemoryOperand - Attempt to fold a load or store of the specified stack
+/// slot into the specified machine instruction for the specified operand(s).
+/// If this is possible, a new instruction is returned with the specified
+/// operand folded, otherwise NULL is returned. The client is responsible for
+/// removing the old instruction and adding the new one in the instruction
+/// stream.
+MachineInstr*
+TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
+                                   const SmallVectorImpl<unsigned> &Ops,
+                                   int FI) const {
+  unsigned Flags = 0;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (MI->getOperand(Ops[i]).isDef())
+      Flags |= MachineMemOperand::MOStore;
+    else
+      Flags |= MachineMemOperand::MOLoad;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  assert(MBB && "foldMemoryOperand needs an inserted instruction");
+  MachineFunction &MF = *MBB->getParent();
+
+  // Ask the target to do the actual folding.
+  if (MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, FI)) {
+    // Add a memory operand, foldMemoryOperandImpl doesn't do that.
+    assert((!(Flags & MachineMemOperand::MOStore) ||
+            NewMI->getDesc().mayStore()) &&
+           "Folded a def to a non-store!");
+    assert((!(Flags & MachineMemOperand::MOLoad) ||
+            NewMI->getDesc().mayLoad()) &&
+           "Folded a use to a non-load!");
+    const MachineFrameInfo &MFI = *MF.getFrameInfo();
+    assert(MFI.getObjectOffset(FI) != -1);
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                              Flags, /*Offset=*/0,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    NewMI->addMemOperand(MF, MMO);
+
+    // FIXME: change foldMemoryOperandImpl semantics to also insert NewMI.
+    return MBB->insert(MI, NewMI);
+  }
+
+  // Straight COPY may fold as load/store.
+  if (!MI->isCopy() || Ops.size() != 1)
+    return 0;
+
+  const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]);
+  if (!RC)
+    return 0;
+
+  const MachineOperand &MO = MI->getOperand(1-Ops[0]);
+  MachineBasicBlock::iterator Pos = MI;
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+
+  if (Flags == MachineMemOperand::MOStore)
+    storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI);
+  else
+    loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI);
+  return --Pos;
+}
+
+/// foldMemoryOperand - Same as the previous version except it allows folding
+/// of any load and store from / to any address, not just from a specific
+/// stack slot.
+MachineInstr*
+TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
+                                   const SmallVectorImpl<unsigned> &Ops,
+                                   MachineInstr* LoadMI) const {
+  assert(LoadMI->getDesc().canFoldAsLoad() && "LoadMI isn't foldable!");
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    assert(MI->getOperand(Ops[i]).isUse() && "Folding load into def!");
+#endif
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+
+  // Ask the target to do the actual folding.
+  MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, LoadMI);
+  if (!NewMI) return 0;
+
+  NewMI = MBB.insert(MI, NewMI);
+
+  // Copy the memoperands from the load to the folded instruction.
+  NewMI->setMemRefs(LoadMI->memoperands_begin(),
+                    LoadMI->memoperands_end());
+
+  return NewMI;
+}
+
+bool TargetInstrInfo::
+isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const {
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetMachine &TM = MF.getTarget();
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+
+  // A load from a fixed stack slot can be rematerialized. This may be
+  // redundant with subsequent checks, but it's target-independent,
+  // simple, and a common case.
+  int FrameIdx = 0;
+  if (TII.isLoadFromStackSlot(MI, FrameIdx) &&
+      MF.getFrameInfo()->isImmutableObjectIndex(FrameIdx))
+    return true;
+
+  const TargetInstrDesc &TID = MI->getDesc();
+
+  // Avoid instructions obviously unsafe for remat.
+  if (TID.hasUnmodeledSideEffects() || TID.isNotDuplicable() ||
+      TID.mayStore())
+    return false;
+
+  // Avoid instructions which load from potentially varying memory.
+  if (TID.mayLoad() && !MI->isInvariantLoad(AA))
+    return false;
+
+  // If any of the registers accessed are non-constant, conservatively assume
+  // the instruction is not rematerializable.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+
+    // Check for a well-behaved physical register.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        if (!MRI.def_empty(Reg))
+          return false;
+        BitVector AllocatableRegs = TRI.getAllocatableSet(MF, 0);
+        if (AllocatableRegs.test(Reg))
+          return false;
+        // Check for a def among the register's aliases too.
+        for (const unsigned *Alias = TRI.getAliasSet(Reg); *Alias; ++Alias) {
+          unsigned AliasReg = *Alias;
+          if (!MRI.def_empty(AliasReg))
+            return false;
+          if (AllocatableRegs.test(AliasReg))
+            return false;
+        }
+      } else {
+        // A physreg def. We can't remat it.
+        return false;
+      }
+      continue;
+    }
+
+    // Only allow one virtual-register def, and that in the first operand.
+    if (MO.isDef() != (i == 0))
+      return false;
+
+    // For the def, it should be the only def of that register.
+    if (MO.isDef() && (llvm::next(MRI.def_begin(Reg)) != MRI.def_end() ||
+                       MRI.isLiveIn(Reg)))
+      return false;
+
+    // Don't allow any virtual-register uses. Rematting an instruction with
+    // virtual register uses would length the live ranges of the uses, which
+    // is not necessarily a good idea, certainly not "trivial".
+    if (MO.isUse())
+      return false;
+  }
+
+  // Everything checked out.
+  return true;
+}
+
+/// isSchedulingBoundary - Test if the given instruction should be
+/// considered a scheduling boundary. This primarily includes labels
+/// and terminators.
+bool TargetInstrInfoImpl::isSchedulingBoundary(const MachineInstr *MI,
+                                               const MachineBasicBlock *MBB,
+                                               const MachineFunction &MF) const{
+  // Terminators and labels can't be scheduled around.
+  if (MI->getDesc().isTerminator() || MI->isLabel())
+    return true;
+
+  // Don't attempt to schedule around any instruction that defines
+  // a stack-oriented pointer, as it's unlikely to be profitable. This
+  // saves compile time, because it doesn't require every single
+  // stack slot reference to depend on the instruction that does the
+  // modification.
+  const TargetLowering &TLI = *MF.getTarget().getTargetLowering();
+  if (MI->definesRegister(TLI.getStackPointerRegisterToSaveRestore()))
+    return true;
+
+  return false;
+}
+
+// Default implementation of CreateTargetPostRAHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfoImpl::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const {
+  return (ScheduleHazardRecognizer *)new PostRAHazardRecognizer(II);
+}
diff --git a/src/LLVM/lib/CodeGen/TwoAddressInstructionPass.cpp b/src/LLVM/lib/CodeGen/TwoAddressInstructionPass.cpp
new file mode 100644
index 0000000..c5d025b
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -0,0 +1,1497 @@
+//===-- TwoAddressInstructionPass.cpp - Two-Address instruction pass ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TwoAddress instruction pass which is used
+// by most register allocators. Two-Address instructions are rewritten
+// from:
+//
+//     A = B op C
+//
+// to:
+//
+//     A = B
+//     A op= C
+//
+// Note that if a register allocator chooses to use this pass, that it
+// has to be capable of handling the non-SSA nature of these rewritten
+// virtual registers.
+//
+// It is also worth noting that the duplicate operand of the two
+// address instruction is removed.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "twoaddrinstr"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions");
+STATISTIC(NumCommuted        , "Number of instructions commuted to coalesce");
+STATISTIC(NumAggrCommuted    , "Number of instructions aggressively commuted");
+STATISTIC(NumConvertedTo3Addr, "Number of instructions promoted to 3-address");
+STATISTIC(Num3AddrSunk,        "Number of 3-address instructions sunk");
+STATISTIC(NumReMats,           "Number of instructions re-materialized");
+STATISTIC(NumDeletes,          "Number of dead instructions deleted");
+
+namespace {
+  class TwoAddressInstructionPass : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+    LiveVariables *LV;
+    AliasAnalysis *AA;
+
+    // DistanceMap - Keep track the distance of a MI from the start of the
+    // current basic block.
+    DenseMap<MachineInstr*, unsigned> DistanceMap;
+
+    // SrcRegMap - A map from virtual registers to physical registers which
+    // are likely targets to be coalesced to due to copies from physical
+    // registers to virtual registers. e.g. v1024 = move r0.
+    DenseMap<unsigned, unsigned> SrcRegMap;
+
+    // DstRegMap - A map from virtual registers to physical registers which
+    // are likely targets to be coalesced to due to copies to physical
+    // registers from virtual registers. e.g. r1 = move v1024.
+    DenseMap<unsigned, unsigned> DstRegMap;
+
+    /// RegSequences - Keep track the list of REG_SEQUENCE instructions seen
+    /// during the initial walk of the machine function.
+    SmallVector<MachineInstr*, 16> RegSequences;
+
+    bool Sink3AddrInstruction(MachineBasicBlock *MBB, MachineInstr *MI,
+                              unsigned Reg,
+                              MachineBasicBlock::iterator OldPos);
+
+    bool isProfitableToReMat(unsigned Reg, const TargetRegisterClass *RC,
+                             MachineInstr *MI, MachineInstr *DefMI,
+                             MachineBasicBlock *MBB, unsigned Loc);
+
+    bool NoUseAfterLastDef(unsigned Reg, MachineBasicBlock *MBB, unsigned Dist,
+                           unsigned &LastDef);
+
+    MachineInstr *FindLastUseInMBB(unsigned Reg, MachineBasicBlock *MBB,
+                                   unsigned Dist);
+
+    bool isProfitableToCommute(unsigned regB, unsigned regC,
+                               MachineInstr *MI, MachineBasicBlock *MBB,
+                               unsigned Dist);
+
+    bool CommuteInstruction(MachineBasicBlock::iterator &mi,
+                            MachineFunction::iterator &mbbi,
+                            unsigned RegB, unsigned RegC, unsigned Dist);
+
+    bool isProfitableToConv3Addr(unsigned RegA);
+
+    bool ConvertInstTo3Addr(MachineBasicBlock::iterator &mi,
+                            MachineBasicBlock::iterator &nmi,
+                            MachineFunction::iterator &mbbi,
+                            unsigned RegB, unsigned Dist);
+
+    typedef std::pair<std::pair<unsigned, bool>, MachineInstr*> NewKill;
+    bool canUpdateDeletedKills(SmallVector<unsigned, 4> &Kills,
+                               SmallVector<NewKill, 4> &NewKills,
+                               MachineBasicBlock *MBB, unsigned Dist);
+    bool DeleteUnusedInstr(MachineBasicBlock::iterator &mi,
+                           MachineBasicBlock::iterator &nmi,
+                           MachineFunction::iterator &mbbi, unsigned Dist);
+
+    bool TryInstructionTransform(MachineBasicBlock::iterator &mi,
+                                 MachineBasicBlock::iterator &nmi,
+                                 MachineFunction::iterator &mbbi,
+                                 unsigned SrcIdx, unsigned DstIdx,
+                                 unsigned Dist);
+
+    void ProcessCopy(MachineInstr *MI, MachineBasicBlock *MBB,
+                     SmallPtrSet<MachineInstr*, 8> &Processed);
+
+    void CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs, unsigned DstReg);
+
+    /// EliminateRegSequences - Eliminate REG_SEQUENCE instructions as part
+    /// of the de-ssa process. This replaces sources of REG_SEQUENCE as
+    /// sub-register references of the register defined by REG_SEQUENCE.
+    bool EliminateRegSequences();
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    TwoAddressInstructionPass() : MachineFunctionPass(ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<AliasAnalysis>();
+      AU.addPreserved<LiveVariables>();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      if (StrongPHIElim)
+        AU.addPreservedID(StrongPHIEliminationID);
+      else
+        AU.addPreservedID(PHIEliminationID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    /// runOnMachineFunction - Pass entry point.
+    bool runOnMachineFunction(MachineFunction&);
+  };
+}
+
+char TwoAddressInstructionPass::ID = 0;
+static RegisterPass<TwoAddressInstructionPass>
+X("twoaddressinstruction", "Two-Address instruction pass");
+
+char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
+
+/// Sink3AddrInstruction - A two-address instruction has been converted to a
+/// three-address instruction to avoid clobbering a register. Try to sink it
+/// past the instruction that would kill the above mentioned register to reduce
+/// register pressure.
+bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
+                                           MachineInstr *MI, unsigned SavedReg,
+                                           MachineBasicBlock::iterator OldPos) {
+  // Check if it's safe to move this instruction.
+  bool SeenStore = true; // Be conservative.
+  if (!MI->isSafeToMove(TII, AA, SeenStore))
+    return false;
+
+  unsigned DefReg = 0;
+  SmallSet<unsigned, 4> UseRegs;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+    if (MO.isUse() && MOReg != SavedReg)
+      UseRegs.insert(MO.getReg());
+    if (!MO.isDef())
+      continue;
+    if (MO.isImplicit())
+      // Don't try to move it if it implicitly defines a register.
+      return false;
+    if (DefReg)
+      // For now, don't move any instructions that define multiple registers.
+      return false;
+    DefReg = MO.getReg();
+  }
+
+  // Find the instruction that kills SavedReg.
+  MachineInstr *KillMI = NULL;
+  for (MachineRegisterInfo::use_nodbg_iterator
+         UI = MRI->use_nodbg_begin(SavedReg),
+         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+    MachineOperand &UseMO = UI.getOperand();
+    if (!UseMO.isKill())
+      continue;
+    KillMI = UseMO.getParent();
+    break;
+  }
+
+  if (!KillMI || KillMI->getParent() != MBB || KillMI == MI)
+    return false;
+
+  // If any of the definitions are used by another instruction between the
+  // position and the kill use, then it's not safe to sink it.
+  // 
+  // FIXME: This can be sped up if there is an easy way to query whether an
+  // instruction is before or after another instruction. Then we can use
+  // MachineRegisterInfo def / use instead.
+  MachineOperand *KillMO = NULL;
+  MachineBasicBlock::iterator KillPos = KillMI;
+  ++KillPos;
+
+  unsigned NumVisited = 0;
+  for (MachineBasicBlock::iterator I = llvm::next(OldPos); I != KillPos; ++I) {
+    MachineInstr *OtherMI = I;
+    // DBG_VALUE cannot be counted against the limit.
+    if (OtherMI->isDebugValue())
+      continue;
+    if (NumVisited > 30)  // FIXME: Arbitrary limit to reduce compile time cost.
+      return false;
+    ++NumVisited;
+    for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = OtherMI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+      if (DefReg == MOReg)
+        return false;
+
+      if (MO.isKill()) {
+        if (OtherMI == KillMI && MOReg == SavedReg)
+          // Save the operand that kills the register. We want to unset the kill
+          // marker if we can sink MI past it.
+          KillMO = &MO;
+        else if (UseRegs.count(MOReg))
+          // One of the uses is killed before the destination.
+          return false;
+      }
+    }
+  }
+
+  // Update kill and LV information.
+  KillMO->setIsKill(false);
+  KillMO = MI->findRegisterUseOperand(SavedReg, false, TRI);
+  KillMO->setIsKill(true);
+  
+  if (LV)
+    LV->replaceKillInstruction(SavedReg, KillMI, MI);
+
+  // Move instruction to its destination.
+  MBB->remove(MI);
+  MBB->insert(KillPos, MI);
+
+  ++Num3AddrSunk;
+  return true;
+}
+
+/// isTwoAddrUse - Return true if the specified MI is using the specified
+/// register as a two-address operand.
+static bool isTwoAddrUse(MachineInstr *UseMI, unsigned Reg) {
+  const TargetInstrDesc &TID = UseMI->getDesc();
+  for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = UseMI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == Reg &&
+        (MO.isDef() || UseMI->isRegTiedToDefOperand(i)))
+      // Earlier use is a two-address one.
+      return true;
+  }
+  return false;
+}
+
+/// isProfitableToReMat - Return true if the heuristics determines it is likely
+/// to be profitable to re-materialize the definition of Reg rather than copy
+/// the register.
+bool
+TwoAddressInstructionPass::isProfitableToReMat(unsigned Reg,
+                                         const TargetRegisterClass *RC,
+                                         MachineInstr *MI, MachineInstr *DefMI,
+                                         MachineBasicBlock *MBB, unsigned Loc) {
+  bool OtherUse = false;
+  for (MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(Reg),
+         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+    MachineOperand &UseMO = UI.getOperand();
+    MachineInstr *UseMI = UseMO.getParent();
+    MachineBasicBlock *UseMBB = UseMI->getParent();
+    if (UseMBB == MBB) {
+      DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UseMI);
+      if (DI != DistanceMap.end() && DI->second == Loc)
+        continue;  // Current use.
+      OtherUse = true;
+      // There is at least one other use in the MBB that will clobber the
+      // register. 
+      if (isTwoAddrUse(UseMI, Reg))
+        return true;
+    }
+  }
+
+  // If other uses in MBB are not two-address uses, then don't remat.
+  if (OtherUse)
+    return false;
+
+  // No other uses in the same block, remat if it's defined in the same
+  // block so it does not unnecessarily extend the live range.
+  return MBB == DefMI->getParent();
+}
+
+/// NoUseAfterLastDef - Return true if there are no intervening uses between the
+/// last instruction in the MBB that defines the specified register and the
+/// two-address instruction which is being processed. It also returns the last
+/// def location by reference
+bool TwoAddressInstructionPass::NoUseAfterLastDef(unsigned Reg,
+                                           MachineBasicBlock *MBB, unsigned Dist,
+                                           unsigned &LastDef) {
+  LastDef = 0;
+  unsigned LastUse = Dist;
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Reg),
+         E = MRI->reg_end(); I != E; ++I) {
+    MachineOperand &MO = I.getOperand();
+    MachineInstr *MI = MO.getParent();
+    if (MI->getParent() != MBB || MI->isDebugValue())
+      continue;
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+    if (DI == DistanceMap.end())
+      continue;
+    if (MO.isUse() && DI->second < LastUse)
+      LastUse = DI->second;
+    if (MO.isDef() && DI->second > LastDef)
+      LastDef = DI->second;
+  }
+
+  return !(LastUse > LastDef && LastUse < Dist);
+}
+
+MachineInstr *TwoAddressInstructionPass::FindLastUseInMBB(unsigned Reg,
+                                                         MachineBasicBlock *MBB,
+                                                         unsigned Dist) {
+  unsigned LastUseDist = 0;
+  MachineInstr *LastUse = 0;
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Reg),
+         E = MRI->reg_end(); I != E; ++I) {
+    MachineOperand &MO = I.getOperand();
+    MachineInstr *MI = MO.getParent();
+    if (MI->getParent() != MBB || MI->isDebugValue())
+      continue;
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+    if (DI == DistanceMap.end())
+      continue;
+    if (DI->second >= Dist)
+      continue;
+
+    if (MO.isUse() && DI->second > LastUseDist) {
+      LastUse = DI->first;
+      LastUseDist = DI->second;
+    }
+  }
+  return LastUse;
+}
+
+/// isCopyToReg - Return true if the specified MI is a copy instruction or
+/// a extract_subreg instruction. It also returns the source and destination
+/// registers and whether they are physical registers by reference.
+static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
+                        unsigned &SrcReg, unsigned &DstReg,
+                        bool &IsSrcPhys, bool &IsDstPhys) {
+  SrcReg = 0;
+  DstReg = 0;
+  if (MI.isCopy()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+  } else if (MI.isInsertSubreg() || MI.isSubregToReg()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(2).getReg();
+  } else
+    return false;
+
+  IsSrcPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
+  IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  return true;
+}
+
+/// isKilled - Test if the given register value, which is used by the given
+/// instruction, is killed by the given instruction. This looks through
+/// coalescable copies to see if the original value is potentially not killed.
+///
+/// For example, in this code:
+///
+///   %reg1034 = copy %reg1024
+///   %reg1035 = copy %reg1025<kill>
+///   %reg1036 = add %reg1034<kill>, %reg1035<kill>
+///
+/// %reg1034 is not considered to be killed, since it is copied from a
+/// register which is not killed. Treating it as not killed lets the
+/// normal heuristics commute the (two-address) add, which lets
+/// coalescing eliminate the extra copy.
+///
+static bool isKilled(MachineInstr &MI, unsigned Reg,
+                     const MachineRegisterInfo *MRI,
+                     const TargetInstrInfo *TII) {
+  MachineInstr *DefMI = &MI;
+  for (;;) {
+    if (!DefMI->killsRegister(Reg))
+      return false;
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      return true;
+    MachineRegisterInfo::def_iterator Begin = MRI->def_begin(Reg);
+    // If there are multiple defs, we can't do a simple analysis, so just
+    // go with what the kill flag says.
+    if (llvm::next(Begin) != MRI->def_end())
+      return true;
+    DefMI = &*Begin;
+    bool IsSrcPhys, IsDstPhys;
+    unsigned SrcReg,  DstReg;
+    // If the def is something other than a copy, then it isn't going to
+    // be coalesced, so follow the kill flag.
+    if (!isCopyToReg(*DefMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
+      return true;
+    Reg = SrcReg;
+  }
+}
+
+/// isTwoAddrUse - Return true if the specified MI uses the specified register
+/// as a two-address use. If so, return the destination register by reference.
+static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  unsigned NumOps = TID.getNumOperands();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
+      continue;
+    unsigned ti;
+    if (MI.isRegTiedToDefOperand(i, &ti)) {
+      DstReg = MI.getOperand(ti).getReg();
+      return true;
+    }
+  }
+  return false;
+}
+
+/// findOnlyInterestingUse - Given a register, if has a single in-basic block
+/// use, return the use instruction if it's a copy or a two-address use.
+static
+MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
+                                     MachineRegisterInfo *MRI,
+                                     const TargetInstrInfo *TII,
+                                     bool &IsCopy,
+                                     unsigned &DstReg, bool &IsDstPhys) {
+  if (!MRI->hasOneNonDBGUse(Reg))
+    // None or more than one use.
+    return 0;
+  MachineInstr &UseMI = *MRI->use_nodbg_begin(Reg);
+  if (UseMI.getParent() != MBB)
+    return 0;
+  unsigned SrcReg;
+  bool IsSrcPhys;
+  if (isCopyToReg(UseMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys)) {
+    IsCopy = true;
+    return &UseMI;
+  }
+  IsDstPhys = false;
+  if (isTwoAddrUse(UseMI, Reg, DstReg)) {
+    IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+    return &UseMI;
+  }
+  return 0;
+}
+
+/// getMappedReg - Return the physical register the specified virtual register
+/// might be mapped to.
+static unsigned
+getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
+  while (TargetRegisterInfo::isVirtualRegister(Reg))  {
+    DenseMap<unsigned, unsigned>::iterator SI = RegMap.find(Reg);
+    if (SI == RegMap.end())
+      return 0;
+    Reg = SI->second;
+  }
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return Reg;
+  return 0;
+}
+
+/// regsAreCompatible - Return true if the two registers are equal or aliased.
+///
+static bool
+regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
+  if (RegA == RegB)
+    return true;
+  if (!RegA || !RegB)
+    return false;
+  return TRI->regsOverlap(RegA, RegB);
+}
+
+
+/// isProfitableToReMat - Return true if it's potentially profitable to commute
+/// the two-address instruction that's being processed.
+bool
+TwoAddressInstructionPass::isProfitableToCommute(unsigned regB, unsigned regC,
+                                       MachineInstr *MI, MachineBasicBlock *MBB,
+                                       unsigned Dist) {
+  // Determine if it's profitable to commute this two address instruction. In
+  // general, we want no uses between this instruction and the definition of
+  // the two-address register.
+  // e.g.
+  // %reg1028<def> = EXTRACT_SUBREG %reg1027<kill>, 1
+  // %reg1029<def> = MOV8rr %reg1028
+  // %reg1029<def> = SHR8ri %reg1029, 7, %EFLAGS<imp-def,dead>
+  // insert => %reg1030<def> = MOV8rr %reg1028
+  // %reg1030<def> = ADD8rr %reg1028<kill>, %reg1029<kill>, %EFLAGS<imp-def,dead>
+  // In this case, it might not be possible to coalesce the second MOV8rr
+  // instruction if the first one is coalesced. So it would be profitable to
+  // commute it:
+  // %reg1028<def> = EXTRACT_SUBREG %reg1027<kill>, 1
+  // %reg1029<def> = MOV8rr %reg1028
+  // %reg1029<def> = SHR8ri %reg1029, 7, %EFLAGS<imp-def,dead>
+  // insert => %reg1030<def> = MOV8rr %reg1029
+  // %reg1030<def> = ADD8rr %reg1029<kill>, %reg1028<kill>, %EFLAGS<imp-def,dead>  
+
+  if (!MI->killsRegister(regC))
+    return false;
+
+  // Ok, we have something like:
+  // %reg1030<def> = ADD8rr %reg1028<kill>, %reg1029<kill>, %EFLAGS<imp-def,dead>
+  // let's see if it's worth commuting it.
+
+  // Look for situations like this:
+  // %reg1024<def> = MOV r1
+  // %reg1025<def> = MOV r0
+  // %reg1026<def> = ADD %reg1024, %reg1025
+  // r0            = MOV %reg1026
+  // Commute the ADD to hopefully eliminate an otherwise unavoidable copy.
+  unsigned FromRegB = getMappedReg(regB, SrcRegMap);
+  unsigned FromRegC = getMappedReg(regC, SrcRegMap);
+  unsigned ToRegB = getMappedReg(regB, DstRegMap);
+  unsigned ToRegC = getMappedReg(regC, DstRegMap);
+  if (!regsAreCompatible(FromRegB, ToRegB, TRI) &&
+      (regsAreCompatible(FromRegB, ToRegC, TRI) ||
+       regsAreCompatible(FromRegC, ToRegB, TRI)))
+    return true;
+
+  // If there is a use of regC between its last def (could be livein) and this
+  // instruction, then bail.
+  unsigned LastDefC = 0;
+  if (!NoUseAfterLastDef(regC, MBB, Dist, LastDefC))
+    return false;
+
+  // If there is a use of regB between its last def (could be livein) and this
+  // instruction, then go ahead and make this transformation.
+  unsigned LastDefB = 0;
+  if (!NoUseAfterLastDef(regB, MBB, Dist, LastDefB))
+    return true;
+
+  // Since there are no intervening uses for both registers, then commute
+  // if the def of regC is closer. Its live interval is shorter.
+  return LastDefB && LastDefC && LastDefC > LastDefB;
+}
+
+/// CommuteInstruction - Commute a two-address instruction and update the basic
+/// block, distance map, and live variables if needed. Return true if it is
+/// successful.
+bool
+TwoAddressInstructionPass::CommuteInstruction(MachineBasicBlock::iterator &mi,
+                               MachineFunction::iterator &mbbi,
+                               unsigned RegB, unsigned RegC, unsigned Dist) {
+  MachineInstr *MI = mi;
+  DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
+  MachineInstr *NewMI = TII->commuteInstruction(MI);
+
+  if (NewMI == 0) {
+    DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "2addr: COMMUTED TO: " << *NewMI);
+  // If the instruction changed to commute it, update livevar.
+  if (NewMI != MI) {
+    if (LV)
+      // Update live variables
+      LV->replaceKillInstruction(RegC, MI, NewMI);
+
+    mbbi->insert(mi, NewMI);           // Insert the new inst
+    mbbi->erase(mi);                   // Nuke the old inst.
+    mi = NewMI;
+    DistanceMap.insert(std::make_pair(NewMI, Dist));
+  }
+
+  // Update source register map.
+  unsigned FromRegC = getMappedReg(RegC, SrcRegMap);
+  if (FromRegC) {
+    unsigned RegA = MI->getOperand(0).getReg();
+    SrcRegMap[RegA] = FromRegC;
+  }
+
+  return true;
+}
+
+/// isProfitableToConv3Addr - Return true if it is profitable to convert the
+/// given 2-address instruction to a 3-address one.
+bool
+TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA) {
+  // Look for situations like this:
+  // %reg1024<def> = MOV r1
+  // %reg1025<def> = MOV r0
+  // %reg1026<def> = ADD %reg1024, %reg1025
+  // r2            = MOV %reg1026
+  // Turn ADD into a 3-address instruction to avoid a copy.
+  unsigned FromRegA = getMappedReg(RegA, SrcRegMap);
+  unsigned ToRegA = getMappedReg(RegA, DstRegMap);
+  return (FromRegA && ToRegA && !regsAreCompatible(FromRegA, ToRegA, TRI));
+}
+
+/// ConvertInstTo3Addr - Convert the specified two-address instruction into a
+/// three address one. Return true if this transformation was successful.
+bool
+TwoAddressInstructionPass::ConvertInstTo3Addr(MachineBasicBlock::iterator &mi,
+                                              MachineBasicBlock::iterator &nmi,
+                                              MachineFunction::iterator &mbbi,
+                                              unsigned RegB, unsigned Dist) {
+  MachineInstr *NewMI = TII->convertToThreeAddress(mbbi, mi, LV);
+  if (NewMI) {
+    DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+    DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
+    bool Sunk = false;
+
+    if (NewMI->findRegisterUseOperand(RegB, false, TRI))
+      // FIXME: Temporary workaround. If the new instruction doesn't
+      // uses RegB, convertToThreeAddress must have created more
+      // then one instruction.
+      Sunk = Sink3AddrInstruction(mbbi, NewMI, RegB, mi);
+
+    mbbi->erase(mi); // Nuke the old inst.
+
+    if (!Sunk) {
+      DistanceMap.insert(std::make_pair(NewMI, Dist));
+      mi = NewMI;
+      nmi = llvm::next(mi);
+    }
+    return true;
+  }
+
+  return false;
+}
+
+/// ProcessCopy - If the specified instruction is not yet processed, process it
+/// if it's a copy. For a copy instruction, we find the physical registers the
+/// source and destination registers might be mapped to. These are kept in
+/// point-to maps used to determine future optimizations. e.g.
+/// v1024 = mov r0
+/// v1025 = mov r1
+/// v1026 = add v1024, v1025
+/// r1    = mov r1026
+/// If 'add' is a two-address instruction, v1024, v1026 are both potentially
+/// coalesced to r0 (from the input side). v1025 is mapped to r1. v1026 is
+/// potentially joined with r1 on the output side. It's worthwhile to commute
+/// 'add' to eliminate a copy.
+void TwoAddressInstructionPass::ProcessCopy(MachineInstr *MI,
+                                     MachineBasicBlock *MBB,
+                                     SmallPtrSet<MachineInstr*, 8> &Processed) {
+  if (Processed.count(MI))
+    return;
+
+  bool IsSrcPhys, IsDstPhys;
+  unsigned SrcReg, DstReg;
+  if (!isCopyToReg(*MI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
+    return;
+
+  if (IsDstPhys && !IsSrcPhys)
+    DstRegMap.insert(std::make_pair(SrcReg, DstReg));
+  else if (!IsDstPhys && IsSrcPhys) {
+    bool isNew = SrcRegMap.insert(std::make_pair(DstReg, SrcReg)).second;
+    if (!isNew)
+      assert(SrcRegMap[DstReg] == SrcReg &&
+             "Can't map to two src physical registers!");
+
+    SmallVector<unsigned, 4> VirtRegPairs;
+    bool IsCopy = false;
+    unsigned NewReg = 0;
+    while (MachineInstr *UseMI = findOnlyInterestingUse(DstReg, MBB, MRI,TII,
+                                                   IsCopy, NewReg, IsDstPhys)) {
+      if (IsCopy) {
+        if (!Processed.insert(UseMI))
+          break;
+      }
+
+      DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UseMI);
+      if (DI != DistanceMap.end())
+        // Earlier in the same MBB.Reached via a back edge.
+        break;
+
+      if (IsDstPhys) {
+        VirtRegPairs.push_back(NewReg);
+        break;
+      }
+      bool isNew = SrcRegMap.insert(std::make_pair(NewReg, DstReg)).second;
+      if (!isNew)
+        assert(SrcRegMap[NewReg] == DstReg &&
+               "Can't map to two src physical registers!");
+      VirtRegPairs.push_back(NewReg);
+      DstReg = NewReg;
+    }
+
+    if (!VirtRegPairs.empty()) {
+      unsigned ToReg = VirtRegPairs.back();
+      VirtRegPairs.pop_back();
+      while (!VirtRegPairs.empty()) {
+        unsigned FromReg = VirtRegPairs.back();
+        VirtRegPairs.pop_back();
+        bool isNew = DstRegMap.insert(std::make_pair(FromReg, ToReg)).second;
+        if (!isNew)
+          assert(DstRegMap[FromReg] == ToReg &&
+                 "Can't map to two dst physical registers!");
+        ToReg = FromReg;
+      }
+    }
+  }
+
+  Processed.insert(MI);
+}
+
+/// isSafeToDelete - If the specified instruction does not produce any side
+/// effects and all of its defs are dead, then it's safe to delete.
+static bool isSafeToDelete(MachineInstr *MI,
+                           const TargetInstrInfo *TII,
+                           SmallVector<unsigned, 4> &Kills) {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (TID.mayStore() || TID.isCall())
+    return false;
+  if (TID.isTerminator() || TID.hasUnmodeledSideEffects())
+    return false;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef() && !MO.isDead())
+      return false;
+    if (MO.isUse() && MO.isKill())
+      Kills.push_back(MO.getReg());
+  }
+  return true;
+}
+
+/// canUpdateDeletedKills - Check if all the registers listed in Kills are
+/// killed by instructions in MBB preceding the current instruction at
+/// position Dist.  If so, return true and record information about the
+/// preceding kills in NewKills.
+bool TwoAddressInstructionPass::
+canUpdateDeletedKills(SmallVector<unsigned, 4> &Kills,
+                      SmallVector<NewKill, 4> &NewKills,
+                      MachineBasicBlock *MBB, unsigned Dist) {
+  while (!Kills.empty()) {
+    unsigned Kill = Kills.back();
+    Kills.pop_back();
+    if (TargetRegisterInfo::isPhysicalRegister(Kill))
+      return false;
+
+    MachineInstr *LastKill = FindLastUseInMBB(Kill, MBB, Dist);
+    if (!LastKill)
+      return false;
+
+    bool isModRef = LastKill->definesRegister(Kill);
+    NewKills.push_back(std::make_pair(std::make_pair(Kill, isModRef),
+                                      LastKill));
+  }
+  return true;
+}
+
+/// DeleteUnusedInstr - If an instruction with a tied register operand can
+/// be safely deleted, just delete it.
+bool
+TwoAddressInstructionPass::DeleteUnusedInstr(MachineBasicBlock::iterator &mi,
+                                             MachineBasicBlock::iterator &nmi,
+                                             MachineFunction::iterator &mbbi,
+                                             unsigned Dist) {
+  // Check if the instruction has no side effects and if all its defs are dead.
+  SmallVector<unsigned, 4> Kills;
+  if (!isSafeToDelete(mi, TII, Kills))
+    return false;
+
+  // If this instruction kills some virtual registers, we need to
+  // update the kill information. If it's not possible to do so,
+  // then bail out.
+  SmallVector<NewKill, 4> NewKills;
+  if (!canUpdateDeletedKills(Kills, NewKills, &*mbbi, Dist))
+    return false;
+
+  if (LV) {
+    while (!NewKills.empty()) {
+      MachineInstr *NewKill = NewKills.back().second;
+      unsigned Kill = NewKills.back().first.first;
+      bool isDead = NewKills.back().first.second;
+      NewKills.pop_back();
+      if (LV->removeVirtualRegisterKilled(Kill, mi)) {
+        if (isDead)
+          LV->addVirtualRegisterDead(Kill, NewKill);
+        else
+          LV->addVirtualRegisterKilled(Kill, NewKill);
+      }
+    }
+  }
+
+  mbbi->erase(mi); // Nuke the old inst.
+  mi = nmi;
+  return true;
+}
+
+/// TryInstructionTransform - For the case where an instruction has a single
+/// pair of tied register operands, attempt some transformations that may
+/// either eliminate the tied operands or improve the opportunities for
+/// coalescing away the register copy.  Returns true if the tied operands
+/// are eliminated altogether.
+bool TwoAddressInstructionPass::
+TryInstructionTransform(MachineBasicBlock::iterator &mi,
+                        MachineBasicBlock::iterator &nmi,
+                        MachineFunction::iterator &mbbi,
+                        unsigned SrcIdx, unsigned DstIdx, unsigned Dist) {
+  const TargetInstrDesc &TID = mi->getDesc();
+  unsigned regA = mi->getOperand(DstIdx).getReg();
+  unsigned regB = mi->getOperand(SrcIdx).getReg();
+
+  assert(TargetRegisterInfo::isVirtualRegister(regB) &&
+         "cannot make instruction into two-address form");
+
+  // If regA is dead and the instruction can be deleted, just delete
+  // it so it doesn't clobber regB.
+  bool regBKilled = isKilled(*mi, regB, MRI, TII);
+  if (!regBKilled && mi->getOperand(DstIdx).isDead() &&
+      DeleteUnusedInstr(mi, nmi, mbbi, Dist)) {
+    ++NumDeletes;
+    return true; // Done with this instruction.
+  }
+
+  // Check if it is profitable to commute the operands.
+  unsigned SrcOp1, SrcOp2;
+  unsigned regC = 0;
+  unsigned regCIdx = ~0U;
+  bool TryCommute = false;
+  bool AggressiveCommute = false;
+  if (TID.isCommutable() && mi->getNumOperands() >= 3 &&
+      TII->findCommutedOpIndices(mi, SrcOp1, SrcOp2)) {
+    if (SrcIdx == SrcOp1)
+      regCIdx = SrcOp2;
+    else if (SrcIdx == SrcOp2)
+      regCIdx = SrcOp1;
+
+    if (regCIdx != ~0U) {
+      regC = mi->getOperand(regCIdx).getReg();
+      if (!regBKilled && isKilled(*mi, regC, MRI, TII))
+        // If C dies but B does not, swap the B and C operands.
+        // This makes the live ranges of A and C joinable.
+        TryCommute = true;
+      else if (isProfitableToCommute(regB, regC, mi, mbbi, Dist)) {
+        TryCommute = true;
+        AggressiveCommute = true;
+      }
+    }
+  }
+
+  // If it's profitable to commute, try to do so.
+  if (TryCommute && CommuteInstruction(mi, mbbi, regB, regC, Dist)) {
+    ++NumCommuted;
+    if (AggressiveCommute)
+      ++NumAggrCommuted;
+    return false;
+  }
+
+  if (TID.isConvertibleTo3Addr()) {
+    // This instruction is potentially convertible to a true
+    // three-address instruction.  Check if it is profitable.
+    if (!regBKilled || isProfitableToConv3Addr(regA)) {
+      // Try to convert it.
+      if (ConvertInstTo3Addr(mi, nmi, mbbi, regB, Dist)) {
+        ++NumConvertedTo3Addr;
+        return true; // Done with this instruction.
+      }
+    }
+  }
+
+  // If this is an instruction with a load folded into it, try unfolding
+  // the load, e.g. avoid this:
+  //   movq %rdx, %rcx
+  //   addq (%rax), %rcx
+  // in favor of this:
+  //   movq (%rax), %rcx
+  //   addq %rdx, %rcx
+  // because it's preferable to schedule a load than a register copy.
+  if (TID.mayLoad() && !regBKilled) {
+    // Determine if a load can be unfolded.
+    unsigned LoadRegIndex;
+    unsigned NewOpc =
+      TII->getOpcodeAfterMemoryUnfold(mi->getOpcode(),
+                                      /*UnfoldLoad=*/true,
+                                      /*UnfoldStore=*/false,
+                                      &LoadRegIndex);
+    if (NewOpc != 0) {
+      const TargetInstrDesc &UnfoldTID = TII->get(NewOpc);
+      if (UnfoldTID.getNumDefs() == 1) {
+        MachineFunction &MF = *mbbi->getParent();
+
+        // Unfold the load.
+        DEBUG(dbgs() << "2addr:   UNFOLDING: " << *mi);
+        const TargetRegisterClass *RC =
+          UnfoldTID.OpInfo[LoadRegIndex].getRegClass(TRI);
+        unsigned Reg = MRI->createVirtualRegister(RC);
+        SmallVector<MachineInstr *, 2> NewMIs;
+        if (!TII->unfoldMemoryOperand(MF, mi, Reg,
+                                      /*UnfoldLoad=*/true,/*UnfoldStore=*/false,
+                                      NewMIs)) {
+          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          return false;
+        }
+        assert(NewMIs.size() == 2 &&
+               "Unfolded a load into multiple instructions!");
+        // The load was previously folded, so this is the only use.
+        NewMIs[1]->addRegisterKilled(Reg, TRI);
+
+        // Tentatively insert the instructions into the block so that they
+        // look "normal" to the transformation logic.
+        mbbi->insert(mi, NewMIs[0]);
+        mbbi->insert(mi, NewMIs[1]);
+
+        DEBUG(dbgs() << "2addr:    NEW LOAD: " << *NewMIs[0]
+                     << "2addr:    NEW INST: " << *NewMIs[1]);
+
+        // Transform the instruction, now that it no longer has a load.
+        unsigned NewDstIdx = NewMIs[1]->findRegisterDefOperandIdx(regA);
+        unsigned NewSrcIdx = NewMIs[1]->findRegisterUseOperandIdx(regB);
+        MachineBasicBlock::iterator NewMI = NewMIs[1];
+        bool TransformSuccess =
+          TryInstructionTransform(NewMI, mi, mbbi,
+                                  NewSrcIdx, NewDstIdx, Dist);
+        if (TransformSuccess ||
+            NewMIs[1]->getOperand(NewSrcIdx).isKill()) {
+          // Success, or at least we made an improvement. Keep the unfolded
+          // instructions and discard the original.
+          if (LV) {
+            for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
+              MachineOperand &MO = mi->getOperand(i);
+              if (MO.isReg() && MO.getReg() != 0 &&
+                  TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+                if (MO.isUse()) {
+                  if (MO.isKill()) {
+                    if (NewMIs[0]->killsRegister(MO.getReg()))
+                      LV->replaceKillInstruction(MO.getReg(), mi, NewMIs[0]);
+                    else {
+                      assert(NewMIs[1]->killsRegister(MO.getReg()) &&
+                             "Kill missing after load unfold!");
+                      LV->replaceKillInstruction(MO.getReg(), mi, NewMIs[1]);
+                    }
+                  }
+                } else if (LV->removeVirtualRegisterDead(MO.getReg(), mi)) {
+                  if (NewMIs[1]->registerDefIsDead(MO.getReg()))
+                    LV->addVirtualRegisterDead(MO.getReg(), NewMIs[1]);
+                  else {
+                    assert(NewMIs[0]->registerDefIsDead(MO.getReg()) &&
+                           "Dead flag missing after load unfold!");
+                    LV->addVirtualRegisterDead(MO.getReg(), NewMIs[0]);
+                  }
+                }
+              }
+            }
+            LV->addVirtualRegisterKilled(Reg, NewMIs[1]);
+          }
+          mi->eraseFromParent();
+          mi = NewMIs[1];
+          if (TransformSuccess)
+            return true;
+        } else {
+          // Transforming didn't eliminate the tie and didn't lead to an
+          // improvement. Clean up the unfolded instructions and keep the
+          // original.
+          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          NewMIs[0]->eraseFromParent();
+          NewMIs[1]->eraseFromParent();
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+/// runOnMachineFunction - Reduce two-address instructions to two operands.
+///
+bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "Machine Function\n");
+  const TargetMachine &TM = MF.getTarget();
+  MRI = &MF.getRegInfo();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  LV = getAnalysisIfAvailable<LiveVariables>();
+  AA = &getAnalysis<AliasAnalysis>();
+
+  bool MadeChange = false;
+
+  DEBUG(dbgs() << "********** REWRITING TWO-ADDR INSTRS **********\n");
+  DEBUG(dbgs() << "********** Function: " 
+        << MF.getFunction()->getName() << '\n');
+
+  // ReMatRegs - Keep track of the registers whose def's are remat'ed.
+  BitVector ReMatRegs;
+  ReMatRegs.resize(MRI->getLastVirtReg()+1);
+
+  typedef DenseMap<unsigned, SmallVector<std::pair<unsigned, unsigned>, 4> >
+    TiedOperandMap;
+  TiedOperandMap TiedOperands(4);
+
+  SmallPtrSet<MachineInstr*, 8> Processed;
+  for (MachineFunction::iterator mbbi = MF.begin(), mbbe = MF.end();
+       mbbi != mbbe; ++mbbi) {
+    unsigned Dist = 0;
+    DistanceMap.clear();
+    SrcRegMap.clear();
+    DstRegMap.clear();
+    Processed.clear();
+    for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
+         mi != me; ) {
+      MachineBasicBlock::iterator nmi = llvm::next(mi);
+      if (mi->isDebugValue()) {
+        mi = nmi;
+        continue;
+      }
+
+      // Remember REG_SEQUENCE instructions, we'll deal with them later.
+      if (mi->isRegSequence())
+        RegSequences.push_back(&*mi);
+
+      const TargetInstrDesc &TID = mi->getDesc();
+      bool FirstTied = true;
+
+      DistanceMap.insert(std::make_pair(mi, ++Dist));
+
+      ProcessCopy(&*mi, &*mbbi, Processed);
+
+      // First scan through all the tied register uses in this instruction
+      // and record a list of pairs of tied operands for each register.
+      unsigned NumOps = TID.getNumOperands();
+      for (unsigned SrcIdx = 0; SrcIdx < NumOps; ++SrcIdx) {
+        unsigned DstIdx = 0;
+        if (!mi->isRegTiedToDefOperand(SrcIdx, &DstIdx))
+          continue;
+
+        if (FirstTied) {
+          FirstTied = false;
+          ++NumTwoAddressInstrs;
+          DEBUG(dbgs() << '\t' << *mi);
+        }
+
+        assert(mi->getOperand(SrcIdx).isReg() &&
+               mi->getOperand(SrcIdx).getReg() &&
+               mi->getOperand(SrcIdx).isUse() &&
+               "two address instruction invalid");
+
+        unsigned regB = mi->getOperand(SrcIdx).getReg();
+        TiedOperandMap::iterator OI = TiedOperands.find(regB);
+        if (OI == TiedOperands.end()) {
+          SmallVector<std::pair<unsigned, unsigned>, 4> TiedPair;
+          OI = TiedOperands.insert(std::make_pair(regB, TiedPair)).first;
+        }
+        OI->second.push_back(std::make_pair(SrcIdx, DstIdx));
+      }
+
+      // Now iterate over the information collected above.
+      for (TiedOperandMap::iterator OI = TiedOperands.begin(),
+             OE = TiedOperands.end(); OI != OE; ++OI) {
+        SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs = OI->second;
+
+        // If the instruction has a single pair of tied operands, try some
+        // transformations that may either eliminate the tied operands or
+        // improve the opportunities for coalescing away the register copy.
+        if (TiedOperands.size() == 1 && TiedPairs.size() == 1) {
+          unsigned SrcIdx = TiedPairs[0].first;
+          unsigned DstIdx = TiedPairs[0].second;
+
+          // If the registers are already equal, nothing needs to be done.
+          if (mi->getOperand(SrcIdx).getReg() ==
+              mi->getOperand(DstIdx).getReg())
+            break; // Done with this instruction.
+
+          if (TryInstructionTransform(mi, nmi, mbbi, SrcIdx, DstIdx, Dist))
+            break; // The tied operands have been eliminated.
+        }
+
+        bool RemovedKillFlag = false;
+        bool AllUsesCopied = true;
+        unsigned LastCopiedReg = 0;
+        unsigned regB = OI->first;
+        for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
+          unsigned SrcIdx = TiedPairs[tpi].first;
+          unsigned DstIdx = TiedPairs[tpi].second;
+          unsigned regA = mi->getOperand(DstIdx).getReg();
+          // Grab regB from the instruction because it may have changed if the
+          // instruction was commuted.
+          regB = mi->getOperand(SrcIdx).getReg();
+
+          if (regA == regB) {
+            // The register is tied to multiple destinations (or else we would
+            // not have continued this far), but this use of the register
+            // already matches the tied destination.  Leave it.
+            AllUsesCopied = false;
+            continue;
+          }
+          LastCopiedReg = regA;
+
+          assert(TargetRegisterInfo::isVirtualRegister(regB) &&
+                 "cannot make instruction into two-address form");
+
+#ifndef NDEBUG
+          // First, verify that we don't have a use of "a" in the instruction
+          // (a = b + a for example) because our transformation will not
+          // work. This should never occur because we are in SSA form.
+          for (unsigned i = 0; i != mi->getNumOperands(); ++i)
+            assert(i == DstIdx ||
+                   !mi->getOperand(i).isReg() ||
+                   mi->getOperand(i).getReg() != regA);
+#endif
+
+          // Emit a copy or rematerialize the definition.
+          const TargetRegisterClass *rc = MRI->getRegClass(regB);
+          MachineInstr *DefMI = MRI->getVRegDef(regB);
+          // If it's safe and profitable, remat the definition instead of
+          // copying it.
+          if (DefMI &&
+              DefMI->getDesc().isAsCheapAsAMove() &&
+              DefMI->isSafeToReMat(TII, AA, regB) &&
+              isProfitableToReMat(regB, rc, mi, DefMI, mbbi, Dist)){
+            DEBUG(dbgs() << "2addr: REMATTING : " << *DefMI << "\n");
+            unsigned regASubIdx = mi->getOperand(DstIdx).getSubReg();
+            TII->reMaterialize(*mbbi, mi, regA, regASubIdx, DefMI, *TRI);
+            ReMatRegs.set(regB);
+            ++NumReMats;
+          } else {
+            BuildMI(*mbbi, mi, mi->getDebugLoc(), TII->get(TargetOpcode::COPY),
+                    regA).addReg(regB);
+          }
+
+          MachineBasicBlock::iterator prevMI = prior(mi);
+          // Update DistanceMap.
+          DistanceMap.insert(std::make_pair(prevMI, Dist));
+          DistanceMap[mi] = ++Dist;
+
+          DEBUG(dbgs() << "\t\tprepend:\t" << *prevMI);
+
+          MachineOperand &MO = mi->getOperand(SrcIdx);
+          assert(MO.isReg() && MO.getReg() == regB && MO.isUse() &&
+                 "inconsistent operand info for 2-reg pass");
+          if (MO.isKill()) {
+            MO.setIsKill(false);
+            RemovedKillFlag = true;
+          }
+          MO.setReg(regA);
+        }
+
+        if (AllUsesCopied) {
+          // Replace other (un-tied) uses of regB with LastCopiedReg.
+          for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
+            MachineOperand &MO = mi->getOperand(i);
+            if (MO.isReg() && MO.getReg() == regB && MO.isUse()) {
+              if (MO.isKill()) {
+                MO.setIsKill(false);
+                RemovedKillFlag = true;
+              }
+              MO.setReg(LastCopiedReg);
+            }
+          }
+
+          // Update live variables for regB.
+          if (RemovedKillFlag && LV && LV->getVarInfo(regB).removeKill(mi))
+            LV->addVirtualRegisterKilled(regB, prior(mi));
+
+        } else if (RemovedKillFlag) {
+          // Some tied uses of regB matched their destination registers, so
+          // regB is still used in this instruction, but a kill flag was
+          // removed from a different tied use of regB, so now we need to add
+          // a kill flag to one of the remaining uses of regB.
+          for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
+            MachineOperand &MO = mi->getOperand(i);
+            if (MO.isReg() && MO.getReg() == regB && MO.isUse()) {
+              MO.setIsKill(true);
+              break;
+            }
+          }
+        }
+
+        // Schedule the source copy / remat inserted to form two-address
+        // instruction. FIXME: Does it matter the distance map may not be
+        // accurate after it's scheduled?
+        TII->scheduleTwoAddrSource(prior(mi), mi, *TRI);
+
+        MadeChange = true;
+
+        DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
+      }
+
+      // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
+      if (mi->isInsertSubreg()) {
+        // From %reg = INSERT_SUBREG %reg, %subreg, subidx
+        // To   %reg:subidx = COPY %subreg
+        unsigned SubIdx = mi->getOperand(3).getImm();
+        mi->RemoveOperand(3);
+        assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
+        mi->getOperand(0).setSubReg(SubIdx);
+        mi->RemoveOperand(1);
+        mi->setDesc(TII->get(TargetOpcode::COPY));
+        DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
+      }
+
+      // Clear TiedOperands here instead of at the top of the loop
+      // since most instructions do not have tied operands.
+      TiedOperands.clear();
+      mi = nmi;
+    }
+  }
+
+  // Some remat'ed instructions are dead.
+  int VReg = ReMatRegs.find_first();
+  while (VReg != -1) {
+    if (MRI->use_nodbg_empty(VReg)) {
+      MachineInstr *DefMI = MRI->getVRegDef(VReg);
+      DefMI->eraseFromParent();
+    }
+    VReg = ReMatRegs.find_next(VReg);
+  }
+
+  // Eliminate REG_SEQUENCE instructions. Their whole purpose was to preseve
+  // SSA form. It's now safe to de-SSA.
+  MadeChange |= EliminateRegSequences();
+
+  return MadeChange;
+}
+
+static void UpdateRegSequenceSrcs(unsigned SrcReg,
+                                  unsigned DstReg, unsigned SubIdx,
+                                  MachineRegisterInfo *MRI,
+                                  const TargetRegisterInfo &TRI) {
+  for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg),
+         RE = MRI->reg_end(); RI != RE; ) {
+    MachineOperand &MO = RI.getOperand();
+    ++RI;
+    MO.substVirtReg(DstReg, SubIdx, TRI);
+  }
+}
+
+/// CoalesceExtSubRegs - If a number of sources of the REG_SEQUENCE are
+/// EXTRACT_SUBREG from the same register and to the same virtual register
+/// with different sub-register indices, attempt to combine the
+/// EXTRACT_SUBREGs and pre-coalesce them. e.g.
+/// %reg1026<def> = VLDMQ %reg1025<kill>, 260, pred:14, pred:%reg0
+/// %reg1029:6<def> = EXTRACT_SUBREG %reg1026, 6
+/// %reg1029:5<def> = EXTRACT_SUBREG %reg1026<kill>, 5
+/// Since D subregs 5, 6 can combine to a Q register, we can coalesce
+/// reg1026 to reg1029.
+void
+TwoAddressInstructionPass::CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs,
+                                              unsigned DstReg) {
+  SmallSet<unsigned, 4> Seen;
+  for (unsigned i = 0, e = Srcs.size(); i != e; ++i) {
+    unsigned SrcReg = Srcs[i];
+    if (!Seen.insert(SrcReg))
+      continue;
+
+    // Check that the instructions are all in the same basic block.
+    MachineInstr *SrcDefMI = MRI->getVRegDef(SrcReg);
+    MachineInstr *DstDefMI = MRI->getVRegDef(DstReg);
+    if (SrcDefMI->getParent() != DstDefMI->getParent())
+      continue;
+
+    // If there are no other uses than copies which feed into
+    // the reg_sequence, then we might be able to coalesce them.
+    bool CanCoalesce = true;
+    SmallVector<unsigned, 4> SrcSubIndices, DstSubIndices;
+    for (MachineRegisterInfo::use_nodbg_iterator
+           UI = MRI->use_nodbg_begin(SrcReg),
+           UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+      MachineInstr *UseMI = &*UI;
+      if (!UseMI->isCopy() || UseMI->getOperand(0).getReg() != DstReg) {
+        CanCoalesce = false;
+        break;
+      }
+      SrcSubIndices.push_back(UseMI->getOperand(1).getSubReg());
+      DstSubIndices.push_back(UseMI->getOperand(0).getSubReg());
+    }
+
+    if (!CanCoalesce || SrcSubIndices.size() < 2)
+      continue;
+
+    // Check that the source subregisters can be combined.
+    std::sort(SrcSubIndices.begin(), SrcSubIndices.end());
+    unsigned NewSrcSubIdx = 0;
+    if (!TRI->canCombineSubRegIndices(MRI->getRegClass(SrcReg), SrcSubIndices,
+                                      NewSrcSubIdx))
+      continue;
+
+    // Check that the destination subregisters can also be combined.
+    std::sort(DstSubIndices.begin(), DstSubIndices.end());
+    unsigned NewDstSubIdx = 0;
+    if (!TRI->canCombineSubRegIndices(MRI->getRegClass(DstReg), DstSubIndices,
+                                      NewDstSubIdx))
+      continue;
+
+    // If neither source nor destination can be combined to the full register,
+    // just give up.  This could be improved if it ever matters.
+    if (NewSrcSubIdx != 0 && NewDstSubIdx != 0)
+      continue;
+
+    // Now that we know that all the uses are extract_subregs and that those
+    // subregs can somehow be combined, scan all the extract_subregs again to
+    // make sure the subregs are in the right order and can be composed.
+    MachineInstr *SomeMI = 0;
+    CanCoalesce = true;
+    for (MachineRegisterInfo::use_nodbg_iterator
+           UI = MRI->use_nodbg_begin(SrcReg),
+           UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+      MachineInstr *UseMI = &*UI;
+      assert(UseMI->isCopy());
+      unsigned DstSubIdx = UseMI->getOperand(0).getSubReg();
+      unsigned SrcSubIdx = UseMI->getOperand(1).getSubReg();
+      assert(DstSubIdx != 0 && "missing subreg from RegSequence elimination");
+      if ((NewDstSubIdx == 0 &&
+           TRI->composeSubRegIndices(NewSrcSubIdx, DstSubIdx) != SrcSubIdx) ||
+          (NewSrcSubIdx == 0 &&
+           TRI->composeSubRegIndices(NewDstSubIdx, SrcSubIdx) != DstSubIdx)) {
+        CanCoalesce = false;
+        break;
+      }
+      // Keep track of one of the uses.
+      SomeMI = UseMI;
+    }
+    if (!CanCoalesce)
+      continue;
+
+    // Insert a copy to replace the original.
+    MachineBasicBlock::iterator InsertLoc = SomeMI;
+    MachineInstr *CopyMI = BuildMI(*SomeMI->getParent(), SomeMI,
+                                   SomeMI->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY))
+      .addReg(DstReg, RegState::Define, NewDstSubIdx)
+      .addReg(SrcReg, 0, NewSrcSubIdx);
+
+    // Remove all the old extract instructions.
+    for (MachineRegisterInfo::use_nodbg_iterator
+           UI = MRI->use_nodbg_begin(SrcReg),
+           UE = MRI->use_nodbg_end(); UI != UE; ) {
+      MachineInstr *UseMI = &*UI;
+      ++UI;
+      if (UseMI == CopyMI)
+        continue;
+      assert(UseMI->isCopy());
+      // Move any kills to the new copy or extract instruction.
+      if (UseMI->getOperand(1).isKill()) {
+        CopyMI->getOperand(1).setIsKill();
+        if (LV)
+          // Update live variables
+          LV->replaceKillInstruction(SrcReg, UseMI, &*CopyMI);
+      }
+      UseMI->eraseFromParent();
+    }
+  }
+}
+
+static bool HasOtherRegSequenceUses(unsigned Reg, MachineInstr *RegSeq,
+                                    MachineRegisterInfo *MRI) {
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
+         UE = MRI->use_end(); UI != UE; ++UI) {
+    MachineInstr *UseMI = &*UI;
+    if (UseMI != RegSeq && UseMI->isRegSequence())
+      return true;
+  }
+  return false;
+}
+
+/// EliminateRegSequences - Eliminate REG_SEQUENCE instructions as part
+/// of the de-ssa process. This replaces sources of REG_SEQUENCE as
+/// sub-register references of the register defined by REG_SEQUENCE. e.g.
+///
+/// %reg1029<def>, %reg1030<def> = VLD1q16 %reg1024<kill>, ...
+/// %reg1031<def> = REG_SEQUENCE %reg1029<kill>, 5, %reg1030<kill>, 6
+/// =>
+/// %reg1031:5<def>, %reg1031:6<def> = VLD1q16 %reg1024<kill>, ...
+bool TwoAddressInstructionPass::EliminateRegSequences() {
+  if (RegSequences.empty())
+    return false;
+
+  for (unsigned i = 0, e = RegSequences.size(); i != e; ++i) {
+    MachineInstr *MI = RegSequences[i];
+    unsigned DstReg = MI->getOperand(0).getReg();
+    if (MI->getOperand(0).getSubReg() ||
+        TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+        !(MI->getNumOperands() & 1)) {
+      DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << *MI);
+      llvm_unreachable(0);
+    }
+
+    bool IsImpDef = true;
+    SmallVector<unsigned, 4> RealSrcs;
+    SmallSet<unsigned, 4> Seen;
+    for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) {
+      unsigned SrcReg = MI->getOperand(i).getReg();
+      if (MI->getOperand(i).getSubReg() ||
+          TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+        DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << *MI);
+        llvm_unreachable(0);
+      }
+
+      MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
+      if (DefMI->isImplicitDef()) {
+        DefMI->eraseFromParent();
+        continue;
+      }
+      IsImpDef = false;
+
+      // Remember COPY sources. These might be candidate for coalescing.
+      if (DefMI->isCopy() && DefMI->getOperand(1).getSubReg())
+        RealSrcs.push_back(DefMI->getOperand(1).getReg());
+
+      bool isKill = MI->getOperand(i).isKill();
+      if (!Seen.insert(SrcReg) || MI->getParent() != DefMI->getParent() ||
+          !isKill || HasOtherRegSequenceUses(SrcReg, MI, MRI)) {
+        // REG_SEQUENCE cannot have duplicated operands, add a copy.
+        // Also add an copy if the source is live-in the block. We don't want
+        // to end up with a partial-redef of a livein, e.g.
+        // BB0:
+        // reg1051:10<def> =
+        // ...
+        // BB1:
+        // ... = reg1051:10
+        // BB2:
+        // reg1051:9<def> =
+        // LiveIntervalAnalysis won't like it.
+        //
+        // If the REG_SEQUENCE doesn't kill its source, keeping live variables
+        // correctly up to date becomes very difficult. Insert a copy.
+
+        // Defer any kill flag to the last operand using SrcReg. Otherwise, we
+        // might insert a COPY that uses SrcReg after is was killed.
+        if (isKill)
+          for (unsigned j = i + 2; j < e; j += 2)
+            if (MI->getOperand(j).getReg() == SrcReg) {
+              MI->getOperand(j).setIsKill();
+              isKill = false;
+              break;
+            }
+
+        MachineBasicBlock::iterator InsertLoc = MI;
+        MachineInstr *CopyMI = BuildMI(*MI->getParent(), InsertLoc,
+                                MI->getDebugLoc(), TII->get(TargetOpcode::COPY))
+            .addReg(DstReg, RegState::Define, MI->getOperand(i+1).getImm())
+            .addReg(SrcReg, getKillRegState(isKill));
+        MI->getOperand(i).setReg(0);
+        if (LV && isKill)
+          LV->replaceKillInstruction(SrcReg, MI, CopyMI);
+        DEBUG(dbgs() << "Inserted: " << *CopyMI);
+      }
+    }
+
+    for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) {
+      unsigned SrcReg = MI->getOperand(i).getReg();
+      if (!SrcReg) continue;
+      unsigned SubIdx = MI->getOperand(i+1).getImm();
+      UpdateRegSequenceSrcs(SrcReg, DstReg, SubIdx, MRI, *TRI);
+    }
+
+    if (IsImpDef) {
+      DEBUG(dbgs() << "Turned: " << *MI << " into an IMPLICIT_DEF");
+      MI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+      for (int j = MI->getNumOperands() - 1, ee = 0; j > ee; --j)
+        MI->RemoveOperand(j);      
+    } else {
+      DEBUG(dbgs() << "Eliminated: " << *MI);
+      MI->eraseFromParent();
+    }
+
+    // Try coalescing some EXTRACT_SUBREG instructions. This can create
+    // INSERT_SUBREG instructions that must have <undef> flags added by
+    // LiveIntervalAnalysis, so only run it when LiveVariables is available.
+    if (LV)
+      CoalesceExtSubRegs(RealSrcs, DstReg);
+  }
+
+  RegSequences.clear();
+  return true;
+}
diff --git a/src/LLVM/lib/CodeGen/UnreachableBlockElim.cpp b/src/LLVM/lib/CodeGen/UnreachableBlockElim.cpp
new file mode 100644
index 0000000..ade2ab4
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/UnreachableBlockElim.cpp
@@ -0,0 +1,206 @@
+//===-- UnreachableBlockElim.cpp - Remove unreachable blocks for codegen --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is an extremely simple version of the SimplifyCFG pass.  Its sole
+// job is to delete LLVM basic blocks that are not reachable from the entry
+// node.  To do this, it performs a simple depth first traversal of the CFG,
+// then deletes any unvisited nodes.
+//
+// Note that this pass is really a hack.  In particular, the instruction
+// selectors for various targets should just not generate code for unreachable
+// blocks.  Until LLVM has a more systematic way of defining instruction
+// selectors, however, we cannot really expect them to handle additional
+// complexity.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Constant.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+using namespace llvm;
+
+namespace {
+  class UnreachableBlockElim : public FunctionPass {
+    virtual bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    UnreachableBlockElim() : FunctionPass(ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<ProfileInfo>();
+    }
+  };
+}
+char UnreachableBlockElim::ID = 0;
+INITIALIZE_PASS(UnreachableBlockElim, "unreachableblockelim",
+                "Remove unreachable blocks from the CFG", false, false);
+
+FunctionPass *llvm::createUnreachableBlockEliminationPass() {
+  return new UnreachableBlockElim();
+}
+
+bool UnreachableBlockElim::runOnFunction(Function &F) {
+  SmallPtrSet<BasicBlock*, 8> Reachable;
+
+  // Mark all reachable blocks.
+  for (df_ext_iterator<Function*, SmallPtrSet<BasicBlock*, 8> > I =
+       df_ext_begin(&F, Reachable), E = df_ext_end(&F, Reachable); I != E; ++I)
+    /* Mark all reachable blocks */;
+
+  // Loop over all dead blocks, remembering them and deleting all instructions
+  // in them.
+  std::vector<BasicBlock*> DeadBlocks;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    if (!Reachable.count(I)) {
+      BasicBlock *BB = I;
+      DeadBlocks.push_back(BB);
+      while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+        PN->replaceAllUsesWith(Constant::getNullValue(PN->getType()));
+        BB->getInstList().pop_front();
+      }
+      for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
+        (*SI)->removePredecessor(BB);
+      BB->dropAllReferences();
+    }
+
+  // Actually remove the blocks now.
+  ProfileInfo *PI = getAnalysisIfAvailable<ProfileInfo>();
+  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i) {
+    if (PI) PI->removeBlock(DeadBlocks[i]);
+    DeadBlocks[i]->eraseFromParent();
+  }
+
+  return DeadBlocks.size();
+}
+
+
+namespace {
+  class UnreachableMachineBlockElim : public MachineFunctionPass {
+    virtual bool runOnMachineFunction(MachineFunction &F);
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    MachineModuleInfo *MMI;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    UnreachableMachineBlockElim() : MachineFunctionPass(ID) {}
+  };
+}
+char UnreachableMachineBlockElim::ID = 0;
+
+static RegisterPass<UnreachableMachineBlockElim>
+Y("unreachable-mbb-elimination",
+  "Remove unreachable machine basic blocks");
+
+char &llvm::UnreachableMachineBlockElimID = UnreachableMachineBlockElim::ID;
+
+void UnreachableMachineBlockElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
+  SmallPtrSet<MachineBasicBlock*, 8> Reachable;
+
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  MachineDominatorTree *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+
+  // Mark all reachable blocks.
+  for (df_ext_iterator<MachineFunction*, SmallPtrSet<MachineBasicBlock*, 8> >
+       I = df_ext_begin(&F, Reachable), E = df_ext_end(&F, Reachable);
+       I != E; ++I)
+    /* Mark all reachable blocks */;
+
+  // Loop over all dead blocks, remembering them and deleting all instructions
+  // in them.
+  std::vector<MachineBasicBlock*> DeadBlocks;
+  for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    MachineBasicBlock *BB = I;
+
+    // Test for deadness.
+    if (!Reachable.count(BB)) {
+      DeadBlocks.push_back(BB);
+
+      // Update dominator and loop info.
+      if (MLI) MLI->removeBlock(BB);
+      if (MDT && MDT->getNode(BB)) MDT->eraseNode(BB);
+
+      while (BB->succ_begin() != BB->succ_end()) {
+        MachineBasicBlock* succ = *BB->succ_begin();
+
+        MachineBasicBlock::iterator start = succ->begin();
+        while (start != succ->end() && start->isPHI()) {
+          for (unsigned i = start->getNumOperands() - 1; i >= 2; i-=2)
+            if (start->getOperand(i).isMBB() &&
+                start->getOperand(i).getMBB() == BB) {
+              start->RemoveOperand(i);
+              start->RemoveOperand(i-1);
+            }
+
+          start++;
+        }
+
+        BB->removeSuccessor(BB->succ_begin());
+      }
+    }
+  }
+
+  // Actually remove the blocks now.
+  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i)
+    DeadBlocks[i]->eraseFromParent();
+
+  // Cleanup PHI nodes.
+  for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    MachineBasicBlock *BB = I;
+    // Prune unneeded PHI entries.
+    SmallPtrSet<MachineBasicBlock*, 8> preds(BB->pred_begin(),
+                                             BB->pred_end());
+    MachineBasicBlock::iterator phi = BB->begin();
+    while (phi != BB->end() && phi->isPHI()) {
+      for (unsigned i = phi->getNumOperands() - 1; i >= 2; i-=2)
+        if (!preds.count(phi->getOperand(i).getMBB())) {
+          phi->RemoveOperand(i);
+          phi->RemoveOperand(i-1);
+        }
+
+      if (phi->getNumOperands() == 3) {
+        unsigned Input = phi->getOperand(1).getReg();
+        unsigned Output = phi->getOperand(0).getReg();
+
+        MachineInstr* temp = phi;
+        ++phi;
+        temp->eraseFromParent();
+
+        if (Input != Output)
+          F.getRegInfo().replaceRegWith(Output, Input);
+
+        continue;
+      }
+
+      ++phi;
+    }
+  }
+
+  F.RenumberBlocks();
+
+  return DeadBlocks.size();
+}
diff --git a/src/LLVM/lib/CodeGen/VirtRegMap.cpp b/src/LLVM/lib/CodeGen/VirtRegMap.cpp
new file mode 100644
index 0000000..20ffcff
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/VirtRegMap.cpp
@@ -0,0 +1,283 @@
+//===-- llvm/CodeGen/VirtRegMap.cpp - Virtual Register Map ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the VirtRegMap class.
+//
+// It also contains implementations of the Spiller interface, which, given a
+// virtual register map and a machine function, eliminates all virtual
+// references by replacing them with physical register references - adding spill
+// code as necessary.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "virtregmap"
+#include "VirtRegMap.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumSpills  , "Number of register spills");
+
+//===----------------------------------------------------------------------===//
+//  VirtRegMap implementation
+//===----------------------------------------------------------------------===//
+
+char VirtRegMap::ID = 0;
+
+INITIALIZE_PASS(VirtRegMap, "virtregmap", "Virtual Register Map", false, false);
+
+bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
+  MRI = &mf.getRegInfo();
+  TII = mf.getTarget().getInstrInfo();
+  TRI = mf.getTarget().getRegisterInfo();
+  MF = &mf;
+
+  ReMatId = MAX_STACK_SLOT+1;
+  LowSpillSlot = HighSpillSlot = NO_STACK_SLOT;
+  
+  Virt2PhysMap.clear();
+  Virt2StackSlotMap.clear();
+  Virt2ReMatIdMap.clear();
+  Virt2SplitMap.clear();
+  Virt2SplitKillMap.clear();
+  ReMatMap.clear();
+  ImplicitDefed.clear();
+  SpillSlotToUsesMap.clear();
+  MI2VirtMap.clear();
+  SpillPt2VirtMap.clear();
+  RestorePt2VirtMap.clear();
+  EmergencySpillMap.clear();
+  EmergencySpillSlots.clear();
+  
+  SpillSlotToUsesMap.resize(8);
+  ImplicitDefed.resize(MF->getRegInfo().getLastVirtReg()+1-
+                       TargetRegisterInfo::FirstVirtualRegister);
+
+  allocatableRCRegs.clear();
+  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+         E = TRI->regclass_end(); I != E; ++I)
+    allocatableRCRegs.insert(std::make_pair(*I,
+                                            TRI->getAllocatableSet(mf, *I)));
+
+  grow();
+  
+  return false;
+}
+
+void VirtRegMap::grow() {
+  unsigned LastVirtReg = MF->getRegInfo().getLastVirtReg();
+  Virt2PhysMap.grow(LastVirtReg);
+  Virt2StackSlotMap.grow(LastVirtReg);
+  Virt2ReMatIdMap.grow(LastVirtReg);
+  Virt2SplitMap.grow(LastVirtReg);
+  Virt2SplitKillMap.grow(LastVirtReg);
+  ReMatMap.grow(LastVirtReg);
+  ImplicitDefed.resize(LastVirtReg-TargetRegisterInfo::FirstVirtualRegister+1);
+}
+
+unsigned VirtRegMap::getRegAllocPref(unsigned virtReg) {
+  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(virtReg);
+  unsigned physReg = Hint.second;
+  if (physReg &&
+      TargetRegisterInfo::isVirtualRegister(physReg) && hasPhys(physReg))
+    physReg = getPhys(physReg);
+  if (Hint.first == 0)
+    return (physReg && TargetRegisterInfo::isPhysicalRegister(physReg))
+      ? physReg : 0;
+  return TRI->ResolveRegAllocHint(Hint.first, physReg, *MF);
+}
+
+int VirtRegMap::assignVirt2StackSlot(unsigned virtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+  assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
+         "attempt to assign stack slot to already spilled register");
+  const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(virtReg);
+  int SS = MF->getFrameInfo()->CreateSpillStackObject(RC->getSize(),
+                                                 RC->getAlignment());
+  if (LowSpillSlot == NO_STACK_SLOT)
+    LowSpillSlot = SS;
+  if (HighSpillSlot == NO_STACK_SLOT || SS > HighSpillSlot)
+    HighSpillSlot = SS;
+  unsigned Idx = SS-LowSpillSlot;
+  while (Idx >= SpillSlotToUsesMap.size())
+    SpillSlotToUsesMap.resize(SpillSlotToUsesMap.size()*2);
+  Virt2StackSlotMap[virtReg] = SS;
+  ++NumSpills;
+  return SS;
+}
+
+void VirtRegMap::assignVirt2StackSlot(unsigned virtReg, int SS) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+  assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
+         "attempt to assign stack slot to already spilled register");
+  assert((SS >= 0 ||
+          (SS >= MF->getFrameInfo()->getObjectIndexBegin())) &&
+         "illegal fixed frame index");
+  Virt2StackSlotMap[virtReg] = SS;
+}
+
+int VirtRegMap::assignVirtReMatId(unsigned virtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+  assert(Virt2ReMatIdMap[virtReg] == NO_STACK_SLOT &&
+         "attempt to assign re-mat id to already spilled register");
+  Virt2ReMatIdMap[virtReg] = ReMatId;
+  return ReMatId++;
+}
+
+void VirtRegMap::assignVirtReMatId(unsigned virtReg, int id) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+  assert(Virt2ReMatIdMap[virtReg] == NO_STACK_SLOT &&
+         "attempt to assign re-mat id to already spilled register");
+  Virt2ReMatIdMap[virtReg] = id;
+}
+
+int VirtRegMap::getEmergencySpillSlot(const TargetRegisterClass *RC) {
+  std::map<const TargetRegisterClass*, int>::iterator I =
+    EmergencySpillSlots.find(RC);
+  if (I != EmergencySpillSlots.end())
+    return I->second;
+  int SS = MF->getFrameInfo()->CreateSpillStackObject(RC->getSize(),
+                                                 RC->getAlignment());
+  if (LowSpillSlot == NO_STACK_SLOT)
+    LowSpillSlot = SS;
+  if (HighSpillSlot == NO_STACK_SLOT || SS > HighSpillSlot)
+    HighSpillSlot = SS;
+  EmergencySpillSlots[RC] = SS;
+  return SS;
+}
+
+void VirtRegMap::addSpillSlotUse(int FI, MachineInstr *MI) {
+  if (!MF->getFrameInfo()->isFixedObjectIndex(FI)) {
+    // If FI < LowSpillSlot, this stack reference was produced by
+    // instruction selection and is not a spill
+    if (FI >= LowSpillSlot) {
+      assert(FI >= 0 && "Spill slot index should not be negative!");
+      assert((unsigned)FI-LowSpillSlot < SpillSlotToUsesMap.size()
+             && "Invalid spill slot");
+      SpillSlotToUsesMap[FI-LowSpillSlot].insert(MI);
+    }
+  }
+}
+
+void VirtRegMap::virtFolded(unsigned VirtReg, MachineInstr *OldMI,
+                            MachineInstr *NewMI, ModRef MRInfo) {
+  // Move previous memory references folded to new instruction.
+  MI2VirtMapTy::iterator IP = MI2VirtMap.lower_bound(NewMI);
+  for (MI2VirtMapTy::iterator I = MI2VirtMap.lower_bound(OldMI),
+         E = MI2VirtMap.end(); I != E && I->first == OldMI; ) {
+    MI2VirtMap.insert(IP, std::make_pair(NewMI, I->second));
+    MI2VirtMap.erase(I++);
+  }
+
+  // add new memory reference
+  MI2VirtMap.insert(IP, std::make_pair(NewMI, std::make_pair(VirtReg, MRInfo)));
+}
+
+void VirtRegMap::virtFolded(unsigned VirtReg, MachineInstr *MI, ModRef MRInfo) {
+  MI2VirtMapTy::iterator IP = MI2VirtMap.lower_bound(MI);
+  MI2VirtMap.insert(IP, std::make_pair(MI, std::make_pair(VirtReg, MRInfo)));
+}
+
+void VirtRegMap::RemoveMachineInstrFromMaps(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isFI())
+      continue;
+    int FI = MO.getIndex();
+    if (MF->getFrameInfo()->isFixedObjectIndex(FI))
+      continue;
+    // This stack reference was produced by instruction selection and
+    // is not a spill
+    if (FI < LowSpillSlot)
+      continue;
+    assert((unsigned)FI-LowSpillSlot < SpillSlotToUsesMap.size()
+           && "Invalid spill slot");
+    SpillSlotToUsesMap[FI-LowSpillSlot].erase(MI);
+  }
+  MI2VirtMap.erase(MI);
+  SpillPt2VirtMap.erase(MI);
+  RestorePt2VirtMap.erase(MI);
+  EmergencySpillMap.erase(MI);
+}
+
+/// FindUnusedRegisters - Gather a list of allocatable registers that
+/// have not been allocated to any virtual register.
+bool VirtRegMap::FindUnusedRegisters(LiveIntervals* LIs) {
+  unsigned NumRegs = TRI->getNumRegs();
+  UnusedRegs.reset();
+  UnusedRegs.resize(NumRegs);
+
+  BitVector Used(NumRegs);
+  for (unsigned i = TargetRegisterInfo::FirstVirtualRegister,
+         e = MF->getRegInfo().getLastVirtReg(); i <= e; ++i)
+    if (Virt2PhysMap[i] != (unsigned)VirtRegMap::NO_PHYS_REG)
+      Used.set(Virt2PhysMap[i]);
+
+  BitVector Allocatable = TRI->getAllocatableSet(*MF);
+  bool AnyUnused = false;
+  for (unsigned Reg = 1; Reg < NumRegs; ++Reg) {
+    if (Allocatable[Reg] && !Used[Reg] && !LIs->hasInterval(Reg)) {
+      bool ReallyUnused = true;
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
+        if (Used[*AS] || LIs->hasInterval(*AS)) {
+          ReallyUnused = false;
+          break;
+        }
+      }
+      if (ReallyUnused) {
+        AnyUnused = true;
+        UnusedRegs.set(Reg);
+      }
+    }
+  }
+
+  return AnyUnused;
+}
+
+void VirtRegMap::print(raw_ostream &OS, const Module* M) const {
+  const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  OS << "********** REGISTER MAP **********\n";
+  for (unsigned i = TargetRegisterInfo::FirstVirtualRegister,
+         e = MF->getRegInfo().getLastVirtReg(); i <= e; ++i) {
+    if (Virt2PhysMap[i] != (unsigned)VirtRegMap::NO_PHYS_REG)
+      OS << "[reg" << i << " -> " << TRI->getName(Virt2PhysMap[i])
+         << "] " << MRI.getRegClass(i)->getName() << "\n";
+  }
+
+  for (unsigned i = TargetRegisterInfo::FirstVirtualRegister,
+         e = MF->getRegInfo().getLastVirtReg(); i <= e; ++i)
+    if (Virt2StackSlotMap[i] != VirtRegMap::NO_STACK_SLOT)
+      OS << "[reg" << i << " -> fi#" << Virt2StackSlotMap[i]
+         << "] " << MRI.getRegClass(i)->getName() << "\n";
+  OS << '\n';
+}
+
+void VirtRegMap::dump() const {
+  print(dbgs());
+}
diff --git a/src/LLVM/lib/CodeGen/VirtRegMap.h b/src/LLVM/lib/CodeGen/VirtRegMap.h
new file mode 100644
index 0000000..8b6082d
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/VirtRegMap.h
@@ -0,0 +1,500 @@
+//===-- llvm/CodeGen/VirtRegMap.h - Virtual Register Map -*- C++ -*--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a virtual register map. This maps virtual registers to
+// physical registers and virtual registers to stack slots. It is created and
+// updated by a register allocator and then used by a machine code rewriter that
+// adds spill code and rewrites virtual into physical register references.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_VIRTREGMAP_H
+#define LLVM_CODEGEN_VIRTREGMAP_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include <map>
+
+namespace llvm {
+  class LiveIntervals;
+  class MachineInstr;
+  class MachineFunction;
+  class MachineRegisterInfo;
+  class TargetInstrInfo;
+  class TargetRegisterInfo;
+  class raw_ostream;
+
+  class VirtRegMap : public MachineFunctionPass {
+  public:
+    enum {
+      NO_PHYS_REG = 0,
+      NO_STACK_SLOT = (1L << 30)-1,
+      MAX_STACK_SLOT = (1L << 18)-1
+    };
+
+    enum ModRef { isRef = 1, isMod = 2, isModRef = 3 };
+    typedef std::multimap<MachineInstr*,
+                          std::pair<unsigned, ModRef> > MI2VirtMapTy;
+
+  private:
+    MachineRegisterInfo *MRI;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineFunction *MF;
+
+    DenseMap<const TargetRegisterClass*, BitVector> allocatableRCRegs;
+
+    /// Virt2PhysMap - This is a virtual to physical register
+    /// mapping. Each virtual register is required to have an entry in
+    /// it; even spilled virtual registers (the register mapped to a
+    /// spilled register is the temporary used to load it from the
+    /// stack).
+    IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2PhysMap;
+
+    /// Virt2StackSlotMap - This is virtual register to stack slot
+    /// mapping. Each spilled virtual register has an entry in it
+    /// which corresponds to the stack slot this register is spilled
+    /// at.
+    IndexedMap<int, VirtReg2IndexFunctor> Virt2StackSlotMap;
+
+    /// Virt2ReMatIdMap - This is virtual register to rematerialization id
+    /// mapping. Each spilled virtual register that should be remat'd has an
+    /// entry in it which corresponds to the remat id.
+    IndexedMap<int, VirtReg2IndexFunctor> Virt2ReMatIdMap;
+
+    /// Virt2SplitMap - This is virtual register to splitted virtual register
+    /// mapping.
+    IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2SplitMap;
+
+    /// Virt2SplitKillMap - This is splitted virtual register to its last use
+    /// (kill) index mapping.
+    IndexedMap<SlotIndex> Virt2SplitKillMap;
+
+    /// ReMatMap - This is virtual register to re-materialized instruction
+    /// mapping. Each virtual register whose definition is going to be
+    /// re-materialized has an entry in it.
+    IndexedMap<MachineInstr*, VirtReg2IndexFunctor> ReMatMap;
+
+    /// MI2VirtMap - This is MachineInstr to virtual register
+    /// mapping. In the case of memory spill code being folded into
+    /// instructions, we need to know which virtual register was
+    /// read/written by this instruction.
+    MI2VirtMapTy MI2VirtMap;
+
+    /// SpillPt2VirtMap - This records the virtual registers which should
+    /// be spilled right after the MachineInstr due to live interval
+    /// splitting.
+    std::map<MachineInstr*, std::vector<std::pair<unsigned,bool> > >
+    SpillPt2VirtMap;
+
+    /// RestorePt2VirtMap - This records the virtual registers which should
+    /// be restored right before the MachineInstr due to live interval
+    /// splitting.
+    std::map<MachineInstr*, std::vector<unsigned> > RestorePt2VirtMap;
+
+    /// EmergencySpillMap - This records the physical registers that should
+    /// be spilled / restored around the MachineInstr since the register
+    /// allocator has run out of registers.
+    std::map<MachineInstr*, std::vector<unsigned> > EmergencySpillMap;
+
+    /// EmergencySpillSlots - This records emergency spill slots used to
+    /// spill physical registers when the register allocator runs out of
+    /// registers. Ideally only one stack slot is used per function per
+    /// register class.
+    std::map<const TargetRegisterClass*, int> EmergencySpillSlots;
+
+    /// ReMatId - Instead of assigning a stack slot to a to be rematerialized
+    /// virtual register, an unique id is being assigned. This keeps track of
+    /// the highest id used so far. Note, this starts at (1<<18) to avoid
+    /// conflicts with stack slot numbers.
+    int ReMatId;
+
+    /// LowSpillSlot, HighSpillSlot - Lowest and highest spill slot indexes.
+    int LowSpillSlot, HighSpillSlot;
+
+    /// SpillSlotToUsesMap - Records uses for each register spill slot.
+    SmallVector<SmallPtrSet<MachineInstr*, 4>, 8> SpillSlotToUsesMap;
+
+    /// ImplicitDefed - One bit for each virtual register. If set it indicates
+    /// the register is implicitly defined.
+    BitVector ImplicitDefed;
+
+    /// UnusedRegs - A list of physical registers that have not been used.
+    BitVector UnusedRegs;
+
+    VirtRegMap(const VirtRegMap&);     // DO NOT IMPLEMENT
+    void operator=(const VirtRegMap&); // DO NOT IMPLEMENT
+
+  public:
+    static char ID;
+    VirtRegMap() : MachineFunctionPass(ID), Virt2PhysMap(NO_PHYS_REG),
+                   Virt2StackSlotMap(NO_STACK_SLOT), 
+                   Virt2ReMatIdMap(NO_STACK_SLOT), Virt2SplitMap(0),
+                   Virt2SplitKillMap(SlotIndex()), ReMatMap(NULL),
+                   ReMatId(MAX_STACK_SLOT+1),
+                   LowSpillSlot(NO_STACK_SLOT), HighSpillSlot(NO_STACK_SLOT) { }
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    MachineFunction &getMachineFunction() const {
+      assert(MF && "getMachineFunction called before runOnMAchineFunction");
+      return *MF;
+    }
+
+    void grow();
+
+    /// @brief returns true if the specified virtual register is
+    /// mapped to a physical register
+    bool hasPhys(unsigned virtReg) const {
+      return getPhys(virtReg) != NO_PHYS_REG;
+    }
+
+    /// @brief returns the physical register mapped to the specified
+    /// virtual register
+    unsigned getPhys(unsigned virtReg) const {
+      assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+      return Virt2PhysMap[virtReg];
+    }
+
+    /// @brief creates a mapping for the specified virtual register to
+    /// the specified physical register
+    void assignVirt2Phys(unsigned virtReg, unsigned physReg) {
+      assert(TargetRegisterInfo::isVirtualRegister(virtReg) &&
+             TargetRegisterInfo::isPhysicalRegister(physReg));
+      assert(Virt2PhysMap[virtReg] == NO_PHYS_REG &&
+             "attempt to assign physical register to already mapped "
+             "virtual register");
+      Virt2PhysMap[virtReg] = physReg;
+    }
+
+    /// @brief clears the specified virtual register's, physical
+    /// register mapping
+    void clearVirt(unsigned virtReg) {
+      assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+      assert(Virt2PhysMap[virtReg] != NO_PHYS_REG &&
+             "attempt to clear a not assigned virtual register");
+      Virt2PhysMap[virtReg] = NO_PHYS_REG;
+    }
+
+    /// @brief clears all virtual to physical register mappings
+    void clearAllVirt() {
+      Virt2PhysMap.clear();
+      grow();
+    }
+
+    /// @brief returns the register allocation preference.
+    unsigned getRegAllocPref(unsigned virtReg);
+
+    /// @brief records virtReg is a split live interval from SReg.
+    void setIsSplitFromReg(unsigned virtReg, unsigned SReg) {
+      Virt2SplitMap[virtReg] = SReg;
+    }
+
+    /// @brief returns the live interval virtReg is split from.
+    unsigned getPreSplitReg(unsigned virtReg) {
+      return Virt2SplitMap[virtReg];
+    }
+
+    /// @brief returns true if the specified virtual register is not
+    /// mapped to a stack slot or rematerialized.
+    bool isAssignedReg(unsigned virtReg) const {
+      if (getStackSlot(virtReg) == NO_STACK_SLOT &&
+          getReMatId(virtReg) == NO_STACK_SLOT)
+        return true;
+      // Split register can be assigned a physical register as well as a
+      // stack slot or remat id.
+      return (Virt2SplitMap[virtReg] && Virt2PhysMap[virtReg] != NO_PHYS_REG);
+    }
+
+    /// @brief returns the stack slot mapped to the specified virtual
+    /// register
+    int getStackSlot(unsigned virtReg) const {
+      assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+      return Virt2StackSlotMap[virtReg];
+    }
+
+    /// @brief returns the rematerialization id mapped to the specified virtual
+    /// register
+    int getReMatId(unsigned virtReg) const {
+      assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+      return Virt2ReMatIdMap[virtReg];
+    }
+
+    /// @brief create a mapping for the specifed virtual register to
+    /// the next available stack slot
+    int assignVirt2StackSlot(unsigned virtReg);
+    /// @brief create a mapping for the specified virtual register to
+    /// the specified stack slot
+    void assignVirt2StackSlot(unsigned virtReg, int frameIndex);
+
+    /// @brief assign an unique re-materialization id to the specified
+    /// virtual register.
+    int assignVirtReMatId(unsigned virtReg);
+    /// @brief assign an unique re-materialization id to the specified
+    /// virtual register.
+    void assignVirtReMatId(unsigned virtReg, int id);
+
+    /// @brief returns true if the specified virtual register is being
+    /// re-materialized.
+    bool isReMaterialized(unsigned virtReg) const {
+      return ReMatMap[virtReg] != NULL;
+    }
+
+    /// @brief returns the original machine instruction being re-issued
+    /// to re-materialize the specified virtual register.
+    MachineInstr *getReMaterializedMI(unsigned virtReg) const {
+      return ReMatMap[virtReg];
+    }
+
+    /// @brief records the specified virtual register will be
+    /// re-materialized and the original instruction which will be re-issed
+    /// for this purpose.  If parameter all is true, then all uses of the
+    /// registers are rematerialized and it's safe to delete the definition.
+    void setVirtIsReMaterialized(unsigned virtReg, MachineInstr *def) {
+      ReMatMap[virtReg] = def;
+    }
+
+    /// @brief record the last use (kill) of a split virtual register.
+    void addKillPoint(unsigned virtReg, SlotIndex index) {
+      Virt2SplitKillMap[virtReg] = index;
+    }
+
+    SlotIndex getKillPoint(unsigned virtReg) const {
+      return Virt2SplitKillMap[virtReg];
+    }
+
+    /// @brief remove the last use (kill) of a split virtual register.
+    void removeKillPoint(unsigned virtReg) {
+      Virt2SplitKillMap[virtReg] = SlotIndex();
+    }
+
+    /// @brief returns true if the specified MachineInstr is a spill point.
+    bool isSpillPt(MachineInstr *Pt) const {
+      return SpillPt2VirtMap.find(Pt) != SpillPt2VirtMap.end();
+    }
+
+    /// @brief returns the virtual registers that should be spilled due to
+    /// splitting right after the specified MachineInstr.
+    std::vector<std::pair<unsigned,bool> > &getSpillPtSpills(MachineInstr *Pt) {
+      return SpillPt2VirtMap[Pt];
+    }
+
+    /// @brief records the specified MachineInstr as a spill point for virtReg.
+    void addSpillPoint(unsigned virtReg, bool isKill, MachineInstr *Pt) {
+      std::map<MachineInstr*, std::vector<std::pair<unsigned,bool> > >::iterator
+        I = SpillPt2VirtMap.find(Pt);
+      if (I != SpillPt2VirtMap.end())
+        I->second.push_back(std::make_pair(virtReg, isKill));
+      else {
+        std::vector<std::pair<unsigned,bool> > Virts;
+        Virts.push_back(std::make_pair(virtReg, isKill));
+        SpillPt2VirtMap.insert(std::make_pair(Pt, Virts));
+      }
+    }
+
+    /// @brief - transfer spill point information from one instruction to
+    /// another.
+    void transferSpillPts(MachineInstr *Old, MachineInstr *New) {
+      std::map<MachineInstr*, std::vector<std::pair<unsigned,bool> > >::iterator
+        I = SpillPt2VirtMap.find(Old);
+      if (I == SpillPt2VirtMap.end())
+        return;
+      while (!I->second.empty()) {
+        unsigned virtReg = I->second.back().first;
+        bool isKill = I->second.back().second;
+        I->second.pop_back();
+        addSpillPoint(virtReg, isKill, New);
+      }
+      SpillPt2VirtMap.erase(I);
+    }
+
+    /// @brief returns true if the specified MachineInstr is a restore point.
+    bool isRestorePt(MachineInstr *Pt) const {
+      return RestorePt2VirtMap.find(Pt) != RestorePt2VirtMap.end();
+    }
+
+    /// @brief returns the virtual registers that should be restoreed due to
+    /// splitting right after the specified MachineInstr.
+    std::vector<unsigned> &getRestorePtRestores(MachineInstr *Pt) {
+      return RestorePt2VirtMap[Pt];
+    }
+
+    /// @brief records the specified MachineInstr as a restore point for virtReg.
+    void addRestorePoint(unsigned virtReg, MachineInstr *Pt) {
+      std::map<MachineInstr*, std::vector<unsigned> >::iterator I =
+        RestorePt2VirtMap.find(Pt);
+      if (I != RestorePt2VirtMap.end())
+        I->second.push_back(virtReg);
+      else {
+        std::vector<unsigned> Virts;
+        Virts.push_back(virtReg);
+        RestorePt2VirtMap.insert(std::make_pair(Pt, Virts));
+      }
+    }
+
+    /// @brief - transfer restore point information from one instruction to
+    /// another.
+    void transferRestorePts(MachineInstr *Old, MachineInstr *New) {
+      std::map<MachineInstr*, std::vector<unsigned> >::iterator I =
+        RestorePt2VirtMap.find(Old);
+      if (I == RestorePt2VirtMap.end())
+        return;
+      while (!I->second.empty()) {
+        unsigned virtReg = I->second.back();
+        I->second.pop_back();
+        addRestorePoint(virtReg, New);
+      }
+      RestorePt2VirtMap.erase(I);
+    }
+
+    /// @brief records that the specified physical register must be spilled
+    /// around the specified machine instr.
+    void addEmergencySpill(unsigned PhysReg, MachineInstr *MI) {
+      if (EmergencySpillMap.find(MI) != EmergencySpillMap.end())
+        EmergencySpillMap[MI].push_back(PhysReg);
+      else {
+        std::vector<unsigned> PhysRegs;
+        PhysRegs.push_back(PhysReg);
+        EmergencySpillMap.insert(std::make_pair(MI, PhysRegs));
+      }
+    }
+
+    /// @brief returns true if one or more physical registers must be spilled
+    /// around the specified instruction.
+    bool hasEmergencySpills(MachineInstr *MI) const {
+      return EmergencySpillMap.find(MI) != EmergencySpillMap.end();
+    }
+
+    /// @brief returns the physical registers to be spilled and restored around
+    /// the instruction.
+    std::vector<unsigned> &getEmergencySpills(MachineInstr *MI) {
+      return EmergencySpillMap[MI];
+    }
+
+    /// @brief - transfer emergency spill information from one instruction to
+    /// another.
+    void transferEmergencySpills(MachineInstr *Old, MachineInstr *New) {
+      std::map<MachineInstr*,std::vector<unsigned> >::iterator I =
+        EmergencySpillMap.find(Old);
+      if (I == EmergencySpillMap.end())
+        return;
+      while (!I->second.empty()) {
+        unsigned virtReg = I->second.back();
+        I->second.pop_back();
+        addEmergencySpill(virtReg, New);
+      }
+      EmergencySpillMap.erase(I);
+    }
+
+    /// @brief return or get a emergency spill slot for the register class.
+    int getEmergencySpillSlot(const TargetRegisterClass *RC);
+
+    /// @brief Return lowest spill slot index.
+    int getLowSpillSlot() const {
+      return LowSpillSlot;
+    }
+
+    /// @brief Return highest spill slot index.
+    int getHighSpillSlot() const {
+      return HighSpillSlot;
+    }
+
+    /// @brief Records a spill slot use.
+    void addSpillSlotUse(int FrameIndex, MachineInstr *MI);
+
+    /// @brief Returns true if spill slot has been used.
+    bool isSpillSlotUsed(int FrameIndex) const {
+      assert(FrameIndex >= 0 && "Spill slot index should not be negative!");
+      return !SpillSlotToUsesMap[FrameIndex-LowSpillSlot].empty();
+    }
+
+    /// @brief Mark the specified register as being implicitly defined.
+    void setIsImplicitlyDefined(unsigned VirtReg) {
+      ImplicitDefed.set(VirtReg-TargetRegisterInfo::FirstVirtualRegister);
+    }
+
+    /// @brief Returns true if the virtual register is implicitly defined.
+    bool isImplicitlyDefined(unsigned VirtReg) const {
+      return ImplicitDefed[VirtReg-TargetRegisterInfo::FirstVirtualRegister];
+    }
+
+    /// @brief Updates information about the specified virtual register's value
+    /// folded into newMI machine instruction.
+    void virtFolded(unsigned VirtReg, MachineInstr *OldMI, MachineInstr *NewMI,
+                    ModRef MRInfo);
+
+    /// @brief Updates information about the specified virtual register's value
+    /// folded into the specified machine instruction.
+    void virtFolded(unsigned VirtReg, MachineInstr *MI, ModRef MRInfo);
+
+    /// @brief returns the virtual registers' values folded in memory
+    /// operands of this instruction
+    std::pair<MI2VirtMapTy::const_iterator, MI2VirtMapTy::const_iterator>
+    getFoldedVirts(MachineInstr* MI) const {
+      return MI2VirtMap.equal_range(MI);
+    }
+    
+    /// RemoveMachineInstrFromMaps - MI is being erased, remove it from the
+    /// the folded instruction map and spill point map.
+    void RemoveMachineInstrFromMaps(MachineInstr *MI);
+
+    /// FindUnusedRegisters - Gather a list of allocatable registers that
+    /// have not been allocated to any virtual register.
+    bool FindUnusedRegisters(LiveIntervals* LIs);
+
+    /// HasUnusedRegisters - Return true if there are any allocatable registers
+    /// that have not been allocated to any virtual register.
+    bool HasUnusedRegisters() const {
+      return !UnusedRegs.none();
+    }
+
+    /// setRegisterUsed - Remember the physical register is now used.
+    void setRegisterUsed(unsigned Reg) {
+      UnusedRegs.reset(Reg);
+    }
+
+    /// isRegisterUnused - Return true if the physical register has not been
+    /// used.
+    bool isRegisterUnused(unsigned Reg) const {
+      return UnusedRegs[Reg];
+    }
+
+    /// getFirstUnusedRegister - Return the first physical register that has not
+    /// been used.
+    unsigned getFirstUnusedRegister(const TargetRegisterClass *RC) {
+      int Reg = UnusedRegs.find_first();
+      while (Reg != -1) {
+        if (allocatableRCRegs[RC][Reg])
+          return (unsigned)Reg;
+        Reg = UnusedRegs.find_next(Reg);
+      }
+      return 0;
+    }
+
+    void print(raw_ostream &OS, const Module* M = 0) const;
+    void dump() const;
+  };
+
+  inline raw_ostream &operator<<(raw_ostream &OS, const VirtRegMap &VRM) {
+    VRM.print(OS);
+    return OS;
+  }
+} // End llvm namespace
+
+#endif
diff --git a/src/LLVM/lib/CodeGen/VirtRegRewriter.cpp b/src/LLVM/lib/CodeGen/VirtRegRewriter.cpp
new file mode 100644
index 0000000..0a9b58d
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/VirtRegRewriter.cpp
@@ -0,0 +1,2545 @@
+//===-- llvm/CodeGen/Rewriter.cpp -  Rewriter -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "virtregrewriter"
+#include "VirtRegRewriter.h"
+#include "VirtRegMap.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumDSE     , "Number of dead stores elided");
+STATISTIC(NumDSS     , "Number of dead spill slots removed");
+STATISTIC(NumCommutes, "Number of instructions commuted");
+STATISTIC(NumDRM     , "Number of re-materializable defs elided");
+STATISTIC(NumStores  , "Number of stores added");
+STATISTIC(NumPSpills , "Number of physical register spills");
+STATISTIC(NumOmitted , "Number of reloads omited");
+STATISTIC(NumAvoided , "Number of reloads deemed unnecessary");
+STATISTIC(NumCopified, "Number of available reloads turned into copies");
+STATISTIC(NumReMats  , "Number of re-materialization");
+STATISTIC(NumLoads   , "Number of loads added");
+STATISTIC(NumReused  , "Number of values reused");
+STATISTIC(NumDCE     , "Number of copies elided");
+STATISTIC(NumSUnfold , "Number of stores unfolded");
+STATISTIC(NumModRefUnfold, "Number of modref unfolded");
+
+namespace {
+  enum RewriterName { local, trivial };
+}
+
+static cl::opt<RewriterName>
+RewriterOpt("rewriter",
+            cl::desc("Rewriter to use (default=local)"),
+            cl::Prefix,
+            cl::values(clEnumVal(local,   "local rewriter"),
+                       clEnumVal(trivial, "trivial rewriter"),
+                       clEnumValEnd),
+            cl::init(local));
+
+static cl::opt<bool>
+ScheduleSpills("schedule-spills",
+               cl::desc("Schedule spill code"),
+               cl::init(false));
+
+VirtRegRewriter::~VirtRegRewriter() {}
+
+/// substitutePhysReg - Replace virtual register in MachineOperand with a
+/// physical register. Do the right thing with the sub-register index.
+/// Note that operands may be added, so the MO reference is no longer valid.
+static void substitutePhysReg(MachineOperand &MO, unsigned Reg,
+                              const TargetRegisterInfo &TRI) {
+  if (unsigned SubIdx = MO.getSubReg()) {
+    // Insert the physical subreg and reset the subreg field.
+    MO.setReg(TRI.getSubReg(Reg, SubIdx));
+    MO.setSubReg(0);
+
+    // Any def, dead, and kill flags apply to the full virtual register, so they
+    // also apply to the full physical register. Add imp-def/dead and imp-kill
+    // as needed.
+    MachineInstr &MI = *MO.getParent();
+    if (MO.isDef())
+      if (MO.isDead())
+        MI.addRegisterDead(Reg, &TRI, /*AddIfNotFound=*/ true);
+      else
+        MI.addRegisterDefined(Reg, &TRI);
+    else if (!MO.isUndef() &&
+             (MO.isKill() ||
+              MI.isRegTiedToDefOperand(&MO-&MI.getOperand(0))))
+      MI.addRegisterKilled(Reg, &TRI, /*AddIfNotFound=*/ true);
+  } else {
+    MO.setReg(Reg);
+  }
+}
+
+namespace {
+
+/// This class is intended for use with the new spilling framework only. It
+/// rewrites vreg def/uses to use the assigned preg, but does not insert any
+/// spill code.
+struct TrivialRewriter : public VirtRegRewriter {
+
+  bool runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM,
+                            LiveIntervals* LIs) {
+    DEBUG(dbgs() << "********** REWRITE MACHINE CODE **********\n");
+    DEBUG(dbgs() << "********** Function: "
+          << MF.getFunction()->getName() << '\n');
+    DEBUG(dbgs() << "**** Machine Instrs"
+          << "(NOTE! Does not include spills and reloads!) ****\n");
+    DEBUG(MF.dump());
+
+    MachineRegisterInfo *mri = &MF.getRegInfo();
+    const TargetRegisterInfo *tri = MF.getTarget().getRegisterInfo();
+
+    bool changed = false;
+
+    for (LiveIntervals::iterator liItr = LIs->begin(), liEnd = LIs->end();
+         liItr != liEnd; ++liItr) {
+
+      const LiveInterval *li = liItr->second;
+      unsigned reg = li->reg;
+
+      if (TargetRegisterInfo::isPhysicalRegister(reg)) {
+        if (!li->empty())
+          mri->setPhysRegUsed(reg);
+      }
+      else {
+        if (!VRM.hasPhys(reg))
+          continue;
+        unsigned pReg = VRM.getPhys(reg);
+        mri->setPhysRegUsed(pReg);
+        // Copy the register use-list before traversing it.
+        SmallVector<std::pair<MachineInstr*, unsigned>, 32> reglist;
+        for (MachineRegisterInfo::reg_iterator I = mri->reg_begin(reg),
+               E = mri->reg_end(); I != E; ++I)
+          reglist.push_back(std::make_pair(&*I, I.getOperandNo()));
+        for (unsigned N=0; N != reglist.size(); ++N)
+          substitutePhysReg(reglist[N].first->getOperand(reglist[N].second),
+                            pReg, *tri);
+        changed |= !reglist.empty();
+      }
+    }
+
+    DEBUG(dbgs() << "**** Post Machine Instrs ****\n");
+    DEBUG(MF.dump());
+
+    return changed;
+  }
+
+};
+
+}
+
+// ************************************************************************ //
+
+namespace {
+
+/// AvailableSpills - As the local rewriter is scanning and rewriting an MBB
+/// from top down, keep track of which spill slots or remat are available in
+/// each register.
+///
+/// Note that not all physregs are created equal here.  In particular, some
+/// physregs are reloads that we are allowed to clobber or ignore at any time.
+/// Other physregs are values that the register allocated program is using
+/// that we cannot CHANGE, but we can read if we like.  We keep track of this
+/// on a per-stack-slot / remat id basis as the low bit in the value of the
+/// SpillSlotsAvailable entries.  The predicate 'canClobberPhysReg()' checks
+/// this bit and addAvailable sets it if.
+class AvailableSpills {
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+
+  // SpillSlotsOrReMatsAvailable - This map keeps track of all of the spilled
+  // or remat'ed virtual register values that are still available, due to
+  // being loaded or stored to, but not invalidated yet.
+  std::map<int, unsigned> SpillSlotsOrReMatsAvailable;
+
+  // PhysRegsAvailable - This is the inverse of SpillSlotsOrReMatsAvailable,
+  // indicating which stack slot values are currently held by a physreg.  This
+  // is used to invalidate entries in SpillSlotsOrReMatsAvailable when a
+  // physreg is modified.
+  std::multimap<unsigned, int> PhysRegsAvailable;
+
+  void disallowClobberPhysRegOnly(unsigned PhysReg);
+
+  void ClobberPhysRegOnly(unsigned PhysReg);
+public:
+  AvailableSpills(const TargetRegisterInfo *tri, const TargetInstrInfo *tii)
+    : TRI(tri), TII(tii) {
+  }
+
+  /// clear - Reset the state.
+  void clear() {
+    SpillSlotsOrReMatsAvailable.clear();
+    PhysRegsAvailable.clear();
+  }
+
+  const TargetRegisterInfo *getRegInfo() const { return TRI; }
+
+  /// getSpillSlotOrReMatPhysReg - If the specified stack slot or remat is
+  /// available in a physical register, return that PhysReg, otherwise
+  /// return 0.
+  unsigned getSpillSlotOrReMatPhysReg(int Slot) const {
+    std::map<int, unsigned>::const_iterator I =
+      SpillSlotsOrReMatsAvailable.find(Slot);
+    if (I != SpillSlotsOrReMatsAvailable.end()) {
+      return I->second >> 1;  // Remove the CanClobber bit.
+    }
+    return 0;
+  }
+
+  /// addAvailable - Mark that the specified stack slot / remat is available
+  /// in the specified physreg.  If CanClobber is true, the physreg can be
+  /// modified at any time without changing the semantics of the program.
+  void addAvailable(int SlotOrReMat, unsigned Reg, bool CanClobber = true) {
+    // If this stack slot is thought to be available in some other physreg,
+    // remove its record.
+    ModifyStackSlotOrReMat(SlotOrReMat);
+
+    PhysRegsAvailable.insert(std::make_pair(Reg, SlotOrReMat));
+    SpillSlotsOrReMatsAvailable[SlotOrReMat]= (Reg << 1) |
+                                              (unsigned)CanClobber;
+
+    if (SlotOrReMat > VirtRegMap::MAX_STACK_SLOT)
+      DEBUG(dbgs() << "Remembering RM#"
+                   << SlotOrReMat-VirtRegMap::MAX_STACK_SLOT-1);
+    else
+      DEBUG(dbgs() << "Remembering SS#" << SlotOrReMat);
+    DEBUG(dbgs() << " in physreg " << TRI->getName(Reg) << "\n");
+  }
+
+  /// canClobberPhysRegForSS - Return true if the spiller is allowed to change
+  /// the value of the specified stackslot register if it desires. The
+  /// specified stack slot must be available in a physreg for this query to
+  /// make sense.
+  bool canClobberPhysRegForSS(int SlotOrReMat) const {
+    assert(SpillSlotsOrReMatsAvailable.count(SlotOrReMat) &&
+           "Value not available!");
+    return SpillSlotsOrReMatsAvailable.find(SlotOrReMat)->second & 1;
+  }
+
+  /// canClobberPhysReg - Return true if the spiller is allowed to clobber the
+  /// physical register where values for some stack slot(s) might be
+  /// available.
+  bool canClobberPhysReg(unsigned PhysReg) const {
+    std::multimap<unsigned, int>::const_iterator I =
+      PhysRegsAvailable.lower_bound(PhysReg);
+    while (I != PhysRegsAvailable.end() && I->first == PhysReg) {
+      int SlotOrReMat = I->second;
+      I++;
+      if (!canClobberPhysRegForSS(SlotOrReMat))
+        return false;
+    }
+    return true;
+  }
+
+  /// disallowClobberPhysReg - Unset the CanClobber bit of the specified
+  /// stackslot register. The register is still available but is no longer
+  /// allowed to be modifed.
+  void disallowClobberPhysReg(unsigned PhysReg);
+
+  /// ClobberPhysReg - This is called when the specified physreg changes
+  /// value.  We use this to invalidate any info about stuff that lives in
+  /// it and any of its aliases.
+  void ClobberPhysReg(unsigned PhysReg);
+
+  /// ModifyStackSlotOrReMat - This method is called when the value in a stack
+  /// slot changes.  This removes information about which register the
+  /// previous value for this slot lives in (as the previous value is dead
+  /// now).
+  void ModifyStackSlotOrReMat(int SlotOrReMat);
+
+  /// AddAvailableRegsToLiveIn - Availability information is being kept coming
+  /// into the specified MBB. Add available physical registers as potential
+  /// live-in's. If they are reused in the MBB, they will be added to the
+  /// live-in set to make register scavenger and post-allocation scheduler.
+  void AddAvailableRegsToLiveIn(MachineBasicBlock &MBB, BitVector &RegKills,
+                                std::vector<MachineOperand*> &KillOps);
+};
+
+}
+
+// ************************************************************************ //
+
+// Given a location where a reload of a spilled register or a remat of
+// a constant is to be inserted, attempt to find a safe location to
+// insert the load at an earlier point in the basic-block, to hide
+// latency of the load and to avoid address-generation interlock
+// issues.
+static MachineBasicBlock::iterator
+ComputeReloadLoc(MachineBasicBlock::iterator const InsertLoc,
+                 MachineBasicBlock::iterator const Begin,
+                 unsigned PhysReg,
+                 const TargetRegisterInfo *TRI,
+                 bool DoReMat,
+                 int SSorRMId,
+                 const TargetInstrInfo *TII,
+                 const MachineFunction &MF)
+{
+  if (!ScheduleSpills)
+    return InsertLoc;
+
+  // Spill backscheduling is of primary interest to addresses, so
+  // don't do anything if the register isn't in the register class
+  // used for pointers.
+
+  const TargetLowering *TL = MF.getTarget().getTargetLowering();
+
+  if (!TL->isTypeLegal(TL->getPointerTy()))
+    // Believe it or not, this is true on PIC16.
+    return InsertLoc;
+
+  const TargetRegisterClass *ptrRegClass =
+    TL->getRegClassFor(TL->getPointerTy());
+  if (!ptrRegClass->contains(PhysReg))
+    return InsertLoc;
+
+  // Scan upwards through the preceding instructions. If an instruction doesn't
+  // reference the stack slot or the register we're loading, we can
+  // backschedule the reload up past it.
+  MachineBasicBlock::iterator NewInsertLoc = InsertLoc;
+  while (NewInsertLoc != Begin) {
+    MachineBasicBlock::iterator Prev = prior(NewInsertLoc);
+    for (unsigned i = 0; i < Prev->getNumOperands(); ++i) {
+      MachineOperand &Op = Prev->getOperand(i);
+      if (!DoReMat && Op.isFI() && Op.getIndex() == SSorRMId)
+        goto stop;
+    }
+    if (Prev->findRegisterUseOperandIdx(PhysReg) != -1 ||
+        Prev->findRegisterDefOperand(PhysReg))
+      goto stop;
+    for (const unsigned *Alias = TRI->getAliasSet(PhysReg); *Alias; ++Alias)
+      if (Prev->findRegisterUseOperandIdx(*Alias) != -1 ||
+          Prev->findRegisterDefOperand(*Alias))
+        goto stop;
+    NewInsertLoc = Prev;
+  }
+stop:;
+
+  // If we made it to the beginning of the block, turn around and move back
+  // down just past any existing reloads. They're likely to be reloads/remats
+  // for instructions earlier than what our current reload/remat is for, so
+  // they should be scheduled earlier.
+  if (NewInsertLoc == Begin) {
+    int FrameIdx;
+    while (InsertLoc != NewInsertLoc &&
+           (TII->isLoadFromStackSlot(NewInsertLoc, FrameIdx) ||
+            TII->isTriviallyReMaterializable(NewInsertLoc)))
+      ++NewInsertLoc;
+  }
+
+  return NewInsertLoc;
+}
+
+namespace {
+
+// ReusedOp - For each reused operand, we keep track of a bit of information,
+// in case we need to rollback upon processing a new operand.  See comments
+// below.
+struct ReusedOp {
+  // The MachineInstr operand that reused an available value.
+  unsigned Operand;
+
+  // StackSlotOrReMat - The spill slot or remat id of the value being reused.
+  unsigned StackSlotOrReMat;
+
+  // PhysRegReused - The physical register the value was available in.
+  unsigned PhysRegReused;
+
+  // AssignedPhysReg - The physreg that was assigned for use by the reload.
+  unsigned AssignedPhysReg;
+
+  // VirtReg - The virtual register itself.
+  unsigned VirtReg;
+
+  ReusedOp(unsigned o, unsigned ss, unsigned prr, unsigned apr,
+           unsigned vreg)
+    : Operand(o), StackSlotOrReMat(ss), PhysRegReused(prr),
+      AssignedPhysReg(apr), VirtReg(vreg) {}
+};
+
+/// ReuseInfo - This maintains a collection of ReuseOp's for each operand that
+/// is reused instead of reloaded.
+class ReuseInfo {
+  MachineInstr &MI;
+  std::vector<ReusedOp> Reuses;
+  BitVector PhysRegsClobbered;
+public:
+  ReuseInfo(MachineInstr &mi, const TargetRegisterInfo *tri) : MI(mi) {
+    PhysRegsClobbered.resize(tri->getNumRegs());
+  }
+
+  bool hasReuses() const {
+    return !Reuses.empty();
+  }
+
+  /// addReuse - If we choose to reuse a virtual register that is already
+  /// available instead of reloading it, remember that we did so.
+  void addReuse(unsigned OpNo, unsigned StackSlotOrReMat,
+                unsigned PhysRegReused, unsigned AssignedPhysReg,
+                unsigned VirtReg) {
+    // If the reload is to the assigned register anyway, no undo will be
+    // required.
+    if (PhysRegReused == AssignedPhysReg) return;
+
+    // Otherwise, remember this.
+    Reuses.push_back(ReusedOp(OpNo, StackSlotOrReMat, PhysRegReused,
+                              AssignedPhysReg, VirtReg));
+  }
+
+  void markClobbered(unsigned PhysReg) {
+    PhysRegsClobbered.set(PhysReg);
+  }
+
+  bool isClobbered(unsigned PhysReg) const {
+    return PhysRegsClobbered.test(PhysReg);
+  }
+
+  /// GetRegForReload - We are about to emit a reload into PhysReg.  If there
+  /// is some other operand that is using the specified register, either pick
+  /// a new register to use, or evict the previous reload and use this reg.
+  unsigned GetRegForReload(const TargetRegisterClass *RC, unsigned PhysReg,
+                           MachineFunction &MF, MachineInstr *MI,
+                           AvailableSpills &Spills,
+                           std::vector<MachineInstr*> &MaybeDeadStores,
+                           SmallSet<unsigned, 8> &Rejected,
+                           BitVector &RegKills,
+                           std::vector<MachineOperand*> &KillOps,
+                           VirtRegMap &VRM);
+
+  /// GetRegForReload - Helper for the above GetRegForReload(). Add a
+  /// 'Rejected' set to remember which registers have been considered and
+  /// rejected for the reload. This avoids infinite looping in case like
+  /// this:
+  /// t1 := op t2, t3
+  /// t2 <- assigned r0 for use by the reload but ended up reuse r1
+  /// t3 <- assigned r1 for use by the reload but ended up reuse r0
+  /// t1 <- desires r1
+  ///       sees r1 is taken by t2, tries t2's reload register r0
+  ///       sees r0 is taken by t3, tries t3's reload register r1
+  ///       sees r1 is taken by t2, tries t2's reload register r0 ...
+  unsigned GetRegForReload(unsigned VirtReg, unsigned PhysReg, MachineInstr *MI,
+                           AvailableSpills &Spills,
+                           std::vector<MachineInstr*> &MaybeDeadStores,
+                           BitVector &RegKills,
+                           std::vector<MachineOperand*> &KillOps,
+                           VirtRegMap &VRM) {
+    SmallSet<unsigned, 8> Rejected;
+    MachineFunction &MF = *MI->getParent()->getParent();
+    const TargetRegisterClass* RC = MF.getRegInfo().getRegClass(VirtReg);
+    return GetRegForReload(RC, PhysReg, MF, MI, Spills, MaybeDeadStores,
+                           Rejected, RegKills, KillOps, VRM);
+  }
+};
+
+}
+
+// ****************** //
+// Utility Functions  //
+// ****************** //
+
+/// findSinglePredSuccessor - Return via reference a vector of machine basic
+/// blocks each of which is a successor of the specified BB and has no other
+/// predecessor.
+static void findSinglePredSuccessor(MachineBasicBlock *MBB,
+                                   SmallVectorImpl<MachineBasicBlock *> &Succs){
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock *SuccMBB = *SI;
+    if (SuccMBB->pred_size() == 1)
+      Succs.push_back(SuccMBB);
+  }
+}
+
+/// InvalidateKill - Invalidate register kill information for a specific
+/// register. This also unsets the kills marker on the last kill operand.
+static void InvalidateKill(unsigned Reg,
+                           const TargetRegisterInfo* TRI,
+                           BitVector &RegKills,
+                           std::vector<MachineOperand*> &KillOps) {
+  if (RegKills[Reg]) {
+    KillOps[Reg]->setIsKill(false);
+    // KillOps[Reg] might be a def of a super-register.
+    unsigned KReg = KillOps[Reg]->getReg();
+    KillOps[KReg] = NULL;
+    RegKills.reset(KReg);
+    for (const unsigned *SR = TRI->getSubRegisters(KReg); *SR; ++SR) {
+      if (RegKills[*SR]) {
+        KillOps[*SR]->setIsKill(false);
+        KillOps[*SR] = NULL;
+        RegKills.reset(*SR);
+      }
+    }
+  }
+}
+
+/// InvalidateKills - MI is going to be deleted. If any of its operands are
+/// marked kill, then invalidate the information.
+static void InvalidateKills(MachineInstr &MI,
+                            const TargetRegisterInfo* TRI,
+                            BitVector &RegKills,
+                            std::vector<MachineOperand*> &KillOps,
+                            SmallVector<unsigned, 2> *KillRegs = NULL) {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || !MO.isKill() || MO.isUndef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (KillRegs)
+      KillRegs->push_back(Reg);
+    assert(Reg < KillOps.size());
+    if (KillOps[Reg] == &MO) {
+      KillOps[Reg] = NULL;
+      RegKills.reset(Reg);
+      for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR) {
+        if (RegKills[*SR]) {
+          KillOps[*SR] = NULL;
+          RegKills.reset(*SR);
+        }
+      }
+    }
+  }
+}
+
+/// InvalidateRegDef - If the def operand of the specified def MI is now dead
+/// (since its spill instruction is removed), mark it isDead. Also checks if
+/// the def MI has other definition operands that are not dead. Returns it by
+/// reference.
+static bool InvalidateRegDef(MachineBasicBlock::iterator I,
+                             MachineInstr &NewDef, unsigned Reg,
+                             bool &HasLiveDef,
+                             const TargetRegisterInfo *TRI) {
+  // Due to remat, it's possible this reg isn't being reused. That is,
+  // the def of this reg (by prev MI) is now dead.
+  MachineInstr *DefMI = I;
+  MachineOperand *DefOp = NULL;
+  for (unsigned i = 0, e = DefMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = DefMI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef() || !MO.isKill() || MO.isUndef())
+      continue;
+    if (MO.getReg() == Reg)
+      DefOp = &MO;
+    else if (!MO.isDead())
+      HasLiveDef = true;
+  }
+  if (!DefOp)
+    return false;
+
+  bool FoundUse = false, Done = false;
+  MachineBasicBlock::iterator E = &NewDef;
+  ++I; ++E;
+  for (; !Done && I != E; ++I) {
+    MachineInstr *NMI = I;
+    for (unsigned j = 0, ee = NMI->getNumOperands(); j != ee; ++j) {
+      MachineOperand &MO = NMI->getOperand(j);
+      if (!MO.isReg() || MO.getReg() == 0 ||
+          (MO.getReg() != Reg && !TRI->isSubRegister(Reg, MO.getReg())))
+        continue;
+      if (MO.isUse())
+        FoundUse = true;
+      Done = true; // Stop after scanning all the operands of this MI.
+    }
+  }
+  if (!FoundUse) {
+    // Def is dead!
+    DefOp->setIsDead();
+    return true;
+  }
+  return false;
+}
+
+/// UpdateKills - Track and update kill info. If a MI reads a register that is
+/// marked kill, then it must be due to register reuse. Transfer the kill info
+/// over.
+static void UpdateKills(MachineInstr &MI, const TargetRegisterInfo* TRI,
+                        BitVector &RegKills,
+                        std::vector<MachineOperand*> &KillOps) {
+  // These do not affect kill info at all.
+  if (MI.isDebugValue())
+    return;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || MO.isUndef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+
+    if (RegKills[Reg] && KillOps[Reg]->getParent() != &MI) {
+      // That can't be right. Register is killed but not re-defined and it's
+      // being reused. Let's fix that.
+      KillOps[Reg]->setIsKill(false);
+      // KillOps[Reg] might be a def of a super-register.
+      unsigned KReg = KillOps[Reg]->getReg();
+      KillOps[KReg] = NULL;
+      RegKills.reset(KReg);
+
+      // Must be a def of a super-register. Its other sub-regsters are no
+      // longer killed as well.
+      for (const unsigned *SR = TRI->getSubRegisters(KReg); *SR; ++SR) {
+        KillOps[*SR] = NULL;
+        RegKills.reset(*SR);
+      }
+    } else {
+      // Check for subreg kills as well.
+      // d4 =
+      // store d4, fi#0
+      // ...
+      //    = s8<kill>
+      // ...
+      //    = d4  <avoiding reload>
+      for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR) {
+        unsigned SReg = *SR;
+        if (RegKills[SReg] && KillOps[SReg]->getParent() != &MI) {
+          KillOps[SReg]->setIsKill(false);
+          unsigned KReg = KillOps[SReg]->getReg();
+          KillOps[KReg] = NULL;
+          RegKills.reset(KReg);
+
+          for (const unsigned *SSR = TRI->getSubRegisters(KReg); *SSR; ++SSR) {
+            KillOps[*SSR] = NULL;
+            RegKills.reset(*SSR);
+          }
+        }
+      }
+    }
+
+    if (MO.isKill()) {
+      RegKills.set(Reg);
+      KillOps[Reg] = &MO;
+      for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR) {
+        RegKills.set(*SR);
+        KillOps[*SR] = &MO;
+      }
+    }
+  }
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.getReg() || !MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    RegKills.reset(Reg);
+    KillOps[Reg] = NULL;
+    // It also defines (or partially define) aliases.
+    for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR) {
+      RegKills.reset(*SR);
+      KillOps[*SR] = NULL;
+    }
+    for (const unsigned *SR = TRI->getSuperRegisters(Reg); *SR; ++SR) {
+      RegKills.reset(*SR);
+      KillOps[*SR] = NULL;
+    }
+  }
+}
+
+/// ReMaterialize - Re-materialize definition for Reg targetting DestReg.
+///
+static void ReMaterialize(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator &MII,
+                          unsigned DestReg, unsigned Reg,
+                          const TargetInstrInfo *TII,
+                          const TargetRegisterInfo *TRI,
+                          VirtRegMap &VRM) {
+  MachineInstr *ReMatDefMI = VRM.getReMaterializedMI(Reg);
+#ifndef NDEBUG
+  const TargetInstrDesc &TID = ReMatDefMI->getDesc();
+  assert(TID.getNumDefs() == 1 &&
+         "Don't know how to remat instructions that define > 1 values!");
+#endif
+  TII->reMaterialize(MBB, MII, DestReg, 0, ReMatDefMI, *TRI);
+  MachineInstr *NewMI = prior(MII);
+  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = NewMI->getOperand(i);
+    if (!MO.isReg() || MO.getReg() == 0)
+      continue;
+    unsigned VirtReg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(VirtReg))
+      continue;
+    assert(MO.isUse());
+    unsigned Phys = VRM.getPhys(VirtReg);
+    assert(Phys && "Virtual register is not assigned a register?");
+    substitutePhysReg(MO, Phys, *TRI);
+  }
+  ++NumReMats;
+}
+
+/// findSuperReg - Find the SubReg's super-register of given register class
+/// where its SubIdx sub-register is SubReg.
+static unsigned findSuperReg(const TargetRegisterClass *RC, unsigned SubReg,
+                             unsigned SubIdx, const TargetRegisterInfo *TRI) {
+  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+       I != E; ++I) {
+    unsigned Reg = *I;
+    if (TRI->getSubReg(Reg, SubIdx) == SubReg)
+      return Reg;
+  }
+  return 0;
+}
+
+// ******************************** //
+// Available Spills Implementation  //
+// ******************************** //
+
+/// disallowClobberPhysRegOnly - Unset the CanClobber bit of the specified
+/// stackslot register. The register is still available but is no longer
+/// allowed to be modifed.
+void AvailableSpills::disallowClobberPhysRegOnly(unsigned PhysReg) {
+  std::multimap<unsigned, int>::iterator I =
+    PhysRegsAvailable.lower_bound(PhysReg);
+  while (I != PhysRegsAvailable.end() && I->first == PhysReg) {
+    int SlotOrReMat = I->second;
+    I++;
+    assert((SpillSlotsOrReMatsAvailable[SlotOrReMat] >> 1) == PhysReg &&
+           "Bidirectional map mismatch!");
+    SpillSlotsOrReMatsAvailable[SlotOrReMat] &= ~1;
+    DEBUG(dbgs() << "PhysReg " << TRI->getName(PhysReg)
+         << " copied, it is available for use but can no longer be modified\n");
+  }
+}
+
+/// disallowClobberPhysReg - Unset the CanClobber bit of the specified
+/// stackslot register and its aliases. The register and its aliases may
+/// still available but is no longer allowed to be modifed.
+void AvailableSpills::disallowClobberPhysReg(unsigned PhysReg) {
+  for (const unsigned *AS = TRI->getAliasSet(PhysReg); *AS; ++AS)
+    disallowClobberPhysRegOnly(*AS);
+  disallowClobberPhysRegOnly(PhysReg);
+}
+
+/// ClobberPhysRegOnly - This is called when the specified physreg changes
+/// value.  We use this to invalidate any info about stuff we thing lives in it.
+void AvailableSpills::ClobberPhysRegOnly(unsigned PhysReg) {
+  std::multimap<unsigned, int>::iterator I =
+    PhysRegsAvailable.lower_bound(PhysReg);
+  while (I != PhysRegsAvailable.end() && I->first == PhysReg) {
+    int SlotOrReMat = I->second;
+    PhysRegsAvailable.erase(I++);
+    assert((SpillSlotsOrReMatsAvailable[SlotOrReMat] >> 1) == PhysReg &&
+           "Bidirectional map mismatch!");
+    SpillSlotsOrReMatsAvailable.erase(SlotOrReMat);
+    DEBUG(dbgs() << "PhysReg " << TRI->getName(PhysReg)
+          << " clobbered, invalidating ");
+    if (SlotOrReMat > VirtRegMap::MAX_STACK_SLOT)
+      DEBUG(dbgs() << "RM#" << SlotOrReMat-VirtRegMap::MAX_STACK_SLOT-1 <<"\n");
+    else
+      DEBUG(dbgs() << "SS#" << SlotOrReMat << "\n");
+  }
+}
+
+/// ClobberPhysReg - This is called when the specified physreg changes
+/// value.  We use this to invalidate any info about stuff we thing lives in
+/// it and any of its aliases.
+void AvailableSpills::ClobberPhysReg(unsigned PhysReg) {
+  for (const unsigned *AS = TRI->getAliasSet(PhysReg); *AS; ++AS)
+    ClobberPhysRegOnly(*AS);
+  ClobberPhysRegOnly(PhysReg);
+}
+
+/// AddAvailableRegsToLiveIn - Availability information is being kept coming
+/// into the specified MBB. Add available physical registers as potential
+/// live-in's. If they are reused in the MBB, they will be added to the
+/// live-in set to make register scavenger and post-allocation scheduler.
+void AvailableSpills::AddAvailableRegsToLiveIn(MachineBasicBlock &MBB,
+                                        BitVector &RegKills,
+                                        std::vector<MachineOperand*> &KillOps) {
+  std::set<unsigned> NotAvailable;
+  for (std::multimap<unsigned, int>::iterator
+         I = PhysRegsAvailable.begin(), E = PhysRegsAvailable.end();
+       I != E; ++I) {
+    unsigned Reg = I->first;
+    const TargetRegisterClass* RC = TRI->getMinimalPhysRegClass(Reg);
+    // FIXME: A temporary workaround. We can't reuse available value if it's
+    // not safe to move the def of the virtual register's class. e.g.
+    // X86::RFP* register classes. Do not add it as a live-in.
+    if (!TII->isSafeToMoveRegClassDefs(RC))
+      // This is no longer available.
+      NotAvailable.insert(Reg);
+    else {
+      MBB.addLiveIn(Reg);
+      InvalidateKill(Reg, TRI, RegKills, KillOps);
+    }
+
+    // Skip over the same register.
+    std::multimap<unsigned, int>::iterator NI = llvm::next(I);
+    while (NI != E && NI->first == Reg) {
+      ++I;
+      ++NI;
+    }
+  }
+
+  for (std::set<unsigned>::iterator I = NotAvailable.begin(),
+         E = NotAvailable.end(); I != E; ++I) {
+    ClobberPhysReg(*I);
+    for (const unsigned *SubRegs = TRI->getSubRegisters(*I);
+       *SubRegs; ++SubRegs)
+      ClobberPhysReg(*SubRegs);
+  }
+}
+
+/// ModifyStackSlotOrReMat - This method is called when the value in a stack
+/// slot changes.  This removes information about which register the previous
+/// value for this slot lives in (as the previous value is dead now).
+void AvailableSpills::ModifyStackSlotOrReMat(int SlotOrReMat) {
+  std::map<int, unsigned>::iterator It =
+    SpillSlotsOrReMatsAvailable.find(SlotOrReMat);
+  if (It == SpillSlotsOrReMatsAvailable.end()) return;
+  unsigned Reg = It->second >> 1;
+  SpillSlotsOrReMatsAvailable.erase(It);
+
+  // This register may hold the value of multiple stack slots, only remove this
+  // stack slot from the set of values the register contains.
+  std::multimap<unsigned, int>::iterator I = PhysRegsAvailable.lower_bound(Reg);
+  for (; ; ++I) {
+    assert(I != PhysRegsAvailable.end() && I->first == Reg &&
+           "Map inverse broken!");
+    if (I->second == SlotOrReMat) break;
+  }
+  PhysRegsAvailable.erase(I);
+}
+
+// ************************** //
+// Reuse Info Implementation  //
+// ************************** //
+
+/// GetRegForReload - We are about to emit a reload into PhysReg.  If there
+/// is some other operand that is using the specified register, either pick
+/// a new register to use, or evict the previous reload and use this reg.
+unsigned ReuseInfo::GetRegForReload(const TargetRegisterClass *RC,
+                         unsigned PhysReg,
+                         MachineFunction &MF,
+                         MachineInstr *MI, AvailableSpills &Spills,
+                         std::vector<MachineInstr*> &MaybeDeadStores,
+                         SmallSet<unsigned, 8> &Rejected,
+                         BitVector &RegKills,
+                         std::vector<MachineOperand*> &KillOps,
+                         VirtRegMap &VRM) {
+  const TargetInstrInfo* TII = MF.getTarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = Spills.getRegInfo();
+
+  if (Reuses.empty()) return PhysReg;  // This is most often empty.
+
+  for (unsigned ro = 0, e = Reuses.size(); ro != e; ++ro) {
+    ReusedOp &Op = Reuses[ro];
+    // If we find some other reuse that was supposed to use this register
+    // exactly for its reload, we can change this reload to use ITS reload
+    // register. That is, unless its reload register has already been
+    // considered and subsequently rejected because it has also been reused
+    // by another operand.
+    if (Op.PhysRegReused == PhysReg &&
+        Rejected.count(Op.AssignedPhysReg) == 0 &&
+        RC->contains(Op.AssignedPhysReg)) {
+      // Yup, use the reload register that we didn't use before.
+      unsigned NewReg = Op.AssignedPhysReg;
+      Rejected.insert(PhysReg);
+      return GetRegForReload(RC, NewReg, MF, MI, Spills, MaybeDeadStores,
+                             Rejected, RegKills, KillOps, VRM);
+    } else {
+      // Otherwise, we might also have a problem if a previously reused
+      // value aliases the new register. If so, codegen the previous reload
+      // and use this one.
+      unsigned PRRU = Op.PhysRegReused;
+      if (TRI->regsOverlap(PRRU, PhysReg)) {
+        // Okay, we found out that an alias of a reused register
+        // was used.  This isn't good because it means we have
+        // to undo a previous reuse.
+        MachineBasicBlock *MBB = MI->getParent();
+        const TargetRegisterClass *AliasRC =
+          MBB->getParent()->getRegInfo().getRegClass(Op.VirtReg);
+
+        // Copy Op out of the vector and remove it, we're going to insert an
+        // explicit load for it.
+        ReusedOp NewOp = Op;
+        Reuses.erase(Reuses.begin()+ro);
+
+        // MI may be using only a sub-register of PhysRegUsed.
+        unsigned RealPhysRegUsed = MI->getOperand(NewOp.Operand).getReg();
+        unsigned SubIdx = 0;
+        assert(TargetRegisterInfo::isPhysicalRegister(RealPhysRegUsed) &&
+               "A reuse cannot be a virtual register");
+        if (PRRU != RealPhysRegUsed) {
+          // What was the sub-register index?
+          SubIdx = TRI->getSubRegIndex(PRRU, RealPhysRegUsed);
+          assert(SubIdx &&
+                 "Operand physreg is not a sub-register of PhysRegUsed");
+        }
+
+        // Ok, we're going to try to reload the assigned physreg into the
+        // slot that we were supposed to in the first place.  However, that
+        // register could hold a reuse.  Check to see if it conflicts or
+        // would prefer us to use a different register.
+        unsigned NewPhysReg = GetRegForReload(RC, NewOp.AssignedPhysReg,
+                                              MF, MI, Spills, MaybeDeadStores,
+                                              Rejected, RegKills, KillOps, VRM);
+
+        bool DoReMat = NewOp.StackSlotOrReMat > VirtRegMap::MAX_STACK_SLOT;
+        int SSorRMId = DoReMat
+          ? VRM.getReMatId(NewOp.VirtReg) : (int) NewOp.StackSlotOrReMat;
+
+        // Back-schedule reloads and remats.
+        MachineBasicBlock::iterator InsertLoc =
+          ComputeReloadLoc(MI, MBB->begin(), PhysReg, TRI,
+                           DoReMat, SSorRMId, TII, MF);
+
+        if (DoReMat) {
+          ReMaterialize(*MBB, InsertLoc, NewPhysReg, NewOp.VirtReg, TII,
+                        TRI, VRM);
+        } else {
+          TII->loadRegFromStackSlot(*MBB, InsertLoc, NewPhysReg,
+                                    NewOp.StackSlotOrReMat, AliasRC, TRI);
+          MachineInstr *LoadMI = prior(InsertLoc);
+          VRM.addSpillSlotUse(NewOp.StackSlotOrReMat, LoadMI);
+          // Any stores to this stack slot are not dead anymore.
+          MaybeDeadStores[NewOp.StackSlotOrReMat] = NULL;
+          ++NumLoads;
+        }
+        Spills.ClobberPhysReg(NewPhysReg);
+        Spills.ClobberPhysReg(NewOp.PhysRegReused);
+
+        unsigned RReg = SubIdx ? TRI->getSubReg(NewPhysReg, SubIdx) :NewPhysReg;
+        MI->getOperand(NewOp.Operand).setReg(RReg);
+        MI->getOperand(NewOp.Operand).setSubReg(0);
+
+        Spills.addAvailable(NewOp.StackSlotOrReMat, NewPhysReg);
+        UpdateKills(*prior(InsertLoc), TRI, RegKills, KillOps);
+        DEBUG(dbgs() << '\t' << *prior(InsertLoc));
+
+        DEBUG(dbgs() << "Reuse undone!\n");
+        --NumReused;
+
+        // Finally, PhysReg is now available, go ahead and use it.
+        return PhysReg;
+      }
+    }
+  }
+  return PhysReg;
+}
+
+// ************************************************************************ //
+
+/// FoldsStackSlotModRef - Return true if the specified MI folds the specified
+/// stack slot mod/ref. It also checks if it's possible to unfold the
+/// instruction by having it define a specified physical register instead.
+static bool FoldsStackSlotModRef(MachineInstr &MI, int SS, unsigned PhysReg,
+                                 const TargetInstrInfo *TII,
+                                 const TargetRegisterInfo *TRI,
+                                 VirtRegMap &VRM) {
+  if (VRM.hasEmergencySpills(&MI) || VRM.isSpillPt(&MI))
+    return false;
+
+  bool Found = false;
+  VirtRegMap::MI2VirtMapTy::const_iterator I, End;
+  for (tie(I, End) = VRM.getFoldedVirts(&MI); I != End; ++I) {
+    unsigned VirtReg = I->second.first;
+    VirtRegMap::ModRef MR = I->second.second;
+    if (MR & VirtRegMap::isModRef)
+      if (VRM.getStackSlot(VirtReg) == SS) {
+        Found= TII->getOpcodeAfterMemoryUnfold(MI.getOpcode(), true, true) != 0;
+        break;
+      }
+  }
+  if (!Found)
+    return false;
+
+  // Does the instruction uses a register that overlaps the scratch register?
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.getReg() == 0)
+      continue;
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (!VRM.hasPhys(Reg))
+        continue;
+      Reg = VRM.getPhys(Reg);
+    }
+    if (TRI->regsOverlap(PhysReg, Reg))
+      return false;
+  }
+  return true;
+}
+
+/// FindFreeRegister - Find a free register of a given register class by looking
+/// at (at most) the last two machine instructions.
+static unsigned FindFreeRegister(MachineBasicBlock::iterator MII,
+                                 MachineBasicBlock &MBB,
+                                 const TargetRegisterClass *RC,
+                                 const TargetRegisterInfo *TRI,
+                                 BitVector &AllocatableRegs) {
+  BitVector Defs(TRI->getNumRegs());
+  BitVector Uses(TRI->getNumRegs());
+  SmallVector<unsigned, 4> LocalUses;
+  SmallVector<unsigned, 4> Kills;
+
+  // Take a look at 2 instructions at most.
+  unsigned Count = 0;
+  while (Count < 2) {
+    if (MII == MBB.begin())
+      break;
+    MachineInstr *PrevMI = prior(MII);
+    MII = PrevMI;
+
+    if (PrevMI->isDebugValue())
+      continue; // Skip over dbg_value instructions.
+    ++Count;
+
+    for (unsigned i = 0, e = PrevMI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = PrevMI->getOperand(i);
+      if (!MO.isReg() || MO.getReg() == 0)
+        continue;
+      unsigned Reg = MO.getReg();
+      if (MO.isDef()) {
+        Defs.set(Reg);
+        for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+          Defs.set(*AS);
+      } else  {
+        LocalUses.push_back(Reg);
+        if (MO.isKill() && AllocatableRegs[Reg])
+          Kills.push_back(Reg);
+      }
+    }
+
+    for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
+      unsigned Kill = Kills[i];
+      if (!Defs[Kill] && !Uses[Kill] &&
+          RC->contains(Kill))
+        return Kill;
+    }
+    for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
+      unsigned Reg = LocalUses[i];
+      Uses.set(Reg);
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+        Uses.set(*AS);
+    }
+  }
+
+  return 0;
+}
+
+static
+void AssignPhysToVirtReg(MachineInstr *MI, unsigned VirtReg, unsigned PhysReg,
+                         const TargetRegisterInfo &TRI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == VirtReg)
+      substitutePhysReg(MO, PhysReg, TRI);
+  }
+}
+
+namespace {
+
+struct RefSorter {
+  bool operator()(const std::pair<MachineInstr*, int> &A,
+                  const std::pair<MachineInstr*, int> &B) {
+    return A.second < B.second;
+  }
+};
+
+// ***************************** //
+// Local Spiller Implementation  //
+// ***************************** //
+
+class LocalRewriter : public VirtRegRewriter {
+  MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+  VirtRegMap *VRM;
+  BitVector AllocatableRegs;
+  DenseMap<MachineInstr*, unsigned> DistanceMap;
+  DenseMap<int, SmallVector<MachineInstr*,4> > Slot2DbgValues;
+
+  MachineBasicBlock *MBB;       // Basic block currently being processed.
+
+public:
+
+  bool runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM,
+                            LiveIntervals* LIs);
+
+private:
+
+  bool OptimizeByUnfold2(unsigned VirtReg, int SS,
+                         MachineBasicBlock::iterator &MII,
+                         std::vector<MachineInstr*> &MaybeDeadStores,
+                         AvailableSpills &Spills,
+                         BitVector &RegKills,
+                         std::vector<MachineOperand*> &KillOps);
+
+  bool OptimizeByUnfold(MachineBasicBlock::iterator &MII,
+                        std::vector<MachineInstr*> &MaybeDeadStores,
+                        AvailableSpills &Spills,
+                        BitVector &RegKills,
+                        std::vector<MachineOperand*> &KillOps);
+
+  bool CommuteToFoldReload(MachineBasicBlock::iterator &MII,
+                           unsigned VirtReg, unsigned SrcReg, int SS,
+                           AvailableSpills &Spills,
+                           BitVector &RegKills,
+                           std::vector<MachineOperand*> &KillOps,
+                           const TargetRegisterInfo *TRI);
+
+  void SpillRegToStackSlot(MachineBasicBlock::iterator &MII,
+                           int Idx, unsigned PhysReg, int StackSlot,
+                           const TargetRegisterClass *RC,
+                           bool isAvailable, MachineInstr *&LastStore,
+                           AvailableSpills &Spills,
+                           SmallSet<MachineInstr*, 4> &ReMatDefs,
+                           BitVector &RegKills,
+                           std::vector<MachineOperand*> &KillOps);
+
+  void TransferDeadness(unsigned Reg, BitVector &RegKills,
+                        std::vector<MachineOperand*> &KillOps);
+
+  bool InsertEmergencySpills(MachineInstr *MI);
+
+  bool InsertRestores(MachineInstr *MI,
+                      AvailableSpills &Spills,
+                      BitVector &RegKills,
+                      std::vector<MachineOperand*> &KillOps);
+
+  bool InsertSpills(MachineInstr *MI);
+
+  void RewriteMBB(LiveIntervals *LIs,
+                  AvailableSpills &Spills, BitVector &RegKills,
+                  std::vector<MachineOperand*> &KillOps);
+};
+}
+
+bool LocalRewriter::runOnMachineFunction(MachineFunction &MF, VirtRegMap &vrm,
+                                         LiveIntervals* LIs) {
+  MRI = &MF.getRegInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getTarget().getInstrInfo();
+  VRM = &vrm;
+  AllocatableRegs = TRI->getAllocatableSet(MF);
+  DEBUG(dbgs() << "\n**** Local spiller rewriting function '"
+        << MF.getFunction()->getName() << "':\n");
+  DEBUG(dbgs() << "**** Machine Instrs (NOTE! Does not include spills and"
+        " reloads!) ****\n");
+  DEBUG(MF.dump());
+
+  // Spills - Keep track of which spilled values are available in physregs
+  // so that we can choose to reuse the physregs instead of emitting
+  // reloads. This is usually refreshed per basic block.
+  AvailableSpills Spills(TRI, TII);
+
+  // Keep track of kill information.
+  BitVector RegKills(TRI->getNumRegs());
+  std::vector<MachineOperand*> KillOps;
+  KillOps.resize(TRI->getNumRegs(), NULL);
+
+  // SingleEntrySuccs - Successor blocks which have a single predecessor.
+  SmallVector<MachineBasicBlock*, 4> SinglePredSuccs;
+  SmallPtrSet<MachineBasicBlock*,16> EarlyVisited;
+
+  // Traverse the basic blocks depth first.
+  MachineBasicBlock *Entry = MF.begin();
+  SmallPtrSet<MachineBasicBlock*,16> Visited;
+  for (df_ext_iterator<MachineBasicBlock*,
+         SmallPtrSet<MachineBasicBlock*,16> >
+         DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited);
+       DFI != E; ++DFI) {
+    MBB = *DFI;
+    if (!EarlyVisited.count(MBB))
+      RewriteMBB(LIs, Spills, RegKills, KillOps);
+
+    // If this MBB is the only predecessor of a successor. Keep the
+    // availability information and visit it next.
+    do {
+      // Keep visiting single predecessor successor as long as possible.
+      SinglePredSuccs.clear();
+      findSinglePredSuccessor(MBB, SinglePredSuccs);
+      if (SinglePredSuccs.empty())
+        MBB = 0;
+      else {
+        // FIXME: More than one successors, each of which has MBB has
+        // the only predecessor.
+        MBB = SinglePredSuccs[0];
+        if (!Visited.count(MBB) && EarlyVisited.insert(MBB)) {
+          Spills.AddAvailableRegsToLiveIn(*MBB, RegKills, KillOps);
+          RewriteMBB(LIs, Spills, RegKills, KillOps);
+        }
+      }
+    } while (MBB);
+
+    // Clear the availability info.
+    Spills.clear();
+  }
+
+  DEBUG(dbgs() << "**** Post Machine Instrs ****\n");
+  DEBUG(MF.dump());
+
+  // Mark unused spill slots.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  int SS = VRM->getLowSpillSlot();
+  if (SS != VirtRegMap::NO_STACK_SLOT) {
+    for (int e = VRM->getHighSpillSlot(); SS <= e; ++SS) {
+      SmallVector<MachineInstr*, 4> &DbgValues = Slot2DbgValues[SS];
+      if (!VRM->isSpillSlotUsed(SS)) {
+        MFI->RemoveStackObject(SS);
+        for (unsigned j = 0, ee = DbgValues.size(); j != ee; ++j) {
+          MachineInstr *DVMI = DbgValues[j];
+          MachineBasicBlock *DVMBB = DVMI->getParent();
+          DEBUG(dbgs() << "Removing debug info referencing FI#" << SS << '\n');
+          VRM->RemoveMachineInstrFromMaps(DVMI);
+          DVMBB->erase(DVMI);
+        }
+        ++NumDSS;
+      }
+      DbgValues.clear();
+    }
+  }
+  Slot2DbgValues.clear();
+
+  return true;
+}
+
+/// OptimizeByUnfold2 - Unfold a series of load / store folding instructions if
+/// a scratch register is available.
+///     xorq  %r12<kill>, %r13
+///     addq  %rax, -184(%rbp)
+///     addq  %r13, -184(%rbp)
+/// ==>
+///     xorq  %r12<kill>, %r13
+///     movq  -184(%rbp), %r12
+///     addq  %rax, %r12
+///     addq  %r13, %r12
+///     movq  %r12, -184(%rbp)
+bool LocalRewriter::
+OptimizeByUnfold2(unsigned VirtReg, int SS,
+                  MachineBasicBlock::iterator &MII,
+                  std::vector<MachineInstr*> &MaybeDeadStores,
+                  AvailableSpills &Spills,
+                  BitVector &RegKills,
+                  std::vector<MachineOperand*> &KillOps) {
+
+  MachineBasicBlock::iterator NextMII = llvm::next(MII);
+  // Skip over dbg_value instructions.
+  while (NextMII != MBB->end() && NextMII->isDebugValue())
+    NextMII = llvm::next(NextMII);
+  if (NextMII == MBB->end())
+    return false;
+
+  if (TII->getOpcodeAfterMemoryUnfold(MII->getOpcode(), true, true) == 0)
+    return false;
+
+  // Now let's see if the last couple of instructions happens to have freed up
+  // a register.
+  const TargetRegisterClass* RC = MRI->getRegClass(VirtReg);
+  unsigned PhysReg = FindFreeRegister(MII, *MBB, RC, TRI, AllocatableRegs);
+  if (!PhysReg)
+    return false;
+
+  MachineFunction &MF = *MBB->getParent();
+  TRI = MF.getTarget().getRegisterInfo();
+  MachineInstr &MI = *MII;
+  if (!FoldsStackSlotModRef(MI, SS, PhysReg, TII, TRI, *VRM))
+    return false;
+
+  // If the next instruction also folds the same SS modref and can be unfoled,
+  // then it's worthwhile to issue a load from SS into the free register and
+  // then unfold these instructions.
+  if (!FoldsStackSlotModRef(*NextMII, SS, PhysReg, TII, TRI, *VRM))
+    return false;
+
+  // Back-schedule reloads and remats.
+  ComputeReloadLoc(MII, MBB->begin(), PhysReg, TRI, false, SS, TII, MF);
+
+  // Load from SS to the spare physical register.
+  TII->loadRegFromStackSlot(*MBB, MII, PhysReg, SS, RC, TRI);
+  // This invalidates Phys.
+  Spills.ClobberPhysReg(PhysReg);
+  // Remember it's available.
+  Spills.addAvailable(SS, PhysReg);
+  MaybeDeadStores[SS] = NULL;
+
+  // Unfold current MI.
+  SmallVector<MachineInstr*, 4> NewMIs;
+  if (!TII->unfoldMemoryOperand(MF, &MI, VirtReg, false, false, NewMIs))
+    llvm_unreachable("Unable unfold the load / store folding instruction!");
+  assert(NewMIs.size() == 1);
+  AssignPhysToVirtReg(NewMIs[0], VirtReg, PhysReg, *TRI);
+  VRM->transferRestorePts(&MI, NewMIs[0]);
+  MII = MBB->insert(MII, NewMIs[0]);
+  InvalidateKills(MI, TRI, RegKills, KillOps);
+  VRM->RemoveMachineInstrFromMaps(&MI);
+  MBB->erase(&MI);
+  ++NumModRefUnfold;
+
+  // Unfold next instructions that fold the same SS.
+  do {
+    MachineInstr &NextMI = *NextMII;
+    NextMII = llvm::next(NextMII);
+    NewMIs.clear();
+    if (!TII->unfoldMemoryOperand(MF, &NextMI, VirtReg, false, false, NewMIs))
+      llvm_unreachable("Unable unfold the load / store folding instruction!");
+    assert(NewMIs.size() == 1);
+    AssignPhysToVirtReg(NewMIs[0], VirtReg, PhysReg, *TRI);
+    VRM->transferRestorePts(&NextMI, NewMIs[0]);
+    MBB->insert(NextMII, NewMIs[0]);
+    InvalidateKills(NextMI, TRI, RegKills, KillOps);
+    VRM->RemoveMachineInstrFromMaps(&NextMI);
+    MBB->erase(&NextMI);
+    ++NumModRefUnfold;
+    // Skip over dbg_value instructions.
+    while (NextMII != MBB->end() && NextMII->isDebugValue())
+      NextMII = llvm::next(NextMII);
+    if (NextMII == MBB->end())
+      break;
+  } while (FoldsStackSlotModRef(*NextMII, SS, PhysReg, TII, TRI, *VRM));
+
+  // Store the value back into SS.
+  TII->storeRegToStackSlot(*MBB, NextMII, PhysReg, true, SS, RC, TRI);
+  MachineInstr *StoreMI = prior(NextMII);
+  VRM->addSpillSlotUse(SS, StoreMI);
+  VRM->virtFolded(VirtReg, StoreMI, VirtRegMap::isMod);
+
+  return true;
+}
+
+/// OptimizeByUnfold - Turn a store folding instruction into a load folding
+/// instruction. e.g.
+///     xorl  %edi, %eax
+///     movl  %eax, -32(%ebp)
+///     movl  -36(%ebp), %eax
+///     orl   %eax, -32(%ebp)
+/// ==>
+///     xorl  %edi, %eax
+///     orl   -36(%ebp), %eax
+///     mov   %eax, -32(%ebp)
+/// This enables unfolding optimization for a subsequent instruction which will
+/// also eliminate the newly introduced store instruction.
+bool LocalRewriter::
+OptimizeByUnfold(MachineBasicBlock::iterator &MII,
+                 std::vector<MachineInstr*> &MaybeDeadStores,
+                 AvailableSpills &Spills,
+                 BitVector &RegKills,
+                 std::vector<MachineOperand*> &KillOps) {
+  MachineFunction &MF = *MBB->getParent();
+  MachineInstr &MI = *MII;
+  unsigned UnfoldedOpc = 0;
+  unsigned UnfoldPR = 0;
+  unsigned UnfoldVR = 0;
+  int FoldedSS = VirtRegMap::NO_STACK_SLOT;
+  VirtRegMap::MI2VirtMapTy::const_iterator I, End;
+  for (tie(I, End) = VRM->getFoldedVirts(&MI); I != End; ) {
+    // Only transform a MI that folds a single register.
+    if (UnfoldedOpc)
+      return false;
+    UnfoldVR = I->second.first;
+    VirtRegMap::ModRef MR = I->second.second;
+    // MI2VirtMap be can updated which invalidate the iterator.
+    // Increment the iterator first.
+    ++I;
+    if (VRM->isAssignedReg(UnfoldVR))
+      continue;
+    // If this reference is not a use, any previous store is now dead.
+    // Otherwise, the store to this stack slot is not dead anymore.
+    FoldedSS = VRM->getStackSlot(UnfoldVR);
+    MachineInstr* DeadStore = MaybeDeadStores[FoldedSS];
+    if (DeadStore && (MR & VirtRegMap::isModRef)) {
+      unsigned PhysReg = Spills.getSpillSlotOrReMatPhysReg(FoldedSS);
+      if (!PhysReg || !DeadStore->readsRegister(PhysReg))
+        continue;
+      UnfoldPR = PhysReg;
+      UnfoldedOpc = TII->getOpcodeAfterMemoryUnfold(MI.getOpcode(),
+                                                    false, true);
+    }
+  }
+
+  if (!UnfoldedOpc) {
+    if (!UnfoldVR)
+      return false;
+
+    // Look for other unfolding opportunities.
+    return OptimizeByUnfold2(UnfoldVR, FoldedSS, MII, MaybeDeadStores, Spills,
+                             RegKills, KillOps);
+  }
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.getReg() == 0 || !MO.isUse())
+      continue;
+    unsigned VirtReg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(VirtReg) || MO.getSubReg())
+      continue;
+    if (VRM->isAssignedReg(VirtReg)) {
+      unsigned PhysReg = VRM->getPhys(VirtReg);
+      if (PhysReg && TRI->regsOverlap(PhysReg, UnfoldPR))
+        return false;
+    } else if (VRM->isReMaterialized(VirtReg))
+      continue;
+    int SS = VRM->getStackSlot(VirtReg);
+    unsigned PhysReg = Spills.getSpillSlotOrReMatPhysReg(SS);
+    if (PhysReg) {
+      if (TRI->regsOverlap(PhysReg, UnfoldPR))
+        return false;
+      continue;
+    }
+    if (VRM->hasPhys(VirtReg)) {
+      PhysReg = VRM->getPhys(VirtReg);
+      if (!TRI->regsOverlap(PhysReg, UnfoldPR))
+        continue;
+    }
+
+    // Ok, we'll need to reload the value into a register which makes
+    // it impossible to perform the store unfolding optimization later.
+    // Let's see if it is possible to fold the load if the store is
+    // unfolded. This allows us to perform the store unfolding
+    // optimization.
+    SmallVector<MachineInstr*, 4> NewMIs;
+    if (TII->unfoldMemoryOperand(MF, &MI, UnfoldVR, false, false, NewMIs)) {
+      assert(NewMIs.size() == 1);
+      MachineInstr *NewMI = NewMIs.back();
+      MBB->insert(MII, NewMI);
+      NewMIs.clear();
+      int Idx = NewMI->findRegisterUseOperandIdx(VirtReg, false);
+      assert(Idx != -1);
+      SmallVector<unsigned, 1> Ops;
+      Ops.push_back(Idx);
+      MachineInstr *FoldedMI = TII->foldMemoryOperand(NewMI, Ops, SS);
+      NewMI->eraseFromParent();
+      if (FoldedMI) {
+        VRM->addSpillSlotUse(SS, FoldedMI);
+        if (!VRM->hasPhys(UnfoldVR))
+          VRM->assignVirt2Phys(UnfoldVR, UnfoldPR);
+        VRM->virtFolded(VirtReg, FoldedMI, VirtRegMap::isRef);
+        MII = FoldedMI;
+        InvalidateKills(MI, TRI, RegKills, KillOps);
+        VRM->RemoveMachineInstrFromMaps(&MI);
+        MBB->erase(&MI);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/// CommuteChangesDestination - We are looking for r0 = op r1, r2 and
+/// where SrcReg is r1 and it is tied to r0. Return true if after
+/// commuting this instruction it will be r0 = op r2, r1.
+static bool CommuteChangesDestination(MachineInstr *DefMI,
+                                      const TargetInstrDesc &TID,
+                                      unsigned SrcReg,
+                                      const TargetInstrInfo *TII,
+                                      unsigned &DstIdx) {
+  if (TID.getNumDefs() != 1 && TID.getNumOperands() != 3)
+    return false;
+  if (!DefMI->getOperand(1).isReg() ||
+      DefMI->getOperand(1).getReg() != SrcReg)
+    return false;
+  unsigned DefIdx;
+  if (!DefMI->isRegTiedToDefOperand(1, &DefIdx) || DefIdx != 0)
+    return false;
+  unsigned SrcIdx1, SrcIdx2;
+  if (!TII->findCommutedOpIndices(DefMI, SrcIdx1, SrcIdx2))
+    return false;
+  if (SrcIdx1 == 1 && SrcIdx2 == 2) {
+    DstIdx = 2;
+    return true;
+  }
+  return false;
+}
+
+/// CommuteToFoldReload -
+/// Look for
+/// r1 = load fi#1
+/// r1 = op r1, r2<kill>
+/// store r1, fi#1
+///
+/// If op is commutable and r2 is killed, then we can xform these to
+/// r2 = op r2, fi#1
+/// store r2, fi#1
+bool LocalRewriter::
+CommuteToFoldReload(MachineBasicBlock::iterator &MII,
+                    unsigned VirtReg, unsigned SrcReg, int SS,
+                    AvailableSpills &Spills,
+                    BitVector &RegKills,
+                    std::vector<MachineOperand*> &KillOps,
+                    const TargetRegisterInfo *TRI) {
+  if (MII == MBB->begin() || !MII->killsRegister(SrcReg))
+    return false;
+
+  MachineInstr &MI = *MII;
+  MachineBasicBlock::iterator DefMII = prior(MII);
+  MachineInstr *DefMI = DefMII;
+  const TargetInstrDesc &TID = DefMI->getDesc();
+  unsigned NewDstIdx;
+  if (DefMII != MBB->begin() &&
+      TID.isCommutable() &&
+      CommuteChangesDestination(DefMI, TID, SrcReg, TII, NewDstIdx)) {
+    MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
+    unsigned NewReg = NewDstMO.getReg();
+    if (!NewDstMO.isKill() || TRI->regsOverlap(NewReg, SrcReg))
+      return false;
+    MachineInstr *ReloadMI = prior(DefMII);
+    int FrameIdx;
+    unsigned DestReg = TII->isLoadFromStackSlot(ReloadMI, FrameIdx);
+    if (DestReg != SrcReg || FrameIdx != SS)
+      return false;
+    int UseIdx = DefMI->findRegisterUseOperandIdx(DestReg, false);
+    if (UseIdx == -1)
+      return false;
+    unsigned DefIdx;
+    if (!MI.isRegTiedToDefOperand(UseIdx, &DefIdx))
+      return false;
+    assert(DefMI->getOperand(DefIdx).isReg() &&
+           DefMI->getOperand(DefIdx).getReg() == SrcReg);
+
+    // Now commute def instruction.
+    MachineInstr *CommutedMI = TII->commuteInstruction(DefMI, true);
+    if (!CommutedMI)
+      return false;
+    MBB->insert(MII, CommutedMI);
+    SmallVector<unsigned, 1> Ops;
+    Ops.push_back(NewDstIdx);
+    MachineInstr *FoldedMI = TII->foldMemoryOperand(CommutedMI, Ops, SS);
+    // Not needed since foldMemoryOperand returns new MI.
+    CommutedMI->eraseFromParent();
+    if (!FoldedMI)
+      return false;
+
+    VRM->addSpillSlotUse(SS, FoldedMI);
+    VRM->virtFolded(VirtReg, FoldedMI, VirtRegMap::isRef);
+    // Insert new def MI and spill MI.
+    const TargetRegisterClass* RC = MRI->getRegClass(VirtReg);
+    TII->storeRegToStackSlot(*MBB, &MI, NewReg, true, SS, RC, TRI);
+    MII = prior(MII);
+    MachineInstr *StoreMI = MII;
+    VRM->addSpillSlotUse(SS, StoreMI);
+    VRM->virtFolded(VirtReg, StoreMI, VirtRegMap::isMod);
+    MII = FoldedMI;  // Update MII to backtrack.
+
+    // Delete all 3 old instructions.
+    InvalidateKills(*ReloadMI, TRI, RegKills, KillOps);
+    VRM->RemoveMachineInstrFromMaps(ReloadMI);
+    MBB->erase(ReloadMI);
+    InvalidateKills(*DefMI, TRI, RegKills, KillOps);
+    VRM->RemoveMachineInstrFromMaps(DefMI);
+    MBB->erase(DefMI);
+    InvalidateKills(MI, TRI, RegKills, KillOps);
+    VRM->RemoveMachineInstrFromMaps(&MI);
+    MBB->erase(&MI);
+
+    // If NewReg was previously holding value of some SS, it's now clobbered.
+    // This has to be done now because it's a physical register. When this
+    // instruction is re-visited, it's ignored.
+    Spills.ClobberPhysReg(NewReg);
+
+    ++NumCommutes;
+    return true;
+  }
+
+  return false;
+}
+
+/// SpillRegToStackSlot - Spill a register to a specified stack slot. Check if
+/// the last store to the same slot is now dead. If so, remove the last store.
+void LocalRewriter::
+SpillRegToStackSlot(MachineBasicBlock::iterator &MII,
+                    int Idx, unsigned PhysReg, int StackSlot,
+                    const TargetRegisterClass *RC,
+                    bool isAvailable, MachineInstr *&LastStore,
+                    AvailableSpills &Spills,
+                    SmallSet<MachineInstr*, 4> &ReMatDefs,
+                    BitVector &RegKills,
+                    std::vector<MachineOperand*> &KillOps) {
+
+  MachineBasicBlock::iterator oldNextMII = llvm::next(MII);
+  TII->storeRegToStackSlot(*MBB, llvm::next(MII), PhysReg, true, StackSlot, RC,
+                           TRI);
+  MachineInstr *StoreMI = prior(oldNextMII);
+  VRM->addSpillSlotUse(StackSlot, StoreMI);
+  DEBUG(dbgs() << "Store:\t" << *StoreMI);
+
+  // If there is a dead store to this stack slot, nuke it now.
+  if (LastStore) {
+    DEBUG(dbgs() << "Removed dead store:\t" << *LastStore);
+    ++NumDSE;
+    SmallVector<unsigned, 2> KillRegs;
+    InvalidateKills(*LastStore, TRI, RegKills, KillOps, &KillRegs);
+    MachineBasicBlock::iterator PrevMII = LastStore;
+    bool CheckDef = PrevMII != MBB->begin();
+    if (CheckDef)
+      --PrevMII;
+    VRM->RemoveMachineInstrFromMaps(LastStore);
+    MBB->erase(LastStore);
+    if (CheckDef) {
+      // Look at defs of killed registers on the store. Mark the defs
+      // as dead since the store has been deleted and they aren't
+      // being reused.
+      for (unsigned j = 0, ee = KillRegs.size(); j != ee; ++j) {
+        bool HasOtherDef = false;
+        if (InvalidateRegDef(PrevMII, *MII, KillRegs[j], HasOtherDef, TRI)) {
+          MachineInstr *DeadDef = PrevMII;
+          if (ReMatDefs.count(DeadDef) && !HasOtherDef) {
+            // FIXME: This assumes a remat def does not have side effects.
+            VRM->RemoveMachineInstrFromMaps(DeadDef);
+            MBB->erase(DeadDef);
+            ++NumDRM;
+          }
+        }
+      }
+    }
+  }
+
+  // Allow for multi-instruction spill sequences, as on PPC Altivec.  Presume
+  // the last of multiple instructions is the actual store.
+  LastStore = prior(oldNextMII);
+
+  // If the stack slot value was previously available in some other
+  // register, change it now.  Otherwise, make the register available,
+  // in PhysReg.
+  Spills.ModifyStackSlotOrReMat(StackSlot);
+  Spills.ClobberPhysReg(PhysReg);
+  Spills.addAvailable(StackSlot, PhysReg, isAvailable);
+  ++NumStores;
+}
+
+/// isSafeToDelete - Return true if this instruction doesn't produce any side
+/// effect and all of its defs are dead.
+static bool isSafeToDelete(MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  if (TID.mayLoad() || TID.mayStore() || TID.isCall() || TID.isTerminator() ||
+      TID.isCall() || TID.isBarrier() || TID.isReturn() ||
+      TID.hasUnmodeledSideEffects())
+    return false;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    if (MO.isDef() && !MO.isDead())
+      return false;
+    if (MO.isUse() && MO.isKill())
+      // FIXME: We can't remove kill markers or else the scavenger will assert.
+      // An alternative is to add a ADD pseudo instruction to replace kill
+      // markers.
+      return false;
+  }
+  return true;
+}
+
+/// TransferDeadness - A identity copy definition is dead and it's being
+/// removed. Find the last def or use and mark it as dead / kill.
+void LocalRewriter::
+TransferDeadness(unsigned Reg, BitVector &RegKills,
+                 std::vector<MachineOperand*> &KillOps) {
+  SmallPtrSet<MachineInstr*, 4> Seens;
+  SmallVector<std::pair<MachineInstr*, int>,8> Refs;
+  for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(Reg),
+         RE = MRI->reg_end(); RI != RE; ++RI) {
+    MachineInstr *UDMI = &*RI;
+    if (UDMI->isDebugValue() || UDMI->getParent() != MBB)
+      continue;
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UDMI);
+    if (DI == DistanceMap.end())
+      continue;
+    if (Seens.insert(UDMI))
+      Refs.push_back(std::make_pair(UDMI, DI->second));
+  }
+
+  if (Refs.empty())
+    return;
+  std::sort(Refs.begin(), Refs.end(), RefSorter());
+
+  while (!Refs.empty()) {
+    MachineInstr *LastUDMI = Refs.back().first;
+    Refs.pop_back();
+
+    MachineOperand *LastUD = NULL;
+    for (unsigned i = 0, e = LastUDMI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = LastUDMI->getOperand(i);
+      if (!MO.isReg() || MO.getReg() != Reg)
+        continue;
+      if (!LastUD || (LastUD->isUse() && MO.isDef()))
+        LastUD = &MO;
+      if (LastUDMI->isRegTiedToDefOperand(i))
+        break;
+    }
+    if (LastUD->isDef()) {
+      // If the instruction has no side effect, delete it and propagate
+      // backward further. Otherwise, mark is dead and we are done.
+      if (!isSafeToDelete(*LastUDMI)) {
+        LastUD->setIsDead();
+        break;
+      }
+      VRM->RemoveMachineInstrFromMaps(LastUDMI);
+      MBB->erase(LastUDMI);
+    } else {
+      LastUD->setIsKill();
+      RegKills.set(Reg);
+      KillOps[Reg] = LastUD;
+      break;
+    }
+  }
+}
+
+/// InsertEmergencySpills - Insert emergency spills before MI if requested by
+/// VRM. Return true if spills were inserted.
+bool LocalRewriter::InsertEmergencySpills(MachineInstr *MI) {
+  if (!VRM->hasEmergencySpills(MI))
+    return false;
+  MachineBasicBlock::iterator MII = MI;
+  SmallSet<int, 4> UsedSS;
+  std::vector<unsigned> &EmSpills = VRM->getEmergencySpills(MI);
+  for (unsigned i = 0, e = EmSpills.size(); i != e; ++i) {
+    unsigned PhysReg = EmSpills[i];
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
+    assert(RC && "Unable to determine register class!");
+    int SS = VRM->getEmergencySpillSlot(RC);
+    if (UsedSS.count(SS))
+      llvm_unreachable("Need to spill more than one physical registers!");
+    UsedSS.insert(SS);
+    TII->storeRegToStackSlot(*MBB, MII, PhysReg, true, SS, RC, TRI);
+    MachineInstr *StoreMI = prior(MII);
+    VRM->addSpillSlotUse(SS, StoreMI);
+
+    // Back-schedule reloads and remats.
+    MachineBasicBlock::iterator InsertLoc =
+      ComputeReloadLoc(llvm::next(MII), MBB->begin(), PhysReg, TRI, false, SS,
+                       TII, *MBB->getParent());
+
+    TII->loadRegFromStackSlot(*MBB, InsertLoc, PhysReg, SS, RC, TRI);
+
+    MachineInstr *LoadMI = prior(InsertLoc);
+    VRM->addSpillSlotUse(SS, LoadMI);
+    ++NumPSpills;
+    DistanceMap.insert(std::make_pair(LoadMI, DistanceMap.size()));
+  }
+  return true;
+}
+
+/// InsertRestores - Restore registers before MI is requested by VRM. Return
+/// true is any instructions were inserted.
+bool LocalRewriter::InsertRestores(MachineInstr *MI,
+                                   AvailableSpills &Spills,
+                                   BitVector &RegKills,
+                                   std::vector<MachineOperand*> &KillOps) {
+  if (!VRM->isRestorePt(MI))
+    return false;
+  MachineBasicBlock::iterator MII = MI;
+  std::vector<unsigned> &RestoreRegs = VRM->getRestorePtRestores(MI);
+  for (unsigned i = 0, e = RestoreRegs.size(); i != e; ++i) {
+    unsigned VirtReg = RestoreRegs[e-i-1];  // Reverse order.
+    if (!VRM->getPreSplitReg(VirtReg))
+      continue; // Split interval spilled again.
+    unsigned Phys = VRM->getPhys(VirtReg);
+    MRI->setPhysRegUsed(Phys);
+
+    // Check if the value being restored if available. If so, it must be
+    // from a predecessor BB that fallthrough into this BB. We do not
+    // expect:
+    // BB1:
+    // r1 = load fi#1
+    // ...
+    //    = r1<kill>
+    // ... # r1 not clobbered
+    // ...
+    //    = load fi#1
+    bool DoReMat = VRM->isReMaterialized(VirtReg);
+    int SSorRMId = DoReMat
+      ? VRM->getReMatId(VirtReg) : VRM->getStackSlot(VirtReg);
+    unsigned InReg = Spills.getSpillSlotOrReMatPhysReg(SSorRMId);
+    if (InReg == Phys) {
+      // If the value is already available in the expected register, save
+      // a reload / remat.
+      if (SSorRMId)
+        DEBUG(dbgs() << "Reusing RM#"
+                     << SSorRMId-VirtRegMap::MAX_STACK_SLOT-1);
+      else
+        DEBUG(dbgs() << "Reusing SS#" << SSorRMId);
+      DEBUG(dbgs() << " from physreg "
+                   << TRI->getName(InReg) << " for vreg"
+                   << VirtReg <<" instead of reloading into physreg "
+                   << TRI->getName(Phys) << '\n');
+      ++NumOmitted;
+      continue;
+    } else if (InReg && InReg != Phys) {
+      if (SSorRMId)
+        DEBUG(dbgs() << "Reusing RM#"
+                     << SSorRMId-VirtRegMap::MAX_STACK_SLOT-1);
+      else
+        DEBUG(dbgs() << "Reusing SS#" << SSorRMId);
+      DEBUG(dbgs() << " from physreg "
+                   << TRI->getName(InReg) << " for vreg"
+                   << VirtReg <<" by copying it into physreg "
+                   << TRI->getName(Phys) << '\n');
+
+      // If the reloaded / remat value is available in another register,
+      // copy it to the desired register.
+
+      // Back-schedule reloads and remats.
+      MachineBasicBlock::iterator InsertLoc =
+        ComputeReloadLoc(MII, MBB->begin(), Phys, TRI, DoReMat, SSorRMId, TII,
+                         *MBB->getParent());
+      MachineInstr *CopyMI = BuildMI(*MBB, InsertLoc, MI->getDebugLoc(),
+                                     TII->get(TargetOpcode::COPY), Phys)
+                               .addReg(InReg, RegState::Kill);
+
+      // This invalidates Phys.
+      Spills.ClobberPhysReg(Phys);
+      // Remember it's available.
+      Spills.addAvailable(SSorRMId, Phys);
+
+      CopyMI->setAsmPrinterFlag(MachineInstr::ReloadReuse);
+      UpdateKills(*CopyMI, TRI, RegKills, KillOps);
+
+      DEBUG(dbgs() << '\t' << *CopyMI);
+      ++NumCopified;
+      continue;
+    }
+
+    // Back-schedule reloads and remats.
+    MachineBasicBlock::iterator InsertLoc =
+      ComputeReloadLoc(MII, MBB->begin(), Phys, TRI, DoReMat, SSorRMId, TII,
+                       *MBB->getParent());
+
+    if (VRM->isReMaterialized(VirtReg)) {
+      ReMaterialize(*MBB, InsertLoc, Phys, VirtReg, TII, TRI, *VRM);
+    } else {
+      const TargetRegisterClass* RC = MRI->getRegClass(VirtReg);
+      TII->loadRegFromStackSlot(*MBB, InsertLoc, Phys, SSorRMId, RC, TRI);
+      MachineInstr *LoadMI = prior(InsertLoc);
+      VRM->addSpillSlotUse(SSorRMId, LoadMI);
+      ++NumLoads;
+      DistanceMap.insert(std::make_pair(LoadMI, DistanceMap.size()));
+    }
+
+    // This invalidates Phys.
+    Spills.ClobberPhysReg(Phys);
+    // Remember it's available.
+    Spills.addAvailable(SSorRMId, Phys);
+
+    UpdateKills(*prior(InsertLoc), TRI, RegKills, KillOps);
+    DEBUG(dbgs() << '\t' << *prior(MII));
+  }
+  return true;
+}
+
+/// InsertEmergencySpills - Insert spills after MI if requested by VRM. Return
+/// true if spills were inserted.
+bool LocalRewriter::InsertSpills(MachineInstr *MI) {
+  if (!VRM->isSpillPt(MI))
+    return false;
+  MachineBasicBlock::iterator MII = MI;
+  std::vector<std::pair<unsigned,bool> > &SpillRegs =
+    VRM->getSpillPtSpills(MI);
+  for (unsigned i = 0, e = SpillRegs.size(); i != e; ++i) {
+    unsigned VirtReg = SpillRegs[i].first;
+    bool isKill = SpillRegs[i].second;
+    if (!VRM->getPreSplitReg(VirtReg))
+      continue; // Split interval spilled again.
+    const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
+    unsigned Phys = VRM->getPhys(VirtReg);
+    int StackSlot = VRM->getStackSlot(VirtReg);
+    MachineBasicBlock::iterator oldNextMII = llvm::next(MII);
+    TII->storeRegToStackSlot(*MBB, llvm::next(MII), Phys, isKill, StackSlot,
+                             RC, TRI);
+    MachineInstr *StoreMI = prior(oldNextMII);
+    VRM->addSpillSlotUse(StackSlot, StoreMI);
+    DEBUG(dbgs() << "Store:\t" << *StoreMI);
+    VRM->virtFolded(VirtReg, StoreMI, VirtRegMap::isMod);
+  }
+  return true;
+}
+
+
+/// rewriteMBB - Keep track of which spills are available even after the
+/// register allocator is done with them.  If possible, avoid reloading vregs.
+void
+LocalRewriter::RewriteMBB(LiveIntervals *LIs,
+                          AvailableSpills &Spills, BitVector &RegKills,
+                          std::vector<MachineOperand*> &KillOps) {
+
+  DEBUG(dbgs() << "\n**** Local spiller rewriting MBB '"
+               << MBB->getName() << "':\n");
+
+  MachineFunction &MF = *MBB->getParent();
+
+  // MaybeDeadStores - When we need to write a value back into a stack slot,
+  // keep track of the inserted store.  If the stack slot value is never read
+  // (because the value was used from some available register, for example), and
+  // subsequently stored to, the original store is dead.  This map keeps track
+  // of inserted stores that are not used.  If we see a subsequent store to the
+  // same stack slot, the original store is deleted.
+  std::vector<MachineInstr*> MaybeDeadStores;
+  MaybeDeadStores.resize(MF.getFrameInfo()->getObjectIndexEnd(), NULL);
+
+  // ReMatDefs - These are rematerializable def MIs which are not deleted.
+  SmallSet<MachineInstr*, 4> ReMatDefs;
+
+  // Clear kill info.
+  SmallSet<unsigned, 2> KilledMIRegs;
+
+  // Keep track of the registers we have already spilled in case there are
+  // multiple defs of the same register in MI.
+  SmallSet<unsigned, 8> SpilledMIRegs;
+
+  RegKills.reset();
+  KillOps.clear();
+  KillOps.resize(TRI->getNumRegs(), NULL);
+
+  DistanceMap.clear();
+  for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
+       MII != E; ) {
+    MachineBasicBlock::iterator NextMII = llvm::next(MII);
+
+    if (OptimizeByUnfold(MII, MaybeDeadStores, Spills, RegKills, KillOps))
+      NextMII = llvm::next(MII);
+
+    if (InsertEmergencySpills(MII))
+      NextMII = llvm::next(MII);
+
+    InsertRestores(MII, Spills, RegKills, KillOps);
+
+    if (InsertSpills(MII))
+      NextMII = llvm::next(MII);
+
+    bool Erased = false;
+    bool BackTracked = false;
+    MachineInstr &MI = *MII;
+
+    // Remember DbgValue's which reference stack slots.
+    if (MI.isDebugValue() && MI.getOperand(0).isFI())
+      Slot2DbgValues[MI.getOperand(0).getIndex()].push_back(&MI);
+
+    /// ReusedOperands - Keep track of operand reuse in case we need to undo
+    /// reuse.
+    ReuseInfo ReusedOperands(MI, TRI);
+    SmallVector<unsigned, 4> VirtUseOps;
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (!MO.isReg() || MO.getReg() == 0)
+        continue;   // Ignore non-register operands.
+
+      unsigned VirtReg = MO.getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(VirtReg)) {
+        // Ignore physregs for spilling, but remember that it is used by this
+        // function.
+        MRI->setPhysRegUsed(VirtReg);
+        continue;
+      }
+
+      // We want to process implicit virtual register uses first.
+      if (MO.isImplicit())
+        // If the virtual register is implicitly defined, emit a implicit_def
+        // before so scavenger knows it's "defined".
+        // FIXME: This is a horrible hack done the by register allocator to
+        // remat a definition with virtual register operand.
+        VirtUseOps.insert(VirtUseOps.begin(), i);
+      else
+        VirtUseOps.push_back(i);
+    }
+
+    // Process all of the spilled uses and all non spilled reg references.
+    SmallVector<int, 2> PotentialDeadStoreSlots;
+    KilledMIRegs.clear();
+    for (unsigned j = 0, e = VirtUseOps.size(); j != e; ++j) {
+      unsigned i = VirtUseOps[j];
+      unsigned VirtReg = MI.getOperand(i).getReg();
+      assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+             "Not a virtual register?");
+
+      unsigned SubIdx = MI.getOperand(i).getSubReg();
+      if (VRM->isAssignedReg(VirtReg)) {
+        // This virtual register was assigned a physreg!
+        unsigned Phys = VRM->getPhys(VirtReg);
+        MRI->setPhysRegUsed(Phys);
+        if (MI.getOperand(i).isDef())
+          ReusedOperands.markClobbered(Phys);
+        substitutePhysReg(MI.getOperand(i), Phys, *TRI);
+        if (VRM->isImplicitlyDefined(VirtReg))
+          // FIXME: Is this needed?
+          BuildMI(*MBB, &MI, MI.getDebugLoc(),
+                  TII->get(TargetOpcode::IMPLICIT_DEF), Phys);
+        continue;
+      }
+
+      // This virtual register is now known to be a spilled value.
+      if (!MI.getOperand(i).isUse())
+        continue;  // Handle defs in the loop below (handle use&def here though)
+
+      bool AvoidReload = MI.getOperand(i).isUndef();
+      // Check if it is defined by an implicit def. It should not be spilled.
+      // Note, this is for correctness reason. e.g.
+      // 8   %reg1024<def> = IMPLICIT_DEF
+      // 12  %reg1024<def> = INSERT_SUBREG %reg1024<kill>, %reg1025, 2
+      // The live range [12, 14) are not part of the r1024 live interval since
+      // it's defined by an implicit def. It will not conflicts with live
+      // interval of r1025. Now suppose both registers are spilled, you can
+      // easily see a situation where both registers are reloaded before
+      // the INSERT_SUBREG and both target registers that would overlap.
+      bool DoReMat = VRM->isReMaterialized(VirtReg);
+      int SSorRMId = DoReMat
+        ? VRM->getReMatId(VirtReg) : VRM->getStackSlot(VirtReg);
+      int ReuseSlot = SSorRMId;
+
+      // Check to see if this stack slot is available.
+      unsigned PhysReg = Spills.getSpillSlotOrReMatPhysReg(SSorRMId);
+
+      // If this is a sub-register use, make sure the reuse register is in the
+      // right register class. For example, for x86 not all of the 32-bit
+      // registers have accessible sub-registers.
+      // Similarly so for EXTRACT_SUBREG. Consider this:
+      // EDI = op
+      // MOV32_mr fi#1, EDI
+      // ...
+      //       = EXTRACT_SUBREG fi#1
+      // fi#1 is available in EDI, but it cannot be reused because it's not in
+      // the right register file.
+      if (PhysReg && !AvoidReload && SubIdx) {
+        const TargetRegisterClass* RC = MRI->getRegClass(VirtReg);
+        if (!RC->contains(PhysReg))
+          PhysReg = 0;
+      }
+
+      if (PhysReg && !AvoidReload) {
+        // This spilled operand might be part of a two-address operand.  If this
+        // is the case, then changing it will necessarily require changing the
+        // def part of the instruction as well.  However, in some cases, we
+        // aren't allowed to modify the reused register.  If none of these cases
+        // apply, reuse it.
+        bool CanReuse = true;
+        bool isTied = MI.isRegTiedToDefOperand(i);
+        if (isTied) {
+          // Okay, we have a two address operand.  We can reuse this physreg as
+          // long as we are allowed to clobber the value and there isn't an
+          // earlier def that has already clobbered the physreg.
+          CanReuse = !ReusedOperands.isClobbered(PhysReg) &&
+            Spills.canClobberPhysReg(PhysReg);
+        }
+
+        if (CanReuse) {
+          // If this stack slot value is already available, reuse it!
+          if (ReuseSlot > VirtRegMap::MAX_STACK_SLOT)
+            DEBUG(dbgs() << "Reusing RM#"
+                  << ReuseSlot-VirtRegMap::MAX_STACK_SLOT-1);
+          else
+            DEBUG(dbgs() << "Reusing SS#" << ReuseSlot);
+          DEBUG(dbgs() << " from physreg "
+                << TRI->getName(PhysReg) << " for vreg"
+                << VirtReg <<" instead of reloading into physreg "
+                << TRI->getName(VRM->getPhys(VirtReg)) << '\n');
+          unsigned RReg = SubIdx ? TRI->getSubReg(PhysReg, SubIdx) : PhysReg;
+          MI.getOperand(i).setReg(RReg);
+          MI.getOperand(i).setSubReg(0);
+
+          // The only technical detail we have is that we don't know that
+          // PhysReg won't be clobbered by a reloaded stack slot that occurs
+          // later in the instruction.  In particular, consider 'op V1, V2'.
+          // If V1 is available in physreg R0, we would choose to reuse it
+          // here, instead of reloading it into the register the allocator
+          // indicated (say R1).  However, V2 might have to be reloaded
+          // later, and it might indicate that it needs to live in R0.  When
+          // this occurs, we need to have information available that
+          // indicates it is safe to use R1 for the reload instead of R0.
+          //
+          // To further complicate matters, we might conflict with an alias,
+          // or R0 and R1 might not be compatible with each other.  In this
+          // case, we actually insert a reload for V1 in R1, ensuring that
+          // we can get at R0 or its alias.
+          ReusedOperands.addReuse(i, ReuseSlot, PhysReg,
+                                  VRM->getPhys(VirtReg), VirtReg);
+          if (isTied)
+            // Only mark it clobbered if this is a use&def operand.
+            ReusedOperands.markClobbered(PhysReg);
+          ++NumReused;
+
+          if (MI.getOperand(i).isKill() &&
+              ReuseSlot <= VirtRegMap::MAX_STACK_SLOT) {
+
+            // The store of this spilled value is potentially dead, but we
+            // won't know for certain until we've confirmed that the re-use
+            // above is valid, which means waiting until the other operands
+            // are processed. For now we just track the spill slot, we'll
+            // remove it after the other operands are processed if valid.
+
+            PotentialDeadStoreSlots.push_back(ReuseSlot);
+          }
+
+          // Mark is isKill if it's there no other uses of the same virtual
+          // register and it's not a two-address operand. IsKill will be
+          // unset if reg is reused.
+          if (!isTied && KilledMIRegs.count(VirtReg) == 0) {
+            MI.getOperand(i).setIsKill();
+            KilledMIRegs.insert(VirtReg);
+          }
+
+          continue;
+        }  // CanReuse
+
+        // Otherwise we have a situation where we have a two-address instruction
+        // whose mod/ref operand needs to be reloaded.  This reload is already
+        // available in some register "PhysReg", but if we used PhysReg as the
+        // operand to our 2-addr instruction, the instruction would modify
+        // PhysReg.  This isn't cool if something later uses PhysReg and expects
+        // to get its initial value.
+        //
+        // To avoid this problem, and to avoid doing a load right after a store,
+        // we emit a copy from PhysReg into the designated register for this
+        // operand.
+        //
+        // This case also applies to an earlyclobber'd PhysReg.
+        unsigned DesignatedReg = VRM->getPhys(VirtReg);
+        assert(DesignatedReg && "Must map virtreg to physreg!");
+
+        // Note that, if we reused a register for a previous operand, the
+        // register we want to reload into might not actually be
+        // available.  If this occurs, use the register indicated by the
+        // reuser.
+        if (ReusedOperands.hasReuses())
+          DesignatedReg = ReusedOperands.
+            GetRegForReload(VirtReg, DesignatedReg, &MI, Spills,
+                            MaybeDeadStores, RegKills, KillOps, *VRM);
+
+        // If the mapped designated register is actually the physreg we have
+        // incoming, we don't need to inserted a dead copy.
+        if (DesignatedReg == PhysReg) {
+          // If this stack slot value is already available, reuse it!
+          if (ReuseSlot > VirtRegMap::MAX_STACK_SLOT)
+            DEBUG(dbgs() << "Reusing RM#"
+                  << ReuseSlot-VirtRegMap::MAX_STACK_SLOT-1);
+          else
+            DEBUG(dbgs() << "Reusing SS#" << ReuseSlot);
+          DEBUG(dbgs() << " from physreg " << TRI->getName(PhysReg)
+                << " for vreg" << VirtReg
+                << " instead of reloading into same physreg.\n");
+          unsigned RReg = SubIdx ? TRI->getSubReg(PhysReg, SubIdx) : PhysReg;
+          MI.getOperand(i).setReg(RReg);
+          MI.getOperand(i).setSubReg(0);
+          ReusedOperands.markClobbered(RReg);
+          ++NumReused;
+          continue;
+        }
+
+        MRI->setPhysRegUsed(DesignatedReg);
+        ReusedOperands.markClobbered(DesignatedReg);
+
+        // Back-schedule reloads and remats.
+        MachineBasicBlock::iterator InsertLoc =
+          ComputeReloadLoc(&MI, MBB->begin(), PhysReg, TRI, DoReMat,
+                           SSorRMId, TII, MF);
+        MachineInstr *CopyMI = BuildMI(*MBB, InsertLoc, MI.getDebugLoc(),
+                                       TII->get(TargetOpcode::COPY),
+                                       DesignatedReg).addReg(PhysReg);
+        CopyMI->setAsmPrinterFlag(MachineInstr::ReloadReuse);
+        UpdateKills(*CopyMI, TRI, RegKills, KillOps);
+
+        // This invalidates DesignatedReg.
+        Spills.ClobberPhysReg(DesignatedReg);
+
+        Spills.addAvailable(ReuseSlot, DesignatedReg);
+        unsigned RReg =
+          SubIdx ? TRI->getSubReg(DesignatedReg, SubIdx) : DesignatedReg;
+        MI.getOperand(i).setReg(RReg);
+        MI.getOperand(i).setSubReg(0);
+        DEBUG(dbgs() << '\t' << *prior(MII));
+        ++NumReused;
+        continue;
+      } // if (PhysReg)
+
+        // Otherwise, reload it and remember that we have it.
+      PhysReg = VRM->getPhys(VirtReg);
+      assert(PhysReg && "Must map virtreg to physreg!");
+
+      // Note that, if we reused a register for a previous operand, the
+      // register we want to reload into might not actually be
+      // available.  If this occurs, use the register indicated by the
+      // reuser.
+      if (ReusedOperands.hasReuses())
+        PhysReg = ReusedOperands.GetRegForReload(VirtReg, PhysReg, &MI,
+                    Spills, MaybeDeadStores, RegKills, KillOps, *VRM);
+
+      MRI->setPhysRegUsed(PhysReg);
+      ReusedOperands.markClobbered(PhysReg);
+      if (AvoidReload)
+        ++NumAvoided;
+      else {
+        // Back-schedule reloads and remats.
+        MachineBasicBlock::iterator InsertLoc =
+          ComputeReloadLoc(MII, MBB->begin(), PhysReg, TRI, DoReMat,
+                           SSorRMId, TII, MF);
+
+        if (DoReMat) {
+          ReMaterialize(*MBB, InsertLoc, PhysReg, VirtReg, TII, TRI, *VRM);
+        } else {
+          const TargetRegisterClass* RC = MRI->getRegClass(VirtReg);
+          TII->loadRegFromStackSlot(*MBB, InsertLoc, PhysReg, SSorRMId, RC,TRI);
+          MachineInstr *LoadMI = prior(InsertLoc);
+          VRM->addSpillSlotUse(SSorRMId, LoadMI);
+          ++NumLoads;
+          DistanceMap.insert(std::make_pair(LoadMI, DistanceMap.size()));
+        }
+        // This invalidates PhysReg.
+        Spills.ClobberPhysReg(PhysReg);
+
+        // Any stores to this stack slot are not dead anymore.
+        if (!DoReMat)
+          MaybeDeadStores[SSorRMId] = NULL;
+        Spills.addAvailable(SSorRMId, PhysReg);
+        // Assumes this is the last use. IsKill will be unset if reg is reused
+        // unless it's a two-address operand.
+        if (!MI.isRegTiedToDefOperand(i) &&
+            KilledMIRegs.count(VirtReg) == 0) {
+          MI.getOperand(i).setIsKill();
+          KilledMIRegs.insert(VirtReg);
+        }
+
+        UpdateKills(*prior(InsertLoc), TRI, RegKills, KillOps);
+        DEBUG(dbgs() << '\t' << *prior(InsertLoc));
+      }
+      unsigned RReg = SubIdx ? TRI->getSubReg(PhysReg, SubIdx) : PhysReg;
+      MI.getOperand(i).setReg(RReg);
+      MI.getOperand(i).setSubReg(0);
+    }
+
+    // Ok - now we can remove stores that have been confirmed dead.
+    for (unsigned j = 0, e = PotentialDeadStoreSlots.size(); j != e; ++j) {
+      // This was the last use and the spilled value is still available
+      // for reuse. That means the spill was unnecessary!
+      int PDSSlot = PotentialDeadStoreSlots[j];
+      MachineInstr* DeadStore = MaybeDeadStores[PDSSlot];
+      if (DeadStore) {
+        DEBUG(dbgs() << "Removed dead store:\t" << *DeadStore);
+        InvalidateKills(*DeadStore, TRI, RegKills, KillOps);
+        VRM->RemoveMachineInstrFromMaps(DeadStore);
+        MBB->erase(DeadStore);
+        MaybeDeadStores[PDSSlot] = NULL;
+        ++NumDSE;
+      }
+    }
+
+
+    DEBUG(dbgs() << '\t' << MI);
+
+
+    // If we have folded references to memory operands, make sure we clear all
+    // physical registers that may contain the value of the spilled virtual
+    // register
+
+    // Copy the folded virts to a small vector, we may change MI2VirtMap.
+    SmallVector<std::pair<unsigned, VirtRegMap::ModRef>, 4> FoldedVirts;
+    // C++0x FTW!
+    for (std::pair<VirtRegMap::MI2VirtMapTy::const_iterator,
+                   VirtRegMap::MI2VirtMapTy::const_iterator> FVRange =
+           VRM->getFoldedVirts(&MI);
+         FVRange.first != FVRange.second; ++FVRange.first)
+      FoldedVirts.push_back(FVRange.first->second);
+
+    SmallSet<int, 2> FoldedSS;
+    for (unsigned FVI = 0, FVE = FoldedVirts.size(); FVI != FVE; ++FVI) {
+      unsigned VirtReg = FoldedVirts[FVI].first;
+      VirtRegMap::ModRef MR = FoldedVirts[FVI].second;
+      DEBUG(dbgs() << "Folded vreg: " << VirtReg << "  MR: " << MR);
+
+      int SS = VRM->getStackSlot(VirtReg);
+      if (SS == VirtRegMap::NO_STACK_SLOT)
+        continue;
+      FoldedSS.insert(SS);
+      DEBUG(dbgs() << " - StackSlot: " << SS << "\n");
+
+      // If this folded instruction is just a use, check to see if it's a
+      // straight load from the virt reg slot.
+      if ((MR & VirtRegMap::isRef) && !(MR & VirtRegMap::isMod)) {
+        int FrameIdx;
+        unsigned DestReg = TII->isLoadFromStackSlot(&MI, FrameIdx);
+        if (DestReg && FrameIdx == SS) {
+          // If this spill slot is available, turn it into a copy (or nothing)
+          // instead of leaving it as a load!
+          if (unsigned InReg = Spills.getSpillSlotOrReMatPhysReg(SS)) {
+            DEBUG(dbgs() << "Promoted Load To Copy: " << MI);
+            if (DestReg != InReg) {
+              MachineOperand *DefMO = MI.findRegisterDefOperand(DestReg);
+              MachineInstr *CopyMI = BuildMI(*MBB, &MI, MI.getDebugLoc(),
+                                             TII->get(TargetOpcode::COPY))
+                .addReg(DestReg, RegState::Define, DefMO->getSubReg())
+                .addReg(InReg, RegState::Kill);
+              // Revisit the copy so we make sure to notice the effects of the
+              // operation on the destreg (either needing to RA it if it's
+              // virtual or needing to clobber any values if it's physical).
+              NextMII = CopyMI;
+              NextMII->setAsmPrinterFlag(MachineInstr::ReloadReuse);
+              BackTracked = true;
+            } else {
+              DEBUG(dbgs() << "Removing now-noop copy: " << MI);
+              // Unset last kill since it's being reused.
+              InvalidateKill(InReg, TRI, RegKills, KillOps);
+              Spills.disallowClobberPhysReg(InReg);
+            }
+
+            InvalidateKills(MI, TRI, RegKills, KillOps);
+            VRM->RemoveMachineInstrFromMaps(&MI);
+            MBB->erase(&MI);
+            Erased = true;
+            goto ProcessNextInst;
+          }
+        } else {
+          unsigned PhysReg = Spills.getSpillSlotOrReMatPhysReg(SS);
+          SmallVector<MachineInstr*, 4> NewMIs;
+          if (PhysReg &&
+              TII->unfoldMemoryOperand(MF, &MI, PhysReg, false, false, NewMIs)){
+            MBB->insert(MII, NewMIs[0]);
+            InvalidateKills(MI, TRI, RegKills, KillOps);
+            VRM->RemoveMachineInstrFromMaps(&MI);
+            MBB->erase(&MI);
+            Erased = true;
+            --NextMII;  // backtrack to the unfolded instruction.
+            BackTracked = true;
+            goto ProcessNextInst;
+          }
+        }
+      }
+
+      // If this reference is not a use, any previous store is now dead.
+      // Otherwise, the store to this stack slot is not dead anymore.
+      MachineInstr* DeadStore = MaybeDeadStores[SS];
+      if (DeadStore) {
+        bool isDead = !(MR & VirtRegMap::isRef);
+        MachineInstr *NewStore = NULL;
+        if (MR & VirtRegMap::isModRef) {
+          unsigned PhysReg = Spills.getSpillSlotOrReMatPhysReg(SS);
+          SmallVector<MachineInstr*, 4> NewMIs;
+          // We can reuse this physreg as long as we are allowed to clobber
+          // the value and there isn't an earlier def that has already clobbered
+          // the physreg.
+          if (PhysReg &&
+              !ReusedOperands.isClobbered(PhysReg) &&
+              Spills.canClobberPhysReg(PhysReg) &&
+              !TII->isStoreToStackSlot(&MI, SS)) { // Not profitable!
+            MachineOperand *KillOpnd =
+              DeadStore->findRegisterUseOperand(PhysReg, true);
+            // Note, if the store is storing a sub-register, it's possible the
+            // super-register is needed below.
+            if (KillOpnd && !KillOpnd->getSubReg() &&
+                TII->unfoldMemoryOperand(MF, &MI, PhysReg, false, true,NewMIs)){
+              MBB->insert(MII, NewMIs[0]);
+              NewStore = NewMIs[1];
+              MBB->insert(MII, NewStore);
+              VRM->addSpillSlotUse(SS, NewStore);
+              InvalidateKills(MI, TRI, RegKills, KillOps);
+              VRM->RemoveMachineInstrFromMaps(&MI);
+              MBB->erase(&MI);
+              Erased = true;
+              --NextMII;
+              --NextMII;  // backtrack to the unfolded instruction.
+              BackTracked = true;
+              isDead = true;
+              ++NumSUnfold;
+            }
+          }
+        }
+
+        if (isDead) {  // Previous store is dead.
+          // If we get here, the store is dead, nuke it now.
+          DEBUG(dbgs() << "Removed dead store:\t" << *DeadStore);
+          InvalidateKills(*DeadStore, TRI, RegKills, KillOps);
+          VRM->RemoveMachineInstrFromMaps(DeadStore);
+          MBB->erase(DeadStore);
+          if (!NewStore)
+            ++NumDSE;
+        }
+
+        MaybeDeadStores[SS] = NULL;
+        if (NewStore) {
+          // Treat this store as a spill merged into a copy. That makes the
+          // stack slot value available.
+          VRM->virtFolded(VirtReg, NewStore, VirtRegMap::isMod);
+          goto ProcessNextInst;
+        }
+      }
+
+      // If the spill slot value is available, and this is a new definition of
+      // the value, the value is not available anymore.
+      if (MR & VirtRegMap::isMod) {
+        // Notice that the value in this stack slot has been modified.
+        Spills.ModifyStackSlotOrReMat(SS);
+
+        // If this is *just* a mod of the value, check to see if this is just a
+        // store to the spill slot (i.e. the spill got merged into the copy). If
+        // so, realize that the vreg is available now, and add the store to the
+        // MaybeDeadStore info.
+        int StackSlot;
+        if (!(MR & VirtRegMap::isRef)) {
+          if (unsigned SrcReg = TII->isStoreToStackSlot(&MI, StackSlot)) {
+            assert(TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+                   "Src hasn't been allocated yet?");
+
+            if (CommuteToFoldReload(MII, VirtReg, SrcReg, StackSlot,
+                                    Spills, RegKills, KillOps, TRI)) {
+              NextMII = llvm::next(MII);
+              BackTracked = true;
+              goto ProcessNextInst;
+            }
+
+            // Okay, this is certainly a store of SrcReg to [StackSlot].  Mark
+            // this as a potentially dead store in case there is a subsequent
+            // store into the stack slot without a read from it.
+            MaybeDeadStores[StackSlot] = &MI;
+
+            // If the stack slot value was previously available in some other
+            // register, change it now.  Otherwise, make the register
+            // available in PhysReg.
+            Spills.addAvailable(StackSlot, SrcReg, MI.killsRegister(SrcReg));
+          }
+        }
+      }
+    }
+
+    // Process all of the spilled defs.
+    SpilledMIRegs.clear();
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (!(MO.isReg() && MO.getReg() && MO.isDef()))
+        continue;
+
+      unsigned VirtReg = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(VirtReg)) {
+        // Check to see if this is a noop copy.  If so, eliminate the
+        // instruction before considering the dest reg to be changed.
+        // Also check if it's copying from an "undef", if so, we can't
+        // eliminate this or else the undef marker is lost and it will
+        // confuses the scavenger. This is extremely rare.
+        if (MI.isIdentityCopy() && !MI.getOperand(1).isUndef() &&
+            MI.getNumOperands() == 2) {
+          ++NumDCE;
+          DEBUG(dbgs() << "Removing now-noop copy: " << MI);
+          SmallVector<unsigned, 2> KillRegs;
+          InvalidateKills(MI, TRI, RegKills, KillOps, &KillRegs);
+          if (MO.isDead() && !KillRegs.empty()) {
+            // Source register or an implicit super/sub-register use is killed.
+            assert(TRI->regsOverlap(KillRegs[0], MI.getOperand(0).getReg()));
+            // Last def is now dead.
+            TransferDeadness(MI.getOperand(1).getReg(), RegKills, KillOps);
+          }
+          VRM->RemoveMachineInstrFromMaps(&MI);
+          MBB->erase(&MI);
+          Erased = true;
+          Spills.disallowClobberPhysReg(VirtReg);
+          goto ProcessNextInst;
+        }
+
+        // If it's not a no-op copy, it clobbers the value in the destreg.
+        Spills.ClobberPhysReg(VirtReg);
+        ReusedOperands.markClobbered(VirtReg);
+
+        // Check to see if this instruction is a load from a stack slot into
+        // a register.  If so, this provides the stack slot value in the reg.
+        int FrameIdx;
+        if (unsigned DestReg = TII->isLoadFromStackSlot(&MI, FrameIdx)) {
+          assert(DestReg == VirtReg && "Unknown load situation!");
+
+          // If it is a folded reference, then it's not safe to clobber.
+          bool Folded = FoldedSS.count(FrameIdx);
+          // Otherwise, if it wasn't available, remember that it is now!
+          Spills.addAvailable(FrameIdx, DestReg, !Folded);
+          goto ProcessNextInst;
+        }
+
+        continue;
+      }
+
+      unsigned SubIdx = MO.getSubReg();
+      bool DoReMat = VRM->isReMaterialized(VirtReg);
+      if (DoReMat)
+        ReMatDefs.insert(&MI);
+
+      // The only vregs left are stack slot definitions.
+      int StackSlot = VRM->getStackSlot(VirtReg);
+      const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
+
+      // If this def is part of a two-address operand, make sure to execute
+      // the store from the correct physical register.
+      unsigned PhysReg;
+      unsigned TiedOp;
+      if (MI.isRegTiedToUseOperand(i, &TiedOp)) {
+        PhysReg = MI.getOperand(TiedOp).getReg();
+        if (SubIdx) {
+          unsigned SuperReg = findSuperReg(RC, PhysReg, SubIdx, TRI);
+          assert(SuperReg && TRI->getSubReg(SuperReg, SubIdx) == PhysReg &&
+                 "Can't find corresponding super-register!");
+          PhysReg = SuperReg;
+        }
+      } else {
+        PhysReg = VRM->getPhys(VirtReg);
+        if (ReusedOperands.isClobbered(PhysReg)) {
+          // Another def has taken the assigned physreg. It must have been a
+          // use&def which got it due to reuse. Undo the reuse!
+          PhysReg = ReusedOperands.GetRegForReload(VirtReg, PhysReg, &MI,
+                      Spills, MaybeDeadStores, RegKills, KillOps, *VRM);
+        }
+      }
+
+      assert(PhysReg && "VR not assigned a physical register?");
+      MRI->setPhysRegUsed(PhysReg);
+      unsigned RReg = SubIdx ? TRI->getSubReg(PhysReg, SubIdx) : PhysReg;
+      ReusedOperands.markClobbered(RReg);
+      MI.getOperand(i).setReg(RReg);
+      MI.getOperand(i).setSubReg(0);
+
+      if (!MO.isDead() && SpilledMIRegs.insert(VirtReg)) {
+        MachineInstr *&LastStore = MaybeDeadStores[StackSlot];
+        SpillRegToStackSlot(MII, -1, PhysReg, StackSlot, RC, true,
+          LastStore, Spills, ReMatDefs, RegKills, KillOps);
+        NextMII = llvm::next(MII);
+
+        // Check to see if this is a noop copy.  If so, eliminate the
+        // instruction before considering the dest reg to be changed.
+        if (MI.isIdentityCopy()) {
+          ++NumDCE;
+          DEBUG(dbgs() << "Removing now-noop copy: " << MI);
+          InvalidateKills(MI, TRI, RegKills, KillOps);
+          VRM->RemoveMachineInstrFromMaps(&MI);
+          MBB->erase(&MI);
+          Erased = true;
+          UpdateKills(*LastStore, TRI, RegKills, KillOps);
+          goto ProcessNextInst;
+        }
+      }
+    }
+    ProcessNextInst:
+    // Delete dead instructions without side effects.
+    if (!Erased && !BackTracked && isSafeToDelete(MI)) {
+      InvalidateKills(MI, TRI, RegKills, KillOps);
+      VRM->RemoveMachineInstrFromMaps(&MI);
+      MBB->erase(&MI);
+      Erased = true;
+    }
+    if (!Erased)
+      DistanceMap.insert(std::make_pair(&MI, DistanceMap.size()));
+    if (!Erased && !BackTracked) {
+      for (MachineBasicBlock::iterator II = &MI; II != NextMII; ++II)
+        UpdateKills(*II, TRI, RegKills, KillOps);
+    }
+    MII = NextMII;
+  }
+
+}
+
+llvm::VirtRegRewriter* llvm::createVirtRegRewriter() {
+  switch (RewriterOpt) {
+  default: llvm_unreachable("Unreachable!");
+  case local:
+    return new LocalRewriter();
+  case trivial:
+    return new TrivialRewriter();
+  }
+}
diff --git a/src/LLVM/lib/CodeGen/VirtRegRewriter.h b/src/LLVM/lib/CodeGen/VirtRegRewriter.h
new file mode 100644
index 0000000..93474e0
--- /dev/null
+++ b/src/LLVM/lib/CodeGen/VirtRegRewriter.h
@@ -0,0 +1,32 @@
+//===-- llvm/CodeGen/VirtRegRewriter.h - VirtRegRewriter -*- C++ -*--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_VIRTREGREWRITER_H
+#define LLVM_CODEGEN_VIRTREGREWRITER_H
+
+namespace llvm {
+  class LiveIntervals;
+  class MachineFunction;
+  class VirtRegMap;
+  
+  /// VirtRegRewriter interface: Implementations of this interface assign
+  /// spilled virtual registers to stack slots, rewriting the code.
+  struct VirtRegRewriter {
+    virtual ~VirtRegRewriter();
+    virtual bool runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM,
+                                      LiveIntervals* LIs) = 0;
+  };
+
+  /// createVirtRegRewriter - Create an return a rewriter object, as specified
+  /// on the command line.
+  VirtRegRewriter* createVirtRegRewriter();
+
+}
+
+#endif