ARM: Add a postRA pass to legalize stack offsets. Greedy approach (reserve IP).

Make a post-register allocation and post-addProlog pass to
go through variables with stack offsets and legalize them
in case the offsets are not encodeable. The naive approach
is to reserve IP, and use IP to movw/movt the offset, then
add/sub the frame/stack pointer to IP and use IP as the new
base instead of the frame/stack pointer. We do some amount
of CSE within a basic block, and share the IP base pointer
when it is (a) within range for later stack references,
and (b) IP hasn't been clobbered (e.g., by a function call).
I chose to do this greedy approach for both Om1 and O2,
since it should just be a linear pass, and it reduces the
amount of variables/instructions created compared to the
super-naive peephole approach (so might be faster?).

Introduce a test-only flag and use that to artificially
bloat the stack frame so that spill offsets are out
of range for ARM. Use that flag for cross tests to
stress this new code a bit more (than would have been
stressed by simply doing a lit test + FileCheck).

Also, the previous version of emitVariable() was using the
Var's type to determine the range (only +/- 255 for i16,
vs +/- 4095 for i32), even though mov's emit() always
uses a full 32-bit "ldr" instead of a 16-bit "ldrh".
Use a common legality check, which uses the stackSlotType
instead of the Var's type. This previously caused the
test_bitmanip to spuriously complain, even though the
offsets for Om1 were "only" in the 300 byte range. With this
fixed, we can then enable the test_bitmanip test too.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1241763002 .
diff --git a/tests_lit/llvm2ice_tests/large_stack_offs.ll b/tests_lit/llvm2ice_tests/large_stack_offs.ll
new file mode 100644
index 0000000..6fba4df
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/large_stack_offs.ll
@@ -0,0 +1,133 @@
+; This tries to create variables with very large stack offsets.
+; This requires a lot of variables/register pressure. To simplify this
+; we assume poor register allocation from Om1, and a flag that forces
+; the frame to add K amount of unused stack for testing.
+; We only need to test ARM and other architectures which have limited space
+; for specifying an offset within an instruction.
+
+; RUN: %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
+; RUN:   -i %s --args -Om1 --skip-unimplemented --test-stack-extra 4096 \
+; RUN:   | %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command FileCheck --check-prefix ARM32 %s
+
+declare i64 @dummy(i32 %t1, i32 %t2, i32 %t3, i64 %t4, i64 %t5)
+
+; Test a function that requires lots of stack (due to test flag), and uses
+; SP as the base register (originally).
+define internal i64 @lotsOfStack(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  %t1 = xor i32 %a, %b
+  %t2 = or i32 %c, %d
+  %cmp = icmp eq i32 %t1, %t2
+  br i1 %cmp, label %br_1, label %br_2
+
+br_1:
+  %x1 = zext i32 %t1 to i64
+  %y1 = ashr i64 %x1, 17
+  ; Use some stack during the call, so that references to %t1 and %t2's
+  ; stack slots require stack adjustment.
+  %r1 = call i64 @dummy(i32 123, i32 321, i32 %t2, i64 %x1, i64 %y1)
+  %z1 = sub i64 %r1, %y1
+  br label %end
+
+br_2:
+  %x2 = zext i32 %t2 to i64
+  %y2 = and i64 %x2, 123
+  %r2 = call i64 @dummy(i32 123, i32 321, i32 %t2, i64 %x2, i64 %y2)
+  %z2 = and i64 %r2, %y2
+  br label %end
+
+end:
+  %x3 = phi i64 [ %x1, %br_1 ], [ %x2, %br_2 ]
+  %z3 = phi i64 [ %z1, %br_1 ], [ %z2, %br_2 ]
+  %r3 = and i64 %x3, %z3
+  ret i64 %r3
+}
+; ARM32-LABEL: lotsOfStack
+; ARM32-NOT: mov fp, sp
+; ARM32: movw ip, #4{{.*}}
+; ARM32-NEXT: sub sp, sp, ip
+; ARM32: movw ip, #4232
+; ARM32-NEXT: add ip, sp, ip
+; ARM32-NOT: movw ip
+; %t2 is the result of the "or", and %t2 will be passed via r1 to the call.
+; Use that to check the stack offset of %t2. The first offset and the
+; later offset right before the call should be 16 bytes apart,
+; because of the sub sp, sp, #16.
+; ARM32: orr [[REG:r.*]], {{.*}},
+; I.e., the slot for t2 is (sp0 + 4232 - 20) == sp0 + 4212.
+; ARM32: str [[REG]], [ip, #-20]
+; ARM32: b {{[a-f0-9]+}}
+; Now skip ahead to where the call in br_1 begins, to check how %t2 is used.
+; ARM32: movw ip, #4216
+; ARM32-NEXT: add ip, sp, ip
+; ARM32: sub sp, sp, #16
+; Now sp1 = sp0 - 16, but ip is still in terms of sp0.
+; So, sp0 + 4212 == ip - 4.
+; ARM32: ldr r2, [ip, #-4]
+; ARM32: bl {{.*}} dummy
+; ARM32: add sp, sp
+; The call clobbers ip, so we need to re-create the base register.
+; ARM32: movw ip, #4{{.*}}
+; ARM32: b {{[a-f0-9]+}}
+; ARM32: bl {{.*}} dummy
+
+; Similar, but test a function that uses FP as the base register (originally).
+define internal i64 @usesFrameReg(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  %p = alloca i8, i32 %d, align 4
+  %t1 = xor i32 %a, %b
+  %t2 = or i32 %c, %d
+  %cmp = icmp eq i32 %t1, %t2
+  br i1 %cmp, label %br_1, label %br_2
+
+br_1:
+  %x1 = zext i32 %t1 to i64
+  %y1 = ashr i64 %x1, 17
+  %p32 = ptrtoint i8* %p to i32
+  %r1 = call i64 @dummy(i32 %p32, i32 321, i32 %t2, i64 %x1, i64 %y1)
+  %z1 = sub i64 %r1, %y1
+  br label %end
+
+br_2:
+  %x2 = zext i32 %t2 to i64
+  %y2 = and i64 %x2, 123
+  %r2 = call i64 @dummy(i32 123, i32 321, i32 %d, i64 %x2, i64 %y2)
+  %z2 = and i64 %r2, %y2
+  br label %end
+
+end:
+  %x3 = phi i64 [ %x1, %br_1 ], [ %x2, %br_2 ]
+  %z3 = phi i64 [ %z1, %br_1 ], [ %z2, %br_2 ]
+  %r3 = and i64 %x3, %z3
+  ret i64 %r3
+}
+; ARM32-LABEL: usesFrameReg
+; ARM32: mov fp, sp
+; ARM32: movw ip, #4{{.*}}
+; ARM32-NEXT: sub sp, sp, ip
+; ARM32: movw ip, #4100
+; ARM32-NEXT: sub ip, fp, ip
+; ARM32-NOT: movw ip
+; %t2 is the result of the "or", and %t2 will be passed via r1 to the call.
+; Use that to check the stack offset of %t2. It should be the same offset
+; even after sub sp, sp, #16, because the base register was originally
+; the FP and not the SP.
+; ARM32: orr [[REG:r.*]], {{.*}},
+; I.e., the slot for t2 is (fp0 - 4100 -24) == fp0 - 4124
+; ARM32: str [[REG]], [ip, #-24]
+; ARM32: b {{[a-f0-9]+}}
+; Now skip ahead to where the call in br_1 begins, to check how %t2 is used.
+; ARM32: movw ip, #4120
+; ARM32-NEXT: sub ip, fp, ip
+; ARM32: sub sp, sp, #16
+; Now sp1 = sp0 - 16, but ip is still in terms of fp0.
+; So, fp0 - 4124 == ip - 4.
+; ARM32: ldr r2, [ip, #-4]
+; ARM32: bl {{.*}} dummy
+; ARM32: add sp, sp
+; The call clobbers ip, so we need to re-create the base register.
+; ARM32: movw ip, #4{{.*}}
+; ARM32: b {{[a-f0-9]+}}
+; ARM32: bl {{.*}} dummy