Subzero: Emit functions and global initializers in a separate thread.

(This is a continuation of https://codereview.chromium.org/876083007/ .)

Emission is done in a separate thread when -threads=N with N>0 is specified.  This includes both functions and global initializers.

Emission is deterministic.  The parser assigns sequence numbers, and the emitter thread reassembles work units into their original order, regardless of the number of threads.

Dump output, however, is not intended to be in deterministic, reassembled order.  As such, lit tests that test dump output (i.e., '-verbose inst') are explicitly run with -threads=0.

For -elf-writer and -ias=1, the translator thread invokes Cfg::emitIAS() and the assembler buffer is passed to the emitter thread.  For -ias=0, the translator thread passed the Cfg to the emitter thread which then invokes Cfg::emit() to produce the textual asm.

Minor cleanup along the way:
  * Removed Flags from the Ice::Translator object and ctor, since it was redundant with Ctx->getFlags().
  * Cfg::getAssembler<> is the same as Cfg::getAssembler<Assembler> and is useful for just passing the assembler around.
  * Removed the redundant Ctx argument from TargetDataLowering::lowerConstants() .

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4075
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/916653004
diff --git a/tests_lit/llvm2ice_tests/arith-opt.ll b/tests_lit/llvm2ice_tests/arith-opt.ll
index 1606916..5ec805d 100644
--- a/tests_lit/llvm2ice_tests/arith-opt.ll
+++ b/tests_lit/llvm2ice_tests/arith-opt.ll
@@ -3,7 +3,7 @@
 
 ; REQUIRES: allow_dump
 
-; RUN: %p2i -i %s --args --verbose inst -ias=0 | FileCheck %s
+; RUN: %p2i -i %s --args --verbose inst -threads=0 -ias=0 | FileCheck %s
 
 define i32 @Add(i32 %a, i32 %b) {
 ; CHECK: define i32 @Add
diff --git a/tests_lit/llvm2ice_tests/branch-simple.ll b/tests_lit/llvm2ice_tests/branch-simple.ll
index 3e763e5..5f821d8 100644
--- a/tests_lit/llvm2ice_tests/branch-simple.ll
+++ b/tests_lit/llvm2ice_tests/branch-simple.ll
@@ -6,8 +6,8 @@
 
 ; REQUIRES: allow_dump
 
-; RUN: %p2i -i %s --args -O2 --verbose inst | FileCheck %s
-; RUN: %p2i -i %s --args -Om1 --verbose inst | FileCheck %s
+; RUN: %p2i -i %s --args -O2 --verbose inst -threads=0 | FileCheck %s
+; RUN: %p2i -i %s --args -Om1 --verbose inst -threads=0 | FileCheck %s
 
 define i32 @simple_cond_branch(i32 %foo, i32 %bar) {
 entry:
diff --git a/tests_lit/llvm2ice_tests/globalrelocs.ll b/tests_lit/llvm2ice_tests/globalrelocs.ll
index 3184022..41fa66d 100644
--- a/tests_lit/llvm2ice_tests/globalrelocs.ll
+++ b/tests_lit/llvm2ice_tests/globalrelocs.ll
@@ -3,13 +3,19 @@
 ; REQUIRES: allow_dump
 
 ; Test that we handle it in the ICE converter.
-; RUN: %lc2i -i %s --args -verbose inst | %iflc FileCheck %s
+; RUN: %lc2i -i %s --args -verbose inst -threads=0 \
+; RUN:     | %iflc FileCheck %s
+; RUN: %lc2i -i %s --args -verbose inst -threads=0 \
+; RUN:     | %iflc FileCheck --check-prefix=DUMP %s
 
 ; Test that we handle it using Subzero's bitcode reader.
-; RUN: %p2i -i %s --args -verbose inst | FileCheck %s
+; RUN: %p2i -i %s --args -verbose inst -threads=0 \
+; RUN:     | FileCheck %s
+; RUN: %p2i -i %s --args -verbose inst -threads=0 \
+; RUN:     | FileCheck --check-prefix=DUMP %s
 
 @bytes = internal global [7 x i8] c"abcdefg"
-; CHECK: @bytes = internal global [7 x i8] c"abcdefg"
+; DUMP: @bytes = internal global [7 x i8] c"abcdefg"
 ; CHECK:	.type	bytes,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:bytes:
@@ -23,7 +29,7 @@
 ; CHECK:	.size	bytes, 7
 
 @const_bytes = internal constant [7 x i8] c"abcdefg"
-; CHECK: @const_bytes = internal constant [7 x i8] c"abcdefg"
+; DUMP: @const_bytes = internal constant [7 x i8] c"abcdefg"
 ; CHECK:	.type	const_bytes,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:const_bytes:
@@ -37,7 +43,7 @@
 ; CHECK:	.size	const_bytes, 7
 
 @ptr_to_ptr = internal global i32 ptrtoint (i32* @ptr to i32)
-; CHECK: @ptr_to_ptr = internal global i32 ptrtoint (i32* @ptr to i32)
+; DUMP: @ptr_to_ptr = internal global i32 ptrtoint (i32* @ptr to i32)
 ; CHECK:	.type	ptr_to_ptr,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:ptr_to_ptr:
@@ -45,7 +51,7 @@
 ; CHECK:	.size	ptr_to_ptr, 4
 
 @const_ptr_to_ptr = internal constant i32 ptrtoint (i32* @ptr to i32)
-; CHECK: @const_ptr_to_ptr = internal constant i32 ptrtoint (i32* @ptr to i32)
+; DUMP: @const_ptr_to_ptr = internal constant i32 ptrtoint (i32* @ptr to i32)
 ; CHECK:	.type	const_ptr_to_ptr,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:const_ptr_to_ptr:
@@ -53,7 +59,7 @@
 ; CHECK:	.size	const_ptr_to_ptr, 4
 
 @ptr_to_func = internal global i32 ptrtoint (void ()* @func to i32)
-; CHECK: @ptr_to_func = internal global i32 ptrtoint (void ()* @func to i32)
+; DUMP: @ptr_to_func = internal global i32 ptrtoint (void ()* @func to i32)
 ; CHECK:	.type	ptr_to_func,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:ptr_to_func:
@@ -61,7 +67,7 @@
 ; CHECK:	.size	ptr_to_func, 4
 
 @const_ptr_to_func = internal constant i32 ptrtoint (void ()* @func to i32)
-; CHECK: @const_ptr_to_func = internal constant i32 ptrtoint (void ()* @func to i32)
+; DUMP: @const_ptr_to_func = internal constant i32 ptrtoint (void ()* @func to i32)
 ; CHECK:	.type	const_ptr_to_func,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:const_ptr_to_func:
@@ -69,7 +75,7 @@
 ; CHECK:	.size	const_ptr_to_func, 4
 
 @compound = internal global <{ [3 x i8], i32 }> <{ [3 x i8] c"foo", i32 ptrtoint (void ()* @func to i32) }>
-; CHECK: @compound = internal global <{ [3 x i8], i32 }> <{ [3 x i8] c"foo", i32 ptrtoint (void ()* @func to i32) }>
+; DUMP: @compound = internal global <{ [3 x i8], i32 }> <{ [3 x i8] c"foo", i32 ptrtoint (void ()* @func to i32) }>
 ; CHECK:	.type	compound,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:compound:
@@ -80,7 +86,7 @@
 ; CHECK:	.size	compound, 7
 
 @const_compound = internal constant <{ [3 x i8], i32 }> <{ [3 x i8] c"foo", i32 ptrtoint (void ()* @func to i32) }>
-; CHECK: @const_compound = internal constant <{ [3 x i8], i32 }> <{ [3 x i8] c"foo", i32 ptrtoint (void ()* @func to i32) }>
+; DUMP: @const_compound = internal constant <{ [3 x i8], i32 }> <{ [3 x i8] c"foo", i32 ptrtoint (void ()* @func to i32) }>
 ; CHECK:	.type	const_compound,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:const_compound:
@@ -91,7 +97,7 @@
 ; CHECK:	.size	const_compound, 7
 
 @ptr = internal global i32 ptrtoint ([7 x i8]* @bytes to i32)
-; CHECK: @ptr = internal global i32 ptrtoint ([7 x i8]* @bytes to i32)
+; DUMP: @ptr = internal global i32 ptrtoint ([7 x i8]* @bytes to i32)
 ; CHECK:	.type	ptr,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:ptr:
@@ -99,7 +105,7 @@
 ; CHECK:	.size	ptr, 4
 
 @const_ptr = internal constant i32 ptrtoint ([7 x i8]* @bytes to i32)
-; CHECK: @const_ptr = internal constant i32 ptrtoint ([7 x i8]* @bytes to i32)
+; DUMP: @const_ptr = internal constant i32 ptrtoint ([7 x i8]* @bytes to i32)
 ; CHECK:	.type	const_ptr,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:const_ptr:
@@ -107,7 +113,7 @@
 ; CHECK:	.size	const_ptr, 4
 
 @addend_ptr = internal global i32 add (i32 ptrtoint (i32* @ptr to i32), i32 1)
-; CHECK: @addend_ptr = internal global i32 add (i32 ptrtoint (i32* @ptr to i32), i32 1)
+; DUMP: @addend_ptr = internal global i32 add (i32 ptrtoint (i32* @ptr to i32), i32 1)
 ; CHECK:	.type	addend_ptr,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:addend_ptr:
@@ -115,7 +121,7 @@
 ; CHECK:	.size	addend_ptr, 4
 
 @const_addend_ptr = internal constant i32 add (i32 ptrtoint (i32* @ptr to i32), i32 1)
-; CHECK: @const_addend_ptr = internal constant i32 add (i32 ptrtoint (i32* @ptr to i32), i32 1)
+; DUMP: @const_addend_ptr = internal constant i32 add (i32 ptrtoint (i32* @ptr to i32), i32 1)
 ; CHECK:	.type	const_addend_ptr,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:const_addend_ptr:
@@ -123,7 +129,7 @@
 ; CHECK:	.size	const_addend_ptr, 4
 
 @addend_negative = internal global i32 add (i32 ptrtoint (i32* @ptr to i32), i32 -1)
-; CHECK: @addend_negative = internal global i32 add (i32 ptrtoint (i32* @ptr to i32), i32 -1)
+; DUMP: @addend_negative = internal global i32 add (i32 ptrtoint (i32* @ptr to i32), i32 -1)
 ; CHECK:	.type	addend_negative,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:addend_negative:
@@ -131,7 +137,7 @@
 ; CHECK:	.size	addend_negative, 4
 
 @const_addend_negative = internal constant i32 add (i32 ptrtoint (i32* @ptr to i32), i32 -1)
-; CHECK: @const_addend_negative = internal constant i32 add (i32 ptrtoint (i32* @ptr to i32), i32 -1)
+; DUMP: @const_addend_negative = internal constant i32 add (i32 ptrtoint (i32* @ptr to i32), i32 -1)
 ; CHECK:	.type	const_addend_negative,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:const_addend_negative:
@@ -139,7 +145,7 @@
 ; CHECK:	.size	const_addend_negative, 4
 
 @addend_array1 = internal global i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 1)
-; CHECK: @addend_array1 = internal global i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 1)
+; DUMP: @addend_array1 = internal global i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 1)
 ; CHECK:	.type	addend_array1,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:addend_array1:
@@ -147,7 +153,7 @@
 ; CHECK:	.size	addend_array1, 4
 
 @const_addend_array1 = internal constant i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 1)
-; CHECK: @const_addend_array1 = internal constant i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 1)
+; DUMP: @const_addend_array1 = internal constant i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 1)
 ; CHECK:	.type	const_addend_array1,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:const_addend_array1:
@@ -155,7 +161,7 @@
 ; CHECK:	.size	const_addend_array1, 4
 
 @addend_array2 = internal global i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 7)
-; CHECK: @addend_array2 = internal global i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 7)
+; DUMP: @addend_array2 = internal global i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 7)
 ; CHECK:	.type	addend_array2,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:addend_array2:
@@ -163,7 +169,7 @@
 ; CHECK:	.size	addend_array2, 4
 
 @const_addend_array2 = internal constant i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 7)
-; CHECK: @const_addend_array2 = internal constant i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 7)
+; DUMP: @const_addend_array2 = internal constant i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 7)
 ; CHECK:	.type	const_addend_array2,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:const_addend_array2:
@@ -171,7 +177,7 @@
 ; CHECK:	.size	const_addend_array2, 4
 
 @addend_array3 = internal global i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 9)
-; CHECK: @addend_array3 = internal global i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 9)
+; DUMP: @addend_array3 = internal global i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 9)
 ; CHECK:	.type	addend_array3,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:addend_array3:
@@ -179,7 +185,7 @@
 ; CHECK:	.size	addend_array3, 4
 
 @const_addend_array3 = internal constant i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 9)
-; CHECK: @const_addend_array3 = internal constant i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 9)
+; DUMP: @const_addend_array3 = internal constant i32 add (i32 ptrtoint ([7 x i8]* @bytes to i32), i32 9)
 ; CHECK:	.type	const_addend_array3,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:const_addend_array3:
@@ -187,7 +193,7 @@
 ; CHECK:	.size	const_addend_array3, 4
 
 @addend_struct1 = internal global i32 add (i32 ptrtoint (<{ [3 x i8], i32 }>* @compound to i32), i32 1)
-; CHECK: @addend_struct1 = internal global i32 add (i32 ptrtoint (<{ [3 x i8], i32 }>* @compound to i32), i32 1)
+; DUMP: @addend_struct1 = internal global i32 add (i32 ptrtoint (<{ [3 x i8], i32 }>* @compound to i32), i32 1)
 ; CHECK:	.type	addend_struct1,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:addend_struct1:
@@ -195,7 +201,7 @@
 ; CHECK:	.size	addend_struct1, 4
 
 @const_addend_struct1 = internal constant i32 add (i32 ptrtoint (<{ [3 x i8], i32 }>* @compound to i32), i32 1)
-; CHECK: @const_addend_struct1 = internal constant i32 add (i32 ptrtoint (<{ [3 x i8], i32 }>* @compound to i32), i32 1)
+; DUMP: @const_addend_struct1 = internal constant i32 add (i32 ptrtoint (<{ [3 x i8], i32 }>* @compound to i32), i32 1)
 ; CHECK:	.type	const_addend_struct1,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:const_addend_struct1:
@@ -203,7 +209,7 @@
 ; CHECK:	.size	const_addend_struct1, 4
 
 @addend_struct2 = internal global i32 add (i32 ptrtoint (<{ [3 x i8], i32 }>* @compound to i32), i32 4)
-; CHECK: @addend_struct2 = internal global i32 add (i32 ptrtoint (<{ [3 x i8], i32 }>* @compound to i32), i32 4)
+; DUMP: @addend_struct2 = internal global i32 add (i32 ptrtoint (<{ [3 x i8], i32 }>* @compound to i32), i32 4)
 ; CHECK:	.type	addend_struct2,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:addend_struct2:
@@ -211,7 +217,7 @@
 ; CHECK:	.size	addend_struct2, 4
 
 @const_addend_struct2 = internal constant i32 add (i32 ptrtoint (<{ [3 x i8], i32 }>* @compound to i32), i32 4)
-; CHECK: @const_addend_struct2 = internal constant i32 add (i32 ptrtoint (<{ [3 x i8], i32 }>* @compound to i32), i32 4)
+; DUMP: @const_addend_struct2 = internal constant i32 add (i32 ptrtoint (<{ [3 x i8], i32 }>* @compound to i32), i32 4)
 ; CHECK:	.type	const_addend_struct2,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:const_addend_struct2:
@@ -219,7 +225,7 @@
 ; CHECK:	.size	const_addend_struct2, 4
 
 @ptr_to_func_align = internal global i32 ptrtoint (void ()* @func to i32), align 8
-; CHECK: @ptr_to_func_align = internal global i32 ptrtoint (void ()* @func to i32), align 8
+; DUMP: @ptr_to_func_align = internal global i32 ptrtoint (void ()* @func to i32), align 8
 ; CHECK:	.type	ptr_to_func_align,@object
 ; CHECK:	.section	.data,"aw",@progbits
 ; CHECK:	.align	8
@@ -228,7 +234,7 @@
 ; CHECK:	.size	ptr_to_func_align, 4
 
 @const_ptr_to_func_align = internal constant i32 ptrtoint (void ()* @func to i32), align 8
-; CHECK: @const_ptr_to_func_align = internal constant i32 ptrtoint (void ()* @func to i32), align 8
+; DUMP: @const_ptr_to_func_align = internal constant i32 ptrtoint (void ()* @func to i32), align 8
 ; CHECK:	.type	const_ptr_to_func_align,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:	.align	8
@@ -237,7 +243,7 @@
 ; CHECK:	.size	const_ptr_to_func_align, 4
 
 @char = internal constant [1 x i8] c"0"
-; CHECK: @char = internal constant [1 x i8] c"0"
+; DUMP: @char = internal constant [1 x i8] c"0"
 ; CHECK:	.type	char,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:char:
@@ -245,7 +251,7 @@
 ; CHECK:	.size	char, 1
 
 @short = internal constant [2 x i8] zeroinitializer
-; CHECK: @short = internal constant [2 x i8] zeroinitializer
+; DUMP: @short = internal constant [2 x i8] zeroinitializer
 ; CHECK:	.type	short,@object
 ; CHECK:	.section	.rodata,"a",@progbits
 ; CHECK:short:
diff --git a/tests_lit/llvm2ice_tests/load.ll b/tests_lit/llvm2ice_tests/load.ll
index a8be9f3..caf99e3 100644
--- a/tests_lit/llvm2ice_tests/load.ll
+++ b/tests_lit/llvm2ice_tests/load.ll
@@ -2,7 +2,7 @@
 
 ; REQUIRES: allow_dump
 
-; RUN: %p2i -i %s --args --verbose inst | FileCheck %s
+; RUN: %p2i -i %s --args --verbose inst -threads=0 | FileCheck %s
 
 define void @load_i64(i32 %addr_arg) {
 entry:
diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-errors.ll b/tests_lit/llvm2ice_tests/nacl-atomic-errors.ll
index ea38e3d..92de7aa 100644
--- a/tests_lit/llvm2ice_tests/nacl-atomic-errors.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-errors.ll
@@ -1,7 +1,8 @@
 ; Test that some errors trigger when the usage of NaCl atomic
 ; intrinsics does not match the required ABI.
 
-; RUN: %p2i -i %s --args --verbose none --exit-success 2>&1 | FileCheck %s
+; RUN: %p2i -i %s --args --verbose none --exit-success -threads=0 2>&1 \
+; RUN:   | FileCheck %s
 
 declare i8 @llvm.nacl.atomic.load.i8(i8*, i32)
 declare i16 @llvm.nacl.atomic.load.i16(i16*, i32)
diff --git a/tests_lit/llvm2ice_tests/store.ll b/tests_lit/llvm2ice_tests/store.ll
index ae65a59..ec85020 100644
--- a/tests_lit/llvm2ice_tests/store.ll
+++ b/tests_lit/llvm2ice_tests/store.ll
@@ -2,7 +2,7 @@
 
 ; REQUIRES: allow_dump
 
-; RUN: %p2i -i %s --args --verbose inst | FileCheck %s
+; RUN: %p2i -i %s --args --verbose inst -threads=0 | FileCheck %s
 
 define void @store_i64(i32 %addr_arg) {
 entry:
diff --git a/tests_lit/llvm2ice_tests/struct-arith.pnacl.ll b/tests_lit/llvm2ice_tests/struct-arith.pnacl.ll
index 5ad603e..b3e4e36 100644
--- a/tests_lit/llvm2ice_tests/struct-arith.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/struct-arith.pnacl.ll
@@ -3,7 +3,7 @@
 
 ; TODO(kschimpf) Find out why lc2i is needed.
 ; REQUIRES: allow_llvm_ir_as_input
-; RUN: %lc2i -i %s --args --verbose inst | FileCheck %s
+; RUN: %lc2i -i %s --args --verbose inst -threads=0 | FileCheck %s
 
 define internal i32 @compute_important_function(i32 %v1, i32 %v2) {
 entry: