| ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s | 
 |  | 
 | declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone | 
 |  | 
 | ; SI-LABEL: {{^}}test_if: | 
 | ; Make sure the i1 values created by the cfg structurizer pass are | 
 | ; moved using VALU instructions | 
 |  | 
 |  | 
 | ; waitcnt should be inserted after exec modification | 
 | ; SI: v_cmp_lt_i32_e32 vcc, 0, | 
 | ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc | 
 | ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]] | 
 | ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]] | 
 | ; SI-NEXT: s_cbranch_execz [[FLOW_BB]] | 
 |  | 
 | ; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3 | 
 | ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 | 
 | ; SI: v_mov_b32_e32 v{{[0-9]}}, -1 | 
 | ; SI: s_and_saveexec_b64 | 
 | ; SI-NEXT: ; mask branch | 
 |  | 
 | ; v_mov should be after exec modification | 
 | ; SI: [[FLOW_BB]]: | 
 | ; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]] | 
 | ; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}} | 
 | ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]] | 
 | ; SI-NEXT: ; mask branch | 
 | ; | 
 | define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { | 
 | entry: | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone | 
 |   switch i32 %tid, label %default [ | 
 |     i32 0, label %case0 | 
 |     i32 1, label %case1 | 
 |   ] | 
 |  | 
 | case0: | 
 |   %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b | 
 |   store i32 13, i32 addrspace(1)* %arrayidx1, align 4 | 
 |   br label %end | 
 |  | 
 | case1: | 
 |   %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b | 
 |   store i32 17, i32 addrspace(1)* %arrayidx5, align 4 | 
 |   br label %end | 
 |  | 
 | default: | 
 |   %cmp8 = icmp eq i32 %tid, 2 | 
 |   %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b | 
 |   br i1 %cmp8, label %if, label %else | 
 |  | 
 | if: | 
 |   store i32 19, i32 addrspace(1)* %arrayidx10, align 4 | 
 |   br label %end | 
 |  | 
 | else: | 
 |   store i32 21, i32 addrspace(1)* %arrayidx10, align 4 | 
 |   br label %end | 
 |  | 
 | end: | 
 |   ret void | 
 | } | 
 |  | 
 | ; SI-LABEL: {{^}}simple_test_v_if: | 
 | ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} | 
 | ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc | 
 | ; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] | 
 |  | 
 | ; SI-NEXT: BB{{[0-9]+_[0-9]+}}: | 
 | ; SI: buffer_store_dword | 
 |  | 
 | ; SI-NEXT: {{^}}[[EXIT]]: | 
 | ; SI: s_endpgm | 
 | define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone | 
 |   %is.0 = icmp ne i32 %tid, 0 | 
 |   br i1 %is.0, label %then, label %exit | 
 |  | 
 | then: | 
 |   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid | 
 |   store i32 999, i32 addrspace(1)* %gep | 
 |   br label %exit | 
 |  | 
 | exit: | 
 |   ret void | 
 | } | 
 |  | 
 | ; FIXME: It would be better to endpgm in the then block. | 
 |  | 
 | ; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret: | 
 | ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} | 
 | ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc | 
 | ; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] | 
 |  | 
 | ; SI-NEXT: BB{{[0-9]+_[0-9]+}}: | 
 | ; SI: buffer_store_dword | 
 |  | 
 | ; SI-NEXT: {{^}}[[EXIT]]: | 
 | ; SI: s_endpgm | 
 | define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %is.0 = icmp ne i32 %tid, 0 | 
 |   br i1 %is.0, label %then, label %exit | 
 |  | 
 | then: | 
 |   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid | 
 |   store i32 999, i32 addrspace(1)* %gep | 
 |   ret void | 
 |  | 
 | exit: | 
 |   ret void | 
 | } | 
 |  | 
 | ; Final block has more than a ret to execute. This was miscompiled | 
 | ; before function exit blocks were unified since the endpgm would | 
 | ; terminate the then wavefront before reaching the store. | 
 |  | 
 | ; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret: | 
 | ; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} | 
 | ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc | 
 | ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] | 
 | ; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] | 
 |  | 
 | ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit | 
 | ; SI: ds_write_b32 | 
 |  | 
 | ; SI-NEXT: {{^}}[[FLOW]]: | 
 | ; SI-NEXT: s_or_saveexec_b64 | 
 | ; SI-NEXT: s_xor_b64 exec, exec | 
 | ; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]] | 
 |  | 
 | ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then | 
 | ; SI: s_waitcnt | 
 | ; SI-NEXT: buffer_store_dword | 
 |  | 
 | ; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock | 
 | ; SI: s_endpgm | 
 | define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %is.0 = icmp ne i32 %tid, 0 | 
 |   br i1 %is.0, label %then, label %exit | 
 |  | 
 | then: | 
 |   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid | 
 |   store i32 999, i32 addrspace(1)* %gep | 
 |   ret void | 
 |  | 
 | exit: | 
 |   store volatile i32 7, i32 addrspace(3)* undef | 
 |   ret void | 
 | } | 
 |  | 
 | ; SI-LABEL: {{^}}simple_test_v_loop: | 
 | ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} | 
 | ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc | 
 | ; SI-NEXT: ; mask branch | 
 | ; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] | 
 |  | 
 | ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} | 
 |  | 
 | ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: | 
 | ; SI: buffer_load_dword | 
 | ; SI-DAG: buffer_store_dword | 
 | ; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100 | 
 | ; SI: s_cbranch_vccz [[LABEL_LOOP]] | 
 | ; SI: [[LABEL_EXIT]]: | 
 | ; SI: s_endpgm | 
 |  | 
 | define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { | 
 | entry: | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone | 
 |   %is.0 = icmp ne i32 %tid, 0 | 
 |   %limit = add i32 %tid, 64 | 
 |   br i1 %is.0, label %loop, label %exit | 
 |  | 
 | loop: | 
 |   %i = phi i32 [%tid, %entry], [%i.inc, %loop] | 
 |   %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i | 
 |   %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i | 
 |   %load = load i32, i32 addrspace(1)* %src | 
 |   store i32 %load, i32 addrspace(1)* %gep.dst | 
 |   %i.inc = add nsw i32 %i, 1 | 
 |   %cmp = icmp eq i32 %limit, %i.inc | 
 |   br i1 %cmp, label %exit, label %loop | 
 |  | 
 | exit: | 
 |   ret void | 
 | } | 
 |  | 
 | ; SI-LABEL: {{^}}multi_vcond_loop: | 
 |  | 
 | ; Load loop limit from buffer | 
 | ; Branch to exit if uniformly not taken | 
 | ; SI: ; %bb.0: | 
 | ; SI: buffer_load_dword [[VBOUND:v[0-9]+]] | 
 | ; SI: v_cmp_lt_i32_e32 vcc | 
 | ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc | 
 | ; SI-NEXT: ; mask branch | 
 | ; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] | 
 |  | 
 | ; Initialize inner condition to false | 
 | ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader | 
 | ; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}} | 
 | ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]] | 
 |  | 
 | ; Clear exec bits for workitems that load -1s | 
 | ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: | 
 | ; SI: buffer_load_dword [[B:v[0-9]+]] | 
 | ; SI: buffer_load_dword [[A:v[0-9]+]] | 
 | ; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] | 
 | ; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] | 
 | ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] | 
 | ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] | 
 | ; SI: s_xor_b64 [[ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG2]] | 
 | ; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]] | 
 |  | 
 | ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20 | 
 | ; SI: buffer_store_dword | 
 | ; SI: v_cmp_ge_i64_e{{32|64}} [[CMP:s\[[0-9]+:[0-9]+\]|vcc]] | 
 | ; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]] | 
 |  | 
 | ; SI: [[LABEL_FLOW]]: | 
 | ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]] | 
 | ; SI-NEXT: s_or_b64 exec, exec, [[ORNEG3]] | 
 | ; SI-NEXT: s_mov_b64 [[MOVED_TMP:s\[[0-9]+:[0-9]+\]]], [[TMP]] | 
 | ; SI-NEXT: s_and_b64 [[MASKED_ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG3]] | 
 | ; SI-NEXT: s_or_b64 [[COND_STATE]], [[MASKED_ORNEG3]], [[MOVED_TMP]] | 
 | ; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]] | 
 | ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]] | 
 |  | 
 | ; SI: [[LABEL_EXIT]]: | 
 | ; SI-NOT: [[COND_STATE]] | 
 | ; SI: s_endpgm | 
 |  | 
 | define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { | 
 | bb: | 
 |   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 | 
 |   %tmp4 = sext i32 %tmp to i64 | 
 |   %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4 | 
 |   %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 | 
 |   %tmp7 = icmp sgt i32 %tmp6, 0 | 
 |   %tmp8 = sext i32 %tmp6 to i64 | 
 |   br i1 %tmp7, label %bb10, label %bb26 | 
 |  | 
 | bb10:                                             ; preds = %bb, %bb20 | 
 |   %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ] | 
 |   %tmp12 = add nsw i64 %tmp11, %tmp4 | 
 |   %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12 | 
 |   %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 | 
 |   %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12 | 
 |   %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 | 
 |   %tmp17 = icmp ne i32 %tmp14, -1 | 
 |   %tmp18 = icmp ne i32 %tmp16, -1 | 
 |   %tmp19 = and i1 %tmp17, %tmp18 | 
 |   br i1 %tmp19, label %bb20, label %bb26 | 
 |  | 
 | bb20:                                             ; preds = %bb10 | 
 |   %tmp21 = add nsw i32 %tmp16, %tmp14 | 
 |   %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12 | 
 |   store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4 | 
 |   %tmp23 = add nuw nsw i64 %tmp11, 1 | 
 |   %tmp24 = icmp slt i64 %tmp23, %tmp8 | 
 |   br i1 %tmp24, label %bb10, label %bb26 | 
 |  | 
 | bb26:                                             ; preds = %bb10, %bb20, %bb | 
 |   ret void | 
 | } | 
 |  | 
 | attributes #0 = { nounwind readnone } | 
 | attributes #1 = { nounwind } |