| ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s | 
 | ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s | 
 |  | 
 | ; Use bar.sync to arrive at a pre-computed barrier number and | 
 | ; wait for all threads in CTA to also arrive: | 
 | define ptx_device void @test_barrier_named_cta() { | 
 | ; CHECK: mov.u32  %r[[REG0:[0-9]+]], 0; | 
 | ; CHECK: bar.sync %r[[REG0]]; | 
 | ; CHECK: mov.u32  %r[[REG1:[0-9]+]], 10; | 
 | ; CHECK: bar.sync %r[[REG1]]; | 
 | ; CHECK: mov.u32  %r[[REG2:[0-9]+]], 15; | 
 | ; CHECK: bar.sync %r[[REG2]]; | 
 | ; CHECK: ret; | 
 |   call void @llvm.nvvm.barrier.n(i32 0) | 
 |   call void @llvm.nvvm.barrier.n(i32 10) | 
 |   call void @llvm.nvvm.barrier.n(i32 15) | 
 |   ret void | 
 | } | 
 |  | 
 | ; Use bar.sync to arrive at a pre-computed barrier number and | 
 | ; wait for fixed number of cooperating threads to arrive: | 
 | define ptx_device void @test_barrier_named() { | 
 | ; CHECK: mov.u32  %r[[REG0A:[0-9]+]], 32; | 
 | ; CHECK: mov.u32  %r[[REG0B:[0-9]+]], 0; | 
 | ; CHECK: bar.sync %r[[REG0B]], %r[[REG0A]]; | 
 | ; CHECK: mov.u32  %r[[REG1A:[0-9]+]], 352; | 
 | ; CHECK: mov.u32  %r[[REG1B:[0-9]+]], 10; | 
 | ; CHECK: bar.sync %r[[REG1B]], %r[[REG1A]]; | 
 | ; CHECK: mov.u32  %r[[REG2A:[0-9]+]], 992; | 
 | ; CHECK: mov.u32  %r[[REG2B:[0-9]+]], 15; | 
 | ; CHECK: bar.sync %r[[REG2B]], %r[[REG2A]]; | 
 | ; CHECK: ret; | 
 |   call void @llvm.nvvm.barrier(i32 0, i32 32) | 
 |   call void @llvm.nvvm.barrier(i32 10, i32 352) | 
 |   call void @llvm.nvvm.barrier(i32 15, i32 992) | 
 |   ret void | 
 | } | 
 |  | 
 | declare void @llvm.nvvm.barrier(i32, i32) | 
 | declare void @llvm.nvvm.barrier.n(i32) |