| ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s |
| ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s |
| |
| ; Use bar.sync to arrive at a pre-computed barrier number and |
| ; wait for all threads in CTA to also arrive: |
| define ptx_device void @test_barrier_named_cta() { |
| ; CHECK: mov.u32 %r[[REG0:[0-9]+]], 0; |
| ; CHECK: bar.sync %r[[REG0]]; |
| ; CHECK: mov.u32 %r[[REG1:[0-9]+]], 10; |
| ; CHECK: bar.sync %r[[REG1]]; |
| ; CHECK: mov.u32 %r[[REG2:[0-9]+]], 15; |
| ; CHECK: bar.sync %r[[REG2]]; |
| ; CHECK: ret; |
| call void @llvm.nvvm.barrier.n(i32 0) |
| call void @llvm.nvvm.barrier.n(i32 10) |
| call void @llvm.nvvm.barrier.n(i32 15) |
| ret void |
| } |
| |
| ; Use bar.sync to arrive at a pre-computed barrier number and |
| ; wait for fixed number of cooperating threads to arrive: |
| define ptx_device void @test_barrier_named() { |
| ; CHECK: mov.u32 %r[[REG0A:[0-9]+]], 32; |
| ; CHECK: mov.u32 %r[[REG0B:[0-9]+]], 0; |
| ; CHECK: bar.sync %r[[REG0B]], %r[[REG0A]]; |
| ; CHECK: mov.u32 %r[[REG1A:[0-9]+]], 352; |
| ; CHECK: mov.u32 %r[[REG1B:[0-9]+]], 10; |
| ; CHECK: bar.sync %r[[REG1B]], %r[[REG1A]]; |
| ; CHECK: mov.u32 %r[[REG2A:[0-9]+]], 992; |
| ; CHECK: mov.u32 %r[[REG2B:[0-9]+]], 15; |
| ; CHECK: bar.sync %r[[REG2B]], %r[[REG2A]]; |
| ; CHECK: ret; |
| call void @llvm.nvvm.barrier(i32 0, i32 32) |
| call void @llvm.nvvm.barrier(i32 10, i32 352) |
| call void @llvm.nvvm.barrier(i32 15, i32 992) |
| ret void |
| } |
| |
| declare void @llvm.nvvm.barrier(i32, i32) |
| declare void @llvm.nvvm.barrier.n(i32) |