CUDA-2.1¶

Only include device-threads participating in a warp collective operation in the mask parameter

Required inputs: IR, StaticSemanticAnalysis

CUDA 2.1 [collective.warp.participants.active] Only include device-threads participating in a warp collective operation in the mask parameter

When a group of device-threads participating in a warp collective operation, the mask parameter for the operation should only include the device-threads that are participating. No inactive device-threads should be specified in the mask parameter. Overlapping warp wide collective operations may occur. The following are warp collective operations:

__syncwarp.
__all_sync, __any_sync, and __ballot_sync.
__match_any_sync, and __match_all_sync.

The following are warp collective shuffle operations:

__shfl_sync.
__shfl_up_sync.
__shfl_down_sync.
__shfl_xor_sync.

Scope: Device.

Audience: CUDA C++.

Category: Advisory.

Hardware Applicability: All Compute Capabilities.

Rationale

A subset of the active device-threads may participate in a warp collective operation but specifying a superset of active device-threads via the mask parameter is undefined behavior, because that superset includes inactive device-threads. The value returned by a warp wide collective shuffle operation is unspecified if the target device-thread is inactive.

Example 1 (Bad)

__global__ void kernel() {
  auto const idx = blockIdx.x * blockDim.x + threadIdx.x;
  if(!(idx % 16)) { // diverge intentionally to show false positive problem
    __syncwarp(0b10000000000000001000000000000000); // non-compliant: wrong mask
  }
  if(!((blockIdx.x * blockDim.x + threadIdx.x) % 16)) { // same problem as above without idx variable
    __syncwarp(0b10000000000000001000000000000000); // non-compliant: wrong mask
  }
  __syncwarp(~1u); // non-compliant: thread lane 0 is not in mask
}

Example 2 (Good)

__global__ void kernel() {
  __syncwarp(~0u); // compliant: all threads in the mask
}

__global__ void kernel2(int32_t *u) {
    auto const idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (!(idx % 2)) {
    u[idx] += u[idx + 1];
    __syncwarp(0b01010101010101010101010101010101);
  }
}

int main() {
  constexpr int32_t n = 32;
  auto deleter = [] (int32_t* ptr) {
    cudaError_t const error0 = cudaFree(ptr);
    assert(cudaSuccess == error0);
  };
  int32_t* raw_u = nullptr;
  cudaError_t const error1 = cudaMallocManaged(&raw_u, n * sizeof(int32_t));
  testTerminate(error1);
  std::unique_ptr<int32_t[], decltype(deleter)> up(raw_u, deleter);
  std::fill(up.get(), up.get() + n, 1);
  kernel<<<1,n>>>(); // this does not generate a false positive.
  kernel2<<<1, n>>>(up.get()); // this generates a false positive.
  cudaError_t const error2 = cudaGetLastError();
  testTerminate(error2);
  cudaError_t const error3 = cudaDeviceSynchronize();
  assert(cudaSuccess == error3);
  assert(n == up[0]);
}

Example 2 (Bad)

# include <cassert>
# include "testTerminate.h"

__global__ void broadcast(int32_t u) {
  auto const idx = threadIdx.x & 31;
  int32_t v = 0;
  if (0 == idx)
    v = u;
  else
    // This `__shfl_sync`'s target device-thread (the third parameter) is the
    // first device-thread in the warp, which isn't participating in the
    // collective operation, and thus the value retrieved will be undefined.
    v = __shfl_sync(0b11111111111111111111111111111110, v, 0);
  *(int*)0 = 0;
}

int main() {
  constexpr int32_t n = 32;
  broadcast<<<1, n>>>(17);
  cudaError_t const error0 = cudaGetLastError();
  testTerminate(error0);
  cudaError_t const error1 = cudaDeviceSynchronize();
  assert(cudaSuccess == error1);
}

Example 4 (Good)

# include <cassert>
# include "testTerminate.h"

__global__ void broadcast(int32_t u) {
  auto const idx = threadIdx.x & 31;
  int32_t v = 0;
  if (0 == idx)
    v = u;
  v = __shfl_sync(0b11111111111111111111111111111111, v, 0);
  if(v != u) {
    *(int*)0 = 0;
  }
}

int main() {
  constexpr int32_t n = 32;
  broadcast<<<1, n>>>(17);
  cudaError_t const error0 = cudaGetLastError();
  testTerminate(error0);
  cudaError_t const error1 = cudaDeviceSynchronize();
  assert(cudaSuccess == error1);
}

Possible Messages

Key	Text	Severity	Disabled
extra_lane_in_mask	Lane(s) {lanes} are named in sync mask 0b{mask:032b} but they do not reach any matching intrinsic call within {routine} in {context}	None	False
potentially_extra_lane_in_mask	Lane(s) {lanes} are named in sync mask 0b{mask:032b} but they potentially do not reach any matching intrinsic call within {routine} in {context}	None	True

Options

This rule shares the following common options: exclude_in_macros, exclude_messages_in_system_headers, excludes, extend_exclude_to_macro_invocations, includes, justification_checker, languages, post_processing, provider, report_at, severity
The following places define options that affect this rule: Stylechecks, Analysis-GlobalOptions
This rule has no individual options.

Axivion Suite 7.12.2-public

Navigation