CUDA-2.1ΒΆ

Only include device-threads participating in a warp collective operation in the mask parameter

Required inputs: IR, StaticSemanticAnalysis

CUDA 2.1 [collective.warp.participants.active] Only include device-threads participating in a warp collective operation in the mask parameter

When a group of device-threads participating in a warp collective operation, the mask parameter for the operation should only include the device-threads that are participating. No inactive device-threads should be specified in the mask parameter. Overlapping warp wide collective operations may occur. The following are warp collective operations:

  • __syncwarp.
  • __all_sync, __any_sync, and __ballot_sync.
  • __match_any_sync, and __match_all_sync.

The following are warp collective shuffle operations:

  • __shfl_sync.
  • __shfl_up_sync.
  • __shfl_down_sync.
  • __shfl_xor_sync.
Scope: Device.
Audience: CUDA C++.
Category: Advisory.
Hardware Applicability: All Compute Capabilities.
Rationale

A subset of the active device-threads may participate in a warp collective operation but specifying a superset of active device-threads via the mask parameter is undefined behavior, because that superset includes inactive device-threads. The value returned by a warp wide collective shuffle operation is unspecified if the target device-thread is inactive.

Example 1 (Bad)
__global__ void kernel() {
  auto const idx = blockIdx.x * blockDim.x + threadIdx.x;
  if(!(idx % 16)) { // diverge intentionally to show false positive problem
    __syncwarp(0b10000000000000001000000000000000); // non-compliant: wrong mask
  }
  if(!((blockIdx.x * blockDim.x + threadIdx.x) % 16)) { // same problem as above without idx variable
    __syncwarp(0b10000000000000001000000000000000); // non-compliant: wrong mask
  }
  __syncwarp(~1u); // non-compliant: thread lane 0 is not in mask
}
Example 2 (Good)
__global__ void kernel() {
  __syncwarp(~0u); // compliant: all threads in the mask
}

__global__ void kernel2(int32_t *u) {
    auto const idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (!(idx % 2)) {
    u[idx] += u[idx + 1];
    __syncwarp(0b01010101010101010101010101010101);
  }
}

int main() {
  constexpr int32_t n = 32;
  auto deleter = [] (int32_t* ptr) {
    cudaError_t const error0 = cudaFree(ptr);
    assert(cudaSuccess == error0);
  };
  int32_t* raw_u = nullptr;
  cudaError_t const error1 = cudaMallocManaged(&raw_u, n * sizeof(int32_t));
  testTerminate(error1);
  std::unique_ptr<int32_t[], decltype(deleter)> up(raw_u, deleter);
  std::fill(up.get(), up.get() + n, 1);
  kernel<<<1,n>>>(); // this does not generate a false positive.
  kernel2<<<1, n>>>(up.get()); // this generates a false positive.
  cudaError_t const error2 = cudaGetLastError();
  testTerminate(error2);
  cudaError_t const error3 = cudaDeviceSynchronize();
  assert(cudaSuccess == error3);
  assert(n == up[0]);
}
Example 2 (Bad)
# include <cassert>
# include "testTerminate.h"

__global__ void broadcast(int32_t u) {
  auto const idx = threadIdx.x & 31;
  int32_t v = 0;
  if (0 == idx)
    v = u;
  else
    // This `__shfl_sync`'s target device-thread (the third parameter) is the
    // first device-thread in the warp, which isn't participating in the
    // collective operation, and thus the value retrieved will be undefined.
    v = __shfl_sync(0b11111111111111111111111111111110, v, 0);
  *(int*)0 = 0;
}

int main() {
  constexpr int32_t n = 32;
  broadcast<<<1, n>>>(17);
  cudaError_t const error0 = cudaGetLastError();
  testTerminate(error0);
  cudaError_t const error1 = cudaDeviceSynchronize();
  assert(cudaSuccess == error1);
}
Example 4 (Good)
# include <cassert>
# include "testTerminate.h"

__global__ void broadcast(int32_t u) {
  auto const idx = threadIdx.x & 31;
  int32_t v = 0;
  if (0 == idx)
    v = u;
  v = __shfl_sync(0b11111111111111111111111111111111, v, 0);
  if(v != u) {
    *(int*)0 = 0;
  }
}

int main() {
  constexpr int32_t n = 32;
  broadcast<<<1, n>>>(17);
  cudaError_t const error0 = cudaGetLastError();
  testTerminate(error0);
  cudaError_t const error1 = cudaDeviceSynchronize();
  assert(cudaSuccess == error1);
}
Excerpt from NVIDIA CUDA C++ Guidelines for robust and safety-critical programming, Version 3.0.1, Copyright (C) 2018-2023 NVIDIA Corporation.

Possible Messages

Key

Text

Severity

Disabled

extra_lane_in_mask

Lane(s) {lanes} are named in sync mask 0b{mask:032b} but they do not reach any matching intrinsic call within {routine} in {context}

None

False

potentially_extra_lane_in_mask

Lane(s) {lanes} are named in sync mask 0b{mask:032b} but they potentially do not reach any matching intrinsic call within {routine} in {context}

None

True

Options