CUDA-3.3¶

Only use objects in the execution space they are associated with

Required inputs: IR, StaticSemanticAnalysis

CUDA 3.3 [share.object.usage] Only use objects in the execution space they are associated with

Supersedes:

[share.object.polymorphic]
[share.object.host-private]
[share.object.device-thread]
[share.object.device-thread-block]
[share.object.device-private]
[share.object.stream-associated]

As seen from a device, CPU or GPU, objects may be in one of two types of memory, accessible or inaccessible. Accessible memory is either directly connected to the device or is connected to another device that has been made available to the current device in some manner. Inaccessible memory is connected to another device and has not been made available to the current device in any manner. Accessible memory can be obtained from multiple sources.

Unified Virtual memory
Managed memory
Stream association
device specific allocation mechanisms

Inaccessible memory is obtained by device specific allocation mechanisms that were not run on the current device. Objects can be copied between memory types, however some objects like Polymorphic objects, are only properly defined on the device where they where created. Objects may only be ODR-used on the devices where they are accessible and properly defined.

Scope: Host, Device.

Audience: CUDA C++, CUDA Libraries.

Category: Required.

Hardware Applicability: All Compute Capabilities.

Rationale

Calling a virtual method of a polymorphic object on a host or device where it was not created has undefined behavior. Calling a function via a pointer or reference on a host or device where the address was not captured has undefined behavior. Inaccessible objects do not exist on the current device. Their addresses will either be invalid or will refer to different storage. Accessible objects exist in the current device's memory space. Their addresses will always refer to the correct object. Accessing a stream-attached region of memory from host-threads or device-threads outside of the stream that the memory is attached to has undefined behavior.

Example 1 (Bad)

# include <memory>
# include <cassert>
# include "testTerminate.h"

struct scalable {
  __host__ __device__ virtual ~scalable() {}
  __host__ __device__ virtual void scale(int32_t c) = 0;
};

struct point2d final : scalable {
  point2d(int32_t x_, int32_t y_) : x(x_), y(y_) {}
  __host__ __device__ void scale(int32_t c) override final {
    x = x * c;
    y = y * c;
  }
  int32_t x;
  int32_t y;
};

__global__ void scale_object(scalable* s, int32_t c) {
  s->scale(2);
}

int main() {
  auto deleter = [] (scalable* ptr) {
    ptr->~scalable();
    cudaError_t const error0 = cudaFree(ptr);
    assert(cudaSuccess == error0);
  };
  scalable* raw_u = nullptr;
  cudaError_t const error1 = cudaMallocManaged(&raw_u, sizeof(point2d));
  testTerminate(error1);
  new (raw_u) point2d(2, 4);
  std::unique_ptr<scalable, decltype(deleter)> up(raw_u, deleter);
  // We incorrectly pass the address of a polymorphic object constructed on the
  // host to our kernel, which leads to a runtime error when the kernel attempts
  // to call a virtual function.
  scale_object<<<1, 1>>>(up.get(), 2);
  cudaError_t const error2 = cudaGetLastError();
  testTerminate(error2);
  cudaError_t const error3 = cudaDeviceSynchronize();
  assert(cudaSuccess == error3);
  point2d* raw_p = dynamic_cast<point2d*>(up.get());
  assert(raw_p != nullptr);
  assert(raw_p->x == 4 && raw_p->y == 8);
}

Example 2 (Bad)

# include <cassert>
# include <memory>

int main() {
  int32_t* raw_u = nullptr;
  cudaError_t const error1 = cudaMalloc(&raw_u, sizeof(int32_t));
  assert(cudaSuccess == error1);
  // `up` points to a device dynamic storage duration object, so accessing it
  // from the host below is undefined behavior.
  *raw_u = 42;
  cudaError_t const error2 = cudaFree(raw_u);
  assert(cudaSuccess == error2);
}

Example 3 (Bad)

# include <cassert>
# include <memory>
# include "testTerminate.h"

__device__ int32_t const i = 42;

__global__ void address_of_i(int32_t const** u) {
  *u = &i;
}

int main() {
  int32_t const** raw_u = nullptr;
  cudaError_t const error1 = cudaMallocManaged(&raw_u, sizeof(int32_t const*));
  testTerminate(error1);
  address_of_i<<<1, 1>>>(raw_u);
  cudaError_t const error2 = cudaGetLastError();
  testTerminate(error2);
  cudaError_t const error3 = cudaDeviceSynchronize();
  testTerminate(error3);
  // The address of `i`, a device-private variable, is stored in `up`. Accessing
  // it here on the host has undefined behavior.
  assert(42 == **raw_u);
  cudaError_t const error4 = cudaFree(raw_u);
  testTerminate(error4);
}

Example 4 (Bad)

# include <cassert>
# include <memory>
# include "testTerminate.h"
__shared__ int32_t i;

__global__ void address_of_i(int32_t** u) {
  if (0 == threadIdx.x)
    i = 42;
  *u = &i;
}

int main() {
  int32_t** raw_u = nullptr;
  cudaError_t const error1 = cudaMallocManaged(&raw_u, sizeof(int32_t*));
  testTerminate(error1);
  address_of_i<<<1, 1>>>(raw_u);
  cudaError_t const error2 = cudaGetLastError();
  testTerminate(error2);
  cudaError_t const error3 = cudaDeviceSynchronize();
  testTerminate(error3);
  // The address of `i`, a device-private variable, is stored in `up`. Accessing
  // it here on the host has undefined behavior.
  assert(42 == **raw_u);
  cudaError_t const error4 = cudaFree(raw_u);
  testTerminate(error4);
}

Example 5 (Bad)

__shared__ int x, *pz;

__device__ int* px, py;

__global__ void kernel(int *p) {
  int z;
  if (!px)
    px = &x;// Non-compliant because address of x escapes the shared scope
  *px += 1; // Non-compliant if evaluated by more than one block: the assignment
// above makes the address of a thread-block shared variable
// accessible to other thread-blocks.
  if (!pz)
    pz = &z; // Non-compliant because address of z escapes block scope
  cudaDeviceSynchronize();
  *pz += 1; // Non-compliant if evaluated by more than one thread per block: the
// address of a thread-private memory location is accessed by other
// device threads.
  *p = 1; // Potentially non-compliant if the pointer is not referring to a
// a memory location accessible by this device thread.
}

int main() {
  int w = x; // Non-compliant: read from device thread-block shared variable in host code.
  x = 123; // Non-compliant: write to device thread-block shared variable in host code.
  kernel<<<2,2>>>(&w); // Compliant: If host stack objects are available to the device, ATS or HMM.
// Non-compliant: host address passed to and used by
// device code.
}

Example 6 (Bad)

void foo() { }

__global__ void kernel(void(*pf)()) {
  pf();
}

int main() {
  kernel<<<1,1>>>(&foo); // Non-compliant: the kernel attempts to call the host
// function 'foo' through a function pointer.
}

Possible Messages

Key	Text	Severity	Disabled
block_variable_accessed	Access to thread-block shared variable potentially from outside the thread-block	None	False
device_func_indirectly_called_from_host	This call is in host code but might call a device function	None	False
device_obj_used_from_host	Device object is used in host code	None	False
host_func_indirectly_called_from_device	This call is in device code but might call a host function	None	False
host_obj_used_from_device	Host object is used in device code	None	False
thread_private_variable_accessed	Access to thread-private variable potentially from different thread	None	False

Options

This rule shares the following common options: exclude_in_macros, exclude_messages_in_system_headers, excludes, extend_exclude_to_macro_invocations, includes, justification_checker, languages, post_processing, provider, report_at, severity
The following places define options that affect this rule: Stylechecks, Analysis-GlobalOptions
This rule has no individual options.

Axivion Suite 7.12.2-public

Navigation