CUDA-3.1ΒΆ

Use functions only in the execution spaces they target

Required inputs: IR

CUDA 3.1 [share.function] Use functions only in the execution spaces they target

Functions should only be executed in the execution space(s) they target.

  • __host__ __device__ functions target both the host and device execution space.
  • __host__ functions and functions with no execution space specifier target the host execution space.
  • __device__ functions and kernels (__global__ functions) target the device execution space.
Scope: Host, Device.
Audience: CUDA C++, CUDA Libraries.
Category: Required.
Hardware Applicability: All Compute Capabilities.
Rationale

Functions are only compiled for the architectures necessary for their execution space. __device__ only functions may appear to have a definition in __host__ code, however, these definitions are no-ops and are purely an implementation detail. Any ODR-use of a function in an execution space that it does not support results in undefined behavior.

Example 1 (Bad)
# include <memory>
# include <algorithm>
# include <vector>
# include <cassert>

struct square {
  template <typename T>
  __device__ T operator()(T t) {
    return t * t;
  }
};

template <typename ForwardIt, typename Size, typename UnaryOp>

__host__ __device__

void inplace_transform_n(ForwardIt first, Size n, UnaryOp op) {
  for (Size i = 0; i < n; ++first, (void) ++i)
    *first = op(*first);
}

void vectorSquare(std::vector<int32_t> &u) {
  // This instantiates and uses `inplace_transform_n` in `__host__` code with a
  // `__device__` only `UnaryOp`, which is invalid and leads to a compile time
  // warning and a run time failure.
  inplace_transform_n(u.begin(), u.size(), square{});
}
Example 2 (Bad)
# include <memory>
# include <algorithm>
# include <vector>
# include <cassert>

struct square {
  template <typename T>
  __device__ T operator()(T t) {
    return t * t;
  }
};
// This template function can be instantiated and used in `__host__` code with
// a `__host__` only `UnaryOp`, but this will generate a vacuous warning about
// using a `__host__` only function in a `__host__ __device__` function. This
// warning can be disabled with `#pragma nv_exec_check_disable`, but this also
// disables relevant warnings for any invalid instantiations in `__device__`
// code with `__host__` only `UnaryOp`s.
# pragma nv_exec_check_disable

template <typename ForwardIt, typename Size, typename UnaryOp>

__host__ __device__

void inplace_transform_n(ForwardIt first, Size n, UnaryOp op) {
  for (Size i = 0; i < n; ++first, (void) ++i)
    *first = op(*first);
}

void vectorSquare(std::vector<int32_t> &u)) {
  // This instantiates and uses `inplace_transform_n` in `__host__` code with a
  // `__device__` only `UnaryOp`, which is invalid and leads to a compile time
  // warning and a run time failure.
  inplace_transform_n(u.begin(), u.size(), square{});
}
Example 3 (Bad)
# include <memory>
# include <algorithm>
# include <cassert>
# include "testTerminate.h"

struct square {
  template <typename T>
  T operator()(T t) {
    return t * t;
  }
};

template <typename ForwardIt, typename Size, typename UnaryOp>

__host__ __device__

void inplace_transform_n(ForwardIt first, Size n, UnaryOp op) {
  for (Size i = 0; i < n; ++first, (void) ++i)
    *first = op(*first);
}

__global__ void square_each_element(int32_t* u, ptrdiff_t per_thread) {
  inplace_transform_n(u, per_thread, square{});
}

void ptrSquare(std::unique_ptr<int32_t> &up, int32_t n, int32_t m) {
  // This kernel instantiates and uses `inplace_transform_n` in `__device__`
  // code with a `__host__` only `UnaryOp`, which is invalid and leads to a
  // compile time warning and a run time failure.
  square_each_element<<<1, n>>>(up.get(), m);
  cudaError_t const error2 = cudaGetLastError();
  testTerminate(error2);
  cudaError_t const error3 = cudaDeviceSynchronize();
  assert(cudaSuccess == error3);
}
Example 4 (Bad)
# include <memory>
# include <algorithm>
# include <cassert>
# include "testTerminate.h"

struct square {
  template <typename T>
  T operator()(T t) {
    return t * t;
  }
};
// This template function can be instantiated and used in `__host__` code with
// a `__host__` only `UnaryOp`, but this will generate a vacuous warning about
// using a `__host__` only function in a `__host__ __device__` function. This
// warning can be disabled with `#pragma nv_exec_check_disable`, but this also
// disables relevant warnings for any invalid instantiations in `__device__`
// code with `__host__` only `UnaryOp`s.
# pragma nv_exec_check_disable

template <typename ForwardIt, typename Size, typename UnaryOp>

__host__ __device__

void inplace_transform_n(ForwardIt first, Size n, UnaryOp f) {
  for (Size i = 0; i < n; ++first, (void) ++i)
    *first = f(*first);
}

__global__ void square_each_element(int32_t* u, ptrdiff_t per_thread) {
  inplace_transform_n(u, per_thread, square{});
}

void ptrSquare(std::unique_ptr<int32_t> &up, int32_t n, int32_t m) {
  // This kernel instantiates and uses `inplace_transform_n` in `__device__`
  // code with a `__host__` only `UnaryOp`, which is invalid and leads to a run
  // time failure, but no compile time warning is produced.
  square_each_element<<<1, n>>>(up.get(), m);
  cudaError_t const error2 = cudaGetLastError();
  testTerminate(error2);
  cudaError_t const error3 = cudaDeviceSynchronize();
  assert(cudaSuccess == error3);
}
Excerpt from NVIDIA CUDA C++ Guidelines for robust and safety-critical programming, Version 3.0.1, Copyright (C) 2018-2023 NVIDIA Corporation.

Possible Messages

Key

Text

Severity

Disabled

constexpr_device_func_called_from_const_host

This call is in host code but attempts to call a constexpr device function

None

False

constexpr_device_func_called_from_host

This call is in host code but attempts to call a constexpr device function outside of a manifestly constant evaluated context

None

False

constexpr_host_func_called_from_const_device

This call is in device code but attempts to call a constexpr host function

None

False

constexpr_host_func_called_from_device

This call is in device code but attempts to call a constexpr host function outside of a manifestly constant evaluated context

None

False

device_func_called_from_host

This call is in host code but attempts to call a device function

None

False

host_func_called_from_device

This call is in device code but attempts to call a host function

None

False

Options