CMakeLists.txt
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
project(CudaCppExample)
enable_language("CUDA")
add_executable(CudaCppExample main.cpp kernel.cu)
set_target_properties(CudaCppExample PROPERTIES CUDA_ARCHITECTURES "50;72")
main.cpp
#include <iostream>
#include <cuda_runtime.h>
const int N = 10;
void cudaErrorCheck(cudaError_t error, const char *file, int line) {
if (error != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(error) << " at " << file << ":" << line << std::endl;
exit(EXIT_FAILURE);
}
}
#define CUDA_CHECK(error) cudaErrorCheck(error, __FILE__, __LINE__)
int main() {
double *h_data, *d_data;
h_data = new double[N];
// Initialize data
for (int i = 0; i < N; ++i) {
h_data[i] = i;
}
CUDA_CHECK(cudaMalloc(&d_data, N * sizeof(double)));
CUDA_CHECK(cudaMemcpy(d_data, h_data, N * sizeof(double), cudaMemcpyHostToDevice));
// Launch kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
kernel << <blocksPerGrid, threadsPerBlock >> > (d_data, N);
CUDA_CHECK(cudaMemcpy(h_data, d_data, N * sizeof(double), cudaMemcpyDeviceToHost));
// Print results
for (int i = 0; i < N; ++i) {
std::cout << "Result[" << i << "]: " << h_data[i] << std::endl;
}
// Cleanup
delete[] h_data;
CUDA_CHECK(cudaFree(d_data));
return 0;
}
kernel.cu
extern "C" __device__ int add(int a, int b) {
return a + b;
}
struct Multiply {
__device__ int operator()(int a, int b) {
return a * b;
}
};
// CUDA kernel that uses both C and C++ code
__global__ void kernel(int *output, int input) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Call the C function
int sum = add(input, tid);
Multiply multiply;
int product = multiply(input, tid);
// Store results
output[tid] = sum + product;
}