CUDA setup not found with update-alternatives symlinks on ubuntu

maichmueller · September 22, 2024, 2:29pm

Hi,

cmake (26.5 and also 30.3) seem to fail searching for the cuda toolkit when it is symlinked by ubuntu’s update-alternatives. More precisely, I have this sample project cmakelists.txt from CLion:

cmake_minimum_required(VERSION 3.24)
project(xmimir CXX CUDA)

set(CMAKE_CUDA_STANDARD 20)

message(STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}")
message(STATUS "CUDAToolkit_ROOT: ${CUDAToolkit_ROOT}")

find_package(Threads REQUIRED)
find_package(CUDAToolkit REQUIRED)


add_library(xmimir STATIC library.cu)
set_target_properties(
        xmimir
        PROPERTIES
        CUDA_SEPARABLE_COMPILATION ON
)

At work we have an IT setup in which /usr/local is mounted with nfs4 from another server. The CUDA toolkit is then found in the default /usr/local directory:

l /usr/local/                                                                                                                                                                        
drwxr-xr-x@     - root 20 Aug 11:21 admin/
drwxr-xr-x@     - root 5 Jun  2023  bib/
drwxr-xr-x@     - root 19 Sep 16:41 bin/
lrwxrwxrwx      - root 19 Sep 16:38 cuda -> /etc/alternatives/cuda/
drwxr-xr-x@     - root 9 Aug  2022  cuda-10.1/
drwxr-xr-x@     - root 9 Aug  2022  cuda-11.2/
drwxr-xr-x@     - root 4 Aug  2022  cuda-11.7/
lrwxrwxrwx      - root 19 Sep 16:38 cuda-12 -> /etc/alternatives/cuda-12/
drwxr-xr-x@     - root 19 Sep 16:38 cuda-12.6/
drwxr-xr-x@     - root 9 Aug  2022  cudnn-10.1-v7.6/
drwxr-xr-x@     - root 9 Aug  2022  cudnn-11.X-v8.4/

and

l /usr/bin/nvcc                                                                                                                                                                      
lrwxrwxrwx - root 21 Sep 19:12 /usr/bin/nvcc -> /etc/alternatives/nvcc*

as well as the config in update-alternatives:

update-alternatives --display nvcc           
                                                                                                                                        
nvcc - auto mode
  link best version is /usr/local/cuda-12.6/bin/nvcc
  link currently points to /usr/local/cuda-12.6/bin/nvcc
  link nvcc is /usr/bin/nvcc
/usr/local/cuda-12.6/bin/nvcc - priority 100

update-alternatives --display cuda           
                                                                                                                                        
cuda - auto mode
  link best version is /usr/local/cuda-12
  link currently points to /usr/local/cuda-12
  link cuda is /usr/local/cuda
/usr/local/cuda-12 - priority 100

update-alternatives --display cuda-12              
                                                                                                                                  
cuda-12 - auto mode
  link best version is /usr/local/cuda-12.6
  link currently points to /usr/local/cuda-12.6
  link cuda-12 is /usr/local/cuda-12
/usr/local/cuda-12.6 - priority 100

If I configure this project with the symlinked versions of the cuda toolkit, ie.
nvcc → /usr/bin/nvcc
cuda → /usr/local/cuda,
then cmake cannot find the cuda toolkit / compiler:

cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_COMPILER=/usr/bin/gcc-12 -DCMAKE_CXX_COMPILER=/usr/bin/g++-12 -G Ninja -DCUDAToolkit_ROOT=/usr/local/cuda -DCMAKE_CUDA_ARCHITECTURES=80 -DCMAKE_CUDA_COMPILER=/usr/bin/nvcc --debug-output -S /work/rleap1/michael.aichmueller/github/xmimir -B /work/rleap1/michael.aichmueller/github/xmimir/cmake-build-debug-gcc12-nvcc-polonium-broken
Running with debug output on.
-- The CXX compiler identification is GNU 12.3.0
   Called from: [3]	/u/michael.aichmueller/cmake-3.26.5-linux-x86_64/share/cmake-3.26/Modules/CMakeDetermineCompilerId.cmake
                [2]	/u/michael.aichmueller/cmake-3.26.5-linux-x86_64/share/cmake-3.26/Modules/CMakeDetermineCXXCompiler.cmake
                [1]	/work/rleap1/michael.aichmueller/github/xmimir/CMakeLists.txt
CMake Error at /u/michael.aichmueller/cmake-3.26.5-linux-x86_64/share/cmake-3.26/Modules/CMakeDetermineCUDACompiler.cmake:227 (message):
  Couldn't find CUDA library root.
Call Stack (most recent call first):
  CMakeLists.txt:2 (project)


   Called from: [2]	/u/michael.aichmueller/cmake-3.26.5-linux-x86_64/share/cmake-3.26/Modules/CMakeDetermineCUDACompiler.cmake
                [1]	/work/rleap1/michael.aichmueller/github/xmimir/CMakeLists.txt
-- Configuring incomplete, errors occurred!

But If I give the true paths, i.e.
nvcc → /usr/local/cuda-12.6/bin/nvcc
cuda → /usr/local/cuda-12.6,
then cmake is able to find everything:

cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_COMPILER=/usr/bin/gcc-12 -DCMAKE_CXX_COMPILER=/usr/bin/g++-12 -G Ninja -DCUDAToolkit_ROOT=/usr/local/cuda-12.6 -DCMAKE_CUDA_ARCHITECTURES=80 -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.6/bin/nvcc --debug-output -S /work/rleap1/michael.aichmueller/github/xmimir -B /work/rleap1/michael.aichmueller/github/xmimir/cmake-build-debug-gcc12-nvcc-polonium
Running with debug output on.
-- CMAKE_CUDA_COMPILER: /usr/local/cuda-12.6/bin/nvcc
   Called from: [1]	/work/rleap1/michael.aichmueller/github/xmimir/CMakeLists.txt
-- CUDAToolkit_ROOT: /usr/local/cuda-12.6
   Called from: [1]	/work/rleap1/michael.aichmueller/github/xmimir/CMakeLists.txt
-- Configuring done (0.3s)
-- Generating /work/rleap1/michael.aichmueller/github/xmimir/cmake-build-debug-gcc12-nvcc-polonium
   Called from: [1]	/work/rleap1/michael.aichmueller/github/xmimir/CMakeLists.txt
-- Generating done (0.0s)
-- Build files have been written to: /work/rleap1/michael.aichmueller/github/xmimir/cmake-build-debug-gcc12-nvcc-polonium

Is this behaviour expected?
Is it due to symlinking with update-alternatives or does cmake simply see multiple cuda-xx folders and decide not to choose, despite being given an explicit path?
If the latter, shouldn’t then at least nvcc be found since there is no alternative in /usr/bin ?

craig.scott · September 22, 2024, 10:01pm

Might be something @robert.maynard knows more about? If I had to guess, I’d assume CMake probably looks for some things in locations relative to where it finds nvcc, but it doesn’t follow symlinks and therefore doesn’t see the real locations, but that’s all just a guess.

robert.maynard · September 30, 2024, 6:47pm

You don’t want to symlink nvcc directly as that breaks the internal compilers detection logic and makes it unusable:

~  $ ln -s /usr/local/cuda-12.4/bin/nvcc nvcc

~  $ ./nvcc -v -c test.cu
#$ _NVVM_BRANCH_=nvvm
#$ _NVVM_BRANCH_SUFFIX_=
#$ _SPACE_= 
#$ _CUDART_=cudart
#$ _HERE_=.
#$ _THERE_=.
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_SIZE_=64
#$ gcc -D__CUDA_ARCH_LIST__=520 -D__NV_LEGACY_LAUNCH -E -x c++ -D__CUDACC__ -D__NVCC__   -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=4 -D__CUDACC_VER_BUILD__=131 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=4 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "test.cu" -o "/tmp/tmpxft_00008507_00000000-5_test.cpp4.ii" 
cc1plus: fatal error: cuda_runtime.h: No such file or directory
compilation terminated.
# --error 0x1 --


~  $ nvcc -v -c test.cu
#$ _NVVM_BRANCH_=nvvm
#$ _NVVM_BRANCH_SUFFIX_=
#$ _SPACE_= 
#$ _CUDART_=cudart
#$ _HERE_=/usr/local/cuda/bin
#$ _THERE_=/usr/local/cuda/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_DIR_=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/usr/local/cuda/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/home/rmaynard/.local/bin:/usr/local/cuda/bin:/home/rmaynard/.cargo/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin:/home/rmaynard/.cargo/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"  
#$ LIBRARIES=  "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -D__CUDA_ARCH_LIST__=520 -D__NV_LEGACY_LAUNCH -E -x c++ -D__CUDACC__ -D__NVCC__  "-I/usr/local/cuda/bin/../targets/x86_64-linux/include"    -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=4 -D__CUDACC_VER_BUILD__=131 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=4 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "test.cu" -o "/tmp/tmpxft_00008518_00000000-5_test.cpp4.ii" 
...
#$ gcc -D__CUDA_ARCH__=520 -D__CUDA_ARCH_LIST__=520 -D__NV_LEGACY_LAUNCH -c -x c++  -DCUDA_DOUBLE_MATH_FUNCTIONS -Wno-psabi "-I/usr/local/cuda/bin/../targets/x86_64-linux/include"   -m64 "/tmp/tmpxft_00008518_00000000-6_test.cudafe1.cpp" -o "test.o"

What you want to do is use your update-alternatives setup to switch the cuda symlink directory ( which it looks like already have setup ).