ld: cannot find library (CMake found it earlier?)

tbouvier · June 21, 2023, 5:15pm

Hello,

I’m trying to compile a shared library I wrote, named neomem. However, when running the following cmake . && make, I get the following output:

-- Found pybind11: /mnt/view/chifflot-v100/include (found version "2.10.0")
-- MKL_ARCH: intel64
-- MKL_ROOT /mnt/view/._chifflot-v100/dxci44kjpbx3fierqjhlpz2fihbz3lwv
-- MKL_LINK: dynamic
-- MKL_INTERFACE_FULL: intel_ilp64
-- MKL_THREADING: intel_thread
-- MKL_MPI: intelmpi
-- Found MKL: intelmpi
-- Found MKL: /mnt/view/._chifflot-v100/dxci44kjpbx3fierqjhlpz2fihbz3lwv
-- Found MKL: /mnt/view/chifflot-v100/lib/intel64/libmkl_intel_ilp64.so
-- Found MKL: /mnt/view/chifflot-v100/lib/intel64/libmkl_intel_thread.so
-- Found MKL: /mnt/view/chifflot-v100/lib/intel64/libmkl_core.so
-- Found MKL: /mnt/view/chifflot-v100/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so
-- Caffe2: Found protobuf with new-style protobuf targets.
-- Caffe2: Protobuf version 3.20.3.0
-- Caffe2: CUDA detected: 11.8
-- Caffe2: CUDA nvcc is: /mnt/view/chifflot-v100/bin/nvcc
-- Caffe2: CUDA toolkit directory: /mnt/view/chifflot-v100
-- Caffe2: Header version is: 11.8
-- Found cuDNN: v8.4.0  (include: /mnt/view/chifflot-v100/include, library: /mnt/view/chifflot-v100/lib/libcudnn.so)
-- /mnt/view/chifflot-v100/lib64/libnvrtc.so shorthash is 672ee683
-- Autodetected CUDA architecture(s):  7.0 7.0
-- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70

...

-- Configuring done (4.4s)
-- Generating done (0.0s)
-- Build files have been written to: /root/distributed-continual-learning/cpp_loader
[ 12%] Linking CXX shared library neomem.cpython-310-x86_64-linux-gnu.so
/usr/bin/ld: cannot find -lmkl_intel_ilp64
/usr/bin/ld: cannot find -lmkl_intel_thread
/usr/bin/ld: cannot find -lmkl_core

ld can’t find mkl_intel_ilp64, mkl_intel_thread and mkl_core and the compilation is failing as a result. Please notice that these libraries are found earlier in the logs (Found MKL lines).

Besides, MKL is installed in a non-standard location lib/intel64.

Here are my CMakeLists files:

CMakeLists.txt

cmake_minimum_required(VERSION 3.19 FATAL_ERROR)

project(Neomem VERSION "0.0.1")
set(CMAKE_CXX_STANDARD 17)

option(WITHOUT_CUDA "Disable CUDA support" OFF)

find_package(Python COMPONENTS Interpreter Development REQUIRED)
include_directories(${Python_INCLUDE_DIRS})

execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import pybind11; print(pybind11.get_cmake_dir())"
    OUTPUT_VARIABLE CUSTOM_PYTHON_PYBIND11_PATH
    OUTPUT_STRIP_TRAILING_WHITESPACE
    ERROR_QUIET)
set(pybind11_DIR ${CUSTOM_PYTHON_PYBIND11_PATH})

find_package(pybind11 CONFIG REQUIRED)
include_directories(${pybind11_INCLUDE_DIR})

execute_process(
  COMMAND python -c "import torch; import os; print(os.path.dirname(torch.__file__), end='')"
  OUTPUT_VARIABLE TORCH_PATH
)
list(APPEND CMAKE_PREFIX_PATH ${TORCH_PATH})

set(TORCH_PYTHON_LIBRARIES "${TORCH_PATH}/lib/libtorch_python.so")

#
# This is where I find the failing package
#
find_package(MKL CONFIG REQUIRED)

find_package(Torch REQUIRED)
include_directories(${TORCH_INCLUDE_DIRS})

find_package(MPI REQUIRED)
include_directories(${MPI_INCLUDE_PATH})

find_package(Thallium REQUIRED)

if (NOT WITHOUT_CUDA)
  find_package(CUDA REQUIRED)
else ()
  add_definitions(-DWITHOUT_CUDA)
endif ()

execute_process(COMMAND python -c "import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))"
                OUTPUT_VARIABLE WITH_ABI OUTPUT_STRIP_TRAILING_WHITESPACE)
add_definitions(
  -O3
  -g
  -Wall
  -fPIC
  -Wl,--no-as-needed
  -D__ASSERT
  -D__DEBUG
)

add_subdirectory(src)

src/CMakeLists.txt

# Set sources and executable
set(LIBRARY_SOURCES
    engine_loader.cpp
    distributed_stream_loader.cpp
    rehearsal.cpp
)
pybind11_add_module(neomem SHARED ${LIBRARY_SOURCES})
target_include_directories(neomem PUBLIC ${PYTHON_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS})
target_link_libraries(neomem PUBLIC ${Python_LIBRARIES} ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARIES} ${MPI_CXX_LIBRARIES} thallium)

I suspect this is related to the target_link_libraries() function, which seems incomplete.

Could someone provide some guidance? Thank you

bill.hoffman · June 21, 2023, 8:09pm

tbouvier:

-- Found MKL: /mnt/view/chifflot-v100/lib/intel64/libmkl_intel_ilp64.so
-- Found MKL: /mnt/view/chifflot-v100/lib/intel64/libmkl_intel_thread.so
-- Found MKL: /mnt/view/chifflot-v100/lib/intel64/libmkl_core.so
-- Found MKL: /mnt/view/chifflot-v100/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so

I see find_package(MKL), and as you point out it is finding it. However, I don’t see anything that uses the result of that find. Somewhere the cmake code is adding -l flags to mkl libraries and not the full paths that should be used. So, I don’t think there is enough code here to tell what is going wrong. The MKL module is not part of CMake, so that might be doing something odd as well.

tbouvier · June 22, 2023, 1:47pm

“I don’t see anything that uses the result of that find”. This is my observation too. In the command issued by make, all the other shared libraries are referred to by their full path i.e., /mnt/view/._chifflot-v100/dxci44kjpbx3fierqjhlpz2fihbz3lwv/lib/python3.10/site-packages/torch/lib/libtorch_python.so. However the 3 missing libraries appear as -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core. I can’t see what parameter I can act on to change this behavior, since I copied all my CMake code in my previous post.

I’ve installed my entire environment using Spack. Could my problem be related to this? Alternatively, could my issue be caused by this file in the PyTorch repo, as MKL is a dependency of PyTorch? I don’t know what to look for, or where to look

tbouvier · June 22, 2023, 11:14pm

I was able to link the missing libraries by adding -D MKL_LIBRARIES=/mnt/view/chifflot-v100/lib/intel64 to my CMake command. Still, I would expect this path to be passed automatically, as it is found earlier by CMake. If anyone has an idea

ben.boeckel · June 25, 2023, 7:48pm

Yes, I think the MKL_LIBRARIES variable (there should be docs in the module about what variables/targets it provides) probably needs to be used somewhere…

bill.hoffman · June 26, 2023, 6:36pm

This could be a Spack issue. Spack can inject things like linker flags into builds by itself. Can you run this outside of spack and reproduce?