
Packaged the script as a docker container stored in gcr.io/google-containers A daemonset deployment is included to make it easy to consume the installer A cluster e2e has been added to test the installation daemonset along with verifying installation by using a sample CUDA application. Node e2e for GPUs updated to avoid running on nodes without GPU devices. Signed-off-by: Vishnu kannan <vishnuk@google.com>
208 lines
7.0 KiB
Bash
208 lines
7.0 KiB
Bash
#!/bin/bash
|
|
|
|
# Copyright 2017 The Kubernetes Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# This script is for dynamically installing nvidia kernel drivers in Container Optimized OS
|
|
|
|
set -o errexit
|
|
set -o pipefail
|
|
set -x
|
|
|
|
# The script must be run as a root.
|
|
# Prerequisites:
|
|
#
|
|
# LAKITU_KERNEL_SHA1 - The env variable is expected to be set to HEAD of the kernel version used on the host.
|
|
# BASE_DIR - Directory that is mapped to a stateful partition on host. Defaults to `/rootfs/nvidia`.
|
|
#
|
|
# The script will output the following artifacts:
|
|
# ${BASE_DIR}/lib* --> Nvidia CUDA libraries
|
|
# ${BASE_DIR}/bin/* --> Nvidia debug utilities
|
|
# ${BASE_DIR}/.cache/* --> Nvidia driver artifacts cached for idempotency.
|
|
#
|
|
|
|
BASE_DIR=${BASE_DIR:-"/rootfs/nvidia"}
|
|
CACHE_DIR="${BASE_DIR}/.cache"
|
|
USR_WORK_DIR="${CACHE_DIR}/usr-work"
|
|
USR_WRITABLE_DIR="${CACHE_DIR}/usr-writable"
|
|
LIB_WORK_DIR="${CACHE_DIR}/lib-work"
|
|
LIB_WRITABLE_DIR="${CACHE_DIR}/lib-writable"
|
|
|
|
LIB_OUTPUT_DIR="${BASE_DIR}/lib"
|
|
BIN_OUTPUT_DIR="${BASE_DIR}/bin"
|
|
|
|
KERNEL_SRC_DIR="/lakitu-kernel"
|
|
NVIDIA_DRIVER_DIR="/nvidia"
|
|
NVIDIA_DRIVER_VERSION="375.26"
|
|
|
|
# Source: https://developer.nvidia.com/cuda-downloads
|
|
NVIDIA_CUDA_URL="https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run"
|
|
NVIDIA_CUDA_MD5SUM="33e1bd980e91af4e55f3ef835c103f9b"
|
|
NVIDIA_CUDA_PKG_NAME="cuda_8.0.61_375.26_linux.run"
|
|
NVIDIA_DRIVER_PKG_NAME="NVIDIA-Linux-x86_64-375.26.run"
|
|
|
|
check_nvidia_device() {
|
|
lspci
|
|
if ! lspci | grep -i -q NVIDIA; then
|
|
echo "No NVIDIA devices attached to this instance."
|
|
exit 0
|
|
fi
|
|
echo "Found NVIDIA device on this instance."
|
|
}
|
|
|
|
prepare_kernel_source() {
|
|
local kernel_git_repo="https://chromium.googlesource.com/chromiumos/third_party/kernel"
|
|
local kernel_version="$(uname -r)"
|
|
local kernel_version_stripped="$(echo ${kernel_version} | sed 's/\+//')"
|
|
|
|
# Checkout the correct tag.
|
|
echo "Downloading kernel source at tag ${kernel_version_stripped} ..."
|
|
pushd "${KERNEL_SRC_DIR}"
|
|
# TODO: Consume KERNEL SHA1 from COS image directly.
|
|
# git checkout "tags/v${kernel_version_stripped}"
|
|
git checkout ${LAKITU_KERNEL_SHA1}
|
|
|
|
# Prepare kernel configu and source for modules.
|
|
echo "Preparing kernel sources ..."
|
|
zcat "/proc/config.gz" > ".config"
|
|
make olddefconfig
|
|
make modules_prepare
|
|
# Done.
|
|
popd
|
|
}
|
|
|
|
download_install_nvidia() {
|
|
local pkg_name="${NVIDIA_CUDA_PKG_NAME}"
|
|
local url="${NVIDIA_CUDA_URL}"
|
|
local log_file_name="${NVIDIA_DRIVER_DIR}/nvidia-installer.log"
|
|
|
|
mkdir -p "${NVIDIA_DRIVER_DIR}"
|
|
pushd "${NVIDIA_DRIVER_DIR}"
|
|
|
|
echo "Downloading Nvidia CUDA package from ${url} ..."
|
|
curl -L -s "${url}" -o "${pkg_name}"
|
|
echo "${NVIDIA_CUDA_MD5SUM} ${pkg_name}" | md5sum --check
|
|
|
|
echo "Extracting Nvidia CUDA package ..."
|
|
sh ${pkg_name} --extract="$(pwd)"
|
|
|
|
echo "Running the Nvidia driver installer ..."
|
|
if ! sh "${NVIDIA_DRIVER_PKG_NAME}" --kernel-source-path="${KERNEL_SRC_DIR}" --silent --accept-license --keep --log-file-name="${log_file_name}"; then
|
|
echo "Nvidia installer failed, log below:"
|
|
echo "==================================="
|
|
tail -50 "${log_file_name}"
|
|
echo "==================================="
|
|
exit 1
|
|
fi
|
|
# Create unified memory device file.
|
|
nvidia-modprobe -c0 -u
|
|
popd
|
|
}
|
|
|
|
unlock_loadpin_and_reboot_if_needed() {
|
|
kernel_cmdline="$(cat /proc/cmdline)"
|
|
if echo "${kernel_cmdline}" | grep -q -v "lsm.module_locking=0"; then
|
|
local -r esp_partition="/dev/sda12"
|
|
local -r mount_path="/tmp/esp"
|
|
local -r grub_cfg="efi/boot/grub.cfg"
|
|
|
|
mkdir -p "${mount_path}"
|
|
mount "${esp_partition}" "${mount_path}"
|
|
|
|
pushd "${mount_path}"
|
|
cp "${grub_cfg}" "${grub_cfg}.orig"
|
|
sed 's/cros_efi/cros_efi lsm.module_locking=0/g' -i "efi/boot/grub.cfg"
|
|
cat "${grub_cfg}"
|
|
popd
|
|
sync
|
|
umount "${mount_path}"
|
|
# Restart the node for loadpin to be disabled.
|
|
echo b > /sysrq
|
|
fi
|
|
}
|
|
|
|
create_uvm_device() {
|
|
# Create unified memory device file.
|
|
nvidia-modprobe -c0 -u
|
|
}
|
|
|
|
verify_base_image() {
|
|
mount --bind /rootfs/etc/os-release /etc/os-release
|
|
local id="$(grep "^ID=" /etc/os-release)"
|
|
if [[ "${id#*=}" != "cos" ]]; then
|
|
echo "This installer is designed to run on Container-Optimized OS only"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
setup_overlay_mounts() {
|
|
mkdir -p ${USR_WRITABLE_DIR} ${USR_WORK_DIR} ${LIB_WRITABLE_DIR} ${LIB_WORK_DIR}
|
|
mount -t overlay -o lowerdir=/usr,upperdir=${USR_WRITABLE_DIR},workdir=${USR_WORK_DIR} none /usr
|
|
mount -t overlay -o lowerdir=/lib,upperdir=${LIB_WRITABLE_DIR},workdir=${LIB_WORK_DIR} none /lib
|
|
}
|
|
|
|
exit_if_install_not_needed() {
|
|
if nvidia-smi; then
|
|
echo "nvidia drivers already installed. Skipping installation"
|
|
post_installation_sequence
|
|
exit 0
|
|
fi
|
|
}
|
|
|
|
restart_kubelet() {
|
|
echo "Sending SIGTERM to kubelet"
|
|
pkill -SIGTERM kubelet
|
|
}
|
|
|
|
# Copy user space libraries and debug utilities to a special output directory on the host.
|
|
# Make these artifacts world readable and executable.
|
|
copy_files_to_host() {
|
|
mkdir -p ${LIB_OUTPUT_DIR} ${BIN_OUTPUT_DIR}
|
|
cp -r ${USR_WRITABLE_DIR}/lib/x86_64-linux-gnu/* ${LIB_OUTPUT_DIR}/
|
|
cp -r ${USR_WRITABLE_DIR}/bin/* ${BIN_OUTPUT_DIR}/
|
|
chmod -R a+rx ${LIB_OUTPUT_DIR}
|
|
chmod -R a+rx ${BIN_OUTPUT_DIR}
|
|
}
|
|
|
|
post_installation_sequence() {
|
|
create_uvm_device
|
|
# Copy nvidia user space libraries and debug tools to the host for use from other containers.
|
|
copy_files_to_host
|
|
# Restart the kubelet for it to pick up the GPU devices.
|
|
restart_kubelet
|
|
}
|
|
|
|
main() {
|
|
# Do not run the installer unless the base image is Container Optimized OS (COS)
|
|
verify_base_image
|
|
# Do not run the installer unless a Nvidia device is found on the PCI bus
|
|
check_nvidia_device
|
|
# Setup overlay mounts to capture nvidia driver artificats in a more permanent storage on the host.
|
|
setup_overlay_mounts
|
|
# Disable a critical security feature in COS that will allow for dynamically loading Nvidia drivers
|
|
unlock_loadpin_and_reboot_if_needed
|
|
# Exit if installation is not required (for idempotency)
|
|
exit_if_install_not_needed
|
|
# Checkout kernel sources appropriate for the base image.
|
|
prepare_kernel_source
|
|
# Download, compile and install nvidia drivers.
|
|
download_install_nvidia
|
|
# Verify that the Nvidia drivers have been successfully installed.
|
|
nvidia-smi
|
|
# Perform post installation steps - copying artifacts, restarting kubelet, etc.
|
|
post_installation_sequence
|
|
}
|
|
|
|
main "$@"
|