Merge pull request #55911 from davidz627/localSSDUUID
Automatic merge from submit-queue (batch tested with PRs 54824, 55911, 55730, 55979, 55961). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Add options for mounting SCSI or NVMe local SSD though Block or Filesystem and do all of that with UUID Fixes: #51431 Fixed version of: #53466 Mount SCSI local SSD by UUID in /mnt/disks/by-uuid/, also allows for users to request and mount NVMe disks. Both types of disks will be accessible either through block or file-system. I have confirmed that it is no longer crashing when nodes are initialized on GKE.
This commit is contained in:
@@ -668,6 +668,7 @@ ENABLE_PROMETHEUS_TO_SD: $(yaml-quote ${ENABLE_PROMETHEUS_TO_SD:-false})
|
||||
ENABLE_POD_PRIORITY: $(yaml-quote ${ENABLE_POD_PRIORITY:-})
|
||||
CONTAINER_RUNTIME: $(yaml-quote ${CONTAINER_RUNTIME:-})
|
||||
CONTAINER_RUNTIME_ENDPOINT: $(yaml-quote ${CONTAINER_RUNTIME_ENDPOINT:-})
|
||||
NODE_LOCAL_SSDS_EXT: $(yaml-quote ${NODE_LOCAL_SSDS_EXT:-})
|
||||
LOAD_IMAGE_COMMAND: $(yaml-quote ${LOAD_IMAGE_COMMAND:-})
|
||||
EOF
|
||||
if [ -n "${KUBELET_PORT:-}" ]; then
|
||||
|
@@ -36,6 +36,11 @@ MASTER_ROOT_DISK_SIZE=${MASTER_ROOT_DISK_SIZE:-$(get-master-root-disk-size)}
|
||||
NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
|
||||
NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
|
||||
NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
|
||||
# An extension to local SSDs allowing users to specify block/fs and SCSI/NVMe devices
|
||||
# Format of this variable will be "#,scsi/nvme,block/fs" you can specify multiple
|
||||
# configurations by seperating them by a semi-colon ex. "2,scsi,fs;1,nvme,block"
|
||||
# is a request for 2 SCSI formatted and mounted SSDs and 1 NVMe block device SSD.
|
||||
NODE_LOCAL_SSDS_EXT=${NODE_LOCAL_SSDS_EXT:-}
|
||||
# Accelerators to be attached to each node. Format "type=<accelerator-type>,count=<accelerator-count>"
|
||||
# More information on available GPUs here - https://cloud.google.com/compute/docs/gpus/
|
||||
NODE_ACCELERATORS=${NODE_ACCELERATORS:-""}
|
||||
|
@@ -25,6 +25,9 @@ set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
readonly UUID_MNT_PREFIX="/mnt/disks/by-uuid/google-local-ssds"
|
||||
readonly UUID_BLOCK_PREFIX="/dev/disk/by-uuid/google-local-ssds"
|
||||
|
||||
function setup-os-params {
|
||||
# Reset core_pattern. On GCI, the default core_pattern pipes the core dumps to
|
||||
# /sbin/crash_reporter which is more restrictive in saving crash dumps. So for
|
||||
@@ -85,11 +88,85 @@ function create-dirs {
|
||||
fi
|
||||
}
|
||||
|
||||
# Formats the given device ($1) if needed and mounts it at given mount point
|
||||
# Gets the total number of $(1) and $(2) type disks specified
|
||||
# by the user in ${NODE_LOCAL_SSDS_EXT}
|
||||
function get-local-disk-num() {
|
||||
local interface="${1}"
|
||||
local format="${2}"
|
||||
|
||||
localdisknum=0
|
||||
if [[ ! -z "${NODE_LOCAL_SSDS_EXT:-}" ]]; then
|
||||
IFS=";" read -r -a ssdgroups <<< "${NODE_LOCAL_SSDS_EXT:-}"
|
||||
for ssdgroup in "${ssdgroups[@]}"; do
|
||||
IFS="," read -r -a ssdopts <<< "${ssdgroup}"
|
||||
local opnum="${ssdopts[0]}"
|
||||
local opinterface="${ssdopts[1]}"
|
||||
local opformat="${ssdopts[2]}"
|
||||
|
||||
if [[ "${opformat,,}" == "${format,,}" && "${opinterface,,}" == "${interface,,}" ]]; then
|
||||
localdisknum=$((localdisknum+opnum))
|
||||
fi
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
# Creates a symlink for a ($1) so that it may be used as block storage
|
||||
function safe-block-symlink(){
|
||||
local device="${1}"
|
||||
local symdir="${2}"
|
||||
|
||||
mkdir -p "${symdir}"
|
||||
|
||||
get-or-generate-uuid "${device}"
|
||||
local myuuid="${retuuid}"
|
||||
|
||||
local sym="${symdir}/local-ssd-${myuuid}"
|
||||
# Do not "mkdir -p ${sym}" as that will cause unintended symlink behavior
|
||||
ln -s "${device}" "${sym}"
|
||||
echo "Created a symlink for SSD $ssd at ${sym}"
|
||||
chmod a+w "${sym}"
|
||||
}
|
||||
|
||||
# Gets a pregenerated UUID from ${ssdmap} if it exists, otherwise generates a new
|
||||
# UUID and places it inside ${ssdmap}
|
||||
function get-or-generate-uuid(){
|
||||
local device="${1}"
|
||||
|
||||
local ssdmap="/home/kubernetes/localssdmap.txt"
|
||||
echo "Generating or getting UUID from ${ssdmap}"
|
||||
|
||||
if [[ ! -e "${ssdmap}" ]]; then
|
||||
touch "${ssdmap}"
|
||||
chmod +w "${ssdmap}"
|
||||
fi
|
||||
|
||||
# each line of the ssdmap looks like "${device} persistent-uuid"
|
||||
if [[ ! -z $(grep ${device} ${ssdmap}) ]]; then
|
||||
#create symlink based on saved uuid
|
||||
local myuuid=$(grep ${device} ${ssdmap} | cut -d ' ' -f 2)
|
||||
else
|
||||
# generate new uuid and add it to the map
|
||||
local myuuid=$(uuidgen)
|
||||
if [[ ! ${?} -eq 0 ]]; then
|
||||
echo "Failed to generate valid UUID with uuidgen" >&2
|
||||
exit 2
|
||||
fi
|
||||
echo "${device} ${myuuid}" >> "${ssdmap}"
|
||||
fi
|
||||
|
||||
if [[ -z "${myuuid}" ]]; then
|
||||
echo "Failed to get a uuid for device ${device} when symlinking." >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
retuuid="${myuuid}"
|
||||
}
|
||||
|
||||
#Formats the given device ($1) if needed and mounts it at given mount point
|
||||
# ($2).
|
||||
function safe-format-and-mount() {
|
||||
device=$1
|
||||
mountpoint=$2
|
||||
local device="${1}"
|
||||
local mountpoint="${2}"
|
||||
|
||||
# Format only if the disk is not already formatted.
|
||||
if ! tune2fs -l "${device}" ; then
|
||||
@@ -102,18 +179,135 @@ function safe-format-and-mount() {
|
||||
mount -o discard,defaults "${device}" "${mountpoint}"
|
||||
}
|
||||
|
||||
# Local ssds, if present, are mounted at /mnt/disks/ssdN.
|
||||
# Gets a devices UUID and bind mounts the device to mount location in
|
||||
# /mnt/disks/by-id/
|
||||
function unique-uuid-bind-mount(){
|
||||
local mountpoint="${1}"
|
||||
local actual_device="${2}"
|
||||
|
||||
# Trigger udev refresh so that newly formatted devices are propagated in by-uuid
|
||||
udevadm control --reload-rules
|
||||
udevadm trigger
|
||||
udevadm settle
|
||||
|
||||
# grep the exact match of actual device, prevents substring matching
|
||||
local myuuid=$(ls -l /dev/disk/by-uuid/ | grep "/${actual_device}$" | tr -s ' ' | cut -d ' ' -f 9)
|
||||
# myuuid should be the uuid of the device as found in /dev/disk/by-uuid/
|
||||
if [[ -z "${myuuid}" ]]; then
|
||||
echo "Failed to get a uuid for device ${actual_device} when mounting." >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# bindpoint should be the full path of the to-be-bound device
|
||||
local bindpoint="${UUID_MNT_PREFIX}-${interface}-fs/local-ssd-${myuuid}"
|
||||
|
||||
safe-bind-mount "${mountpoint}" "${bindpoint}"
|
||||
}
|
||||
|
||||
# Bind mounts device at mountpoint to bindpoint
|
||||
function safe-bind-mount(){
|
||||
local mountpoint="${1}"
|
||||
local bindpoint="${2}"
|
||||
|
||||
# Mount device to the mountpoint
|
||||
mkdir -p "${bindpoint}"
|
||||
echo "Binding '${mountpoint}' at '${bindpoint}'"
|
||||
mount --bind "${mountpoint}" "${bindpoint}"
|
||||
chmod a+w "${bindpoint}"
|
||||
}
|
||||
|
||||
|
||||
# Mounts, bindmounts, or symlinks depending on the interface and format
|
||||
# of the incoming device
|
||||
function mount-ext(){
|
||||
local ssd="${1}"
|
||||
local devicenum="${2}"
|
||||
local interface="${3}"
|
||||
local format="${4}"
|
||||
|
||||
|
||||
if [[ -z "${devicenum}" ]]; then
|
||||
echo "Failed to get the local disk number for device ${ssd}" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# TODO: Handle partitioned disks. Right now this code just ignores partitions
|
||||
if [[ "${format}" == "fs" ]]; then
|
||||
if [[ "${interface}" == "scsi" ]]; then
|
||||
local actual_device=$(readlink -f "${ssd}" | cut -d '/' -f 3)
|
||||
# Error checking
|
||||
if [[ "${actual_device}" != sd* ]]; then
|
||||
echo "'actual_device' is not of the correct format. It must be the kernel name of the device, got ${actual_device} instead" >&2
|
||||
exit 1
|
||||
fi
|
||||
local mountpoint="/mnt/disks/ssd${devicenum}"
|
||||
else
|
||||
# This path is required because the existing Google images do not
|
||||
# expose NVMe devices in /dev/disk/by-id so we are using the /dev/nvme instead
|
||||
local actual_device=$(echo ${ssd} | cut -d '/' -f 3)
|
||||
# Error checking
|
||||
if [[ "${actual_device}" != nvme* ]]; then
|
||||
echo "'actual_device' is not of the correct format. It must be the kernel name of the device, got ${actual_device} instead" >&2
|
||||
exit 1
|
||||
fi
|
||||
local mountpoint="/mnt/disks/ssd-nvme${devicenum}"
|
||||
fi
|
||||
|
||||
safe-format-and-mount "${ssd}" "${mountpoint}"
|
||||
# We only do the bindmount if users are using the new local ssd request method
|
||||
# see https://github.com/kubernetes/kubernetes/pull/53466#discussion_r146431894
|
||||
if [[ ! -z "${NODE_LOCAL_SSDS_EXT:-}" ]]; then
|
||||
unique-uuid-bind-mount "${mountpoint}" "${actual_device}"
|
||||
fi
|
||||
elif [[ "${format}" == "block" ]]; then
|
||||
local symdir="${UUID_BLOCK_PREFIX}-${interface}-block"
|
||||
safe-block-symlink "${ssd}" "${symdir}"
|
||||
else
|
||||
echo "Disk format must be either fs or block, got ${format}"
|
||||
fi
|
||||
}
|
||||
|
||||
# Local ssds, if present, are mounted or symlinked to their appropriate
|
||||
# locations
|
||||
function ensure-local-ssds() {
|
||||
get-local-disk-num "scsi" "block"
|
||||
local scsiblocknum="${localdisknum}"
|
||||
local i=0
|
||||
for ssd in /dev/disk/by-id/google-local-ssd-*; do
|
||||
if [ -e "${ssd}" ]; then
|
||||
ssdnum=`echo ${ssd} | sed -e 's/\/dev\/disk\/by-id\/google-local-ssd-\([0-9]*\)/\1/'`
|
||||
ssdmount="/mnt/disks/ssd${ssdnum}/"
|
||||
mkdir -p ${ssdmount}
|
||||
safe-format-and-mount "${ssd}" ${ssdmount}
|
||||
echo "Mounted local SSD $ssd at ${ssdmount}"
|
||||
chmod a+w ${ssdmount}
|
||||
local devicenum=`echo ${ssd} | sed -e 's/\/dev\/disk\/by-id\/google-local-ssd-\([0-9]*\)/\1/'`
|
||||
if [[ "${i}" -lt "${scsiblocknum}" ]]; then
|
||||
mount-ext "${ssd}" "${devicenum}" "scsi" "block"
|
||||
else
|
||||
# GKE does not set NODE_LOCAL_SSDS so all non-block devices
|
||||
# are assumed to be filesystem devices
|
||||
mount-ext "${ssd}" "${devicenum}" "scsi" "fs"
|
||||
fi
|
||||
i=$((i+1))
|
||||
else
|
||||
echo "No local SSD disks found."
|
||||
echo "No local SCSI SSD disks found."
|
||||
fi
|
||||
done
|
||||
|
||||
# The following mounts or symlinks NVMe devices
|
||||
get-local-disk-num "nvme" "block"
|
||||
local nvmeblocknum="${localdisknum}"
|
||||
local i=0
|
||||
for ssd in /dev/nvme*; do
|
||||
if [ -e "${ssd}" ]; then
|
||||
# This workaround to find if the NVMe device is a disk is required because
|
||||
# the existing Google images does not expose NVMe devices in /dev/disk/by-id
|
||||
if [[ `udevadm info --query=property --name=${ssd} | grep DEVTYPE | sed "s/DEVTYPE=//"` == "disk" ]]; then
|
||||
local devicenum=`echo ${ssd} | sed -e 's/\/dev\/nvme0n\([0-9]*\)/\1/'`
|
||||
if [[ "${i}" -lt "${nvmeblocknum}" ]]; then
|
||||
mount-ext "${ssd}" "${devicenum}" "nvme" "block"
|
||||
else
|
||||
mount-ext "${ssd}" "${devicenum}" "nvme" "fs"
|
||||
fi
|
||||
i=$((i+1))
|
||||
fi
|
||||
else
|
||||
echo "No local NVMe SSD disks found."
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
@@ -18,6 +18,8 @@
|
||||
|
||||
# Use the config file specified in $KUBE_CONFIG_FILE, or default to
|
||||
# config-default.sh.
|
||||
readonly GCE_MAX_LOCAL_SSD=8
|
||||
|
||||
KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../..
|
||||
source "${KUBE_ROOT}/cluster/gce/${KUBE_CONFIG_FILE-"config-default.sh"}"
|
||||
source "${KUBE_ROOT}/cluster/common.sh"
|
||||
@@ -37,6 +39,11 @@ else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ${NODE_LOCAL_SSDS:-} -ge 1 ]] && [[ ! -z ${NODE_LOCAL_SSDS_EXT:-} ]] ; then
|
||||
echo -e "${color_red}Local SSD: Only one of NODE_LOCAL_SSDS and NODE_LOCAL_SSDS_EXT can be specified at once${color_norm}" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [[ "${MASTER_OS_DISTRIBUTION}" == "gci" ]]; then
|
||||
DEFAULT_GCI_PROJECT=google-containers
|
||||
if [[ "${GCI_VERSION}" == "cos"* ]]; then
|
||||
@@ -546,6 +553,29 @@ function get-template-name-from-version() {
|
||||
echo "${NODE_INSTANCE_PREFIX}-template-${1}" | cut -c 1-63 | sed 's/[\.\+]/-/g;s/-*$//g'
|
||||
}
|
||||
|
||||
# validates the NODE_LOCAL_SSDS_EXT variable
|
||||
function validate-node-local-ssds-ext(){
|
||||
ssdopts="${1}"
|
||||
|
||||
if [[ -z "${ssdopts[0]}" || -z "${ssdopts[1]}" || -z "${ssdopts[2]}" ]]; then
|
||||
echo -e "${color_red}Local SSD: NODE_LOCAL_SSDS_EXT is malformed, found ${ssdopts[0]-_},${ssdopts[1]-_},${ssdopts[2]-_} ${color_norm}" >&2
|
||||
exit 2
|
||||
fi
|
||||
if [[ "${ssdopts[1]}" != "scsi" && "${ssdopts[1]}" != "nvme" ]]; then
|
||||
echo -e "${color_red}Local SSD: Interface must be scsi or nvme, found: ${ssdopts[1]} ${color_norm}" >&2
|
||||
exit 2
|
||||
fi
|
||||
if [[ "${ssdopts[2]}" != "fs" && "${ssdopts[2]}" != "block" ]]; then
|
||||
echo -e "${color_red}Local SSD: Filesystem type must be fs or block, found: ${ssdopts[2]} ${color_norm}" >&2
|
||||
exit 2
|
||||
fi
|
||||
local_ssd_ext_count=$((local_ssd_ext_count+ssdopts[0]))
|
||||
if [[ "${local_ssd_ext_count}" -gt "${GCE_MAX_LOCAL_SSD}" || "${local_ssd_ext_count}" -lt 1 ]]; then
|
||||
echo -e "${color_red}Local SSD: Total number of local ssds must range from 1 to 8, found: ${local_ssd_ext_count} ${color_norm}" >&2
|
||||
exit 2
|
||||
fi
|
||||
}
|
||||
|
||||
# Robustly try to create an instance template.
|
||||
# $1: The name of the instance template.
|
||||
# $2: The scopes flag.
|
||||
@@ -587,6 +617,19 @@ function create-node-template() {
|
||||
fi
|
||||
|
||||
local local_ssds=""
|
||||
local_ssd_ext_count=0
|
||||
if [[ ! -z ${NODE_LOCAL_SSDS_EXT:-} ]]; then
|
||||
IFS=";" read -r -a ssdgroups <<< "${NODE_LOCAL_SSDS_EXT:-}"
|
||||
for ssdgroup in "${ssdgroups[@]}"
|
||||
do
|
||||
IFS="," read -r -a ssdopts <<< "${ssdgroup}"
|
||||
validate-node-local-ssds-ext "${ssdopts}"
|
||||
for i in $(seq ${ssdopts[0]}); do
|
||||
local_ssds="$local_ssds--local-ssd=interface=${ssdopts[1]} "
|
||||
done
|
||||
done
|
||||
fi
|
||||
|
||||
if [[ ! -z ${NODE_LOCAL_SSDS+x} ]]; then
|
||||
# The NODE_LOCAL_SSDS check below fixes issue #49171
|
||||
# Some versions of seq will count down from 1 if "seq 0" is specified
|
||||
@@ -596,6 +639,7 @@ function create-node-template() {
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
local network=$(make-gcloud-network-argument \
|
||||
"${NETWORK_PROJECT}" \
|
||||
|
Reference in New Issue
Block a user