test: Add e2e to verify static pod termination

Add a node e2e to verify that if a static pod is terminated while the
container runtime or CRI returns an error, the pod is eventually
terminated successfully.

This test serves as a regression test for k8s.io/issue/113145 which
fixes an issue where force deleted pods may not be terminated if the
container runtime fails during a `syncTerminatingPod`.

To test this behavior, start a static pod, stop the container runtime,
and later start the container runtime. The static pod is expected to
eventually terminate successfully.

To start and stop the container runtime, we need to find the container
runtime systemd unit name. Introduce a util function
`findContainerRuntimeServiceName` which finds the unit name by getting
the pid of the container runtime from the existing
`ContainerRuntimeProcessName` flag passed into node e2e and using
systemd dbus `GetUnitNameByPID` function to convert the pid of the
container runtime to a unit name. Using the unit name, introduce helper
functions to start and stop the container runtime.

Signed-off-by: David Porter <david@porter.me>
This commit is contained in:
David Porter
2022-10-19 20:37:24 -07:00
committed by Clayton Coleman
parent d446bebca8
commit 1c75c2cda8
3 changed files with 154 additions and 38 deletions

View File

@@ -25,17 +25,21 @@ import (
"io"
"net"
"net/http"
"os"
"os/exec"
"regexp"
"strconv"
"strings"
"time"
"k8s.io/kubernetes/pkg/util/procfs"
oteltrace "go.opentelemetry.io/otel/trace"
v1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
clientset "k8s.io/client-go/kubernetes"
@@ -55,6 +59,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/util"
"github.com/coreos/go-systemd/v22/dbus"
"k8s.io/kubernetes/test/e2e/framework"
e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet"
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
@@ -84,12 +89,14 @@ const (
var kubeletHealthCheckURL = fmt.Sprintf("http://127.0.0.1:%d/healthz", ports.KubeletHealthzPort)
var containerRuntimeUnitName = ""
func getNodeSummary(ctx context.Context) (*stats.Summary, error) {
kubeletConfig, err := getCurrentKubeletConfig(ctx)
if err != nil {
return nil, fmt.Errorf("failed to get current kubelet config")
}
req, err := http.NewRequest("GET", fmt.Sprintf("http://%s/stats/summary", net.JoinHostPort(kubeletConfig.Address, strconv.Itoa(int(kubeletConfig.ReadOnlyPort)))), nil)
req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("http://%s/stats/summary", net.JoinHostPort(kubeletConfig.Address, strconv.Itoa(int(kubeletConfig.ReadOnlyPort)))), nil)
if err != nil {
return nil, fmt.Errorf("failed to build http request: %w", err)
}
@@ -340,6 +347,71 @@ func findKubeletServiceName(running bool) string {
return kubeletServiceName
}
func findContainerRuntimeServiceName() (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
conn, err := dbus.NewWithContext(ctx)
framework.ExpectNoError(err, "Failed to setup dbus connection")
defer conn.Close()
runtimePids, err := getPidsForProcess(framework.TestContext.ContainerRuntimeProcessName, framework.TestContext.ContainerRuntimePidFile)
framework.ExpectNoError(err, "failed to get list of container runtime pids")
framework.ExpectEqual(len(runtimePids), 1, "Unexpected number of container runtime pids. Expected 1 but got %v", len(runtimePids))
containerRuntimePid := runtimePids[0]
unitName, err := conn.GetUnitNameByPID(ctx, uint32(containerRuntimePid))
framework.ExpectNoError(err, "Failed to get container runtime unit name")
return unitName, nil
}
type containerRuntimeUnitOp int
const (
startContainerRuntimeUnitOp containerRuntimeUnitOp = iota
stopContainerRuntimeUnitOp
)
func performContainerRuntimeUnitOp(op containerRuntimeUnitOp) error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
conn, err := dbus.NewWithContext(ctx)
framework.ExpectNoError(err, "Failed to setup dbus connection")
defer conn.Close()
if containerRuntimeUnitName == "" {
containerRuntimeUnitName, err = findContainerRuntimeServiceName()
framework.ExpectNoError(err, "Failed to find container runtime name")
}
reschan := make(chan string)
switch op {
case startContainerRuntimeUnitOp:
conn.StartUnitContext(ctx, containerRuntimeUnitName, "replace", reschan)
case stopContainerRuntimeUnitOp:
conn.StopUnitContext(ctx, containerRuntimeUnitName, "replace", reschan)
default:
framework.Failf("Unexpected container runtime op: %v", op)
}
job := <-reschan
framework.ExpectEqual(job, "done", "Expected job to complete with done")
return nil
}
func stopContainerRuntime() error {
return performContainerRuntimeUnitOp(stopContainerRuntimeUnitOp)
}
func startContainerRuntime() error {
return performContainerRuntimeUnitOp(startContainerRuntimeUnitOp)
}
// restartKubelet restarts the current kubelet service.
// the "current" kubelet service is the instance managed by the current e2e_node test run.
// If `running` is true, restarts only if the current kubelet is actually running. In some cases,
@@ -465,3 +537,35 @@ func waitForAllContainerRemoval(ctx context.Context, podName, podNS string) {
return nil
}, 2*time.Minute, 1*time.Second).Should(gomega.Succeed())
}
func getPidsForProcess(name, pidFile string) ([]int, error) {
if len(pidFile) > 0 {
pid, err := getPidFromPidFile(pidFile)
if err == nil {
return []int{pid}, nil
}
// log the error and fall back to pidof
runtime.HandleError(err)
}
return procfs.PidOf(name)
}
func getPidFromPidFile(pidFile string) (int, error) {
file, err := os.Open(pidFile)
if err != nil {
return 0, fmt.Errorf("error opening pid file %s: %v", pidFile, err)
}
defer file.Close()
data, err := io.ReadAll(file)
if err != nil {
return 0, fmt.Errorf("error reading pid file %s: %v", pidFile, err)
}
pid, err := strconv.Atoi(string(data))
if err != nil {
return 0, fmt.Errorf("error parsing %s as a number: %v", string(data), err)
}
return pid, nil
}