Switch 'ContainerCheckpoint' from Alpha to Beta

* Forensic Container Checkpointing as described in KEP 2008 moves from
   Alpha to Beta. This is corresponding code change.

 * Adapt e2e test to handle
   '(rpc error: code = Unimplemented desc = unknown method CheckpointContainer'
   and
   '(rpc error: code = Unimplemented desc = method CheckpointContainer not implemented)'
   and
   '(rpc error: code = Unknown desc = checkpoint/restore support not available)'
   One error message is if the CRI implementation does
   not implement the CRI RPC (too old) and the second is
   if the CRI implementation does explicitly not support the feature.
   The third error message can be seen if the container engine
   explicitly disabled the checkpoint/restore support,

 * As described in the corresponding KEP 2008 explicitly test for
   disabled functionality.

 * Extended test to look for the checkpoint kubelet metric.

 * Extended test to look for the CRI error metric.

 *  Add separate sub-resource permission to control permissions on
    the checkpoint kubelet API endpoint

Signed-off-by: Adrian Reber <areber@redhat.com>
This commit is contained in:
Adrian Reber 2024-02-09 11:02:25 +00:00
parent f99638d315
commit da8ffcd1dc
No known key found for this signature in database
GPG Key ID: 82C9378ED3C4906A
6 changed files with 135 additions and 13 deletions

View File

@ -182,6 +182,7 @@ const (
// owner: @adrianreber
// kep: https://kep.k8s.io/2008
// alpha: v1.25
// beta: v1.30
//
// Enables container Checkpoint support in the kubelet
ContainerCheckpoint featuregate.Feature = "ContainerCheckpoint"
@ -966,7 +967,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
CloudControllerManagerWebhook: {Default: false, PreRelease: featuregate.Alpha},
ContainerCheckpoint: {Default: false, PreRelease: featuregate.Alpha},
ContainerCheckpoint: {Default: true, PreRelease: featuregate.Beta},
ConsistentHTTPGetHandlers: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.31

View File

@ -105,6 +105,8 @@ func (n nodeAuthorizerAttributesGetter) GetRequestAttributes(u user.Info, r *htt
case isSubpath(requestPath, logsPath):
// "log" to match other log subresources (pods/log, etc)
attrs.Subresource = "log"
case isSubpath(requestPath, checkpointPath):
attrs.Subresource = "checkpoint"
}
klog.V(5).InfoS("Node request attributes", "user", attrs.GetUser().GetName(), "verb", attrs.GetVerb(), "resource", attrs.GetResource(), "subresource", attrs.GetSubresource())

View File

@ -110,7 +110,7 @@ func AuthzTestCases() []AuthzTestCase {
testPaths := map[string]string{
"/attach/{podNamespace}/{podID}/{containerName}": "proxy",
"/attach/{podNamespace}/{podID}/{uid}/{containerName}": "proxy",
"/checkpoint/{podNamespace}/{podID}/{containerName}": "proxy",
"/checkpoint/{podNamespace}/{podID}/{containerName}": "checkpoint",
"/configz": "proxy",
"/containerLogs/{podNamespace}/{podID}/{containerName}": "proxy",
"/debug/flags/v": "proxy",

View File

@ -98,6 +98,7 @@ const (
proberMetricsPath = "/metrics/probes"
statsPath = "/stats/"
logsPath = "/logs/"
checkpointPath = "/checkpoint/"
pprofBasePath = "/debug/pprof/"
debugFlagPath = "/debug/flags/v"
)
@ -441,7 +442,7 @@ func (s *Server) InstallDefaultHandlers() {
if utilfeature.DefaultFeatureGate.Enabled(features.ContainerCheckpoint) {
s.addMetricsBucketMatcher("checkpoint")
ws = &restful.WebService{}
ws.Path("/checkpoint").Produces(restful.MIME_JSON)
ws.Path(checkpointPath).Produces(restful.MIME_JSON)
ws.Route(ws.POST("/{podNamespace}/{podID}/{containerName}").
To(s.checkpoint).
Operation("checkpoint"))

View File

@ -858,18 +858,24 @@ func TestContainerLogsWithInvalidTail(t *testing.T) {
}
func TestCheckpointContainer(t *testing.T) {
// Enable features.ContainerCheckpoint during test
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ContainerCheckpoint, true)()
fw := newServerTest()
defer fw.testHTTPServer.Close()
podNamespace := "other"
podName := "foo"
expectedContainerName := "baz"
setupTest := func(featureGate bool) *serverTestFramework {
// Enable features.ContainerCheckpoint during test
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ContainerCheckpoint, featureGate)()
fw := newServerTest()
// GetPodByName() should always fail
fw.fakeKubelet.podByNameFunc = func(namespace, name string) (*v1.Pod, bool) {
return nil, false
}
return fw
}
fw := setupTest(true)
defer fw.testHTTPServer.Close()
t.Run("wrong pod namespace", func(t *testing.T) {
resp, err := http.Post(fw.testHTTPServer.URL+"/checkpoint/"+podNamespace+"/"+podName+"/"+expectedContainerName, "", nil)
if err != nil {
@ -927,6 +933,19 @@ func TestCheckpointContainer(t *testing.T) {
}
assert.Equal(t, resp.StatusCode, 200)
})
// Now test for 404 if checkpointing support is explicitly disabled.
fw.testHTTPServer.Close()
fw = setupTest(false)
defer fw.testHTTPServer.Close()
setPodByNameFunc(fw, podNamespace, podName, expectedContainerName)
t.Run("checkpointing fails because disabled", func(t *testing.T) {
resp, err := http.Post(fw.testHTTPServer.URL+"/checkpoint/"+podNamespace+"/"+podName+"/"+expectedContainerName, "", nil)
if err != nil {
t.Errorf("Got error POSTing: %v", err)
}
assert.Equal(t, 404, resp.StatusCode)
})
}
func makeReq(t *testing.T, method, url, clientProtocol string) *http.Request {

View File

@ -34,11 +34,14 @@ import (
clientset "k8s.io/client-go/kubernetes"
restclient "k8s.io/client-go/rest"
"k8s.io/kubernetes/test/e2e/framework"
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
"k8s.io/kubernetes/test/e2e/nodefeature"
testutils "k8s.io/kubernetes/test/utils"
imageutils "k8s.io/kubernetes/test/utils/image"
admissionapi "k8s.io/pod-security-admission/api"
"github.com/onsi/gomega"
)
const (
@ -75,6 +78,58 @@ func proxyPostRequest(ctx context.Context, c clientset.Interface, node, endpoint
}
}
func getCheckpointContainerMetric(ctx context.Context, f *framework.Framework, pod *v1.Pod) (int, error) {
framework.Logf("Getting 'checkpoint_container' metrics from %q", pod.Spec.NodeName)
ms, err := e2emetrics.GetKubeletMetrics(
ctx,
f.ClientSet,
pod.Spec.NodeName,
)
if err != nil {
return 0, err
}
runtimeOperationsTotal, ok := ms["runtime_operations_total"]
if !ok {
// If the metric was not found it was probably not written to, yet.
return 0, nil
}
for _, item := range runtimeOperationsTotal {
if item.Metric["__name__"] == "kubelet_runtime_operations_total" && item.Metric["operation_type"] == "checkpoint_container" {
return int(item.Value), nil
}
}
// If the metric was not found it was probably not written to, yet.
return 0, nil
}
func getCheckpointContainerErrorMetric(ctx context.Context, f *framework.Framework, pod *v1.Pod) (int, error) {
framework.Logf("Getting 'checkpoint_container' error metrics from %q", pod.Spec.NodeName)
ms, err := e2emetrics.GetKubeletMetrics(
ctx,
f.ClientSet,
pod.Spec.NodeName,
)
if err != nil {
return 0, err
}
runtimeOperationsErrorsTotal, ok := ms["runtime_operations_errors_total"]
if !ok {
// If the metric was not found it was probably not written to, yet.
return 0, nil
}
for _, item := range runtimeOperationsErrorsTotal {
if item.Metric["__name__"] == "kubelet_runtime_operations_errors_total" && item.Metric["operation_type"] == "checkpoint_container" {
return int(item.Value), nil
}
}
// If the metric was not found it was probably not written to, yet.
return 0, nil
}
var _ = SIGDescribe("Checkpoint Container", nodefeature.CheckpointContainer, func() {
f := framework.NewDefaultFramework("checkpoint-container-test")
f.NamespacePodSecurityLevel = admissionapi.LevelBaseline
@ -82,7 +137,10 @@ var _ = SIGDescribe("Checkpoint Container", nodefeature.CheckpointContainer, fun
ginkgo.By("creating a target pod")
podClient := e2epod.NewPodClient(f)
pod := podClient.CreateSync(ctx, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "checkpoint-container-pod"},
ObjectMeta: metav1.ObjectMeta{
Name: "checkpoint-container-pod",
Namespace: f.Namespace.Name,
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
@ -108,6 +166,15 @@ var _ = SIGDescribe("Checkpoint Container", nodefeature.CheckpointContainer, fun
framework.Failf("pod %q should be ready", p.Name)
}
// No checkpoint operation should have been logged
checkpointContainerMetric, err := getCheckpointContainerMetric(ctx, f, pod)
framework.ExpectNoError(err)
gomega.Expect(checkpointContainerMetric).To(gomega.Equal(0))
// No error should have been logged
checkpointContainerErrorMetric, err := getCheckpointContainerErrorMetric(ctx, f, pod)
framework.ExpectNoError(err)
gomega.Expect(checkpointContainerErrorMetric).To(gomega.Equal(0))
framework.Logf(
"About to checkpoint container %q on %q",
pod.Spec.Containers[0].Name,
@ -144,6 +211,12 @@ var _ = SIGDescribe("Checkpoint Container", nodefeature.CheckpointContainer, fun
// If the container engine has not implemented the Checkpoint CRI API
// we will get 500 and a message with
// '(rpc error: code = Unimplemented desc = unknown method CheckpointContainer'
// or
// '(rpc error: code = Unimplemented desc = method CheckpointContainer not implemented)'
// if the container engine returns that it explicitly has disabled support for it.
// or
// '(rpc error: code = Unknown desc = checkpoint/restore support not available)'
// if the container engine explicitly disabled the checkpoint/restore support
if (int(statusError.ErrStatus.Code)) == http.StatusInternalServerError {
if strings.Contains(
statusError.ErrStatus.Message,
@ -152,8 +225,26 @@ var _ = SIGDescribe("Checkpoint Container", nodefeature.CheckpointContainer, fun
ginkgo.Skip("Container engine does not implement 'CheckpointContainer'")
return
}
if strings.Contains(
statusError.ErrStatus.Message,
"(rpc error: code = Unimplemented desc = method CheckpointContainer not implemented)",
) {
ginkgo.Skip("Container engine does not implement 'CheckpointContainer'")
return
}
framework.Failf("Unexpected status code (%d) during 'CheckpointContainer'", statusError.ErrStatus.Code)
if strings.Contains(
statusError.ErrStatus.Message,
"(rpc error: code = Unknown desc = checkpoint/restore support not available)",
) {
ginkgo.Skip("Container engine does not implement 'CheckpointContainer'")
return
}
}
framework.Failf(
"Unexpected status code (%d) during 'CheckpointContainer': %q",
statusError.ErrStatus.Code,
statusError.ErrStatus.Message,
)
}
framework.ExpectNoError(err)
@ -205,5 +296,13 @@ var _ = SIGDescribe("Checkpoint Container", nodefeature.CheckpointContainer, fun
// cleanup checkpoint archive
os.RemoveAll(item)
}
// Exactly one checkpoint operation should have happened
checkpointContainerMetric, err = getCheckpointContainerMetric(ctx, f, pod)
framework.ExpectNoError(err)
gomega.Expect(checkpointContainerMetric).To(gomega.Equal(1))
// No error should have been logged
checkpointContainerErrorMetric, err = getCheckpointContainerErrorMetric(ctx, f, pod)
framework.ExpectNoError(err)
gomega.Expect(checkpointContainerErrorMetric).To(gomega.Equal(0))
})
})