From 7a7519a780fd8407251c4759277d50546fe6b37e Mon Sep 17 00:00:00 2001 From: Danny Canter Date: Fri, 7 Apr 2023 18:59:03 -0700 Subject: [PATCH] CRI Sbserver: Make PodSandboxStatus friendlier to shim crashes Currently if you're using the shim-mode sandbox server support, if your shim that's hosting the Sandbox API dies for any reason that wasn't intentional (segfault, oom etc.) PodSandboxStatus is kind of wedged. We can use the fact that if we didn't go through the usual k8s flow of Stop->Remove and we still have an entry in our sandbox store, us not having a shim mapping anymore means this was likely unintentional. Signed-off-by: Danny Canter --- pkg/cri/sbserver/sandbox_status.go | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/pkg/cri/sbserver/sandbox_status.go b/pkg/cri/sbserver/sandbox_status.go index f4a02f2ab..0d547b4b6 100644 --- a/pkg/cri/sbserver/sandbox_status.go +++ b/pkg/cri/sbserver/sandbox_status.go @@ -21,6 +21,7 @@ import ( "fmt" "time" + "github.com/containerd/containerd/errdefs" sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" ) @@ -42,12 +43,32 @@ func (c *criService) PodSandboxStatus(ctx context.Context, r *runtime.PodSandbox return nil, fmt.Errorf("failed to get sandbox controller: %w", err) } + var ( + createdAt time.Time + state string + info map[string]string + ) cstatus, err := controller.Status(ctx, sandbox.ID, r.GetVerbose()) if err != nil { - return nil, fmt.Errorf("failed to query controller status: %w", err) + // If the shim died unexpectedly (segfault etc.) let's set the state as + // NOTREADY and not just error out to make k8s and clients like crictl + // happy. If we get back ErrNotFound from controller.Status above while + // we're using the shim-mode controller, this is a decent indicator it + // exited unexpectedly. We can use the fact that we successfully retrieved + // the sandbox object from the store above to tell that this is true, otherwise + // if we followed the normal k8s convention of StopPodSandbox -> RemovePodSandbox, + // we wouldn't have that object in the store anymore. + if !errdefs.IsNotFound(err) { + return nil, fmt.Errorf("failed to query controller status: %w", err) + } + state = runtime.PodSandboxState_SANDBOX_NOTREADY.String() + } else { + state = cstatus.State + createdAt = cstatus.CreatedAt + info = cstatus.Info } - status := toCRISandboxStatus(sandbox.Metadata, cstatus.State, cstatus.CreatedAt, ip, additionalIPs) + status := toCRISandboxStatus(sandbox.Metadata, state, createdAt, ip, additionalIPs) if status.GetCreatedAt() == 0 { // CRI doesn't allow CreatedAt == 0. sandboxInfo, err := c.client.SandboxStore().Get(ctx, sandbox.ID) @@ -59,7 +80,7 @@ func (c *criService) PodSandboxStatus(ctx context.Context, r *runtime.PodSandbox return &runtime.PodSandboxStatusResponse{ Status: status, - Info: cstatus.Info, + Info: info, }, nil }