diff --git a/integration/sandbox_run_rollback_test.go b/integration/sandbox_run_rollback_test.go index 9a152e164..d7b9b8e91 100644 --- a/integration/sandbox_run_rollback_test.go +++ b/integration/sandbox_run_rollback_test.go @@ -20,6 +20,7 @@ package integration import ( + "context" "encoding/json" "fmt" "os" @@ -36,6 +37,7 @@ import ( "github.com/stretchr/testify/require" criapiv1 "k8s.io/cri-api/pkg/apis/runtime/v1" + "github.com/containerd/containerd/pkg/cri/sbserver" "github.com/containerd/containerd/pkg/cri/store/sandbox" "github.com/containerd/containerd/pkg/failpoint" "github.com/containerd/typeurl" @@ -103,9 +105,6 @@ func TestRunPodSandboxWithShimDeleteFailure(t *testing.T) { if runtime.GOOS != "linux" { t.Skip() } - if os.Getenv("ENABLE_CRI_SANDBOXES") != "" { - t.Skip() - } testCase := func(restart bool) func(*testing.T) { return func(t *testing.T) { @@ -177,9 +176,6 @@ func TestRunPodSandboxWithShimStartAndTeardownCNIFailure(t *testing.T) { if runtime.GOOS != "linux" { t.Skip() } - if os.Getenv("ENABLE_CRI_SANDBOXES") != "" { - t.Skip() - } testCase := func(restart bool) func(*testing.T) { return func(t *testing.T) { @@ -243,9 +239,6 @@ func TestRunPodSandboxAndTeardownCNISlow(t *testing.T) { if runtime.GOOS != "linux" { t.Skip() } - if os.Getenv("ENABLE_CRI_SANDBOXES") != "" { - t.Skip() - } t.Log("Init PodSandboxConfig with specific key") sbName := t.Name() @@ -301,31 +294,63 @@ func TestRunPodSandboxAndTeardownCNISlow(t *testing.T) { assert.Equal(t, sb.Metadata.Uid, sbConfig.Metadata.Uid) assert.Equal(t, sb.Metadata.Attempt, sbConfig.Metadata.Attempt) - t.Log("Get sandbox info") - _, info, err := SandboxInfo(sb.Id) - require.NoError(t, err) - require.False(t, info.NetNSClosed) - - var netNS string - for _, n := range info.RuntimeSpec.Linux.Namespaces { - if n.Type == runtimespec.NetworkNamespace { - netNS = n.Path + switch os.Getenv("ENABLE_CRI_SANDBOXES") { + case "": + // non-sbserver + t.Log("Get sandbox info (non-sbserver)") + _, info, err := SandboxInfo(sb.Id) + require.NoError(t, err) + require.False(t, info.NetNSClosed) + var netNS string + for _, n := range info.RuntimeSpec.Linux.Namespaces { + if n.Type == runtimespec.NetworkNamespace { + netNS = n.Path + } } - } - assert.NotEmpty(t, netNS, "network namespace should be set") + assert.NotEmpty(t, netNS, "network namespace should be set") - t.Log("Get sandbox container") - c, err := GetContainer(sb.Id) - require.NoError(t, err) - any, ok := c.Extensions["io.cri-containerd.sandbox.metadata"] - require.True(t, ok, "sandbox metadata should exist in extension") - i, err := typeurl.UnmarshalAny(any) - require.NoError(t, err) - require.IsType(t, &sandbox.Metadata{}, i) - metadata, ok := i.(*sandbox.Metadata) - require.True(t, ok) - assert.NotEmpty(t, metadata.NetNSPath) - assert.Equal(t, netNS, metadata.NetNSPath, "network namespace path should be the same in runtime spec and sandbox metadata") + t.Log("Get sandbox container") + c, err := GetContainer(sb.Id) + require.NoError(t, err) + any, ok := c.Extensions["io.cri-containerd.sandbox.metadata"] + require.True(t, ok, "sandbox metadata should exist in extension") + i, err := typeurl.UnmarshalAny(any) + require.NoError(t, err) + require.IsType(t, &sandbox.Metadata{}, i) + metadata, ok := i.(*sandbox.Metadata) + require.True(t, ok) + assert.Equal(t, netNS, metadata.NetNSPath, "network namespace path should be the same in runtime spec and sandbox metadata") + default: + // sbserver + t.Log("Get sandbox info (sbserver)") + _, info, err := sbserverSandboxInfo(sb.Id) + require.NoError(t, err) + require.False(t, info.NetNSClosed) + + assert.NotEmpty(t, info.Metadata.NetNSPath, "network namespace should be set") + } + +} + +// sbserverSandboxInfo gets sandbox info. +func sbserverSandboxInfo(id string) (*criapiv1.PodSandboxStatus, *sbserver.SandboxInfo, error) { + client, err := RawRuntimeClient() + if err != nil { + return nil, nil, fmt.Errorf("failed to get raw runtime client: %w", err) + } + resp, err := client.PodSandboxStatus(context.Background(), &criapiv1.PodSandboxStatusRequest{ + PodSandboxId: id, + Verbose: true, + }) + if err != nil { + return nil, nil, fmt.Errorf("failed to get sandbox status: %w", err) + } + status := resp.GetStatus() + var info sbserver.SandboxInfo + if err := json.Unmarshal([]byte(resp.GetInfo()["info"]), &info); err != nil { + return nil, nil, fmt.Errorf("failed to unmarshal sandbox info: %w", err) + } + return status, &info, nil } func ensureCNIAddRunning(t *testing.T, sbName string) error { diff --git a/pkg/cri/sbserver/podsandbox/helpers.go b/pkg/cri/sbserver/podsandbox/helpers.go index 03202e017..d3f90888c 100644 --- a/pkg/cri/sbserver/podsandbox/helpers.go +++ b/pkg/cri/sbserver/podsandbox/helpers.go @@ -63,6 +63,8 @@ const ( sandboxMetadataExtension = criContainerdPrefix + ".sandbox.metadata" // runtimeRunhcsV1 is the runtime type for runhcs. runtimeRunhcsV1 = "io.containerd.runhcs.v1" + // MetadataKey is the key used for storing metadata in the sandbox extensions + MetadataKey = "metadata" ) const ( diff --git a/pkg/cri/sbserver/podsandbox/sandbox_delete.go b/pkg/cri/sbserver/podsandbox/sandbox_delete.go index b40c768ca..73b67e58f 100644 --- a/pkg/cri/sbserver/podsandbox/sandbox_delete.go +++ b/pkg/cri/sbserver/podsandbox/sandbox_delete.go @@ -49,11 +49,13 @@ func (c *Controller) Delete(ctx context.Context, sandboxID string) (*api.Control } // Delete sandbox container. - if err := sandbox.Container.Delete(ctx, containerd.WithSnapshotCleanup); err != nil { - if !errdefs.IsNotFound(err) { - return nil, fmt.Errorf("failed to delete sandbox container %q: %w", sandboxID, err) + if sandbox.Container != nil { + if err := sandbox.Container.Delete(ctx, containerd.WithSnapshotCleanup); err != nil { + if !errdefs.IsNotFound(err) { + return nil, fmt.Errorf("failed to delete sandbox container %q: %w", sandboxID, err) + } + log.G(ctx).Tracef("Sandbox controller Delete called for sandbox container %q that does not exist", sandboxID) } - log.G(ctx).Tracef("Sandbox controller Delete called for sandbox container %q that does not exist", sandboxID) } return &api.ControllerDeleteResponse{}, nil diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run.go b/pkg/cri/sbserver/podsandbox/sandbox_run.go index 3f372a3b2..d5aef016f 100644 --- a/pkg/cri/sbserver/podsandbox/sandbox_run.go +++ b/pkg/cri/sbserver/podsandbox/sandbox_run.go @@ -21,6 +21,13 @@ import ( "errors" "fmt" + "github.com/containerd/nri" + v1 "github.com/containerd/nri/types/v1" + "github.com/containerd/typeurl" + "github.com/davecgh/go-spew/spew" + "github.com/opencontainers/selinux/go-selinux" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + "github.com/containerd/containerd" api "github.com/containerd/containerd/api/services/sandbox/v1" containerdio "github.com/containerd/containerd/cio" @@ -33,12 +40,6 @@ import ( ctrdutil "github.com/containerd/containerd/pkg/cri/util" "github.com/containerd/containerd/protobuf" "github.com/containerd/containerd/snapshots" - "github.com/containerd/nri" - v1 "github.com/containerd/nri/types/v1" - "github.com/containerd/typeurl" - "github.com/davecgh/go-spew/spew" - "github.com/opencontainers/selinux/go-selinux" - runtime "k8s.io/cri-api/pkg/apis/runtime/v1" ) func init() { @@ -46,14 +47,26 @@ func init() { "github.com/containerd/cri/pkg/store/sandbox", "Metadata") } -func (c *Controller) Start(ctx context.Context, id string) (_ *api.ControllerStartResponse, retErr error) { +// Start creates resources required for the sandbox and starts the sandbox. If an error occurs, Start attempts to tear +// down the created resources. If an error occurs while tearing down resources, a zero-valued response is returned +// alongside the error. If the teardown was successful, a nil response is returned with the error. +// TODO(samuelkarp) Determine whether this error indication is reasonable to retain once controller.Delete is implemented. +func (c *Controller) Start(ctx context.Context, id string) (resp *api.ControllerStartResponse, retErr error) { + var cleanupErr error + defer func() { + if retErr != nil && cleanupErr != nil { + log.G(ctx).WithField("id", id).WithError(cleanupErr).Errorf("failed to fully teardown sandbox resources after earlier error: %s", retErr) + resp = &api.ControllerStartResponse{} + } + }() + sandboxInfo, err := c.client.SandboxStore().Get(ctx, id) if err != nil { return nil, fmt.Errorf("unable to find sandbox with id %q: %w", id, err) } var metadata sandboxstore.Metadata - if err := sandboxInfo.GetExtension("metadata", &metadata); err != nil { + if err := sandboxInfo.GetExtension(MetadataKey, &metadata); err != nil { return nil, fmt.Errorf("failed to get sandbox %q metadata: %w", id, err) } @@ -137,11 +150,11 @@ func (c *Controller) Start(ctx context.Context, id string) (_ *api.ControllerSta return nil, fmt.Errorf("failed to create containerd container: %w", err) } defer func() { - if retErr != nil { + if retErr != nil && cleanupErr == nil { deferCtx, deferCancel := ctrdutil.DeferContext() defer deferCancel() - if err := container.Delete(deferCtx, containerd.WithSnapshotCleanup); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to delete containerd container %q", id) + if cleanupErr = container.Delete(deferCtx, containerd.WithSnapshotCleanup); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to delete containerd container %q", id) } } }() @@ -153,10 +166,10 @@ func (c *Controller) Start(ctx context.Context, id string) (_ *api.ControllerSta sandboxRootDir, err) } defer func() { - if retErr != nil { + if retErr != nil && cleanupErr == nil { // Cleanup the sandbox root directory. - if err := c.os.RemoveAll(sandboxRootDir); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to remove sandbox root directory %q", + if cleanupErr = c.os.RemoveAll(sandboxRootDir); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove sandbox root directory %q", sandboxRootDir) } } @@ -168,10 +181,10 @@ func (c *Controller) Start(ctx context.Context, id string) (_ *api.ControllerSta volatileSandboxRootDir, err) } defer func() { - if retErr != nil { + if retErr != nil && cleanupErr == nil { // Cleanup the volatile sandbox root directory. - if err := c.os.RemoveAll(volatileSandboxRootDir); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to remove volatile sandbox root directory %q", + if cleanupErr = c.os.RemoveAll(volatileSandboxRootDir); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove volatile sandbox root directory %q", volatileSandboxRootDir) } } @@ -182,9 +195,9 @@ func (c *Controller) Start(ctx context.Context, id string) (_ *api.ControllerSta return nil, fmt.Errorf("failed to setup sandbox files: %w", err) } defer func() { - if retErr != nil { - if err = c.cleanupSandboxFiles(id, config); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to cleanup sandbox files in %q", + if retErr != nil && cleanupErr == nil { + if cleanupErr = c.cleanupSandboxFiles(id, config); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to cleanup sandbox files in %q", sandboxRootDir) } } @@ -210,12 +223,13 @@ func (c *Controller) Start(ctx context.Context, id string) (_ *api.ControllerSta return nil, fmt.Errorf("failed to create containerd task: %w", err) } defer func() { - if retErr != nil { + if retErr != nil && cleanupErr == nil { deferCtx, deferCancel := ctrdutil.DeferContext() defer deferCancel() // Cleanup the sandbox container if an error is returned. if _, err := task.Delete(deferCtx, WithNRISandboxDelete(id), containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) { log.G(ctx).WithError(err).Errorf("Failed to delete sandbox container %q", id) + cleanupErr = err } } }() @@ -245,7 +259,7 @@ func (c *Controller) Start(ctx context.Context, id string) (_ *api.ControllerSta return nil, fmt.Errorf("failed to start sandbox container task %q: %w", id, err) } - resp := &api.ControllerStartResponse{ + resp = &api.ControllerStartResponse{ SandboxID: id, Pid: task.Pid(), CreatedAt: protobuf.ToTimestamp(info.CreatedAt), diff --git a/pkg/cri/sbserver/podsandbox/sandbox_stop.go b/pkg/cri/sbserver/podsandbox/sandbox_stop.go index 5173431ba..0f817e08f 100644 --- a/pkg/cri/sbserver/podsandbox/sandbox_stop.go +++ b/pkg/cri/sbserver/podsandbox/sandbox_stop.go @@ -21,13 +21,14 @@ import ( "fmt" "syscall" + "github.com/sirupsen/logrus" + eventtypes "github.com/containerd/containerd/api/events" api "github.com/containerd/containerd/api/services/sandbox/v1" "github.com/containerd/containerd/errdefs" sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox" ctrdutil "github.com/containerd/containerd/pkg/cri/util" "github.com/containerd/containerd/protobuf" - "github.com/sirupsen/logrus" ) func (c *Controller) Stop(ctx context.Context, sandboxID string) (*api.ControllerStopResponse, error) { @@ -44,7 +45,7 @@ func (c *Controller) Stop(ctx context.Context, sandboxID string) (*api.Controlle // TODO: The Controller maintains its own Status instead of CRI's sandboxStore. // Only stop sandbox container when it's running or unknown. state := sandbox.Status.Get().State - if state == sandboxstore.StateReady || state == sandboxstore.StateUnknown { + if (state == sandboxstore.StateReady || state == sandboxstore.StateUnknown) && sandbox.Container != nil { if err := c.stopSandboxContainer(ctx, sandbox); err != nil { return nil, fmt.Errorf("failed to stop sandbox container %q in %q state: %w", sandboxID, state, err) } diff --git a/pkg/cri/sbserver/restart.go b/pkg/cri/sbserver/restart.go index 6bc91968c..547dd1093 100644 --- a/pkg/cri/sbserver/restart.go +++ b/pkg/cri/sbserver/restart.go @@ -30,6 +30,7 @@ import ( "github.com/containerd/containerd/errdefs" containerdimages "github.com/containerd/containerd/images" "github.com/containerd/containerd/log" + "github.com/containerd/containerd/pkg/cri/sbserver/podsandbox" "github.com/containerd/containerd/platforms" "github.com/containerd/typeurl" "golang.org/x/sync/errgroup" @@ -82,6 +83,28 @@ func (c *criService) recover(ctx context.Context) error { return err } + // Recover sandboxes in the new SandboxStore + storedSandboxes, err := c.client.SandboxStore().List(ctx) + if err != nil { + return fmt.Errorf("failed to list sandboxes from API: %w", err) + } + for _, sbx := range storedSandboxes { + if _, err := c.sandboxStore.Get(sbx.ID); err == nil { + continue + } + metadata := sandboxstore.Metadata{} + err := sbx.GetExtension(podsandbox.MetadataKey, &metadata) + if err != nil { + return fmt.Errorf("failed to get metadata for stored sandbox %q: %w", sbx.ID, err) + } + sb := sandboxstore.NewSandbox(metadata, sandboxstore.Status{State: sandboxstore.StateUnknown}) + // Load network namespace. + sb.NetNS = getNetNS(&metadata) + if err := c.sandboxStore.Add(sb); err != nil { + return fmt.Errorf("failed to add stored sandbox %q to store: %w", sbx.ID, err) + } + } + // Recover all containers. containers, err := c.client.Containers(ctx, filterLabel(containerKindLabel, containerKindContainer)) if err != nil { @@ -427,15 +450,7 @@ func (c *criService) loadSandbox(ctx context.Context, cntr containerd.Container) sandbox.Container = cntr // Load network namespace. - if goruntime.GOOS != "windows" && - meta.Config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetNetwork() == runtime.NamespaceMode_NODE { - // Don't need to load netns for host network sandbox. - return sandbox, nil - } - if goruntime.GOOS == "windows" && meta.Config.GetWindows().GetSecurityContext().GetHostProcess() { - return sandbox, nil - } - sandbox.NetNS = netns.LoadNetNS(meta.NetNSPath) + sandbox.NetNS = getNetNS(meta) // It doesn't matter whether task is running or not. If it is running, sandbox // status will be `READY`; if it is not running, sandbox status will be `NOT_READY`, @@ -443,6 +458,18 @@ func (c *criService) loadSandbox(ctx context.Context, cntr containerd.Container) return sandbox, nil } +func getNetNS(meta *sandboxstore.Metadata) *netns.NetNS { + if goruntime.GOOS != "windows" && + meta.Config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetNetwork() == runtime.NamespaceMode_NODE { + // Don't need to load netns for host network sandbox. + return nil + } + if goruntime.GOOS == "windows" && meta.Config.GetWindows().GetSecurityContext().GetHostProcess() { + return nil + } + return netns.LoadNetNS(meta.NetNSPath) +} + // loadImages loads images from containerd. func (c *criService) loadImages(ctx context.Context, cImages []containerd.Image) { snapshotter := c.config.ContainerdConfig.Snapshotter diff --git a/pkg/cri/sbserver/sandbox_run.go b/pkg/cri/sbserver/sandbox_run.go index 8bc0ca499..4fda0b739 100644 --- a/pkg/cri/sbserver/sandbox_run.go +++ b/pkg/cri/sbserver/sandbox_run.go @@ -27,14 +27,16 @@ import ( "strings" "time" - eventtypes "github.com/containerd/containerd/api/events" - "github.com/containerd/containerd/protobuf" - sb "github.com/containerd/containerd/sandbox" "github.com/containerd/go-cni" "github.com/containerd/typeurl" "github.com/sirupsen/logrus" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + eventtypes "github.com/containerd/containerd/api/events" + "github.com/containerd/containerd/pkg/cri/sbserver/podsandbox" + "github.com/containerd/containerd/protobuf" + sb "github.com/containerd/containerd/sandbox" + "github.com/containerd/containerd/log" "github.com/containerd/containerd/pkg/cri/annotations" criconfig "github.com/containerd/containerd/pkg/cri/config" @@ -64,6 +66,13 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox } name := makeSandboxName(metadata) log.G(ctx).WithField("podsandboxid", id).Debugf("generated id for sandbox name %q", name) + + // cleanupErr records the last error returned by the critical cleanup operations in deferred functions, + // like CNI teardown and stopping the running sandbox task. + // If cleanup is not completed for some reason, the CRI-plugin will leave the sandbox + // in a not-ready state, which can later be cleaned up by the next execution of the kubelet's syncPod workflow. + var cleanupErr error + // Reserve the sandbox name to avoid concurrent `RunPodSandbox` request starting the // same sandbox. if err := c.sandboxNameIndex.Reserve(name, id); err != nil { @@ -71,7 +80,8 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox } defer func() { // Release the name if the function returns with an error. - if retErr != nil { + // When cleanupErr != nil, the name will be cleaned in sandbox_remove. + if retErr != nil && cleanupErr == nil { c.sandboxNameIndex.ReleaseByName(name) } }() @@ -98,6 +108,25 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox }, ) + if _, err := c.client.SandboxStore().Create(ctx, sandboxInfo); err != nil { + return nil, fmt.Errorf("failed to save sandbox metadata: %w", err) + } + defer func() { + if retErr != nil && cleanupErr == nil { + cleanupErr = c.client.SandboxStore().Delete(ctx, id) + } + }() + + defer func() { + // Put the sandbox into sandbox store when some resources fail to be cleaned. + if retErr != nil && cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("encountered an error cleaning up failed sandbox %q, marking sandbox state as SANDBOX_UNKNOWN", id) + if err := c.sandboxStore.Add(sandbox); err != nil { + log.G(ctx).WithError(err).Errorf("failed to add sandbox %+v into store", sandbox) + } + } + }() + var ( podNetwork = true err error @@ -110,7 +139,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox } if goruntime.GOOS == "windows" && config.GetWindows().GetSecurityContext().GetHostProcess() { - //Windows HostProcess pods can only run on the host network + // Windows HostProcess pods can only run on the host network podNetwork = false } @@ -128,20 +157,40 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox if err != nil { return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) } + // Update network namespace in the store, which is used to generate the container's spec sandbox.NetNSPath = sandbox.NetNS.GetPath() defer func() { - if retErr != nil { + // Remove the network namespace only if all the resource cleanup is done + if retErr != nil && cleanupErr == nil { + if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) + return + } + sandbox.NetNSPath = "" + } + }() + + if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil { + return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err) + } + // Save sandbox metadata to store + if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil { + return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err) + } + + // Define this defer to teardownPodNetwork prior to the setupPodNetwork function call. + // This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource + // creation functions. + defer func() { + // Remove the network namespace only if all the resource cleanup is done. + if retErr != nil && cleanupErr == nil { deferCtx, deferCancel := ctrdutil.DeferContext() defer deferCancel() // Teardown network if an error is returned. - if err := c.teardownPodNetwork(deferCtx, sandbox); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to destroy network for sandbox %q", id) + if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id) } - if err := sandbox.NetNS.Remove(); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) - } - sandbox.NetNSPath = "" } }() @@ -159,8 +208,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox sandboxCreateNetworkTimer.UpdateSince(netStart) } - // Save sandbox metadata to store - if err := sandboxInfo.AddExtension("metadata", &sandbox.Metadata); err != nil { + if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil { return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err) } @@ -169,8 +217,9 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("failed to get sandbox controller: %w", err) } - if _, err := c.client.SandboxStore().Create(ctx, sandboxInfo); err != nil { - return nil, fmt.Errorf("failed to save sandbox metadata: %w", err) + // Save sandbox metadata to store + if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil { + return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err) } runtimeStart := time.Now() @@ -181,8 +230,19 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox resp, err := controller.Start(ctx, id) if err != nil { + sandbox.Container, _ = c.client.LoadContainer(ctx, id) + if resp != nil && resp.SandboxID == "" && resp.Pid == 0 && resp.CreatedAt == nil && len(resp.Labels) == 0 { + // if resp is a non-nil zero-value, an error occurred during cleanup + cleanupErr = fmt.Errorf("failed to cleanup sandbox") + } return nil, fmt.Errorf("failed to start sandbox %q: %w", id, err) } + // TODO: get rid of this. sandbox object should no longer have Container field. + container, err := c.client.LoadContainer(ctx, id) + if err != nil { + return nil, fmt.Errorf("failed to load container %q for sandbox: %w", id, err) + } + sandbox.Container = container labels := resp.GetLabels() if labels == nil { @@ -201,14 +261,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("failed to update sandbox status: %w", err) } - // TODO: get rid of this. sandbox object should no longer have Container field. - container, err := c.client.LoadContainer(ctx, id) - if err != nil { - return nil, fmt.Errorf("failed to load container %q for sandbox: %w", id, err) - } // Add sandbox into sandbox store in INIT state. - sandbox.Container = container - if err := c.sandboxStore.Add(sandbox); err != nil { return nil, fmt.Errorf("failed to add sandbox %+v into store: %w", sandbox, err) } diff --git a/pkg/cri/sbserver/sandbox_status.go b/pkg/cri/sbserver/sandbox_status.go index a9508451e..97e8aae4e 100644 --- a/pkg/cri/sbserver/sandbox_status.go +++ b/pkg/cri/sbserver/sandbox_status.go @@ -141,38 +141,66 @@ type SandboxInfo struct { RuntimeType string `json:"runtimeType"` RuntimeOptions interface{} `json:"runtimeOptions"` Config *runtime.PodSandboxConfig `json:"config"` - RuntimeSpec *runtimespec.Spec `json:"runtimeSpec"` - CNIResult *cni.Result `json:"cniResult"` + // Note: RuntimeSpec may not be populated if the sandbox has not been fully created. + RuntimeSpec *runtimespec.Spec `json:"runtimeSpec"` + CNIResult *cni.Result `json:"cniResult"` + Metadata *sandboxstore.Metadata `json:"sandboxMetadata"` } // toCRISandboxInfo converts internal container object information to CRI sandbox status response info map. func toCRISandboxInfo(ctx context.Context, sandbox sandboxstore.Sandbox) (map[string]string, error) { - container := sandbox.Container - task, err := container.Task(ctx, nil) - if err != nil && !errdefs.IsNotFound(err) { - return nil, fmt.Errorf("failed to get sandbox container task: %w", err) - } - - var processStatus containerd.ProcessStatus - if task != nil { - if taskStatus, err := task.Status(ctx); err != nil { - if !errdefs.IsNotFound(err) { - return nil, fmt.Errorf("failed to get task status: %w", err) - } - processStatus = containerd.Unknown - } else { - processStatus = taskStatus.Status - } - } - si := &SandboxInfo{ Pid: sandbox.Status.Get().Pid, - RuntimeHandler: sandbox.RuntimeHandler, - Status: string(processStatus), Config: sandbox.Config, + RuntimeHandler: sandbox.RuntimeHandler, CNIResult: sandbox.CNIResult, } + if sandbox.Container != nil { + container := sandbox.Container + task, err := container.Task(ctx, nil) + if err != nil && !errdefs.IsNotFound(err) { + return nil, fmt.Errorf("failed to get sandbox container task: %w", err) + } + + var processStatus containerd.ProcessStatus + if task != nil { + if taskStatus, err := task.Status(ctx); err != nil { + if !errdefs.IsNotFound(err) { + return nil, fmt.Errorf("failed to get task status: %w", err) + } + processStatus = containerd.Unknown + } else { + processStatus = taskStatus.Status + } + } + si.Status = string(processStatus) + + spec, err := container.Spec(ctx) + if err != nil { + return nil, fmt.Errorf("failed to get sandbox container runtime spec: %w", err) + } + si.RuntimeSpec = spec + + ctrInfo, err := container.Info(ctx) + if err != nil { + return nil, fmt.Errorf("failed to get sandbox container info: %w", err) + } + // Do not use config.SandboxImage because the configuration might + // be changed during restart. It may not reflect the actual image + // used by the sandbox container. + si.Image = ctrInfo.Image + si.SnapshotKey = ctrInfo.SnapshotKey + si.Snapshotter = ctrInfo.Snapshotter + + runtimeOptions, err := getRuntimeOptions(ctrInfo) + if err != nil { + return nil, fmt.Errorf("failed to get runtime options: %w", err) + } + si.RuntimeType = ctrInfo.Runtime.Name + si.RuntimeOptions = runtimeOptions + } + if si.Status == "" { // If processStatus is empty, it means that the task is deleted. Apply "deleted" // status which does not exist in containerd. @@ -188,29 +216,7 @@ func toCRISandboxInfo(ctx context.Context, sandbox sandboxstore.Sandbox) (map[st si.NetNSClosed = closed } - spec, err := container.Spec(ctx) - if err != nil { - return nil, fmt.Errorf("failed to get sandbox container runtime spec: %w", err) - } - si.RuntimeSpec = spec - - ctrInfo, err := container.Info(ctx) - if err != nil { - return nil, fmt.Errorf("failed to get sandbox container info: %w", err) - } - // Do not use config.SandboxImage because the configuration might - // be changed during restart. It may not reflect the actual image - // used by the sandbox container. - si.Image = ctrInfo.Image - si.SnapshotKey = ctrInfo.SnapshotKey - si.Snapshotter = ctrInfo.Snapshotter - - runtimeOptions, err := getRuntimeOptions(ctrInfo) - if err != nil { - return nil, fmt.Errorf("failed to get runtime options: %w", err) - } - si.RuntimeType = ctrInfo.Runtime.Name - si.RuntimeOptions = runtimeOptions + si.Metadata = &sandbox.Metadata infoBytes, err := json.Marshal(si) if err != nil {