From 601699a184fa20f4f9dfd89a78e650a5aeddbef1 Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Fri, 11 Aug 2023 08:06:15 +0000 Subject: [PATCH] integration: add ShouldRetryShutdown case based on #7496 Since the moby/moby can't handle duplicate exit event well, it's hard for containerd to retry shutdown if there is error, like context canceled. In order to prevent from regression like #4769, I add skipped integration case as TODO item and we should rethink about how to handle the task/shim lifecycle. Signed-off-by: Wei Fu --- integration/issue7496_shutdown_linux_test.go | 65 ++++++++++++++++++++ runtime/v2/shim.go | 12 +++- 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 integration/issue7496_shutdown_linux_test.go diff --git a/integration/issue7496_shutdown_linux_test.go b/integration/issue7496_shutdown_linux_test.go new file mode 100644 index 000000000..e9e1cdc58 --- /dev/null +++ b/integration/issue7496_shutdown_linux_test.go @@ -0,0 +1,65 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package integration + +import ( + "context" + "testing" + + "github.com/stretchr/testify/require" + + apitask "github.com/containerd/containerd/api/runtime/task/v2" + "github.com/containerd/containerd/namespaces" +) + +// TestIssue7496_ShouldRetryShutdown is based on https://github.com/containerd/containerd/issues/7496. +// +// Assume that the shim.Delete takes almost 10 seconds and returns successfully +// and there is no container in shim. However, the context is close to be +// canceled. It will fail to call Shutdown. If we ignores the Canceled error, +// the shim will be leaked. In order to reproduce this, this case will use +// failpoint to inject error into Shutdown API, and then check whether the shim +// is leaked. +func TestIssue7496_ShouldRetryShutdown(t *testing.T) { + // TODO: re-enable if we can retry Shutdown API. + t.Skipf("Please re-enable me if we can retry Shutdown API") + + ctx := namespaces.WithNamespace(context.Background(), "k8s.io") + + t.Logf("Create a pod config with shutdown failpoint") + sbConfig := PodSandboxConfig("sandbox", "issue7496_shouldretryshutdown") + injectShimFailpoint(t, sbConfig, map[string]string{ + "Shutdown": "1*error(please retry)", + }) + + t.Logf("RunPodSandbox") + sbID, err := runtimeService.RunPodSandbox(sbConfig, failpointRuntimeHandler) + require.NoError(t, err) + + t.Logf("Connect to the shim %s", sbID) + shimCli := connectToShim(ctx, t, sbID) + + t.Logf("Log shim %s's pid: %d", sbID, shimPid(ctx, t, shimCli)) + + t.Logf("StopPodSandbox and RemovePodSandbox") + require.NoError(t, runtimeService.StopPodSandbox(sbID)) + require.NoError(t, runtimeService.RemovePodSandbox(sbID)) + + t.Logf("Check the shim connection") + _, err = shimCli.Connect(ctx, &apitask.ConnectRequest{}) + require.Error(t, err, "should failed to call shim connect API") +} diff --git a/runtime/v2/shim.go b/runtime/v2/shim.go index 7524fe280..f4265c17f 100644 --- a/runtime/v2/shim.go +++ b/runtime/v2/shim.go @@ -458,6 +458,12 @@ func (s *shimTask) delete(ctx context.Context, sandboxed bool, removeTask func(c // If not, the shim has been delivered the exit and delete events. // So we should remove the record and prevent duplicate events from // ttrpc-callback-on-close. + // + // TODO: It's hard to guarantee that the event is unique and sent only + // once. The moby/moby should not rely on that assumption that there is + // only one exit event. The moby/moby should handle the duplicate events. + // + // REF: https://github.com/containerd/containerd/issues/4769 if shimErr == nil { removeTask(ctx, s.ID()) } @@ -466,7 +472,11 @@ func (s *shimTask) delete(ctx context.Context, sandboxed bool, removeTask func(c // Let controller decide when to shutdown. if !sandboxed { if err := s.waitShutdown(ctx); err != nil { - log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to shutdown shim task") + // FIXME(fuweid): + // + // If the error is context canceled, should we use context.TODO() + // to wait for it? + log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to shutdown shim task and the shim might be leaked") } }