From 601699a184fa20f4f9dfd89a78e650a5aeddbef1 Mon Sep 17 00:00:00 2001
From: Wei Fu <fuweid89@gmail.com>
Date: Fri, 11 Aug 2023 08:06:15 +0000
Subject: [PATCH] integration: add ShouldRetryShutdown case based on #7496

Since the moby/moby can't handle duplicate exit event well, it's hard
for containerd to retry shutdown if there is error, like context
canceled.

In order to prevent from regression like #4769, I add skipped
integration case as TODO item and we should rethink about how to handle
the task/shim lifecycle.

Signed-off-by: Wei Fu <fuweid89@gmail.com>
---
 integration/issue7496_shutdown_linux_test.go | 65 ++++++++++++++++++++
 runtime/v2/shim.go                           | 12 +++-
 2 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 integration/issue7496_shutdown_linux_test.go

diff --git a/integration/issue7496_shutdown_linux_test.go b/integration/issue7496_shutdown_linux_test.go
new file mode 100644
index 000000000..e9e1cdc58
--- /dev/null
+++ b/integration/issue7496_shutdown_linux_test.go
@@ -0,0 +1,65 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+package integration
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	apitask "github.com/containerd/containerd/api/runtime/task/v2"
+	"github.com/containerd/containerd/namespaces"
+)
+
+// TestIssue7496_ShouldRetryShutdown is based on https://github.com/containerd/containerd/issues/7496.
+//
+// Assume that the shim.Delete takes almost 10 seconds and returns successfully
+// and there is no container in shim. However, the context is close to be
+// canceled. It will fail to call Shutdown. If we ignores the Canceled error,
+// the shim will be leaked. In order to reproduce this, this case will use
+// failpoint to inject error into Shutdown API, and then check whether the shim
+// is leaked.
+func TestIssue7496_ShouldRetryShutdown(t *testing.T) {
+	// TODO: re-enable if we can retry Shutdown API.
+	t.Skipf("Please re-enable me if we can retry Shutdown API")
+
+	ctx := namespaces.WithNamespace(context.Background(), "k8s.io")
+
+	t.Logf("Create a pod config with shutdown failpoint")
+	sbConfig := PodSandboxConfig("sandbox", "issue7496_shouldretryshutdown")
+	injectShimFailpoint(t, sbConfig, map[string]string{
+		"Shutdown": "1*error(please retry)",
+	})
+
+	t.Logf("RunPodSandbox")
+	sbID, err := runtimeService.RunPodSandbox(sbConfig, failpointRuntimeHandler)
+	require.NoError(t, err)
+
+	t.Logf("Connect to the shim %s", sbID)
+	shimCli := connectToShim(ctx, t, sbID)
+
+	t.Logf("Log shim %s's pid: %d", sbID, shimPid(ctx, t, shimCli))
+
+	t.Logf("StopPodSandbox and RemovePodSandbox")
+	require.NoError(t, runtimeService.StopPodSandbox(sbID))
+	require.NoError(t, runtimeService.RemovePodSandbox(sbID))
+
+	t.Logf("Check the shim connection")
+	_, err = shimCli.Connect(ctx, &apitask.ConnectRequest{})
+	require.Error(t, err, "should failed to call shim connect API")
+}
diff --git a/runtime/v2/shim.go b/runtime/v2/shim.go
index 7524fe280..f4265c17f 100644
--- a/runtime/v2/shim.go
+++ b/runtime/v2/shim.go
@@ -458,6 +458,12 @@ func (s *shimTask) delete(ctx context.Context, sandboxed bool, removeTask func(c
 	// If not, the shim has been delivered the exit and delete events.
 	// So we should remove the record and prevent duplicate events from
 	// ttrpc-callback-on-close.
+	//
+	// TODO: It's hard to guarantee that the event is unique and sent only
+	// once. The moby/moby should not rely on that assumption that there is
+	// only one exit event. The moby/moby should handle the duplicate events.
+	//
+	// REF: https://github.com/containerd/containerd/issues/4769
 	if shimErr == nil {
 		removeTask(ctx, s.ID())
 	}
@@ -466,7 +472,11 @@ func (s *shimTask) delete(ctx context.Context, sandboxed bool, removeTask func(c
 	// Let controller decide when to shutdown.
 	if !sandboxed {
 		if err := s.waitShutdown(ctx); err != nil {
-			log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to shutdown shim task")
+			// FIXME(fuweid):
+			//
+			// If the error is context canceled, should we use context.TODO()
+			// to wait for it?
+			log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to shutdown shim task and the shim might be leaked")
 		}
 	}