integration: add ShouldRetryShutdown case based on #7496

Since the moby/moby can't handle duplicate exit event well, it's hard
for containerd to retry shutdown if there is error, like context
canceled.

In order to prevent from regression like #4769, I add skipped
integration case as TODO item and we should rethink about how to handle
the task/shim lifecycle.

Signed-off-by: Wei Fu <fuweid89@gmail.com>
This commit is contained in:
Wei Fu 2023-08-11 08:06:15 +00:00
parent 8dcb2a6e6d
commit 601699a184
2 changed files with 76 additions and 1 deletions

View File

@ -0,0 +1,65 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package integration
import (
"context"
"testing"
"github.com/stretchr/testify/require"
apitask "github.com/containerd/containerd/api/runtime/task/v2"
"github.com/containerd/containerd/namespaces"
)
// TestIssue7496_ShouldRetryShutdown is based on https://github.com/containerd/containerd/issues/7496.
//
// Assume that the shim.Delete takes almost 10 seconds and returns successfully
// and there is no container in shim. However, the context is close to be
// canceled. It will fail to call Shutdown. If we ignores the Canceled error,
// the shim will be leaked. In order to reproduce this, this case will use
// failpoint to inject error into Shutdown API, and then check whether the shim
// is leaked.
func TestIssue7496_ShouldRetryShutdown(t *testing.T) {
// TODO: re-enable if we can retry Shutdown API.
t.Skipf("Please re-enable me if we can retry Shutdown API")
ctx := namespaces.WithNamespace(context.Background(), "k8s.io")
t.Logf("Create a pod config with shutdown failpoint")
sbConfig := PodSandboxConfig("sandbox", "issue7496_shouldretryshutdown")
injectShimFailpoint(t, sbConfig, map[string]string{
"Shutdown": "1*error(please retry)",
})
t.Logf("RunPodSandbox")
sbID, err := runtimeService.RunPodSandbox(sbConfig, failpointRuntimeHandler)
require.NoError(t, err)
t.Logf("Connect to the shim %s", sbID)
shimCli := connectToShim(ctx, t, sbID)
t.Logf("Log shim %s's pid: %d", sbID, shimPid(ctx, t, shimCli))
t.Logf("StopPodSandbox and RemovePodSandbox")
require.NoError(t, runtimeService.StopPodSandbox(sbID))
require.NoError(t, runtimeService.RemovePodSandbox(sbID))
t.Logf("Check the shim connection")
_, err = shimCli.Connect(ctx, &apitask.ConnectRequest{})
require.Error(t, err, "should failed to call shim connect API")
}

View File

@ -458,6 +458,12 @@ func (s *shimTask) delete(ctx context.Context, sandboxed bool, removeTask func(c
// If not, the shim has been delivered the exit and delete events.
// So we should remove the record and prevent duplicate events from
// ttrpc-callback-on-close.
//
// TODO: It's hard to guarantee that the event is unique and sent only
// once. The moby/moby should not rely on that assumption that there is
// only one exit event. The moby/moby should handle the duplicate events.
//
// REF: https://github.com/containerd/containerd/issues/4769
if shimErr == nil {
removeTask(ctx, s.ID())
}
@ -466,7 +472,11 @@ func (s *shimTask) delete(ctx context.Context, sandboxed bool, removeTask func(c
// Let controller decide when to shutdown.
if !sandboxed {
if err := s.waitShutdown(ctx); err != nil {
log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to shutdown shim task")
// FIXME(fuweid):
//
// If the error is context canceled, should we use context.TODO()
// to wait for it?
log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to shutdown shim task and the shim might be leaked")
}
}