integration: add ShouldRetryShutdown case based on #7496
Since the moby/moby can't handle duplicate exit event well, it's hard for containerd to retry shutdown if there is error, like context canceled. In order to prevent from regression like #4769, I add skipped integration case as TODO item and we should rethink about how to handle the task/shim lifecycle. Signed-off-by: Wei Fu <fuweid89@gmail.com>
This commit is contained in:
parent
8dcb2a6e6d
commit
601699a184
65
integration/issue7496_shutdown_linux_test.go
Normal file
65
integration/issue7496_shutdown_linux_test.go
Normal file
@ -0,0 +1,65 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
apitask "github.com/containerd/containerd/api/runtime/task/v2"
|
||||
"github.com/containerd/containerd/namespaces"
|
||||
)
|
||||
|
||||
// TestIssue7496_ShouldRetryShutdown is based on https://github.com/containerd/containerd/issues/7496.
|
||||
//
|
||||
// Assume that the shim.Delete takes almost 10 seconds and returns successfully
|
||||
// and there is no container in shim. However, the context is close to be
|
||||
// canceled. It will fail to call Shutdown. If we ignores the Canceled error,
|
||||
// the shim will be leaked. In order to reproduce this, this case will use
|
||||
// failpoint to inject error into Shutdown API, and then check whether the shim
|
||||
// is leaked.
|
||||
func TestIssue7496_ShouldRetryShutdown(t *testing.T) {
|
||||
// TODO: re-enable if we can retry Shutdown API.
|
||||
t.Skipf("Please re-enable me if we can retry Shutdown API")
|
||||
|
||||
ctx := namespaces.WithNamespace(context.Background(), "k8s.io")
|
||||
|
||||
t.Logf("Create a pod config with shutdown failpoint")
|
||||
sbConfig := PodSandboxConfig("sandbox", "issue7496_shouldretryshutdown")
|
||||
injectShimFailpoint(t, sbConfig, map[string]string{
|
||||
"Shutdown": "1*error(please retry)",
|
||||
})
|
||||
|
||||
t.Logf("RunPodSandbox")
|
||||
sbID, err := runtimeService.RunPodSandbox(sbConfig, failpointRuntimeHandler)
|
||||
require.NoError(t, err)
|
||||
|
||||
t.Logf("Connect to the shim %s", sbID)
|
||||
shimCli := connectToShim(ctx, t, sbID)
|
||||
|
||||
t.Logf("Log shim %s's pid: %d", sbID, shimPid(ctx, t, shimCli))
|
||||
|
||||
t.Logf("StopPodSandbox and RemovePodSandbox")
|
||||
require.NoError(t, runtimeService.StopPodSandbox(sbID))
|
||||
require.NoError(t, runtimeService.RemovePodSandbox(sbID))
|
||||
|
||||
t.Logf("Check the shim connection")
|
||||
_, err = shimCli.Connect(ctx, &apitask.ConnectRequest{})
|
||||
require.Error(t, err, "should failed to call shim connect API")
|
||||
}
|
@ -458,6 +458,12 @@ func (s *shimTask) delete(ctx context.Context, sandboxed bool, removeTask func(c
|
||||
// If not, the shim has been delivered the exit and delete events.
|
||||
// So we should remove the record and prevent duplicate events from
|
||||
// ttrpc-callback-on-close.
|
||||
//
|
||||
// TODO: It's hard to guarantee that the event is unique and sent only
|
||||
// once. The moby/moby should not rely on that assumption that there is
|
||||
// only one exit event. The moby/moby should handle the duplicate events.
|
||||
//
|
||||
// REF: https://github.com/containerd/containerd/issues/4769
|
||||
if shimErr == nil {
|
||||
removeTask(ctx, s.ID())
|
||||
}
|
||||
@ -466,7 +472,11 @@ func (s *shimTask) delete(ctx context.Context, sandboxed bool, removeTask func(c
|
||||
// Let controller decide when to shutdown.
|
||||
if !sandboxed {
|
||||
if err := s.waitShutdown(ctx); err != nil {
|
||||
log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to shutdown shim task")
|
||||
// FIXME(fuweid):
|
||||
//
|
||||
// If the error is context canceled, should we use context.TODO()
|
||||
// to wait for it?
|
||||
log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to shutdown shim task and the shim might be leaked")
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user