integration: deflake TestContainerdRestart
The CRI-plugin will setup watcher for each container after StartContainer or RunPodSandbox. It will cleanup task(container/sandbox) if received the exit event from watcher. The original test design is to `Delete` sandbox container to get NOT_READY state and expect to receive NotFound error. It depends on that CRI-plugin cleanups container after `Delete` API. If not, the shim will be cleanup and test code will receive `ttrpc: closed: unknown` or other unknown error. It is flaky. In this patch, the test will only send the kill signal and wait for the exit event. When sandbox exits, the state will and must be NOT_READY. ```plain // test fail log === RUN TestContainerdRestart restart_test.go:92: Make sure no sandbox is running before test restart_test.go:97: Start test sandboxes and containers common.go:115: Image "k8s.gcr.io/pause:3.6" already exists, not pulling. common.go:115: Image "k8s.gcr.io/pause:3.6" already exists, not pulling. restart_test.go:139: Error Trace: restart_test.go:139 Error: Should be true Test: TestContainerdRestart Messages: delete should return not found error but returned failed to delete task: ttrpc: closed: unknown --- FAIL: TestContainerdRestart (4.25s) // containerd log &TaskExit{ContainerID:4b4c1d1d303c14a2cc759631d163f153ba8536e9ea6821744a509e4a17346184,ID:4b4c1d1d303c14a2cc759631d163f153ba8536e9ea6821744a509e4a17346184,Pid:28430,ExitStatus:137,ExitedAt:2021-12-12 07:56:01.400753012 +0000 UTC,XXX_unrecognized:[],}" time="2021-12-12T07:56:01.401120516Z" level=debug msg="event forwarded" ns=k8s.io topic=/tasks/exit type=containerd.events.TaskExit time="2021-12-12T07:56:01.418934208Z" level=debug msg="event forwarded" ns=k8s.io topic=/tasks/delete type=containerd.events.TaskDelete time="2021-12-12T07:56:01.419192910Z" level=info msg="shim disconnected" id=4b4c1d1d303c14a2cc759631d163f153ba8536e9ea6821744a509e4a17346184 time="2021-12-12T07:56:01.419235911Z" level=warning msg="cleaning up after shim disconnected" id=4b4c1d1d303c14a2cc759631d163f153ba8536e9ea6821744a509e4a17346184 namespace=k8s.io time="2021-12-12T07:56:01.419247711Z" level=info msg="cleaning up dead shim" time="2021-12-12T07:56:01.419235311Z" level=error msg="failed sending message on channel" error="write unix /run/containerd/s/18afde7fcde70236eb31b9f43f3bd92af1dc1186583c501aa1396255f87f95d4->@: write: broken pipe" time="2021-12-12T07:56:01.419354712Z" level=debug msg="failed to delete task" error="ttrpc: closed" id=4b4c1d1d303c14a2cc759631d163f153ba8536e9ea6821744a509e4a17346184 ``` CI Link: `https://pipelines.actions.githubusercontent.com/G4SighzWVVZ6vsyiz7FFMFjLjRzveJHseEnVyibkSq87Cl2x4O/_apis/pipelines/1/runs/9501/signedlogcontent/76?urlExpires=2021-12-12T08%3A42%3A08.0765750Z&urlSigningMethod=HMACV1&urlSignature=pH93isMSFdZUo1ndnZynJpZbPGrEyvt12MO03fgUU7I%3D` Signed-off-by: Wei Fu <fuweid89@gmail.com>
This commit is contained in:
parent
4236f6b225
commit
9e9ee66bfd
@ -19,10 +19,11 @@ package integration
|
||||
import (
|
||||
goruntime "runtime"
|
||||
"sort"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/containerd/containerd"
|
||||
"github.com/containerd/containerd/errdefs"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/net/context"
|
||||
@ -134,9 +135,21 @@ func TestContainerdRestart(t *testing.T) {
|
||||
require.NoError(t, err)
|
||||
task, err := cntr.Task(ctx, nil)
|
||||
require.NoError(t, err)
|
||||
_, err = task.Delete(ctx, containerd.WithProcessKill)
|
||||
if err != nil {
|
||||
require.True(t, errdefs.IsNotFound(err), "delete should return not found error but returned %v", err)
|
||||
|
||||
waitCh, err := task.Wait(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// NOTE: CRI-plugin setups watcher for each container and
|
||||
// cleanups container when the watcher returns exit event.
|
||||
// We just need to kill that sandbox and wait for exit
|
||||
// event from waitCh. If the sandbox container exits,
|
||||
// the state of sandbox must be NOT_READY.
|
||||
require.NoError(t, task.Kill(ctx, syscall.SIGKILL, containerd.WithKillAll))
|
||||
|
||||
select {
|
||||
case <-waitCh:
|
||||
case <-time.After(30 * time.Second):
|
||||
t.Fatalf("expected to receive exit event in time, but timeout")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user