diff --git a/pkg/server/container_stop.go b/pkg/server/container_stop.go index 4dcf2d521..0605d4d85 100644 --- a/pkg/server/container_stop.go +++ b/pkg/server/container_stop.go @@ -71,10 +71,10 @@ func (c *criContainerdService) StopContainer(ctx context.Context, r *runtime.Sto glog.V(2).Infof("Stop container %q with signal %v", id, stopSignal) _, err = c.containerService.Kill(ctx, &execution.KillRequest{ID: id, Signal: uint32(stopSignal)}) if err != nil { - if isContainerdContainerNotExistError(err) { - return &runtime.StopContainerResponse{}, nil + if !isContainerdContainerNotExistError(err) && !isRuncProcessAlreadyFinishedError(err) { + return nil, fmt.Errorf("failed to stop container %q: %v", id, err) } - return nil, fmt.Errorf("failed to stop container %q: %v", id, err) + // Move on to make sure container status is updated. } err = c.waitContainerStop(ctx, id, time.Duration(r.GetTimeout())*time.Second) @@ -84,20 +84,19 @@ func (c *criContainerdService) StopContainer(ctx context.Context, r *runtime.Sto glog.Errorf("Stop container %q timed out: %v", id, err) } - glog.V(2).Infof("Delete container from containerd %q", id) - // Delete sends SIGKILL to the container in the containerd version we are using. - // TODO(random-liu): Replace with `Kill` to avoid race soon. - _, err = c.containerService.Delete(ctx, &execution.DeleteRequest{ID: id}) + // Event handler will Delete the container from containerd after it handles the Exited event. + glog.V(2).Infof("Kill container %q", id) + _, err = c.containerService.Kill(ctx, &execution.KillRequest{ID: id, Signal: uint32(unix.SIGKILL)}) if err != nil { - if isContainerdContainerNotExistError(err) { - return &runtime.StopContainerResponse{}, nil + if !isContainerdContainerNotExistError(err) && !isRuncProcessAlreadyFinishedError(err) { + return nil, fmt.Errorf("failed to kill container %q: %v", id, err) } - return nil, fmt.Errorf("failed to delete container %q: %v", id, err) + // Move on to make sure container status is updated. } - // Wait forever until container stop is observed by event monitor. + // Wait for a fixed timeout until container stop is observed by event monitor. if err := c.waitContainerStop(ctx, id, killContainerTimeout); err != nil { - return nil, fmt.Errorf("error occurs during waiting for container %q to stop: %v", + return nil, fmt.Errorf("an error occurs during waiting for container %q to stop: %v", id, err) } return &runtime.StopContainerResponse{}, nil diff --git a/pkg/server/container_stop_test.go b/pkg/server/container_stop_test.go index 85e32ccc1..7971c916b 100644 --- a/pkg/server/container_stop_test.go +++ b/pkg/server/container_stop_test.go @@ -25,6 +25,7 @@ import ( "github.com/containerd/containerd/api/types/container" "github.com/stretchr/testify/assert" "golang.org/x/net/context" + "golang.org/x/sys/unix" runtime "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1" "github.com/kubernetes-incubator/cri-containerd/pkg/metadata" @@ -106,17 +107,15 @@ func TestStopContainer(t *testing.T) { for desc, test := range map[string]struct { metadata *metadata.ContainerMetadata containerdContainer *container.Container - killErr error - deleteErr error - discardEvents int + stopErr error noTimeout bool expectErr bool - expectCalls []string + expectCalls []servertesting.CalledDetail }{ "should return error when container does not exist": { metadata: nil, expectErr: true, - expectCalls: []string{}, + expectCalls: []servertesting.CalledDetail{}, }, "should not return error when container is not running": { metadata: &metadata.ContainerMetadata{ @@ -124,52 +123,99 @@ func TestStopContainer(t *testing.T) { CreatedAt: time.Now().UnixNano(), }, expectErr: false, - expectCalls: []string{}, + expectCalls: []servertesting.CalledDetail{}, }, "should not return error if containerd container does not exist": { - metadata: &testMetadata, - expectErr: false, - expectCalls: []string{"kill"}, + metadata: &testMetadata, + containerdContainer: &testContainer, + // Since it's hard to inject event during `StopContainer` is running, + // we only test the case that first stop returns error, but container + // status is not updated yet. + // We also leverage this behavior to test that when graceful + // stop doesn't take effect, container should be SIGKILL-ed. + stopErr: servertesting.ContainerNotExistError, + expectErr: false, + expectCalls: []servertesting.CalledDetail{ + { + Name: "kill", + Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGTERM)}, + }, + { + Name: "kill", + Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGKILL)}, + }, + { + Name: "delete", + Argument: &execution.DeleteRequest{ID: testID}, + }, + }, }, - "should not return error if containerd container is killed": { + "should not return error if containerd container process already finished": { + metadata: &testMetadata, + containerdContainer: &testContainer, + stopErr: errors.New("os: process already finished"), + expectErr: false, + expectCalls: []servertesting.CalledDetail{ + { + Name: "kill", + Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGTERM)}, + }, + { + Name: "kill", + Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGKILL)}, + }, + { + Name: "delete", + Argument: &execution.DeleteRequest{ID: testID}, + }, + }, + }, + "should return error if graceful stop returns random error": { + metadata: &testMetadata, + containerdContainer: &testContainer, + stopErr: errors.New("random stop error"), + expectErr: true, + expectCalls: []servertesting.CalledDetail{ + { + Name: "kill", + Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGTERM)}, + }, + }, + }, + "should not return error if containerd container is gracefully stopped": { metadata: &testMetadata, containerdContainer: &testContainer, expectErr: false, // deleted by the event monitor. - expectCalls: []string{"kill", "delete"}, - }, - "should not return error if containerd container is deleted": { - metadata: &testMetadata, - containerdContainer: &testContainer, - // discard killed events to force a delete. This is only - // for testing. Actually real containerd should only generate - // one EXIT event. - discardEvents: 1, - expectErr: false, - // one more delete from the event monitor. - expectCalls: []string{"kill", "delete", "delete"}, - }, - "should return error if kill failed": { - metadata: &testMetadata, - containerdContainer: &testContainer, - killErr: errors.New("random error"), - expectErr: true, - expectCalls: []string{"kill"}, + expectCalls: []servertesting.CalledDetail{ + { + Name: "kill", + Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGTERM)}, + }, + { + Name: "delete", + Argument: &execution.DeleteRequest{ID: testID}, + }, + }, }, "should directly kill container if timeout is 0": { metadata: &testMetadata, containerdContainer: &testContainer, noTimeout: true, - expectCalls: []string{"delete", "delete"}, - }, - "should return error if delete failed": { - metadata: &testMetadata, - containerdContainer: &testContainer, - deleteErr: errors.New("random error"), - discardEvents: 1, - expectErr: true, - expectCalls: []string{"kill", "delete"}, + expectErr: false, + expectCalls: []servertesting.CalledDetail{ + { + Name: "kill", + Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGKILL)}, + }, + { + Name: "delete", + Argument: &execution.DeleteRequest{ID: testID}, + }, + }, }, + // TODO(random-liu): Test "should return error if both failed" after we have + // fake clock for test. } { t.Logf("TestCase %q", desc) c := newTestCRIContainerdService() @@ -185,28 +231,19 @@ func TestStopContainer(t *testing.T) { if test.containerdContainer != nil { fake.SetFakeContainers([]container.Container{*test.containerdContainer}) } - if test.killErr != nil { - fake.InjectError("kill", test.killErr) - } - if test.deleteErr != nil { - fake.InjectError("delete", test.deleteErr) + if test.stopErr != nil { + fake.InjectError("kill", test.stopErr) } eventClient, err := fake.Events(context.Background(), &execution.EventsRequest{}) assert.NoError(t, err) // Start a simple test event monitor. - go func(e execution.ContainerService_EventsClient, discard int) { + go func(e execution.ContainerService_EventsClient) { for { - e, err := e.Recv() // nolint: vetshadow - if err != nil { + if err := c.handleEventStream(e); err != nil { // nolint: vetshadow return } - if discard > 0 { - discard-- - continue - } - c.handleEvent(e) } - }(eventClient, test.discardEvents) + }(eventClient) fake.ClearCalls() timeout := int64(1) if test.noTimeout { @@ -225,6 +262,6 @@ func TestStopContainer(t *testing.T) { assert.NoError(t, err) assert.NotNil(t, resp) } - assert.Equal(t, test.expectCalls, fake.GetCalledNames()) + assert.Equal(t, test.expectCalls, fake.GetCalledDetails()) } } diff --git a/pkg/server/helpers.go b/pkg/server/helpers.go index 218e944ab..46fac2cba 100644 --- a/pkg/server/helpers.go +++ b/pkg/server/helpers.go @@ -226,6 +226,13 @@ func isContainerdContainerNotExistError(grpcError error) bool { return grpc.ErrorDesc(grpcError) == containerd.ErrContainerNotExist.Error() } +// isRuncProcessAlreadyFinishedError checks whether a grpc error is a process already +// finished error. +// TODO(random-liu): Containerd should expose this error in api. (containerd#999) +func isRuncProcessAlreadyFinishedError(grpcError error) bool { + return strings.Contains(grpc.ErrorDesc(grpcError), "os: process already finished") +} + // getSandbox gets the sandbox metadata from the sandbox store. It returns nil without // error if the sandbox metadata is not found. It also tries to get full sandbox id and // retry if the sandbox metadata is not found with the initial id. diff --git a/pkg/server/testing/fake_execution_client.go b/pkg/server/testing/fake_execution_client.go index 6a00127c4..b6ef3fd23 100644 --- a/pkg/server/testing/fake_execution_client.go +++ b/pkg/server/testing/fake_execution_client.go @@ -31,7 +31,8 @@ import ( "google.golang.org/grpc/codes" ) -var containerNotExistError = grpc.Errorf(codes.Unknown, containerd.ErrContainerNotExist.Error()) +// ContainerNotExistError is the fake error returned when container does not exist. +var ContainerNotExistError = grpc.Errorf(codes.Unknown, containerd.ErrContainerNotExist.Error()) // CalledDetail is the struct contains called function name and arguments. type CalledDetail struct { @@ -229,7 +230,7 @@ func (f *FakeExecutionClient) Start(ctx context.Context, startOpts *execution.St } c, ok := f.ContainerList[startOpts.ID] if !ok { - return nil, containerNotExistError + return nil, ContainerNotExistError } f.sendEvent(&container.Event{ ID: c.ID, @@ -260,7 +261,7 @@ func (f *FakeExecutionClient) Delete(ctx context.Context, deleteOpts *execution. } c, ok := f.ContainerList[deleteOpts.ID] if !ok { - return nil, containerNotExistError + return nil, ContainerNotExistError } delete(f.ContainerList, deleteOpts.ID) f.sendEvent(&container.Event{ @@ -281,7 +282,7 @@ func (f *FakeExecutionClient) Info(ctx context.Context, infoOpts *execution.Info } c, ok := f.ContainerList[infoOpts.ID] if !ok { - return nil, containerNotExistError + return nil, ContainerNotExistError } return &c, nil } @@ -315,7 +316,7 @@ func (f *FakeExecutionClient) Kill(ctx context.Context, killOpts *execution.Kill } c, ok := f.ContainerList[killOpts.ID] if !ok { - return nil, containerNotExistError + return nil, ContainerNotExistError } c.Status = container.Status_STOPPED f.ContainerList[killOpts.ID] = c