Fix Delete race.

Signed-off-by: Lantao Liu <lantaol@google.com>
This commit is contained in:
Lantao Liu 2017-06-12 23:54:39 +00:00
parent 7050011faa
commit bd09d31777
4 changed files with 114 additions and 70 deletions

View File

@ -71,10 +71,10 @@ func (c *criContainerdService) StopContainer(ctx context.Context, r *runtime.Sto
glog.V(2).Infof("Stop container %q with signal %v", id, stopSignal) glog.V(2).Infof("Stop container %q with signal %v", id, stopSignal)
_, err = c.containerService.Kill(ctx, &execution.KillRequest{ID: id, Signal: uint32(stopSignal)}) _, err = c.containerService.Kill(ctx, &execution.KillRequest{ID: id, Signal: uint32(stopSignal)})
if err != nil { if err != nil {
if isContainerdContainerNotExistError(err) { if !isContainerdContainerNotExistError(err) && !isRuncProcessAlreadyFinishedError(err) {
return &runtime.StopContainerResponse{}, nil return nil, fmt.Errorf("failed to stop container %q: %v", id, err)
} }
return nil, fmt.Errorf("failed to stop container %q: %v", id, err) // Move on to make sure container status is updated.
} }
err = c.waitContainerStop(ctx, id, time.Duration(r.GetTimeout())*time.Second) err = c.waitContainerStop(ctx, id, time.Duration(r.GetTimeout())*time.Second)
@ -84,20 +84,19 @@ func (c *criContainerdService) StopContainer(ctx context.Context, r *runtime.Sto
glog.Errorf("Stop container %q timed out: %v", id, err) glog.Errorf("Stop container %q timed out: %v", id, err)
} }
glog.V(2).Infof("Delete container from containerd %q", id) // Event handler will Delete the container from containerd after it handles the Exited event.
// Delete sends SIGKILL to the container in the containerd version we are using. glog.V(2).Infof("Kill container %q", id)
// TODO(random-liu): Replace with `Kill` to avoid race soon. _, err = c.containerService.Kill(ctx, &execution.KillRequest{ID: id, Signal: uint32(unix.SIGKILL)})
_, err = c.containerService.Delete(ctx, &execution.DeleteRequest{ID: id})
if err != nil { if err != nil {
if isContainerdContainerNotExistError(err) { if !isContainerdContainerNotExistError(err) && !isRuncProcessAlreadyFinishedError(err) {
return &runtime.StopContainerResponse{}, nil return nil, fmt.Errorf("failed to kill container %q: %v", id, err)
} }
return nil, fmt.Errorf("failed to delete container %q: %v", id, err) // Move on to make sure container status is updated.
} }
// Wait forever until container stop is observed by event monitor. // Wait for a fixed timeout until container stop is observed by event monitor.
if err := c.waitContainerStop(ctx, id, killContainerTimeout); err != nil { if err := c.waitContainerStop(ctx, id, killContainerTimeout); err != nil {
return nil, fmt.Errorf("error occurs during waiting for container %q to stop: %v", return nil, fmt.Errorf("an error occurs during waiting for container %q to stop: %v",
id, err) id, err)
} }
return &runtime.StopContainerResponse{}, nil return &runtime.StopContainerResponse{}, nil

View File

@ -25,6 +25,7 @@ import (
"github.com/containerd/containerd/api/types/container" "github.com/containerd/containerd/api/types/container"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"golang.org/x/net/context" "golang.org/x/net/context"
"golang.org/x/sys/unix"
runtime "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1" runtime "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1"
"github.com/kubernetes-incubator/cri-containerd/pkg/metadata" "github.com/kubernetes-incubator/cri-containerd/pkg/metadata"
@ -106,17 +107,15 @@ func TestStopContainer(t *testing.T) {
for desc, test := range map[string]struct { for desc, test := range map[string]struct {
metadata *metadata.ContainerMetadata metadata *metadata.ContainerMetadata
containerdContainer *container.Container containerdContainer *container.Container
killErr error stopErr error
deleteErr error
discardEvents int
noTimeout bool noTimeout bool
expectErr bool expectErr bool
expectCalls []string expectCalls []servertesting.CalledDetail
}{ }{
"should return error when container does not exist": { "should return error when container does not exist": {
metadata: nil, metadata: nil,
expectErr: true, expectErr: true,
expectCalls: []string{}, expectCalls: []servertesting.CalledDetail{},
}, },
"should not return error when container is not running": { "should not return error when container is not running": {
metadata: &metadata.ContainerMetadata{ metadata: &metadata.ContainerMetadata{
@ -124,52 +123,99 @@ func TestStopContainer(t *testing.T) {
CreatedAt: time.Now().UnixNano(), CreatedAt: time.Now().UnixNano(),
}, },
expectErr: false, expectErr: false,
expectCalls: []string{}, expectCalls: []servertesting.CalledDetail{},
}, },
"should not return error if containerd container does not exist": { "should not return error if containerd container does not exist": {
metadata: &testMetadata, metadata: &testMetadata,
expectErr: false, containerdContainer: &testContainer,
expectCalls: []string{"kill"}, // Since it's hard to inject event during `StopContainer` is running,
// we only test the case that first stop returns error, but container
// status is not updated yet.
// We also leverage this behavior to test that when graceful
// stop doesn't take effect, container should be SIGKILL-ed.
stopErr: servertesting.ContainerNotExistError,
expectErr: false,
expectCalls: []servertesting.CalledDetail{
{
Name: "kill",
Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGTERM)},
},
{
Name: "kill",
Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGKILL)},
},
{
Name: "delete",
Argument: &execution.DeleteRequest{ID: testID},
},
},
}, },
"should not return error if containerd container is killed": { "should not return error if containerd container process already finished": {
metadata: &testMetadata,
containerdContainer: &testContainer,
stopErr: errors.New("os: process already finished"),
expectErr: false,
expectCalls: []servertesting.CalledDetail{
{
Name: "kill",
Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGTERM)},
},
{
Name: "kill",
Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGKILL)},
},
{
Name: "delete",
Argument: &execution.DeleteRequest{ID: testID},
},
},
},
"should return error if graceful stop returns random error": {
metadata: &testMetadata,
containerdContainer: &testContainer,
stopErr: errors.New("random stop error"),
expectErr: true,
expectCalls: []servertesting.CalledDetail{
{
Name: "kill",
Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGTERM)},
},
},
},
"should not return error if containerd container is gracefully stopped": {
metadata: &testMetadata, metadata: &testMetadata,
containerdContainer: &testContainer, containerdContainer: &testContainer,
expectErr: false, expectErr: false,
// deleted by the event monitor. // deleted by the event monitor.
expectCalls: []string{"kill", "delete"}, expectCalls: []servertesting.CalledDetail{
}, {
"should not return error if containerd container is deleted": { Name: "kill",
metadata: &testMetadata, Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGTERM)},
containerdContainer: &testContainer, },
// discard killed events to force a delete. This is only {
// for testing. Actually real containerd should only generate Name: "delete",
// one EXIT event. Argument: &execution.DeleteRequest{ID: testID},
discardEvents: 1, },
expectErr: false, },
// one more delete from the event monitor.
expectCalls: []string{"kill", "delete", "delete"},
},
"should return error if kill failed": {
metadata: &testMetadata,
containerdContainer: &testContainer,
killErr: errors.New("random error"),
expectErr: true,
expectCalls: []string{"kill"},
}, },
"should directly kill container if timeout is 0": { "should directly kill container if timeout is 0": {
metadata: &testMetadata, metadata: &testMetadata,
containerdContainer: &testContainer, containerdContainer: &testContainer,
noTimeout: true, noTimeout: true,
expectCalls: []string{"delete", "delete"}, expectErr: false,
}, expectCalls: []servertesting.CalledDetail{
"should return error if delete failed": { {
metadata: &testMetadata, Name: "kill",
containerdContainer: &testContainer, Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGKILL)},
deleteErr: errors.New("random error"), },
discardEvents: 1, {
expectErr: true, Name: "delete",
expectCalls: []string{"kill", "delete"}, Argument: &execution.DeleteRequest{ID: testID},
},
},
}, },
// TODO(random-liu): Test "should return error if both failed" after we have
// fake clock for test.
} { } {
t.Logf("TestCase %q", desc) t.Logf("TestCase %q", desc)
c := newTestCRIContainerdService() c := newTestCRIContainerdService()
@ -185,28 +231,19 @@ func TestStopContainer(t *testing.T) {
if test.containerdContainer != nil { if test.containerdContainer != nil {
fake.SetFakeContainers([]container.Container{*test.containerdContainer}) fake.SetFakeContainers([]container.Container{*test.containerdContainer})
} }
if test.killErr != nil { if test.stopErr != nil {
fake.InjectError("kill", test.killErr) fake.InjectError("kill", test.stopErr)
}
if test.deleteErr != nil {
fake.InjectError("delete", test.deleteErr)
} }
eventClient, err := fake.Events(context.Background(), &execution.EventsRequest{}) eventClient, err := fake.Events(context.Background(), &execution.EventsRequest{})
assert.NoError(t, err) assert.NoError(t, err)
// Start a simple test event monitor. // Start a simple test event monitor.
go func(e execution.ContainerService_EventsClient, discard int) { go func(e execution.ContainerService_EventsClient) {
for { for {
e, err := e.Recv() // nolint: vetshadow if err := c.handleEventStream(e); err != nil { // nolint: vetshadow
if err != nil {
return return
} }
if discard > 0 {
discard--
continue
}
c.handleEvent(e)
} }
}(eventClient, test.discardEvents) }(eventClient)
fake.ClearCalls() fake.ClearCalls()
timeout := int64(1) timeout := int64(1)
if test.noTimeout { if test.noTimeout {
@ -225,6 +262,6 @@ func TestStopContainer(t *testing.T) {
assert.NoError(t, err) assert.NoError(t, err)
assert.NotNil(t, resp) assert.NotNil(t, resp)
} }
assert.Equal(t, test.expectCalls, fake.GetCalledNames()) assert.Equal(t, test.expectCalls, fake.GetCalledDetails())
} }
} }

View File

@ -226,6 +226,13 @@ func isContainerdContainerNotExistError(grpcError error) bool {
return grpc.ErrorDesc(grpcError) == containerd.ErrContainerNotExist.Error() return grpc.ErrorDesc(grpcError) == containerd.ErrContainerNotExist.Error()
} }
// isRuncProcessAlreadyFinishedError checks whether a grpc error is a process already
// finished error.
// TODO(random-liu): Containerd should expose this error in api. (containerd#999)
func isRuncProcessAlreadyFinishedError(grpcError error) bool {
return strings.Contains(grpc.ErrorDesc(grpcError), "os: process already finished")
}
// getSandbox gets the sandbox metadata from the sandbox store. It returns nil without // getSandbox gets the sandbox metadata from the sandbox store. It returns nil without
// error if the sandbox metadata is not found. It also tries to get full sandbox id and // error if the sandbox metadata is not found. It also tries to get full sandbox id and
// retry if the sandbox metadata is not found with the initial id. // retry if the sandbox metadata is not found with the initial id.

View File

@ -31,7 +31,8 @@ import (
"google.golang.org/grpc/codes" "google.golang.org/grpc/codes"
) )
var containerNotExistError = grpc.Errorf(codes.Unknown, containerd.ErrContainerNotExist.Error()) // ContainerNotExistError is the fake error returned when container does not exist.
var ContainerNotExistError = grpc.Errorf(codes.Unknown, containerd.ErrContainerNotExist.Error())
// CalledDetail is the struct contains called function name and arguments. // CalledDetail is the struct contains called function name and arguments.
type CalledDetail struct { type CalledDetail struct {
@ -229,7 +230,7 @@ func (f *FakeExecutionClient) Start(ctx context.Context, startOpts *execution.St
} }
c, ok := f.ContainerList[startOpts.ID] c, ok := f.ContainerList[startOpts.ID]
if !ok { if !ok {
return nil, containerNotExistError return nil, ContainerNotExistError
} }
f.sendEvent(&container.Event{ f.sendEvent(&container.Event{
ID: c.ID, ID: c.ID,
@ -260,7 +261,7 @@ func (f *FakeExecutionClient) Delete(ctx context.Context, deleteOpts *execution.
} }
c, ok := f.ContainerList[deleteOpts.ID] c, ok := f.ContainerList[deleteOpts.ID]
if !ok { if !ok {
return nil, containerNotExistError return nil, ContainerNotExistError
} }
delete(f.ContainerList, deleteOpts.ID) delete(f.ContainerList, deleteOpts.ID)
f.sendEvent(&container.Event{ f.sendEvent(&container.Event{
@ -281,7 +282,7 @@ func (f *FakeExecutionClient) Info(ctx context.Context, infoOpts *execution.Info
} }
c, ok := f.ContainerList[infoOpts.ID] c, ok := f.ContainerList[infoOpts.ID]
if !ok { if !ok {
return nil, containerNotExistError return nil, ContainerNotExistError
} }
return &c, nil return &c, nil
} }
@ -315,7 +316,7 @@ func (f *FakeExecutionClient) Kill(ctx context.Context, killOpts *execution.Kill
} }
c, ok := f.ContainerList[killOpts.ID] c, ok := f.ContainerList[killOpts.ID]
if !ok { if !ok {
return nil, containerNotExistError return nil, ContainerNotExistError
} }
c.Status = container.Status_STOPPED c.Status = container.Status_STOPPED
f.ContainerList[killOpts.ID] = c f.ContainerList[killOpts.ID] = c