Merge pull request #73 from Random-Liu/fix-delete-race

Fix Delete race.
This commit is contained in:
Lantao Liu 2017-06-14 14:04:24 -07:00 committed by GitHub
commit d4f7380f59
4 changed files with 114 additions and 70 deletions

View File

@ -71,10 +71,10 @@ func (c *criContainerdService) StopContainer(ctx context.Context, r *runtime.Sto
glog.V(2).Infof("Stop container %q with signal %v", id, stopSignal)
_, err = c.containerService.Kill(ctx, &execution.KillRequest{ID: id, Signal: uint32(stopSignal)})
if err != nil {
if isContainerdContainerNotExistError(err) {
return &runtime.StopContainerResponse{}, nil
if !isContainerdContainerNotExistError(err) && !isRuncProcessAlreadyFinishedError(err) {
return nil, fmt.Errorf("failed to stop container %q: %v", id, err)
}
return nil, fmt.Errorf("failed to stop container %q: %v", id, err)
// Move on to make sure container status is updated.
}
err = c.waitContainerStop(ctx, id, time.Duration(r.GetTimeout())*time.Second)
@ -84,20 +84,19 @@ func (c *criContainerdService) StopContainer(ctx context.Context, r *runtime.Sto
glog.Errorf("Stop container %q timed out: %v", id, err)
}
glog.V(2).Infof("Delete container from containerd %q", id)
// Delete sends SIGKILL to the container in the containerd version we are using.
// TODO(random-liu): Replace with `Kill` to avoid race soon.
_, err = c.containerService.Delete(ctx, &execution.DeleteRequest{ID: id})
// Event handler will Delete the container from containerd after it handles the Exited event.
glog.V(2).Infof("Kill container %q", id)
_, err = c.containerService.Kill(ctx, &execution.KillRequest{ID: id, Signal: uint32(unix.SIGKILL)})
if err != nil {
if isContainerdContainerNotExistError(err) {
return &runtime.StopContainerResponse{}, nil
if !isContainerdContainerNotExistError(err) && !isRuncProcessAlreadyFinishedError(err) {
return nil, fmt.Errorf("failed to kill container %q: %v", id, err)
}
return nil, fmt.Errorf("failed to delete container %q: %v", id, err)
// Move on to make sure container status is updated.
}
// Wait forever until container stop is observed by event monitor.
// Wait for a fixed timeout until container stop is observed by event monitor.
if err := c.waitContainerStop(ctx, id, killContainerTimeout); err != nil {
return nil, fmt.Errorf("error occurs during waiting for container %q to stop: %v",
return nil, fmt.Errorf("an error occurs during waiting for container %q to stop: %v",
id, err)
}
return &runtime.StopContainerResponse{}, nil

View File

@ -25,6 +25,7 @@ import (
"github.com/containerd/containerd/api/types/container"
"github.com/stretchr/testify/assert"
"golang.org/x/net/context"
"golang.org/x/sys/unix"
runtime "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1"
"github.com/kubernetes-incubator/cri-containerd/pkg/metadata"
@ -106,17 +107,15 @@ func TestStopContainer(t *testing.T) {
for desc, test := range map[string]struct {
metadata *metadata.ContainerMetadata
containerdContainer *container.Container
killErr error
deleteErr error
discardEvents int
stopErr error
noTimeout bool
expectErr bool
expectCalls []string
expectCalls []servertesting.CalledDetail
}{
"should return error when container does not exist": {
metadata: nil,
expectErr: true,
expectCalls: []string{},
expectCalls: []servertesting.CalledDetail{},
},
"should not return error when container is not running": {
metadata: &metadata.ContainerMetadata{
@ -124,52 +123,99 @@ func TestStopContainer(t *testing.T) {
CreatedAt: time.Now().UnixNano(),
},
expectErr: false,
expectCalls: []string{},
expectCalls: []servertesting.CalledDetail{},
},
"should not return error if containerd container does not exist": {
metadata: &testMetadata,
expectErr: false,
expectCalls: []string{"kill"},
metadata: &testMetadata,
containerdContainer: &testContainer,
// Since it's hard to inject event during `StopContainer` is running,
// we only test the case that first stop returns error, but container
// status is not updated yet.
// We also leverage this behavior to test that when graceful
// stop doesn't take effect, container should be SIGKILL-ed.
stopErr: servertesting.ContainerNotExistError,
expectErr: false,
expectCalls: []servertesting.CalledDetail{
{
Name: "kill",
Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGTERM)},
},
{
Name: "kill",
Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGKILL)},
},
{
Name: "delete",
Argument: &execution.DeleteRequest{ID: testID},
},
},
},
"should not return error if containerd container is killed": {
"should not return error if containerd container process already finished": {
metadata: &testMetadata,
containerdContainer: &testContainer,
stopErr: errors.New("os: process already finished"),
expectErr: false,
expectCalls: []servertesting.CalledDetail{
{
Name: "kill",
Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGTERM)},
},
{
Name: "kill",
Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGKILL)},
},
{
Name: "delete",
Argument: &execution.DeleteRequest{ID: testID},
},
},
},
"should return error if graceful stop returns random error": {
metadata: &testMetadata,
containerdContainer: &testContainer,
stopErr: errors.New("random stop error"),
expectErr: true,
expectCalls: []servertesting.CalledDetail{
{
Name: "kill",
Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGTERM)},
},
},
},
"should not return error if containerd container is gracefully stopped": {
metadata: &testMetadata,
containerdContainer: &testContainer,
expectErr: false,
// deleted by the event monitor.
expectCalls: []string{"kill", "delete"},
},
"should not return error if containerd container is deleted": {
metadata: &testMetadata,
containerdContainer: &testContainer,
// discard killed events to force a delete. This is only
// for testing. Actually real containerd should only generate
// one EXIT event.
discardEvents: 1,
expectErr: false,
// one more delete from the event monitor.
expectCalls: []string{"kill", "delete", "delete"},
},
"should return error if kill failed": {
metadata: &testMetadata,
containerdContainer: &testContainer,
killErr: errors.New("random error"),
expectErr: true,
expectCalls: []string{"kill"},
expectCalls: []servertesting.CalledDetail{
{
Name: "kill",
Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGTERM)},
},
{
Name: "delete",
Argument: &execution.DeleteRequest{ID: testID},
},
},
},
"should directly kill container if timeout is 0": {
metadata: &testMetadata,
containerdContainer: &testContainer,
noTimeout: true,
expectCalls: []string{"delete", "delete"},
},
"should return error if delete failed": {
metadata: &testMetadata,
containerdContainer: &testContainer,
deleteErr: errors.New("random error"),
discardEvents: 1,
expectErr: true,
expectCalls: []string{"kill", "delete"},
expectErr: false,
expectCalls: []servertesting.CalledDetail{
{
Name: "kill",
Argument: &execution.KillRequest{ID: testID, Signal: uint32(unix.SIGKILL)},
},
{
Name: "delete",
Argument: &execution.DeleteRequest{ID: testID},
},
},
},
// TODO(random-liu): Test "should return error if both failed" after we have
// fake clock for test.
} {
t.Logf("TestCase %q", desc)
c := newTestCRIContainerdService()
@ -185,28 +231,19 @@ func TestStopContainer(t *testing.T) {
if test.containerdContainer != nil {
fake.SetFakeContainers([]container.Container{*test.containerdContainer})
}
if test.killErr != nil {
fake.InjectError("kill", test.killErr)
}
if test.deleteErr != nil {
fake.InjectError("delete", test.deleteErr)
if test.stopErr != nil {
fake.InjectError("kill", test.stopErr)
}
eventClient, err := fake.Events(context.Background(), &execution.EventsRequest{})
assert.NoError(t, err)
// Start a simple test event monitor.
go func(e execution.ContainerService_EventsClient, discard int) {
go func(e execution.ContainerService_EventsClient) {
for {
e, err := e.Recv() // nolint: vetshadow
if err != nil {
if err := c.handleEventStream(e); err != nil { // nolint: vetshadow
return
}
if discard > 0 {
discard--
continue
}
c.handleEvent(e)
}
}(eventClient, test.discardEvents)
}(eventClient)
fake.ClearCalls()
timeout := int64(1)
if test.noTimeout {
@ -225,6 +262,6 @@ func TestStopContainer(t *testing.T) {
assert.NoError(t, err)
assert.NotNil(t, resp)
}
assert.Equal(t, test.expectCalls, fake.GetCalledNames())
assert.Equal(t, test.expectCalls, fake.GetCalledDetails())
}
}

View File

@ -226,6 +226,13 @@ func isContainerdContainerNotExistError(grpcError error) bool {
return grpc.ErrorDesc(grpcError) == containerd.ErrContainerNotExist.Error()
}
// isRuncProcessAlreadyFinishedError checks whether a grpc error is a process already
// finished error.
// TODO(random-liu): Containerd should expose this error in api. (containerd#999)
func isRuncProcessAlreadyFinishedError(grpcError error) bool {
return strings.Contains(grpc.ErrorDesc(grpcError), "os: process already finished")
}
// getSandbox gets the sandbox metadata from the sandbox store. It returns nil without
// error if the sandbox metadata is not found. It also tries to get full sandbox id and
// retry if the sandbox metadata is not found with the initial id.

View File

@ -31,7 +31,8 @@ import (
"google.golang.org/grpc/codes"
)
var containerNotExistError = grpc.Errorf(codes.Unknown, containerd.ErrContainerNotExist.Error())
// ContainerNotExistError is the fake error returned when container does not exist.
var ContainerNotExistError = grpc.Errorf(codes.Unknown, containerd.ErrContainerNotExist.Error())
// CalledDetail is the struct contains called function name and arguments.
type CalledDetail struct {
@ -230,7 +231,7 @@ func (f *FakeExecutionClient) Start(ctx context.Context, startOpts *execution.St
}
c, ok := f.ContainerList[startOpts.ID]
if !ok {
return nil, containerNotExistError
return nil, ContainerNotExistError
}
f.sendEvent(&container.Event{
ID: c.ID,
@ -261,7 +262,7 @@ func (f *FakeExecutionClient) Delete(ctx context.Context, deleteOpts *execution.
}
c, ok := f.ContainerList[deleteOpts.ID]
if !ok {
return nil, containerNotExistError
return nil, ContainerNotExistError
}
delete(f.ContainerList, deleteOpts.ID)
f.sendEvent(&container.Event{
@ -282,7 +283,7 @@ func (f *FakeExecutionClient) Info(ctx context.Context, infoOpts *execution.Info
}
c, ok := f.ContainerList[infoOpts.ID]
if !ok {
return nil, containerNotExistError
return nil, ContainerNotExistError
}
return &c, nil
}
@ -316,7 +317,7 @@ func (f *FakeExecutionClient) Kill(ctx context.Context, killOpts *execution.Kill
}
c, ok := f.ContainerList[killOpts.ID]
if !ok {
return nil, containerNotExistError
return nil, ContainerNotExistError
}
c.Status = container.Status_STOPPED
f.ContainerList[killOpts.ID] = c