From 4eaaee380fbb76ee1a1b973a82b3955610b29472 Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Mon, 30 Oct 2017 04:28:53 +0000 Subject: [PATCH] Fix removing state recover. Signed-off-by: Lantao Liu --- pkg/server/container_remove.go | 22 +++++++++++----------- pkg/server/container_start.go | 2 +- pkg/server/events.go | 4 ++-- pkg/store/container/fake_status.go | 4 ++++ pkg/store/container/status.go | 22 +++++++++++++++++----- pkg/store/container/status_test.go | 29 ++++++++++++++++++++++++++++- 6 files changed, 63 insertions(+), 20 deletions(-) diff --git a/pkg/server/container_remove.go b/pkg/server/container_remove.go index 7caa54bda..c1c09901b 100644 --- a/pkg/server/container_remove.go +++ b/pkg/server/container_remove.go @@ -31,6 +31,7 @@ import ( ) // RemoveContainer removes the container. +// TODO(random-liu): Forcibly stop container if it's running. func (c *criContainerdService) RemoveContainer(ctx context.Context, r *runtime.RemoveContainerRequest) (_ *runtime.RemoveContainerResponse, retErr error) { container, err := c.containerStore.Get(r.GetContainerId()) if err != nil { @@ -52,7 +53,6 @@ func (c *criContainerdService) RemoveContainer(ctx context.Context, r *runtime.R if retErr != nil { // Reset removing if remove failed. if err := resetContainerRemoving(container); err != nil { - // TODO(random-liu): Do not checkpoint `Removing` state. glog.Errorf("failed to reset removing state for container %q: %v", id, err) } } @@ -63,10 +63,12 @@ func (c *criContainerdService) RemoveContainer(ctx context.Context, r *runtime.R // kubelet implementation, we'll never start a container once we decide to remove it, // so we don't need the "Dead" state for now. - containerRootDir := getContainerRootDir(c.config.RootDir, id) - if err := system.EnsureRemoveAll(containerRootDir); err != nil { - return nil, fmt.Errorf("failed to remove container root directory %q: %v", - containerRootDir, err) + // Delete containerd container. + if err := container.Container.Delete(ctx, containerd.WithSnapshotCleanup); err != nil { + if !errdefs.IsNotFound(err) { + return nil, fmt.Errorf("failed to delete containerd container %q: %v", id, err) + } + glog.V(5).Infof("Remove called for containerd container %q that does not exist", id, err) } // Delete container checkpoint. @@ -74,12 +76,10 @@ func (c *criContainerdService) RemoveContainer(ctx context.Context, r *runtime.R return nil, fmt.Errorf("failed to delete container checkpoint for %q: %v", id, err) } - // Delete containerd container. - if err := container.Container.Delete(ctx, containerd.WithSnapshotCleanup); err != nil { - if !errdefs.IsNotFound(err) { - return nil, fmt.Errorf("failed to delete containerd container %q: %v", id, err) - } - glog.V(5).Infof("Remove called for containerd container %q that does not exist", id, err) + containerRootDir := getContainerRootDir(c.config.RootDir, id) + if err := system.EnsureRemoveAll(containerRootDir); err != nil { + return nil, fmt.Errorf("failed to remove container root directory %q: %v", + containerRootDir, err) } c.containerStore.Delete(id) diff --git a/pkg/server/container_start.go b/pkg/server/container_start.go index e2fb5b871..9febe006f 100644 --- a/pkg/server/container_start.go +++ b/pkg/server/container_start.go @@ -39,7 +39,7 @@ func (c *criContainerdService) StartContainer(ctx context.Context, r *runtime.St var startErr error // update container status in one transaction to avoid race with event monitor. - if err := container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) { + if err := container.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) { // Always apply status change no matter startContainer fails or not. Because startContainer // may change container state no matter it fails or succeeds. startErr = c.startContainer(ctx, container, &status) diff --git a/pkg/server/events.go b/pkg/server/events.go index f38c0e2f9..b361d4ac6 100644 --- a/pkg/server/events.go +++ b/pkg/server/events.go @@ -125,7 +125,7 @@ func (em *eventMonitor) handleEvent(evt *events.Envelope) { // Move on to make sure container status is updated. } } - err = cntr.Status.Update(func(status containerstore.Status) (containerstore.Status, error) { + err = cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) { // If FinishedAt has been set (e.g. with start failure), keep as // it is. if status.FinishedAt != 0 { @@ -151,7 +151,7 @@ func (em *eventMonitor) handleEvent(evt *events.Envelope) { } glog.Errorf("Failed to get container %q: %v", e.ContainerID, err) } - err = cntr.Status.Update(func(status containerstore.Status) (containerstore.Status, error) { + err = cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) { status.Reason = oomExitReason return status, nil }) diff --git a/pkg/store/container/fake_status.go b/pkg/store/container/fake_status.go index 2f0f5fbff..7cc7c83e8 100644 --- a/pkg/store/container/fake_status.go +++ b/pkg/store/container/fake_status.go @@ -38,6 +38,10 @@ func (f *fakeStatusStorage) Get() Status { return f.status } +func (f *fakeStatusStorage) UpdateSync(u UpdateFunc) error { + return f.Update(u) +} + func (f *fakeStatusStorage) Update(u UpdateFunc) error { f.Lock() defer f.Unlock() diff --git a/pkg/store/container/status.go b/pkg/store/container/status.go index a8390c81c..0e6a21baa 100644 --- a/pkg/store/container/status.go +++ b/pkg/store/container/status.go @@ -106,11 +106,11 @@ type UpdateFunc func(Status) (Status, error) type StatusStorage interface { // Get a container status. Get() Status + // UpdateSync updates the container status and the on disk checkpoint. + // Note that the update MUST be applied in one transaction. + UpdateSync(UpdateFunc) error // Update the container status. Note that the update MUST be applied // in one transaction. - // TODO(random-liu): Distinguish `UpdateSync` and `Update`, only - // `UpdateSync` should sync data onto disk, so that disk operation - // for non-critical status change could be avoided. Update(UpdateFunc) error // Delete the container status. // Note: @@ -167,8 +167,8 @@ func (s *statusStorage) Get() Status { return s.status } -// Update the container status. -func (s *statusStorage) Update(u UpdateFunc) error { +// UpdateSync updates the container status and the on disk checkpoint. +func (s *statusStorage) UpdateSync(u UpdateFunc) error { s.Lock() defer s.Unlock() newStatus, err := u(s.status) @@ -186,6 +186,18 @@ func (s *statusStorage) Update(u UpdateFunc) error { return nil } +// Update the container status. +func (s *statusStorage) Update(u UpdateFunc) error { + s.Lock() + defer s.Unlock() + newStatus, err := u(s.status) + if err != nil { + return err + } + s.status = newStatus + return nil +} + // Delete deletes the container status from disk atomically. func (s *statusStorage) Delete() error { temp := filepath.Dir(s.path) + ".del-" + filepath.Base(s.path) diff --git a/pkg/store/container/status_test.go b/pkg/store/container/status_test.go index 5e8560b4e..8be47fb31 100644 --- a/pkg/store/container/status_test.go +++ b/pkg/store/container/status_test.go @@ -132,7 +132,7 @@ func TestStatus(t *testing.T) { require.NoError(err) assert.Equal(testStatus, loaded) - t.Logf("successful update should take effect") + t.Logf("successful update should take effect but not checkpoint") err = s.Update(func(o Status) (Status, error) { o = updateStatus return o, nil @@ -141,6 +141,33 @@ func TestStatus(t *testing.T) { assert.Equal(updateStatus, s.Get()) loaded, err = LoadStatus(tempDir, testID) require.NoError(err) + assert.Equal(testStatus, loaded) + // Recover status. + assert.NoError(s.Update(func(o Status) (Status, error) { + o = testStatus + return o, nil + })) + + t.Logf("failed update sync should not take effect") + err = s.UpdateSync(func(o Status) (Status, error) { + o = updateStatus + return o, updateErr + }) + assert.Equal(updateErr, err) + assert.Equal(testStatus, s.Get()) + loaded, err = LoadStatus(tempDir, testID) + require.NoError(err) + assert.Equal(testStatus, loaded) + + t.Logf("successful update sync should take effect and checkpoint") + err = s.UpdateSync(func(o Status) (Status, error) { + o = updateStatus + return o, nil + }) + assert.NoError(err) + assert.Equal(updateStatus, s.Get()) + loaded, err = LoadStatus(tempDir, testID) + require.NoError(err) assert.Equal(updateStatus, loaded) t.Logf("successful update should not affect existing snapshot")