Merge pull request #8954 from fuweid/fix-shim-leak

2023-08-17 08:16:20 +08:00
parent a3c68d161f 00ef8ba99b
commit ba852faf41
7 changed files with 450 additions and 1 deletions
--- a/1
+++ b/1
@@ -80,6 +80,7 @@ Vagrant.configure("2") do |config|
            libselinux-devel \
            lsof \
            make \
            strace \
            ${INSTALL_PACKAGES}
    SHELL
  end
--- a/integration/issue7496_linux_test.go
+++ b/integration/issue7496_linux_test.go
@@ -0,0 +1,172 @@
 /*
   Copyright The containerd Authors.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 */
 package integration
 import (
 	"bufio"
 	"context"
 	"io"
 	"net"
 	"os"
 	"strconv"
 	"strings"
 	"syscall"
 	"testing"
 	"time"
 	apitask "github.com/containerd/containerd/api/runtime/task/v2"
 	"github.com/containerd/containerd/integration/images"
 	"github.com/containerd/containerd/namespaces"
 	"github.com/containerd/containerd/runtime/v2/shim"
 	"github.com/containerd/ttrpc"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	exec "golang.org/x/sys/execabs"
 )
 // TestIssue7496 is used to reproduce https://github.com/containerd/containerd/issues/7496
 //
 // NOTE: https://github.com/containerd/containerd/issues/8931 is the same issue.
 func TestIssue7496(t *testing.T) {
 	ctx := namespaces.WithNamespace(context.Background(), "k8s.io")
 	t.Logf("Create a pod config and run sandbox container")
 	sbConfig := PodSandboxConfig("sandbox", "issue7496")
 	sbID, err := runtimeService.RunPodSandbox(sbConfig, *runtimeHandler)
 	require.NoError(t, err)
 	shimCli := connectToShim(ctx, t, sbID)
 	delayInSec := 12
 	t.Logf("[shim pid: %d]: Injecting %d seconds delay to umount2 syscall",
 		shimPid(ctx, t, shimCli),
 		delayInSec)
 	doneCh := injectDelayToUmount2(ctx, t, shimCli, delayInSec /* CRI plugin uses 10 seconds to delete task */)
 	t.Logf("Create a container config and run container in a pod")
 	pauseImage := images.Get(images.Pause)
 	EnsureImageExists(t, pauseImage)
 	containerConfig := ContainerConfig("pausecontainer", pauseImage)
 	cnID, err := runtimeService.CreateContainer(sbID, containerConfig, sbConfig)
 	require.NoError(t, err)
 	require.NoError(t, runtimeService.StartContainer(cnID))
 	t.Logf("Start to StopPodSandbox and RemovePodSandbox")
 	ctx, cancelFn := context.WithTimeout(ctx, 3*time.Minute)
 	defer cancelFn()
 	for {
 		select {
 		case <-ctx.Done():
 			require.NoError(t, ctx.Err(), "The StopPodSandbox should be done in time")
 		default:
 		}
 		err := runtimeService.StopPodSandbox(sbID)
 		if err != nil {
 			t.Logf("Failed to StopPodSandbox: %v", err)
 			continue
 		}
 		err = runtimeService.RemovePodSandbox(sbID)
 		if err == nil {
 			break
 		}
 		t.Logf("Failed to RemovePodSandbox: %v", err)
 		time.Sleep(1 * time.Second)
 	}
 	t.Logf("PodSandbox %s has been deleted and start to wait for strace exit", sbID)
 	select {
 	case <-time.After(15 * time.Second):
 		resp, err := shimCli.Connect(ctx, &apitask.ConnectRequest{})
 		assert.Error(t, err, "should failed to call shim connect API")
 		t.Errorf("Strace doesn't exit in time")
 		t.Logf("Cleanup the shim (pid: %d)", resp.GetShimPid())
 		syscall.Kill(int(resp.GetShimPid()), syscall.SIGKILL)
 		<-doneCh
 	case <-doneCh:
 	}
 }
 // injectDelayToUmount2 uses strace(1) to inject delay on umount2 syscall to
 // simulate IO pressure because umount2 might force kernel to syncfs, for
 // example, umount overlayfs rootfs which doesn't with volatile.
 //
 // REF: https://man7.org/linux/man-pages/man1/strace.1.html
 func injectDelayToUmount2(ctx context.Context, t *testing.T, shimCli apitask.TaskService, delayInSec int) chan struct{} {
 	pid := shimPid(ctx, t, shimCli)
 	doneCh := make(chan struct{})
 	cmd := exec.CommandContext(ctx, "strace",
 		"-p", strconv.Itoa(int(pid)), "-f", // attach to all the threads
 		"--detach-on=execve", // stop to attach runc child-processes
 		"--trace=umount2",    // only trace umount2 syscall
 		"-e", "inject=umount2:delay_enter="+strconv.Itoa(delayInSec)+"s",
 	)
 	cmd.SysProcAttr = &syscall.SysProcAttr{Pdeathsig: syscall.SIGKILL}
 	pipeR, pipeW := io.Pipe()
 	cmd.Stdout = pipeW
 	cmd.Stderr = pipeW
 	require.NoError(t, cmd.Start())
 	// ensure that strace has attached to the shim
 	readyCh := make(chan struct{})
 	go func() {
 		defer close(doneCh)
 		bufReader := bufio.NewReader(pipeR)
 		_, err := bufReader.Peek(1)
 		assert.NoError(t, err, "failed to ensure that strace has attached to shim")
 		close(readyCh)
 		io.Copy(os.Stdout, bufReader)
 		t.Logf("Strace has exited")
 	}()
 	go func() {
 		defer pipeW.Close()
 		assert.NoError(t, cmd.Wait(), "strace should exit with zero code")
 	}()
 	<-readyCh
 	return doneCh
 }
 func connectToShim(ctx context.Context, t *testing.T, id string) apitask.TaskService {
 	addr, err := shim.SocketAddress(ctx, containerdEndpoint, id)
 	require.NoError(t, err)
 	addr = strings.TrimPrefix(addr, "unix://")
 	conn, err := net.Dial("unix", addr)
 	require.NoError(t, err)
 	client := ttrpc.NewClient(conn)
 	return apitask.NewTaskClient(client)
 }
 func shimPid(ctx context.Context, t *testing.T, shimCli apitask.TaskService) uint32 {
 	resp, err := shimCli.Connect(ctx, &apitask.ConnectRequest{})
 	require.NoError(t, err)
 	return resp.GetShimPid()
 }
--- a/integration/issue7496_shutdown_linux_test.go
+++ b/integration/issue7496_shutdown_linux_test.go
@@ -0,0 +1,65 @@
 /*
   Copyright The containerd Authors.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 */
 package integration
 import (
 	"context"
 	"testing"
 	"github.com/stretchr/testify/require"
 	apitask "github.com/containerd/containerd/api/runtime/task/v2"
 	"github.com/containerd/containerd/namespaces"
 )
 // TestIssue7496_ShouldRetryShutdown is based on https://github.com/containerd/containerd/issues/7496.
 //
 // Assume that the shim.Delete takes almost 10 seconds and returns successfully
 // and there is no container in shim. However, the context is close to be
 // canceled. It will fail to call Shutdown. If we ignores the Canceled error,
 // the shim will be leaked. In order to reproduce this, this case will use
 // failpoint to inject error into Shutdown API, and then check whether the shim
 // is leaked.
 func TestIssue7496_ShouldRetryShutdown(t *testing.T) {
 	// TODO: re-enable if we can retry Shutdown API.
 	t.Skipf("Please re-enable me if we can retry Shutdown API")
 	ctx := namespaces.WithNamespace(context.Background(), "k8s.io")
 	t.Logf("Create a pod config with shutdown failpoint")
 	sbConfig := PodSandboxConfig("sandbox", "issue7496_shouldretryshutdown")
 	injectShimFailpoint(t, sbConfig, map[string]string{
 		"Shutdown": "1*error(please retry)",
 	})
 	t.Logf("RunPodSandbox")
 	sbID, err := runtimeService.RunPodSandbox(sbConfig, failpointRuntimeHandler)
 	require.NoError(t, err)
 	t.Logf("Connect to the shim %s", sbID)
 	shimCli := connectToShim(ctx, t, sbID)
 	t.Logf("Log shim %s's pid: %d", sbID, shimPid(ctx, t, shimCli))
 	t.Logf("StopPodSandbox and RemovePodSandbox")
 	require.NoError(t, runtimeService.StopPodSandbox(sbID))
 	require.NoError(t, runtimeService.RemovePodSandbox(sbID))
 	t.Logf("Check the shim connection")
 	_, err = shimCli.Connect(ctx, &apitask.ConnectRequest{})
 	require.Error(t, err, "should failed to call shim connect API")
 }
--- a/pkg/cri/sbserver/events.go
+++ b/pkg/cri/sbserver/events.go
@@ -25,6 +25,7 @@ import (
 	"github.com/containerd/containerd"
 	eventtypes "github.com/containerd/containerd/api/events"
 	apitasks "github.com/containerd/containerd/api/services/tasks/v1"
 	containerdio "github.com/containerd/containerd/cio"
 	"github.com/containerd/containerd/errdefs"
 	"github.com/containerd/containerd/events"
@@ -404,6 +405,51 @@ func handleContainerExit(ctx context.Context, e *eventtypes.TaskExit, cntr conta
 			// Move on to make sure container status is updated.
 		}
 	}
 	// NOTE: Both sb.Container.Task and task.Delete interface always ensures
 	// that the status of target task. However, the interfaces return
 	// ErrNotFound, which doesn't mean that the shim instance doesn't exist.
 	//
 	// There are two caches for task in containerd:
 	//
 	//   1. io.containerd.service.v1.tasks-service
 	//   2. io.containerd.runtime.v2.task
 	//
 	// First one is to maintain the shim connection and shutdown the shim
 	// in Delete API. And the second one is to maintain the lifecycle of
 	// task in shim server.
 	//
 	// So, if the shim instance is running and task has been deleted in shim
 	// server, the sb.Container.Task and task.Delete will receive the
 	// ErrNotFound. If we don't delete the shim instance in io.containerd.service.v1.tasks-service,
 	// shim will be leaky.
 	//
 	// Based on containerd/containerd#7496 issue, when host is under IO
 	// pressure, the umount2 syscall will take more than 10 seconds so that
 	// the CRI plugin will cancel this task.Delete call. However, the shim
 	// server isn't aware about this. After return from umount2 syscall, the
 	// shim server continue delete the task record. And then CRI plugin
 	// retries to delete task and retrieves ErrNotFound and marks it as
 	// stopped. Therefore, The shim is leaky.
 	//
 	// It's hard to handle the connection lost or request canceled cases in
 	// shim server. We should call Delete API to io.containerd.service.v1.tasks-service
 	// to ensure that shim instance is shutdown.
 	//
 	// REF:
 	// 1. https://github.com/containerd/containerd/issues/7496#issuecomment-1671100968
 	// 2. https://github.com/containerd/containerd/issues/8931
 	if errdefs.IsNotFound(err) {
 		_, err = c.client.TaskService().Delete(ctx, &apitasks.DeleteTaskRequest{ContainerID: cntr.Container.ID()})
 		if err != nil {
 			err = errdefs.FromGRPC(err)
 			if !errdefs.IsNotFound(err) {
 				return fmt.Errorf("failed to cleanup container %s in task-service: %w", cntr.Container.ID(), err)
 			}
 		}
 		log.L.Infof("Ensure that container %s in task-service has been cleanup successfully", cntr.Container.ID())
 	}
 	err = cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
 		if status.FinishedAt == 0 {
 			status.Pid = 0
--- a/pkg/cri/sbserver/podsandbox/sandbox_delete.go
+++ b/pkg/cri/sbserver/podsandbox/sandbox_delete.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"github.com/containerd/containerd"
 	apitasks "github.com/containerd/containerd/api/services/tasks/v1"
 	"github.com/containerd/containerd/errdefs"
 	"github.com/containerd/containerd/log"
 )
@@ -49,6 +50,10 @@ func (c *Controller) Shutdown(ctx context.Context, sandboxID string) error {
 	// Delete sandbox container.
 	if sandbox.Container != nil {
 		if err := c.cleanupSandboxTask(ctx, sandbox.Container); err != nil {
 			return fmt.Errorf("failed to delete sandbox task %q: %w", sandboxID, err)
 		}
 		if err := sandbox.Container.Delete(ctx, containerd.WithSnapshotCleanup); err != nil {
 			if !errdefs.IsNotFound(err) {
 				return fmt.Errorf("failed to delete sandbox container %q: %w", sandboxID, err)
@@ -59,3 +64,63 @@ func (c *Controller) Shutdown(ctx context.Context, sandboxID string) error {
 	return nil
 }
 func (c *Controller) cleanupSandboxTask(ctx context.Context, sbCntr containerd.Container) error {
 	task, err := sbCntr.Task(ctx, nil)
 	if err != nil {
 		if !errdefs.IsNotFound(err) {
 			return fmt.Errorf("failed to load task for sandbox: %w", err)
 		}
 	} else {
 		if _, err = task.Delete(ctx, containerd.WithProcessKill); err != nil {
 			if !errdefs.IsNotFound(err) {
 				return fmt.Errorf("failed to stop sandbox: %w", err)
 			}
 		}
 	}
 	// NOTE: Both sb.Container.Task and task.Delete interface always ensures
 	// that the status of target task. However, the interfaces return
 	// ErrNotFound, which doesn't mean that the shim instance doesn't exist.
 	//
 	// There are two caches for task in containerd:
 	//
 	//   1. io.containerd.service.v1.tasks-service
 	//   2. io.containerd.runtime.v2.task
 	//
 	// First one is to maintain the shim connection and shutdown the shim
 	// in Delete API. And the second one is to maintain the lifecycle of
 	// task in shim server.
 	//
 	// So, if the shim instance is running and task has been deleted in shim
 	// server, the sb.Container.Task and task.Delete will receive the
 	// ErrNotFound. If we don't delete the shim instance in io.containerd.service.v1.tasks-service,
 	// shim will be leaky.
 	//
 	// Based on containerd/containerd#7496 issue, when host is under IO
 	// pressure, the umount2 syscall will take more than 10 seconds so that
 	// the CRI plugin will cancel this task.Delete call. However, the shim
 	// server isn't aware about this. After return from umount2 syscall, the
 	// shim server continue delete the task record. And then CRI plugin
 	// retries to delete task and retrieves ErrNotFound and marks it as
 	// stopped. Therefore, The shim is leaky.
 	//
 	// It's hard to handle the connection lost or request canceled cases in
 	// shim server. We should call Delete API to io.containerd.service.v1.tasks-service
 	// to ensure that shim instance is shutdown.
 	//
 	// REF:
 	// 1. https://github.com/containerd/containerd/issues/7496#issuecomment-1671100968
 	// 2. https://github.com/containerd/containerd/issues/8931
 	if errdefs.IsNotFound(err) {
 		_, err = c.client.TaskService().Delete(ctx, &apitasks.DeleteTaskRequest{ContainerID: sbCntr.ID()})
 		if err != nil {
 			err = errdefs.FromGRPC(err)
 			if !errdefs.IsNotFound(err) {
 				return fmt.Errorf("failed to cleanup sandbox %s in task-service: %w", sbCntr.ID(), err)
 			}
 		}
 		log.G(ctx).Infof("Ensure that sandbox %s in task-service has been cleanup successfully", sbCntr.ID())
 	}
 	return nil
 }
--- a/pkg/cri/server/events.go
+++ b/pkg/cri/server/events.go
@@ -25,6 +25,7 @@ import (
 	"github.com/containerd/containerd"
 	eventtypes "github.com/containerd/containerd/api/events"
 	apitasks "github.com/containerd/containerd/api/services/tasks/v1"
 	containerdio "github.com/containerd/containerd/cio"
 	"github.com/containerd/containerd/errdefs"
 	"github.com/containerd/containerd/events"
@@ -393,6 +394,51 @@ func handleContainerExit(ctx context.Context, e *eventtypes.TaskExit, cntr conta
 			// Move on to make sure container status is updated.
 		}
 	}
 	// NOTE: Both sb.Container.Task and task.Delete interface always ensures
 	// that the status of target task. However, the interfaces return
 	// ErrNotFound, which doesn't mean that the shim instance doesn't exist.
 	//
 	// There are two caches for task in containerd:
 	//
 	//   1. io.containerd.service.v1.tasks-service
 	//   2. io.containerd.runtime.v2.task
 	//
 	// First one is to maintain the shim connection and shutdown the shim
 	// in Delete API. And the second one is to maintain the lifecycle of
 	// task in shim server.
 	//
 	// So, if the shim instance is running and task has been deleted in shim
 	// server, the sb.Container.Task and task.Delete will receive the
 	// ErrNotFound. If we don't delete the shim instance in io.containerd.service.v1.tasks-service,
 	// shim will be leaky.
 	//
 	// Based on containerd/containerd#7496 issue, when host is under IO
 	// pressure, the umount2 syscall will take more than 10 seconds so that
 	// the CRI plugin will cancel this task.Delete call. However, the shim
 	// server isn't aware about this. After return from umount2 syscall, the
 	// shim server continue delete the task record. And then CRI plugin
 	// retries to delete task and retrieves ErrNotFound and marks it as
 	// stopped. Therefore, The shim is leaky.
 	//
 	// It's hard to handle the connection lost or request canceled cases in
 	// shim server. We should call Delete API to io.containerd.service.v1.tasks-service
 	// to ensure that shim instance is shutdown.
 	//
 	// REF:
 	// 1. https://github.com/containerd/containerd/issues/7496#issuecomment-1671100968
 	// 2. https://github.com/containerd/containerd/issues/8931
 	if errdefs.IsNotFound(err) {
 		_, err = c.client.TaskService().Delete(ctx, &apitasks.DeleteTaskRequest{ContainerID: cntr.Container.ID()})
 		if err != nil {
 			err = errdefs.FromGRPC(err)
 			if !errdefs.IsNotFound(err) {
 				return fmt.Errorf("failed to cleanup container %s in task-service: %w", cntr.Container.ID(), err)
 			}
 		}
 		logrus.Infof("Ensure that container %s in task-service has been cleanup successfully", cntr.Container.ID())
 	}
 	err = cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
 		if status.FinishedAt == 0 {
 			status.Pid = 0
@@ -434,6 +480,50 @@ func handleSandboxExit(ctx context.Context, e *eventtypes.TaskExit, sb sandboxst
 			// Move on to make sure container status is updated.
 		}
 	}
 	// NOTE: Both sb.Container.Task and task.Delete interface always ensures
 	// that the status of target task. However, the interfaces return
 	// ErrNotFound, which doesn't mean that the shim instance doesn't exist.
 	//
 	// There are two caches for task in containerd:
 	//
 	//   1. io.containerd.service.v1.tasks-service
 	//   2. io.containerd.runtime.v2.task
 	//
 	// First one is to maintain the shim connection and shutdown the shim
 	// in Delete API. And the second one is to maintain the lifecycle of
 	// task in shim server.
 	//
 	// So, if the shim instance is running and task has been deleted in shim
 	// server, the sb.Container.Task and task.Delete will receive the
 	// ErrNotFound. If we don't delete the shim instance in io.containerd.service.v1.tasks-service,
 	// shim will be leaky.
 	//
 	// Based on containerd/containerd#7496 issue, when host is under IO
 	// pressure, the umount2 syscall will take more than 10 seconds so that
 	// the CRI plugin will cancel this task.Delete call. However, the shim
 	// server isn't aware about this. After return from umount2 syscall, the
 	// shim server continue delete the task record. And then CRI plugin
 	// retries to delete task and retrieves ErrNotFound and marks it as
 	// stopped. Therefore, The shim is leaky.
 	//
 	// It's hard to handle the connection lost or request canceled cases in
 	// shim server. We should call Delete API to io.containerd.service.v1.tasks-service
 	// to ensure that shim instance is shutdown.
 	//
 	// REF:
 	// 1. https://github.com/containerd/containerd/issues/7496#issuecomment-1671100968
 	// 2. https://github.com/containerd/containerd/issues/8931
 	if errdefs.IsNotFound(err) {
 		_, err = c.client.TaskService().Delete(ctx, &apitasks.DeleteTaskRequest{ContainerID: sb.Container.ID()})
 		if err != nil {
 			err = errdefs.FromGRPC(err)
 			if !errdefs.IsNotFound(err) {
 				return fmt.Errorf("failed to cleanup sandbox %s in task-service: %w", sb.Container.ID(), err)
 			}
 		}
 		logrus.Infof("Ensure that sandbox %s in task-service has been cleanup successfully", sb.Container.ID())
 	}
 	err = sb.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
 		status.State = sandboxstore.StateNotReady
 		status.Pid = 0
--- a/runtime/v2/shim.go
+++ b/runtime/v2/shim.go
@@ -458,6 +458,12 @@ func (s *shimTask) delete(ctx context.Context, sandboxed bool, removeTask func(c
 	// If not, the shim has been delivered the exit and delete events.
 	// So we should remove the record and prevent duplicate events from
 	// ttrpc-callback-on-close.
 	//
 	// TODO: It's hard to guarantee that the event is unique and sent only
 	// once. The moby/moby should not rely on that assumption that there is
 	// only one exit event. The moby/moby should handle the duplicate events.
 	//
 	// REF: https://github.com/containerd/containerd/issues/4769
 	if shimErr == nil {
 		removeTask(ctx, s.ID())
 	}
@@ -466,7 +472,11 @@ func (s *shimTask) delete(ctx context.Context, sandboxed bool, removeTask func(c
 	// Let controller decide when to shutdown.
 	if !sandboxed {
 		if err := s.waitShutdown(ctx); err != nil {
-			log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to shutdown shim task")
+			// FIXME(fuweid):
 			//
 			// If the error is context canceled, should we use context.TODO()
 			// to wait for it?
 			log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to shutdown shim task and the shim might be leaked")
 		}
 	}