Merge pull request #9104 from cyyzero/fix-deadlock
This commit is contained in:
commit
8db0d39c68
5
Makefile
5
Makefile
@ -236,6 +236,11 @@ bin/cni-bridge-fp: integration/failpoint/cmd/cni-bridge-fp FORCE
|
||||
@echo "$(WHALE) $@"
|
||||
@$(GO) build ${GO_BUILD_FLAGS} -o $@ ./integration/failpoint/cmd/cni-bridge-fp
|
||||
|
||||
# build runc-fp as runc wrapper to support failpoint, only used by integration test
|
||||
bin/runc-fp: integration/failpoint/cmd/runc-fp FORCE
|
||||
@echo "$(WHALE) $@"
|
||||
@$(GO) build ${GO_BUILD_FLAGS} -o $@ ./integration/failpoint/cmd/runc-fp
|
||||
|
||||
benchmark: ## run benchmarks tests
|
||||
@echo "$(WHALE) $@"
|
||||
@$(GO) test ${TESTFLAGS} -bench . -run Benchmark -test.root
|
||||
|
@ -41,7 +41,9 @@ import (
|
||||
"github.com/containerd/containerd/plugin"
|
||||
"github.com/containerd/containerd/runtime/v2/runc/options"
|
||||
"github.com/containerd/containerd/sys"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/stretchr/testify/require"
|
||||
exec "golang.org/x/sys/execabs"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
@ -1417,3 +1419,80 @@ func TestShimOOMScore(t *testing.T) {
|
||||
case <-statusC:
|
||||
}
|
||||
}
|
||||
|
||||
// TestIssue9103 is used as regression case for issue 9103.
|
||||
//
|
||||
// The runc-fp will kill the init process so that the shim should return stopped
|
||||
// status after container.NewTask. It's used to simulate that the runc-init
|
||||
// might be killed by oom-kill.
|
||||
func TestIssue9103(t *testing.T) {
|
||||
if os.Getenv("RUNC_FLAVOR") == "crun" {
|
||||
t.Skip("skip it when using crun")
|
||||
}
|
||||
|
||||
client, err := newClient(t, address)
|
||||
require.NoError(t, err)
|
||||
defer client.Close()
|
||||
|
||||
var (
|
||||
image Image
|
||||
ctx, cancel = testContext(t)
|
||||
id = t.Name()
|
||||
)
|
||||
defer cancel()
|
||||
|
||||
image, err = client.GetImage(ctx, testImage)
|
||||
require.NoError(t, err)
|
||||
|
||||
for idx, tc := range []struct {
|
||||
desc string
|
||||
cntrOpts []NewContainerOpts
|
||||
expectedStatus ProcessStatus
|
||||
}{
|
||||
{
|
||||
desc: "should be created status",
|
||||
cntrOpts: []NewContainerOpts{
|
||||
WithNewSpec(oci.WithImageConfig(image),
|
||||
withProcessArgs("sleep", "30"),
|
||||
),
|
||||
},
|
||||
expectedStatus: Created,
|
||||
},
|
||||
{
|
||||
desc: "should be stopped status if init has been killed",
|
||||
cntrOpts: []NewContainerOpts{
|
||||
WithNewSpec(oci.WithImageConfig(image),
|
||||
withProcessArgs("sleep", "30"),
|
||||
oci.WithAnnotations(map[string]string{
|
||||
"oci.runc.failpoint.profile": "issue9103",
|
||||
}),
|
||||
),
|
||||
WithRuntime(client.Runtime(), &options.Options{
|
||||
BinaryName: "runc-fp",
|
||||
}),
|
||||
},
|
||||
expectedStatus: Stopped,
|
||||
},
|
||||
} {
|
||||
tc := tc
|
||||
tName := fmt.Sprintf("%s%d", id, idx)
|
||||
t.Run(tc.desc, func(t *testing.T) {
|
||||
container, err := client.NewContainer(ctx, tName,
|
||||
append([]NewContainerOpts{WithNewSnapshot(tName, image)}, tc.cntrOpts...)...,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
defer container.Delete(ctx, WithSnapshotCleanup)
|
||||
|
||||
cctx, ccancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
task, err := container.NewTask(cctx, empty())
|
||||
ccancel()
|
||||
require.NoError(t, err)
|
||||
|
||||
defer task.Delete(ctx, WithProcessKill)
|
||||
|
||||
status, err := task.Status(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, status.Status, tc.expectedStatus)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
69
integration/failpoint/cmd/runc-fp/issue9103.go
Normal file
69
integration/failpoint/cmd/runc-fp/issue9103.go
Normal file
@ -0,0 +1,69 @@
|
||||
//go:build linux
|
||||
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// issue9103KillInitAfterCreate kills the runc.Init process after creating
|
||||
// command returns successfully.
|
||||
//
|
||||
// REF: https://github.com/containerd/containerd/issues/9103
|
||||
func issue9103KillInitAfterCreate(ctx context.Context, method invoker) error {
|
||||
isCreated := strings.Contains(strings.Join(os.Args, ","), ",create,")
|
||||
|
||||
if err := method(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !isCreated {
|
||||
return nil
|
||||
}
|
||||
|
||||
initPidPath := "init.pid"
|
||||
data, err := os.ReadFile(initPidPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read %s: %w", initPidPath, err)
|
||||
}
|
||||
|
||||
pid, err := strconv.Atoi(string(data))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get init pid from string %s: %w", string(data), err)
|
||||
}
|
||||
|
||||
if pid <= 0 {
|
||||
return fmt.Errorf("unexpected init pid %v", pid)
|
||||
}
|
||||
|
||||
if err := syscall.Kill(pid, syscall.SIGKILL); err != nil {
|
||||
return fmt.Errorf("failed to kill the init pid %v: %w", pid, err)
|
||||
}
|
||||
|
||||
// Ensure that the containerd-shim has received the SIGCHLD and start
|
||||
// to cleanup
|
||||
time.Sleep(3 * time.Second)
|
||||
return nil
|
||||
}
|
108
integration/failpoint/cmd/runc-fp/main.go
Normal file
108
integration/failpoint/cmd/runc-fp/main.go
Normal file
@ -0,0 +1,108 @@
|
||||
//go:build linux
|
||||
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"syscall"
|
||||
|
||||
"github.com/containerd/containerd/oci"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const (
|
||||
failpointProfileKey = "oci.runc.failpoint.profile"
|
||||
)
|
||||
|
||||
type invoker func(context.Context) error
|
||||
|
||||
type invokerInterceptor func(context.Context, invoker) error
|
||||
|
||||
var (
|
||||
failpointProfiles = map[string]invokerInterceptor{
|
||||
"issue9103": issue9103KillInitAfterCreate,
|
||||
}
|
||||
)
|
||||
|
||||
// setupLog setups messages into log file.
|
||||
func setupLog() {
|
||||
// containerd/go-runc always add --log option
|
||||
idx := 2
|
||||
for ; idx < len(os.Args); idx++ {
|
||||
if os.Args[idx] == "--log" {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if idx >= len(os.Args)-1 || os.Args[idx] != "--log" {
|
||||
panic("option --log required")
|
||||
}
|
||||
|
||||
logFile := os.Args[idx+1]
|
||||
f, err := os.OpenFile(logFile, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0o644)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("failed to open %s: %w", logFile, err))
|
||||
}
|
||||
|
||||
logrus.SetOutput(f)
|
||||
logrus.SetFormatter(new(logrus.JSONFormatter))
|
||||
}
|
||||
|
||||
func main() {
|
||||
setupLog()
|
||||
|
||||
fpProfile, err := failpointProfileFromOCIAnnotation()
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatal("failed to get failpoint profile")
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
if err := fpProfile(ctx, defaultRuncInvoker); err != nil {
|
||||
logrus.WithError(err).Fatal("failed to exec failpoint profile")
|
||||
}
|
||||
}
|
||||
|
||||
// defaultRuncInvoker is to call the runc command with same arguments.
|
||||
func defaultRuncInvoker(ctx context.Context) error {
|
||||
cmd := exec.CommandContext(ctx, "runc", os.Args[1:]...)
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Pdeathsig: syscall.SIGKILL}
|
||||
return cmd.Run()
|
||||
}
|
||||
|
||||
// failpointProfileFromOCIAnnotation gets the profile from OCI annotations.
|
||||
func failpointProfileFromOCIAnnotation() (invokerInterceptor, error) {
|
||||
spec, err := oci.ReadSpec(oci.ConfigFilename)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read %s: %w", oci.ConfigFilename, err)
|
||||
}
|
||||
|
||||
profileName, ok := spec.Annotations[failpointProfileKey]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("failpoint profile is required")
|
||||
}
|
||||
|
||||
fp, ok := failpointProfiles[profileName]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("no such failpoint profile %s", profileName)
|
||||
}
|
||||
return fp, nil
|
||||
}
|
@ -137,11 +137,14 @@ type containerProcess struct {
|
||||
// that its exit can be handled efficiently. If the process has already exited,
|
||||
// it handles the exit immediately. handleStarted should be called after the
|
||||
// event announcing the start of the process has been published.
|
||||
// Note that handleStarted needs to be aware of whether s.mu is already held
|
||||
// when it is called. If s.mu has been held, we don't need to lock it when
|
||||
// calling handleProcessExit.
|
||||
//
|
||||
// The returned cleanup closure releases resources used to handle early exits.
|
||||
// It must be called before the caller of preStart returns, otherwise severe
|
||||
// memory leaks will occur.
|
||||
func (s *service) preStart(c *runc.Container) (handleStarted func(*runc.Container, process.Process), cleanup func()) {
|
||||
func (s *service) preStart(c *runc.Container) (handleStarted func(*runc.Container, process.Process, bool), cleanup func()) {
|
||||
exits := make(map[int][]runcC.Exit)
|
||||
|
||||
s.lifecycleMu.Lock()
|
||||
@ -165,7 +168,7 @@ func (s *service) preStart(c *runc.Container) (handleStarted func(*runc.Containe
|
||||
}
|
||||
}
|
||||
|
||||
handleStarted = func(c *runc.Container, p process.Process) {
|
||||
handleStarted = func(c *runc.Container, p process.Process, muLocked bool) {
|
||||
var pid int
|
||||
if p != nil {
|
||||
pid = p.Pid()
|
||||
@ -180,7 +183,13 @@ func (s *service) preStart(c *runc.Container) (handleStarted func(*runc.Containe
|
||||
} else if exited {
|
||||
s.lifecycleMu.Unlock()
|
||||
for _, ee := range ees {
|
||||
s.handleProcessExit(ee, c, p)
|
||||
if muLocked {
|
||||
s.handleProcessExit(ee, c, p)
|
||||
} else {
|
||||
s.mu.Lock()
|
||||
s.handleProcessExit(ee, c, p)
|
||||
s.mu.Unlock()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
s.running[pid] = append(s.running[pid], containerProcess{
|
||||
@ -235,7 +244,7 @@ func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ *
|
||||
// could happen would also cause the container.Pid() call above to
|
||||
// nil-deference panic.
|
||||
proc, _ := container.Process("")
|
||||
handleStarted(container, proc)
|
||||
handleStarted(container, proc, true)
|
||||
|
||||
return &taskAPI.CreateTaskResponse{
|
||||
Pid: uint32(container.Pid()),
|
||||
@ -262,7 +271,7 @@ func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.
|
||||
defer cleanup()
|
||||
p, err := container.Start(ctx, r)
|
||||
if err != nil {
|
||||
handleStarted(container, p)
|
||||
handleStarted(container, p, false)
|
||||
return nil, errdefs.ToGRPC(err)
|
||||
}
|
||||
|
||||
@ -302,7 +311,7 @@ func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.
|
||||
Pid: uint32(p.Pid()),
|
||||
})
|
||||
}
|
||||
handleStarted(container, p)
|
||||
handleStarted(container, p, false)
|
||||
return &taskAPI.StartResponse{
|
||||
Pid: uint32(p.Pid()),
|
||||
}, nil
|
||||
@ -462,7 +471,7 @@ func (s *service) Pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.Pi
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pids, err := s.getContainerPids(ctx, r.ID)
|
||||
pids, err := s.getContainerPids(ctx, container)
|
||||
if err != nil {
|
||||
return nil, errdefs.ToGRPC(err)
|
||||
}
|
||||
@ -631,7 +640,9 @@ func (s *service) processExits() {
|
||||
s.lifecycleMu.Unlock()
|
||||
|
||||
for _, cp := range cps {
|
||||
s.mu.Lock()
|
||||
s.handleProcessExit(e, cp.Container, cp.Process)
|
||||
s.mu.Unlock()
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -640,10 +651,8 @@ func (s *service) send(evt interface{}) {
|
||||
s.events <- evt
|
||||
}
|
||||
|
||||
// s.mu must be locked when calling handleProcessExit
|
||||
func (s *service) handleProcessExit(e runcC.Exit, c *runc.Container, p process.Process) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
if ip, ok := p.(*process.Init); ok {
|
||||
// Ensure all children are killed
|
||||
if runc.ShouldKillAllOnExit(s.context, c.Bundle) {
|
||||
@ -664,16 +673,12 @@ func (s *service) handleProcessExit(e runcC.Exit, c *runc.Container, p process.P
|
||||
})
|
||||
}
|
||||
|
||||
func (s *service) getContainerPids(ctx context.Context, id string) ([]uint32, error) {
|
||||
container, err := s.getContainer(id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
func (s *service) getContainerPids(ctx context.Context, container *runc.Container) ([]uint32, error) {
|
||||
p, err := container.Process("")
|
||||
if err != nil {
|
||||
return nil, errdefs.ToGRPC(err)
|
||||
}
|
||||
ps, err := p.(*process.Init).Runtime().Ps(ctx, id)
|
||||
ps, err := p.(*process.Init).Runtime().Ps(ctx, container.ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -33,3 +33,7 @@ sudo install bin/cni-bridge-fp "${CNI_BIN_DIR}"
|
||||
SHIM_BIN_DIR=${SHIM_BIN_DIR:-"/usr/local/bin"}
|
||||
make bin/containerd-shim-runc-fp-v1
|
||||
sudo install bin/containerd-shim-runc-fp-v1 "${SHIM_BIN_DIR}"
|
||||
|
||||
RUNCFP_BIN_DIR=${RUNCFP_BIN_DIR:-"/usr/local/bin"}
|
||||
make bin/runc-fp
|
||||
sudo install bin/runc-fp "${RUNCFP_BIN_DIR}"
|
||||
|
Loading…
Reference in New Issue
Block a user