integration: regression test for issue 10589
This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist. Relates to https://github.com/containerd/containerd/issues/10589 Signed-off-by: Samuel Karp <samuelkarp@google.com>
This commit is contained in:
parent
5f37a2c205
commit
18725f010b
@ -34,14 +34,18 @@ import (
|
||||
"github.com/containerd/cgroups/v3/cgroup1"
|
||||
cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2"
|
||||
"github.com/containerd/containerd/api/types/runc/options"
|
||||
"github.com/containerd/errdefs"
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
. "github.com/containerd/containerd/v2/client"
|
||||
"github.com/containerd/containerd/v2/core/containers"
|
||||
"github.com/containerd/containerd/v2/integration/failpoint"
|
||||
"github.com/containerd/containerd/v2/pkg/cio"
|
||||
"github.com/containerd/containerd/v2/pkg/fifosync"
|
||||
"github.com/containerd/containerd/v2/pkg/oci"
|
||||
"github.com/containerd/containerd/v2/pkg/shim"
|
||||
"github.com/containerd/containerd/v2/pkg/sys"
|
||||
"github.com/containerd/containerd/v2/plugins"
|
||||
"github.com/containerd/errdefs"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/stretchr/testify/require"
|
||||
@ -1551,3 +1555,207 @@ func TestIssue9103(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestIssue10589 is used as regression case for issue 10589.
|
||||
//
|
||||
// This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates
|
||||
// by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This
|
||||
// allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist.
|
||||
//
|
||||
// The workflow is as follows:
|
||||
// 1. Create a container as normal
|
||||
// 2. Make an exec1 using runc-fp with delayexec
|
||||
// 3. Wait until the exec is waiting to start (triggered by delayexec)
|
||||
// 4. Kill the container init process (signalling it is easiest)
|
||||
// 5. Make an exec2 using runc-fp with delayexec
|
||||
// 6. Wait until the exec is waiting to start
|
||||
// 7. Allow exec1 to proceed
|
||||
// 8. Allow exec2 to proceed
|
||||
// 9. See that the container has exited and all execs have exited too
|
||||
//
|
||||
// https://github.com/containerd/containerd/issues/10589
|
||||
func TestIssue10589(t *testing.T) {
|
||||
if f := os.Getenv("RUNC_FLAVOR"); f != "" && f != "runc" {
|
||||
t.Skip("test requires runc")
|
||||
}
|
||||
|
||||
client, err := newClient(t, address)
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(func() {
|
||||
client.Close()
|
||||
})
|
||||
|
||||
var (
|
||||
image Image
|
||||
ctx, cancel = testContext(t)
|
||||
id = t.Name()
|
||||
)
|
||||
t.Cleanup(cancel)
|
||||
|
||||
image, err = client.GetImage(ctx, testImage)
|
||||
require.NoError(t, err)
|
||||
|
||||
// 1. Create a sleeping container
|
||||
t.Log("1. Create a sleeping container")
|
||||
container, err := client.NewContainer(ctx, id,
|
||||
WithNewSnapshot(id, image),
|
||||
WithNewSpec(oci.WithImageConfig(image),
|
||||
withProcessArgs("sleep", "inf"),
|
||||
oci.WithAnnotations(map[string]string{
|
||||
"oci.runc.failpoint.profile": "delayExec",
|
||||
}),
|
||||
),
|
||||
WithRuntime(client.Runtime(), &options.Options{
|
||||
BinaryName: "runc-fp",
|
||||
}),
|
||||
)
|
||||
require.NoError(t, err, "create container")
|
||||
t.Cleanup(func() {
|
||||
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
err := container.Delete(ctx, WithSnapshotCleanup)
|
||||
if err != nil {
|
||||
t.Log("delete err", err)
|
||||
}
|
||||
cancel()
|
||||
})
|
||||
|
||||
task, err := container.NewTask(ctx, empty())
|
||||
require.NoError(t, err, "create task")
|
||||
t.Cleanup(func() {
|
||||
ctx, cancel := context.WithTimeout(ctx, 2*time.Second)
|
||||
st, err := task.Delete(ctx, WithProcessKill)
|
||||
t.Log("exit status", st)
|
||||
if err != nil {
|
||||
t.Log("kill err", err)
|
||||
}
|
||||
cancel()
|
||||
})
|
||||
|
||||
err = task.Start(ctx)
|
||||
require.NoError(t, err, "start container")
|
||||
|
||||
status, err := task.Status(ctx)
|
||||
require.NoError(t, err, "container status")
|
||||
require.Equal(t, Running, status.Status)
|
||||
|
||||
// 2. Create an exec
|
||||
t.Log("2. Create exec1")
|
||||
exec1ReadyFifo, err := fifosync.NewWaiter(filepath.Join(t.TempDir(), "exec1-ready.fifo"), 0600)
|
||||
require.NoError(t, err, "create exec1 ready fifo")
|
||||
exec1DelayFifo, err := fifosync.NewTrigger(filepath.Join(t.TempDir(), "exec1-delay.fifo"), 0600)
|
||||
require.NoError(t, err, "create exec1 delay fifo")
|
||||
exec1, err := task.Exec(ctx, "exec1", &specs.Process{
|
||||
Args: []string{"/bin/sleep", "301"},
|
||||
Cwd: "/",
|
||||
Env: []string{
|
||||
failpoint.DelayExecReadyEnv + "=" + exec1ReadyFifo.Name(),
|
||||
failpoint.DelayExecDelayEnv + "=" + exec1DelayFifo.Name(),
|
||||
},
|
||||
}, cio.NullIO)
|
||||
require.NoError(t, err, "create exec1")
|
||||
|
||||
exec1done := make(chan struct{})
|
||||
go func() {
|
||||
defer close(exec1done)
|
||||
t.Log("Starting exec1")
|
||||
err := exec1.Start(ctx)
|
||||
assert.Error(t, err, "start exec1")
|
||||
t.Logf("error starting exec1: %s", err)
|
||||
}()
|
||||
|
||||
// 3. Wait until the exec is waiting to start
|
||||
t.Log("3. Wait until exec1 is waiting to start")
|
||||
err = exec1ReadyFifo.Wait()
|
||||
require.NoError(t, err, "open exec1 fifo")
|
||||
|
||||
// 4. Kill the container init process
|
||||
t.Log("4. Kill the container init process")
|
||||
target := task.Pid()
|
||||
t.Logf("Killing main pid (%v) of container %s", target, container.ID())
|
||||
syscall.Kill(int(target), syscall.SIGKILL)
|
||||
status, err = task.Status(ctx)
|
||||
require.NoError(t, err, "container status")
|
||||
t.Log("container status", status.Status)
|
||||
|
||||
// 5. Make an exec (2) using this failpoint
|
||||
t.Log("5. Create exec2")
|
||||
exec2ReadyFifo, err := fifosync.NewWaiter(filepath.Join(t.TempDir(), "exec2-ready.fifo"), 0600)
|
||||
require.NoError(t, err, "create exec2 ready fifo: %q", exec2ReadyFifo)
|
||||
exec2DelayFifo, err := fifosync.NewTrigger(filepath.Join(t.TempDir(), "exec2-delay.fifo"), 0600)
|
||||
require.NoError(t, err, "create exec2 delay fifo: %q", exec2DelayFifo)
|
||||
exec2, err := task.Exec(ctx, "exec2", &specs.Process{
|
||||
Args: []string{"/bin/sleep", "302"},
|
||||
Cwd: "/",
|
||||
Env: []string{
|
||||
failpoint.DelayExecReadyEnv + "=" + exec2ReadyFifo.Name(),
|
||||
failpoint.DelayExecDelayEnv + "=" + exec2DelayFifo.Name(),
|
||||
},
|
||||
}, cio.NullIO)
|
||||
require.NoError(t, err, "create exec2")
|
||||
|
||||
exec2done := make(chan struct{})
|
||||
didExec2Run := true
|
||||
go func() {
|
||||
defer close(exec2done)
|
||||
t.Log("Starting exec2")
|
||||
err := exec2.Start(ctx)
|
||||
assert.Error(t, err, "start exec2")
|
||||
t.Logf("error starting exec2: %s", err)
|
||||
}()
|
||||
|
||||
// 6. Wait until the exec is waiting to start
|
||||
t.Log("6. Wait until exec2 is waiting to start")
|
||||
exec2ready := make(chan struct{})
|
||||
go func() {
|
||||
exec2ReadyFifo.Wait()
|
||||
close(exec2ready)
|
||||
}()
|
||||
select {
|
||||
case <-exec2ready:
|
||||
case <-exec2done:
|
||||
didExec2Run = false
|
||||
}
|
||||
|
||||
// 7. Allow exec=1 to proceed
|
||||
t.Log("7. Allow exec=1 to proceed")
|
||||
err = exec1DelayFifo.Trigger()
|
||||
assert.NoError(t, err, "trigger exec1 fifo")
|
||||
status, err = task.Status(ctx)
|
||||
require.NoError(t, err, "container status")
|
||||
t.Log("container status", status.Status)
|
||||
<-exec1done
|
||||
status, err = task.Status(ctx)
|
||||
require.NoError(t, err, "container status")
|
||||
t.Log("container status", status.Status)
|
||||
|
||||
// 8. Allow exec=2 to proceed
|
||||
if didExec2Run {
|
||||
t.Log("8. Allow exec2 to proceed")
|
||||
err = exec2DelayFifo.Trigger()
|
||||
assert.NoError(t, err, "trigger exec2 fifo")
|
||||
status, err = task.Status(ctx)
|
||||
require.NoError(t, err, "container status")
|
||||
t.Log("container status", status.Status)
|
||||
<-exec2done
|
||||
status, err = task.Status(ctx)
|
||||
require.NoError(t, err, "container status")
|
||||
t.Log("container status", status.Status)
|
||||
} else {
|
||||
t.Log("8. Skip exec2")
|
||||
}
|
||||
|
||||
// 9. Validate
|
||||
t.Log("9. Validate")
|
||||
status, err = exec1.Status(ctx)
|
||||
require.NoError(t, err, "exec1 status")
|
||||
t.Logf("exec1 status: %s", status.Status)
|
||||
assert.Equal(t, Created, status.Status)
|
||||
status, err = exec2.Status(ctx)
|
||||
require.NoError(t, err, "exec2 status")
|
||||
t.Logf("exec2 status: %s", status.Status)
|
||||
assert.Equal(t, Created, status.Status)
|
||||
status, err = task.Status(ctx)
|
||||
t.Logf("task status: %s", status.Status)
|
||||
require.NoError(t, err, "container status")
|
||||
assert.Equal(t, Stopped, status.Status)
|
||||
}
|
||||
|
131
integration/failpoint/cmd/runc-fp/delayexec.go
Normal file
131
integration/failpoint/cmd/runc-fp/delayexec.go
Normal file
@ -0,0 +1,131 @@
|
||||
//go:build linux
|
||||
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/containerd/containerd/v2/integration/failpoint"
|
||||
"github.com/containerd/containerd/v2/pkg/fifosync"
|
||||
)
|
||||
|
||||
// delayExec delays an "exec" command until a trigger is received from the calling test program. This can be used to
|
||||
// test races around container lifecycle and exec processes.
|
||||
func delayExec(ctx context.Context, method invoker) error {
|
||||
isExec := strings.Contains(strings.Join(os.Args, ","), ",exec,")
|
||||
if !isExec {
|
||||
if err := method(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
logrus.Debug("EXEC!")
|
||||
|
||||
if err := delay(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := method(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func delay() error {
|
||||
ready, delay, err := fifoFromProcessEnv()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ready.Trigger(); err != nil {
|
||||
return err
|
||||
}
|
||||
return delay.Wait()
|
||||
}
|
||||
|
||||
// fifoFromProcessEnv finds a fifo specified in the environment variables of an exec process
|
||||
func fifoFromProcessEnv() (fifosync.Trigger, fifosync.Waiter, error) {
|
||||
env, err := processEnvironment()
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
readyName, ok := env[failpoint.DelayExecReadyEnv]
|
||||
if !ok {
|
||||
return nil, nil, fmt.Errorf("fifo: failed to find %q env var in %v", failpoint.DelayExecReadyEnv, env)
|
||||
}
|
||||
delayName, ok := env[failpoint.DelayExecDelayEnv]
|
||||
if !ok {
|
||||
return nil, nil, fmt.Errorf("fifo: failed to find %q env var in %v", failpoint.DelayExecDelayEnv, env)
|
||||
}
|
||||
logrus.WithField("ready", readyName).WithField("delay", delayName).Debug("Found FIFOs!")
|
||||
readyFIFO, err := fifosync.NewTrigger(readyName, 0600)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
delayFIFO, err := fifosync.NewWaiter(delayName, 0600)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
return readyFIFO, delayFIFO, nil
|
||||
}
|
||||
|
||||
func processEnvironment() (map[string]string, error) {
|
||||
idx := 2
|
||||
for ; idx < len(os.Args); idx++ {
|
||||
if os.Args[idx] == "--process" {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if idx >= len(os.Args)-1 || os.Args[idx] != "--process" {
|
||||
return nil, errors.New("env: option --process required")
|
||||
}
|
||||
|
||||
specFile := os.Args[idx+1]
|
||||
f, err := os.OpenFile(specFile, os.O_RDONLY, 0o644)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("env: failed to open %s: %w", specFile, err)
|
||||
}
|
||||
|
||||
b, err := io.ReadAll(f)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("env: failed to read spec from %q", specFile)
|
||||
}
|
||||
var spec specs.Process
|
||||
if err := json.Unmarshal(b, &spec); err != nil {
|
||||
return nil, fmt.Errorf("env: failed to unmarshal spec from %q: %w", specFile, err)
|
||||
}
|
||||
|
||||
// XXX: env vars can be specified multiple times, but we only keep one
|
||||
env := make(map[string]string)
|
||||
for _, e := range spec.Env {
|
||||
k, v, _ := strings.Cut(e, "=")
|
||||
env[k] = v
|
||||
}
|
||||
|
||||
return env, nil
|
||||
}
|
@ -25,8 +25,9 @@ import (
|
||||
"os/exec"
|
||||
"syscall"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/oci"
|
||||
"github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/oci"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -40,6 +41,7 @@ type invokerInterceptor func(context.Context, invoker) error
|
||||
var (
|
||||
failpointProfiles = map[string]invokerInterceptor{
|
||||
"issue9103": issue9103KillInitAfterCreate,
|
||||
"delayExec": delayExec,
|
||||
}
|
||||
)
|
||||
|
||||
|
22
integration/failpoint/const.go
Normal file
22
integration/failpoint/const.go
Normal file
@ -0,0 +1,22 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package failpoint
|
||||
|
||||
const (
|
||||
DelayExecReadyEnv = "_RUNC_FP_DELAY_EXEC_READY"
|
||||
DelayExecDelayEnv = "_RUNC_FP_DELAY_EXEC_DELAY"
|
||||
)
|
Loading…
Reference in New Issue
Block a user