Merge pull request #7069 from fuweid/failpoint-in-runc-shimv2

test: introduce failpoint control to runc-shimv2 and cni
This commit is contained in:
Derek McGowan
2022-07-26 23:12:20 -07:00
committed by GitHub
24 changed files with 1700 additions and 14 deletions

View File

@@ -0,0 +1,159 @@
## cni-bridge-f(ail)p(oint)
### Overview
The `cni-bridge-fp` is a CNI plugin which delegates interface-creating function
to [CNI bridge plugin][1] and allows user to inject failpoint before delegation.
Since the CNI plugin is invoked by binary call from CRI and it is short-lived,
the failpoint need to be configured by a JSON file, which can be persisted.
There is an example about failpoint description.
```json
{
"cmdAdd": "1*error(you-shall-not-pass!)->1*panic(again)",
"cmdDel": "1*error(try-again)",
"cmdCheck": "10*off"
}
```
* `cmdAdd` (string, optional): The failpoint for `ADD` command.
* `cmdDel` (string, optional): The failpoint for `DEL` command.
* `cmdCheck` (string, optional): The failpoint for `CHECK` command.
Since the `cmdXXX` can be multiple failpoints, each CNI binary call will update
the current state to make sure the order of execution is expected.
And the failpoint injection is enabled by pod's annotation. Currently, the key
of customized CNI capabilities in containerd can only be `io.kubernetes.cri.pod-annotations`
and containerd will pass pod's annotations to CNI under the that object. The
user can use the `failpoint.cni.containerd.io/confpath` annotation to enable
failpoint for the pod.
```yaml
apiVersion: v1
kind: Pod
metadata:
name: nginx
annotations:
failpoint.cni.containerd.io/confpath: "/tmp/pod-failpoints.json"
spec:
containers:
- name: nginx
image: nginx:1.14.2
ports:
- containerPort: 80
```
### Example
Let's use the following json as failpoint description.
```bash
$ cat <<EOF | tee /tmp/cni-failpoint.json
{
"cmdAdd": "1*error(try-again)",
"cmdDel": "2*error(oops)",
"cmdCheck": "1*off->1*panic(sorry)"
}
EOF
```
And use `ip netns` to create persisted net namespace named by `failpoint`.
```bash
$ sudo ip netns add failpoint
```
And then setup the following bash script for demo.
```bash
$ cat <<EOFDEMO | tee /tmp/cni-failpoint-demo-helper.sh
#!/usr/bin/env bash
export CNI_CONTAINERID=failpoint-testing
export CNI_NETNS=/run/netns/failpoint
export CNI_IFNAME=fpeni0
export CNI_PATH=/opt/cni/bin/
cat <<EOF | /opt/cni/bin/cni-bridge-fp
{
"cniVersion": "0.3.0",
"name": "containerd-net-fp",
"type": "cni-bridge-fp",
"bridge": "fp-cni0",
"isGateway": true,
"ipMasq": true,
"promiscMode": true,
"ipam": {
"type": "host-local",
"ranges": [
[{
"subnet": "10.88.0.0/16"
}],
[{
"subnet": "2001:4860:4860::/64"
}]
],
"routes": [
{ "dst": "0.0.0.0/0" },
{ "dst": "::/0" }
]
},
"runtimeConfig": {
"io.kubernetes.cri.pod-annotations": {
"failpoint.cni.containerd.io/confpath": "/tmp/cni-failpoint.json"
}
}
}
EOF
EOFDEMO
```
Let's try to setup CNI and we should get a error `try-again`.
```bash
$ sudo CNI_COMMAND=ADD bash /tmp/cni-failpoint-demo-helper.sh
{
"code": 999,
"msg": "try-again"
}
# there is no failpoint for ADD command.
$ cat /tmp/cni-failpoint.json | jq .
{
"cmdAdd": "0*error(try-again)",
"cmdDel": "2*error(oops)",
"cmdCheck": "1*off->1*panic(sorry)"
}
```
We should setup CNI successfully after retry. When we teardown the interface,
there should be two failpoints.
```bash
$ sudo CNI_COMMAND=ADD bash /tmp/cni-failpoint-demo-helper.sh
...
$ sudo CNI_COMMAND=DEL bash /tmp/cni-failpoint-demo-helper.sh
{
"code": 999,
"msg": "oops"
}
$ sudo CNI_COMMAND=DEL bash /tmp/cni-failpoint-demo-helper.sh
{
"code": 999,
"msg": "oops"
}
$ cat /tmp/cni-failpoint.json | jq .
{
"cmdAdd": "0*error(try-again)",
"cmdDel": "0*error(oops)",
"cmdCheck": "1*off->1*panic(sorry)"
}
```
[1]: <https://www.cni.dev/plugins/current/main/bridge/>

View File

@@ -0,0 +1,202 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"syscall"
"github.com/containerd/containerd/pkg/failpoint"
"github.com/containerd/continuity"
"github.com/containernetworking/cni/pkg/invoke"
"github.com/containernetworking/cni/pkg/skel"
"github.com/containernetworking/cni/pkg/version"
)
const delegatedPlugin = "bridge"
type netConf struct {
RuntimeConfig struct {
PodAnnotations inheritedPodAnnotations `json:"io.kubernetes.cri.pod-annotations"`
} `json:"runtimeConfig,omitempty"`
}
type inheritedPodAnnotations struct {
// FailpointConfPath represents filepath of failpoint settings.
FailpointConfPath string `json:"failpoint.cni.containerd.io/confpath,omitempty"`
}
// failpointConf is used to describe cmdAdd/cmdDel/cmdCheck command's failpoint.
type failpointConf struct {
Add string `json:"cmdAdd,omitempty"`
Del string `json:"cmdDel,omitempty"`
Check string `json:"cmdCheck,omitempty"`
}
func main() {
skel.PluginMain(cmdAdd, cmdCheck, cmdDel, version.All, "bridge with failpoint support")
}
func cmdAdd(args *skel.CmdArgs) error {
if err := handleFailpoint(args, "ADD"); err != nil {
return err
}
result, err := invoke.DelegateAdd(context.TODO(), delegatedPlugin, args.StdinData, nil)
if err != nil {
return err
}
return result.Print()
}
func cmdCheck(args *skel.CmdArgs) error {
if err := handleFailpoint(args, "CHECK"); err != nil {
return err
}
return invoke.DelegateCheck(context.TODO(), delegatedPlugin, args.StdinData, nil)
}
func cmdDel(args *skel.CmdArgs) error {
if err := handleFailpoint(args, "DEL"); err != nil {
return err
}
return invoke.DelegateDel(context.TODO(), delegatedPlugin, args.StdinData, nil)
}
func handleFailpoint(args *skel.CmdArgs, cmdKind string) error {
var conf netConf
if err := json.Unmarshal(args.StdinData, &conf); err != nil {
return fmt.Errorf("failed to parse network configuration: %w", err)
}
confPath := conf.RuntimeConfig.PodAnnotations.FailpointConfPath
if len(confPath) == 0 {
return nil
}
control, err := newFailpointControl(confPath)
if err != nil {
return err
}
evalFn, err := control.delegatedEvalFn(cmdKind)
if err != nil {
return err
}
return evalFn()
}
type failpointControl struct {
confPath string
}
func newFailpointControl(confPath string) (*failpointControl, error) {
if !filepath.IsAbs(confPath) {
return nil, fmt.Errorf("failpoint confPath(%s) is required to be absolute", confPath)
}
return &failpointControl{
confPath: confPath,
}, nil
}
func (c *failpointControl) delegatedEvalFn(cmdKind string) (failpoint.EvalFn, error) {
var resFn failpoint.EvalFn = nopEvalFn
if err := c.updateTx(func(conf *failpointConf) error {
var fpStr *string
switch cmdKind {
case "ADD":
fpStr = &conf.Add
case "DEL":
fpStr = &conf.Del
case "CHECK":
fpStr = &conf.Check
}
if fpStr == nil || *fpStr == "" {
return nil
}
fp, err := failpoint.NewFailpoint(cmdKind, *fpStr)
if err != nil {
return fmt.Errorf("failed to parse failpoint %s: %w", *fpStr, err)
}
resFn = fp.DelegatedEval()
*fpStr = fp.Marshal()
return nil
}); err != nil {
return nil, err
}
return resFn, nil
}
func (c *failpointControl) updateTx(updateFn func(conf *failpointConf) error) error {
f, err := os.OpenFile(c.confPath, os.O_RDWR, 0666)
if err != nil {
return fmt.Errorf("failed to open confPath %s: %w", c.confPath, err)
}
defer f.Close()
if err := flock(f.Fd()); err != nil {
return fmt.Errorf("failed to lock failpoint setting %s: %w", c.confPath, err)
}
defer unflock(f.Fd())
data, err := ioutil.ReadAll(f)
if err != nil {
return fmt.Errorf("failed to read failpoint setting %s: %w", c.confPath, err)
}
var conf failpointConf
if err := json.Unmarshal(data, &conf); err != nil {
return fmt.Errorf("failed to unmarshal failpoint conf %s: %w", string(data), err)
}
if err := updateFn(&conf); err != nil {
return err
}
data, err = json.Marshal(conf)
if err != nil {
return fmt.Errorf("failed to marshal failpoint conf: %w", err)
}
return continuity.AtomicWriteFile(c.confPath, data, 0666)
}
func nopEvalFn() error {
return nil
}
func flock(fd uintptr) error {
return syscall.Flock(int(fd), syscall.LOCK_EX)
}
func unflock(fd uintptr) error {
return syscall.Flock(int(fd), syscall.LOCK_UN)
}

View File

@@ -0,0 +1,29 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"context"
"github.com/containerd/containerd/runtime/v2/runc/manager"
_ "github.com/containerd/containerd/runtime/v2/runc/pause"
"github.com/containerd/containerd/runtime/v2/shim"
)
func main() {
shim.RunManager(context.Background(), manager.NewShimManager("io.containerd.runc-fp.v1"))
}

View File

@@ -0,0 +1,141 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"context"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
taskapi "github.com/containerd/containerd/api/runtime/task/v2"
"github.com/containerd/containerd/oci"
"github.com/containerd/containerd/pkg/failpoint"
"github.com/containerd/containerd/pkg/shutdown"
"github.com/containerd/containerd/plugin"
"github.com/containerd/containerd/runtime/v2/runc/task"
"github.com/containerd/containerd/runtime/v2/shim"
"github.com/containerd/ttrpc"
)
const (
ociConfigFilename = "config.json"
failpointPrefixKey = "io.containerd.runtime.v2.shim.failpoint."
)
func init() {
plugin.Register(&plugin.Registration{
Type: plugin.TTRPCPlugin,
ID: "task",
Requires: []plugin.Type{
plugin.EventPlugin,
plugin.InternalPlugin,
},
InitFn: func(ic *plugin.InitContext) (interface{}, error) {
pp, err := ic.GetByID(plugin.EventPlugin, "publisher")
if err != nil {
return nil, err
}
ss, err := ic.GetByID(plugin.InternalPlugin, "shutdown")
if err != nil {
return nil, err
}
fps, err := newFailpointFromOCIAnnotation()
if err != nil {
return nil, err
}
service, err := task.NewTaskService(ic.Context, pp.(shim.Publisher), ss.(shutdown.Service))
if err != nil {
return nil, err
}
return &taskServiceWithFp{
fps: fps,
local: service,
}, nil
},
})
}
type taskServiceWithFp struct {
fps map[string]*failpoint.Failpoint
local taskapi.TaskService
}
func (s *taskServiceWithFp) RegisterTTRPC(server *ttrpc.Server) error {
taskapi.RegisterTaskService(server, s.local)
return nil
}
func (s *taskServiceWithFp) UnaryInterceptor() ttrpc.UnaryServerInterceptor {
return func(ctx context.Context, unmarshal ttrpc.Unmarshaler, info *ttrpc.UnaryServerInfo, method ttrpc.Method) (interface{}, error) {
methodName := filepath.Base(info.FullMethod)
if fp, ok := s.fps[methodName]; ok {
if err := fp.Evaluate(); err != nil {
return nil, err
}
}
return method(ctx, unmarshal)
}
}
// newFailpointFromOCIAnnotation reloads and parses the annotation from
// bundle-path/config.json.
//
// The annotation controlling task API's failpoint should be like:
//
// io.containerd.runtime.v2.shim.failpoint.Create = 1*off->1*error(please retry)
//
// The `Create` is the shim unary API and the value of annotation is the
// failpoint control. The function will return a set of failpoint controllers.
func newFailpointFromOCIAnnotation() (map[string]*failpoint.Failpoint, error) {
// NOTE: shim's current working dir is in bundle dir.
cwd, err := os.Getwd()
if err != nil {
return nil, fmt.Errorf("failed to get current working dir: %w", err)
}
configPath := filepath.Join(cwd, ociConfigFilename)
data, err := os.ReadFile(configPath)
if err != nil {
return nil, fmt.Errorf("failed to read %v: %w", configPath, err)
}
var spec oci.Spec
if err := json.Unmarshal(data, &spec); err != nil {
return nil, fmt.Errorf("failed to parse oci.Spec(%v): %w", string(data), err)
}
res := make(map[string]*failpoint.Failpoint)
for k, v := range spec.Annotations {
if !strings.HasPrefix(k, failpointPrefixKey) {
continue
}
methodName := strings.TrimPrefix(k, failpointPrefixKey)
fp, err := failpoint.NewFailpoint(methodName, v)
if err != nil {
return nil, fmt.Errorf("failed to parse failpoint %v: %w", v, err)
}
res[methodName] = fp
}
return res, nil
}

View File

@@ -174,7 +174,8 @@ func PodSandboxConfig(name, ns string, opts ...PodSandboxOpts) *runtime.PodSandb
Uid: util.GenerateID(),
Namespace: Randomize(ns),
},
Linux: &runtime.LinuxPodSandboxConfig{},
Linux: &runtime.LinuxPodSandboxConfig{},
Annotations: make(map[string]string),
}
for _, opt := range opts {
opt(config)

View File

@@ -0,0 +1,122 @@
//go:build linux
// +build linux
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package integration
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
"testing"
criapiv1 "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/pkg/failpoint"
"github.com/stretchr/testify/require"
)
const (
failpointRuntimeHandler = "runc-fp"
failpointShimPrefixKey = "io.containerd.runtime.v2.shim.failpoint."
failpointCNIConfPathKey = "failpoint.cni.containerd.io/confpath"
)
func TestRunPodSandboxWithSetupCNIFailure(t *testing.T) {
if runtime.GOOS != "linux" {
t.Skip()
}
t.Logf("Inject CNI failpoint")
conf := &failpointConf{
Add: "1*error(you-shall-not-pass!)",
}
sbConfig := PodSandboxConfig(t.Name(), "failpoint")
injectCNIFailpoint(t, sbConfig, conf)
t.Logf("Create a sandbox")
_, err := runtimeService.RunPodSandbox(sbConfig, failpointRuntimeHandler)
require.Error(t, err)
require.Equal(t, true, strings.Contains(err.Error(), "you-shall-not-pass!"))
t.Logf("Retry to create sandbox with same config")
sb, err := runtimeService.RunPodSandbox(sbConfig, failpointRuntimeHandler)
require.NoError(t, err)
err = runtimeService.StopPodSandbox(sb)
require.NoError(t, err)
err = runtimeService.RemovePodSandbox(sb)
require.NoError(t, err)
}
func TestRunPodSandboxWithShimStartFailure(t *testing.T) {
if runtime.GOOS != "linux" {
t.Skip()
}
t.Logf("Inject Shim failpoint")
sbConfig := PodSandboxConfig(t.Name(), "failpoint")
injectShimFailpoint(t, sbConfig, map[string]string{
"Start": "1*error(no hard feelings)",
})
t.Logf("Create a sandbox")
_, err := runtimeService.RunPodSandbox(sbConfig, failpointRuntimeHandler)
require.Error(t, err)
require.Equal(t, true, strings.Contains(err.Error(), "no hard feelings"))
}
// failpointConf is used to describe cmdAdd/cmdDel/cmdCheck command's failpoint.
type failpointConf struct {
Add string `json:"cmdAdd"`
Del string `json:"cmdDel"`
Check string `json:"cmdCheck"`
}
func injectCNIFailpoint(t *testing.T, sbConfig *criapiv1.PodSandboxConfig, conf *failpointConf) {
stateDir := t.TempDir()
metadata := sbConfig.Metadata
fpFilename := filepath.Join(stateDir,
fmt.Sprintf("%s-%s.json", metadata.Namespace, metadata.Name))
data, err := json.Marshal(conf)
require.NoError(t, err)
err = os.WriteFile(fpFilename, data, 0666)
require.NoError(t, err)
sbConfig.Annotations[failpointCNIConfPathKey] = fpFilename
}
func injectShimFailpoint(t *testing.T, sbConfig *criapiv1.PodSandboxConfig, methodFps map[string]string) {
for method, fp := range methodFps {
_, err := failpoint.NewFailpoint(method, fp)
require.NoError(t, err, "check failpoint %s for shim method %s", fp, method)
sbConfig.Annotations[failpointShimPrefixKey+method] = fp
}
}