diff --git a/cmd/cri-containerd/options/options.go b/cmd/cri-containerd/options/options.go index 945a1098e..da8dba7ee 100644 --- a/cmd/cri-containerd/options/options.go +++ b/cmd/cri-containerd/options/options.go @@ -87,6 +87,9 @@ type PluginConfig struct { // SkipImageFSUUID skips retrieving imagefs uuid. // TODO(random-liu): Remove this after we find a generic way to get imagefs uuid. SkipImageFSUUID bool `toml:"skip_imagefs_uuid" json:"skipImageFSUUID,omitempty"` + // EnableIPv6DAD enables IPv6 DAD. + // TODO(random-liu): Use optimistic_dad when it's GA. + EnableIPv6DAD bool `toml:"enable_ipv6_dad" json:"enableIPv6DAD,omitempty"` } // CRIConfig contains toml config related to CRI service. @@ -189,6 +192,8 @@ func (c *CRIContainerdOptions) AddFlags(fs *pflag.FlagSet) { defaults.ProfilingAddress, "Profiling address for web interface host:port/debug/pprof/.") fs.BoolVar(&c.SkipImageFSUUID, "skip-imagefs-uuid", defaults.SkipImageFSUUID, "Skip retrieval of imagefs uuid. When turned on, kubelet will not be able to get imagefs capacity or perform imagefs disk eviction.") + fs.BoolVar(&c.EnableIPv6DAD, "enable-ipv6-dad", + defaults.EnableIPv6DAD, "Enable IPv6 DAD (duplicate address detection) for pod sandbox network. Enabling this will increase pod sandbox start latency by several seconds.") } // InitFlags load configurations from config file, and then overwrite with flags. @@ -257,6 +262,7 @@ func DefaultConfig() Config { StatsCollectPeriod: 10, SystemdCgroup: false, SkipImageFSUUID: false, + EnableIPv6DAD: false, }, ContainerdRootDir: "/var/lib/containerd", ContainerdEndpoint: "/run/containerd/containerd.sock", diff --git a/pkg/server/helpers.go b/pkg/server/helpers.go index 2d9399ff2..48b98f2c7 100644 --- a/pkg/server/helpers.go +++ b/pkg/server/helpers.go @@ -19,6 +19,7 @@ package server import ( "encoding/json" "fmt" + "os/exec" "path" "path/filepath" "strconv" @@ -36,6 +37,7 @@ import ( "github.com/opencontainers/selinux/go-selinux/label" "golang.org/x/net/context" "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime" + "k8s.io/kubernetes/pkg/util/sysctl" "github.com/containerd/cri-containerd/pkg/store" imagestore "github.com/containerd/cri-containerd/pkg/store/image" @@ -383,3 +385,35 @@ func newSpecGenerator(spec *runtimespec.Spec) generate.Generator { g.HostSpecific = true return g } + +// disableNetNSDAD disables duplicate address detection in the network namespace. +// DAD has a negative affect on sandbox start latency, since we have to wait +// a second or more for the addresses to leave the "tentative" state. +func disableNetNSDAD(ns string) error { + dad := "net/ipv6/conf/default/accept_dad" + + sysctlBin, err := exec.LookPath("sysctl") + if err != nil { + return fmt.Errorf("could not find sysctl binary: %v", err) + } + + nsenterBin, err := exec.LookPath("nsenter") + if err != nil { + return fmt.Errorf("could not find nsenter binary: %v", err) + } + + // If the sysctl doesn't exist, it means ipv6 is disabled. + if _, err := sysctl.New().GetSysctl(dad); err != nil { + return nil + } + + output, err := exec.Command(nsenterBin, + fmt.Sprintf("--net=%s", ns), "-F", "--", + sysctlBin, "-w", fmt.Sprintf("%s=%s", dad, "0"), + ).CombinedOutput() + if err != nil { + return fmt.Errorf("failed to write sysctl %q - output: %s, error: %s", + dad, output, err) + } + return nil +} diff --git a/pkg/server/sandbox_run.go b/pkg/server/sandbox_run.go index 0d0dfd4a2..ff26f9f19 100644 --- a/pkg/server/sandbox_run.go +++ b/pkg/server/sandbox_run.go @@ -102,6 +102,16 @@ func (c *criContainerdService) RunPodSandbox(ctx context.Context, r *runtime.Run sandbox.NetNSPath = "" } }() + if !c.config.EnableIPv6DAD { + // It's a known issue that IPv6 DAD increases sandbox start latency by several seconds. + // Disable it when it's not enabled to avoid the latency. + // See: + // * https://github.com/kubernetes/kubernetes/issues/54651 + // * https://www.agwa.name/blog/post/beware_the_ipv6_dad_race_condition + if err := disableNetNSDAD(sandbox.NetNSPath); err != nil { + return nil, fmt.Errorf("failed to disable DAD for sandbox %q: %v", id, err) + } + } // Setup network for sandbox. podNetwork := ocicni.PodNetwork{ Name: config.GetMetadata().GetName(), diff --git a/vendor/github.com/docker/docker/hack/README.md b/vendor/github.com/docker/docker/hack/README.md deleted file mode 100644 index 9e588db25..000000000 --- a/vendor/github.com/docker/docker/hack/README.md +++ /dev/null @@ -1,60 +0,0 @@ -## About - -This directory contains a collection of scripts used to build and manage this -repository. If there are any issues regarding the intention of a particular -script (or even part of a certain script), please reach out to us. -It may help us either refine our current scripts, or add on new ones -that are appropriate for a given use case. - -## DinD (dind.sh) - -DinD is a wrapper script which allows Docker to be run inside a Docker -container. DinD requires the container to -be run with privileged mode enabled. - -## Generate Authors (generate-authors.sh) - -Generates AUTHORS; a file with all the names and corresponding emails of -individual contributors. AUTHORS can be found in the home directory of -this repository. - -## Make - -There are two make files, each with different extensions. Neither are supposed -to be called directly; only invoke `make`. Both scripts run inside a Docker -container. - -### make.ps1 - -- The Windows native build script that uses PowerShell semantics; it is limited -unlike `hack\make.sh` since it does not provide support for the full set of -operations provided by the Linux counterpart, `make.sh`. However, `make.ps1` -does provide support for local Windows development and Windows to Windows CI. -More information is found within `make.ps1` by the author, @jhowardmsft - -### make.sh - -- Referenced via `make test` when running tests on a local machine, -or directly referenced when running tests inside a Docker development container. -- When running on a local machine, `make test` to run all tests found in -`test`, `test-unit`, `test-integration`, and `test-docker-py` on -your local machine. The default timeout is set in `make.sh` to 60 minutes -(`${TIMEOUT:=60m}`), since it currently takes up to an hour to run -all of the tests. -- When running inside a Docker development container, `hack/make.sh` does -not have a single target that runs all the tests. You need to provide a -single command line with multiple targets that performs the same thing. -An example referenced from [Run targets inside a development container](https://docs.docker.com/opensource/project/test-and-docs/#run-targets-inside-a-development-container): `root@5f8630b873fe:/go/src/github.com/moby/moby# hack/make.sh dynbinary binary cross test-unit test-integration test-docker-py` -- For more information related to testing outside the scope of this README, -refer to -[Run tests and test documentation](https://docs.docker.com/opensource/project/test-and-docs/) - -## Release (release.sh) - -Releases any bundles built by `make` on a public AWS S3 bucket. -For information regarding configuration, please view `release.sh`. - -## Vendor (vendor.sh) - -A shell script that is a wrapper around Vndr. For information on how to use -this, please refer to [vndr's README](https://github.com/LK4D4/vndr/blob/master/README.md) diff --git a/vendor/github.com/docker/docker/hack/integration-cli-on-swarm/README.md b/vendor/github.com/docker/docker/hack/integration-cli-on-swarm/README.md deleted file mode 100644 index 1cea52526..000000000 --- a/vendor/github.com/docker/docker/hack/integration-cli-on-swarm/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# Integration Testing on Swarm - -IT on Swarm allows you to execute integration test in parallel across a Docker Swarm cluster - -## Architecture - -### Master service - - - Works as a funker caller - - Calls a worker funker (`-worker-service`) with a chunk of `-check.f` filter strings (passed as a file via `-input` flag, typically `/mnt/input`) - -### Worker service - - - Works as a funker callee - - Executes an equivalent of `TESTFLAGS=-check.f TestFoo|TestBar|TestBaz ... make test-integration-cli` using the bind-mounted API socket (`docker.sock`) - -### Client - - - Controls master and workers via `docker stack` - - No need to have a local daemon - -Typically, the master and workers are supposed to be running on a cloud environment, -while the client is supposed to be running on a laptop, e.g. Docker for Mac/Windows. - -## Requirement - - - Docker daemon 1.13 or later - - Private registry for distributed execution with multiple nodes - -## Usage - -### Step 1: Prepare images - - $ make build-integration-cli-on-swarm - -Following environment variables are known to work in this step: - - - `BUILDFLAGS` - - `DOCKER_INCREMENTAL_BINARY` - -Note: during the transition into Moby Project, you might need to create a symbolic link `$GOPATH/src/github.com/docker/docker` to `$GOPATH/src/github.com/moby/moby`. - -### Step 2: Execute tests - - $ ./hack/integration-cli-on-swarm/integration-cli-on-swarm -replicas 40 -push-worker-image YOUR_REGISTRY.EXAMPLE.COM/integration-cli-worker:latest - -Following environment variables are known to work in this step: - - - `DOCKER_GRAPHDRIVER` - - `DOCKER_EXPERIMENTAL` - -#### Flags - -Basic flags: - - - `-replicas N`: the number of worker service replicas. i.e. degree of parallelism. - - `-chunks N`: the number of chunks. By default, `chunks` == `replicas`. - - `-push-worker-image REGISTRY/IMAGE:TAG`: push the worker image to the registry. Note that if you have only single node and hence you do not need a private registry, you do not need to specify `-push-worker-image`. - -Experimental flags for mitigating makespan nonuniformity: - - - `-shuffle`: Shuffle the test filter strings - -Flags for debugging IT on Swarm itself: - - - `-rand-seed N`: the random seed. This flag is useful for deterministic replaying. By default(0), the timestamp is used. - - `-filters-file FILE`: the file contains `-check.f` strings. By default, the file is automatically generated. - - `-dry-run`: skip the actual workload - - `keep-executor`: do not auto-remove executor containers, which is used for running privileged programs on Swarm diff --git a/vendor/github.com/docker/docker/hack/integration-cli-on-swarm/agent/vendor.conf b/vendor/github.com/docker/docker/hack/integration-cli-on-swarm/agent/vendor.conf deleted file mode 100644 index efd6d6d04..000000000 --- a/vendor/github.com/docker/docker/hack/integration-cli-on-swarm/agent/vendor.conf +++ /dev/null @@ -1,2 +0,0 @@ -# dependencies specific to worker (i.e. github.com/docker/docker/...) are not vendored here -github.com/bfirsh/funker-go eaa0a2e06f30e72c9a0b7f858951e581e26ef773 diff --git a/vendor/k8s.io/kubernetes/pkg/util/sysctl/sysctl.go b/vendor/k8s.io/kubernetes/pkg/util/sysctl/sysctl.go new file mode 100644 index 000000000..5c01dd88e --- /dev/null +++ b/vendor/k8s.io/kubernetes/pkg/util/sysctl/sysctl.go @@ -0,0 +1,78 @@ +/* +Copyright 2015 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package sysctl + +import ( + "io/ioutil" + "path" + "strconv" + "strings" +) + +const ( + sysctlBase = "/proc/sys" + VmOvercommitMemory = "vm/overcommit_memory" + VmPanicOnOOM = "vm/panic_on_oom" + KernelPanic = "kernel/panic" + KernelPanicOnOops = "kernel/panic_on_oops" + RootMaxKeys = "kernel/keys/root_maxkeys" + RootMaxBytes = "kernel/keys/root_maxbytes" + + VmOvercommitMemoryAlways = 1 // kernel performs no memory over-commit handling + VmPanicOnOOMInvokeOOMKiller = 0 // kernel calls the oom_killer function when OOM occurs + + KernelPanicOnOopsAlways = 1 // kernel panics on kernel oops + KernelPanicRebootTimeout = 10 // seconds after a panic for the kernel to reboot + + RootMaxKeysSetting = 1000000 // Needed since docker creates a new key per container + RootMaxBytesSetting = RootMaxKeysSetting * 25 // allocate 25 bytes per key * number of MaxKeys +) + +// An injectable interface for running sysctl commands. +type Interface interface { + // GetSysctl returns the value for the specified sysctl setting + GetSysctl(sysctl string) (int, error) + // SetSysctl modifies the specified sysctl flag to the new value + SetSysctl(sysctl string, newVal int) error +} + +// New returns a new Interface for accessing sysctl +func New() Interface { + return &procSysctl{} +} + +// procSysctl implements Interface by reading and writing files under /proc/sys +type procSysctl struct { +} + +// GetSysctl returns the value for the specified sysctl setting +func (_ *procSysctl) GetSysctl(sysctl string) (int, error) { + data, err := ioutil.ReadFile(path.Join(sysctlBase, sysctl)) + if err != nil { + return -1, err + } + val, err := strconv.Atoi(strings.Trim(string(data), " \n")) + if err != nil { + return -1, err + } + return val, nil +} + +// SetSysctl modifies the specified sysctl flag to the new value +func (_ *procSysctl) SetSysctl(sysctl string, newVal int) error { + return ioutil.WriteFile(path.Join(sysctlBase, sysctl), []byte(strconv.Itoa(newVal)), 0640) +}