diff --git a/cluster/saltbase/salt/kubelet/default b/cluster/saltbase/salt/kubelet/default index e34369ea1bc..02c39c49821 100644 --- a/cluster/saltbase/salt/kubelet/default +++ b/cluster/saltbase/salt/kubelet/default @@ -58,10 +58,12 @@ {% set configure_cbr0 = "--configure-cbr0=" + pillar['allocate_node_cidrs'] -%} {% endif -%} -# Run containers under the root cgroup. +# Run containers under the root cgroup and create a system container. +{% set system_container = "" -%} {% set cgroup_root = "" -%} {% if grains['os_family'] == 'Debian' -%} + {% set system_container = "--system-container=/system" -%} {% set cgroup_root = "--cgroup_root=/" -%} {% endif -%} -DAEMON_ARGS="{{daemon_args}} {{api_servers_with_port}} {{hostname_override}} {{cloud_provider}} {{config}} --allow_privileged={{pillar['allow_privileged']}} {{pillar['log_level']}} {{cluster_dns}} {{cluster_domain}} {{docker_root}} {{configure_cbr0}} {{cgroup_root}}" +DAEMON_ARGS="{{daemon_args}} {{api_servers_with_port}} {{hostname_override}} {{cloud_provider}} {{config}} --allow_privileged={{pillar['allow_privileged']}} {{pillar['log_level']}} {{cluster_dns}} {{cluster_domain}} {{docker_root}} {{configure_cbr0}} {{cgroup_root}} {{system_container}}" diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index e9845700fc7..0c6d75cc30d 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -108,6 +108,7 @@ type KubeletServer struct { CgroupRoot string ContainerRuntime string DockerDaemonContainer string + SystemContainer string ConfigureCBR0 bool MaxPods int @@ -170,6 +171,7 @@ func NewKubeletServer() *KubeletServer { CgroupRoot: "", ContainerRuntime: "docker", DockerDaemonContainer: "/docker-daemon", + SystemContainer: "", ConfigureCBR0: false, } } @@ -228,7 +230,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) { fs.StringVar(&s.ResourceContainer, "resource-container", s.ResourceContainer, "Absolute name of the resource-only container to create and run the Kubelet in (Default: /kubelet).") fs.StringVar(&s.CgroupRoot, "cgroup_root", s.CgroupRoot, "Optional root cgroup to use for pods. This is handled by the container runtime on a best effort basis. Default: '', which means use the container runtime default.") fs.StringVar(&s.ContainerRuntime, "container_runtime", s.ContainerRuntime, "The container runtime to use. Possible values: 'docker', 'rkt'. Default: 'docker'.") - fs.StringVar(&s.DockerDaemonContainer, "docker-daemon-container", s.DockerDaemonContainer, "Optional resource-only container in which to place the Docker Daemon. Empty for no container (Default: /docker-daemon).") + fs.StringVar(&s.SystemContainer, "system-container", s.SystemContainer, "Optional resource-only container in which to place all non-kernel processes that are not already in a container. Empty for no container. Rolling back the flag requires a reboot. (Default: \"\").") fs.BoolVar(&s.ConfigureCBR0, "configure-cbr0", s.ConfigureCBR0, "If true, kubelet will configure cbr0 based on Node.Spec.PodCIDR.") fs.IntVar(&s.MaxPods, "max-pods", 100, "Number of Pods that can run on this Kubelet.") @@ -347,6 +349,7 @@ func (s *KubeletServer) Run(_ []string) error { ContainerRuntime: s.ContainerRuntime, Mounter: mounter, DockerDaemonContainer: s.DockerDaemonContainer, + SystemContainer: s.SystemContainer, ConfigureCBR0: s.ConfigureCBR0, MaxPods: s.MaxPods, } @@ -513,6 +516,7 @@ func SimpleKubelet(client *client.Client, ContainerRuntime: "docker", Mounter: mount.New(), DockerDaemonContainer: "/docker-daemon", + SystemContainer: "", MaxPods: 32, } return &kcfg @@ -648,6 +652,7 @@ type KubeletConfig struct { ContainerRuntime string Mounter mount.Interface DockerDaemonContainer string + SystemContainer string ConfigureCBR0 bool MaxPods int } @@ -701,6 +706,7 @@ func createAndInitKubelet(kc *KubeletConfig) (k KubeletBootstrap, pc *config.Pod kc.ContainerRuntime, kc.Mounter, kc.DockerDaemonContainer, + kc.SystemContainer, kc.ConfigureCBR0, kc.MaxPods) diff --git a/pkg/kubelet/container_manager.go b/pkg/kubelet/container_manager.go index 84b20de18da..188646c45d3 100644 --- a/pkg/kubelet/container_manager.go +++ b/pkg/kubelet/container_manager.go @@ -20,5 +20,6 @@ package kubelet type containerManager interface { // Runs the container manager's housekeeping. // - Ensures that the Docker daemon is in a container. + // - Creates the system container where all non-containerized processes run. Start() error } diff --git a/pkg/kubelet/container_manager_linux.go b/pkg/kubelet/container_manager_linux.go index 330e4bf43f9..af1d53a5007 100644 --- a/pkg/kubelet/container_manager_linux.go +++ b/pkg/kubelet/container_manager_linux.go @@ -35,33 +35,60 @@ import ( ) type containerManagerImpl struct { - // Absolute name of the desired container that Docker should be in. - dockerContainerName string + // Whether to create and use the specified containers. + useDockerContainer bool + useSystemContainer bool - // The manager of the resource-only container Docker should be in. - manager fs.Manager + // OOM score for the Docker container. dockerOomScoreAdj int + + // Managers for containers. + dockerContainer fs.Manager + systemContainer fs.Manager + rootContainer fs.Manager } var _ containerManager = &containerManagerImpl{} -// Takes the absolute name that the Docker daemon should be in. -// Empty container name disables moving the Docker daemon. -func newContainerManager(dockerDaemonContainer string) (containerManager, error) { +// Takes the absolute name of the specified containers. +// Empty container name disables use of the specified container. +func newContainerManager(dockerDaemonContainer, systemContainer string) (containerManager, error) { + if systemContainer == "/" { + return nil, fmt.Errorf("system container cannot be root (\"/\")") + } + return &containerManagerImpl{ - dockerContainerName: dockerDaemonContainer, - manager: fs.Manager{ + useDockerContainer: dockerDaemonContainer != "", + useSystemContainer: systemContainer != "", + dockerOomScoreAdj: -900, + dockerContainer: fs.Manager{ Cgroups: &configs.Cgroup{ Name: dockerDaemonContainer, AllowAllDevices: true, }, }, - dockerOomScoreAdj: -900, + systemContainer: fs.Manager{ + Cgroups: &configs.Cgroup{ + Name: systemContainer, + AllowAllDevices: true, + }, + }, + rootContainer: fs.Manager{ + Cgroups: &configs.Cgroup{ + Name: "/", + }, + }, }, nil } func (cm *containerManagerImpl) Start() error { - if cm.dockerContainerName != "" { + if cm.useSystemContainer { + err := cm.ensureSystemContainer() + if err != nil { + return err + } + } + if cm.useDockerContainer { go util.Until(func() { err := cm.ensureDockerInContainer() if err != nil { @@ -99,10 +126,10 @@ func (cm *containerManagerImpl) ensureDockerInContainer() error { errs = append(errs, fmt.Errorf("failed to find container of PID %q: %v", pid, err)) } - if cont != cm.dockerContainerName { - err = cm.manager.Apply(pid) + if cont != cm.dockerContainer.Cgroups.Name { + err = cm.dockerContainer.Apply(pid) if err != nil { - errs = append(errs, fmt.Errorf("failed to move PID %q (in %q) to %q", pid, cont, cm.dockerContainerName)) + errs = append(errs, fmt.Errorf("failed to move PID %q (in %q) to %q", pid, cont, cm.dockerContainer.Cgroups.Name)) } } @@ -125,3 +152,60 @@ func getContainer(pid int) (string, error) { return cgroups.ParseCgroupFile("cpu", f) } + +// Ensures the system container is created and all non-kernel processes without +// a container are moved to it. +func (cm *containerManagerImpl) ensureSystemContainer() error { + // Move non-kernel PIDs to the system container. + attemptsRemaining := 10 + var errs []error + for attemptsRemaining >= 0 { + // Only keep errors on latest attempt. + errs = []error{} + attemptsRemaining-- + + allPids, err := cm.rootContainer.GetPids() + if err != nil { + errs = append(errs, fmt.Errorf("Failed to list PIDs for root: %v", err)) + continue + } + + // Remove kernel pids + pids := make([]int, 0, len(allPids)) + for _, pid := range allPids { + if isKernelPid(pid) { + continue + } + + pids = append(pids, pid) + } + glog.Infof("Found %d PIDs in root, %d of them are kernel related", len(allPids), len(allPids)-len(pids)) + + // Check if we moved all the non-kernel PIDs. + if len(pids) == 0 { + break + } + + glog.Infof("Moving non-kernel threads: %v", pids) + for _, pid := range pids { + err := cm.systemContainer.Apply(pid) + if err != nil { + errs = append(errs, fmt.Errorf("failed to move PID %d into the system container %q: %v", pid, cm.systemContainer.Cgroups.Name, err)) + continue + } + } + + } + if attemptsRemaining < 0 { + errs = append(errs, fmt.Errorf("ran out of attempts to create system containers %q", cm.systemContainer.Cgroups.Name)) + } + + return errors.NewAggregate(errs) +} + +// Determines whether the specified PID is a kernel PID. +func isKernelPid(pid int) bool { + // Kernel threads have no associated executable. + _, err := os.Readlink(fmt.Sprintf("/proc/%d/exe", pid)) + return err != nil +} diff --git a/pkg/kubelet/container_manager_unsupported.go b/pkg/kubelet/container_manager_unsupported.go index 6c543e1e617..77246f174a4 100644 --- a/pkg/kubelet/container_manager_unsupported.go +++ b/pkg/kubelet/container_manager_unsupported.go @@ -31,6 +31,6 @@ func (unsupportedContainerManager) Start() error { return fmt.Errorf("Container Manager is unsupported in this build") } -func newContainerManager(dockerDaemonContainer string) (containerManager, error) { +func newContainerManager(dockerDaemonContainer, systemContainer string) (containerManager, error) { return &unsupportedContainerManager{}, nil } diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 7614a1149dc..ad2c09feb77 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -139,6 +139,7 @@ func NewMainKubelet( containerRuntime string, mounter mount.Interface, dockerDaemonContainer string, + systemContainer string, configureCBR0 bool, pods int) (*Kubelet, error) { if rootDirectory == "" { @@ -147,6 +148,9 @@ func NewMainKubelet( if resyncInterval <= 0 { return nil, fmt.Errorf("invalid sync frequency %d", resyncInterval) } + if systemContainer != "" && cgroupRoot == "" { + return nil, fmt.Errorf("invalid configuration: system container was specified and cgroup root was not specified") + } dockerClient = dockertools.NewInstrumentedDockerInterface(dockerClient) serviceStore := cache.NewStore(cache.MetaNamespaceKeyFunc) @@ -295,7 +299,9 @@ func NewMainKubelet( return nil, fmt.Errorf("unsupported container runtime %q specified", containerRuntime) } - containerManager, err := newContainerManager(dockerDaemonContainer) + // Setup container manager, can fail if the devices hierarchy is not mounted + // (it is required by Docker however). + containerManager, err := newContainerManager(dockerDaemonContainer, systemContainer) if err != nil { return nil, fmt.Errorf("failed to create the Container Manager: %v", err) }