382 lines
14 KiB
Go
382 lines
14 KiB
Go
/*
|
|
Copyright 2015 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package minion
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"os"
|
|
"os/signal"
|
|
"path"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
|
|
log "github.com/golang/glog"
|
|
"github.com/kardianos/osext"
|
|
"github.com/spf13/pflag"
|
|
"gopkg.in/natefinch/lumberjack.v2"
|
|
kubeletapp "k8s.io/kubernetes/cmd/kubelet/app"
|
|
exservice "k8s.io/kubernetes/contrib/mesos/pkg/executor/service"
|
|
"k8s.io/kubernetes/contrib/mesos/pkg/flagutil"
|
|
"k8s.io/kubernetes/contrib/mesos/pkg/hyperkube"
|
|
"k8s.io/kubernetes/contrib/mesos/pkg/minion/config"
|
|
"k8s.io/kubernetes/contrib/mesos/pkg/minion/tasks"
|
|
"k8s.io/kubernetes/pkg/api/resource"
|
|
"k8s.io/kubernetes/pkg/client/restclient"
|
|
)
|
|
|
|
const (
|
|
proxyLogFilename = "proxy.log"
|
|
executorLogFilename = "executor.log"
|
|
)
|
|
|
|
type MinionServer struct {
|
|
// embed the executor server to be able to use its flags
|
|
// TODO(sttts): get rid of this mixing of the minion and the executor server with a multiflags implementation for km
|
|
KubeletExecutorServer *exservice.KubeletExecutorServer
|
|
|
|
privateMountNS bool
|
|
hks hyperkube.Interface
|
|
clientConfig *restclient.Config
|
|
kmBinary string
|
|
tasks []*tasks.Task
|
|
|
|
pathOverride string // the PATH environment for the sub-processes
|
|
cgroupPrefix string // e.g. mesos
|
|
cgroupRoot string // the cgroupRoot that we pass to the kubelet-executor, depends on containPodResources
|
|
mesosCgroup string // discovered mesos cgroup root, e.g. /mesos/{container-id}
|
|
containPodResources bool
|
|
|
|
logMaxSize resource.Quantity
|
|
logMaxBackups int
|
|
logMaxAgeInDays int
|
|
logVerbosity int32 // see glog.Level
|
|
|
|
runProxy bool
|
|
proxyKubeconfig string
|
|
proxyLogV int
|
|
proxyBindall bool
|
|
proxyMode string
|
|
conntrackMax int
|
|
conntrackTCPTimeoutEstablished int
|
|
}
|
|
|
|
// NewMinionServer creates the MinionServer struct with default values to be used by hyperkube
|
|
func NewMinionServer() *MinionServer {
|
|
s := &MinionServer{
|
|
KubeletExecutorServer: exservice.NewKubeletExecutorServer(),
|
|
privateMountNS: false, // disabled until Docker supports customization of the parent mount namespace
|
|
cgroupPrefix: config.DefaultCgroupPrefix,
|
|
containPodResources: true,
|
|
logMaxSize: config.DefaultLogMaxSize(),
|
|
logMaxBackups: config.DefaultLogMaxBackups,
|
|
logMaxAgeInDays: config.DefaultLogMaxAgeInDays,
|
|
runProxy: true,
|
|
proxyMode: "userspace", // upstream default is "iptables" post-v1.1
|
|
}
|
|
|
|
// cache this for later use
|
|
binary, err := osext.Executable()
|
|
if err != nil {
|
|
log.Fatalf("failed to determine currently running executable: %v", err)
|
|
}
|
|
s.kmBinary = binary
|
|
|
|
return s
|
|
}
|
|
|
|
// filterArgsByFlagSet returns a list of args which are parsed by the given flag set
|
|
// and another list with those which do not match
|
|
func filterArgsByFlagSet(args []string, flags *pflag.FlagSet) ([]string, []string) {
|
|
matched := []string{}
|
|
notMatched := []string{}
|
|
for _, arg := range args {
|
|
err := flags.Parse([]string{arg})
|
|
if err != nil {
|
|
notMatched = append(notMatched, arg)
|
|
} else {
|
|
matched = append(matched, arg)
|
|
}
|
|
}
|
|
return matched, notMatched
|
|
}
|
|
|
|
func findMesosCgroup(prefix string) (cgroupPath string, containerID string) {
|
|
// derive our cgroup from MESOS_DIRECTORY environment
|
|
mesosDir := os.Getenv("MESOS_DIRECTORY")
|
|
if mesosDir == "" {
|
|
log.V(2).Infof("cannot derive executor's cgroup because MESOS_DIRECTORY is empty")
|
|
return
|
|
}
|
|
|
|
containerID = path.Base(mesosDir)
|
|
if containerID == "" {
|
|
log.V(2).Infof("cannot derive executor's cgroup from MESOS_DIRECTORY=%q", mesosDir)
|
|
return
|
|
}
|
|
|
|
cgroupPath = path.Join("/", prefix, containerID)
|
|
return
|
|
}
|
|
|
|
func (ms *MinionServer) launchProxyServer() {
|
|
bindAddress := "0.0.0.0"
|
|
if !ms.proxyBindall {
|
|
bindAddress = ms.KubeletExecutorServer.Address
|
|
}
|
|
args := []string{
|
|
fmt.Sprintf("--bind-address=%s", bindAddress),
|
|
fmt.Sprintf("--v=%d", ms.proxyLogV),
|
|
"--logtostderr=true",
|
|
// TODO(jdef) resource-container is going away completely at some point, but
|
|
// we need to override it here to disable the current default behavior
|
|
"--resource-container=", // disable this; mesos slave doesn't like sub-containers yet
|
|
"--proxy-mode=" + ms.proxyMode,
|
|
"--conntrack-max=" + strconv.Itoa(ms.conntrackMax),
|
|
"--conntrack-tcp-timeout-established=" + strconv.Itoa(ms.conntrackTCPTimeoutEstablished),
|
|
}
|
|
if ms.proxyKubeconfig != "" {
|
|
args = append(args, fmt.Sprintf("--kubeconfig=%s", ms.proxyKubeconfig))
|
|
}
|
|
if ms.clientConfig.Host != "" {
|
|
args = append(args, fmt.Sprintf("--master=%s", ms.clientConfig.Host))
|
|
}
|
|
if ms.KubeletExecutorServer.HostnameOverride != "" {
|
|
args = append(args, fmt.Sprintf("--hostname-override=%s", ms.KubeletExecutorServer.HostnameOverride))
|
|
}
|
|
|
|
ms.launchHyperkubeServer(hyperkube.CommandProxy, args, proxyLogFilename)
|
|
}
|
|
|
|
// launchExecutorServer returns a chan that closes upon kubelet-executor death. since the kubelet-
|
|
// executor doesn't support failover right now, the right thing to do is to fail completely since all
|
|
// pods will be lost upon restart and we want mesos to recover the resources from them.
|
|
func (ms *MinionServer) launchExecutorServer(containerID string) <-chan struct{} {
|
|
allArgs := os.Args[2:]
|
|
|
|
// filter out minion flags, leaving those for the executor
|
|
executorFlags := pflag.NewFlagSet("executor", pflag.ContinueOnError)
|
|
executorFlags.SetOutput(ioutil.Discard)
|
|
ms.AddExecutorFlags(executorFlags)
|
|
executorArgs, _ := filterArgsByFlagSet(allArgs, executorFlags)
|
|
|
|
// disable resource-container; mesos slave doesn't like sub-containers yet
|
|
executorArgs = append(executorArgs, "--kubelet-cgroups=")
|
|
|
|
appendOptional := func(name, value string) {
|
|
if value != "" {
|
|
executorArgs = append(executorArgs, "--"+name+"="+value)
|
|
}
|
|
}
|
|
appendOptional("cgroup-root", ms.cgroupRoot)
|
|
|
|
// forward global cadvisor flag values to the executor
|
|
// TODO(jdef) remove this code once cadvisor global flags have been cleaned up
|
|
appendOptional(flagutil.Cadvisor.HousekeepingInterval.NameValue())
|
|
appendOptional(flagutil.Cadvisor.GlobalHousekeepingInterval.NameValue())
|
|
|
|
// forward containerID so that the executor may pass it along to containers that it launches
|
|
var ctidOpt tasks.Option
|
|
ctidOpt = func(t *tasks.Task) tasks.Option {
|
|
oldenv := t.Env[:]
|
|
t.Env = append(t.Env, "MESOS_EXECUTOR_CONTAINER_UUID="+containerID)
|
|
return func(t2 *tasks.Task) tasks.Option {
|
|
t2.Env = oldenv
|
|
return ctidOpt
|
|
}
|
|
}
|
|
|
|
// run executor and quit minion server when this exits cleanly
|
|
execDied := make(chan struct{})
|
|
ms.launchHyperkubeServer(hyperkube.CommandExecutor, executorArgs, executorLogFilename, tasks.NoRespawn(execDied), ctidOpt)
|
|
return execDied
|
|
}
|
|
|
|
func (ms *MinionServer) launchHyperkubeServer(server string, args []string, logFileName string, options ...tasks.Option) {
|
|
log.V(2).Infof("Spawning hyperkube %v with args '%+v'", server, args)
|
|
|
|
kmArgs := append([]string{server}, args...)
|
|
maxSize := ms.logMaxSize.Value()
|
|
if maxSize > 0 {
|
|
// convert to MB
|
|
maxSize = maxSize / 1024 / 1024
|
|
if maxSize == 0 {
|
|
log.Warning("maximal log file size is rounded to 1 MB")
|
|
maxSize = 1
|
|
}
|
|
}
|
|
|
|
writerFunc := func() io.WriteCloser {
|
|
return &lumberjack.Logger{
|
|
Filename: logFileName,
|
|
MaxSize: int(maxSize),
|
|
MaxBackups: ms.logMaxBackups,
|
|
MaxAge: ms.logMaxAgeInDays,
|
|
}
|
|
}
|
|
|
|
// prepend env, allow later options to customize further
|
|
options = append([]tasks.Option{tasks.Environment(os.Environ()), ms.applyPathOverride()}, options...)
|
|
|
|
t := tasks.New(server, ms.kmBinary, kmArgs, writerFunc, options...)
|
|
go t.Start()
|
|
ms.tasks = append(ms.tasks, t)
|
|
}
|
|
|
|
// applyPathOverride overrides PATH and also adds $SANDBOX/bin (needed for locating bundled binary deps
|
|
// as well as external deps like iptables)
|
|
func (ms *MinionServer) applyPathOverride() tasks.Option {
|
|
return func(t *tasks.Task) tasks.Option {
|
|
kmEnv := make([]string, 0, len(t.Env))
|
|
for _, e := range t.Env {
|
|
if !strings.HasPrefix(e, "PATH=") {
|
|
kmEnv = append(kmEnv, e)
|
|
} else {
|
|
if ms.pathOverride != "" {
|
|
e = "PATH=" + ms.pathOverride
|
|
}
|
|
pwd, err := os.Getwd()
|
|
if err != nil {
|
|
panic(fmt.Errorf("Cannot get current directory: %v", err))
|
|
}
|
|
kmEnv = append(kmEnv, fmt.Sprintf("%s:%s", e, path.Join(pwd, "bin")))
|
|
}
|
|
}
|
|
oldenv := t.Env
|
|
t.Env = kmEnv
|
|
return tasks.Environment(oldenv)
|
|
}
|
|
}
|
|
|
|
// runs the main kubelet loop, closing the kubeletFinished chan when the loop exits.
|
|
// never returns.
|
|
func (ms *MinionServer) Run(hks hyperkube.Interface, _ []string) error {
|
|
if ms.privateMountNS {
|
|
// only the Linux version will do anything
|
|
enterPrivateMountNamespace()
|
|
}
|
|
|
|
// create apiserver client
|
|
clientConfig, err := kubeletapp.CreateAPIServerClientConfig(ms.KubeletExecutorServer.KubeletServer)
|
|
if err != nil {
|
|
// required for k8sm since we need to send api.Binding information
|
|
// back to the apiserver
|
|
log.Fatalf("No API client: %v", err)
|
|
}
|
|
ms.clientConfig = clientConfig
|
|
|
|
// derive the executor cgroup and use it as:
|
|
// - pod container cgroup root (e.g. docker cgroup-parent, optionally; see comments below)
|
|
// - parent of kubelet container
|
|
// - parent of kube-proxy container
|
|
containerID := ""
|
|
ms.mesosCgroup, containerID = findMesosCgroup(ms.cgroupPrefix)
|
|
log.Infof("discovered mesos cgroup at %q", ms.mesosCgroup)
|
|
|
|
// hack alert, this helps to work around systemd+docker+mesos integration problems
|
|
// when docker's cgroup-parent flag is used (!containPodResources = don't use the docker flag)
|
|
if ms.containPodResources {
|
|
ms.cgroupRoot = ms.mesosCgroup
|
|
}
|
|
|
|
cgroupLogger := log.Infof
|
|
if ms.cgroupRoot == "" {
|
|
cgroupLogger = log.Warningf
|
|
}
|
|
|
|
cgroupLogger("using cgroup-root %q", ms.cgroupRoot)
|
|
|
|
// run subprocesses until ms.done is closed on return of this function
|
|
if ms.runProxy {
|
|
ms.launchProxyServer()
|
|
}
|
|
|
|
// abort closes when the kubelet-executor dies
|
|
abort := ms.launchExecutorServer(containerID)
|
|
shouldQuit := termSignalListener(abort)
|
|
te := tasks.MergeOutput(ms.tasks, shouldQuit)
|
|
|
|
// TODO(jdef) do something fun here, such as reporting task completion to the apiserver
|
|
|
|
<-te.Close().Done() // we don't listen for any specific events yet; wait for all tasks to finish
|
|
return nil
|
|
}
|
|
|
|
// termSignalListener returns a signal chan that closes when either (a) the process receives a termination
|
|
// signal: SIGTERM, SIGINT, or SIGHUP; or (b) the abort chan closes.
|
|
func termSignalListener(abort <-chan struct{}) <-chan struct{} {
|
|
shouldQuit := make(chan struct{})
|
|
sigCh := make(chan os.Signal, 1)
|
|
signal.Notify(sigCh)
|
|
|
|
go func() {
|
|
defer close(shouldQuit)
|
|
for {
|
|
select {
|
|
case <-abort:
|
|
log.Infof("executor died, aborting")
|
|
return
|
|
case s, ok := <-sigCh:
|
|
if !ok {
|
|
return
|
|
}
|
|
switch s {
|
|
case os.Interrupt, os.Signal(syscall.SIGTERM), os.Signal(syscall.SIGINT), os.Signal(syscall.SIGHUP):
|
|
log.Infof("received signal %q, aborting", s)
|
|
return
|
|
case os.Signal(syscall.SIGCHLD): // who cares?
|
|
default:
|
|
log.Errorf("unexpected signal: %T %#v", s, s)
|
|
}
|
|
|
|
}
|
|
}
|
|
}()
|
|
return shouldQuit
|
|
}
|
|
|
|
func (ms *MinionServer) AddExecutorFlags(fs *pflag.FlagSet) {
|
|
ms.KubeletExecutorServer.AddFlags(fs)
|
|
|
|
// hack to forward log verbosity flag to the executor
|
|
fs.Int32Var(&ms.logVerbosity, "v", ms.logVerbosity, "log level for V logs")
|
|
}
|
|
|
|
func (ms *MinionServer) AddMinionFlags(fs *pflag.FlagSet) {
|
|
// general minion flags
|
|
fs.StringVar(&ms.cgroupPrefix, "mesos-cgroup-prefix", ms.cgroupPrefix, "The cgroup prefix concatenated with MESOS_DIRECTORY must give the executor cgroup set by Mesos")
|
|
fs.BoolVar(&ms.privateMountNS, "private-mountns", ms.privateMountNS, "Enter a private mount NS before spawning procs (linux only). Experimental, not yet compatible with k8s volumes.")
|
|
fs.StringVar(&ms.pathOverride, "path-override", ms.pathOverride, "Override the PATH in the environment of the sub-processes.")
|
|
fs.BoolVar(&ms.containPodResources, "contain-pod-resources", ms.containPodResources, "Allocate pod CPU and memory resources from offers and reparent pod containers into mesos cgroups; disable if you're having strange mesos/docker/systemd interactions.")
|
|
|
|
// log file flags
|
|
fs.Var(resource.NewQuantityFlagValue(&ms.logMaxSize), "max-log-size", "Maximum log file size for the executor and proxy before rotation")
|
|
fs.IntVar(&ms.logMaxAgeInDays, "max-log-age", ms.logMaxAgeInDays, "Maximum log file age of the executor and proxy in days")
|
|
fs.IntVar(&ms.logMaxBackups, "max-log-backups", ms.logMaxBackups, "Maximum log file backups of the executor and proxy to keep after rotation")
|
|
|
|
// proxy flags
|
|
fs.BoolVar(&ms.runProxy, "run-proxy", ms.runProxy, "Maintain a running kube-proxy instance as a child proc of this kubelet-executor.")
|
|
fs.StringVar(&ms.proxyKubeconfig, "proxy-kubeconfig", ms.proxyKubeconfig, "Path to kubeconfig file used by the child kube-proxy.")
|
|
fs.IntVar(&ms.proxyLogV, "proxy-logv", ms.proxyLogV, "Log verbosity of the child kube-proxy.")
|
|
fs.BoolVar(&ms.proxyBindall, "proxy-bindall", ms.proxyBindall, "When true will cause kube-proxy to bind to 0.0.0.0.")
|
|
fs.StringVar(&ms.proxyMode, "proxy-mode", ms.proxyMode, "Which proxy mode to use: 'userspace' (older) or 'iptables' (faster). If the iptables proxy is selected, regardless of how, but the system's kernel or iptables versions are insufficient, this always falls back to the userspace proxy.")
|
|
fs.IntVar(&ms.conntrackMax, "conntrack-max", ms.conntrackMax, "Maximum number of NAT connections to track on agent nodes (0 to leave as-is)")
|
|
fs.IntVar(&ms.conntrackTCPTimeoutEstablished, "conntrack-tcp-timeout-established", ms.conntrackTCPTimeoutEstablished, "Idle timeout for established TCP connections on agent nodes (0 to leave as-is)")
|
|
}
|