429 lines
12 KiB
Go
429 lines
12 KiB
Go
/*
|
|
Copyright The containerd Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package server
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"path"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/containerd/containerd"
|
|
"github.com/containerd/containerd/mount"
|
|
"github.com/containerd/containerd/pkg/apparmor"
|
|
"github.com/containerd/containerd/pkg/seccomp"
|
|
"github.com/containerd/containerd/pkg/seutil"
|
|
"github.com/containerd/containerd/snapshots"
|
|
"github.com/containerd/log"
|
|
"github.com/moby/sys/mountinfo"
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/opencontainers/selinux/go-selinux/label"
|
|
"golang.org/x/sys/unix"
|
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
|
)
|
|
|
|
const (
|
|
// defaultSandboxOOMAdj is default omm adj for sandbox container. (kubernetes#47938).
|
|
defaultSandboxOOMAdj = -998
|
|
// defaultShmSize is the default size of the sandbox shm.
|
|
defaultShmSize = int64(1024 * 1024 * 64)
|
|
// relativeRootfsPath is the rootfs path relative to bundle path.
|
|
relativeRootfsPath = "rootfs"
|
|
// devShm is the default path of /dev/shm.
|
|
devShm = "/dev/shm"
|
|
// etcHosts is the default path of /etc/hosts file.
|
|
etcHosts = "/etc/hosts"
|
|
// etcHostname is the default path of /etc/hostname file.
|
|
etcHostname = "/etc/hostname"
|
|
// resolvConfPath is the abs path of resolv.conf on host or container.
|
|
resolvConfPath = "/etc/resolv.conf"
|
|
// hostnameEnv is the key for HOSTNAME env.
|
|
hostnameEnv = "HOSTNAME"
|
|
)
|
|
|
|
// getCgroupsPath generates container cgroups path.
|
|
func getCgroupsPath(cgroupsParent, id string) string {
|
|
base := path.Base(cgroupsParent)
|
|
if strings.HasSuffix(base, ".slice") {
|
|
// For a.slice/b.slice/c.slice, base is c.slice.
|
|
// runc systemd cgroup path format is "slice:prefix:name".
|
|
return strings.Join([]string{base, "cri-containerd", id}, ":")
|
|
}
|
|
return filepath.Join(cgroupsParent, id)
|
|
}
|
|
|
|
// getSandboxHostname returns the hostname file path inside the sandbox root directory.
|
|
func (c *criService) getSandboxHostname(id string) string {
|
|
return filepath.Join(c.getSandboxRootDir(id), "hostname")
|
|
}
|
|
|
|
// getSandboxHosts returns the hosts file path inside the sandbox root directory.
|
|
func (c *criService) getSandboxHosts(id string) string {
|
|
return filepath.Join(c.getSandboxRootDir(id), "hosts")
|
|
}
|
|
|
|
// getResolvPath returns resolv.conf filepath for specified sandbox.
|
|
func (c *criService) getResolvPath(id string) string {
|
|
return filepath.Join(c.getSandboxRootDir(id), "resolv.conf")
|
|
}
|
|
|
|
// getSandboxDevShm returns the shm file path inside the sandbox root directory.
|
|
func (c *criService) getSandboxDevShm(id string) string {
|
|
return filepath.Join(c.getVolatileSandboxRootDir(id), "shm")
|
|
}
|
|
|
|
func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) {
|
|
var labels []string
|
|
|
|
if selinuxOptions == nil {
|
|
return nil, nil
|
|
}
|
|
if err := checkSelinuxLevel(selinuxOptions.Level); err != nil {
|
|
return nil, err
|
|
}
|
|
if selinuxOptions.User != "" {
|
|
labels = append(labels, "user:"+selinuxOptions.User)
|
|
}
|
|
if selinuxOptions.Role != "" {
|
|
labels = append(labels, "role:"+selinuxOptions.Role)
|
|
}
|
|
if selinuxOptions.Type != "" {
|
|
labels = append(labels, "type:"+selinuxOptions.Type)
|
|
}
|
|
if selinuxOptions.Level != "" {
|
|
labels = append(labels, "level:"+selinuxOptions.Level)
|
|
}
|
|
|
|
return labels, nil
|
|
}
|
|
|
|
func initLabelsFromOpt(selinuxOpts *runtime.SELinuxOption) (string, string, error) {
|
|
labels, err := toLabel(selinuxOpts)
|
|
if err != nil {
|
|
return "", "", err
|
|
}
|
|
return label.InitLabels(labels)
|
|
}
|
|
|
|
func checkSelinuxLevel(level string) error {
|
|
if len(level) == 0 {
|
|
return nil
|
|
}
|
|
|
|
matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level)
|
|
if err != nil {
|
|
return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err)
|
|
}
|
|
if !matched {
|
|
return fmt.Errorf("the format of 'level' %q is not correct", level)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// apparmorEnabled returns true if apparmor is enabled, supported by the host,
|
|
// if apparmor_parser is installed, and if we are not running docker-in-docker.
|
|
func (c *criService) apparmorEnabled() bool {
|
|
if c.config.DisableApparmor {
|
|
return false
|
|
}
|
|
return apparmor.HostSupports()
|
|
}
|
|
|
|
func (c *criService) seccompEnabled() bool {
|
|
return seccomp.IsEnabled()
|
|
}
|
|
|
|
// openLogFile opens/creates a container log file.
|
|
func openLogFile(path string) (*os.File, error) {
|
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
|
return nil, err
|
|
}
|
|
return os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0640)
|
|
}
|
|
|
|
// unmountRecursive unmounts the target and all mounts underneath, starting with
|
|
// the deepest mount first.
|
|
func unmountRecursive(ctx context.Context, target string) error {
|
|
target, err := mount.CanonicalizePath(target)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
toUnmount, err := mountinfo.GetMounts(mountinfo.PrefixFilter(target))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Make the deepest mount be first
|
|
sort.Slice(toUnmount, func(i, j int) bool {
|
|
return len(toUnmount[i].Mountpoint) > len(toUnmount[j].Mountpoint)
|
|
})
|
|
|
|
for i, m := range toUnmount {
|
|
if err := mount.UnmountAll(m.Mountpoint, unix.MNT_DETACH); err != nil {
|
|
if i == len(toUnmount)-1 { // last mount
|
|
return err
|
|
}
|
|
// This is some submount, we can ignore this error for now, the final unmount will fail if this is a real problem
|
|
log.G(ctx).WithError(err).Debugf("failed to unmount submount %s", m.Mountpoint)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ensureRemoveAll wraps `os.RemoveAll` to check for specific errors that can
|
|
// often be remedied.
|
|
// Only use `ensureRemoveAll` if you really want to make every effort to remove
|
|
// a directory.
|
|
//
|
|
// Because of the way `os.Remove` (and by extension `os.RemoveAll`) works, there
|
|
// can be a race between reading directory entries and then actually attempting
|
|
// to remove everything in the directory.
|
|
// These types of errors do not need to be returned since it's ok for the dir to
|
|
// be gone we can just retry the remove operation.
|
|
//
|
|
// This should not return a `os.ErrNotExist` kind of error under any circumstances
|
|
func ensureRemoveAll(ctx context.Context, dir string) error {
|
|
notExistErr := make(map[string]bool)
|
|
|
|
// track retries
|
|
exitOnErr := make(map[string]int)
|
|
maxRetry := 50
|
|
|
|
// Attempt to unmount anything beneath this dir first.
|
|
if err := unmountRecursive(ctx, dir); err != nil {
|
|
log.G(ctx).WithError(err).Debugf("failed to do initial unmount of %s", dir)
|
|
}
|
|
|
|
for {
|
|
err := os.RemoveAll(dir)
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
|
|
pe, ok := err.(*os.PathError)
|
|
if !ok {
|
|
return err
|
|
}
|
|
|
|
if os.IsNotExist(err) {
|
|
if notExistErr[pe.Path] {
|
|
return err
|
|
}
|
|
notExistErr[pe.Path] = true
|
|
|
|
// There is a race where some subdir can be removed but after the
|
|
// parent dir entries have been read.
|
|
// So the path could be from `os.Remove(subdir)`
|
|
// If the reported non-existent path is not the passed in `dir` we
|
|
// should just retry, but otherwise return with no error.
|
|
if pe.Path == dir {
|
|
return nil
|
|
}
|
|
continue
|
|
}
|
|
|
|
if pe.Err != syscall.EBUSY {
|
|
return err
|
|
}
|
|
if e := mount.Unmount(pe.Path, unix.MNT_DETACH); e != nil {
|
|
return fmt.Errorf("error while removing %s: %w", dir, e)
|
|
}
|
|
|
|
if exitOnErr[pe.Path] == maxRetry {
|
|
return err
|
|
}
|
|
exitOnErr[pe.Path]++
|
|
time.Sleep(100 * time.Millisecond)
|
|
}
|
|
}
|
|
|
|
var vmbasedRuntimes = []string{
|
|
"io.containerd.kata",
|
|
}
|
|
|
|
func isVMBasedRuntime(runtimeType string) bool {
|
|
for _, rt := range vmbasedRuntimes {
|
|
if strings.Contains(runtimeType, rt) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
|
|
if !isVMBasedRuntime(runtimeType) {
|
|
return nil
|
|
}
|
|
l, err := seutil.ChangeToKVM(spec.Process.SelinuxLabel)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get selinux kvm label: %w", err)
|
|
}
|
|
spec.Process.SelinuxLabel = l
|
|
return nil
|
|
}
|
|
|
|
func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]specs.LinuxIDMapping, error) {
|
|
var m []specs.LinuxIDMapping
|
|
|
|
if len(runtimeIDMap) == 0 {
|
|
return m, nil
|
|
}
|
|
|
|
if len(runtimeIDMap) > 1 {
|
|
// We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that.
|
|
return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap))
|
|
}
|
|
|
|
// We know len is 1 now.
|
|
if runtimeIDMap[0] == nil {
|
|
return m, nil
|
|
}
|
|
uidMap := *runtimeIDMap[0]
|
|
|
|
if uidMap.Length < 1 {
|
|
return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length)
|
|
}
|
|
|
|
m = []specs.LinuxIDMapping{
|
|
{
|
|
ContainerID: uidMap.ContainerId,
|
|
HostID: uidMap.HostId,
|
|
Size: uidMap.Length,
|
|
},
|
|
}
|
|
|
|
return m, nil
|
|
}
|
|
|
|
func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []specs.LinuxIDMapping, retErr error) {
|
|
if userns == nil {
|
|
// If userns is not set, the kubelet doesn't support this option
|
|
// and we should just fallback to no userns. This is completely
|
|
// valid.
|
|
return nil, nil, nil
|
|
}
|
|
|
|
uids, err := parseUsernsIDMap(userns.GetUids())
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("UID mapping: %w", err)
|
|
}
|
|
|
|
gids, err = parseUsernsIDMap(userns.GetGids())
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("GID mapping: %w", err)
|
|
}
|
|
|
|
switch mode := userns.GetMode(); mode {
|
|
case runtime.NamespaceMode_NODE:
|
|
if len(uids) != 0 || len(gids) != 0 {
|
|
return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids))
|
|
}
|
|
case runtime.NamespaceMode_POD:
|
|
// This is valid, we will handle it in WithPodNamespaces().
|
|
if len(uids) == 0 || len(gids) == 0 {
|
|
return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode)
|
|
}
|
|
default:
|
|
return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
|
|
}
|
|
|
|
return uids, gids, nil
|
|
}
|
|
|
|
// sameUsernsConfig checks if the userns configs are the same. If the mappings
|
|
// on each config are the same but in different order, it returns false.
|
|
// XXX: If the runtime.UserNamespace struct changes, we should update this
|
|
// function accordingly.
|
|
func sameUsernsConfig(a, b *runtime.UserNamespace) bool {
|
|
// If both are nil, they are the same.
|
|
if a == nil && b == nil {
|
|
return true
|
|
}
|
|
// If only one is nil, they are different.
|
|
if a == nil || b == nil {
|
|
return false
|
|
}
|
|
// At this point, a is not nil nor b.
|
|
|
|
if a.GetMode() != b.GetMode() {
|
|
return false
|
|
}
|
|
|
|
aUids, aGids, err := parseUsernsIDs(a)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
bUids, bGids, err := parseUsernsIDs(b)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
if !sameMapping(aUids, bUids) {
|
|
return false
|
|
}
|
|
if !sameMapping(aGids, bGids) {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// sameMapping checks if the mappings are the same. If the mappings are the same
|
|
// but in different order, it returns false.
|
|
func sameMapping(a, b []specs.LinuxIDMapping) bool {
|
|
if len(a) != len(b) {
|
|
return false
|
|
}
|
|
|
|
for x := range a {
|
|
if a[x].ContainerID != b[x].ContainerID {
|
|
return false
|
|
}
|
|
if a[x].HostID != b[x].HostID {
|
|
return false
|
|
}
|
|
if a[x].Size != b[x].Size {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func snapshotterRemapOpts(nsOpts *runtime.NamespaceOption) ([]snapshots.Opt, error) {
|
|
snapshotOpt := []snapshots.Opt{}
|
|
usernsOpts := nsOpts.GetUsernsOptions()
|
|
if usernsOpts == nil {
|
|
return snapshotOpt, nil
|
|
}
|
|
|
|
uids, gids, err := parseUsernsIDs(usernsOpts)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("user namespace configuration: %w", err)
|
|
}
|
|
|
|
if usernsOpts.GetMode() == runtime.NamespaceMode_POD {
|
|
snapshotOpt = append(snapshotOpt, containerd.WithRemapperLabels(0, uids[0].HostID, 0, gids[0].HostID, uids[0].Size))
|
|
}
|
|
return snapshotOpt, nil
|
|
}
|