diff --git a/pkg/cri/sbserver/helpers.go b/pkg/cri/sbserver/helpers.go index 88ea39698..90d6888c9 100644 --- a/pkg/cri/sbserver/helpers.go +++ b/pkg/cri/sbserver/helpers.go @@ -219,9 +219,9 @@ func getUserFromImage(user string) (*int64, string) { return &uid, "" } -// ensureImageExists returns corresponding metadata of the image reference, if image is not +// EnsureImageExists returns corresponding metadata of the image reference, if image is not // pulled yet, the function will pull the image. -func (c *criService) ensureImageExists(ctx context.Context, ref string, config *runtime.PodSandboxConfig) (*imagestore.Image, error) { +func (c *criService) EnsureImageExists(ctx context.Context, ref string, config *runtime.PodSandboxConfig) (*imagestore.Image, error) { image, err := c.localResolve(ref) if err != nil && !errdefs.IsNotFound(err) { return nil, fmt.Errorf("failed to get image %q: %w", ref, err) diff --git a/pkg/cri/sbserver/podsandbox/container_linux.go b/pkg/cri/sbserver/podsandbox/container_linux.go new file mode 100644 index 000000000..19f3ba0ac --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/container_linux.go @@ -0,0 +1,148 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// These are copied from container_create_linux.go and should be consolidated later. + +package podsandbox + +import ( + "errors" + "fmt" + "strconv" + "strings" + + "github.com/containerd/containerd/contrib/seccomp" + "github.com/containerd/containerd/oci" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +const ( + // profileNamePrefix is the prefix for loading profiles on a localhost. Eg. AppArmor localhost/profileName. + profileNamePrefix = "localhost/" // TODO (mikebrow): get localhost/ & runtime/default from CRI kubernetes/kubernetes#51747 + // runtimeDefault indicates that we should use or create a runtime default profile. + runtimeDefault = "runtime/default" + // dockerDefault indicates that we should use or create a docker default profile. + dockerDefault = "docker/default" + // unconfinedProfile is a string indicating one should run a pod/containerd without a security profile + unconfinedProfile = "unconfined" +) + +// generateSeccompSpecOpts generates containerd SpecOpts for seccomp. +func (c *Controller) generateSeccompSpecOpts(sp *runtime.SecurityProfile, privileged, seccompEnabled bool) (oci.SpecOpts, error) { + if privileged { + // Do not set seccomp profile when container is privileged + return nil, nil + } + if !seccompEnabled { + if sp != nil { + if sp.ProfileType != runtime.SecurityProfile_Unconfined { + return nil, errors.New("seccomp is not supported") + } + } + return nil, nil + } + + if sp == nil { + return nil, nil + } + + if sp.ProfileType != runtime.SecurityProfile_Localhost && sp.LocalhostRef != "" { + return nil, errors.New("seccomp config invalid LocalhostRef must only be set if ProfileType is Localhost") + } + switch sp.ProfileType { + case runtime.SecurityProfile_Unconfined: + // Do not set seccomp profile. + return nil, nil + case runtime.SecurityProfile_RuntimeDefault: + return seccomp.WithDefaultProfile(), nil + case runtime.SecurityProfile_Localhost: + // trimming the localhost/ prefix just in case even though it should not + // be necessary with the new SecurityProfile struct + return seccomp.WithProfile(strings.TrimPrefix(sp.LocalhostRef, profileNamePrefix)), nil + default: + return nil, errors.New("seccomp unknown ProfileType") + } +} + +func generateSeccompSecurityProfile(profilePath string, unsetProfilePath string) (*runtime.SecurityProfile, error) { + if profilePath != "" { + return generateSecurityProfile(profilePath) + } + if unsetProfilePath != "" { + return generateSecurityProfile(unsetProfilePath) + } + return nil, nil +} + +func generateSecurityProfile(profilePath string) (*runtime.SecurityProfile, error) { + switch profilePath { + case runtimeDefault, dockerDefault, "": + return &runtime.SecurityProfile{ + ProfileType: runtime.SecurityProfile_RuntimeDefault, + }, nil + case unconfinedProfile: + return &runtime.SecurityProfile{ + ProfileType: runtime.SecurityProfile_Unconfined, + }, nil + default: + // Require and Trim default profile name prefix + if !strings.HasPrefix(profilePath, profileNamePrefix) { + return nil, fmt.Errorf("invalid profile %q", profilePath) + } + return &runtime.SecurityProfile{ + ProfileType: runtime.SecurityProfile_Localhost, + LocalhostRef: strings.TrimPrefix(profilePath, profileNamePrefix), + }, nil + } +} + +// generateUserString generates valid user string based on OCI Image Spec +// v1.0.0. +// +// CRI defines that the following combinations are valid: +// +// (none) -> "" +// username -> username +// username, uid -> username +// username, uid, gid -> username:gid +// username, gid -> username:gid +// uid -> uid +// uid, gid -> uid:gid +// gid -> error +// +// TODO(random-liu): Add group name support in CRI. +func generateUserString(username string, uid, gid *runtime.Int64Value) (string, error) { + var userstr, groupstr string + if uid != nil { + userstr = strconv.FormatInt(uid.GetValue(), 10) + } + if username != "" { + userstr = username + } + if gid != nil { + groupstr = strconv.FormatInt(gid.GetValue(), 10) + } + if userstr == "" { + if groupstr != "" { + return "", fmt.Errorf("user group %q is specified without user", groupstr) + } + return "", nil + } + if groupstr != "" { + userstr = userstr + ":" + groupstr + } + return userstr, nil +} diff --git a/pkg/cri/sbserver/podsandbox/controller.go b/pkg/cri/sbserver/podsandbox/controller.go new file mode 100644 index 000000000..d6c1e58f3 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/controller.go @@ -0,0 +1,86 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "context" + + "github.com/containerd/containerd" + api "github.com/containerd/containerd/api/services/sandbox/v1" + "github.com/containerd/containerd/oci" + criconfig "github.com/containerd/containerd/pkg/cri/config" + imagestore "github.com/containerd/containerd/pkg/cri/store/image" + sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox" + osinterface "github.com/containerd/containerd/pkg/os" + "github.com/containerd/containerd/sandbox" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +// CRIService interface contains things required by controller, but not yet refactored from criService. +// This will be removed in subsequent iterations. +type CRIService interface { + EnsureImageExists(ctx context.Context, ref string, config *runtime.PodSandboxConfig) (*imagestore.Image, error) + + StartSandboxExitMonitor(ctx context.Context, id string, pid uint32, exitCh <-chan containerd.ExitStatus) <-chan struct{} +} + +type Controller struct { + // config contains all configurations. + config criconfig.Config + // client is an instance of the containerd client + client *containerd.Client + // sandboxStore stores all resources associated with sandboxes. + sandboxStore *sandboxstore.Store + // os is an interface for all required os operations. + os osinterface.OS + // cri is CRI service that provides missing gaps needed by controller. + cri CRIService + // baseOCISpecs contains cached OCI specs loaded via `Runtime.BaseRuntimeSpec` + baseOCISpecs map[string]*oci.Spec +} + +func New( + config criconfig.Config, + client *containerd.Client, + sandboxStore *sandboxstore.Store, + os osinterface.OS, + cri CRIService, + baseOCISpecs map[string]*oci.Spec, +) *Controller { + return &Controller{ + config: config, + client: client, + sandboxStore: sandboxStore, + os: os, + cri: cri, + baseOCISpecs: baseOCISpecs, + } +} + +var _ sandbox.Controller = (*Controller)(nil) + +func (c *Controller) Shutdown(ctx context.Context, sandboxID string) error { + panic("implement me") +} + +func (c *Controller) Wait(ctx context.Context, sandboxID string) (*api.ControllerWaitResponse, error) { + panic("implement me") +} + +func (c *Controller) Status(ctx context.Context, sandboxID string) (*api.ControllerStatusResponse, error) { + panic("implement me") +} diff --git a/pkg/cri/sbserver/podsandbox/controller_test.go b/pkg/cri/sbserver/podsandbox/controller_test.go new file mode 100644 index 000000000..6dc68fe30 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/controller_test.go @@ -0,0 +1,52 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + criconfig "github.com/containerd/containerd/pkg/cri/config" + "github.com/containerd/containerd/pkg/cri/store/label" + sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox" + ostesting "github.com/containerd/containerd/pkg/os/testing" +) + +const ( + testRootDir = "/test/root" + testStateDir = "/test/state" + // Use an image id as test sandbox image to avoid image name resolve. + // TODO(random-liu): Change this to image name after we have complete image + // management unit test framework. + testSandboxImage = "sha256:c75bebcdd211f41b3a460c7bf82970ed6c75acaab9cd4c9a4e125b03ca113798" +) + +var testConfig = criconfig.Config{ + RootDir: testRootDir, + StateDir: testStateDir, + PluginConfig: criconfig.PluginConfig{ + SandboxImage: testSandboxImage, + TolerateMissingHugetlbController: true, + }, +} + +// newControllerService creates a fake criService for test. +func newControllerService() *Controller { + labels := label.NewStore() + return &Controller{ + config: testConfig, + os: ostesting.NewFakeOS(), + sandboxStore: sandboxstore.NewStore(labels), + } +} diff --git a/pkg/cri/sbserver/podsandbox/helpers.go b/pkg/cri/sbserver/podsandbox/helpers.go new file mode 100644 index 000000000..1705fe2d2 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/helpers.go @@ -0,0 +1,402 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "context" + "fmt" + "path" + "path/filepath" + "strconv" + "strings" + + "github.com/containerd/containerd" + "github.com/containerd/containerd/containers" + clabels "github.com/containerd/containerd/labels" + "github.com/containerd/containerd/log" + "github.com/containerd/containerd/oci" + criconfig "github.com/containerd/containerd/pkg/cri/config" + containerstore "github.com/containerd/containerd/pkg/cri/store/container" + imagestore "github.com/containerd/containerd/pkg/cri/store/image" + sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox" + ctrdutil "github.com/containerd/containerd/pkg/cri/util" + runtimeoptions "github.com/containerd/containerd/pkg/runtimeoptions/v1" + "github.com/containerd/containerd/plugin" + "github.com/containerd/containerd/reference/docker" + "github.com/containerd/containerd/runtime/linux/runctypes" + runcoptions "github.com/containerd/containerd/runtime/v2/runc/options" + "github.com/containerd/typeurl" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + + runhcsoptions "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options" + imagedigest "github.com/opencontainers/go-digest" + "github.com/pelletier/go-toml" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +const ( + // errorStartReason is the exit reason when fails to start container. + errorStartReason = "StartError" + // errorStartExitCode is the exit code when fails to start container. + // 128 is the same with Docker's behavior. + // TODO(windows): Figure out what should be used for windows. + errorStartExitCode = 128 + // completeExitReason is the exit reason when container exits with code 0. + completeExitReason = "Completed" + // errorExitReason is the exit reason when container exits with code non-zero. + errorExitReason = "Error" + // oomExitReason is the exit reason when process in container is oom killed. + oomExitReason = "OOMKilled" + + // sandboxesDir contains all sandbox root. A sandbox root is the running + // directory of the sandbox, all files created for the sandbox will be + // placed under this directory. + sandboxesDir = "sandboxes" + // containersDir contains all container root. + containersDir = "containers" + // Delimiter used to construct container/sandbox names. + nameDelimiter = "_" + + // criContainerdPrefix is common prefix for cri-containerd + criContainerdPrefix = "io.cri-containerd" + // containerKindLabel is a label key indicating container is sandbox container or application container + containerKindLabel = criContainerdPrefix + ".kind" + // containerKindSandbox is a label value indicating container is sandbox container + containerKindSandbox = "sandbox" + // containerKindContainer is a label value indicating container is application container + containerKindContainer = "container" + // imageLabelKey is the label key indicating the image is managed by cri plugin. + imageLabelKey = criContainerdPrefix + ".image" + // imageLabelValue is the label value indicating the image is managed by cri plugin. + imageLabelValue = "managed" + // sandboxMetadataExtension is an extension name that identify metadata of sandbox in CreateContainerRequest + sandboxMetadataExtension = criContainerdPrefix + ".sandbox.metadata" + // containerMetadataExtension is an extension name that identify metadata of container in CreateContainerRequest + containerMetadataExtension = criContainerdPrefix + ".container.metadata" + + // defaultIfName is the default network interface for the pods + defaultIfName = "eth0" + + // runtimeRunhcsV1 is the runtime type for runhcs. + runtimeRunhcsV1 = "io.containerd.runhcs.v1" +) + +// makeSandboxName generates sandbox name from sandbox metadata. The name +// generated is unique as long as sandbox metadata is unique. +func makeSandboxName(s *runtime.PodSandboxMetadata) string { + return strings.Join([]string{ + s.Name, // 0 + s.Namespace, // 1 + s.Uid, // 2 + fmt.Sprintf("%d", s.Attempt), // 3 + }, nameDelimiter) +} + +// makeContainerName generates container name from sandbox and container metadata. +// The name generated is unique as long as the sandbox container combination is +// unique. +func makeContainerName(c *runtime.ContainerMetadata, s *runtime.PodSandboxMetadata) string { + return strings.Join([]string{ + c.Name, // 0: container name + s.Name, // 1: pod name + s.Namespace, // 2: pod namespace + s.Uid, // 3: pod uid + fmt.Sprintf("%d", c.Attempt), // 4: attempt number of creating the container + }, nameDelimiter) +} + +// getSandboxRootDir returns the root directory for managing sandbox files, +// e.g. hosts files. +func (c *Controller) getSandboxRootDir(id string) string { + return filepath.Join(c.config.RootDir, sandboxesDir, id) +} + +// getVolatileSandboxRootDir returns the root directory for managing volatile sandbox files, +// e.g. named pipes. +func (c *Controller) getVolatileSandboxRootDir(id string) string { + return filepath.Join(c.config.StateDir, sandboxesDir, id) +} + +// getContainerRootDir returns the root directory for managing container files, +// e.g. state checkpoint. +func (c *Controller) getContainerRootDir(id string) string { + return filepath.Join(c.config.RootDir, containersDir, id) +} + +// getVolatileContainerRootDir returns the root directory for managing volatile container files, +// e.g. named pipes. +func (c *Controller) getVolatileContainerRootDir(id string) string { + return filepath.Join(c.config.StateDir, containersDir, id) +} + +// criContainerStateToString formats CRI container state to string. +func criContainerStateToString(state runtime.ContainerState) string { + return runtime.ContainerState_name[int32(state)] +} + +// getRepoDigestAngTag returns image repoDigest and repoTag of the named image reference. +func getRepoDigestAndTag(namedRef docker.Named, digest imagedigest.Digest, schema1 bool) (string, string) { + var repoTag, repoDigest string + if _, ok := namedRef.(docker.NamedTagged); ok { + repoTag = namedRef.String() + } + if _, ok := namedRef.(docker.Canonical); ok { + repoDigest = namedRef.String() + } else if !schema1 { + // digest is not actual repo digest for schema1 image. + repoDigest = namedRef.Name() + "@" + digest.String() + } + return repoDigest, repoTag +} + +// toContainerdImage converts an image object in image store to containerd image handler. +func (c *Controller) toContainerdImage(ctx context.Context, image imagestore.Image) (containerd.Image, error) { + // image should always have at least one reference. + if len(image.References) == 0 { + return nil, fmt.Errorf("invalid image with no reference %q", image.ID) + } + return c.client.GetImage(ctx, image.References[0]) +} + +// getUserFromImage gets uid or user name of the image user. +// If user is numeric, it will be treated as uid; or else, it is treated as user name. +func getUserFromImage(user string) (*int64, string) { + // return both empty if user is not specified in the image. + if user == "" { + return nil, "" + } + // split instances where the id may contain user:group + user = strings.Split(user, ":")[0] + // user could be either uid or user name. Try to interpret as numeric uid. + uid, err := strconv.ParseInt(user, 10, 64) + if err != nil { + // If user is non numeric, assume it's user name. + return nil, user + } + // If user is a numeric uid. + return &uid, "" +} + +// isInCRIMounts checks whether a destination is in CRI mount list. +func isInCRIMounts(dst string, mounts []*runtime.Mount) bool { + for _, m := range mounts { + if filepath.Clean(m.ContainerPath) == filepath.Clean(dst) { + return true + } + } + return false +} + +// filterLabel returns a label filter. Use `%q` here because containerd +// filter needs extra quote to work properly. +func filterLabel(k, v string) string { + return fmt.Sprintf("labels.%q==%q", k, v) +} + +// buildLabel builds the labels from config to be passed to containerd +func buildLabels(configLabels, imageConfigLabels map[string]string, containerType string) map[string]string { + labels := make(map[string]string) + + for k, v := range imageConfigLabels { + if err := clabels.Validate(k, v); err == nil { + labels[k] = v + } else { + // In case the image label is invalid, we output a warning and skip adding it to the + // container. + logrus.WithError(err).Warnf("unable to add image label with key %s to the container", k) + } + } + // labels from the CRI request (config) will override labels in the image config + for k, v := range configLabels { + labels[k] = v + } + labels[containerKindLabel] = containerType + return labels +} + +// toRuntimeAuthConfig converts cri plugin auth config to runtime auth config. +func toRuntimeAuthConfig(a criconfig.AuthConfig) *runtime.AuthConfig { + return &runtime.AuthConfig{ + Username: a.Username, + Password: a.Password, + Auth: a.Auth, + IdentityToken: a.IdentityToken, + } +} + +// parseImageReferences parses a list of arbitrary image references and returns +// the repotags and repodigests +func parseImageReferences(refs []string) ([]string, []string) { + var tags, digests []string + for _, ref := range refs { + parsed, err := docker.ParseAnyReference(ref) + if err != nil { + continue + } + if _, ok := parsed.(docker.Canonical); ok { + digests = append(digests, parsed.String()) + } else if _, ok := parsed.(docker.Tagged); ok { + tags = append(tags, parsed.String()) + } + } + return tags, digests +} + +// generateRuntimeOptions generates runtime options from cri plugin config. +func generateRuntimeOptions(r criconfig.Runtime, c criconfig.Config) (interface{}, error) { + if r.Options == nil { + if r.Type != plugin.RuntimeLinuxV1 { + return nil, nil + } + // This is a legacy config, generate runctypes.RuncOptions. + return &runctypes.RuncOptions{ + Runtime: r.Engine, + RuntimeRoot: r.Root, + SystemdCgroup: c.SystemdCgroup, + }, nil + } + optionsTree, err := toml.TreeFromMap(r.Options) + if err != nil { + return nil, err + } + options := getRuntimeOptionsType(r.Type) + if err := optionsTree.Unmarshal(options); err != nil { + return nil, err + } + return options, nil +} + +// getRuntimeOptionsType gets empty runtime options by the runtime type name. +func getRuntimeOptionsType(t string) interface{} { + switch t { + case plugin.RuntimeRuncV1: + fallthrough + case plugin.RuntimeRuncV2: + return &runcoptions.Options{} + case plugin.RuntimeLinuxV1: + return &runctypes.RuncOptions{} + case runtimeRunhcsV1: + return &runhcsoptions.Options{} + default: + return &runtimeoptions.Options{} + } +} + +// getRuntimeOptions get runtime options from container metadata. +func getRuntimeOptions(c containers.Container) (interface{}, error) { + from := c.Runtime.Options + if from == nil || from.GetValue() == nil { + return nil, nil + } + opts, err := typeurl.UnmarshalAny(from) + if err != nil { + return nil, err + } + return opts, nil +} + +const ( + // unknownExitCode is the exit code when exit reason is unknown. + unknownExitCode = 255 + // unknownExitReason is the exit reason when exit reason is unknown. + unknownExitReason = "Unknown" +) + +// unknownContainerStatus returns the default container status when its status is unknown. +func unknownContainerStatus() containerstore.Status { + return containerstore.Status{ + CreatedAt: 0, + StartedAt: 0, + FinishedAt: 0, + ExitCode: unknownExitCode, + Reason: unknownExitReason, + Unknown: true, + } +} + +// unknownSandboxStatus returns the default sandbox status when its status is unknown. +func unknownSandboxStatus() sandboxstore.Status { + return sandboxstore.Status{ + State: sandboxstore.StateUnknown, + } +} + +// getPassthroughAnnotations filters requested pod annotations by comparing +// against permitted annotations for the given runtime. +func getPassthroughAnnotations(podAnnotations map[string]string, + runtimePodAnnotations []string) (passthroughAnnotations map[string]string) { + passthroughAnnotations = make(map[string]string) + + for podAnnotationKey, podAnnotationValue := range podAnnotations { + for _, pattern := range runtimePodAnnotations { + // Use path.Match instead of filepath.Match here. + // filepath.Match treated `\\` as path separator + // on windows, which is not what we want. + if ok, _ := path.Match(pattern, podAnnotationKey); ok { + passthroughAnnotations[podAnnotationKey] = podAnnotationValue + } + } + } + return passthroughAnnotations +} + +// runtimeSpec returns a default runtime spec used in cri-containerd. +func (c *Controller) runtimeSpec(id string, baseSpecFile string, opts ...oci.SpecOpts) (*runtimespec.Spec, error) { + // GenerateSpec needs namespace. + ctx := ctrdutil.NamespacedContext() + container := &containers.Container{ID: id} + + if baseSpecFile != "" { + baseSpec, ok := c.baseOCISpecs[baseSpecFile] + if !ok { + return nil, fmt.Errorf("can't find base OCI spec %q", baseSpecFile) + } + + spec := oci.Spec{} + if err := ctrdutil.DeepCopy(&spec, &baseSpec); err != nil { + return nil, fmt.Errorf("failed to clone OCI spec: %w", err) + } + + // Fix up cgroups path + applyOpts := append([]oci.SpecOpts{oci.WithNamespacedCgroup()}, opts...) + + if err := oci.ApplyOpts(ctx, nil, container, &spec, applyOpts...); err != nil { + return nil, fmt.Errorf("failed to apply OCI options: %w", err) + } + + return &spec, nil + } + + spec, err := oci.GenerateSpec(ctx, nil, container, opts...) + if err != nil { + return nil, fmt.Errorf("failed to generate spec: %w", err) + } + + return spec, nil +} + +// Overrides the default snapshotter if Snapshotter is set for this runtime. +// See See https://github.com/containerd/containerd/issues/6657 +func (c *Controller) runtimeSnapshotter(ctx context.Context, ociRuntime criconfig.Runtime) string { + if ociRuntime.Snapshotter == "" { + return c.config.ContainerdConfig.Snapshotter + } + + log.G(ctx).Debugf("Set snapshotter for runtime %s to %s", ociRuntime.Type, ociRuntime.Snapshotter) + return ociRuntime.Snapshotter +} diff --git a/pkg/cri/sbserver/podsandbox/helpers_linux.go b/pkg/cri/sbserver/podsandbox/helpers_linux.go new file mode 100644 index 000000000..861514787 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/helpers_linux.go @@ -0,0 +1,277 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "context" + "fmt" + "os" + "path" + "path/filepath" + "regexp" + "sort" + "strings" + "syscall" + "time" + + "github.com/containerd/containerd/log" + "github.com/containerd/containerd/mount" + "github.com/containerd/containerd/pkg/apparmor" + "github.com/containerd/containerd/pkg/seccomp" + "github.com/containerd/containerd/pkg/seutil" + "github.com/moby/sys/mountinfo" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux/label" + "golang.org/x/sys/unix" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +const ( + // defaultSandboxOOMAdj is default omm adj for sandbox container. (kubernetes#47938). + defaultSandboxOOMAdj = -998 + // defaultShmSize is the default size of the sandbox shm. + defaultShmSize = int64(1024 * 1024 * 64) + // relativeRootfsPath is the rootfs path relative to bundle path. + relativeRootfsPath = "rootfs" + // devShm is the default path of /dev/shm. + devShm = "/dev/shm" + // etcHosts is the default path of /etc/hosts file. + etcHosts = "/etc/hosts" + // etcHostname is the default path of /etc/hostname file. + etcHostname = "/etc/hostname" + // resolvConfPath is the abs path of resolv.conf on host or container. + resolvConfPath = "/etc/resolv.conf" + // hostnameEnv is the key for HOSTNAME env. + hostnameEnv = "HOSTNAME" +) + +// getCgroupsPath generates container cgroups path. +func getCgroupsPath(cgroupsParent, id string) string { + base := path.Base(cgroupsParent) + if strings.HasSuffix(base, ".slice") { + // For a.slice/b.slice/c.slice, base is c.slice. + // runc systemd cgroup path format is "slice:prefix:name". + return strings.Join([]string{base, "cri-containerd", id}, ":") + } + return filepath.Join(cgroupsParent, id) +} + +// getSandboxHostname returns the hostname file path inside the sandbox root directory. +func (c *Controller) getSandboxHostname(id string) string { + return filepath.Join(c.getSandboxRootDir(id), "hostname") +} + +// getSandboxHosts returns the hosts file path inside the sandbox root directory. +func (c *Controller) getSandboxHosts(id string) string { + return filepath.Join(c.getSandboxRootDir(id), "hosts") +} + +// getResolvPath returns resolv.conf filepath for specified sandbox. +func (c *Controller) getResolvPath(id string) string { + return filepath.Join(c.getSandboxRootDir(id), "resolv.conf") +} + +// getSandboxDevShm returns the shm file path inside the sandbox root directory. +func (c *Controller) getSandboxDevShm(id string) string { + return filepath.Join(c.getVolatileSandboxRootDir(id), "shm") +} + +func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) { + var labels []string + + if selinuxOptions == nil { + return nil, nil + } + if err := checkSelinuxLevel(selinuxOptions.Level); err != nil { + return nil, err + } + if selinuxOptions.User != "" { + labels = append(labels, "user:"+selinuxOptions.User) + } + if selinuxOptions.Role != "" { + labels = append(labels, "role:"+selinuxOptions.Role) + } + if selinuxOptions.Type != "" { + labels = append(labels, "type:"+selinuxOptions.Type) + } + if selinuxOptions.Level != "" { + labels = append(labels, "level:"+selinuxOptions.Level) + } + + return labels, nil +} + +func initLabelsFromOpt(selinuxOpts *runtime.SELinuxOption) (string, string, error) { + labels, err := toLabel(selinuxOpts) + if err != nil { + return "", "", err + } + return label.InitLabels(labels) +} + +func checkSelinuxLevel(level string) error { + if len(level) == 0 { + return nil + } + + matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level) + if err != nil { + return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err) + } + if !matched { + return fmt.Errorf("the format of 'level' %q is not correct", level) + } + return nil +} + +// apparmorEnabled returns true if apparmor is enabled, supported by the host, +// if apparmor_parser is installed, and if we are not running docker-in-docker. +func (c *Controller) apparmorEnabled() bool { + if c.config.DisableApparmor { + return false + } + return apparmor.HostSupports() +} + +func (c *Controller) seccompEnabled() bool { + return seccomp.IsEnabled() +} + +// openLogFile opens/creates a container log file. +func openLogFile(path string) (*os.File, error) { + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return nil, err + } + return os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0640) +} + +// unmountRecursive unmounts the target and all mounts underneath, starting with +// the deepest mount first. +func unmountRecursive(ctx context.Context, target string) error { + toUnmount, err := mountinfo.GetMounts(mountinfo.PrefixFilter(target)) + if err != nil { + return err + } + + // Make the deepest mount be first + sort.Slice(toUnmount, func(i, j int) bool { + return len(toUnmount[i].Mountpoint) > len(toUnmount[j].Mountpoint) + }) + + for i, m := range toUnmount { + if err := mount.UnmountAll(m.Mountpoint, unix.MNT_DETACH); err != nil { + if i == len(toUnmount)-1 { // last mount + return err + } + // This is some submount, we can ignore this error for now, the final unmount will fail if this is a real problem + log.G(ctx).WithError(err).Debugf("failed to unmount submount %s", m.Mountpoint) + } + } + return nil +} + +// ensureRemoveAll wraps `os.RemoveAll` to check for specific errors that can +// often be remedied. +// Only use `ensureRemoveAll` if you really want to make every effort to remove +// a directory. +// +// Because of the way `os.Remove` (and by extension `os.RemoveAll`) works, there +// can be a race between reading directory entries and then actually attempting +// to remove everything in the directory. +// These types of errors do not need to be returned since it's ok for the dir to +// be gone we can just retry the remove operation. +// +// This should not return a `os.ErrNotExist` kind of error under any circumstances +func ensureRemoveAll(ctx context.Context, dir string) error { + notExistErr := make(map[string]bool) + + // track retries + exitOnErr := make(map[string]int) + maxRetry := 50 + + // Attempt to unmount anything beneath this dir first. + if err := unmountRecursive(ctx, dir); err != nil { + log.G(ctx).WithError(err).Debugf("failed to do initial unmount of %s", dir) + } + + for { + err := os.RemoveAll(dir) + if err == nil { + return nil + } + + pe, ok := err.(*os.PathError) + if !ok { + return err + } + + if os.IsNotExist(err) { + if notExistErr[pe.Path] { + return err + } + notExistErr[pe.Path] = true + + // There is a race where some subdir can be removed but after the + // parent dir entries have been read. + // So the path could be from `os.Remove(subdir)` + // If the reported non-existent path is not the passed in `dir` we + // should just retry, but otherwise return with no error. + if pe.Path == dir { + return nil + } + continue + } + + if pe.Err != syscall.EBUSY { + return err + } + if e := mount.Unmount(pe.Path, unix.MNT_DETACH); e != nil { + return fmt.Errorf("error while removing %s: %w", dir, e) + } + + if exitOnErr[pe.Path] == maxRetry { + return err + } + exitOnErr[pe.Path]++ + time.Sleep(100 * time.Millisecond) + } +} + +var vmbasedRuntimes = []string{ + "io.containerd.kata", +} + +func isVMBasedRuntime(runtimeType string) bool { + for _, rt := range vmbasedRuntimes { + if strings.Contains(runtimeType, rt) { + return true + } + } + return false +} + +func modifyProcessLabel(runtimeType string, spec *specs.Spec) error { + if !isVMBasedRuntime(runtimeType) { + return nil + } + l, err := seutil.ChangeToKVM(spec.Process.SelinuxLabel) + if err != nil { + return fmt.Errorf("failed to get selinux kvm label: %w", err) + } + spec.Process.SelinuxLabel = l + return nil +} diff --git a/pkg/cri/sbserver/podsandbox/helpers_linux_test.go b/pkg/cri/sbserver/podsandbox/helpers_linux_test.go new file mode 100644 index 000000000..c61383636 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/helpers_linux_test.go @@ -0,0 +1,100 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "context" + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "golang.org/x/sys/unix" +) + +func TestGetCgroupsPath(t *testing.T) { + testID := "test-id" + for desc, test := range map[string]struct { + cgroupsParent string + expected string + }{ + "should support regular cgroup path": { + cgroupsParent: "/a/b", + expected: "/a/b/test-id", + }, + "should support systemd cgroup path": { + cgroupsParent: "/a.slice/b.slice", + expected: "b.slice:cri-containerd:test-id", + }, + "should support tailing slash for regular cgroup path": { + cgroupsParent: "/a/b/", + expected: "/a/b/test-id", + }, + "should support tailing slash for systemd cgroup path": { + cgroupsParent: "/a.slice/b.slice/", + expected: "b.slice:cri-containerd:test-id", + }, + "should treat root cgroup as regular cgroup path": { + cgroupsParent: "/", + expected: "/test-id", + }, + } { + t.Run(desc, func(t *testing.T) { + got := getCgroupsPath(test.cgroupsParent, testID) + assert.Equal(t, test.expected, got) + }) + } +} + +func TestEnsureRemoveAllWithMount(t *testing.T) { + if os.Getuid() != 0 { + t.Skip("skipping test that requires root") + } + + var err error + dir1 := t.TempDir() + dir2 := t.TempDir() + + bindDir := filepath.Join(dir1, "bind") + if err := os.MkdirAll(bindDir, 0755); err != nil { + t.Fatal(err) + } + + if err := unix.Mount(dir2, bindDir, "none", unix.MS_BIND, ""); err != nil { + t.Fatal(err) + } + + done := make(chan struct{}) + go func() { + err = ensureRemoveAll(context.Background(), dir1) + close(done) + }() + + select { + case <-done: + if err != nil { + t.Fatal(err) + } + case <-time.After(5 * time.Second): + t.Fatal("timeout waiting for EnsureRemoveAll to finish") + } + + if _, err := os.Stat(dir1); !os.IsNotExist(err) { + t.Fatalf("expected %q to not exist", dir1) + } +} diff --git a/pkg/cri/sbserver/podsandbox/helpers_other.go b/pkg/cri/sbserver/podsandbox/helpers_other.go new file mode 100644 index 000000000..b66c26a95 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/helpers_other.go @@ -0,0 +1,44 @@ +//go:build !windows && !linux +// +build !windows,!linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "context" + "os" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +// openLogFile opens/creates a container log file. +func openLogFile(path string) (*os.File, error) { + return os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0640) +} + +// ensureRemoveAll wraps `os.RemoveAll` to check for specific errors that can +// often be remedied. +// Only use `ensureRemoveAll` if you really want to make every effort to remove +// a directory. +func ensureRemoveAll(ctx context.Context, dir string) error { + return os.RemoveAll(dir) +} + +func modifyProcessLabel(runtimeType string, spec *specs.Spec) error { + return nil +} diff --git a/pkg/cri/sbserver/podsandbox/helpers_selinux_linux_test.go b/pkg/cri/sbserver/podsandbox/helpers_selinux_linux_test.go new file mode 100644 index 000000000..8b38bde25 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/helpers_selinux_linux_test.go @@ -0,0 +1,157 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "testing" + + "github.com/opencontainers/selinux/go-selinux" + "github.com/stretchr/testify/assert" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func TestInitSelinuxOpts(t *testing.T) { + if !selinux.GetEnabled() { + t.Skip("selinux is not enabled") + } + + for desc, test := range map[string]struct { + selinuxOpt *runtime.SELinuxOption + processLabel string + mountLabel string + expectErr bool + }{ + "Should return empty strings for processLabel and mountLabel when selinuxOpt is nil": { + selinuxOpt: nil, + processLabel: ".*:c[0-9]{1,3},c[0-9]{1,3}", + mountLabel: ".*:c[0-9]{1,3},c[0-9]{1,3}", + }, + "Should overlay fields on processLabel when selinuxOpt has been initialized partially": { + selinuxOpt: &runtime.SELinuxOption{ + User: "", + Role: "user_r", + Type: "", + Level: "s0:c1,c2", + }, + processLabel: "system_u:user_r:(container_file_t|svirt_lxc_net_t):s0:c1,c2", + mountLabel: "system_u:object_r:(container_file_t|svirt_sandbox_file_t):s0:c1,c2", + }, + "Should be resolved correctly when selinuxOpt has been initialized completely": { + selinuxOpt: &runtime.SELinuxOption{ + User: "user_u", + Role: "user_r", + Type: "user_t", + Level: "s0:c1,c2", + }, + processLabel: "user_u:user_r:user_t:s0:c1,c2", + mountLabel: "user_u:object_r:(container_file_t|svirt_sandbox_file_t):s0:c1,c2", + }, + "Should be resolved correctly when selinuxOpt has been initialized with level=''": { + selinuxOpt: &runtime.SELinuxOption{ + User: "user_u", + Role: "user_r", + Type: "user_t", + Level: "", + }, + processLabel: "user_u:user_r:user_t:s0:c[0-9]{1,3},c[0-9]{1,3}", + mountLabel: "user_u:object_r:(container_file_t|svirt_sandbox_file_t):s0", + }, + "Should return error when the format of 'level' is not correct": { + selinuxOpt: &runtime.SELinuxOption{ + User: "user_u", + Role: "user_r", + Type: "user_t", + Level: "s0,c1,c2", + }, + expectErr: true, + }, + } { + t.Run(desc, func(t *testing.T) { + processLabel, mountLabel, err := initLabelsFromOpt(test.selinuxOpt) + if test.expectErr { + assert.Error(t, err) + } else { + assert.Regexp(t, test.processLabel, processLabel) + assert.Regexp(t, test.mountLabel, mountLabel) + } + }) + } +} + +func TestCheckSelinuxLevel(t *testing.T) { + for desc, test := range map[string]struct { + level string + expectNoMatch bool + }{ + "s0": { + level: "s0", + }, + "s0-s0": { + level: "s0-s0", + }, + "s0:c0": { + level: "s0:c0", + }, + "s0:c0.c3": { + level: "s0:c0.c3", + }, + "s0:c0,c3": { + level: "s0:c0,c3", + }, + "s0-s0:c0,c3": { + level: "s0-s0:c0,c3", + }, + "s0-s0:c0,c3.c6": { + level: "s0-s0:c0,c3.c6", + }, + "s0-s0:c0,c3.c6,c8.c10": { + level: "s0-s0:c0,c3.c6,c8.c10", + }, + "s0-s0:c0,c3.c6,c8,c10": { + level: "s0-s0:c0,c3.c6", + }, + "s0,c0,c3": { + level: "s0,c0,c3", + expectNoMatch: true, + }, + "s0:c0.c3.c6": { + level: "s0:c0.c3.c6", + expectNoMatch: true, + }, + "s0-s0,c0,c3": { + level: "s0-s0,c0,c3", + expectNoMatch: true, + }, + "s0-s0:c0.c3.c6": { + level: "s0-s0:c0.c3.c6", + expectNoMatch: true, + }, + "s0-s0:c0,c3.c6.c8": { + level: "s0-s0:c0,c3.c6.c8", + expectNoMatch: true, + }, + } { + t.Run(desc, func(t *testing.T) { + err := checkSelinuxLevel(test.level) + if test.expectNoMatch { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } + }) + } +} diff --git a/pkg/cri/sbserver/podsandbox/helpers_test.go b/pkg/cri/sbserver/podsandbox/helpers_test.go new file mode 100644 index 000000000..1edb2cada --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/helpers_test.go @@ -0,0 +1,483 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "context" + "os" + "strings" + "testing" + + "github.com/containerd/containerd/containers" + "github.com/containerd/containerd/oci" + criconfig "github.com/containerd/containerd/pkg/cri/config" + "github.com/containerd/containerd/plugin" + "github.com/containerd/containerd/protobuf/types" + "github.com/containerd/containerd/reference/docker" + "github.com/containerd/containerd/runtime/linux/runctypes" + runcoptions "github.com/containerd/containerd/runtime/v2/runc/options" + "github.com/containerd/typeurl" + + imagedigest "github.com/opencontainers/go-digest" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pelletier/go-toml" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestGetUserFromImage tests the logic of getting image uid or user name of image user. +func TestGetUserFromImage(t *testing.T) { + newI64 := func(i int64) *int64 { return &i } + for c, test := range map[string]struct { + user string + uid *int64 + name string + }{ + "no gid": { + user: "0", + uid: newI64(0), + }, + "uid/gid": { + user: "0:1", + uid: newI64(0), + }, + "empty user": { + user: "", + }, + "multiple separators": { + user: "1:2:3", + uid: newI64(1), + }, + "root username": { + user: "root:root", + name: "root", + }, + "username": { + user: "test:test", + name: "test", + }, + } { + t.Run(c, func(t *testing.T) { + actualUID, actualName := getUserFromImage(test.user) + assert.Equal(t, test.uid, actualUID) + assert.Equal(t, test.name, actualName) + }) + } +} + +func TestGetRepoDigestAndTag(t *testing.T) { + digest := imagedigest.Digest("sha256:e6693c20186f837fc393390135d8a598a96a833917917789d63766cab6c59582") + for desc, test := range map[string]struct { + ref string + schema1 bool + expectedRepoDigest string + expectedRepoTag string + }{ + "repo tag should be empty if original ref has no tag": { + ref: "gcr.io/library/busybox@" + digest.String(), + expectedRepoDigest: "gcr.io/library/busybox@" + digest.String(), + }, + "repo tag should not be empty if original ref has tag": { + ref: "gcr.io/library/busybox:latest", + expectedRepoDigest: "gcr.io/library/busybox@" + digest.String(), + expectedRepoTag: "gcr.io/library/busybox:latest", + }, + "repo digest should be empty if original ref is schema1 and has no digest": { + ref: "gcr.io/library/busybox:latest", + schema1: true, + expectedRepoDigest: "", + expectedRepoTag: "gcr.io/library/busybox:latest", + }, + "repo digest should not be empty if original ref is schema1 but has digest": { + ref: "gcr.io/library/busybox@sha256:e6693c20186f837fc393390135d8a598a96a833917917789d63766cab6c59594", + schema1: true, + expectedRepoDigest: "gcr.io/library/busybox@sha256:e6693c20186f837fc393390135d8a598a96a833917917789d63766cab6c59594", + expectedRepoTag: "", + }, + } { + t.Run(desc, func(t *testing.T) { + named, err := docker.ParseDockerRef(test.ref) + assert.NoError(t, err) + repoDigest, repoTag := getRepoDigestAndTag(named, digest, test.schema1) + assert.Equal(t, test.expectedRepoDigest, repoDigest) + assert.Equal(t, test.expectedRepoTag, repoTag) + }) + } +} + +func TestBuildLabels(t *testing.T) { + imageConfigLabels := map[string]string{ + "a": "z", + "d": "y", + "long-label": strings.Repeat("example", 10000), + } + configLabels := map[string]string{ + "a": "b", + "c": "d", + } + newLabels := buildLabels(configLabels, imageConfigLabels, containerKindSandbox) + assert.Len(t, newLabels, 4) + assert.Equal(t, "b", newLabels["a"]) + assert.Equal(t, "d", newLabels["c"]) + assert.Equal(t, "y", newLabels["d"]) + assert.Equal(t, containerKindSandbox, newLabels[containerKindLabel]) + assert.NotContains(t, newLabels, "long-label") + + newLabels["a"] = "e" + assert.Empty(t, configLabels[containerKindLabel], "should not add new labels into original label") + assert.Equal(t, "b", configLabels["a"], "change in new labels should not affect original label") +} + +func TestParseImageReferences(t *testing.T) { + refs := []string{ + "gcr.io/library/busybox@sha256:e6693c20186f837fc393390135d8a598a96a833917917789d63766cab6c59582", + "gcr.io/library/busybox:1.2", + "sha256:e6693c20186f837fc393390135d8a598a96a833917917789d63766cab6c59582", + "arbitrary-ref", + } + expectedTags := []string{ + "gcr.io/library/busybox:1.2", + } + expectedDigests := []string{"gcr.io/library/busybox@sha256:e6693c20186f837fc393390135d8a598a96a833917917789d63766cab6c59582"} + tags, digests := parseImageReferences(refs) + assert.Equal(t, expectedTags, tags) + assert.Equal(t, expectedDigests, digests) +} + +func TestGenerateRuntimeOptions(t *testing.T) { + nilOpts := ` +systemd_cgroup = true +[containerd] + no_pivot = true + default_runtime_name = "default" +[containerd.runtimes.legacy] + runtime_type = "` + plugin.RuntimeLinuxV1 + `" +[containerd.runtimes.runc] + runtime_type = "` + plugin.RuntimeRuncV1 + `" +[containerd.runtimes.runcv2] + runtime_type = "` + plugin.RuntimeRuncV2 + `" +` + nonNilOpts := ` +systemd_cgroup = true +[containerd] + no_pivot = true + default_runtime_name = "default" +[containerd.runtimes.legacy] + runtime_type = "` + plugin.RuntimeLinuxV1 + `" +[containerd.runtimes.legacy.options] + Runtime = "legacy" + RuntimeRoot = "/legacy" +[containerd.runtimes.runc] + runtime_type = "` + plugin.RuntimeRuncV1 + `" +[containerd.runtimes.runc.options] + BinaryName = "runc" + Root = "/runc" + NoNewKeyring = true +[containerd.runtimes.runcv2] + runtime_type = "` + plugin.RuntimeRuncV2 + `" +[containerd.runtimes.runcv2.options] + BinaryName = "runc" + Root = "/runcv2" + NoNewKeyring = true +` + var nilOptsConfig, nonNilOptsConfig criconfig.Config + tree, err := toml.Load(nilOpts) + require.NoError(t, err) + err = tree.Unmarshal(&nilOptsConfig) + require.NoError(t, err) + require.Len(t, nilOptsConfig.Runtimes, 3) + + tree, err = toml.Load(nonNilOpts) + require.NoError(t, err) + err = tree.Unmarshal(&nonNilOptsConfig) + require.NoError(t, err) + require.Len(t, nonNilOptsConfig.Runtimes, 3) + + for desc, test := range map[string]struct { + r criconfig.Runtime + c criconfig.Config + expectedOptions interface{} + }{ + "when options is nil, should return nil option for io.containerd.runc.v1": { + r: nilOptsConfig.Runtimes["runc"], + c: nilOptsConfig, + expectedOptions: nil, + }, + "when options is nil, should return nil option for io.containerd.runc.v2": { + r: nilOptsConfig.Runtimes["runcv2"], + c: nilOptsConfig, + expectedOptions: nil, + }, + "when options is nil, should use legacy fields for legacy runtime": { + r: nilOptsConfig.Runtimes["legacy"], + c: nilOptsConfig, + expectedOptions: &runctypes.RuncOptions{ + SystemdCgroup: true, + }, + }, + "when options is not nil, should be able to decode for io.containerd.runc.v1": { + r: nonNilOptsConfig.Runtimes["runc"], + c: nonNilOptsConfig, + expectedOptions: &runcoptions.Options{ + BinaryName: "runc", + Root: "/runc", + NoNewKeyring: true, + }, + }, + "when options is not nil, should be able to decode for io.containerd.runc.v2": { + r: nonNilOptsConfig.Runtimes["runcv2"], + c: nonNilOptsConfig, + expectedOptions: &runcoptions.Options{ + BinaryName: "runc", + Root: "/runcv2", + NoNewKeyring: true, + }, + }, + "when options is not nil, should be able to decode for legacy runtime": { + r: nonNilOptsConfig.Runtimes["legacy"], + c: nonNilOptsConfig, + expectedOptions: &runctypes.RuncOptions{ + Runtime: "legacy", + RuntimeRoot: "/legacy", + }, + }, + } { + t.Run(desc, func(t *testing.T) { + opts, err := generateRuntimeOptions(test.r, test.c) + assert.NoError(t, err) + assert.Equal(t, test.expectedOptions, opts) + }) + } +} + +func TestEnvDeduplication(t *testing.T) { + for desc, test := range map[string]struct { + existing []string + kv [][2]string + expected []string + }{ + "single env": { + kv: [][2]string{ + {"a", "b"}, + }, + expected: []string{"a=b"}, + }, + "multiple envs": { + kv: [][2]string{ + {"a", "b"}, + {"c", "d"}, + {"e", "f"}, + }, + expected: []string{ + "a=b", + "c=d", + "e=f", + }, + }, + "env override": { + kv: [][2]string{ + {"k1", "v1"}, + {"k2", "v2"}, + {"k3", "v3"}, + {"k3", "v4"}, + {"k1", "v5"}, + {"k4", "v6"}, + }, + expected: []string{ + "k1=v5", + "k2=v2", + "k3=v4", + "k4=v6", + }, + }, + "existing env": { + existing: []string{ + "k1=v1", + "k2=v2", + "k3=v3", + }, + kv: [][2]string{ + {"k3", "v4"}, + {"k2", "v5"}, + {"k4", "v6"}, + }, + expected: []string{ + "k1=v1", + "k2=v5", + "k3=v4", + "k4=v6", + }, + }, + } { + t.Run(desc, func(t *testing.T) { + var spec runtimespec.Spec + if len(test.existing) > 0 { + spec.Process = &runtimespec.Process{ + Env: test.existing, + } + } + for _, kv := range test.kv { + oci.WithEnv([]string{kv[0] + "=" + kv[1]})(context.Background(), nil, nil, &spec) + } + assert.Equal(t, test.expected, spec.Process.Env) + }) + } +} + +func TestPassThroughAnnotationsFilter(t *testing.T) { + for desc, test := range map[string]struct { + podAnnotations map[string]string + runtimePodAnnotations []string + passthroughAnnotations map[string]string + }{ + "should support direct match": { + podAnnotations: map[string]string{"c": "d", "d": "e"}, + runtimePodAnnotations: []string{"c"}, + passthroughAnnotations: map[string]string{"c": "d"}, + }, + "should support wildcard match": { + podAnnotations: map[string]string{ + "t.f": "j", + "z.g": "o", + "z": "o", + "y.ca": "b", + "y": "b", + }, + runtimePodAnnotations: []string{"*.f", "z*g", "y.c*"}, + passthroughAnnotations: map[string]string{ + "t.f": "j", + "z.g": "o", + "y.ca": "b", + }, + }, + "should support wildcard match all": { + podAnnotations: map[string]string{ + "t.f": "j", + "z.g": "o", + "z": "o", + "y.ca": "b", + "y": "b", + }, + runtimePodAnnotations: []string{"*"}, + passthroughAnnotations: map[string]string{ + "t.f": "j", + "z.g": "o", + "z": "o", + "y.ca": "b", + "y": "b", + }, + }, + "should support match including path separator": { + podAnnotations: map[string]string{ + "matchend.com/end": "1", + "matchend.com/end1": "2", + "matchend.com/1end": "3", + "matchmid.com/mid": "4", + "matchmid.com/mi1d": "5", + "matchmid.com/mid1": "6", + "matchhead.com/head": "7", + "matchhead.com/1head": "8", + "matchhead.com/head1": "9", + "matchall.com/abc": "10", + "matchall.com/def": "11", + "end/matchend": "12", + "end1/matchend": "13", + "1end/matchend": "14", + "mid/matchmid": "15", + "mi1d/matchmid": "16", + "mid1/matchmid": "17", + "head/matchhead": "18", + "1head/matchhead": "19", + "head1/matchhead": "20", + "abc/matchall": "21", + "def/matchall": "22", + "match1/match2": "23", + "nomatch/nomatch": "24", + }, + runtimePodAnnotations: []string{ + "matchend.com/end*", + "matchmid.com/mi*d", + "matchhead.com/*head", + "matchall.com/*", + "end*/matchend", + "mi*d/matchmid", + "*head/matchhead", + "*/matchall", + "match*/match*", + }, + passthroughAnnotations: map[string]string{ + "matchend.com/end": "1", + "matchend.com/end1": "2", + "matchmid.com/mid": "4", + "matchmid.com/mi1d": "5", + "matchhead.com/head": "7", + "matchhead.com/1head": "8", + "matchall.com/abc": "10", + "matchall.com/def": "11", + "end/matchend": "12", + "end1/matchend": "13", + "mid/matchmid": "15", + "mi1d/matchmid": "16", + "head/matchhead": "18", + "1head/matchhead": "19", + "abc/matchall": "21", + "def/matchall": "22", + "match1/match2": "23", + }, + }, + } { + t.Run(desc, func(t *testing.T) { + passthroughAnnotations := getPassthroughAnnotations(test.podAnnotations, test.runtimePodAnnotations) + assert.Equal(t, test.passthroughAnnotations, passthroughAnnotations) + }) + } +} + +func TestEnsureRemoveAllNotExist(t *testing.T) { + // should never return an error for a non-existent path + if err := ensureRemoveAll(context.Background(), "/non/existent/path"); err != nil { + t.Fatal(err) + } +} + +func TestEnsureRemoveAllWithDir(t *testing.T) { + dir := t.TempDir() + if err := ensureRemoveAll(context.Background(), dir); err != nil { + t.Fatal(err) + } +} + +func TestEnsureRemoveAllWithFile(t *testing.T) { + tmp, err := os.CreateTemp("", "test-ensure-removeall-with-dir") + if err != nil { + t.Fatal(err) + } + tmp.Close() + if err := ensureRemoveAll(context.Background(), tmp.Name()); err != nil { + t.Fatal(err) + } +} + +func TestGetRuntimeOptions(t *testing.T) { + _, err := getRuntimeOptions(containers.Container{}) + require.NoError(t, err) + + var pbany *types.Any // This is nil. + var typeurlAny typeurl.Any = pbany // This is typed nil. + _, err = getRuntimeOptions(containers.Container{Runtime: containers.RuntimeInfo{Options: typeurlAny}}) + require.NoError(t, err) +} diff --git a/pkg/cri/sbserver/podsandbox/helpers_windows.go b/pkg/cri/sbserver/podsandbox/helpers_windows.go new file mode 100644 index 000000000..a98563fd2 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/helpers_windows.go @@ -0,0 +1,168 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "context" + "os" + "path/filepath" + "syscall" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +// openLogFile opens/creates a container log file. +// It specifies `FILE_SHARE_DELETE` option to make sure +// log files can be rotated by kubelet. +// TODO(windows): Use golang support after 1.14. (https://github.com/golang/go/issues/32088) +func openLogFile(path string) (*os.File, error) { + path = fixLongPath(path) + if len(path) == 0 { + return nil, syscall.ERROR_FILE_NOT_FOUND + } + pathp, err := syscall.UTF16PtrFromString(path) + if err != nil { + return nil, err + } + createmode := uint32(syscall.OPEN_ALWAYS) + access := uint32(syscall.FILE_APPEND_DATA) + sharemode := uint32(syscall.FILE_SHARE_READ | syscall.FILE_SHARE_WRITE | syscall.FILE_SHARE_DELETE) + h, err := syscall.CreateFile(pathp, access, sharemode, nil, createmode, syscall.FILE_ATTRIBUTE_NORMAL, 0) + if err != nil { + return nil, err + } + return os.NewFile(uintptr(h), path), nil +} + +// Copyright (c) 2009 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// fixLongPath returns the extended-length (\\?\-prefixed) form of +// path when needed, in order to avoid the default 260 character file +// path limit imposed by Windows. If path is not easily converted to +// the extended-length form (for example, if path is a relative path +// or contains .. elements), or is short enough, fixLongPath returns +// path unmodified. +// +// See https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx#maxpath +// +// This is copied from https://golang.org/src/path/filepath/path_windows.go. +func fixLongPath(path string) string { + // Do nothing (and don't allocate) if the path is "short". + // Empirically (at least on the Windows Server 2013 builder), + // the kernel is arbitrarily okay with < 248 bytes. That + // matches what the docs above say: + // "When using an API to create a directory, the specified + // path cannot be so long that you cannot append an 8.3 file + // name (that is, the directory name cannot exceed MAX_PATH + // minus 12)." Since MAX_PATH is 260, 260 - 12 = 248. + // + // The MSDN docs appear to say that a normal path that is 248 bytes long + // will work; empirically the path must be less then 248 bytes long. + if len(path) < 248 { + // Don't fix. (This is how Go 1.7 and earlier worked, + // not automatically generating the \\?\ form) + return path + } + + // The extended form begins with \\?\, as in + // \\?\c:\windows\foo.txt or \\?\UNC\server\share\foo.txt. + // The extended form disables evaluation of . and .. path + // elements and disables the interpretation of / as equivalent + // to \. The conversion here rewrites / to \ and elides + // . elements as well as trailing or duplicate separators. For + // simplicity it avoids the conversion entirely for relative + // paths or paths containing .. elements. For now, + // \\server\share paths are not converted to + // \\?\UNC\server\share paths because the rules for doing so + // are less well-specified. + if len(path) >= 2 && path[:2] == `\\` { + // Don't canonicalize UNC paths. + return path + } + if !filepath.IsAbs(path) { + // Relative path + return path + } + + const prefix = `\\?` + + pathbuf := make([]byte, len(prefix)+len(path)+len(`\`)) + copy(pathbuf, prefix) + n := len(path) + r, w := 0, len(prefix) + for r < n { + switch { + case os.IsPathSeparator(path[r]): + // empty block + r++ + case path[r] == '.' && (r+1 == n || os.IsPathSeparator(path[r+1])): + // /./ + r++ + case r+1 < n && path[r] == '.' && path[r+1] == '.' && (r+2 == n || os.IsPathSeparator(path[r+2])): + // /../ is currently unhandled + return path + default: + pathbuf[w] = '\\' + w++ + for ; r < n && !os.IsPathSeparator(path[r]); r++ { + pathbuf[w] = path[r] + w++ + } + } + } + // A drive's root directory needs a trailing \ + if w == len(`\\?\c:`) { + pathbuf[w] = '\\' + w++ + } + return string(pathbuf[:w]) +} + +// ensureRemoveAll is a wrapper for os.RemoveAll on Windows. +func ensureRemoveAll(_ context.Context, dir string) error { + return os.RemoveAll(dir) +} + +func modifyProcessLabel(runtimeType string, spec *specs.Spec) error { + return nil +} diff --git a/pkg/cri/sbserver/podsandbox/opts.go b/pkg/cri/sbserver/podsandbox/opts.go new file mode 100644 index 000000000..539a2b3d5 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/opts.go @@ -0,0 +1,51 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "context" + + "github.com/containerd/containerd" + "github.com/containerd/containerd/log" + "github.com/containerd/nri" + v1 "github.com/containerd/nri/types/v1" +) + +// WithNRISandboxDelete calls delete for a sandbox'd task +func WithNRISandboxDelete(sandboxID string) containerd.ProcessDeleteOpts { + return func(ctx context.Context, p containerd.Process) error { + task, ok := p.(containerd.Task) + if !ok { + return nil + } + nric, err := nri.New() + if err != nil { + log.G(ctx).WithError(err).Error("unable to create nri client") + return nil + } + if nric == nil { + return nil + } + sb := &nri.Sandbox{ + ID: sandboxID, + } + if _, err := nric.InvokeWithSandbox(ctx, task, v1.Delete, sb); err != nil { + log.G(ctx).WithError(err).Errorf("Failed to delete nri for %q", task.ID()) + } + return nil + } +} diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run.go b/pkg/cri/sbserver/podsandbox/sandbox_run.go new file mode 100644 index 000000000..542ccef92 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/sandbox_run.go @@ -0,0 +1,302 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "context" + "errors" + "fmt" + + "github.com/containerd/containerd" + containerdio "github.com/containerd/containerd/cio" + "github.com/containerd/containerd/errdefs" + "github.com/containerd/containerd/log" + "github.com/containerd/containerd/pkg/cri/annotations" + criconfig "github.com/containerd/containerd/pkg/cri/config" + customopts "github.com/containerd/containerd/pkg/cri/opts" + sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox" + ctrdutil "github.com/containerd/containerd/pkg/cri/util" + "github.com/containerd/containerd/snapshots" + "github.com/containerd/nri" + v1 "github.com/containerd/nri/types/v1" + "github.com/containerd/typeurl" + "github.com/davecgh/go-spew/spew" + "github.com/opencontainers/selinux/go-selinux" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func init() { + typeurl.Register(&sandboxstore.Metadata{}, + "github.com/containerd/cri/pkg/store/sandbox", "Metadata") +} + +func (c *Controller) Start(ctx context.Context, id string) (_ uint32, retErr error) { + sandboxInfo, err := c.client.SandboxStore().Get(ctx, id) + if err != nil { + return 0, fmt.Errorf("unable to find sandbox with id %q: %w", id, err) + } + + var metadata sandboxstore.Metadata + if err := sandboxInfo.GetExtension("metadata", &metadata); err != nil { + return 0, fmt.Errorf("failed to get sandbox %q metadata: %w", id, err) + } + + var config = metadata.Config + + // Ensure sandbox container image snapshot. + image, err := c.cri.EnsureImageExists(ctx, c.config.SandboxImage, config) + if err != nil { + return 0, fmt.Errorf("failed to get sandbox image %q: %w", c.config.SandboxImage, err) + } + + containerdImage, err := c.toContainerdImage(ctx, *image) + if err != nil { + return 0, fmt.Errorf("failed to get image from containerd %q: %w", image.ID, err) + } + + ociRuntime, err := c.getSandboxRuntime(config, sandboxInfo.Runtime.Name) + if err != nil { + return 0, fmt.Errorf("failed to get sandbox runtime: %w", err) + } + log.G(ctx).WithField("podsandboxid", id).Debugf("use OCI runtime %+v", ociRuntime) + + // Create sandbox container. + // NOTE: sandboxContainerSpec SHOULD NOT have side + // effect, e.g. accessing/creating files, so that we can test + // it safely. + spec, err := c.sandboxContainerSpec(id, config, &image.ImageSpec.Config, metadata.NetNSPath, ociRuntime.PodAnnotations) + if err != nil { + return 0, fmt.Errorf("failed to generate sandbox container spec: %w", err) + } + log.G(ctx).WithField("podsandboxid", id).Debugf("sandbox container spec: %#+v", spew.NewFormatter(spec)) + + processLabel := spec.Process.SelinuxLabel + defer func() { + if retErr != nil { + selinux.ReleaseLabel(processLabel) + } + }() + + // handle any KVM based runtime + if err := modifyProcessLabel(ociRuntime.Type, spec); err != nil { + return 0, err + } + + if config.GetLinux().GetSecurityContext().GetPrivileged() { + // If privileged don't set selinux label, but we still record the MCS label so that + // the unused label can be freed later. + spec.Process.SelinuxLabel = "" + } + + // Generate spec options that will be applied to the spec later. + specOpts, err := c.sandboxContainerSpecOpts(config, &image.ImageSpec.Config) + if err != nil { + return 0, fmt.Errorf("failed to generate sandbox container spec options: %w", err) + } + + sandboxLabels := buildLabels(config.Labels, image.ImageSpec.Config.Labels, containerKindSandbox) + + runtimeOpts, err := generateRuntimeOptions(ociRuntime, c.config) + if err != nil { + return 0, fmt.Errorf("failed to generate runtime options: %w", err) + } + + snapshotterOpt := snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations)) + opts := []containerd.NewContainerOpts{ + containerd.WithSnapshotter(c.runtimeSnapshotter(ctx, ociRuntime)), + customopts.WithNewSnapshot(id, containerdImage, snapshotterOpt), + containerd.WithSpec(spec, specOpts...), + containerd.WithContainerLabels(sandboxLabels), + containerd.WithContainerExtension(sandboxMetadataExtension, &metadata), + containerd.WithRuntime(ociRuntime.Type, runtimeOpts)} + + container, err := c.client.NewContainer(ctx, id, opts...) + if err != nil { + return 0, fmt.Errorf("failed to create containerd container: %w", err) + } + defer func() { + if retErr != nil { + deferCtx, deferCancel := ctrdutil.DeferContext() + defer deferCancel() + if err := container.Delete(deferCtx, containerd.WithSnapshotCleanup); err != nil { + log.G(ctx).WithError(err).Errorf("Failed to delete containerd container %q", id) + } + } + }() + + // Create sandbox container root directories. + sandboxRootDir := c.getSandboxRootDir(id) + if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil { + return 0, fmt.Errorf("failed to create sandbox root directory %q: %w", + sandboxRootDir, err) + } + defer func() { + if retErr != nil { + // Cleanup the sandbox root directory. + if err := c.os.RemoveAll(sandboxRootDir); err != nil { + log.G(ctx).WithError(err).Errorf("Failed to remove sandbox root directory %q", + sandboxRootDir) + } + } + }() + + volatileSandboxRootDir := c.getVolatileSandboxRootDir(id) + if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil { + return 0, fmt.Errorf("failed to create volatile sandbox root directory %q: %w", + volatileSandboxRootDir, err) + } + defer func() { + if retErr != nil { + // Cleanup the volatile sandbox root directory. + if err := c.os.RemoveAll(volatileSandboxRootDir); err != nil { + log.G(ctx).WithError(err).Errorf("Failed to remove volatile sandbox root directory %q", + volatileSandboxRootDir) + } + } + }() + + // Setup files required for the sandbox. + if err = c.setupSandboxFiles(id, config); err != nil { + return 0, fmt.Errorf("failed to setup sandbox files: %w", err) + } + defer func() { + if retErr != nil { + if err = c.cleanupSandboxFiles(id, config); err != nil { + log.G(ctx).WithError(err).Errorf("Failed to cleanup sandbox files in %q", + sandboxRootDir) + } + } + }() + + // Update sandbox created timestamp. + info, err := container.Info(ctx) // TODO: return info.CreatedAt. + if err != nil { + return 0, fmt.Errorf("failed to get sandbox container info: %w", err) + } + log.G(ctx).Debugf("container info: %+v", info) + + // Create sandbox task in containerd. + log.G(ctx).Tracef("Create sandbox container (id=%q, name=%q).", id, metadata.Name) + + taskOpts := c.taskOpts(ociRuntime.Type) + if ociRuntime.Path != "" { + taskOpts = append(taskOpts, containerd.WithRuntimePath(ociRuntime.Path)) + } + + // We don't need stdio for sandbox container. + task, err := container.NewTask(ctx, containerdio.NullIO, taskOpts...) + if err != nil { + return 0, fmt.Errorf("failed to create containerd task: %w", err) + } + defer func() { + if retErr != nil { + deferCtx, deferCancel := ctrdutil.DeferContext() + defer deferCancel() + // Cleanup the sandbox container if an error is returned. + if _, err := task.Delete(deferCtx, WithNRISandboxDelete(id), containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) { + log.G(ctx).WithError(err).Errorf("Failed to delete sandbox container %q", id) + } + } + }() + + // wait is a long running background request, no timeout needed. + exitCh, err := task.Wait(ctrdutil.NamespacedContext()) + if err != nil { + return 0, fmt.Errorf("failed to wait for sandbox container task: %w", err) + } + + nric, err := nri.New() + if err != nil { + return 0, fmt.Errorf("unable to create nri client: %w", err) + } + if nric != nil { + nriSB := &nri.Sandbox{ + ID: id, + Labels: config.Labels, + } + if _, err := nric.InvokeWithSandbox(ctx, task, v1.Create, nriSB); err != nil { + return 0, fmt.Errorf("nri invoke: %w", err) + } + } + + if err := task.Start(ctx); err != nil { + return 0, fmt.Errorf("failed to start sandbox container task %q: %w", id, err) + } + + // start the monitor after adding sandbox into the store, this ensures + // that sandbox is in the store, when event monitor receives the TaskExit event. + // + // TaskOOM from containerd may come before sandbox is added to store, + // but we don't care about sandbox TaskOOM right now, so it is fine. + c.cri.StartSandboxExitMonitor(context.Background(), id, task.Pid(), exitCh) // TODO: Move back to CRI service. + + return 0, nil +} + +// untrustedWorkload returns true if the sandbox contains untrusted workload. +func untrustedWorkload(config *runtime.PodSandboxConfig) bool { + return config.GetAnnotations()[annotations.UntrustedWorkload] == "true" +} + +// hostAccessingSandbox returns true if the sandbox configuration +// requires additional host access for the sandbox. +func hostAccessingSandbox(config *runtime.PodSandboxConfig) bool { + securityContext := config.GetLinux().GetSecurityContext() + + namespaceOptions := securityContext.GetNamespaceOptions() + if namespaceOptions.GetNetwork() == runtime.NamespaceMode_NODE || + namespaceOptions.GetPid() == runtime.NamespaceMode_NODE || + namespaceOptions.GetIpc() == runtime.NamespaceMode_NODE { + return true + } + + return false +} + +// getSandboxRuntime returns the runtime configuration for sandbox. +// If the sandbox contains untrusted workload, runtime for untrusted workload will be returned, +// or else default runtime will be returned. +func (c *Controller) getSandboxRuntime(config *runtime.PodSandboxConfig, runtimeHandler string) (criconfig.Runtime, error) { + if untrustedWorkload(config) { + // If the untrusted annotation is provided, runtimeHandler MUST be empty. + if runtimeHandler != "" && runtimeHandler != criconfig.RuntimeUntrusted { + return criconfig.Runtime{}, errors.New("untrusted workload with explicit runtime handler is not allowed") + } + + // If the untrusted workload is requesting access to the host/node, this request will fail. + // + // Note: If the workload is marked untrusted but requests privileged, this can be granted, as the + // runtime may support this. For example, in a virtual-machine isolated runtime, privileged + // is a supported option, granting the workload to access the entire guest VM instead of host. + // TODO(windows): Deprecate this so that we don't need to handle it for windows. + if hostAccessingSandbox(config) { + return criconfig.Runtime{}, errors.New("untrusted workload with host access is not allowed") + } + + runtimeHandler = criconfig.RuntimeUntrusted + } + + if runtimeHandler == "" { + runtimeHandler = c.config.ContainerdConfig.DefaultRuntimeName + } + + handler, ok := c.config.ContainerdConfig.Runtimes[runtimeHandler] + if !ok { + return criconfig.Runtime{}, fmt.Errorf("no runtime for %q is configured", runtimeHandler) + } + return handler, nil +} diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run_linux.go b/pkg/cri/sbserver/podsandbox/sandbox_run_linux.go new file mode 100644 index 000000000..06116d73f --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/sandbox_run_linux.go @@ -0,0 +1,350 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "fmt" + "os" + "strconv" + "strings" + + "github.com/containerd/containerd" + "github.com/containerd/containerd/oci" + "github.com/containerd/containerd/plugin" + imagespec "github.com/opencontainers/image-spec/specs-go/v1" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux" + "golang.org/x/sys/unix" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + + "github.com/containerd/containerd/pkg/cri/annotations" + customopts "github.com/containerd/containerd/pkg/cri/opts" + osinterface "github.com/containerd/containerd/pkg/os" + "github.com/containerd/containerd/pkg/userns" +) + +func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig, + imageConfig *imagespec.ImageConfig, nsPath string, runtimePodAnnotations []string) (_ *runtimespec.Spec, retErr error) { + // Creates a spec Generator with the default spec. + // TODO(random-liu): [P1] Compare the default settings with docker and containerd default. + specOpts := []oci.SpecOpts{ + oci.WithoutRunMount, + customopts.WithoutDefaultSecuritySettings, + customopts.WithRelativeRoot(relativeRootfsPath), + oci.WithEnv(imageConfig.Env), + oci.WithRootFSReadonly(), + oci.WithHostname(config.GetHostname()), + } + if imageConfig.WorkingDir != "" { + specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir)) + } + + if len(imageConfig.Entrypoint) == 0 && len(imageConfig.Cmd) == 0 { + // Pause image must have entrypoint or cmd. + return nil, fmt.Errorf("invalid empty entrypoint and cmd in image config %+v", imageConfig) + } + specOpts = append(specOpts, oci.WithProcessArgs(append(imageConfig.Entrypoint, imageConfig.Cmd...)...)) + + // Set cgroups parent. + if c.config.DisableCgroup { + specOpts = append(specOpts, customopts.WithDisabledCgroups) + } else { + if config.GetLinux().GetCgroupParent() != "" { + cgroupsPath := getCgroupsPath(config.GetLinux().GetCgroupParent(), id) + specOpts = append(specOpts, oci.WithCgroup(cgroupsPath)) + } + } + + // When cgroup parent is not set, containerd-shim will create container in a child cgroup + // of the cgroup itself is in. + // TODO(random-liu): [P2] Set default cgroup path if cgroup parent is not specified. + + // Set namespace options. + var ( + securityContext = config.GetLinux().GetSecurityContext() + nsOptions = securityContext.GetNamespaceOptions() + ) + if nsOptions.GetNetwork() == runtime.NamespaceMode_NODE { + specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.NetworkNamespace)) + specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.UTSNamespace)) + } else { + specOpts = append(specOpts, oci.WithLinuxNamespace( + runtimespec.LinuxNamespace{ + Type: runtimespec.NetworkNamespace, + Path: nsPath, + })) + } + if nsOptions.GetPid() == runtime.NamespaceMode_NODE { + specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.PIDNamespace)) + } + if nsOptions.GetIpc() == runtime.NamespaceMode_NODE { + specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.IPCNamespace)) + } + + // It's fine to generate the spec before the sandbox /dev/shm + // is actually created. + sandboxDevShm := c.getSandboxDevShm(id) + if nsOptions.GetIpc() == runtime.NamespaceMode_NODE { + sandboxDevShm = devShm + } + // Remove the default /dev/shm mount from defaultMounts, it is added in oci/mounts.go. + specOpts = append(specOpts, oci.WithoutMounts(devShm)) + // In future the when user-namespace is enabled, the `nosuid, nodev, noexec` flags are + // required, otherwise the remount will fail with EPERM. Just use them unconditionally, + // they are nice to have anyways. + specOpts = append(specOpts, oci.WithMounts([]runtimespec.Mount{ + { + Source: sandboxDevShm, + Destination: devShm, + Type: "bind", + Options: []string{"rbind", "ro", "nosuid", "nodev", "noexec"}, + }, + // Add resolv.conf for katacontainers to setup the DNS of pod VM properly. + { + Source: c.getResolvPath(id), + Destination: resolvConfPath, + Type: "bind", + Options: []string{"rbind", "ro"}, + }, + })) + + processLabel, mountLabel, err := initLabelsFromOpt(securityContext.GetSelinuxOptions()) + if err != nil { + return nil, fmt.Errorf("failed to init selinux options %+v: %w", securityContext.GetSelinuxOptions(), err) + } + defer func() { + if retErr != nil { + selinux.ReleaseLabel(processLabel) + } + }() + + supplementalGroups := securityContext.GetSupplementalGroups() + specOpts = append(specOpts, + customopts.WithSelinuxLabels(processLabel, mountLabel), + customopts.WithSupplementalGroups(supplementalGroups), + ) + + // Add sysctls + sysctls := config.GetLinux().GetSysctls() + if sysctls == nil { + sysctls = make(map[string]string) + } + _, ipUnprivilegedPortStart := sysctls["net.ipv4.ip_unprivileged_port_start"] + _, pingGroupRange := sysctls["net.ipv4.ping_group_range"] + if nsOptions.GetNetwork() != runtime.NamespaceMode_NODE { + if c.config.EnableUnprivilegedPorts && !ipUnprivilegedPortStart { + sysctls["net.ipv4.ip_unprivileged_port_start"] = "0" + } + if c.config.EnableUnprivilegedICMP && !pingGroupRange && !userns.RunningInUserNS() { + sysctls["net.ipv4.ping_group_range"] = "0 2147483647" + } + } + specOpts = append(specOpts, customopts.WithSysctls(sysctls)) + + // Note: LinuxSandboxSecurityContext does not currently provide an apparmor profile + + if !c.config.DisableCgroup { + specOpts = append(specOpts, customopts.WithDefaultSandboxShares) + } + + if res := config.GetLinux().GetResources(); res != nil { + specOpts = append(specOpts, + customopts.WithAnnotation(annotations.SandboxCPUPeriod, strconv.FormatInt(res.CpuPeriod, 10)), + customopts.WithAnnotation(annotations.SandboxCPUQuota, strconv.FormatInt(res.CpuQuota, 10)), + customopts.WithAnnotation(annotations.SandboxCPUShares, strconv.FormatInt(res.CpuShares, 10)), + customopts.WithAnnotation(annotations.SandboxMem, strconv.FormatInt(res.MemoryLimitInBytes, 10))) + } + + specOpts = append(specOpts, customopts.WithPodOOMScoreAdj(int(defaultSandboxOOMAdj), c.config.RestrictOOMScoreAdj)) + + for pKey, pValue := range getPassthroughAnnotations(config.Annotations, + runtimePodAnnotations) { + specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) + } + + specOpts = append(specOpts, + customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeSandbox), + customopts.WithAnnotation(annotations.SandboxID, id), + customopts.WithAnnotation(annotations.SandboxNamespace, config.GetMetadata().GetNamespace()), + customopts.WithAnnotation(annotations.SandboxName, config.GetMetadata().GetName()), + customopts.WithAnnotation(annotations.SandboxLogDir, config.GetLogDirectory()), + ) + + return c.runtimeSpec(id, "", specOpts...) +} + +// sandboxContainerSpecOpts generates OCI spec options for +// the sandbox container. +func (c *Controller) sandboxContainerSpecOpts(config *runtime.PodSandboxConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) { + var ( + securityContext = config.GetLinux().GetSecurityContext() + specOpts []oci.SpecOpts + err error + ) + ssp := securityContext.GetSeccomp() + if ssp == nil { + ssp, err = generateSeccompSecurityProfile( + securityContext.GetSeccompProfilePath(), //nolint:staticcheck // Deprecated but we don't want to remove yet + c.config.UnsetSeccompProfile) + if err != nil { + return nil, fmt.Errorf("failed to generate seccomp spec opts: %w", err) + } + } + seccompSpecOpts, err := c.generateSeccompSpecOpts( + ssp, + securityContext.GetPrivileged(), + c.seccompEnabled()) + if err != nil { + return nil, fmt.Errorf("failed to generate seccomp spec opts: %w", err) + } + if seccompSpecOpts != nil { + specOpts = append(specOpts, seccompSpecOpts) + } + + userstr, err := generateUserString( + "", + securityContext.GetRunAsUser(), + securityContext.GetRunAsGroup(), + ) + if err != nil { + return nil, fmt.Errorf("failed to generate user string: %w", err) + } + if userstr == "" { + // Lastly, since no user override was passed via CRI try to set via OCI + // Image + userstr = imageConfig.User + } + if userstr != "" { + specOpts = append(specOpts, oci.WithUser(userstr)) + } + return specOpts, nil +} + +// setupSandboxFiles sets up necessary sandbox files including /dev/shm, /etc/hosts, +// /etc/resolv.conf and /etc/hostname. +func (c *Controller) setupSandboxFiles(id string, config *runtime.PodSandboxConfig) error { + sandboxEtcHostname := c.getSandboxHostname(id) + hostname := config.GetHostname() + if hostname == "" { + var err error + hostname, err = c.os.Hostname() + if err != nil { + return fmt.Errorf("failed to get hostname: %w", err) + } + } + if err := c.os.WriteFile(sandboxEtcHostname, []byte(hostname+"\n"), 0644); err != nil { + return fmt.Errorf("failed to write hostname to %q: %w", sandboxEtcHostname, err) + } + + // TODO(random-liu): Consider whether we should maintain /etc/hosts and /etc/resolv.conf in kubelet. + sandboxEtcHosts := c.getSandboxHosts(id) + if err := c.os.CopyFile(etcHosts, sandboxEtcHosts, 0644); err != nil { + return fmt.Errorf("failed to generate sandbox hosts file %q: %w", sandboxEtcHosts, err) + } + + // Set DNS options. Maintain a resolv.conf for the sandbox. + var err error + resolvContent := "" + if dnsConfig := config.GetDnsConfig(); dnsConfig != nil { + resolvContent, err = parseDNSOptions(dnsConfig.Servers, dnsConfig.Searches, dnsConfig.Options) + if err != nil { + return fmt.Errorf("failed to parse sandbox DNSConfig %+v: %w", dnsConfig, err) + } + } + resolvPath := c.getResolvPath(id) + if resolvContent == "" { + // copy host's resolv.conf to resolvPath + err = c.os.CopyFile(resolvConfPath, resolvPath, 0644) + if err != nil { + return fmt.Errorf("failed to copy host's resolv.conf to %q: %w", resolvPath, err) + } + } else { + err = c.os.WriteFile(resolvPath, []byte(resolvContent), 0644) + if err != nil { + return fmt.Errorf("failed to write resolv content to %q: %w", resolvPath, err) + } + } + + // Setup sandbox /dev/shm. + if config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetIpc() == runtime.NamespaceMode_NODE { + if _, err := c.os.Stat(devShm); err != nil { + return fmt.Errorf("host %q is not available for host ipc: %w", devShm, err) + } + } else { + sandboxDevShm := c.getSandboxDevShm(id) + if err := c.os.MkdirAll(sandboxDevShm, 0700); err != nil { + return fmt.Errorf("failed to create sandbox shm: %w", err) + } + shmproperty := fmt.Sprintf("mode=1777,size=%d", defaultShmSize) + if err := c.os.(osinterface.UNIX).Mount("shm", sandboxDevShm, "tmpfs", uintptr(unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV), shmproperty); err != nil { + return fmt.Errorf("failed to mount sandbox shm: %w", err) + } + } + + return nil +} + +// parseDNSOptions parse DNS options into resolv.conf format content, +// if none option is specified, will return empty with no error. +func parseDNSOptions(servers, searches, options []string) (string, error) { + resolvContent := "" + + if len(searches) > 0 { + resolvContent += fmt.Sprintf("search %s\n", strings.Join(searches, " ")) + } + + if len(servers) > 0 { + resolvContent += fmt.Sprintf("nameserver %s\n", strings.Join(servers, "\nnameserver ")) + } + + if len(options) > 0 { + resolvContent += fmt.Sprintf("options %s\n", strings.Join(options, " ")) + } + + return resolvContent, nil +} + +// cleanupSandboxFiles unmount some sandbox files, we rely on the removal of sandbox root directory to +// remove these files. Unmount should *NOT* return error if the mount point is already unmounted. +func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error { + if config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetIpc() != runtime.NamespaceMode_NODE { + path, err := c.os.FollowSymlinkInScope(c.getSandboxDevShm(id), "/") + if err != nil { + return fmt.Errorf("failed to follow symlink: %w", err) + } + if err := c.os.(osinterface.UNIX).Unmount(path); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to unmount %q: %w", path, err) + } + } + return nil +} + +// taskOpts generates task options for a (sandbox) container. +func (c *Controller) taskOpts(runtimeType string) []containerd.NewTaskOpts { + // TODO(random-liu): Remove this after shim v1 is deprecated. + var taskOpts []containerd.NewTaskOpts + + // c.config.NoPivot is only supported for RuntimeLinuxV1 = "io.containerd.runtime.v1.linux" legacy linux runtime + // and is not supported for RuntimeRuncV1 = "io.containerd.runc.v1" or RuntimeRuncV2 = "io.containerd.runc.v2" + // for RuncV1/2 no pivot is set under the containerd.runtimes.runc.options config see + // https://github.com/containerd/containerd/blob/v1.3.2/runtime/v2/runc/options/oci.pb.go#L26 + if c.config.NoPivot && runtimeType == plugin.RuntimeLinuxV1 { + taskOpts = append(taskOpts, containerd.WithNoPivotRoot) + } + + return taskOpts +} diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run_linux_test.go b/pkg/cri/sbserver/podsandbox/sandbox_run_linux_test.go new file mode 100644 index 000000000..37f9fcdf4 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/sandbox_run_linux_test.go @@ -0,0 +1,526 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "os" + "path/filepath" + "strconv" + "testing" + + imagespec "github.com/opencontainers/image-spec/specs-go/v1" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + v1 "k8s.io/cri-api/pkg/apis/runtime/v1" + + "github.com/containerd/containerd/pkg/cri/annotations" + "github.com/containerd/containerd/pkg/cri/opts" + ostesting "github.com/containerd/containerd/pkg/os/testing" +) + +func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { + config := &runtime.PodSandboxConfig{ + Metadata: &runtime.PodSandboxMetadata{ + Name: "test-name", + Uid: "test-uid", + Namespace: "test-ns", + Attempt: 1, + }, + Hostname: "test-hostname", + LogDirectory: "test-log-directory", + Labels: map[string]string{"a": "b"}, + Annotations: map[string]string{"c": "d"}, + Linux: &runtime.LinuxPodSandboxConfig{ + CgroupParent: "/test/cgroup/parent", + }, + } + imageConfig := &imagespec.ImageConfig{ + Env: []string{"a=b", "c=d"}, + Entrypoint: []string{"/pause"}, + Cmd: []string{"forever"}, + WorkingDir: "/workspace", + } + specCheck := func(t *testing.T, id string, spec *runtimespec.Spec) { + assert.Equal(t, "test-hostname", spec.Hostname) + assert.Equal(t, getCgroupsPath("/test/cgroup/parent", id), spec.Linux.CgroupsPath) + assert.Equal(t, relativeRootfsPath, spec.Root.Path) + assert.Equal(t, true, spec.Root.Readonly) + assert.Contains(t, spec.Process.Env, "a=b", "c=d") + assert.Equal(t, []string{"/pause", "forever"}, spec.Process.Args) + assert.Equal(t, "/workspace", spec.Process.Cwd) + assert.EqualValues(t, *spec.Linux.Resources.CPU.Shares, opts.DefaultSandboxCPUshares) + assert.EqualValues(t, *spec.Process.OOMScoreAdj, defaultSandboxOOMAdj) + + t.Logf("Check PodSandbox annotations") + assert.Contains(t, spec.Annotations, annotations.SandboxID) + assert.EqualValues(t, spec.Annotations[annotations.SandboxID], id) + + assert.Contains(t, spec.Annotations, annotations.ContainerType) + assert.EqualValues(t, spec.Annotations[annotations.ContainerType], annotations.ContainerTypeSandbox) + + assert.Contains(t, spec.Annotations, annotations.SandboxNamespace) + assert.EqualValues(t, spec.Annotations[annotations.SandboxNamespace], "test-ns") + + assert.Contains(t, spec.Annotations, annotations.SandboxName) + assert.EqualValues(t, spec.Annotations[annotations.SandboxName], "test-name") + + assert.Contains(t, spec.Annotations, annotations.SandboxLogDir) + assert.EqualValues(t, spec.Annotations[annotations.SandboxLogDir], "test-log-directory") + + if selinux.GetEnabled() { + assert.NotEqual(t, "", spec.Process.SelinuxLabel) + assert.NotEqual(t, "", spec.Linux.MountLabel) + } + } + return config, imageConfig, specCheck +} + +func TestLinuxSandboxContainerSpec(t *testing.T) { + testID := "test-id" + nsPath := "test-cni" + for desc, test := range map[string]struct { + configChange func(*runtime.PodSandboxConfig) + specCheck func(*testing.T, *runtimespec.Spec) + expectErr bool + }{ + "spec should reflect original config": { + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + // runtime spec should have expected namespaces enabled by default. + require.NotNil(t, spec.Linux) + assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.NetworkNamespace, + Path: nsPath, + }) + assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.UTSNamespace, + }) + assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.PIDNamespace, + }) + assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.IPCNamespace, + }) + assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "0") + assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") + }, + }, + "host namespace": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + Network: runtime.NamespaceMode_NODE, + Pid: runtime.NamespaceMode_NODE, + Ipc: runtime.NamespaceMode_NODE, + }, + } + }, + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + // runtime spec should disable expected namespaces in host mode. + require.NotNil(t, spec.Linux) + assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.NetworkNamespace, + }) + assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.UTSNamespace, + }) + assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.PIDNamespace, + }) + assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.IPCNamespace, + }) + assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "0") + assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") + }, + }, + "should set supplemental groups correctly": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + SupplementalGroups: []int64{1111, 2222}, + } + }, + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + require.NotNil(t, spec.Process) + assert.Contains(t, spec.Process.User.AdditionalGids, uint32(1111)) + assert.Contains(t, spec.Process.User.AdditionalGids, uint32(2222)) + }, + }, + "should overwrite default sysctls": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.Sysctls = map[string]string{ + "net.ipv4.ip_unprivileged_port_start": "500", + "net.ipv4.ping_group_range": "1 1000", + } + }, + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + require.NotNil(t, spec.Process) + assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "500") + assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "1 1000") + }, + }, + "sandbox sizing annotations should be set if LinuxContainerResources were provided": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.Resources = &v1.LinuxContainerResources{ + CpuPeriod: 100, + CpuQuota: 200, + CpuShares: 5000, + MemoryLimitInBytes: 1024, + } + }, + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + value, ok := spec.Annotations[annotations.SandboxCPUPeriod] + assert.True(t, ok) + assert.EqualValues(t, strconv.FormatInt(100, 10), value) + assert.EqualValues(t, "100", value) + + value, ok = spec.Annotations[annotations.SandboxCPUQuota] + assert.True(t, ok) + assert.EqualValues(t, "200", value) + + value, ok = spec.Annotations[annotations.SandboxCPUShares] + assert.True(t, ok) + assert.EqualValues(t, "5000", value) + + value, ok = spec.Annotations[annotations.SandboxMem] + assert.True(t, ok) + assert.EqualValues(t, "1024", value) + }, + }, + "sandbox sizing annotations should not be set if LinuxContainerResources were not provided": { + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + _, ok := spec.Annotations[annotations.SandboxCPUPeriod] + assert.False(t, ok) + _, ok = spec.Annotations[annotations.SandboxCPUQuota] + assert.False(t, ok) + _, ok = spec.Annotations[annotations.SandboxCPUShares] + assert.False(t, ok) + _, ok = spec.Annotations[annotations.SandboxMem] + assert.False(t, ok) + }, + }, + "sandbox sizing annotations are zero if the resources are set to 0": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.Resources = &v1.LinuxContainerResources{} + }, + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + value, ok := spec.Annotations[annotations.SandboxCPUPeriod] + assert.True(t, ok) + assert.EqualValues(t, "0", value) + value, ok = spec.Annotations[annotations.SandboxCPUQuota] + assert.True(t, ok) + assert.EqualValues(t, "0", value) + value, ok = spec.Annotations[annotations.SandboxCPUShares] + assert.True(t, ok) + assert.EqualValues(t, "0", value) + value, ok = spec.Annotations[annotations.SandboxMem] + assert.True(t, ok) + assert.EqualValues(t, "0", value) + }, + }, + } { + t.Run(desc, func(t *testing.T) { + c := newControllerService() + c.config.EnableUnprivilegedICMP = true + c.config.EnableUnprivilegedPorts = true + config, imageConfig, specCheck := getRunPodSandboxTestData() + if test.configChange != nil { + test.configChange(config) + } + spec, err := c.sandboxContainerSpec(testID, config, imageConfig, nsPath, nil) + if test.expectErr { + assert.Error(t, err) + assert.Nil(t, spec) + return + } + assert.NoError(t, err) + assert.NotNil(t, spec) + specCheck(t, testID, spec) + if test.specCheck != nil { + test.specCheck(t, spec) + } + }) + } +} + +func TestSetupSandboxFiles(t *testing.T) { + const ( + testID = "test-id" + realhostname = "test-real-hostname" + ) + for desc, test := range map[string]struct { + dnsConfig *runtime.DNSConfig + hostname string + ipcMode runtime.NamespaceMode + expectedCalls []ostesting.CalledDetail + }{ + "should check host /dev/shm existence when ipc mode is NODE": { + ipcMode: runtime.NamespaceMode_NODE, + expectedCalls: []ostesting.CalledDetail{ + { + Name: "Hostname", + }, + { + Name: "WriteFile", + Arguments: []interface{}{ + filepath.Join(testRootDir, sandboxesDir, testID, "hostname"), + []byte(realhostname + "\n"), + os.FileMode(0644), + }, + }, + { + Name: "CopyFile", + Arguments: []interface{}{ + "/etc/hosts", + filepath.Join(testRootDir, sandboxesDir, testID, "hosts"), + os.FileMode(0644), + }, + }, + { + Name: "CopyFile", + Arguments: []interface{}{ + "/etc/resolv.conf", + filepath.Join(testRootDir, sandboxesDir, testID, "resolv.conf"), + os.FileMode(0644), + }, + }, + { + Name: "Stat", + Arguments: []interface{}{"/dev/shm"}, + }, + }, + }, + "should create new /etc/resolv.conf if DNSOptions is set": { + dnsConfig: &runtime.DNSConfig{ + Servers: []string{"8.8.8.8"}, + Searches: []string{"114.114.114.114"}, + Options: []string{"timeout:1"}, + }, + ipcMode: runtime.NamespaceMode_NODE, + expectedCalls: []ostesting.CalledDetail{ + { + Name: "Hostname", + }, + { + Name: "WriteFile", + Arguments: []interface{}{ + filepath.Join(testRootDir, sandboxesDir, testID, "hostname"), + []byte(realhostname + "\n"), + os.FileMode(0644), + }, + }, + { + Name: "CopyFile", + Arguments: []interface{}{ + "/etc/hosts", + filepath.Join(testRootDir, sandboxesDir, testID, "hosts"), + os.FileMode(0644), + }, + }, + { + Name: "WriteFile", + Arguments: []interface{}{ + filepath.Join(testRootDir, sandboxesDir, testID, "resolv.conf"), + []byte(`search 114.114.114.114 +nameserver 8.8.8.8 +options timeout:1 +`), os.FileMode(0644), + }, + }, + { + Name: "Stat", + Arguments: []interface{}{"/dev/shm"}, + }, + }, + }, + "should create sandbox shm when ipc namespace mode is not NODE": { + ipcMode: runtime.NamespaceMode_POD, + expectedCalls: []ostesting.CalledDetail{ + { + Name: "Hostname", + }, + { + Name: "WriteFile", + Arguments: []interface{}{ + filepath.Join(testRootDir, sandboxesDir, testID, "hostname"), + []byte(realhostname + "\n"), + os.FileMode(0644), + }, + }, + { + Name: "CopyFile", + Arguments: []interface{}{ + "/etc/hosts", + filepath.Join(testRootDir, sandboxesDir, testID, "hosts"), + os.FileMode(0644), + }, + }, + { + Name: "CopyFile", + Arguments: []interface{}{ + "/etc/resolv.conf", + filepath.Join(testRootDir, sandboxesDir, testID, "resolv.conf"), + os.FileMode(0644), + }, + }, + { + Name: "MkdirAll", + Arguments: []interface{}{ + filepath.Join(testStateDir, sandboxesDir, testID, "shm"), + os.FileMode(0700), + }, + }, + { + Name: "Mount", + // Ignore arguments which are too complex to check. + }, + }, + }, + "should create /etc/hostname when hostname is set": { + hostname: "test-hostname", + ipcMode: runtime.NamespaceMode_NODE, + expectedCalls: []ostesting.CalledDetail{ + { + Name: "WriteFile", + Arguments: []interface{}{ + filepath.Join(testRootDir, sandboxesDir, testID, "hostname"), + []byte("test-hostname\n"), + os.FileMode(0644), + }, + }, + { + Name: "CopyFile", + Arguments: []interface{}{ + "/etc/hosts", + filepath.Join(testRootDir, sandboxesDir, testID, "hosts"), + os.FileMode(0644), + }, + }, + { + Name: "CopyFile", + Arguments: []interface{}{ + "/etc/resolv.conf", + filepath.Join(testRootDir, sandboxesDir, testID, "resolv.conf"), + os.FileMode(0644), + }, + }, + { + Name: "Stat", + Arguments: []interface{}{"/dev/shm"}, + }, + }, + }, + } { + t.Run(desc, func(t *testing.T) { + c := newControllerService() + c.os.(*ostesting.FakeOS).HostnameFn = func() (string, error) { + return realhostname, nil + } + cfg := &runtime.PodSandboxConfig{ + Hostname: test.hostname, + DnsConfig: test.dnsConfig, + Linux: &runtime.LinuxPodSandboxConfig{ + SecurityContext: &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + Ipc: test.ipcMode, + }, + }, + }, + } + c.setupSandboxFiles(testID, cfg) + calls := c.os.(*ostesting.FakeOS).GetCalls() + assert.Len(t, calls, len(test.expectedCalls)) + for i, expected := range test.expectedCalls { + if expected.Arguments == nil { + // Ignore arguments. + expected.Arguments = calls[i].Arguments + } + assert.Equal(t, expected, calls[i]) + } + }) + } +} + +func TestParseDNSOption(t *testing.T) { + for desc, test := range map[string]struct { + servers []string + searches []string + options []string + expectedContent string + expectErr bool + }{ + "empty dns options should return empty content": {}, + "non-empty dns options should return correct content": { + servers: []string{"8.8.8.8", "server.google.com"}, + searches: []string{"114.114.114.114"}, + options: []string{"timeout:1"}, + expectedContent: `search 114.114.114.114 +nameserver 8.8.8.8 +nameserver server.google.com +options timeout:1 +`, + }, + "expanded dns config should return correct content on modern libc (e.g. glibc 2.26 and above)": { + servers: []string{"8.8.8.8", "server.google.com"}, + searches: []string{ + "server0.google.com", + "server1.google.com", + "server2.google.com", + "server3.google.com", + "server4.google.com", + "server5.google.com", + "server6.google.com", + }, + options: []string{"timeout:1"}, + expectedContent: `search server0.google.com server1.google.com server2.google.com server3.google.com server4.google.com server5.google.com server6.google.com +nameserver 8.8.8.8 +nameserver server.google.com +options timeout:1 +`, + }, + } { + t.Run(desc, func(t *testing.T) { + resolvContent, err := parseDNSOptions(test.servers, test.searches, test.options) + if test.expectErr { + assert.Error(t, err) + return + } + assert.NoError(t, err) + assert.Equal(t, resolvContent, test.expectedContent) + }) + } +} + +func TestSandboxDisableCgroup(t *testing.T) { + config, imageConfig, _ := getRunPodSandboxTestData() + c := newControllerService() + c.config.DisableCgroup = true + spec, err := c.sandboxContainerSpec("test-id", config, imageConfig, "test-cni", []string{}) + require.NoError(t, err) + + t.Log("resource limit should not be set") + assert.Nil(t, spec.Linux.Resources.Memory) + assert.Nil(t, spec.Linux.Resources.CPU) + + t.Log("cgroup path should be empty") + assert.Empty(t, spec.Linux.CgroupsPath) +} + +// TODO(random-liu): [P1] Add unit test for different error cases to make sure +// the function cleans up on error properly. diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run_other.go b/pkg/cri/sbserver/podsandbox/sandbox_run_other.go new file mode 100644 index 000000000..b629566ba --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/sandbox_run_other.go @@ -0,0 +1,56 @@ +//go:build !windows && !linux +// +build !windows,!linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "github.com/containerd/containerd" + "github.com/containerd/containerd/oci" + imagespec "github.com/opencontainers/image-spec/specs-go/v1" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig, + imageConfig *imagespec.ImageConfig, nsPath string, runtimePodAnnotations []string) (_ *runtimespec.Spec, retErr error) { + return c.runtimeSpec(id, "") +} + +// sandboxContainerSpecOpts generates OCI spec options for +// the sandbox container. +func (c *Controller) sandboxContainerSpecOpts(config *runtime.PodSandboxConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) { + return []oci.SpecOpts{}, nil +} + +// setupSandboxFiles sets up necessary sandbox files including /dev/shm, /etc/hosts, +// /etc/resolv.conf and /etc/hostname. +func (c *Controller) setupSandboxFiles(id string, config *runtime.PodSandboxConfig) error { + return nil +} + +// cleanupSandboxFiles unmount some sandbox files, we rely on the removal of sandbox root directory to +// remove these files. Unmount should *NOT* return error if the mount point is already unmounted. +func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error { + return nil +} + +// taskOpts generates task options for a (sandbox) container. +func (c *Controller) taskOpts(runtimeType string) []containerd.NewTaskOpts { + return []containerd.NewTaskOpts{} +} diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run_other_test.go b/pkg/cri/sbserver/podsandbox/sandbox_run_other_test.go new file mode 100644 index 000000000..5d5ad96ec --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/sandbox_run_other_test.go @@ -0,0 +1,36 @@ +//go:build !windows && !linux +// +build !windows,!linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "testing" + + imagespec "github.com/opencontainers/image-spec/specs-go/v1" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { + config := &runtime.PodSandboxConfig{} + imageConfig := &imagespec.ImageConfig{} + specCheck := func(t *testing.T, id string, spec *runtimespec.Spec) { + } + return config, imageConfig, specCheck +} diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run_test.go b/pkg/cri/sbserver/podsandbox/sandbox_run_test.go new file mode 100644 index 000000000..7d775d307 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/sandbox_run_test.go @@ -0,0 +1,372 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + goruntime "runtime" + "testing" + + "github.com/containerd/typeurl" + imagespec "github.com/opencontainers/image-spec/specs-go/v1" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/stretchr/testify/assert" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + + "github.com/containerd/containerd/pkg/cri/annotations" + criconfig "github.com/containerd/containerd/pkg/cri/config" + sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox" +) + +func TestSandboxContainerSpec(t *testing.T) { + switch goruntime.GOOS { + case "darwin": + t.Skip("not implemented on Darwin") + case "freebsd": + t.Skip("not implemented on FreeBSD") + } + testID := "test-id" + nsPath := "test-cni" + for desc, test := range map[string]struct { + configChange func(*runtime.PodSandboxConfig) + podAnnotations []string + imageConfigChange func(*imagespec.ImageConfig) + specCheck func(*testing.T, *runtimespec.Spec) + expectErr bool + }{ + "should return error when entrypoint and cmd are empty": { + imageConfigChange: func(c *imagespec.ImageConfig) { + c.Entrypoint = nil + c.Cmd = nil + }, + expectErr: true, + }, + "a passthrough annotation should be passed as an OCI annotation": { + podAnnotations: []string{"c"}, + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + assert.Equal(t, spec.Annotations["c"], "d") + }, + }, + "a non-passthrough annotation should not be passed as an OCI annotation": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Annotations["d"] = "e" + }, + podAnnotations: []string{"c"}, + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + assert.Equal(t, spec.Annotations["c"], "d") + _, ok := spec.Annotations["d"] + assert.False(t, ok) + }, + }, + "passthrough annotations should support wildcard match": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Annotations["t.f"] = "j" + c.Annotations["z.g"] = "o" + c.Annotations["z"] = "o" + c.Annotations["y.ca"] = "b" + c.Annotations["y"] = "b" + }, + podAnnotations: []string{"t*", "z.*", "y.c*"}, + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + assert.Equal(t, spec.Annotations["t.f"], "j") + assert.Equal(t, spec.Annotations["z.g"], "o") + assert.Equal(t, spec.Annotations["y.ca"], "b") + _, ok := spec.Annotations["y"] + assert.False(t, ok) + _, ok = spec.Annotations["z"] + assert.False(t, ok) + }, + }, + } { + t.Run(desc, func(t *testing.T) { + c := newControllerService() + config, imageConfig, specCheck := getRunPodSandboxTestData() + if test.configChange != nil { + test.configChange(config) + } + + if test.imageConfigChange != nil { + test.imageConfigChange(imageConfig) + } + spec, err := c.sandboxContainerSpec(testID, config, imageConfig, nsPath, + test.podAnnotations) + if test.expectErr { + assert.Error(t, err) + assert.Nil(t, spec) + return + } + assert.NoError(t, err) + assert.NotNil(t, spec) + specCheck(t, testID, spec) + if test.specCheck != nil { + test.specCheck(t, spec) + } + }) + } +} + +func TestTypeurlMarshalUnmarshalSandboxMeta(t *testing.T) { + for desc, test := range map[string]struct { + configChange func(*runtime.PodSandboxConfig) + }{ + "should marshal original config": {}, + "should marshal Linux": { + configChange: func(c *runtime.PodSandboxConfig) { + if c.Linux == nil { + c.Linux = &runtime.LinuxPodSandboxConfig{} + } + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + Network: runtime.NamespaceMode_NODE, + Pid: runtime.NamespaceMode_NODE, + Ipc: runtime.NamespaceMode_NODE, + }, + SupplementalGroups: []int64{1111, 2222}, + } + }, + }, + } { + t.Run(desc, func(t *testing.T) { + meta := &sandboxstore.Metadata{ + ID: "1", + Name: "sandbox_1", + NetNSPath: "/home/cloud", + } + meta.Config, _, _ = getRunPodSandboxTestData() + if test.configChange != nil { + test.configChange(meta.Config) + } + + any, err := typeurl.MarshalAny(meta) + assert.NoError(t, err) + data, err := typeurl.UnmarshalAny(any) + assert.NoError(t, err) + assert.IsType(t, &sandboxstore.Metadata{}, data) + curMeta, ok := data.(*sandboxstore.Metadata) + assert.True(t, ok) + assert.Equal(t, meta, curMeta) + }) + } +} + +func TestHostAccessingSandbox(t *testing.T) { + privilegedContext := &runtime.PodSandboxConfig{ + Linux: &runtime.LinuxPodSandboxConfig{ + SecurityContext: &runtime.LinuxSandboxSecurityContext{ + Privileged: true, + }, + }, + } + nonPrivilegedContext := &runtime.PodSandboxConfig{ + Linux: &runtime.LinuxPodSandboxConfig{ + SecurityContext: &runtime.LinuxSandboxSecurityContext{ + Privileged: false, + }, + }, + } + hostNamespace := &runtime.PodSandboxConfig{ + Linux: &runtime.LinuxPodSandboxConfig{ + SecurityContext: &runtime.LinuxSandboxSecurityContext{ + Privileged: false, + NamespaceOptions: &runtime.NamespaceOption{ + Network: runtime.NamespaceMode_NODE, + Pid: runtime.NamespaceMode_NODE, + Ipc: runtime.NamespaceMode_NODE, + }, + }, + }, + } + tests := []struct { + name string + config *runtime.PodSandboxConfig + want bool + }{ + {"Security Context is nil", nil, false}, + {"Security Context is privileged", privilegedContext, false}, + {"Security Context is not privileged", nonPrivilegedContext, false}, + {"Security Context namespace host access", hostNamespace, true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := hostAccessingSandbox(tt.config); got != tt.want { + t.Errorf("hostAccessingSandbox() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestGetSandboxRuntime(t *testing.T) { + untrustedWorkloadRuntime := criconfig.Runtime{ + Type: "io.containerd.runtime.v1.linux", + Engine: "untrusted-workload-runtime", + Root: "", + } + + defaultRuntime := criconfig.Runtime{ + Type: "io.containerd.runtime.v1.linux", + Engine: "default-runtime", + Root: "", + } + + fooRuntime := criconfig.Runtime{ + Type: "io.containerd.runtime.v1.linux", + Engine: "foo-bar", + Root: "", + } + + for desc, test := range map[string]struct { + sandboxConfig *runtime.PodSandboxConfig + runtimeHandler string + runtimes map[string]criconfig.Runtime + expectErr bool + expectedRuntime criconfig.Runtime + }{ + "should return error if untrusted workload requires host access": { + sandboxConfig: &runtime.PodSandboxConfig{ + Linux: &runtime.LinuxPodSandboxConfig{ + SecurityContext: &runtime.LinuxSandboxSecurityContext{ + Privileged: false, + NamespaceOptions: &runtime.NamespaceOption{ + Network: runtime.NamespaceMode_NODE, + Pid: runtime.NamespaceMode_NODE, + Ipc: runtime.NamespaceMode_NODE, + }, + }, + }, + Annotations: map[string]string{ + annotations.UntrustedWorkload: "true", + }, + }, + runtimes: map[string]criconfig.Runtime{ + criconfig.RuntimeDefault: defaultRuntime, + criconfig.RuntimeUntrusted: untrustedWorkloadRuntime, + }, + expectErr: true, + }, + "should use untrusted workload runtime for untrusted workload": { + sandboxConfig: &runtime.PodSandboxConfig{ + Annotations: map[string]string{ + annotations.UntrustedWorkload: "true", + }, + }, + runtimes: map[string]criconfig.Runtime{ + criconfig.RuntimeDefault: defaultRuntime, + criconfig.RuntimeUntrusted: untrustedWorkloadRuntime, + }, + expectedRuntime: untrustedWorkloadRuntime, + }, + "should use default runtime for regular workload": { + sandboxConfig: &runtime.PodSandboxConfig{}, + runtimes: map[string]criconfig.Runtime{ + criconfig.RuntimeDefault: defaultRuntime, + }, + expectedRuntime: defaultRuntime, + }, + "should use default runtime for trusted workload": { + sandboxConfig: &runtime.PodSandboxConfig{ + Annotations: map[string]string{ + annotations.UntrustedWorkload: "false", + }, + }, + runtimes: map[string]criconfig.Runtime{ + criconfig.RuntimeDefault: defaultRuntime, + criconfig.RuntimeUntrusted: untrustedWorkloadRuntime, + }, + expectedRuntime: defaultRuntime, + }, + "should return error if untrusted workload runtime is required but not configured": { + sandboxConfig: &runtime.PodSandboxConfig{ + Annotations: map[string]string{ + annotations.UntrustedWorkload: "true", + }, + }, + runtimes: map[string]criconfig.Runtime{ + criconfig.RuntimeDefault: defaultRuntime, + }, + expectErr: true, + }, + "should use 'untrusted' runtime for untrusted workload": { + sandboxConfig: &runtime.PodSandboxConfig{ + Annotations: map[string]string{ + annotations.UntrustedWorkload: "true", + }, + }, + runtimes: map[string]criconfig.Runtime{ + criconfig.RuntimeDefault: defaultRuntime, + criconfig.RuntimeUntrusted: untrustedWorkloadRuntime, + }, + expectedRuntime: untrustedWorkloadRuntime, + }, + "should use 'untrusted' runtime for untrusted workload & handler": { + sandboxConfig: &runtime.PodSandboxConfig{ + Annotations: map[string]string{ + annotations.UntrustedWorkload: "true", + }, + }, + runtimeHandler: "untrusted", + runtimes: map[string]criconfig.Runtime{ + criconfig.RuntimeDefault: defaultRuntime, + criconfig.RuntimeUntrusted: untrustedWorkloadRuntime, + }, + expectedRuntime: untrustedWorkloadRuntime, + }, + "should return an error if untrusted annotation with conflicting handler": { + sandboxConfig: &runtime.PodSandboxConfig{ + Annotations: map[string]string{ + annotations.UntrustedWorkload: "true", + }, + }, + runtimeHandler: "foo", + runtimes: map[string]criconfig.Runtime{ + criconfig.RuntimeDefault: defaultRuntime, + criconfig.RuntimeUntrusted: untrustedWorkloadRuntime, + "foo": fooRuntime, + }, + expectErr: true, + }, + "should use correct runtime for a runtime handler": { + sandboxConfig: &runtime.PodSandboxConfig{}, + runtimeHandler: "foo", + runtimes: map[string]criconfig.Runtime{ + criconfig.RuntimeDefault: defaultRuntime, + criconfig.RuntimeUntrusted: untrustedWorkloadRuntime, + "foo": fooRuntime, + }, + expectedRuntime: fooRuntime, + }, + "should return error if runtime handler is required but not configured": { + sandboxConfig: &runtime.PodSandboxConfig{}, + runtimeHandler: "bar", + runtimes: map[string]criconfig.Runtime{ + criconfig.RuntimeDefault: defaultRuntime, + "foo": fooRuntime, + }, + expectErr: true, + }, + } { + t.Run(desc, func(t *testing.T) { + cri := newControllerService() + cri.config = criconfig.Config{ + PluginConfig: criconfig.DefaultConfig(), + } + cri.config.ContainerdConfig.DefaultRuntimeName = criconfig.RuntimeDefault + cri.config.ContainerdConfig.Runtimes = test.runtimes + r, err := cri.getSandboxRuntime(test.sandboxConfig, test.runtimeHandler) + assert.Equal(t, test.expectErr, err != nil) + assert.Equal(t, test.expectedRuntime, r) + }) + } +} diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run_windows.go b/pkg/cri/sbserver/podsandbox/sandbox_run_windows.go new file mode 100644 index 000000000..569281b32 --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/sandbox_run_windows.go @@ -0,0 +1,113 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "fmt" + "strconv" + + "github.com/containerd/containerd" + "github.com/containerd/containerd/oci" + imagespec "github.com/opencontainers/image-spec/specs-go/v1" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + + "github.com/containerd/containerd/pkg/cri/annotations" + customopts "github.com/containerd/containerd/pkg/cri/opts" +) + +func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig, + imageConfig *imagespec.ImageConfig, nsPath string, runtimePodAnnotations []string) (*runtimespec.Spec, error) { + // Creates a spec Generator with the default spec. + specOpts := []oci.SpecOpts{ + oci.WithEnv(imageConfig.Env), + oci.WithHostname(config.GetHostname()), + } + if imageConfig.WorkingDir != "" { + specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir)) + } + + if len(imageConfig.Entrypoint) == 0 && len(imageConfig.Cmd) == 0 { + // Pause image must have entrypoint or cmd. + return nil, fmt.Errorf("invalid empty entrypoint and cmd in image config %+v", imageConfig) + } + specOpts = append(specOpts, oci.WithProcessArgs(append(imageConfig.Entrypoint, imageConfig.Cmd...)...)) + + specOpts = append(specOpts, + // Clear the root location since hcsshim expects it. + // NOTE: readonly rootfs doesn't work on windows. + customopts.WithoutRoot, + customopts.WithWindowsNetworkNamespace(nsPath), + ) + + specOpts = append(specOpts, customopts.WithWindowsDefaultSandboxShares) + + // Start with the image config user and override below if RunAsUsername is not "". + username := imageConfig.User + + runAsUser := config.GetWindows().GetSecurityContext().GetRunAsUsername() + if runAsUser != "" { + username = runAsUser + } + + cs := config.GetWindows().GetSecurityContext().GetCredentialSpec() + if cs != "" { + specOpts = append(specOpts, customopts.WithWindowsCredentialSpec(cs)) + } + + // There really isn't a good Windows way to verify that the username is available in the + // image as early as here like there is for Linux. Later on in the stack hcsshim + // will handle the behavior of erroring out if the user isn't available in the image + // when trying to run the init process. + specOpts = append(specOpts, oci.WithUser(username)) + + for pKey, pValue := range getPassthroughAnnotations(config.Annotations, + runtimePodAnnotations) { + specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) + } + + specOpts = append(specOpts, + customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeSandbox), + customopts.WithAnnotation(annotations.SandboxID, id), + customopts.WithAnnotation(annotations.SandboxNamespace, config.GetMetadata().GetNamespace()), + customopts.WithAnnotation(annotations.SandboxName, config.GetMetadata().GetName()), + customopts.WithAnnotation(annotations.SandboxLogDir, config.GetLogDirectory()), + customopts.WithAnnotation(annotations.WindowsHostProcess, strconv.FormatBool(config.GetWindows().GetSecurityContext().GetHostProcess())), + ) + + return c.runtimeSpec(id, "", specOpts...) +} + +// No sandbox container spec options for windows yet. +func (c *Controller) sandboxContainerSpecOpts(config *runtime.PodSandboxConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) { + return nil, nil +} + +// No sandbox files needed for windows. +func (c *Controller) setupSandboxFiles(id string, config *runtime.PodSandboxConfig) error { + return nil +} + +// No sandbox files needed for windows. +func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error { + return nil +} + +// No task options needed for windows. +func (c *Controller) taskOpts(runtimeType string) []containerd.NewTaskOpts { + return nil +} diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run_windows_test.go b/pkg/cri/sbserver/podsandbox/sandbox_run_windows_test.go new file mode 100644 index 000000000..958de454f --- /dev/null +++ b/pkg/cri/sbserver/podsandbox/sandbox_run_windows_test.go @@ -0,0 +1,108 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package podsandbox + +import ( + "testing" + + imagespec "github.com/opencontainers/image-spec/specs-go/v1" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/stretchr/testify/assert" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + + "github.com/containerd/containerd/pkg/cri/annotations" + "github.com/containerd/containerd/pkg/cri/opts" +) + +func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { + config := &runtime.PodSandboxConfig{ + Metadata: &runtime.PodSandboxMetadata{ + Name: "test-name", + Uid: "test-uid", + Namespace: "test-ns", + Attempt: 1, + }, + Hostname: "test-hostname", + LogDirectory: "test-log-directory", + Labels: map[string]string{"a": "b"}, + Annotations: map[string]string{"c": "d"}, + Windows: &runtime.WindowsPodSandboxConfig{ + SecurityContext: &runtime.WindowsSandboxSecurityContext{ + RunAsUsername: "test-user", + CredentialSpec: "{\"test\": \"spec\"}", + HostProcess: false, + }, + }, + } + imageConfig := &imagespec.ImageConfig{ + Env: []string{"a=b", "c=d"}, + Entrypoint: []string{"/pause"}, + Cmd: []string{"forever"}, + WorkingDir: "/workspace", + User: "test-image-user", + } + specCheck := func(t *testing.T, id string, spec *runtimespec.Spec) { + assert.Equal(t, "test-hostname", spec.Hostname) + assert.Nil(t, spec.Root) + assert.Contains(t, spec.Process.Env, "a=b", "c=d") + assert.Equal(t, []string{"/pause", "forever"}, spec.Process.Args) + assert.Equal(t, "/workspace", spec.Process.Cwd) + assert.EqualValues(t, *spec.Windows.Resources.CPU.Shares, opts.DefaultSandboxCPUshares) + + // Also checks if override of the image configs user is behaving. + t.Logf("Check username") + assert.Contains(t, spec.Process.User.Username, "test-user") + + t.Logf("Check credential spec") + assert.Contains(t, spec.Windows.CredentialSpec, "{\"test\": \"spec\"}") + + t.Logf("Check PodSandbox annotations") + assert.Contains(t, spec.Annotations, annotations.SandboxID) + assert.EqualValues(t, spec.Annotations[annotations.SandboxID], id) + + assert.Contains(t, spec.Annotations, annotations.ContainerType) + assert.EqualValues(t, spec.Annotations[annotations.ContainerType], annotations.ContainerTypeSandbox) + + assert.Contains(t, spec.Annotations, annotations.SandboxNamespace) + assert.EqualValues(t, spec.Annotations[annotations.SandboxNamespace], "test-ns") + + assert.Contains(t, spec.Annotations, annotations.SandboxName) + assert.EqualValues(t, spec.Annotations[annotations.SandboxName], "test-name") + + assert.Contains(t, spec.Annotations, annotations.SandboxLogDir) + assert.EqualValues(t, spec.Annotations[annotations.SandboxLogDir], "test-log-directory") + + assert.Contains(t, spec.Annotations, annotations.WindowsHostProcess) + assert.EqualValues(t, spec.Annotations[annotations.WindowsHostProcess], "false") + } + return config, imageConfig, specCheck +} + +func TestSandboxWindowsNetworkNamespace(t *testing.T) { + testID := "test-id" + nsPath := "test-cni" + c := newControllerService() + + config, imageConfig, specCheck := getRunPodSandboxTestData() + spec, err := c.sandboxContainerSpec(testID, config, imageConfig, nsPath, nil) + assert.NoError(t, err) + assert.NotNil(t, spec) + specCheck(t, testID, spec) + assert.NotNil(t, spec.Windows) + assert.NotNil(t, spec.Windows.Network) + assert.Equal(t, nsPath, spec.Windows.Network.NetworkNamespace) +} diff --git a/pkg/cri/sbserver/sandbox_remove.go b/pkg/cri/sbserver/sandbox_remove.go index 5771a7a24..74cdb00e5 100644 --- a/pkg/cri/sbserver/sandbox_remove.go +++ b/pkg/cri/sbserver/sandbox_remove.go @@ -108,6 +108,10 @@ func (c *criService) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodS // 3) On-going operations which have held the reference will not be affected. c.sandboxStore.Delete(id) + if err := c.client.SandboxStore().Delete(ctx, id); err != nil { + return nil, fmt.Errorf("failed to remove sandbox metadata from store: %w", err) + } + // Release the sandbox name reserved for the sandbox. c.sandboxNameIndex.ReleaseByKey(id) diff --git a/pkg/cri/sbserver/sandbox_run.go b/pkg/cri/sbserver/sandbox_run.go index 39f4a63a6..2c742167d 100644 --- a/pkg/cri/sbserver/sandbox_run.go +++ b/pkg/cri/sbserver/sandbox_run.go @@ -27,28 +27,20 @@ import ( "strings" "time" + sb "github.com/containerd/containerd/sandbox" "github.com/containerd/go-cni" - "github.com/containerd/nri" - v1 "github.com/containerd/nri/types/v1" "github.com/containerd/typeurl" - "github.com/davecgh/go-spew/spew" - "github.com/opencontainers/selinux/go-selinux" "github.com/sirupsen/logrus" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" - "github.com/containerd/containerd" - containerdio "github.com/containerd/containerd/cio" - "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/log" "github.com/containerd/containerd/pkg/cri/annotations" criconfig "github.com/containerd/containerd/pkg/cri/config" - customopts "github.com/containerd/containerd/pkg/cri/opts" "github.com/containerd/containerd/pkg/cri/server/bandwidth" sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox" "github.com/containerd/containerd/pkg/cri/util" ctrdutil "github.com/containerd/containerd/pkg/cri/util" "github.com/containerd/containerd/pkg/netns" - "github.com/containerd/containerd/snapshots" ) func init() { @@ -82,6 +74,14 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox } }() + sandboxInfo := sb.Sandbox{ + ID: id, + Runtime: sb.RuntimeOpts{Name: r.GetRuntimeHandler()}, + } + + // Save sandbox name + sandboxInfo.AddLabel("name", name) + // Create initial internal sandbox object. sandbox := sandboxstore.NewSandbox( sandboxstore.Metadata{ @@ -95,23 +95,10 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox }, ) - // Ensure sandbox container image snapshot. - image, err := c.ensureImageExists(ctx, c.config.SandboxImage, config) - if err != nil { - return nil, fmt.Errorf("failed to get sandbox image %q: %w", c.config.SandboxImage, err) - } - containerdImage, err := c.toContainerdImage(ctx, *image) - if err != nil { - return nil, fmt.Errorf("failed to get image from containerd %q: %w", image.ID, err) - } - - ociRuntime, err := c.getSandboxRuntime(config, r.GetRuntimeHandler()) - if err != nil { - return nil, fmt.Errorf("failed to get sandbox runtime: %w", err) - } - log.G(ctx).WithField("podsandboxid", id).Debugf("use OCI runtime %+v", ociRuntime) - - podNetwork := true + var ( + podNetwork = true + err error + ) if goruntime.GOOS != "windows" && config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetNetwork() == runtime.NamespaceMode_NODE { @@ -169,176 +156,40 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox sandboxCreateNetworkTimer.UpdateSince(netStart) } + // Save sandbox metadata to store + if err := sandboxInfo.AddExtension("metadata", &sandbox.Metadata); err != nil { + return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err) + } + + if _, err := c.client.SandboxStore().Create(ctx, sandboxInfo); err != nil { + return nil, fmt.Errorf("failed to save sandbox metadata: %w", err) + } + runtimeStart := time.Now() - // Create sandbox container. - // NOTE: sandboxContainerSpec SHOULD NOT have side - // effect, e.g. accessing/creating files, so that we can test - // it safely. - spec, err := c.sandboxContainerSpec(id, config, &image.ImageSpec.Config, sandbox.NetNSPath, ociRuntime.PodAnnotations) + + pid, err := c.sandboxController.Start(ctx, id) if err != nil { - return nil, fmt.Errorf("failed to generate sandbox container spec: %w", err) - } - log.G(ctx).WithField("podsandboxid", id).Debugf("sandbox container spec: %#+v", spew.NewFormatter(spec)) - sandbox.ProcessLabel = spec.Process.SelinuxLabel - defer func() { - if retErr != nil { - selinux.ReleaseLabel(sandbox.ProcessLabel) - } - }() - - // handle any KVM based runtime - if err := modifyProcessLabel(ociRuntime.Type, spec); err != nil { - return nil, err + return nil, fmt.Errorf("failed to start sandbox %q: %w", id, err) } - if config.GetLinux().GetSecurityContext().GetPrivileged() { - // If privileged don't set selinux label, but we still record the MCS label so that - // the unused label can be freed later. - spec.Process.SelinuxLabel = "" - } - - // Generate spec options that will be applied to the spec later. - specOpts, err := c.sandboxContainerSpecOpts(config, &image.ImageSpec.Config) - if err != nil { - return nil, fmt.Errorf("failed to generate sandbox container spec options: %w", err) - } - - sandboxLabels := buildLabels(config.Labels, image.ImageSpec.Config.Labels, containerKindSandbox) - - runtimeOpts, err := generateRuntimeOptions(ociRuntime, c.config) - if err != nil { - return nil, fmt.Errorf("failed to generate runtime options: %w", err) - } - snapshotterOpt := snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations)) - opts := []containerd.NewContainerOpts{ - containerd.WithSnapshotter(c.runtimeSnapshotter(ctx, ociRuntime)), - customopts.WithNewSnapshot(id, containerdImage, snapshotterOpt), - containerd.WithSpec(spec, specOpts...), - containerd.WithContainerLabels(sandboxLabels), - containerd.WithContainerExtension(sandboxMetadataExtension, &sandbox.Metadata), - containerd.WithRuntime(ociRuntime.Type, runtimeOpts)} - - container, err := c.client.NewContainer(ctx, id, opts...) - if err != nil { - return nil, fmt.Errorf("failed to create containerd container: %w", err) - } - defer func() { - if retErr != nil { - deferCtx, deferCancel := ctrdutil.DeferContext() - defer deferCancel() - if err := container.Delete(deferCtx, containerd.WithSnapshotCleanup); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to delete containerd container %q", id) - } - } - }() - - // Create sandbox container root directories. - sandboxRootDir := c.getSandboxRootDir(id) - if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil { - return nil, fmt.Errorf("failed to create sandbox root directory %q: %w", - sandboxRootDir, err) - } - defer func() { - if retErr != nil { - // Cleanup the sandbox root directory. - if err := c.os.RemoveAll(sandboxRootDir); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to remove sandbox root directory %q", - sandboxRootDir) - } - } - }() - volatileSandboxRootDir := c.getVolatileSandboxRootDir(id) - if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil { - return nil, fmt.Errorf("failed to create volatile sandbox root directory %q: %w", - volatileSandboxRootDir, err) - } - defer func() { - if retErr != nil { - // Cleanup the volatile sandbox root directory. - if err := c.os.RemoveAll(volatileSandboxRootDir); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to remove volatile sandbox root directory %q", - volatileSandboxRootDir) - } - } - }() - - // Setup files required for the sandbox. - if err = c.setupSandboxFiles(id, config); err != nil { - return nil, fmt.Errorf("failed to setup sandbox files: %w", err) - } - defer func() { - if retErr != nil { - if err = c.cleanupSandboxFiles(id, config); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to cleanup sandbox files in %q", - sandboxRootDir) - } - } - }() - - // Update sandbox created timestamp. - info, err := container.Info(ctx) - if err != nil { - return nil, fmt.Errorf("failed to get sandbox container info: %w", err) - } - - // Create sandbox task in containerd. - log.G(ctx).Tracef("Create sandbox container (id=%q, name=%q).", - id, name) - - taskOpts := c.taskOpts(ociRuntime.Type) - if ociRuntime.Path != "" { - taskOpts = append(taskOpts, containerd.WithRuntimePath(ociRuntime.Path)) - } - // We don't need stdio for sandbox container. - task, err := container.NewTask(ctx, containerdio.NullIO, taskOpts...) - if err != nil { - return nil, fmt.Errorf("failed to create containerd task: %w", err) - } - defer func() { - if retErr != nil { - deferCtx, deferCancel := ctrdutil.DeferContext() - defer deferCancel() - // Cleanup the sandbox container if an error is returned. - if _, err := task.Delete(deferCtx, WithNRISandboxDelete(id), containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) { - log.G(ctx).WithError(err).Errorf("Failed to delete sandbox container %q", id) - } - } - }() - - // wait is a long running background request, no timeout needed. - exitCh, err := task.Wait(ctrdutil.NamespacedContext()) - if err != nil { - return nil, fmt.Errorf("failed to wait for sandbox container task: %w", err) - } - - nric, err := nri.New() - if err != nil { - return nil, fmt.Errorf("unable to create nri client: %w", err) - } - if nric != nil { - nriSB := &nri.Sandbox{ - ID: id, - Labels: config.Labels, - } - if _, err := nric.InvokeWithSandbox(ctx, task, v1.Create, nriSB); err != nil { - return nil, fmt.Errorf("nri invoke: %w", err) - } - } - - if err := task.Start(ctx); err != nil { - return nil, fmt.Errorf("failed to start sandbox container task %q: %w", id, err) - } + createdAt := time.Now() // TODO: return created at from Start. if err := sandbox.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) { // Set the pod sandbox as ready after successfully start sandbox container. - status.Pid = task.Pid() + status.Pid = pid status.State = sandboxstore.StateReady - status.CreatedAt = info.CreatedAt + status.CreatedAt = createdAt return status, nil }); err != nil { return nil, fmt.Errorf("failed to update sandbox status: %w", err) } + // TODO: get rid of this. sandbox object should no longer have Container field. + container, err := c.client.LoadContainer(ctx, id) + if err != nil { + return nil, fmt.Errorf("failed to load container %q for sandbox: %w", id, err) + } + // Add sandbox into sandbox store in INIT state. sandbox.Container = container @@ -346,13 +197,11 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("failed to add sandbox %+v into store: %w", sandbox, err) } - // start the monitor after adding sandbox into the store, this ensures - // that sandbox is in the store, when event monitor receives the TaskExit event. - // - // TaskOOM from containerd may come before sandbox is added to store, - // but we don't care about sandbox TaskOOM right now, so it is fine. - c.eventMonitor.startSandboxExitMonitor(context.Background(), id, task.Pid(), exitCh) - + // TODO: Remove this, this is needed only to update the timer below. + ociRuntime, err := c.getSandboxRuntime(config, sandboxInfo.Runtime.Name) + if err != nil { + return nil, fmt.Errorf("failed to get sandbox runtime: %w", err) + } sandboxRuntimeCreateTimer.WithValues(ociRuntime.Type).UpdateSince(runtimeStart) return &runtime.RunPodSandboxResponse{PodSandboxId: id}, nil diff --git a/pkg/cri/sbserver/service.go b/pkg/cri/sbserver/service.go index 96502cb7c..537cb1a8c 100644 --- a/pkg/cri/sbserver/service.go +++ b/pkg/cri/sbserver/service.go @@ -17,6 +17,7 @@ package sbserver import ( + "context" "encoding/json" "fmt" "io" @@ -28,9 +29,11 @@ import ( "github.com/containerd/containerd" "github.com/containerd/containerd/oci" + "github.com/containerd/containerd/pkg/cri/sbserver/podsandbox" "github.com/containerd/containerd/pkg/cri/streaming" "github.com/containerd/containerd/pkg/kmutex" "github.com/containerd/containerd/plugin" + "github.com/containerd/containerd/sandbox" "github.com/containerd/go-cni" "github.com/sirupsen/logrus" "google.golang.org/grpc" @@ -88,6 +91,8 @@ type criService struct { sandboxNameIndex *registrar.Registrar // containerStore stores all resources associated with containers. containerStore *containerstore.Store + // sandboxController controls sandbox lifecycle (and hides implementation details behind). + sandboxController sandbox.Controller // containerNameIndex stores all container names and make sure each // name is unique. containerNameIndex *registrar.Registrar @@ -181,9 +186,17 @@ func NewCRIService(config criconfig.Config, client *containerd.Client) (CRIServi return nil, err } + c.sandboxController = podsandbox.New(config, client, c.sandboxStore, c.os, c, c.baseOCISpecs) + return c, nil } +// StartSandboxExitMonitor is a temporary workaround to call monitor from pause controller. +// TODO: get rid of this. +func (c *criService) StartSandboxExitMonitor(ctx context.Context, id string, pid uint32, exitCh <-chan containerd.ExitStatus) <-chan struct{} { + return c.eventMonitor.startSandboxExitMonitor(ctx, id, pid, exitCh) +} + // Register registers all required services onto a specific grpc server. // This is used by containerd cri plugin. func (c *criService) Register(s *grpc.Server) error {