diff --git a/pkg/opts/container.go b/pkg/opts/container.go new file mode 100644 index 000000000..7a158fd7e --- /dev/null +++ b/pkg/opts/container.go @@ -0,0 +1,95 @@ +/* +Copyright 2017 The Kubernetes Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package opts + +import ( + "context" + "io/ioutil" + "os" + "path/filepath" + + "github.com/containerd/containerd" + "github.com/containerd/containerd/containers" + "github.com/docker/docker/pkg/system" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// WithVolumes copies ownership of volume in rootfs to its corresponding host path. +// It doesn't update runtime spec. +// The passed in map is a host path to container path map for all volumes. +// TODO(random-liu): Figure out whether we need to copy volume content. +func WithVolumes(volumeMounts map[string]string) containerd.NewContainerOpts { + return func(ctx context.Context, client *containerd.Client, c *containers.Container) error { + if c.Snapshotter == "" { + return errors.Errorf("no snapshotter set for container") + } + if c.SnapshotKey == "" { + return errors.Errorf("rootfs not created for container") + } + snapshotter := client.SnapshotService(c.Snapshotter) + mounts, err := snapshotter.Mounts(ctx, c.SnapshotKey) + if err != nil { + return err + } + root, err := ioutil.TempDir("", "ctd-volume") + if err != nil { + return err + } + defer os.RemoveAll(root) // nolint: errcheck + for _, m := range mounts { + if err := m.Mount(root); err != nil { + return err + } + } + defer unix.Unmount(root, 0) // nolint: errcheck + + for host, volume := range volumeMounts { + if err := copyOwnership(filepath.Join(root, volume), host); err != nil { + return err + } + } + return nil + } +} + +// copyOwnership copies the permissions and uid:gid of the src file +// to the dst file +func copyOwnership(src, dst string) error { + stat, err := system.Stat(src) + if err != nil { + return err + } + + dstStat, err := system.Stat(dst) + if err != nil { + return err + } + + // In some cases, even though UID/GID match and it would effectively be a no-op, + // this can return a permission denied error... for example if this is an NFS + // mount. + // Since it's not really an error that we can't chown to the same UID/GID, don't + // even bother trying in such cases. + if stat.UID() != dstStat.UID() || stat.GID() != dstStat.GID() { + if err := os.Chown(dst, int(stat.UID()), int(stat.GID())); err != nil { + return err + } + } + + if stat.Mode() != dstStat.Mode() { + return os.Chmod(dst, os.FileMode(stat.Mode())) + } + return nil +} diff --git a/pkg/os/os.go b/pkg/os/os.go index 6c562d427..ad5be5c0b 100644 --- a/pkg/os/os.go +++ b/pkg/os/os.go @@ -20,6 +20,7 @@ import ( "io" "io/ioutil" "os" + "path/filepath" "github.com/containerd/fifo" "github.com/docker/docker/pkg/mount" @@ -34,6 +35,7 @@ type OS interface { RemoveAll(path string) error OpenFifo(ctx context.Context, fn string, flag int, perm os.FileMode) (io.ReadWriteCloser, error) Stat(name string) (os.FileInfo, error) + ResolveSymbolicLink(name string) (string, error) CopyFile(src, dest string, perm os.FileMode) error WriteFile(filename string, data []byte, perm os.FileMode) error Mount(source string, target string, fstype string, flags uintptr, data string) error @@ -63,6 +65,18 @@ func (RealOS) Stat(name string) (os.FileInfo, error) { return os.Stat(name) } +// ResolveSymbolicLink will follow any symbolic links +func (RealOS) ResolveSymbolicLink(path string) (string, error) { + info, err := os.Lstat(path) + if err != nil { + return "", err + } + if info.Mode()&os.ModeSymlink != os.ModeSymlink { + return path, nil + } + return filepath.EvalSymlinks(path) +} + // CopyFile copys src file to dest file func (RealOS) CopyFile(src, dest string, perm os.FileMode) error { in, err := os.Open(src) diff --git a/pkg/os/testing/fake_os.go b/pkg/os/testing/fake_os.go index c1d5aa9b9..9ac5029b5 100644 --- a/pkg/os/testing/fake_os.go +++ b/pkg/os/testing/fake_os.go @@ -39,16 +39,17 @@ type CalledDetail struct { // of the real call. type FakeOS struct { sync.Mutex - MkdirAllFn func(string, os.FileMode) error - RemoveAllFn func(string) error - OpenFifoFn func(context.Context, string, int, os.FileMode) (io.ReadWriteCloser, error) - StatFn func(string) (os.FileInfo, error) - CopyFileFn func(string, string, os.FileMode) error - WriteFileFn func(string, []byte, os.FileMode) error - MountFn func(source string, target string, fstype string, flags uintptr, data string) error - UnmountFn func(target string, flags int) error - calls []CalledDetail - errors map[string]error + MkdirAllFn func(string, os.FileMode) error + RemoveAllFn func(string) error + OpenFifoFn func(context.Context, string, int, os.FileMode) (io.ReadWriteCloser, error) + StatFn func(string) (os.FileInfo, error) + ResolveSymbolicLinkFn func(string) (string, error) + CopyFileFn func(string, string, os.FileMode) error + WriteFileFn func(string, []byte, os.FileMode) error + MountFn func(source string, target string, fstype string, flags uintptr, data string) error + UnmountFn func(target string, flags int) error + calls []CalledDetail + errors map[string]error } var _ osInterface.OS = &FakeOS{} @@ -160,6 +161,19 @@ func (f *FakeOS) Stat(name string) (os.FileInfo, error) { return nil, nil } +// ResolveSymbolicLink is a fake call that invokes ResolveSymbolicLinkFn or returns its input +func (f *FakeOS) ResolveSymbolicLink(path string) (string, error) { + f.appendCalls("ResolveSymbolicLink", path) + if err := f.getError("ResolveSymbolicLink"); err != nil { + return "", err + } + + if f.ResolveSymbolicLinkFn != nil { + return f.ResolveSymbolicLinkFn(path) + } + return path, nil +} + // CopyFile is a fake call that invokes CopyFileFn or just return nil. func (f *FakeOS) CopyFile(src, dest string, perm os.FileMode) error { f.appendCalls("CopyFile", src, dest, perm) diff --git a/pkg/server/container_create.go b/pkg/server/container_create.go index 2c4914ebd..427501339 100644 --- a/pkg/server/container_create.go +++ b/pkg/server/container_create.go @@ -19,6 +19,7 @@ package server import ( "fmt" "os" + "path/filepath" "strings" "time" @@ -36,6 +37,7 @@ import ( "golang.org/x/sys/unix" "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime" + customopts "github.com/kubernetes-incubator/cri-containerd/pkg/opts" cio "github.com/kubernetes-incubator/cri-containerd/pkg/server/io" containerstore "github.com/kubernetes-incubator/cri-containerd/pkg/store/container" "github.com/kubernetes-incubator/cri-containerd/pkg/util" @@ -101,26 +103,6 @@ func (c *criContainerdService) CreateContainer(ctx context.Context, r *runtime.C return nil, fmt.Errorf("image %q not found", imageRef) } - // Generate container runtime spec. - mounts := c.generateContainerMounts(getSandboxRootDir(c.rootDir, sandboxID), config) - spec, err := c.generateContainerSpec(id, sandboxPid, config, sandboxConfig, image.Config, mounts) - if err != nil { - return nil, fmt.Errorf("failed to generate container %q spec: %v", id, err) - } - glog.V(4).Infof("Container spec: %+v", spec) - - // Set snapshotter before any other options. - opts := []containerd.NewContainerOpts{ - containerd.WithSnapshotter(c.snapshotter), - } - // Prepare container rootfs. This is always writeable even if - // the container wants a readonly rootfs since we want to give - // the runtime (runc) a chance to modify (e.g. to create mount - // points corresponding to spec.Mounts) before making the - // rootfs readonly (requested by spec.Root.Readonly). - opts = append(opts, containerd.WithNewSnapshot(id, image.Image)) - meta.ImageRef = image.ID - // Create container root directory. containerRootDir := getContainerRootDir(c.rootDir, id) if err = c.os.MkdirAll(containerRootDir, 0755); err != nil { @@ -137,6 +119,39 @@ func (c *criContainerdService) CreateContainer(ctx context.Context, r *runtime.C } }() + // Create container volumes mounts. + // TODO(random-liu): Add cri-containerd integration test for image volume. + volumeMounts := c.generateVolumeMounts(containerRootDir, config.GetMounts(), image.Config) + + // Generate container runtime spec. + mounts := c.generateContainerMounts(getSandboxRootDir(c.rootDir, sandboxID), config) + + spec, err := c.generateContainerSpec(id, sandboxPid, config, sandboxConfig, image.Config, append(mounts, volumeMounts...)) + if err != nil { + return nil, fmt.Errorf("failed to generate container %q spec: %v", id, err) + } + glog.V(4).Infof("Container spec: %+v", spec) + + // Set snapshotter before any other options. + opts := []containerd.NewContainerOpts{ + containerd.WithSnapshotter(c.snapshotter), + // Prepare container rootfs. This is always writeable even if + // the container wants a readonly rootfs since we want to give + // the runtime (runc) a chance to modify (e.g. to create mount + // points corresponding to spec.Mounts) before making the + // rootfs readonly (requested by spec.Root.Readonly). + containerd.WithNewSnapshot(id, image.Image), + } + + if len(volumeMounts) > 0 { + mountMap := make(map[string]string) + for _, v := range volumeMounts { + mountMap[v.HostPath] = v.ContainerPath + } + opts = append(opts, customopts.WithVolumes(mountMap)) + } + meta.ImageRef = image.ID + containerIO, err := cio.NewContainerIO(id, cio.WithStdin(config.GetStdin()), cio.WithTerminal(config.GetTty()), @@ -277,7 +292,7 @@ func (c *criContainerdService) generateContainerSpec(id string, sandboxPid uint3 // Add extra mounts first so that CRI specified mounts can override. mounts := append(extraMounts, config.GetMounts()...) - if err := addOCIBindMounts(&g, mounts, mountLabel); err != nil { + if err := c.addOCIBindMounts(&g, mounts, mountLabel); err != nil { return nil, fmt.Errorf("failed to set OCI bind mounts %+v: %v", mounts, err) } @@ -289,7 +304,7 @@ func (c *criContainerdService) generateContainerSpec(id string, sandboxPid uint3 return nil, err } } else { - if err := addOCIDevices(&g, config.GetDevices()); err != nil { + if err := c.addOCIDevices(&g, config.GetDevices()); err != nil { return nil, fmt.Errorf("failed to set devices mapping %+v: %v", config.GetDevices(), err) } @@ -328,34 +343,69 @@ func (c *criContainerdService) generateContainerSpec(id string, sandboxPid uint3 return g.Spec(), nil } +// generateVolumeMounts sets up image volumes for container. Rely on the removal of container +// root directory to do cleanup. Note that image volume will be skipped, if there is criMounts +// specified with the same destination. +func (c *criContainerdService) generateVolumeMounts(containerRootDir string, criMounts []*runtime.Mount, config *imagespec.ImageConfig) []*runtime.Mount { + if len(config.Volumes) == 0 { + return nil + } + var mounts []*runtime.Mount + for dst := range config.Volumes { + if isInCRIMounts(dst, criMounts) { + // Skip the image volume, if there is CRI defined volume mapping. + // TODO(random-liu): This should be handled by Kubelet in the future. + // Kubelet should decide what to use for image volume, and also de-duplicate + // the image volume and user mounts. + continue + } + volumeID := util.GenerateID() + src := filepath.Join(containerRootDir, "volumes", volumeID) + // addOCIBindMounts will create these volumes. + mounts = append(mounts, &runtime.Mount{ + ContainerPath: dst, + HostPath: src, + // Use default mount propagation. + // TODO(random-liu): What about selinux relabel? + }) + } + return mounts +} + // generateContainerMounts sets up necessary container mounts including /dev/shm, /etc/hosts // and /etc/resolv.conf. func (c *criContainerdService) generateContainerMounts(sandboxRootDir string, config *runtime.ContainerConfig) []*runtime.Mount { var mounts []*runtime.Mount securityContext := config.GetLinux().GetSecurityContext() - mounts = append(mounts, &runtime.Mount{ - ContainerPath: etcHosts, - HostPath: getSandboxHosts(sandboxRootDir), - Readonly: securityContext.GetReadonlyRootfs(), - }) + if !isInCRIMounts(etcHosts, config.GetMounts()) { + mounts = append(mounts, &runtime.Mount{ + ContainerPath: etcHosts, + HostPath: getSandboxHosts(sandboxRootDir), + Readonly: securityContext.GetReadonlyRootfs(), + }) + } // Mount sandbox resolv.config. // TODO: Need to figure out whether we should always mount it as read-only - mounts = append(mounts, &runtime.Mount{ - ContainerPath: resolvConfPath, - HostPath: getResolvPath(sandboxRootDir), - Readonly: securityContext.GetReadonlyRootfs(), - }) - - sandboxDevShm := getSandboxDevShm(sandboxRootDir) - if securityContext.GetNamespaceOptions().GetHostIpc() { - sandboxDevShm = devShm + if !isInCRIMounts(resolvConfPath, config.GetMounts()) { + mounts = append(mounts, &runtime.Mount{ + ContainerPath: resolvConfPath, + HostPath: getResolvPath(sandboxRootDir), + Readonly: securityContext.GetReadonlyRootfs(), + }) + } + + if !isInCRIMounts(devShm, config.GetMounts()) { + sandboxDevShm := getSandboxDevShm(sandboxRootDir) + if securityContext.GetNamespaceOptions().GetHostIpc() { + sandboxDevShm = devShm + } + mounts = append(mounts, &runtime.Mount{ + ContainerPath: devShm, + HostPath: sandboxDevShm, + Readonly: false, + }) } - mounts = append(mounts, &runtime.Mount{ - ContainerPath: devShm, - HostPath: sandboxDevShm, - Readonly: false, - }) return mounts } @@ -414,10 +464,10 @@ func clearReadOnly(m *runtimespec.Mount) { } // addDevices set device mapping without privilege. -func addOCIDevices(g *generate.Generator, devs []*runtime.Device) error { +func (c *criContainerdService) addOCIDevices(g *generate.Generator, devs []*runtime.Device) error { spec := g.Spec() for _, device := range devs { - path, err := resolveSymbolicLink(device.HostPath) + path, err := c.os.ResolveSymbolicLink(device.HostPath) if err != nil { return err } @@ -479,7 +529,7 @@ func setOCIDevicesPrivileged(g *generate.Generator) error { // addOCIBindMounts adds bind mounts. // TODO(random-liu): Figure out whether we need to change all CRI mounts to readonly when // rootfs is readonly. (https://github.com/moby/moby/blob/master/daemon/oci_linux.go) -func addOCIBindMounts(g *generate.Generator, mounts []*runtime.Mount, mountLabel string) error { +func (c *criContainerdService) addOCIBindMounts(g *generate.Generator, mounts []*runtime.Mount, mountLabel string) error { // Mount cgroup into the container as readonly, which inherits docker's behavior. g.AddCgroupsMount("ro") // nolint: errcheck for _, mount := range mounts { @@ -487,17 +537,17 @@ func addOCIBindMounts(g *generate.Generator, mounts []*runtime.Mount, mountLabel src := mount.GetHostPath() // Create the host path if it doesn't exist. // TODO(random-liu): Add CRI validation test for this case. - if _, err := os.Stat(src); err != nil { + if _, err := c.os.Stat(src); err != nil { if !os.IsNotExist(err) { return fmt.Errorf("failed to stat %q: %v", src, err) } - if err := os.MkdirAll(src, 0755); err != nil { + if err := c.os.MkdirAll(src, 0755); err != nil { return fmt.Errorf("failed to mkdir %q: %v", src, err) } } // TODO(random-liu): Add cri-containerd integration test or cri validation test // for this. - src, err := resolveSymbolicLink(src) + src, err := c.os.ResolveSymbolicLink(src) if err != nil { return fmt.Errorf("failed to resolve symlink %q: %v", src, err) } diff --git a/pkg/server/container_create_test.go b/pkg/server/container_create_test.go index 7d0addd30..39b9aade0 100644 --- a/pkg/server/container_create_test.go +++ b/pkg/server/container_create_test.go @@ -17,6 +17,7 @@ limitations under the License. package server import ( + "path/filepath" "testing" imagespec "github.com/opencontainers/image-spec/specs-go/v1" @@ -203,7 +204,7 @@ func TestGeneralContainerSpec(t *testing.T) { config, sandboxConfig, imageConfig, specCheck := getCreateContainerTestData() c := newTestCRIContainerdService() spec, err := c.generateContainerSpec(testID, testPid, config, sandboxConfig, imageConfig, nil) - assert.NoError(t, err) + require.NoError(t, err) specCheck(t, testID, testPid, spec) } @@ -257,7 +258,7 @@ func TestContainerCapabilities(t *testing.T) { t.Logf("TestCase %q", desc) config.Linux.SecurityContext.Capabilities = test.capability spec, err := c.generateContainerSpec(testID, testPid, config, sandboxConfig, imageConfig, nil) - assert.NoError(t, err) + require.NoError(t, err) specCheck(t, testID, testPid, spec) t.Log(spec.Process.Capabilities.Bounding) for _, include := range test.includes { @@ -283,7 +284,7 @@ func TestContainerSpecTty(t *testing.T) { for _, tty := range []bool{true, false} { config.Tty = tty spec, err := c.generateContainerSpec(testID, testPid, config, sandboxConfig, imageConfig, nil) - assert.NoError(t, err) + require.NoError(t, err) specCheck(t, testID, testPid, spec) assert.Equal(t, tty, spec.Process.Terminal) if tty { @@ -302,7 +303,7 @@ func TestContainerSpecReadonlyRootfs(t *testing.T) { for _, readonly := range []bool{true, false} { config.Linux.SecurityContext.ReadonlyRootfs = readonly spec, err := c.generateContainerSpec(testID, testPid, config, sandboxConfig, imageConfig, nil) - assert.NoError(t, err) + require.NoError(t, err) specCheck(t, testID, testPid, spec) assert.Equal(t, readonly, spec.Root.Readonly) } @@ -325,7 +326,7 @@ func TestContainerSpecWithExtraMounts(t *testing.T) { Readonly: true, } spec, err := c.generateContainerSpec(testID, testPid, config, sandboxConfig, imageConfig, []*runtime.Mount{extraMount}) - assert.NoError(t, err) + require.NoError(t, err) specCheck(t, testID, testPid, spec) var mounts []runtimespec.Mount for _, m := range spec.Mounts { @@ -402,9 +403,66 @@ func TestContainerSpecCommand(t *testing.T) { } } +func TestGenerateVolumeMounts(t *testing.T) { + testContainerRootDir := "test-container-root" + for desc, test := range map[string]struct { + criMounts []*runtime.Mount + imageVolumes map[string]struct{} + expectedMountDest []string + }{ + "should setup rw mount for image volumes": { + imageVolumes: map[string]struct{}{ + "/test-volume-1": {}, + "/test-volume-2": {}, + }, + expectedMountDest: []string{ + "/test-volume-1", + "/test-volume-2", + }, + }, + "should skip image volumes if already mounted by CRI": { + criMounts: []*runtime.Mount{ + { + ContainerPath: "/test-volume-1", + HostPath: "/test-hostpath-1", + }, + }, + imageVolumes: map[string]struct{}{ + "/test-volume-1": {}, + "/test-volume-2": {}, + }, + expectedMountDest: []string{ + "/test-volume-2", + }, + }, + } { + t.Logf("TestCase %q", desc) + config := &imagespec.ImageConfig{ + Volumes: test.imageVolumes, + } + c := newTestCRIContainerdService() + got := c.generateVolumeMounts(testContainerRootDir, test.criMounts, config) + assert.Len(t, got, len(test.expectedMountDest)) + for _, dest := range test.expectedMountDest { + found := false + for _, m := range got { + if m.ContainerPath == dest { + found = true + assert.Equal(t, + filepath.Dir(m.HostPath), + filepath.Join(testContainerRootDir, "volumes")) + break + } + } + assert.True(t, found) + } + } +} + func TestGenerateContainerMounts(t *testing.T) { testSandboxRootDir := "test-sandbox-root" for desc, test := range map[string]struct { + criMounts []*runtime.Mount securityContext *runtime.LinuxContainerSecurityContext expectedMounts []*runtime.Mount }{ @@ -472,12 +530,31 @@ func TestGenerateContainerMounts(t *testing.T) { }, }, }, + "should skip contaner mounts if already mounted by CRI": { + criMounts: []*runtime.Mount{ + { + ContainerPath: "/etc/hosts", + HostPath: "/test-etc-host", + }, + { + ContainerPath: resolvConfPath, + HostPath: "test-resolv-conf", + }, + { + ContainerPath: "/dev/shm", + HostPath: "test-dev-shm", + }, + }, + securityContext: &runtime.LinuxContainerSecurityContext{}, + expectedMounts: nil, + }, } { config := &runtime.ContainerConfig{ Metadata: &runtime.ContainerMetadata{ Name: "test-name", Attempt: 1, }, + Mounts: test.criMounts, Linux: &runtime.LinuxContainerConfig{ SecurityContext: test.securityContext, }, @@ -514,7 +591,8 @@ func TestPrivilegedBindMount(t *testing.T) { t.Logf("TestCase %q", desc) g := generate.New() g.SetRootReadonly(test.readonlyRootFS) - addOCIBindMounts(&g, nil, "") + c := newTestCRIContainerdService() + c.addOCIBindMounts(&g, nil, "") if test.privileged { setOCIBindMountsPrivileged(&g) } @@ -540,7 +618,7 @@ func TestPidNamespace(t *testing.T) { t.Logf("should not set pid namespace when host pid is true") config.Linux.SecurityContext.NamespaceOptions = &runtime.NamespaceOption{HostPid: true} spec, err := c.generateContainerSpec(testID, testPid, config, sandboxConfig, imageConfig, nil) - assert.NoError(t, err) + require.NoError(t, err) specCheck(t, testID, testPid, spec) for _, ns := range spec.Linux.Namespaces { assert.NotEqual(t, ns.Type, runtimespec.PIDNamespace) @@ -549,7 +627,7 @@ func TestPidNamespace(t *testing.T) { t.Logf("should set pid namespace when host pid is false") config.Linux.SecurityContext.NamespaceOptions = &runtime.NamespaceOption{HostPid: false} spec, err = c.generateContainerSpec(testID, testPid, config, sandboxConfig, imageConfig, nil) - assert.NoError(t, err) + require.NoError(t, err) specCheck(t, testID, testPid, spec) assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ Type: runtimespec.PIDNamespace, diff --git a/pkg/server/helpers.go b/pkg/server/helpers.go index ebeba75fc..30482d702 100644 --- a/pkg/server/helpers.go +++ b/pkg/server/helpers.go @@ -297,19 +297,6 @@ func (c *criContainerdService) ensureImageExists(ctx context.Context, ref string return &newImage, nil } -// resolveSymbolicLink resolves a possbile symlink path. If the path is a symlink, returns resolved -// path; if not, returns the original path. -func resolveSymbolicLink(path string) (string, error) { - info, err := os.Lstat(path) - if err != nil { - return "", err - } - if info.Mode()&os.ModeSymlink != os.ModeSymlink { - return path, nil - } - return filepath.EvalSymlinks(path) -} - // loadCgroup loads the cgroup associated with path if it exists and moves the current process into the cgroup. If the cgroup // is not created it is created and returned. func loadCgroup(cgroupPath string) (cgroups.Cgroup, error) { @@ -387,3 +374,13 @@ func initSelinuxOpts(selinuxOpt *runtime.SELinuxOption) (string, string, error) selinuxOpt.GetType()) return label.InitLabels(selinux.DupSecOpt(labelOpts)) } + +// isInCRIMounts checks whether a destination is in CRI mount list. +func isInCRIMounts(dst string, mounts []*runtime.Mount) bool { + for _, m := range mounts { + if m.ContainerPath == dst { + return true + } + } + return false +}