Update libcontainer dependency

This commit is contained in:
Buddha Prakash 2016-07-10 20:29:06 -07:00
parent 710374b65f
commit f6186afe99
60 changed files with 2415 additions and 946 deletions

67
Godeps/Godeps.json generated
View File

@ -1,7 +1,7 @@
{
"ImportPath": "k8s.io/kubernetes",
"GoVersion": "go1.6",
"GodepVersion": "v67",
"GodepVersion": "v74",
"Packages": [
"github.com/ugorji/go/codec/codecgen",
"github.com/onsi/ginkgo/ginkgo",
@ -1511,78 +1511,83 @@
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/keys",
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/label",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/selinux",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/system",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/user",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/utils",
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
"Comment": "v1.0.0-rc1-57-g4eb8c2f",
"Rev": "4eb8c2fb1dcb10fa3bf9bd7031f3a25a8ce2fef6"
},
{
"ImportPath": "github.com/pborman/uuid",

199
Godeps/LICENSES generated
View File

@ -48436,6 +48436,205 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================================================
================================================================================
= vendor/github.com/opencontainers/runc/libcontainer/keys licensed under: =
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Copyright 2014 Docker, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
= vendor/github.com/opencontainers/runc/LICENSE 435b266b3899aa8a959f17d41c56def8 -
================================================================================
================================================================================
= vendor/github.com/opencontainers/runc/libcontainer/label licensed under: =

View File

@ -76,7 +76,7 @@ config := &configs.Config{
Name: "test-container",
Parent: "system",
Resources: &configs.Resources{
MemorySwappiness: -1,
MemorySwappiness: nil,
AllowAllDevices: false,
AllowedDevices: configs.DefaultAllowedDevices,
},
@ -133,15 +133,15 @@ config := &configs.Config{
UidMappings: []configs.IDMap{
{
ContainerID: 0,
Host: 1000,
size: 65536,
HostID: 1000,
Size: 65536,
},
},
GidMappings: []configs.IDMap{
{
ContainerID: 0,
Host: 1000,
size: 65536,
HostID: 1000,
Size: 65536,
},
},
Networks: []*configs.Network{
@ -186,8 +186,8 @@ process := &libcontainer.Process{
err := container.Start(process)
if err != nil {
logrus.Fatal(err)
container.Destroy()
logrus.Fatal(err)
return
}
@ -216,6 +216,12 @@ container.Pause()
// resume all paused processes.
container.Resume()
// send signal to container's init process.
container.Signal(signal)
// update container resource constraints.
container.Set(config)
```

View File

@ -90,7 +90,7 @@ in tmpfs.
After `/dev/null` has been setup we check for any external links between
the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing
to `/dev/null` outside the container we close and `dup2` the the `/dev/null`
to `/dev/null` outside the container we close and `dup2` the `/dev/null`
that is local to the container's rootfs.
@ -142,6 +142,7 @@ system resources like cpu, memory, and device access.
| perf_event | 1 |
| freezer | 1 |
| hugetlb | 1 |
| pids | 1 |
All cgroup subsystem are joined so that statistics can be collected from
@ -199,7 +200,7 @@ provide a good default for security and flexibility for the applications.
| CAP_SYS_BOOT | 0 |
| CAP_LEASE | 0 |
| CAP_WAKE_ALARM | 0 |
| CAP_BLOCK_SUSPE | 0 |
| CAP_BLOCK_SUSPEND | 0 |
Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
@ -296,7 +297,7 @@ a container.
| -------------- | ------------------------------------------------------------------ |
| Get processes | Return all the pids for processes running inside a container |
| Get Stats | Return resource statistics for the container as a whole |
| Wait | Wait waits on the container's init process ( pid 1 ) |
| Wait | Waits on the container's init process ( pid 1 ) |
| Wait Process | Wait on any of the container's processes returning the exit status |
| Destroy | Kill the container's init process and remove any filesystem state |
| Signal | Send a signal to the container's init process |

View File

@ -7,6 +7,7 @@ package apparmor
// #include <stdlib.h>
import "C"
import (
"fmt"
"io/ioutil"
"os"
"unsafe"
@ -32,7 +33,7 @@ func ApplyProfile(name string) error {
cName := C.CString(name)
defer C.free(unsafe.Pointer(cName))
if _, err := C.aa_change_onexec(cName); err != nil {
return err
return fmt.Errorf("apparmor failed to apply profile: %s", err)
}
return nil
}

View File

@ -9,7 +9,7 @@ import (
)
type Manager interface {
// Apply cgroup configuration to the process with the specified pid
// Applies cgroup configuration to the process with the specified pid
Apply(pid int) error
// Returns the PIDs inside the cgroup set

View File

@ -14,6 +14,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
)
var (
@ -30,6 +31,7 @@ var (
&NetPrioGroup{},
&PerfEventGroup{},
&FreezerGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
}
CgroupProcesses = "cgroup.procs"
HugePageSizes, _ = cgroups.GetHugePageSize()
@ -94,11 +96,10 @@ func getCgroupRoot() (string, error) {
}
type cgroupData struct {
root string
parent string
name string
config *configs.Cgroup
pid int
root string
innerPath string
config *configs.Cgroup
pid int
}
func (m *Manager) Apply(pid int) (err error) {
@ -129,12 +130,9 @@ func (m *Manager) Apply(pid int) (err error) {
return cgroups.EnterPid(m.Paths, pid)
}
m.mu.Lock()
defer m.mu.Unlock()
paths := make(map[string]string)
defer func() {
if err != nil {
cgroups.RemovePaths(paths)
}
}()
for _, sys := range subsystems {
if err := sys.Apply(d); err != nil {
return err
@ -144,7 +142,9 @@ func (m *Manager) Apply(pid int) (err error) {
// created then join consists of writing the process pids to cgroup.procs
p, err := d.path(sys.Name())
if err != nil {
if cgroups.IsNotFound(err) {
// The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && sys.Name() != "devices" {
continue
}
return err
@ -267,45 +267,31 @@ func getCgroupPath(c *configs.Cgroup) (string, error) {
return d.path("devices")
}
// pathClean makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using pathClean.
func pathClean(path string) string {
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
root, err := getCgroupRoot()
if err != nil {
return nil, err
}
// Clean the parent slice path.
c.Parent = pathClean(c.Parent)
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove this code. Path safety is important! -- cyphar
cgPath := libcontainerUtils.CleanPath(c.Path)
cgParent := libcontainerUtils.CleanPath(c.Parent)
cgName := libcontainerUtils.CleanPath(c.Name)
innerPath := cgPath
if innerPath == "" {
innerPath = filepath.Join(cgParent, cgName)
}
return &cgroupData{
root: root,
parent: c.Parent,
name: c.Name,
config: c,
pid: pid,
root: root,
innerPath: innerPath,
config: c,
pid: pid,
}, nil
}
@ -333,11 +319,10 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
return "", err
}
cgPath := filepath.Join(raw.parent, raw.name)
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(cgPath) {
if filepath.IsAbs(raw.innerPath) {
// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
return filepath.Join(raw.root, filepath.Base(mnt), cgPath), nil
return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
}
parentPath, err := raw.parentPath(subsystem, mnt, root)
@ -345,7 +330,7 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
return "", err
}
return filepath.Join(parentPath, cgPath), nil
return filepath.Join(parentPath, raw.innerPath), nil
}
func (raw *cgroupData) join(subsystem string) (string, error) {
@ -366,9 +351,12 @@ func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s.", file)
return fmt.Errorf("no such directory for %s", file)
}
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", data, file, err)
}
return nil
}
func readFile(dir, file string) (string, error) {

View File

@ -12,6 +12,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
)
type CpusetGroup struct {
@ -88,7 +89,7 @@ func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []b
// it's parent.
func (s *CpusetGroup) ensureParent(current, root string) error {
parent := filepath.Dir(current)
if filepath.Clean(parent) == root {
if libcontainerUtils.CleanPath(parent) == root {
return nil
}
// Avoid infinite recursion.

View File

@ -5,6 +5,7 @@ package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
)
type DevicesGroup struct {
@ -25,6 +26,23 @@ func (s *DevicesGroup) Apply(d *cgroupData) error {
}
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if system.RunningInUserNS() {
return nil
}
devices := cgroup.Resources.Devices
if len(devices) > 0 {
for _, dev := range devices {
file := "devices.deny"
if dev.Allow {
file = "devices.allow"
}
if err := writeFile(path, file, dev.CgroupString()); err != nil {
return err
}
}
return nil
}
if !cgroup.Resources.AllowAllDevices {
if err := writeFile(path, "devices.deny", "a"); err != nil {
return err

View File

@ -9,6 +9,7 @@ import (
"path/filepath"
"strconv"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
@ -26,38 +27,75 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if memoryAssigned(d.config) {
if path != "" {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
}
// We have to set kernel memory here, as we can't change it once
// processes have been attached.
if err := s.SetKernelMemory(path, d.config); err != nil {
return err
}
// reset error.
err = nil
if path == "" {
// Invalid input.
return fmt.Errorf("invalid path for memory cgroups: %+v", d)
}
defer func() {
if err != nil {
os.RemoveAll(path)
}
}()
if !cgroups.PathExists(path) {
if err = os.MkdirAll(path, 0755); err != nil {
return err
}
}
if memoryAssigned(d.config) {
// We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup.
if err = s.SetKernelMemory(path, d.config); err != nil {
return err
}
}
// We need to join memory cgroup after set memory limits, because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
_, err = d.join("memory")
if err != nil && !cgroups.IsNotFound(err) {
if _, jerr := d.join("memory"); jerr != nil && !cgroups.IsNotFound(jerr) {
err = jerr
return err
}
return nil
}
func getModifyTime(path string) (time.Time, error) {
stat, err := os.Stat(path)
if err != nil {
return time.Time{}, fmt.Errorf("failed to get memory cgroups creation time: %v", err)
}
return stat.ModTime(), nil
}
func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
// This has to be done separately because it has special constraints (it
// can't be done after there are processes attached to the cgroup).
if cgroup.Resources.KernelMemory > 0 {
// This has to be done separately because it has special
// constraints (it can only be initialized before setting up a
// hierarchy or adding a task to the cgroups. However, if
// sucessfully initialized, it can be updated anytime afterwards)
if cgroup.Resources.KernelMemory != 0 {
// Is kmem.limit_in_bytes already set?
// memory.kmem.max_usage_in_bytes is a read-only file. Use it to get cgroups creation time.
kmemCreationTime, err := getModifyTime(filepath.Join(path, "memory.kmem.max_usage_in_bytes"))
if err != nil {
return err
}
kmemLimitsUpdateTime, err := getModifyTime(filepath.Join(path, "memory.kmem.limit_in_bytes"))
if err != nil {
return err
}
// kmem.limit_in_bytes has already been set if its update time is after that of creation time.
// We use `!=` op instead of `>` because updates are losing precision compared to creation.
kmemInitialized := !kmemLimitsUpdateTime.Equal(kmemCreationTime)
if !kmemInitialized {
// If there's already tasks in the cgroup, we can't change the limit either
tasks, err := getCgroupParamString(path, "tasks")
if err != nil {
return err
}
if tasks != "" {
return fmt.Errorf("cannot set kmem.limit_in_bytes after task have joined this cgroup")
}
}
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
return err
}
@ -65,19 +103,65 @@ func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error
return nil
}
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.Memory != 0 {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
// When memory and swap memory are both set, we need to handle the cases
// for updating container.
if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap > 0 {
memoryUsage, err := getMemoryData(path, "")
if err != nil {
return err
}
// When update memory limit, we should adapt the write sequence
// for memory and swap memory, so it won't fail because the new
// value and the old value don't fit kernel's validation.
if memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
} else {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
} else {
if cgroup.Resources.Memory != 0 {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemorySwap > 0 {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
}
return nil
}
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
if err := setMemoryAndSwap(path, cgroup); err != nil {
return err
}
if err := s.SetKernelMemory(path, cgroup); err != nil {
return err
}
if cgroup.Resources.MemoryReservation != 0 {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemorySwap > 0 {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
if cgroup.Resources.KernelMemoryTCP != 0 {
if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
return err
}
}
@ -86,14 +170,14 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
return err
}
}
if cgroup.Resources.MemorySwappiness >= 0 && cgroup.Resources.MemorySwappiness <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(cgroup.Resources.MemorySwappiness, 10)); err != nil {
if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
return nil
} else if int64(*cgroup.Resources.MemorySwappiness) >= 0 && int64(*cgroup.Resources.MemorySwappiness) <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
return err
}
} else if cgroup.Resources.MemorySwappiness == -1 {
return nil
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", cgroup.Resources.MemorySwappiness)
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", int64(*cgroup.Resources.MemorySwappiness))
}
return nil
@ -139,6 +223,11 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
return err
}
stats.MemoryStats.KernelUsage = kernelUsage
kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
if err != nil {
return err
}
stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
return nil
}
@ -148,8 +237,9 @@ func memoryAssigned(cgroup *configs.Cgroup) bool {
cgroup.Resources.MemoryReservation != 0 ||
cgroup.Resources.MemorySwap > 0 ||
cgroup.Resources.KernelMemory > 0 ||
cgroup.Resources.KernelMemoryTCP > 0 ||
cgroup.Resources.OomKillDisable ||
cgroup.Resources.MemorySwappiness != -1
(cgroup.Resources.MemorySwappiness != nil && *cgroup.Resources.MemorySwappiness != -1)
}
func getMemoryData(path, name string) (cgroups.MemoryData, error) {
@ -162,6 +252,7 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
value, err := getCgroupParamUint(path, usage)
if err != nil {
@ -187,6 +278,14 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
}
memoryData.Failcnt = value
value, err = getCgroupParamUint(path, limit)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
}
memoryData.Limit = value
return memoryData, nil
}

View File

@ -9,6 +9,7 @@ import (
type NameGroup struct {
GroupName string
Join bool
}
func (s *NameGroup) Name() string {
@ -16,6 +17,10 @@ func (s *NameGroup) Name() string {
}
func (s *NameGroup) Apply(d *cgroupData) error {
if s.Join {
// ignore errors if the named cgroup does not exist
d.join(s.GroupName)
}
return nil
}
@ -24,6 +29,9 @@ func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
}
func (s *NameGroup) Remove(d *cgroupData) error {
if s.Join {
removePath(d.path(s.GroupName))
}
return nil
}

View File

@ -4,6 +4,7 @@ package fs
import (
"fmt"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
@ -47,11 +48,26 @@ func (s *PidsGroup) Remove(d *cgroupData) error {
}
func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
value, err := getCgroupParamUint(path, "pids.current")
current, err := getCgroupParamUint(path, "pids.current")
if err != nil {
return fmt.Errorf("failed to parse pids.current - %s", err)
}
stats.PidsStats.Current = value
maxString, err := getCgroupParamString(path, "pids.max")
if err != nil {
return fmt.Errorf("failed to parse pids.max - %s", err)
}
// Default if pids.max == "max" is 0 -- which represents "no limit".
var max uint64
if maxString != "max" {
max, err = parseUint(maxString, 10, 64)
if err != nil {
return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max"))
}
}
stats.PidsStats.Current = current
stats.PidsStats.Limit = max
return nil
}

View File

@ -12,7 +12,6 @@ import (
)
var (
ErrNotSupportStat = errors.New("stats are not supported for subsystem")
ErrNotValidFormat = errors.New("line is not a valid key value format")
)

View File

@ -11,6 +11,7 @@ type ThrottlingData struct {
ThrottledTime uint64 `json:"throttled_time,omitempty"`
}
// CpuUsage denotes the usage of a CPU.
// All CPU stats are aggregate since container inception.
type CpuUsage struct {
// Total CPU time consumed.
@ -36,7 +37,9 @@ type MemoryData struct {
Usage uint64 `json:"usage,omitempty"`
MaxUsage uint64 `json:"max_usage,omitempty"`
Failcnt uint64 `json:"failcnt"`
Limit uint64 `json:"limit"`
}
type MemoryStats struct {
// memory used for cache
Cache uint64 `json:"cache,omitempty"`
@ -44,14 +47,19 @@ type MemoryStats struct {
Usage MemoryData `json:"usage,omitempty"`
// usage of memory + swap
SwapUsage MemoryData `json:"swap_usage,omitempty"`
// usafe of kernel memory
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
Stats map[string]uint64 `json:"stats,omitempty"`
// usage of kernel memory
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
// usage of kernel TCP memory
KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
Stats map[string]uint64 `json:"stats,omitempty"`
}
type PidsStats struct {
// number of pids in the cgroup
Current uint64 `json:"current,omitempty"`
// active pids hard limit
Limit uint64 `json:"limit,omitempty"`
}
type BlkioStatEntry struct {
@ -78,7 +86,7 @@ type HugetlbStats struct {
Usage uint64 `json:"usage,omitempty"`
// maximum usage ever recorded.
MaxUsage uint64 `json:"max_usage,omitempty"`
// number of times htgetlb usage allocation failure.
// number of times hugetlb usage allocation failure.
Failcnt uint64 `json:"failcnt"`
}

View File

@ -74,6 +74,7 @@ var (
theConn *systemdDbus.Conn
hasStartTransientUnit bool
hasTransientDefaultDependencies bool
hasDelegate bool
)
func newProp(name string, units interface{}) systemdDbus.Property {
@ -146,20 +147,24 @@ func UseSystemd() bool {
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
// Assume StartTransientUnit on a scope allows Delegate
hasDelegate = true
dl := newProp("Delegate", true)
if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
hasDelegate = false
}
}
}
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
}
return hasStartTransientUnit
}
func getIfaceForUnit(unitName string) string {
if strings.HasSuffix(unitName, ".scope") {
return "Scope"
}
if strings.HasSuffix(unitName, ".service") {
return "Service"
}
return "Unit"
}
func (m *Manager) Apply(pid int) error {
var (
c = m.Cgroups
@ -195,6 +200,11 @@ func (m *Manager) Apply(pid int) error {
newProp("PIDs", []uint32{uint32(pid)}),
)
if hasDelegate {
// This is only supported on systemd versions 218 and above.
properties = append(properties, newProp("Delegate", true))
}
// Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time.
properties = append(properties,
@ -222,11 +232,9 @@ func (m *Manager) Apply(pid int) error {
newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
}
// We need to set kernel memory before processes join cgroup because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
// And swap memory limit needs to be set after memory limit, only
// memory limit is handled by systemd, so it's kind of ugly here.
if c.Resources.KernelMemory > 0 {
// We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup.
if c.Resources.KernelMemory != 0 {
if err := setKernelMemory(c); err != nil {
return err
}
@ -236,53 +244,7 @@ func (m *Manager) Apply(pid int) error {
return err
}
if err := joinDevices(c, pid); err != nil {
return err
}
// TODO: CpuQuota and CpuPeriod not available in systemd
// we need to manually join the cpu.cfs_quota_us and cpu.cfs_period_us
if err := joinCpu(c, pid); err != nil {
return err
}
// TODO: MemoryReservation and MemorySwap not available in systemd
if err := joinMemory(c, pid); err != nil {
return err
}
// we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd
// because it does not currently support it via the dbus api.
if err := joinFreezer(c, pid); err != nil {
return err
}
if err := joinNetPrio(c, pid); err != nil {
return err
}
if err := joinNetCls(c, pid); err != nil {
return err
}
if err := joinPids(c, pid); err != nil {
return err
}
if err := joinCpuset(c, pid); err != nil {
return err
}
if err := joinHugetlb(c, pid); err != nil {
return err
}
if err := joinPerfEvent(c, pid); err != nil {
return err
}
// FIXME: Systemd does have `BlockIODeviceWeight` property, but we got problem
// using that (at least on systemd 208, see https://github.com/opencontainers/runc/libcontainer/pull/354),
// so use fs work around for now.
if err := joinBlkio(c, pid); err != nil {
if err := joinCgroups(c, pid); err != nil {
return err
}
@ -327,7 +289,7 @@ func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s.", file)
return fmt.Errorf("no such directory for %s", file)
}
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
}
@ -347,43 +309,41 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
return path, nil
}
func joinCpu(c *configs.Cgroup, pid int) error {
_, err := join(c, "cpu", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
func joinCgroups(c *configs.Cgroup, pid int) error {
for _, sys := range subsystems {
name := sys.Name()
switch name {
case "name=systemd":
// let systemd handle this
break
case "cpuset":
path, err := getSubsystemPath(c, name)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
s := &fs.CpusetGroup{}
if err := s.ApplyDir(path, c, pid); err != nil {
return err
}
break
default:
_, err := join(c, name, pid)
if err != nil {
// Even if it's `not found` error, we'll return err
// because devices cgroup is hard requirement for
// container security.
if name == "devices" {
return err
}
// For other subsystems, omit the `not found` error
// because they are optional.
if !cgroups.IsNotFound(err) {
return err
}
}
}
}
return nil
}
func joinFreezer(c *configs.Cgroup, pid int) error {
_, err := join(c, "freezer", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinNetPrio(c *configs.Cgroup, pid int) error {
_, err := join(c, "net_prio", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinNetCls(c *configs.Cgroup, pid int) error {
_, err := join(c, "net_cls", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinPids(c *configs.Cgroup, pid int) error {
_, err := join(c, "pids", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
@ -392,9 +352,18 @@ func joinPids(c *configs.Cgroup, pid int) error {
// test.slice/test-a.slice/test-a-b.slice.
func expandSlice(slice string) (string, error) {
suffix := ".slice"
sliceName := strings.TrimSuffix(slice, suffix)
// Name has to end with ".slice", but can't be just ".slice".
if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
// Path-separators are not allowed.
if strings.Contains(slice, "/") {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
var path, prefix string
sliceName := strings.TrimSuffix(slice, suffix)
for _, component := range strings.Split(sliceName, "-") {
// test--a.slice isn't permitted, nor is -test.slice.
if component == "" {
@ -510,87 +479,11 @@ func getUnitName(c *configs.Cgroup) string {
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
}
// Atm we can't use the systemd device support because of two missing things:
// * Support for wildcards to allow mknod on any device
// * Support for wildcards to allow /dev/pts support
//
// The second is available in more recent systemd as "char-pts", but not in e.g. v208 which is
// in wide use. When both these are available we will be able to switch, but need to keep the old
// implementation for backwards compat.
//
// Note: we can't use systemd to set up the initial limits, and then change the cgroup
// because systemd will re-write the device settings if it needs to re-apply the cgroup context.
// This happens at least for v208 when any sibling unit is started.
func joinDevices(c *configs.Cgroup, pid int) error {
_, err := join(c, "devices", pid)
// Even if it's `not found` error, we'll return err because devices cgroup
// is hard requirement for container security.
if err != nil {
return err
}
return nil
}
func setKernelMemory(c *configs.Cgroup) error {
path, err := getSubsystemPath(c, "memory")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// This doesn't get called by manager.Set, so we need to do it here.
s := &fs.MemoryGroup{}
return s.SetKernelMemory(path, c)
}
func joinMemory(c *configs.Cgroup, pid int) error {
_, err := join(c, "memory", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
// systemd does not atm set up the cpuset controller, so we must manually
// join it. Additionally that is a very finicky controller where each
// level must have a full setup as the default for a new directory is "no cpus"
func joinCpuset(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "cpuset")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
s := &fs.CpusetGroup{}
return s.ApplyDir(path, c, pid)
}
// `BlockIODeviceWeight` property of systemd does not work properly, and systemd
// expects device path instead of major minor numbers, which is also confusing
// for users. So we use fs work around for now.
func joinBlkio(c *configs.Cgroup, pid int) error {
_, err := join(c, "blkio", pid)
if err != nil {
return err
}
return nil
}
func joinHugetlb(c *configs.Cgroup, pid int) error {
_, err := join(c, "hugetlb", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinPerfEvent(c *configs.Cgroup, pid int) error {
_, err := join(c, "perf_event", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
return os.MkdirAll(path, 0755)
}

View File

@ -5,6 +5,7 @@ package cgroups
import (
"bufio"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
@ -12,17 +13,19 @@ import (
"strings"
"time"
"github.com/docker/docker/pkg/mount"
"github.com/docker/go-units"
)
const cgroupNamePrefix = "name="
// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt
// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(subsystem string) (string, error) {
// We are not using mount.GetMounts() because it's super-inefficient,
// parsing it directly sped up x10 times because of not using Sscanf.
// It was one of two major performance drawbacks in container start.
if !isSubsystemAvailable(subsystem) {
return "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", err
@ -47,6 +50,9 @@ func FindCgroupMountpoint(subsystem string) (string, error) {
}
func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
if !isSubsystemAvailable(subsystem) {
return "", "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", "", err
@ -70,6 +76,15 @@ func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
return "", "", NewNotFoundError(subsystem)
}
func isSubsystemAvailable(subsystem string) bool {
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return false
}
_, avail := cgroups[subsystem]
return avail
}
func FindCgroupMountpointDir() (string, error) {
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
@ -121,42 +136,63 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
return getControllerPath(m.Subsystems[0], cgroups)
}
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
res := make([]Mount, 0, len(ss))
scanner := bufio.NewScanner(mi)
numFound := 0
for scanner.Scan() && numFound < len(ss) {
txt := scanner.Text()
sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 {
return nil, fmt.Errorf("invalid mountinfo format")
}
if txt[sepIdx+3:sepIdx+9] != "cgroup" {
continue
}
fields := strings.Split(txt, " ")
m := Mount{
Mountpoint: fields[4],
Root: fields[3],
}
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if !ss[opt] {
continue
}
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
} else {
m.Subsystems = append(m.Subsystems, opt)
}
numFound++
}
res = append(res, m)
}
if err := scanner.Err(); err != nil {
return nil, err
}
return res, nil
}
func GetCgroupMounts() ([]Mount, error) {
mounts, err := mount.GetMounts()
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return nil, err
}
defer f.Close()
all, err := GetAllSubsystems()
all, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return nil, err
}
allMap := make(map[string]bool)
for _, s := range all {
for s := range all {
allMap[s] = true
}
res := []Mount{}
for _, mount := range mounts {
if mount.Fstype == "cgroup" {
m := Mount{Mountpoint: mount.Mountpoint, Root: mount.Root}
for _, opt := range strings.Split(mount.VfsOpts, ",") {
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
}
if allMap[opt] {
m.Subsystems = append(m.Subsystems, opt)
}
}
res = append(res, m)
}
}
return res, nil
return getCgroupMountsHelper(allMap, f)
}
// Returns all the cgroup subsystems supported by the kernel
// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
func GetAllSubsystems() ([]string, error) {
f, err := os.Open("/proc/cgroups")
if err != nil {
@ -182,7 +218,7 @@ func GetAllSubsystems() ([]string, error) {
return subsystems, nil
}
// Returns the relative path to the cgroup docker is running in.
// GetThisCgroupDir returns the relative path to the cgroup docker is running in.
func GetThisCgroupDir(subsystem string) (string, error) {
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
@ -226,6 +262,8 @@ func readProcsFile(dir string) ([]int, error) {
return out, nil
}
// ParseCgroupFile parses the given cgroup file, typically from
// /proc/<pid>/cgroup, into a map of subgroups to cgroup names.
func ParseCgroupFile(path string) (map[string]string, error) {
f, err := os.Open(path)
if err != nil {
@ -233,7 +271,12 @@ func ParseCgroupFile(path string) (map[string]string, error) {
}
defer f.Close()
s := bufio.NewScanner(f)
return parseCgroupFromReader(f)
}
// helper function for ParseCgroupFile to make testing easier
func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
s := bufio.NewScanner(r)
cgroups := make(map[string]string)
for s.Scan() {
@ -242,7 +285,16 @@ func ParseCgroupFile(path string) (map[string]string, error) {
}
text := s.Text()
parts := strings.Split(text, ":")
// from cgroups(7):
// /proc/[pid]/cgroup
// ...
// For each cgroup hierarchy ... there is one entry
// containing three colon-separated fields of the form:
// hierarchy-ID:subsystem-list:cgroup-path
parts := strings.SplitN(text, ":", 3)
if len(parts) < 3 {
return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
}
for _, subs := range strings.Split(parts[1], ",") {
cgroups[subs] = parts[2]
@ -309,7 +361,7 @@ func RemovePaths(paths map[string]string) (err error) {
return nil
}
}
return fmt.Errorf("Failed to remove paths: %s", paths)
return fmt.Errorf("Failed to remove paths: %v", paths)
}
func GetHugePageSize() ([]string, error) {

View File

@ -11,15 +11,22 @@ const (
)
type Cgroup struct {
Name string `json:"name"`
// Deprecated, use Path instead
Name string `json:"name,omitempty"`
// name of parent cgroup or slice
Parent string `json:"parent"`
// name of parent of cgroup or slice
// Deprecated, use Path instead
Parent string `json:"parent,omitempty"`
// Path specifies the path to cgroups that are created and/or joined by the container.
// The path is assumed to be relative to the host system cgroup mountpoint.
Path string `json:"path"`
// ScopePrefix decribes prefix for the scope name
ScopePrefix string `json:"scope_prefix"`
// Paths represent the cgroups paths to join
// Paths represent the absolute cgroups paths to join.
// This takes precedence over Path.
Paths map[string]string
// Resources contains various cgroups settings to apply
@ -28,11 +35,14 @@ type Cgroup struct {
type Resources struct {
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
AllowAllDevices bool `json:"allow_all_devices"`
// Deprecated
AllowAllDevices bool `json:"allow_all_devices,omitempty"`
// Deprecated
AllowedDevices []*Device `json:"allowed_devices,omitempty"`
// Deprecated
DeniedDevices []*Device `json:"denied_devices,omitempty"`
AllowedDevices []*Device `json:"allowed_devices"`
DeniedDevices []*Device `json:"denied_devices"`
Devices []*Device `json:"devices"`
// Memory limit (in bytes)
Memory int64 `json:"memory"`
@ -46,6 +56,9 @@ type Resources struct {
// Kernel memory limit (in bytes)
KernelMemory int64 `json:"kernel_memory"`
// Kernel memory limit for TCP use (in bytes)
KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
// CPU shares (relative weight vs. other containers)
CpuShares int64 `json:"cpu_shares"`
@ -101,7 +114,7 @@ type Resources struct {
OomKillDisable bool `json:"oom_kill_disable"`
// Tuning swappiness behaviour per cgroup
MemorySwappiness int64 `json:"memory_swappiness"`
MemorySwappiness *int64 `json:"memory_swappiness"`
// Set priority of network traffic for container
NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`

View File

@ -3,7 +3,11 @@ package configs
import (
"bytes"
"encoding/json"
"fmt"
"os/exec"
"time"
"github.com/Sirupsen/logrus"
)
type Rlimit struct {
@ -29,7 +33,7 @@ type Seccomp struct {
Syscalls []*Syscall `json:"syscalls"`
}
// An action to be taken upon rule match in Seccomp
// Action is taken upon rule match in Seccomp
type Action int
const (
@ -40,7 +44,7 @@ const (
Trace
)
// A comparison operator to be used when matching syscall arguments in Seccomp
// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
type Operator int
const (
@ -53,7 +57,7 @@ const (
MaskEqualTo
)
// A rule to match a specific syscall argument in Seccomp
// Arg is a rule to match a specific syscall argument in Seccomp
type Arg struct {
Index uint `json:"index"`
Value uint64 `json:"value"`
@ -61,7 +65,7 @@ type Arg struct {
Op Operator `json:"op"`
}
// An rule to match a syscall in Seccomp
// Syscall is a rule to match a syscall in Seccomp
type Syscall struct {
Name string `json:"name"`
Action Action `json:"action"`
@ -128,15 +132,15 @@ type Config struct {
// AppArmorProfile specifies the profile to apply to the process running in the container and is
// change at the time the process is execed
AppArmorProfile string `json:"apparmor_profile"`
AppArmorProfile string `json:"apparmor_profile,omitempty"`
// ProcessLabel specifies the label to apply to the process running in the container. It is
// commonly used by selinux
ProcessLabel string `json:"process_label"`
ProcessLabel string `json:"process_label,omitempty"`
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []Rlimit `json:"rlimits"`
Rlimits []Rlimit `json:"rlimits,omitempty"`
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
// for a process. Valid values are between the range [-1000, '1000'], where processes with
@ -171,12 +175,22 @@ type Config struct {
// A default action to be taken if no rules match is also given.
Seccomp *Seccomp `json:"seccomp"`
// NoNewPrivileges controls whether processes in the container can gain additional privileges.
NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
// Hooks are a collection of actions to perform at various container lifecycle events.
// Hooks are not able to be marshaled to json but they are also not needed to.
Hooks *Hooks `json:"-"`
// CommandHooks are serialized to JSON, but other hooks are not.
Hooks *Hooks
// Version is the version of opencontainer specification that is supported.
Version string `json:"version"`
// Labels are user defined metadata that is stored in the config and populated on the state
Labels []string `json:"labels"`
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"`
}
type Hooks struct {
@ -191,12 +205,59 @@ type Hooks struct {
Poststop []Hook
}
func (hooks *Hooks) UnmarshalJSON(b []byte) error {
var state struct {
Prestart []CommandHook
Poststart []CommandHook
Poststop []CommandHook
}
if err := json.Unmarshal(b, &state); err != nil {
return err
}
deserialize := func(shooks []CommandHook) (hooks []Hook) {
for _, shook := range shooks {
hooks = append(hooks, shook)
}
return hooks
}
hooks.Prestart = deserialize(state.Prestart)
hooks.Poststart = deserialize(state.Poststart)
hooks.Poststop = deserialize(state.Poststop)
return nil
}
func (hooks Hooks) MarshalJSON() ([]byte, error) {
serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
for _, hook := range hooks {
switch chook := hook.(type) {
case CommandHook:
serializableHooks = append(serializableHooks, chook)
default:
logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
}
}
return serializableHooks
}
return json.Marshal(map[string]interface{}{
"prestart": serialize(hooks.Prestart),
"poststart": serialize(hooks.Poststart),
"poststop": serialize(hooks.Poststop),
})
}
// HookState is the payload provided to a hook on execution.
type HookState struct {
Version string `json:"version"`
ID string `json:"id"`
Pid int `json:"pid"`
Root string `json:"root"`
Version string `json:"ociVersion"`
ID string `json:"id"`
Pid int `json:"pid"`
Root string `json:"root"`
BundlePath string `json:"bundlePath"`
}
type Hook interface {
@ -204,7 +265,7 @@ type Hook interface {
Run(HookState) error
}
// NewFunctionHooks will call the provided function when the hook is run.
// NewFunctionHook will call the provided function when the hook is run.
func NewFunctionHook(f func(HookState) error) FuncHook {
return FuncHook{
run: f,
@ -220,13 +281,14 @@ func (f FuncHook) Run(s HookState) error {
}
type Command struct {
Path string `json:"path"`
Args []string `json:"args"`
Env []string `json:"env"`
Dir string `json:"dir"`
Path string `json:"path"`
Args []string `json:"args"`
Env []string `json:"env"`
Dir string `json:"dir"`
Timeout *time.Duration `json:"timeout"`
}
// NewCommandHooks will execute the provided command when the hook is run.
// NewCommandHook will execute the provided command when the hook is run.
func NewCommandHook(cmd Command) CommandHook {
return CommandHook{
Command: cmd,
@ -248,5 +310,23 @@ func (c Command) Run(s HookState) error {
Env: c.Env,
Stdin: bytes.NewReader(b),
}
return cmd.Run()
errC := make(chan error, 1)
go func() {
out, err := cmd.CombinedOutput()
if err != nil {
err = fmt.Errorf("%s: %s", err, out)
}
errC <- err
}()
if c.Timeout != nil {
select {
case err := <-errC:
return err
case <-time.After(*c.Timeout):
cmd.Process.Kill()
cmd.Wait()
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
}
}
return <-errC
}

View File

@ -4,7 +4,7 @@ package configs
import "fmt"
// Gets the root uid for the process on host which could be non-zero
// HostUID gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostUID() (int, error) {
if c.Namespaces.Contains(NEWUSER) {
@ -21,7 +21,7 @@ func (c Config) HostUID() (int, error) {
return 0, nil
}
// Gets the root gid for the process on host which could be non-zero
// HostGID gets the root gid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostGID() (int, error) {
if c.Namespaces.Contains(NEWUSER) {

View File

@ -35,6 +35,9 @@ type Device struct {
// Gid of the device.
Gid uint32 `json:"gid"`
// Write the file to the allowed list
Allow bool `json:"allow"`
}
func (d *Device) CgroupString() string {

View File

@ -3,7 +3,7 @@
package configs
var (
// These are devices that are to be both allowed and created.
// DefaultSimpleDevices are devices that are to be both allowed and created.
DefaultSimpleDevices = []*Device{
// /dev/null and zero
{

View File

@ -18,7 +18,7 @@ var namespaceInfo = map[NamespaceType]int{
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This functions returns flags only for new namespaces.
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
var flag int
for _, v := range *n {

View File

@ -8,7 +8,7 @@ func (n *Namespace) Syscall() int {
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This functions returns flags only for new namespaces.
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
panic("No namespace syscall support")
return uintptr(0)

View File

@ -2,7 +2,11 @@
package configs
import "fmt"
import (
"fmt"
"os"
"sync"
)
const (
NEWNET NamespaceType = "NEWNET"
@ -13,6 +17,51 @@ const (
NEWUSER NamespaceType = "NEWUSER"
)
var (
nsLock sync.Mutex
supportedNamespaces = make(map[NamespaceType]bool)
)
// nsToFile converts the namespace type to its filename
func nsToFile(ns NamespaceType) string {
switch ns {
case NEWNET:
return "net"
case NEWNS:
return "mnt"
case NEWPID:
return "pid"
case NEWIPC:
return "ipc"
case NEWUSER:
return "user"
case NEWUTS:
return "uts"
}
return ""
}
// IsNamespaceSupported returns whether a namespace is available or
// not
func IsNamespaceSupported(ns NamespaceType) bool {
nsLock.Lock()
defer nsLock.Unlock()
supported, ok := supportedNamespaces[ns]
if ok {
return supported
}
nsFile := nsToFile(ns)
// if the namespace type is unknown, just return false
if nsFile == "" {
return false
}
_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
// a namespace is supported if it exists and we have permissions to read it
supported = err == nil
supportedNamespaces[ns] = supported
return supported
}
func NamespaceTypes() []NamespaceType {
return []NamespaceType{
NEWNET,
@ -35,26 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
if n.Path != "" {
return n.Path
}
return fmt.Sprintf("/proc/%d/ns/%s", pid, n.file())
}
func (n *Namespace) file() string {
file := ""
switch n.Type {
case NEWNET:
file = "net"
case NEWNS:
file = "mnt"
case NEWPID:
file = "pid"
case NEWIPC:
file = "ipc"
case NEWUSER:
file = "user"
case NEWUTS:
file = "uts"
}
return file
return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type))
}
func (n *Namespaces) Remove(t NamespaceType) bool {
@ -87,3 +117,11 @@ func (n *Namespaces) index(t NamespaceType) int {
func (n *Namespaces) Contains(t NamespaceType) bool {
return n.index(t) != -1
}
func (n *Namespaces) PathOf(t NamespaceType) string {
i := n.index(t)
if i == -1 {
return ""
}
return (*n)[i].Path
}

View File

@ -4,8 +4,10 @@ import (
"fmt"
"os"
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/selinux"
)
type Validator interface {
@ -35,10 +37,13 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.usernamespace(config); err != nil {
return err
}
if err := v.sysctl(config); err != nil {
return err
}
return nil
}
// rootfs validates the the rootfs is an absolute path and is not a symlink
// rootfs validates if the rootfs is an absolute path and is not a symlink
// to the container's root filesystem.
func (v *ConfigValidator) rootfs(config *configs.Config) error {
cleaned, err := filepath.Abs(config.Rootfs)
@ -48,7 +53,7 @@ func (v *ConfigValidator) rootfs(config *configs.Config) error {
if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
return err
}
if config.Rootfs != cleaned {
if filepath.Clean(config.Rootfs) != cleaned {
return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs)
}
return nil
@ -76,6 +81,10 @@ func (v *ConfigValidator) security(config *configs.Config) error {
!config.Namespaces.Contains(configs.NEWNS) {
return fmt.Errorf("unable to restrict sys entries without a private MNT namespace")
}
if config.ProcessLabel != "" && !selinux.SelinuxEnabled() {
return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported")
}
return nil
}
@ -91,3 +100,39 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error {
}
return nil
}
// sysctl validates that the specified sysctl keys are valid or not.
// /proc/sys isn't completely namespaced and depending on which namespaces
// are specified, a subset of sysctls are permitted.
func (v *ConfigValidator) sysctl(config *configs.Config) error {
validSysctlMap := map[string]bool{
"kernel.msgmax": true,
"kernel.msgmnb": true,
"kernel.msgmni": true,
"kernel.sem": true,
"kernel.shmall": true,
"kernel.shmmax": true,
"kernel.shmmni": true,
"kernel.shm_rmid_forced": true,
}
for s := range config.Sysctl {
if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
if config.Namespaces.Contains(configs.NEWIPC) {
continue
} else {
return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s)
}
}
if strings.HasPrefix(s, "net.") {
if config.Namespaces.Contains(configs.NEWNET) {
continue
} else {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
}
}
return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
}
return nil
}

View File

@ -0,0 +1,11 @@
package libcontainer
import (
"errors"
)
// NewConsole returns an initalized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) {
return nil, errors.New("libcontainer console is not supported on Solaris")
}

View File

@ -1,4 +1,4 @@
// Libcontainer provides a native Go implementation for creating containers
// Package libcontainer provides a native Go implementation for creating containers
// with namespaces, cgroups, capabilities, and filesystem access controls.
// It allows you to manage the lifecycle of the container performing additional operations
// after the container is created.
@ -6,31 +6,25 @@ package libcontainer
import (
"os"
"time"
"github.com/opencontainers/runc/libcontainer/configs"
)
// The status of a container.
// Status is the status of a container.
type Status int
const (
// The container exists but has not been run yet
// Created is the status that denotes the container exists but has not been run yet.
Created Status = iota
// The container exists and is running.
// Running is the status that denotes the container exists and is running.
Running
// The container exists, it is in the process of being paused.
// Pausing is the status that denotes the container exists, it is in the process of being paused.
Pausing
// The container exists, but all its processes are paused.
// Paused is the status that denotes the container exists, but all its processes are paused.
Paused
// The container exists, but its state is saved on disk
Checkpointed
// The container does not exist.
Destroyed
// Stopped is the status that denotes the container does not have a created or running process.
Stopped
)
func (s Status) String() string {
@ -43,10 +37,8 @@ func (s Status) String() string {
return "pausing"
case Paused:
return "paused"
case Checkpointed:
return "checkpointed"
case Destroyed:
return "destroyed"
case Stopped:
return "stopped"
default:
return "unknown"
}
@ -61,14 +53,17 @@ type BaseState struct {
// InitProcessPid is the init process id in the parent namespace.
InitProcessPid int `json:"init_process_pid"`
// InitProcessStartTime is the init process start time.
// InitProcessStartTime is the init process start time in clock cycles since boot time.
InitProcessStartTime string `json:"init_process_start"`
// Created is the unix timestamp for the creation time of the container in UTC
Created time.Time `json:"created"`
// Config is the container's configuration.
Config configs.Config `json:"config"`
}
// A libcontainer container object.
// BaseContainer is a libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
@ -81,13 +76,13 @@ type BaseContainer interface {
//
// errors:
// ContainerDestroyed - Container no longer exists,
// Systemerror - System error.
// SystemError - System error.
Status() (Status, error)
// State returns the current container's state information.
//
// errors:
// Systemerror - System error.
// SystemError - System error.
State() (*State, error)
// Returns the current config of the container.
@ -97,7 +92,7 @@ type BaseContainer interface {
//
// errors:
// ContainerDestroyed - Container no longer exists,
// Systemerror - System error.
// SystemError - System error.
//
// Some of the returned PIDs may no longer refer to processes in the Container, unless
// the Container state is PAUSED in which case every PID in the slice is valid.
@ -107,7 +102,7 @@ type BaseContainer interface {
//
// errors:
// ContainerDestroyed - Container no longer exists,
// Systemerror - System error.
// SystemError - System error.
Stats() (*Stats, error)
// Set resources of container as configured
@ -115,7 +110,7 @@ type BaseContainer interface {
// We can use this to change resources when containers are running.
//
// errors:
// Systemerror - System error.
// SystemError - System error.
Set(config configs.Config) error
// Start a process inside the container. Returns error if process fails to
@ -125,21 +120,38 @@ type BaseContainer interface {
// ContainerDestroyed - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// Systemerror - System error.
// SystemError - System error.
Start(process *Process) (err error)
// Run immediatly starts the process inside the conatiner. Returns error if process
// fails to start. It does not block waiting for the exec fifo after start returns but
// opens the fifo after start returns.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// SystemError - System error.
Run(process *Process) (err error)
// Destroys the container after killing all running processes.
//
// Any event registrations are removed before the container is destroyed.
// No error is returned if the container is already destroyed.
//
// errors:
// Systemerror - System error.
// SystemError - System error.
Destroy() error
// Signal sends the provided signal code to the container's initial process.
//
// errors:
// Systemerror - System error.
// SystemError - System error.
Signal(s os.Signal) error
// Exec signals the container to exec the users process at the end of the init.
//
// errors:
// SystemError - System error.
Exec() error
}

View File

@ -15,30 +15,35 @@ import (
"strings"
"sync"
"syscall"
"time"
"github.com/Sirupsen/logrus"
"github.com/golang/protobuf/proto"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/criurpc"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/syndtr/gocapability/capability"
"github.com/vishvananda/netlink/nl"
)
const stdioFdCount = 3
type linuxContainer struct {
id string
root string
config *configs.Config
cgroupManager cgroups.Manager
initPath string
initArgs []string
initProcess parentProcess
criuPath string
m sync.Mutex
criuVersion int
state containerState
id string
root string
config *configs.Config
cgroupManager cgroups.Manager
initPath string
initArgs []string
initProcess parentProcess
initProcessStartTime string
criuPath string
m sync.Mutex
criuVersion int
state containerState
created time.Time
}
// State represents a running container's state
@ -59,7 +64,7 @@ type State struct {
ExternalDescriptors []string `json:"external_descriptors,omitempty"`
}
// A libcontainer container object.
// Container is a libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
@ -75,13 +80,13 @@ type Container interface {
// Systemerror - System error.
Checkpoint(criuOpts *CriuOpts) error
// Restore restores the checkpointed container to a running state using the criu(8) utiity.
// Restore restores the checkpointed container to a running state using the criu(8) utility.
//
// errors:
// Systemerror - System error.
Restore(process *Process, criuOpts *CriuOpts) error
// If the Container state is RUNNING or PAUSING, sets the Container state to PAUSING and pauses
// If the Container state is RUNNING, sets the Container state to PAUSING and pauses
// the execution of any user processes. Asynchronously, when the container finished being paused the
// state is changed to PAUSED.
// If the Container state is PAUSED, do nothing.
@ -138,7 +143,7 @@ func (c *linuxContainer) State() (*State, error) {
func (c *linuxContainer) Processes() ([]int, error) {
pids, err := c.cgroupManager.GetAllPids()
if err != nil {
return nil, newSystemError(err)
return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
}
return pids, nil
}
@ -149,14 +154,14 @@ func (c *linuxContainer) Stats() (*Stats, error) {
stats = &Stats{}
)
if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
return stats, newSystemError(err)
return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
}
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
if err != nil {
return stats, newSystemError(err)
return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
}
stats.Interfaces = append(stats.Interfaces, istats)
}
@ -167,6 +172,13 @@ func (c *linuxContainer) Stats() (*Stats, error) {
func (c *linuxContainer) Set(config configs.Config) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status == Stopped {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
}
c.config = &config
return c.cgroupManager.Set(c.config)
}
@ -178,38 +190,90 @@ func (c *linuxContainer) Start(process *Process) error {
if err != nil {
return err
}
doInit := status == Destroyed
parent, err := c.newParentProcess(process, doInit)
return c.start(process, status == Stopped)
}
func (c *linuxContainer) Run(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return newSystemError(err)
return err
}
if err := c.start(process, status == Stopped); err != nil {
return err
}
if status == Stopped {
return c.exec()
}
return nil
}
func (c *linuxContainer) Exec() error {
c.m.Lock()
defer c.m.Unlock()
return c.exec()
}
func (c *linuxContainer) exec() error {
path := filepath.Join(c.root, execFifoFilename)
f, err := os.OpenFile(path, os.O_RDONLY, 0)
if err != nil {
return newSystemErrorWithCause(err, "open exec fifo for reading")
}
defer f.Close()
data, err := ioutil.ReadAll(f)
if err != nil {
return err
}
if len(data) > 0 {
os.Remove(path)
return nil
}
return fmt.Errorf("cannot start an already running container")
}
func (c *linuxContainer) start(process *Process, isInit bool) error {
parent, err := c.newParentProcess(process, isInit)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
}
if err := parent.start(); err != nil {
// terminate the process to ensure that it properly is reaped.
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemError(err)
return newSystemErrorWithCause(err, "starting container process")
}
// generate a timestamp indicating when the container was started
c.created = time.Now().UTC()
c.state = &runningState{
c: c,
}
if doInit {
if err := c.updateState(parent); err != nil {
if isInit {
c.state = &createdState{
c: c,
}
state, err := c.updateState(parent)
if err != nil {
return err
}
c.initProcessStartTime = state.InitProcessStartTime
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
Version: c.config.Version,
ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
}
for _, hook := range c.config.Hooks.Poststart {
for i, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemError(err)
return newSystemErrorWithCausef(err, "running poststart hook %d", i)
}
}
}
@ -219,7 +283,7 @@ func (c *linuxContainer) Start(process *Process) error {
func (c *linuxContainer) Signal(s os.Signal) error {
if err := c.initProcess.signal(s); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "signaling init process")
}
return nil
}
@ -227,19 +291,23 @@ func (c *linuxContainer) Signal(s os.Signal) error {
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
parentPipe, childPipe, err := newPipe()
if err != nil {
return nil, newSystemError(err)
return nil, newSystemErrorWithCause(err, "creating new init pipe")
}
cmd, err := c.commandTemplate(p, childPipe)
rootDir, err := os.Open(c.root)
if err != nil {
return nil, newSystemError(err)
return nil, err
}
cmd, err := c.commandTemplate(p, childPipe, rootDir)
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template")
}
if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir)
}
return c.newInitProcess(p, cmd, parentPipe, childPipe)
return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
}
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) {
cmd := &exec.Cmd{
Path: c.initPath,
Args: c.initArgs,
@ -251,8 +319,10 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
cmd.ExtraFiles = append(p.ExtraFiles, childPipe)
cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir)
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2),
fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
// NOTE: when running a container with no PID namespace and the parent process spawning the container is
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
// even with the parent still running.
@ -262,38 +332,42 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
return cmd, nil
}
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
t := "_LIBCONTAINER_INITTYPE=" + string(initStandard)
cloneFlags := c.config.Namespaces.CloneFlags()
if cloneFlags&syscall.CLONE_NEWUSER != 0 {
if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
// user mappings are not supported
return nil, err
}
enableSetgroups(cmd.SysProcAttr)
// Default to root user when user namespaces are enabled.
if cmd.SysProcAttr.Credential == nil {
cmd.SysProcAttr.Credential = &syscall.Credential{}
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
if ns.Path != "" {
nsMaps[ns.Type] = ns.Path
}
}
cmd.Env = append(cmd.Env, t)
cmd.SysProcAttr.Cloneflags = cloneFlags
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "")
if err != nil {
return nil, err
}
return &initProcess{
cmd: cmd,
childPipe: childPipe,
parentPipe: parentPipe,
manager: c.cgroupManager,
config: c.newInitConfig(p),
container: c,
process: p,
cmd: cmd,
childPipe: childPipe,
parentPipe: parentPipe,
manager: c.cgroupManager,
config: c.newInitConfig(p),
container: c,
process: p,
bootstrapData: data,
sharePidns: sharePidns,
rootDir: rootDir,
}, nil
}
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
state, err := c.currentState()
if err != nil {
return nil, newSystemErrorWithCause(err, "getting container's current state")
}
// for setns process, we dont have to set cloneflags as the process namespaces
// will only be set via setns syscall
data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath)
if err != nil {
return nil, err
}
@ -306,11 +380,12 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
config: c.newInitConfig(p),
process: p,
bootstrapData: data,
rootDir: rootDir,
}, nil
}
func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
return &initConfig{
cfg := &initConfig{
Config: c.config,
Args: process.Args,
Env: process.Env,
@ -319,7 +394,26 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
Console: process.consolePath,
Capabilities: process.Capabilities,
PassedFilesCount: len(process.ExtraFiles),
ContainerId: c.ID(),
NoNewPrivileges: c.config.NoNewPrivileges,
AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits,
ExecFifoPath: filepath.Join(c.root, execFifoFilename),
}
if process.NoNewPrivileges != nil {
cfg.NoNewPrivileges = *process.NoNewPrivileges
}
if process.AppArmorProfile != "" {
cfg.AppArmorProfile = process.AppArmorProfile
}
if process.Label != "" {
cfg.ProcessLabel = process.Label
}
if len(process.Rlimits) > 0 {
cfg.Rlimits = process.Rlimits
}
return cfg
}
func newPipe() (parent *os.File, child *os.File, err error) {
@ -343,15 +437,16 @@ func (c *linuxContainer) Pause() error {
if err != nil {
return err
}
if status != Running {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
switch status {
case Running, Created:
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
}
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
return newGenericError(fmt.Errorf("container not running: %s", status), ContainerNotRunning)
}
func (c *linuxContainer) Resume() error {
@ -380,23 +475,13 @@ func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struc
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
}
// XXX debug support, remove when debugging done.
func addArgsFromEnv(evar string, args *[]string) {
if e := os.Getenv(evar); e != "" {
for _, f := range strings.Fields(e) {
*args = append(*args, f)
}
}
fmt.Printf(">>> criu %v\n", *args)
}
// check Criu version greater than or equal to min_version
func (c *linuxContainer) checkCriuVersion(min_version string) error {
// checkCriuVersion checks Criu version greater than or equal to minVersion
func (c *linuxContainer) checkCriuVersion(minVersion string) error {
var x, y, z, versionReq int
_, err := fmt.Sscanf(min_version, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
_, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
if err != nil {
_, err = fmt.Sscanf(min_version, "Version: %d.%d\n", &x, &y) // 1.6
_, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
}
versionReq = x*10000 + y*100 + z
@ -441,7 +526,7 @@ func (c *linuxContainer) checkCriuVersion(min_version string) error {
c.criuVersion = x*10000 + y*100 + z
if c.criuVersion < versionReq {
return fmt.Errorf("CRIU version must be %s or higher", min_version)
return fmt.Errorf("CRIU version must be %s or higher", minVersion)
}
return nil
@ -514,6 +599,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
}
// append optional criu opts, e.g., page-server and port
@ -529,7 +615,8 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
if err := c.checkCriuVersion("1.7"); err != nil {
return err
}
rpcOpts.ManageCgroupsMode = proto.Uint32(uint32(criuOpts.ManageCgroupsMode))
mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
rpcOpts.ManageCgroupsMode = &mode
}
t := criurpc.CriuReqType_DUMP
@ -587,6 +674,27 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
}
}
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
@ -650,6 +758,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
},
}
@ -669,23 +778,9 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
break
}
}
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
c.restoreNetwork(req, criuOpts)
}
// append optional manage cgroups mode
@ -693,7 +788,8 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
if err := c.checkCriuVersion("1.7"); err != nil {
return err
}
req.Opts.ManageCgroupsMode = proto.Uint32(uint32(criuOpts.ManageCgroupsMode))
mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
req.Opts.ManageCgroupsMode = &mode
}
var (
@ -850,7 +946,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
if err != nil {
return err
}
n, err = criuClient.Write(data)
_, err = criuClient.Write(data)
if err != nil {
return err
}
@ -925,6 +1021,20 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
if err := lockNetwork(c.config); err != nil {
return err
}
case notify.GetScript() == "setup-namespaces":
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: int(notify.GetPid()),
Root: c.config.Rootfs,
}
for i, hook := range c.config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
case notify.GetScript() == "post-restore":
pid := notify.GetPid()
r, err := newRestoredProcess(int(pid), fds)
@ -938,7 +1048,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
}); err != nil {
return err
}
if err := c.updateState(r); err != nil {
if _, err := c.updateState(r); err != nil {
return err
}
if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
@ -950,13 +1060,17 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
return nil
}
func (c *linuxContainer) updateState(process parentProcess) error {
func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
c.initProcess = process
state, err := c.currentState()
if err != nil {
return err
return nil, err
}
return c.saveState(state)
err = c.saveState(state)
if err != nil {
return nil, err
}
return state, nil
}
func (c *linuxContainer) saveState(s *State) error {
@ -991,37 +1105,75 @@ func (c *linuxContainer) refreshState() error {
if paused {
return c.state.transition(&pausedState{c: c})
}
running, err := c.isRunning()
t, err := c.runType()
if err != nil {
return err
}
if running {
switch t {
case Created:
return c.state.transition(&createdState{c: c})
case Running:
return c.state.transition(&runningState{c: c})
}
return c.state.transition(&stoppedState{c: c})
}
func (c *linuxContainer) isRunning() (bool, error) {
if c.initProcess == nil {
// doesInitProcessExist checks if the init process is still the same process
// as the initial one, it could happen that the original process has exited
// and a new process has been created with the same pid, in this case, the
// container would already be stopped.
func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
startTime, err := system.GetProcessStartTime(initPid)
if err != nil {
return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid)
}
if c.initProcessStartTime != startTime {
return false, nil
}
// return Running if the init process is alive
if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
if err == syscall.ESRCH {
return false, nil
}
return false, newSystemError(err)
}
return true, nil
}
func (c *linuxContainer) runType() (Status, error) {
if c.initProcess == nil {
return Stopped, nil
}
pid := c.initProcess.pid()
// return Running if the init process is alive
if err := syscall.Kill(pid, 0); err != nil {
if err == syscall.ESRCH {
// It means the process does not exist anymore, could happen when the
// process exited just when we call the function, we should not return
// error in this case.
return Stopped, nil
}
return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
}
// check if the process is still the original init process.
exist, err := c.doesInitProcessExist(pid)
if !exist || err != nil {
return Stopped, err
}
// check if the process that is running is the init process or the user's process.
// this is the difference between the container Running and Created.
environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid))
if err != nil {
return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid)
}
check := []byte("_LIBCONTAINER")
if bytes.Contains(environ, check) {
return Created, nil
}
return Running, nil
}
func (c *linuxContainer) isPaused() (bool, error) {
data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
if err != nil {
// If freezer cgroup is not mounted, the container would just be not paused.
if os.IsNotExist(err) {
return false, nil
}
return false, newSystemError(err)
return false, newSystemErrorWithCause(err, "checking if container is paused")
}
return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
}
@ -1043,6 +1195,7 @@ func (c *linuxContainer) currentState() (*State, error) {
Config: *c.config,
InitProcessPid: pid,
InitProcessStartTime: startTime,
Created: c.created,
},
CgroupPaths: c.cgroupManager.GetPaths(),
NamespacePaths: make(map[configs.NamespaceType]string),
@ -1053,6 +1206,9 @@ func (c *linuxContainer) currentState() (*State, error) {
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
}
for _, nsType := range configs.NamespaceTypes() {
if !configs.IsNamespaceSupported(nsType) {
continue
}
if _, ok := state.NamespacePaths[nsType]; !ok {
ns := configs.Namespace{Type: nsType}
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
@ -1062,18 +1218,69 @@ func (c *linuxContainer) currentState() (*State, error) {
return state, nil
}
// bootstrapData encodes the necessary data in netlink binary format as a io.Reader.
// Consumer can write the data to a bootstrap program such as one that uses
// nsenter package to bootstrap the container's init process correctly, i.e. with
// correct namespaces, uid/gid mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) {
// orderNamespacePaths sorts namespace paths into a list of paths that we
// can setns in order.
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
paths := []string{}
nsTypes := []configs.NamespaceType{
configs.NEWIPC,
configs.NEWUTS,
configs.NEWNET,
configs.NEWPID,
configs.NEWNS,
}
// join userns if the init process explicitly requires NEWUSER
if c.config.Namespaces.Contains(configs.NEWUSER) {
nsTypes = append(nsTypes, configs.NEWUSER)
}
for _, nsType := range nsTypes {
if p, ok := namespaces[nsType]; ok && p != "" {
// check if the requested namespace is supported
if !configs.IsNamespaceSupported(nsType) {
return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType))
}
// only set to join this namespace if it exists
if _, err := os.Lstat(p); err != nil {
return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
}
// do not allow namespace path with comma as we use it to separate
// the namespace paths
if strings.ContainsRune(p, ',') {
return nil, newSystemError(fmt.Errorf("invalid path %s", p))
}
paths = append(paths, p)
}
}
return paths, nil
}
func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
data := bytes.NewBuffer(nil)
for _, im := range idMap {
line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
if _, err := data.WriteString(line); err != nil {
return nil, err
}
}
return data.Bytes(), nil
}
// bootstrapData encodes the necessary data in netlink binary format
// as a io.Reader.
// Consumer can write the data to a bootstrap program
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)
// write pid
// write cloneFlags
r.AddData(&Int32msg{
Type: PidAttr,
Value: uint32(pid),
Type: CloneFlagsAttr,
Value: uint32(cloneFlags),
})
// write console path
if consolePath != "" {
r.AddData(&Bytemsg{
@ -1081,5 +1288,57 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath
Value: []byte(consolePath),
})
}
// write custom namespace paths
if len(nsMaps) > 0 {
nsPaths, err := c.orderNamespacePaths(nsMaps)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: NsPathsAttr,
Value: []byte(strings.Join(nsPaths, ",")),
})
}
// write namespace paths only when we are not joining an existing user ns
_, joinExistingUser := nsMaps[configs.NEWUSER]
if !joinExistingUser {
// write uid mappings
if len(c.config.UidMappings) > 0 {
b, err := encodeIDMapping(c.config.UidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: UidmapAttr,
Value: b,
})
}
// write gid mappings
if len(c.config.GidMappings) > 0 {
b, err := encodeIDMapping(c.config.GidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: GidmapAttr,
Value: b,
})
// check if we have CAP_SETGID to setgroup properly
pid, err := capability.NewPid(os.Getpid())
if err != nil {
return nil, err
}
if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
r.AddData(&Boolmsg{
Type: SetgroupAttr,
Value: true,
})
}
}
}
return bytes.NewReader(r.Serialize()), nil
}

View File

@ -1,13 +0,0 @@
// +build !go1.4
package libcontainer
import (
"fmt"
"syscall"
)
// not available before go 1.4
func (c *linuxContainer) addUidGidMappings(sys *syscall.SysProcAttr) error {
return fmt.Errorf("User namespace is not supported in golang < 1.4")
}

View File

@ -0,0 +1,20 @@
package libcontainer
// State represents a running container's state
type State struct {
BaseState
// Platform specific fields below here
}
// A libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
// was not found.
type Container interface {
BaseContainer
// Methods below here are platform specific
}

View File

@ -1,26 +0,0 @@
// +build go1.4
package libcontainer
import "syscall"
// Converts IDMap to SysProcIDMap array and adds it to SysProcAttr.
func (c *linuxContainer) addUidGidMappings(sys *syscall.SysProcAttr) error {
if c.config.UidMappings != nil {
sys.UidMappings = make([]syscall.SysProcIDMap, len(c.config.UidMappings))
for i, um := range c.config.UidMappings {
sys.UidMappings[i].ContainerID = um.ContainerID
sys.UidMappings[i].HostID = um.HostID
sys.UidMappings[i].Size = um.Size
}
}
if c.config.GidMappings != nil {
sys.GidMappings = make([]syscall.SysProcIDMap, len(c.config.GidMappings))
for i, gm := range c.config.GidMappings {
sys.GidMappings[i].ContainerID = gm.ContainerID
sys.GidMappings[i].HostID = gm.HostID
sys.GidMappings[i].Size = gm.Size
}
}
return nil
}

View File

@ -3,13 +3,13 @@
package libcontainer
// cgroup restoring strategy provided by criu
type cg_mode uint32
type cgMode uint32
const (
CRIU_CG_MODE_SOFT cg_mode = 3 + iota // restore cgroup properties if only dir created by criu
CRIU_CG_MODE_FULL // always restore all cgroups and their properties
CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system
CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT
CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu
CRIU_CG_MODE_FULL // always restore all cgroups and their properties
CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system
CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT
)
type CriuPageServerInfo struct {
@ -32,5 +32,6 @@ type CriuOpts struct {
FileLocks bool // handle file locks, for safety
PageServer CriuPageServerInfo // allow to dump to criu page server
VethPairs []VethPairName // pass the veth to criu when restore
ManageCgroupsMode cg_mode // dump or restore cgroup mode
ManageCgroupsMode cgMode // dump or restore cgroup mode
EmptyNs uint32 // don't c/r properties for namespace from this mask
}

View File

@ -19,6 +19,7 @@ It has these top-level messages:
CriuDumpResp
CriuRestoreResp
CriuNotify
CriuFeatures
CriuReq
CriuResp
*/
@ -31,6 +32,54 @@ import math "math"
var _ = proto.Marshal
var _ = math.Inf
type CriuCgMode int32
const (
CriuCgMode_IGNORE CriuCgMode = 0
CriuCgMode_NONE CriuCgMode = 1
CriuCgMode_PROPS CriuCgMode = 2
CriuCgMode_SOFT CriuCgMode = 3
CriuCgMode_FULL CriuCgMode = 4
CriuCgMode_STRICT CriuCgMode = 5
CriuCgMode_DEFAULT CriuCgMode = 6
)
var CriuCgMode_name = map[int32]string{
0: "IGNORE",
1: "NONE",
2: "PROPS",
3: "SOFT",
4: "FULL",
5: "STRICT",
6: "DEFAULT",
}
var CriuCgMode_value = map[string]int32{
"IGNORE": 0,
"NONE": 1,
"PROPS": 2,
"SOFT": 3,
"FULL": 4,
"STRICT": 5,
"DEFAULT": 6,
}
func (x CriuCgMode) Enum() *CriuCgMode {
p := new(CriuCgMode)
*p = x
return p
}
func (x CriuCgMode) String() string {
return proto.EnumName(CriuCgMode_name, int32(x))
}
func (x *CriuCgMode) UnmarshalJSON(data []byte) error {
value, err := proto.UnmarshalJSONEnum(CriuCgMode_value, data, "CriuCgMode")
if err != nil {
return err
}
*x = CriuCgMode(value)
return nil
}
type CriuReqType int32
const (
@ -43,6 +92,7 @@ const (
CriuReqType_NOTIFY CriuReqType = 6
CriuReqType_CPUINFO_DUMP CriuReqType = 7
CriuReqType_CPUINFO_CHECK CriuReqType = 8
CriuReqType_FEATURE_CHECK CriuReqType = 9
)
var CriuReqType_name = map[int32]string{
@ -55,6 +105,7 @@ var CriuReqType_name = map[int32]string{
6: "NOTIFY",
7: "CPUINFO_DUMP",
8: "CPUINFO_CHECK",
9: "FEATURE_CHECK",
}
var CriuReqType_value = map[string]int32{
"EMPTY": 0,
@ -66,6 +117,7 @@ var CriuReqType_value = map[string]int32{
"NOTIFY": 6,
"CPUINFO_DUMP": 7,
"CPUINFO_CHECK": 8,
"FEATURE_CHECK": 9,
}
func (x CriuReqType) Enum() *CriuReqType {
@ -271,7 +323,12 @@ type CriuOpts struct {
SkipMnt []string `protobuf:"bytes,31,rep,name=skip_mnt" json:"skip_mnt,omitempty"`
EnableFs []string `protobuf:"bytes,32,rep,name=enable_fs" json:"enable_fs,omitempty"`
UnixSkIno []*UnixSk `protobuf:"bytes,33,rep,name=unix_sk_ino" json:"unix_sk_ino,omitempty"`
ManageCgroupsMode *uint32 `protobuf:"varint,34,opt,name=manage_cgroups_mode" json:"manage_cgroups_mode,omitempty"`
ManageCgroupsMode *CriuCgMode `protobuf:"varint,34,opt,name=manage_cgroups_mode,enum=CriuCgMode" json:"manage_cgroups_mode,omitempty"`
GhostLimit *uint32 `protobuf:"varint,35,opt,name=ghost_limit,def=1048576" json:"ghost_limit,omitempty"`
IrmapScanPaths []string `protobuf:"bytes,36,rep,name=irmap_scan_paths" json:"irmap_scan_paths,omitempty"`
External []string `protobuf:"bytes,37,rep,name=external" json:"external,omitempty"`
EmptyNs *uint32 `protobuf:"varint,38,opt,name=empty_ns" json:"empty_ns,omitempty"`
NoSeccomp *bool `protobuf:"varint,39,opt,name=no_seccomp" json:"no_seccomp,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
@ -281,6 +338,7 @@ func (*CriuOpts) ProtoMessage() {}
const Default_CriuOpts_LogLevel int32 = 2
const Default_CriuOpts_CpuCap uint32 = 4294967295
const Default_CriuOpts_GhostLimit uint32 = 1048576
func (m *CriuOpts) GetImagesDirFd() int32 {
if m != nil && m.ImagesDirFd != nil {
@ -513,13 +571,48 @@ func (m *CriuOpts) GetUnixSkIno() []*UnixSk {
return nil
}
func (m *CriuOpts) GetManageCgroupsMode() uint32 {
func (m *CriuOpts) GetManageCgroupsMode() CriuCgMode {
if m != nil && m.ManageCgroupsMode != nil {
return *m.ManageCgroupsMode
}
return CriuCgMode_IGNORE
}
func (m *CriuOpts) GetGhostLimit() uint32 {
if m != nil && m.GhostLimit != nil {
return *m.GhostLimit
}
return Default_CriuOpts_GhostLimit
}
func (m *CriuOpts) GetIrmapScanPaths() []string {
if m != nil {
return m.IrmapScanPaths
}
return nil
}
func (m *CriuOpts) GetExternal() []string {
if m != nil {
return m.External
}
return nil
}
func (m *CriuOpts) GetEmptyNs() uint32 {
if m != nil && m.EmptyNs != nil {
return *m.EmptyNs
}
return 0
}
func (m *CriuOpts) GetNoSeccomp() bool {
if m != nil && m.NoSeccomp != nil {
return *m.NoSeccomp
}
return false
}
type CriuDumpResp struct {
Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"`
XXX_unrecognized []byte `json:"-"`
@ -576,6 +669,25 @@ func (m *CriuNotify) GetPid() int32 {
return 0
}
//
// List of features which can queried via
// CRIU_REQ_TYPE__FEATURE_CHECK
type CriuFeatures struct {
MemTrack *bool `protobuf:"varint,1,opt,name=mem_track" json:"mem_track,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuFeatures) Reset() { *m = CriuFeatures{} }
func (m *CriuFeatures) String() string { return proto.CompactTextString(m) }
func (*CriuFeatures) ProtoMessage() {}
func (m *CriuFeatures) GetMemTrack() bool {
if m != nil && m.MemTrack != nil {
return *m.MemTrack
}
return false
}
type CriuReq struct {
Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"`
Opts *CriuOpts `protobuf:"bytes,2,opt,name=opts" json:"opts,omitempty"`
@ -584,8 +696,13 @@ type CriuReq struct {
// When set service won't close the connection but
// will wait for more req-s to appear. Works not
// for all request types.
KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"`
XXX_unrecognized []byte `json:"-"`
KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"`
//
// 'features' can be used to query which features
// are supported by the installed criu/kernel
// via RPC.
Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuReq) Reset() { *m = CriuReq{} }
@ -620,6 +737,13 @@ func (m *CriuReq) GetKeepOpen() bool {
return false
}
func (m *CriuReq) GetFeatures() *CriuFeatures {
if m != nil {
return m.Features
}
return nil
}
type CriuResp struct {
Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"`
Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"`
@ -628,6 +752,7 @@ type CriuResp struct {
Notify *CriuNotify `protobuf:"bytes,5,opt,name=notify" json:"notify,omitempty"`
Ps *CriuPageServerInfo `protobuf:"bytes,6,opt,name=ps" json:"ps,omitempty"`
CrErrno *int32 `protobuf:"varint,7,opt,name=cr_errno" json:"cr_errno,omitempty"`
Features *CriuFeatures `protobuf:"bytes,8,opt,name=features" json:"features,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
@ -684,6 +809,14 @@ func (m *CriuResp) GetCrErrno() int32 {
return 0
}
func (m *CriuResp) GetFeatures() *CriuFeatures {
if m != nil {
return m.Features
}
return nil
}
func init() {
proto.RegisterEnum("CriuCgMode", CriuCgMode_name, CriuCgMode_value)
proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value)
}

View File

@ -29,6 +29,16 @@ message unix_sk {
required uint32 inode = 1;
};
enum criu_cg_mode {
IGNORE = 0;
NONE = 1;
PROPS = 2;
SOFT = 3;
FULL = 4;
STRICT = 5;
DEFAULT = 6;
};
message criu_opts {
required int32 images_dir_fd = 1;
optional int32 pid = 2; /* if not set on dump, will dump requesting process */
@ -75,7 +85,12 @@ message criu_opts {
repeated unix_sk unix_sk_ino = 33;
optional uint32 manage_cgroups_mode = 34;
optional criu_cg_mode manage_cgroups_mode = 34;
optional uint32 ghost_limit = 35 [default = 0x100000];
repeated string irmap_scan_paths = 36;
repeated string external = 37;
optional uint32 empty_ns = 38;
optional bool no_seccomp = 39;
}
message criu_dump_resp {
@ -103,6 +118,16 @@ enum criu_req_type {
CPUINFO_DUMP = 7;
CPUINFO_CHECK = 8;
FEATURE_CHECK = 9;
}
/*
* List of features which can queried via
* CRIU_REQ_TYPE__FEATURE_CHECK
*/
message criu_features {
optional bool mem_track = 1;
}
/*
@ -122,11 +147,17 @@ message criu_req {
* for all request types.
*/
optional bool keep_open = 4;
/*
* 'features' can be used to query which features
* are supported by the installed criu/kernel
* via RPC.
*/
optional criu_features features = 5;
}
/*
* Responce -- it states whether the request was served
* and additional request-specific informarion
* Response -- it states whether the request was served
* and additional request-specific information
*/
message criu_resp {
@ -139,4 +170,5 @@ message criu_resp {
optional criu_page_server_info ps = 6;
optional int32 cr_errno = 7;
optional criu_features features = 8;
}

View File

@ -2,7 +2,7 @@ package libcontainer
import "io"
// API error code type.
// ErrorCode is the API error code type.
type ErrorCode int
// API error codes.
@ -19,7 +19,7 @@ const (
ContainerNotPaused
// Process errors
ProcessNotExecuted
NoProcessOps
// Common errors
ConfigInvalid
@ -49,12 +49,14 @@ func (c ErrorCode) String() string {
return "Console exists for process"
case ContainerNotPaused:
return "Container is not paused"
case NoProcessOps:
return "No process operations"
default:
return "Unknown error"
}
}
// API Error type.
// Error is the API error type.
type Error interface {
error

View File

@ -9,6 +9,7 @@ import (
"os/exec"
"path/filepath"
"regexp"
"runtime/debug"
"strconv"
"syscall"
@ -22,11 +23,12 @@ import (
)
const (
stateFilename = "state.json"
stateFilename = "state.json"
execFifoFilename = "exec.fifo"
)
var (
idRegex = regexp.MustCompile(`^[\w_-]+$`)
idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
maxIdLen = 1024
)
@ -157,13 +159,34 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
}
uid, err := config.HostUID()
if err != nil {
return nil, newGenericError(err, SystemError)
}
gid, err := config.HostGID()
if err != nil {
return nil, newGenericError(err, SystemError)
}
containerRoot := filepath.Join(l.Root, id)
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError)
}
if err := os.MkdirAll(containerRoot, 0700); err != nil {
if err := os.MkdirAll(containerRoot, 0711); err != nil {
return nil, newGenericError(err, SystemError)
}
if err := os.Chown(containerRoot, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
}
fifoName := filepath.Join(containerRoot, execFifoFilename)
oldMask := syscall.Umask(0000)
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
syscall.Umask(oldMask)
return nil, newGenericError(err, SystemError)
}
syscall.Umask(oldMask)
if err := os.Chown(fifoName, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
}
c := &linuxContainer{
@ -194,16 +217,18 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
fds: state.ExternalDescriptors,
}
c := &linuxContainer{
initProcess: r,
id: id,
config: &state.Config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot,
initProcess: r,
initProcessStartTime: state.InitProcessStartTime,
id: id,
config: &state.Config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot,
created: state.Created,
}
c.state = &createdState{c: c, s: Created}
c.state = &loadedState{c: c}
if err := c.refreshState(); err != nil {
return nil, err
}
@ -217,10 +242,18 @@ func (l *LinuxFactory) Type() string {
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
fdStr := os.Getenv("_LIBCONTAINER_INITPIPE")
pipefd, err := strconv.Atoi(fdStr)
if err != nil {
return fmt.Errorf("error converting env var _LIBCONTAINER_INITPIPE(%q) to an int: %s", fdStr, err)
var pipefd, rootfd int
for k, v := range map[string]*int{
"_LIBCONTAINER_INITPIPE": &pipefd,
"_LIBCONTAINER_STATEDIR": &rootfd,
} {
s := os.Getenv(k)
i, err := strconv.Atoi(s)
if err != nil {
return fmt.Errorf("unable to convert %s=%s to int", k, s)
}
*v = i
}
var (
pipe = os.NewFile(uintptr(pipefd), "pipe")
@ -229,29 +262,31 @@ func (l *LinuxFactory) StartInitialization() (err error) {
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
var i initer
defer func() {
// if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError.
if err != nil {
if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if err := utils.WriteJSON(pipe, syncT{procError}); err != nil {
panic(err)
}
}
if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
panic(err)
}
} else {
if err := utils.WriteJSON(pipe, syncT{procStart}); err != nil {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
// If container's init successed, syscall.Exec will not return, hence
// this defer function will never be called.
if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
panic(err)
}
}
if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
panic(err)
}
// ensure that this pipe is always closed
pipe.Close()
}()
i, err = newContainerInit(it, pipe)
defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
}
}()
i, err = newContainerInit(it, pipe, rootfd)
if err != nil {
return err
}

View File

@ -14,8 +14,9 @@ type syncType uint8
const (
procReady syncType = iota
procError
procStart
procRun
procHooks
procResume
)
type syncT struct {
@ -51,6 +52,21 @@ func newGenericError(err error, c ErrorCode) Error {
}
func newSystemError(err error) Error {
return createSystemError(err, "")
}
func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error {
return createSystemError(err, fmt.Sprintf(cause, v...))
}
func newSystemErrorWithCause(err error, cause string) Error {
return createSystemError(err, cause)
}
// createSystemError creates the specified error with the correct number of
// stack frames skipped. This is only to be called by the other functions for
// formatting the error.
func createSystemError(err error, cause string) Error {
if le, ok := err.(Error); ok {
return le
}
@ -58,7 +74,8 @@ func newSystemError(err error) Error {
Timestamp: time.Now(),
Err: err,
ECode: SystemError,
Stack: stacktrace.Capture(1),
Cause: cause,
Stack: stacktrace.Capture(2),
}
if err != nil {
gerr.Message = err.Error()
@ -70,12 +87,17 @@ type genericError struct {
Timestamp time.Time
ECode ErrorCode
Err error `json:"-"`
Cause string
Message string
Stack stacktrace.Stacktrace
}
func (e *genericError) Error() string {
return fmt.Sprintf("[%d] %s: %s", e.ECode, e.ECode, e.Message)
if e.Cause == "" {
return e.Message
}
frame := e.Stack.Frames[0]
return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message)
}
func (e *genericError) Code() ErrorCode {

View File

@ -44,22 +44,28 @@ type network struct {
// initConfig is used for transferring parameters from Exec() to Init()
type initConfig struct {
Args []string `json:"args"`
Env []string `json:"env"`
Cwd string `json:"cwd"`
Capabilities []string `json:"capabilities"`
User string `json:"user"`
Config *configs.Config `json:"config"`
Console string `json:"console"`
Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"`
Args []string `json:"args"`
Env []string `json:"env"`
Cwd string `json:"cwd"`
Capabilities []string `json:"capabilities"`
ProcessLabel string `json:"process_label"`
AppArmorProfile string `json:"apparmor_profile"`
NoNewPrivileges bool `json:"no_new_privileges"`
User string `json:"user"`
Config *configs.Config `json:"config"`
Console string `json:"console"`
Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"`
ContainerId string `json:"containerid"`
Rlimits []configs.Rlimit `json:"rlimits"`
ExecFifoPath string `json:"start_pipe_path"`
}
type initer interface {
Init() error
}
func newContainerInit(t initType, pipe *os.File) (initer, error) {
func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
@ -74,9 +80,10 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
}, nil
case initStandard:
return &linuxStandardInit{
pipe: pipe,
parentPid: syscall.Getppid(),
config: config,
pipe: pipe,
parentPid: syscall.Getppid(),
config: config,
stateDirFD: stateDirFD,
}, nil
}
return nil, fmt.Errorf("unknown init type %q", t)
@ -163,20 +170,22 @@ func syncParentReady(pipe io.ReadWriter) error {
return nil
}
// joinExistingNamespaces gets all the namespace paths specified for the container and
// does a setns on the namespace fd so that the current process joins the namespace.
func joinExistingNamespaces(namespaces []configs.Namespace) error {
for _, ns := range namespaces {
if ns.Path != "" {
f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0)
if err != nil {
return err
}
err = system.Setns(f.Fd(), uintptr(ns.Syscall()))
f.Close()
if err != nil {
return err
}
// syncParentHooks sends to the given pipe a JSON payload which indicates that
// the parent should execute pre-start hooks. It then waits for the parent to
// indicate that it is cleared to resume.
func syncParentHooks(pipe io.ReadWriter) error {
// Tell parent.
if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil {
return err
}
// Wait for parent to give the all-clear.
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procResume {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
@ -309,19 +318,19 @@ func setupRoute(config *configs.Config) error {
return nil
}
func setupRlimits(config *configs.Config) error {
for _, rlimit := range config.Rlimits {
l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}
if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
func setupRlimits(limits []configs.Rlimit, pid int) error {
for _, rlimit := range limits {
if err := system.Prlimit(pid, rlimit.Type, syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
}
}
return nil
}
func setOomScoreAdj(oomScoreAdj int) error {
path := "/proc/self/oom_score_adj"
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0700)
func setOomScoreAdj(oomScoreAdj int, pid int) error {
path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600)
}
// killCgroupProcesses freezes then iterates over all the processes inside the
@ -338,11 +347,14 @@ func killCgroupProcesses(m cgroups.Manager) error {
return err
}
for _, pid := range pids {
if p, err := os.FindProcess(pid); err == nil {
procs = append(procs, p)
if err := p.Kill(); err != nil {
logrus.Warn(err)
}
p, err := os.FindProcess(pid)
if err != nil {
logrus.Warn(err)
continue
}
procs = append(procs, p)
if err := p.Kill(); err != nil {
logrus.Warn(err)
}
}
if err := m.Freeze(configs.Thawed); err != nil {

View File

@ -0,0 +1,66 @@
// +build linux
package keyctl
import (
"fmt"
"strconv"
"strings"
"syscall"
"unsafe"
)
const KEYCTL_JOIN_SESSION_KEYRING = 1
const KEYCTL_SETPERM = 5
const KEYCTL_DESCRIBE = 6
type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) {
var _name *byte
var err error
if len(name) > 0 {
_name, err = syscall.BytePtrFromString(name)
if err != nil {
return KeySerial(0), err
}
}
sessKeyId, _, errn := syscall.Syscall(syscall.SYS_KEYCTL, KEYCTL_JOIN_SESSION_KEYRING, uintptr(unsafe.Pointer(_name)), 0)
if errn != 0 {
return 0, fmt.Errorf("could not create session key: %v", errn)
}
return KeySerial(sessKeyId), nil
}
// ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
// anding the bits with the given mask (clearing permissions) and setting
// additional permission bits
func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
dest := make([]byte, 1024)
destBytes := unsafe.Pointer(&dest[0])
if _, _, err := syscall.Syscall6(syscall.SYS_KEYCTL, uintptr(KEYCTL_DESCRIBE), uintptr(ringId), uintptr(destBytes), uintptr(len(dest)), 0, 0); err != 0 {
return err
}
res := strings.Split(string(dest), ";")
if len(res) < 5 {
return fmt.Errorf("Destination buffer for key description is too small")
}
// parse permissions
perm64, err := strconv.ParseUint(res[3], 16, 32)
if err != nil {
return err
}
perm := (uint32(perm64) & mask) | setbits
if _, _, err := syscall.Syscall(syscall.SYS_KEYCTL, uintptr(KEYCTL_SETPERM), uintptr(ringId), uintptr(perm)); err != 0 {
return err
}
return nil
}

View File

@ -21,6 +21,10 @@ func SetProcessLabel(processLabel string) error {
return nil
}
func GetFileLabel(path string) (string, error) {
return "", nil
}
func SetFileLabel(path string, fileLabel string) error {
return nil
}
@ -48,7 +52,7 @@ func UnreserveLabel(label string) error {
return nil
}
// DupSecOpt takes an process label and returns security options that
// DupSecOpt takes a process label and returns security options that
// can be used to set duplicate labels on future container processes
func DupSecOpt(src string) []string {
return nil

View File

@ -94,6 +94,11 @@ func GetProcessLabel() (string, error) {
return selinux.Getexeccon()
}
// GetFileLabel returns the label for specified path
func GetFileLabel(path string) (string, error) {
return selinux.Getfilecon(path)
}
// SetFileLabel modifies the "path" label to the specified file label
func SetFileLabel(path string, fileLabel string) error {
if selinux.SelinuxEnabled() && fileLabel != "" {
@ -102,7 +107,7 @@ func SetFileLabel(path string, fileLabel string) error {
return nil
}
// Tell the kernel the label for all files to be created
// SetFileCreateLabel tells the kernel the label for all files to be created
func SetFileCreateLabel(fileLabel string) error {
if selinux.SelinuxEnabled() {
return selinux.Setfscreatecon(fileLabel)
@ -110,7 +115,7 @@ func SetFileCreateLabel(fileLabel string) error {
return nil
}
// Change the label of path to the filelabel string.
// Relabel changes the label of path to the filelabel string.
// It changes the MCS label to s0 if shared is true.
// This will allow all containers to share the content.
func Relabel(path string, fileLabel string, shared bool) error {

View File

@ -12,8 +12,12 @@ import (
// The number is randomly chosen to not conflict with known netlink types
const (
InitMsg uint16 = 62000
PidAttr uint16 = 27281
CloneFlagsAttr uint16 = 27281
ConsolePathAttr uint16 = 27282
NsPathsAttr uint16 = 27283
UidmapAttr uint16 = 27284
GidmapAttr uint16 = 27285
SetgroupAttr uint16 = 27286
// When syscall.NLA_HDRLEN is in gccgo, take this out.
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
)
@ -23,7 +27,8 @@ type Int32msg struct {
Value uint32
}
// int32msg has the following representation
// Serialize serializes the message.
// Int32msg has the following representation
// | nlattr len | nlattr type |
// | uint32 value |
func (msg *Int32msg) Serialize() []byte {
@ -39,7 +44,7 @@ func (msg *Int32msg) Len() int {
return syscall_NLA_HDRLEN + 4
}
// bytemsg has the following representation
// Bytemsg has the following representation
// | nlattr len | nlattr type |
// | value | pad |
type Bytemsg struct {
@ -60,3 +65,25 @@ func (msg *Bytemsg) Serialize() []byte {
func (msg *Bytemsg) Len() int {
return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
}
type Boolmsg struct {
Type uint16
Value bool
}
func (msg *Boolmsg) Serialize() []byte {
buf := make([]byte, msg.Len())
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(msg.Len()))
native.PutUint16(buf[2:4], msg.Type)
if msg.Value {
buf[4] = 1
} else {
buf[4] = 0
}
return buf
}
func (msg *Boolmsg) Len() int {
return syscall_NLA_HDRLEN + 1
}

View File

@ -5,6 +5,8 @@ import (
"io"
"math"
"os"
"github.com/opencontainers/runc/libcontainer/configs"
)
type processOperations interface {
@ -48,6 +50,20 @@ type Process struct {
// All capabilities not specified will be dropped from the processes capability mask
Capabilities []string
// AppArmorProfile specifies the profile to apply to the process and is
// changed at the time the process is execed
AppArmorProfile string
// Label specifies the label to apply to the process. It is commonly used by selinux
Label string
// NoNewPrivileges controls whether processes can gain additional privileges.
NoNewPrivileges *bool
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []configs.Rlimit
ops processOperations
}
@ -55,7 +71,7 @@ type Process struct {
// Wait releases any resources associated with the Process
func (p Process) Wait() (*os.ProcessState, error) {
if p.ops == nil {
return nil, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
}
return p.ops.wait()
}
@ -65,7 +81,7 @@ func (p Process) Pid() (int, error) {
// math.MinInt32 is returned here, because it's invalid value
// for the kill() system call.
if p.ops == nil {
return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
}
return p.ops.pid(), nil
}
@ -73,7 +89,7 @@ func (p Process) Pid() (int, error) {
// Signal sends a signal to the Process.
func (p Process) Signal(sig os.Signal) error {
if p.ops == nil {
return newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
}
return p.ops.signal(sig)
}
@ -86,8 +102,8 @@ type IO struct {
}
// NewConsole creates new console for process and returns it
func (p *Process) NewConsole(rootuid int) (Console, error) {
console, err := NewConsole(rootuid, rootuid)
func (p *Process) NewConsole(rootuid, rootgid int) (Console, error) {
console, err := NewConsole(rootuid, rootgid)
if err != nil {
return nil, err
}

View File

@ -51,6 +51,7 @@ type setnsProcess struct {
fds []string
process *Process
bootstrapData io.Reader
rootDir *os.File
}
func (p *setnsProcess) startTime() (string, error) {
@ -69,39 +70,49 @@ func (p *setnsProcess) start() (err error) {
defer p.parentPipe.Close()
err = p.cmd.Start()
p.childPipe.Close()
p.rootDir.Close()
if err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "starting setns process")
}
if p.bootstrapData != nil {
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
}
if err = p.execSetns(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "executing setns process")
}
if len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
}
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score")
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for process")
}
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "writing config to pipe")
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "calling shutdown on init pipe")
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *genericError
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
return newSystemErrorWithCause(err, "decoding init error from pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
return ierr
}
return nil
}
@ -114,7 +125,7 @@ func (p *setnsProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return newSystemError(err)
return newSystemErrorWithCause(err, "waiting on setns process to finish")
}
if !status.Success() {
p.cmd.Wait()
@ -123,7 +134,7 @@ func (p *setnsProcess) execSetns() error {
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return newSystemError(err)
return newSystemErrorWithCause(err, "reading pid from init pipe")
}
process, err := os.FindProcess(pid.Pid)
if err != nil {
@ -167,14 +178,17 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) {
}
type initProcess struct {
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
config *initConfig
manager cgroups.Manager
container *linuxContainer
fds []string
process *Process
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
config *initConfig
manager cgroups.Manager
container *linuxContainer
fds []string
process *Process
bootstrapData io.Reader
sharePidns bool
rootDir *os.File
}
func (p *initProcess) pid() int {
@ -185,27 +199,63 @@ func (p *initProcess) externalDescriptors() []string {
return p.fds
}
func (p *initProcess) start() (err error) {
// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
// This is called by initProcess.start function
func (p *initProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return err
}
if !status.Success() {
p.cmd.Wait()
return &exec.ExitError{ProcessState: status}
}
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return err
}
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
p.cmd.Process = process
p.process.ops = p
return nil
}
func (p *initProcess) start() error {
defer p.parentPipe.Close()
err = p.cmd.Start()
err := p.cmd.Start()
p.process.ops = p
p.childPipe.Close()
p.rootDir.Close()
if err != nil {
p.process.ops = nil
return newSystemError(err)
return newSystemErrorWithCause(err, "starting init process command")
}
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return err
}
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init")
}
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(p.pid())
if err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
}
p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children
// can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}
defer func() {
if err != nil {
@ -213,56 +263,88 @@ func (p *initProcess) start() (err error) {
p.manager.Destroy()
}
}()
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
}
for _, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemError(err)
}
}
}
if err := p.createNetworkInterfaces(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "creating nework interfaces")
}
if err := p.sendConfig(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "sending config to init process")
}
var (
procSync syncT
sentRun bool
ierr *genericError
procSync syncT
sentRun bool
sentResume bool
ierr *genericError
)
dec := json.NewDecoder(p.parentPipe)
loop:
for {
if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
if err := dec.Decode(&procSync); err != nil {
if err == io.EOF {
break loop
}
return newSystemError(err)
return newSystemErrorWithCause(err, "decoding sync type from init pipe")
}
switch procSync.Type {
case procStart:
break loop
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score for ready process")
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for ready process")
}
// call prestart hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "reading syncT run type")
}
sentRun = true
case procHooks:
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil {
return newSystemErrorWithCause(err, "reading syncT resume type")
}
sentResume = true
case procError:
// wait for the child process to fully complete and receive an error message
// if one was encoutered
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
if err := dec.Decode(&ierr); err != nil && err != io.EOF {
return newSystemErrorWithCause(err, "decoding proc error from init")
}
if ierr != nil {
break loop
@ -270,19 +352,22 @@ loop:
// Programmer error.
panic("No error following JSON procError payload.")
default:
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
}
}
if !sentRun {
return newSystemError(fmt.Errorf("could not synchronise with container process"))
return newSystemErrorWithCause(ierr, "container init failed")
}
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "shutting down init pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
return ierr
}
return nil
}
@ -293,7 +378,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) {
return p.cmd.ProcessState, err
}
// we should kill all processes in cgroup when init is died if we use host PID namespace
if p.cmd.SysProcAttr.Cloneflags&syscall.CLONE_NEWPID == 0 {
if p.sharePidns {
killCgroupProcesses(p.manager)
}
return p.cmd.ProcessState, nil
@ -315,7 +400,9 @@ func (p *initProcess) startTime() (string, error) {
}
func (p *initProcess) sendConfig() error {
// send the state to the container's init process then shutdown writes for the parent
// send the config to the container's init process, we don't use JSON Encode
// here because there might be a problem in JSON decoder in some cases, see:
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
return utils.WriteJSON(p.parentPipe, p.config)
}
@ -365,7 +452,7 @@ func getPipeFds(pid int) ([]string, error) {
// InitializeIO creates pipes for use with the process's STDIO
// and returns the opposite side for each
func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
var fds []uintptr
i = &IO{}
// cleanup in case of an error
@ -397,7 +484,7 @@ func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace
for _, fd := range fds {
if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil {
if err := syscall.Fchown(int(fd), rootuid, rootgid); err != nil {
return nil, err
}
}

View File

@ -4,6 +4,7 @@ package libcontainer
import (
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
@ -19,47 +20,65 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/system"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
)
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
// needsSetupDev returns true if /dev needs to be set up.
func needsSetupDev(config *configs.Config) bool {
for _, m := range config.Mounts {
if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
return false
}
}
return true
}
// setupRootfs sets up the devices, mount points, and filesystems for use inside a
// new mount namespace.
func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) {
if err := prepareRoot(config); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "preparing rootfs")
}
setupDev := len(config.Devices) != 0
setupDev := needsSetupDev(config)
for _, m := range config.Mounts {
for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "running premount command")
}
}
if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "mounting %q to rootfs %q", m.Destination, config.Rootfs)
}
for _, postcmd := range m.PostmountCmds {
if err := mountCmd(postcmd); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "running postmount command")
}
}
}
if setupDev {
if err := createDevices(config); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "creating device nodes")
}
if err := setupPtmx(config, console); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting up ptmx")
}
if err := setupDevSymlinks(config.Rootfs); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting up /dev symlinks")
}
}
// Signal the parent to run the pre-start hooks.
// The hooks are run after the mounts are setup, but before we switch to the new
// root, so that the old root is still available in the hooks for any mount
// manipulations.
if err := syncParentHooks(pipe); err != nil {
return err
}
if err := syscall.Chdir(config.Rootfs); err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
}
if config.NoPivotRoot {
err = msMoveRoot(config.Rootfs)
@ -67,16 +86,28 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
err = pivotRoot(config.Rootfs, config.PivotDir)
}
if err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "jailing process inside rootfs")
}
if setupDev {
if err := reOpenDevNull(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "reopening /dev/null inside container")
}
}
// remount dev as ro if specifed
for _, m := range config.Mounts {
if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
if m.Flags&syscall.MS_RDONLY != 0 {
if err := remountReadonly(m.Destination); err != nil {
return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
}
}
break
}
}
// set rootfs ( / ) as readonly
if config.Readonlyfs {
if err := setReadonly(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting rootfs as readonly")
}
}
syscall.Umask(0022)
@ -84,14 +115,12 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
}
func mountCmd(cmd configs.Command) error {
command := exec.Command(cmd.Path, cmd.Args[:]...)
command.Env = cmd.Env
command.Dir = cmd.Dir
if out, err := command.CombinedOutput(); err != nil {
return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err)
}
return nil
}
@ -119,8 +148,9 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
if err := mountPropagate(m, rootfs, ""); err != nil {
return err
}
return label.SetFileLabel(dest, mountLabel)
}
return label.SetFileLabel(dest, mountLabel)
return nil
case "tmpfs":
stat, err := os.Stat(dest)
if err != nil {
@ -137,16 +167,6 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
}
}
return nil
case "devpts":
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
case "securityfs":
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
case "bind":
stat, err := os.Stat(m.Source)
if err != nil {
@ -218,41 +238,33 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
return err
}
}
// create symlinks for merged cgroups
cwd, err := os.Getwd()
if err != nil {
return err
}
if err := os.Chdir(filepath.Join(rootfs, m.Destination)); err != nil {
return err
}
for _, mc := range merged {
for _, ss := range strings.Split(mc, ",") {
if err := os.Symlink(mc, ss); err != nil {
// if cgroup already exists, then okay(it could have been created before)
if os.IsExist(err) {
continue
}
os.Chdir(cwd)
// symlink(2) is very dumb, it will just shove the path into
// the link and doesn't do any checks or relative path
// conversion. Also, don't error out if the cgroup already exists.
if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) {
return err
}
}
}
if err := os.Chdir(cwd); err != nil {
return err
}
if m.Flags&syscall.MS_RDONLY != 0 {
// remount cgroup root as readonly
mcgrouproot := &configs.Mount{
Source: m.Destination,
Device: "bind",
Destination: m.Destination,
Flags: defaultMountFlags | syscall.MS_RDONLY,
Flags: defaultMountFlags | syscall.MS_RDONLY | syscall.MS_BIND,
}
if err := remount(mcgrouproot, rootfs); err != nil {
return err
}
}
default:
return fmt.Errorf("unknown mount device %q to %q", m.Device, m.Destination)
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
}
return nil
}
@ -294,7 +306,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
// dest is required to be an abs path and have any symlinks resolved before calling this function.
func checkMountDestination(rootfs, dest string) error {
if filepath.Clean(rootfs) == filepath.Clean(dest) {
if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
return fmt.Errorf("mounting into / is prohibited")
}
invalidDestinations := []string{
@ -307,7 +319,8 @@ func checkMountDestination(rootfs, dest string) error {
"/proc/cpuinfo",
"/proc/diskstats",
"/proc/meminfo",
"/proc/stats",
"/proc/stat",
"/proc/net/dev",
}
for _, valid := range validDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
@ -340,7 +353,7 @@ func setupDevSymlinks(rootfs string) error {
// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
// in /dev if it exists in /proc.
if _, err := os.Stat("/proc/kcore"); err == nil {
links = append(links, [2]string{"/proc/kcore", "/dev/kcore"})
links = append(links, [2]string{"/proc/kcore", "/dev/core"})
}
for _, link := range links {
var (
@ -489,10 +502,10 @@ func getParentMount(rootfs string) (string, string, error) {
}
// Make parent mount private if it was shared
func rootfsParentMountPrivate(config *configs.Config) error {
func rootfsParentMountPrivate(rootfs string) error {
sharedMount := false
parentMount, optionalOpts, err := getParentMount(config.Rootfs)
parentMount, optionalOpts, err := getParentMount(rootfs)
if err != nil {
return err
}
@ -524,9 +537,10 @@ func prepareRoot(config *configs.Config) error {
if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil {
return err
}
if err := rootfsParentMountPrivate(config); err != nil {
return err
if config.NoPivotRoot {
if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
return err
}
}
return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "")
@ -550,7 +564,7 @@ func setupPtmx(config *configs.Config, console *linuxConsole) error {
return nil
}
func pivotRoot(rootfs, pivotBaseDir string) error {
func pivotRoot(rootfs, pivotBaseDir string) (err error) {
if pivotBaseDir == "" {
pivotBaseDir = "/"
}
@ -562,8 +576,21 @@ func pivotRoot(rootfs, pivotBaseDir string) error {
if err != nil {
return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err)
}
defer func() {
errVal := os.Remove(pivotDir)
if err == nil {
err = errVal
}
}()
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
return fmt.Errorf("pivot_root %s", err)
// Make the parent mount private
if err := rootfsParentMountPrivate(rootfs); err != nil {
return err
}
// Try again
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
return fmt.Errorf("pivot_root %s", err)
}
}
if err := syscall.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
@ -580,7 +607,7 @@ func pivotRoot(rootfs, pivotBaseDir string) error {
if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
return fmt.Errorf("unmount pivot_root dir %s", err)
}
return os.Remove(pivotDir)
return nil
}
func msMoveRoot(rootfs string) error {
@ -669,14 +696,18 @@ func remount(m *configs.Mount, rootfs string) error {
// of propagation flags.
func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
var (
dest = m.Destination
data = label.FormatMountLabel(m.Data, mountLabel)
dest = m.Destination
data = label.FormatMountLabel(m.Data, mountLabel)
flags = m.Flags
)
if libcontainerUtils.CleanPath(dest) == "/dev" {
flags &= ^syscall.MS_RDONLY
}
if !strings.HasPrefix(dest, rootfs) {
dest = filepath.Join(rootfs, dest)
}
if err := syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data); err != nil {
if err := syscall.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
return err
}

View File

@ -36,6 +36,11 @@ var archs = map[string]string{
"SCMP_ARCH_MIPSEL": "mipsel",
"SCMP_ARCH_MIPSEL64": "mipsel64",
"SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
"SCMP_ARCH_PPC": "ppc",
"SCMP_ARCH_PPC64": "ppc64",
"SCMP_ARCH_PPC64LE": "ppc64le",
"SCMP_ARCH_S390": "s390",
"SCMP_ARCH_S390X": "s390x",
}
// ConvertStringToOperator converts a string into a Seccomp comparison operator.

View File

@ -5,7 +5,6 @@ package seccomp
import (
"bufio"
"fmt"
"log"
"os"
"strings"
"syscall"
@ -167,7 +166,6 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
// Ignore it, don't error out
callNum, err := libseccomp.GetSyscallFromName(call.Name)
if err != nil {
log.Printf("Error resolving syscall name %s: %s - ignoring syscall.", call.Name, err)
return nil
}

View File

@ -10,7 +10,7 @@ import (
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
// Seccomp not supported, do nothing
// InitSeccomp does nothing because seccomp is not supported.
func InitSeccomp(config *configs.Seccomp) error {
if config != nil {
return ErrSeccompNotEnabled

View File

@ -13,9 +13,9 @@ import (
"regexp"
"strconv"
"strings"
"sync"
"syscall"
"github.com/docker/docker/pkg/mount"
"github.com/opencontainers/runc/libcontainer/system"
)
@ -35,6 +35,7 @@ const (
var (
assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`)
mcsList = make(map[string]bool)
mcsLock sync.Mutex
selinuxfs = "unknown"
selinuxEnabled = false // Stores whether selinux is currently enabled
selinuxEnabledChecked = false // Stores whether selinux enablement has been checked or established yet
@ -58,16 +59,31 @@ func getSelinuxMountPoint() string {
}
selinuxfs = ""
mounts, err := mount.GetMounts()
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return selinuxfs
}
for _, mount := range mounts {
if mount.Fstype == "selinuxfs" {
selinuxfs = mount.Mountpoint
break
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
txt := scanner.Text()
// Safe as mountinfo encodes mountpoints with spaces as \040.
sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 {
continue
}
if !strings.Contains(txt[sepIdx:], "selinuxfs") {
continue
}
fields := strings.Split(txt, " ")
if len(fields) < 5 {
continue
}
selinuxfs = fields[4]
break
}
if selinuxfs != "" {
var buf syscall.Statfs_t
syscall.Statfs(selinuxfs, &buf)
@ -158,12 +174,14 @@ func Setfilecon(path string, scon string) error {
// Getfilecon returns the SELinux label for this path or returns an error.
func Getfilecon(path string) (string, error) {
con, err := system.Lgetxattr(path, xattrNameSelinux)
if err != nil {
return "", err
}
// Trim the NUL byte at the end of the byte buffer, if present.
if con[len(con)-1] == '\x00' {
if len(con) > 0 && con[len(con)-1] == '\x00' {
con = con[:len(con)-1]
}
return string(con), err
return string(con), nil
}
func Setfscreatecon(scon string) error {
@ -265,6 +283,8 @@ func SelinuxGetEnforceMode() int {
}
func mcsAdd(mcs string) error {
mcsLock.Lock()
defer mcsLock.Unlock()
if mcsList[mcs] {
return fmt.Errorf("MCS Label already exists")
}
@ -273,7 +293,9 @@ func mcsAdd(mcs string) error {
}
func mcsDelete(mcs string) {
mcsLock.Lock()
mcsList[mcs] = false
mcsLock.Unlock()
}
func IntToMcs(id int, catRange uint32) string {
@ -289,7 +311,7 @@ func IntToMcs(id int, catRange uint32) string {
for ORD > TIER {
ORD = ORD - TIER
TIER -= 1
TIER--
}
TIER = SETSIZE - TIER
ORD = ORD + TIER
@ -430,7 +452,7 @@ func badPrefix(fpath string) error {
return nil
}
// Change the fpath file object to the SELinux label scon.
// Chcon changes the fpath file object to the SELinux label scon.
// If the fpath is a directory and recurse is true Chcon will walk the
// directory tree setting the label
func Chcon(fpath string, scon string, recurse bool) error {
@ -464,14 +486,14 @@ func DupSecOpt(src string) []string {
con["level"] == "" {
return nil
}
return []string{"label:user:" + con["user"],
"label:role:" + con["role"],
"label:type:" + con["type"],
"label:level:" + con["level"]}
return []string{"label=user:" + con["user"],
"label=role:" + con["role"],
"label=type:" + con["type"],
"label=level:" + con["level"]}
}
// DisableSecOpt returns a security opt that can be used to disabling SELinux
// labeling support for future container processes
func DisableSecOpt() []string {
return []string{"label:disable"}
return []string{"label=disable"}
}

View File

@ -3,9 +3,11 @@
package libcontainer
import (
"fmt"
"os"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
@ -17,12 +19,21 @@ type linuxSetnsInit struct {
config *initConfig
}
func (l *linuxSetnsInit) getSessionRingName() string {
return fmt.Sprintf("_ses.%s", l.config.ContainerId)
}
func (l *linuxSetnsInit) Init() error {
if err := setupRlimits(l.config.Config); err != nil {
return err
if !l.config.Config.NoNewKeyring {
// do not inherit the parent's session keyring
if _, err := keyctl.JoinSessionKeyring(l.getSessionRingName()); err != nil {
return err
}
}
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
return err
if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
if l.config.Config.Seccomp != nil {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
@ -32,13 +43,11 @@ func (l *linuxSetnsInit) Init() error {
if err := finalizeNamespace(l.config); err != nil {
return err
}
if err := apparmor.ApplyProfile(l.config.Config.AppArmorProfile); err != nil {
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
if l.config.Config.ProcessLabel != "" {
if err := label.SetProcessLabel(l.config.Config.ProcessLabel); err != nil {
return err
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}

View File

@ -2,14 +2,14 @@ package stacktrace
import "runtime"
// Caputure captures a stacktrace for the current calling go program
// Capture captures a stacktrace for the current calling go program
//
// skip is the number of frames to skip
func Capture(userSkip int) Stacktrace {
var (
skip = userSkip + 1 // add one for our own function
frames []Frame
prevPc uintptr = 0
prevPc uintptr
)
for i := skip; ; i++ {
pc, file, line, ok := runtime.Caller(i)

View File

@ -3,28 +3,62 @@
package libcontainer
import (
"fmt"
"io"
"os"
"os/exec"
"syscall"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
)
type linuxStandardInit struct {
pipe io.ReadWriter
parentPid int
config *initConfig
pipe io.ReadWriteCloser
parentPid int
stateDirFD int
config *initConfig
}
func (l *linuxStandardInit) Init() error {
// join any namespaces via a path to the namespace fd if provided
if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil {
return err
func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
var newperms uint32
if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
// with user ns we need 'other' search permissions
newperms = 0x8
} else {
// without user ns we need 'UID' search permissions
newperms = 0x80000
}
// create a unique per session container name that we can
// join in setns; however, other containers can also join it
return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
}
// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
// the kernel
const PR_SET_NO_NEW_PRIVS = 0x26
func (l *linuxStandardInit) Init() error {
if !l.config.Config.NoNewKeyring {
ringname, keepperms, newperms := l.getSessionRingParams()
// do not inherit the parent's session keyring
sessKeyId, err := keyctl.JoinSessionKeyring(ringname)
if err != nil {
return err
}
// make session keyring searcheable
if err := keyctl.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err
}
}
var console *linuxConsole
if l.config.Console != "" {
console = newConsoleFromPath(l.config.Console)
@ -32,9 +66,6 @@ func (l *linuxStandardInit) Init() error {
return err
}
}
if _, err := syscall.Setsid(); err != nil {
return err
}
if console != nil {
if err := system.Setctty(); err != nil {
return err
@ -46,16 +77,11 @@ func (l *linuxStandardInit) Init() error {
if err := setupRoute(l.config.Config); err != nil {
return err
}
if err := setupRlimits(l.config.Config); err != nil {
return err
}
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
return err
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := setupRootfs(l.config.Config, console); err != nil {
if err := setupRootfs(l.config.Config, console, l.pipe); err != nil {
return err
}
}
@ -64,10 +90,10 @@ func (l *linuxStandardInit) Init() error {
return err
}
}
if err := apparmor.ApplyProfile(l.config.Config.AppArmorProfile); err != nil {
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
if err := label.SetProcessLabel(l.config.Config.ProcessLabel); err != nil {
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
@ -90,13 +116,21 @@ func (l *linuxStandardInit) Init() error {
if err != nil {
return err
}
if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
}
if l.config.Config.Seccomp != nil {
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err
}
@ -115,5 +149,30 @@ func (l *linuxStandardInit) Init() error {
if syscall.Getppid() != l.parentPid {
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
// check for the arg before waiting to make sure it exists and it is returned
// as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// close the pipe to signal that we have completed our init.
l.pipe.Close()
// wait for the fifo to be opened on the other side before
// exec'ing the users process.
fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
if err != nil {
return newSystemErrorWithCause(err, "openat exec fifo")
}
if _, err := syscall.Write(fd, []byte("0")); err != nil {
return newSystemErrorWithCause(err, "write 0 exec fifo")
}
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}
return nil
}

View File

@ -6,9 +6,11 @@ import (
"fmt"
"os"
"path/filepath"
"syscall"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
)
func newStateTransitionError(from, to containerState) error {
@ -56,9 +58,10 @@ func destroy(c *linuxContainer) error {
func runPoststopHooks(c *linuxContainer) error {
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
}
for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil {
@ -75,7 +78,7 @@ type stoppedState struct {
}
func (b *stoppedState) status() Status {
return Destroyed
return Stopped
}
func (b *stoppedState) transition(s containerState) error {
@ -108,11 +111,11 @@ func (r *runningState) status() Status {
func (r *runningState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
running, err := r.c.isRunning()
t, err := r.c.runType()
if err != nil {
return err
}
if running {
if t == Running {
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
}
r.c.state = s
@ -127,16 +130,40 @@ func (r *runningState) transition(s containerState) error {
}
func (r *runningState) destroy() error {
running, err := r.c.isRunning()
t, err := r.c.runType()
if err != nil {
return err
}
if running {
if t == Running {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
}
return destroy(r.c)
}
type createdState struct {
c *linuxContainer
}
func (i *createdState) status() Status {
return Created
}
func (i *createdState) transition(s containerState) error {
switch s.(type) {
case *runningState, *pausedState, *stoppedState:
i.c.state = s
return nil
case *createdState:
return nil
}
return newStateTransitionError(i, s)
}
func (i *createdState) destroy() error {
i.c.initProcess.signal(syscall.SIGKILL)
return destroy(i.c)
}
// pausedState represents a container that is currently pause. It cannot be destroyed in a
// paused state and must transition back to running first.
type pausedState struct {
@ -159,11 +186,11 @@ func (p *pausedState) transition(s containerState) error {
}
func (p *pausedState) destroy() error {
isRunning, err := p.c.isRunning()
t, err := p.c.runType()
if err != nil {
return err
}
if !isRunning {
if t != Running && t != Created {
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
@ -173,7 +200,7 @@ func (p *pausedState) destroy() error {
}
// restoredState is the same as the running state but also has accociated checkpoint
// information that maybe need destroyed when the container is stopped and destory is called.
// information that maybe need destroyed when the container is stopped and destroy is called.
type restoredState struct {
imageDir string
c *linuxContainer
@ -202,22 +229,25 @@ func (r *restoredState) destroy() error {
return destroy(r.c)
}
// createdState is used whenever a container is restored, loaded, or setting additional
// loadedState is used whenever a container is restored, loaded, or setting additional
// processes inside and it should not be destroyed when it is exiting.
type createdState struct {
type loadedState struct {
c *linuxContainer
s Status
}
func (n *createdState) status() Status {
func (n *loadedState) status() Status {
return n.s
}
func (n *createdState) transition(s containerState) error {
func (n *loadedState) transition(s containerState) error {
n.c.state = s
return nil
}
func (n *createdState) destroy() error {
return nil
func (n *loadedState) destroy() error {
if err := n.c.refreshState(); err != nil {
return err
}
return n.c.state.destroy()
}

View File

@ -0,0 +1,7 @@
package libcontainer
// Solaris - TODO
type Stats struct {
Interfaces []*NetworkInterface
}

View File

@ -11,6 +11,19 @@ import (
"unsafe"
)
// If arg2 is nonzero, set the "child subreaper" attribute of the
// calling process; if arg2 is zero, unset the attribute. When a
// process is marked as a child subreaper, all of the children
// that it creates, and their descendants, will be marked as
// having a subreaper. In effect, a subreaper fulfills the role
// of init(1) for its descendant processes. Upon termination of
// a process that is orphaned (i.e., its immediate parent has
// already terminated) and marked as having a subreaper, the
// nearest still living ancestor subreaper will receive a SIGCHLD
// signal and be able to wait(2) on the process to discover its
// termination status.
const PR_SET_CHILD_SUBREAPER = 36
type ParentDeathSignal int
func (p ParentDeathSignal) Restore() error {
@ -40,6 +53,14 @@ func Execv(cmd string, args []string, env []string) error {
return syscall.Exec(name, args, env)
}
func Prlimit(pid, resource int, limit syscall.Rlimit) error {
_, _, err := syscall.RawSyscall6(syscall.SYS_PRLIMIT64, uintptr(pid), uintptr(resource), uintptr(unsafe.Pointer(&limit)), uintptr(unsafe.Pointer(&limit)), 0, 0)
if err != 0 {
return err
}
return nil
}
func SetParentDeathSignal(sig uintptr) error {
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, sig, 0); err != 0 {
return err
@ -79,17 +100,12 @@ func Setctty() error {
return nil
}
/*
* Detect whether we are currently running in a user namespace.
* Copied from github.com/lxc/lxd/shared/util.go
*/
// RunningInUserNS detects whether we are currently running in a user namespace.
// Copied from github.com/lxc/lxd/shared/util.go
func RunningInUserNS() bool {
file, err := os.Open("/proc/self/uid_map")
if err != nil {
/*
* This kernel-provided file only exists if user namespaces are
* supported
*/
// This kernel-provided file only exists if user namespaces are supported
return false
}
defer file.Close()
@ -112,3 +128,16 @@ func RunningInUserNS() bool {
}
return true
}
// SetSubreaper sets the value i as the subreaper setting for the calling process
func SetSubreaper(i int) error {
return Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
}
func Prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) {
_, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0)
if e1 != 0 {
err = e1
}
return
}

View File

@ -0,0 +1,9 @@
// +build !linux
package system
// RunningInUserNS is a stub for non-Linux systems
// Always returns false
func RunningInUserNS() bool {
return false
}

View File

@ -2,13 +2,15 @@ package user
import (
"errors"
"fmt"
"syscall"
)
var (
// The current operating system does not provide the required data for user lookups.
ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
// No matching entries found in file.
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
ErrNoGroupEntries = errors.New("no matching entries in group file")
)
func lookupUser(filter func(u User) bool) (User, error) {
@ -27,7 +29,7 @@ func lookupUser(filter func(u User) bool) (User, error) {
// No user entries found.
if len(users) == 0 {
return User{}, fmt.Errorf("no matching entries in passwd file")
return User{}, ErrNoPasswdEntries
}
// Assume the first entry is the "correct" one.
@ -75,7 +77,7 @@ func lookupGroup(filter func(g Group) bool) (Group, error) {
// No user entries found.
if len(groups) == 0 {
return Group{}, fmt.Errorf("no matching entries in group file")
return Group{}, ErrNoGroupEntries
}
// Assume the first entry is the "correct" one.

View File

@ -15,7 +15,7 @@ const (
)
var (
ErrRange = fmt.Errorf("Uids and gids must be in range %d-%d", minId, maxId)
ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
)
type User struct {
@ -42,29 +42,30 @@ func parseLine(line string, v ...interface{}) {
parts := strings.Split(line, ":")
for i, p := range parts {
// Ignore cases where we don't have enough fields to populate the arguments.
// Some configuration files like to misbehave.
if len(v) <= i {
// if we have more "parts" than we have places to put them, bail for great "tolerance" of naughty configuration files
break
}
// Use the type of the argument to figure out how to parse it, scanf() style.
// This is legit.
switch e := v[i].(type) {
case *string:
// "root", "adm", "/bin/bash"
*e = p
case *int:
// "0", "4", "1000"
// ignore string to int conversion errors, for great "tolerance" of naughty configuration files
// "numbers", with conversion errors ignored because of some misbehaving configuration files.
*e, _ = strconv.Atoi(p)
case *[]string:
// "", "root", "root,adm,daemon"
// Comma-separated lists.
if p != "" {
*e = strings.Split(p, ",")
} else {
*e = []string{}
}
default:
// panic, because this is a programming/logic error, not a runtime one
panic("parseLine expects only pointers! argument " + strconv.Itoa(i) + " is not a pointer!")
// Someone goof'd when writing code using this function. Scream so they can hear us.
panic(fmt.Sprintf("parseLine only accepts {*string, *int, *[]string} as arguments! %#v is not a pointer!", e))
}
}
}
@ -106,8 +107,8 @@ func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
return nil, err
}
text := strings.TrimSpace(s.Text())
if text == "" {
line := strings.TrimSpace(s.Text())
if line == "" {
continue
}
@ -117,10 +118,7 @@ func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
// root:x:0:0:root:/root:/bin/bash
// adm:x:3:4:adm:/var/adm:/bin/false
p := User{}
parseLine(
text,
&p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell,
)
parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell)
if filter == nil || filter(p) {
out = append(out, p)
@ -135,6 +133,7 @@ func ParseGroupFile(path string) ([]Group, error) {
if err != nil {
return nil, err
}
defer group.Close()
return ParseGroup(group)
}
@ -178,10 +177,7 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
// root:x:0:root
// adm:x:4:root,adm,daemon
p := Group{}
parseLine(
text,
&p.Name, &p.Pass, &p.Gid, &p.List,
)
parseLine(text, &p.Name, &p.Pass, &p.Gid, &p.List)
if filter == nil || filter(p) {
out = append(out, p)
@ -192,9 +188,10 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
}
type ExecUser struct {
Uid, Gid int
Sgids []int
Home string
Uid int
Gid int
Sgids []int
Home string
}
// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the
@ -235,12 +232,12 @@ func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath
// * "uid:gid
// * "user:gid"
// * "uid:group"
//
// It should be noted that if you specify a numeric user or group id, they will
// not be evaluated as usernames (only the metadata will be filled). So attempting
// to parse a user with user.Name = "1337" will produce the user with a UID of
// 1337.
func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) {
var (
userArg, groupArg string
name string
)
if defaults == nil {
defaults = new(ExecUser)
}
@ -258,87 +255,113 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (
user.Sgids = []int{}
}
// allow for userArg to have either "user" syntax, or optionally "user:group" syntax
// Allow for userArg to have either "user" syntax, or optionally "user:group" syntax
var userArg, groupArg string
parseLine(userSpec, &userArg, &groupArg)
// Convert userArg and groupArg to be numeric, so we don't have to execute
// Atoi *twice* for each iteration over lines.
uidArg, uidErr := strconv.Atoi(userArg)
gidArg, gidErr := strconv.Atoi(groupArg)
// Find the matching user.
users, err := ParsePasswdFilter(passwd, func(u User) bool {
if userArg == "" {
// Default to current state of the user.
return u.Uid == user.Uid
}
return u.Name == userArg || strconv.Itoa(u.Uid) == userArg
if uidErr == nil {
// If the userArg is numeric, always treat it as a UID.
return uidArg == u.Uid
}
return u.Name == userArg
})
// If we can't find the user, we have to bail.
if err != nil && passwd != nil {
if userArg == "" {
userArg = strconv.Itoa(user.Uid)
}
return nil, fmt.Errorf("Unable to find user %v: %v", userArg, err)
return nil, fmt.Errorf("unable to find user %s: %v", userArg, err)
}
haveUser := users != nil && len(users) > 0
if haveUser {
// if we found any user entries that matched our filter, let's take the first one as "correct"
name = users[0].Name
var matchedUserName string
if len(users) > 0 {
// First match wins, even if there's more than one matching entry.
matchedUserName = users[0].Name
user.Uid = users[0].Uid
user.Gid = users[0].Gid
user.Home = users[0].Home
} else if userArg != "" {
// we asked for a user but didn't find them... let's check to see if we wanted a numeric user
user.Uid, err = strconv.Atoi(userArg)
if err != nil {
// not numeric - we have to bail
return nil, fmt.Errorf("Unable to find user %v", userArg)
// If we can't find a user with the given username, the only other valid
// option is if it's a numeric username with no associated entry in passwd.
if uidErr != nil {
// Not numeric.
return nil, fmt.Errorf("unable to find user %s: %v", userArg, ErrNoPasswdEntries)
}
user.Uid = uidArg
// Must be inside valid uid range.
if user.Uid < minId || user.Uid > maxId {
return nil, ErrRange
}
// if userArg couldn't be found in /etc/passwd but is numeric, just roll with it - this is legit
// Okay, so it's numeric. We can just roll with this.
}
if groupArg != "" || name != "" {
// On to the groups. If we matched a username, we need to do this because of
// the supplementary group IDs.
if groupArg != "" || matchedUserName != "" {
groups, err := ParseGroupFilter(group, func(g Group) bool {
// Explicit group format takes precedence.
if groupArg != "" {
return g.Name == groupArg || strconv.Itoa(g.Gid) == groupArg
}
// Check if user is a member.
for _, u := range g.List {
if u == name {
return true
// If the group argument isn't explicit, we'll just search for it.
if groupArg == "" {
// Check if user is a member of this group.
for _, u := range g.List {
if u == matchedUserName {
return true
}
}
return false
}
return false
if gidErr == nil {
// If the groupArg is numeric, always treat it as a GID.
return gidArg == g.Gid
}
return g.Name == groupArg
})
if err != nil && group != nil {
return nil, fmt.Errorf("Unable to find groups for user %v: %v", users[0].Name, err)
return nil, fmt.Errorf("unable to find groups for spec %v: %v", matchedUserName, err)
}
haveGroup := groups != nil && len(groups) > 0
// Only start modifying user.Gid if it is in explicit form.
if groupArg != "" {
if haveGroup {
// if we found any group entries that matched our filter, let's take the first one as "correct"
if len(groups) > 0 {
// First match wins, even if there's more than one matching entry.
user.Gid = groups[0].Gid
} else {
// we asked for a group but didn't find id... let's check to see if we wanted a numeric group
user.Gid, err = strconv.Atoi(groupArg)
if err != nil {
// not numeric - we have to bail
return nil, fmt.Errorf("Unable to find group %v", groupArg)
}
} else if groupArg != "" {
// If we can't find a group with the given name, the only other valid
// option is if it's a numeric group name with no associated entry in group.
// Ensure gid is inside gid range.
if gidErr != nil {
// Not numeric.
return nil, fmt.Errorf("unable to find group %s: %v", groupArg, ErrNoGroupEntries)
}
user.Gid = gidArg
// Must be inside valid gid range.
if user.Gid < minId || user.Gid > maxId {
return nil, ErrRange
}
// if groupArg couldn't be found in /etc/group but is numeric, just roll with it - this is legit
// Okay, so it's numeric. We can just roll with this.
}
} else if haveGroup {
// If implicit group format, fill supplementary gids.
} else if len(groups) > 0 {
// Supplementary group ids only make sense if in the implicit form.
user.Sgids = make([]int, len(groups))
for i, group := range groups {
user.Sgids[i] = group.Gid

View File

@ -5,7 +5,9 @@ import (
"encoding/hex"
"encoding/json"
"io"
"os"
"path/filepath"
"strings"
"syscall"
)
@ -54,3 +56,66 @@ func WriteJSON(w io.Writer, v interface{}) error {
_, err = w.Write(data)
return err
}
// CleanPath makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using CleanPath.
func CleanPath(path string) string {
// Deal with empty strings nicely.
if path == "" {
return ""
}
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}
// SearchLabels searches a list of key-value pairs for the provided key and
// returns the corresponding value. The pairs must be separated with '='.
func SearchLabels(labels []string, query string) string {
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == query {
return parts[1]
}
}
return ""
}
// Annotations returns the bundle path and user defined annotations from the
// libcontianer state. We need to remove the bundle because that is a label
// added by libcontainer.
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
userAnnotations = make(map[string]string)
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == "bundle" {
bundle = parts[1]
} else {
userAnnotations[parts[0]] = parts[1]
}
}
return
}

View File

@ -295,7 +295,9 @@ func (c *Command) HelpTemplate() string {
if c.HasParent() {
return c.parent.HelpTemplate()
}
return `{{with or .Long .Short }}{{. | trim}}{{end}}{{if or .Runnable .HasSubCommands}}{{.UsageString}}{{end}}`
return `{{with or .Long .Short }}{{. | trim}}
{{end}}{{if or .Runnable .HasSubCommands}}{{.UsageString}}{{end}}`
}
// Really only used when casting a command to a commander