Update libcontainer dependency

2016-07-10 20:29:06 -07:00
parent 710374b65f
commit f6186afe99
60 changed files with 2415 additions and 946 deletions
--- a/vendor/github.com/opencontainers/runc/libcontainer/README.md
+++ b/vendor/github.com/opencontainers/runc/libcontainer/README.md
@@ -76,7 +76,7 @@ config := &configs.Config{
 		Name:   "test-container",
 		Parent: "system",
 		Resources: &configs.Resources{
-			MemorySwappiness: -1,
+			MemorySwappiness: nil,
 			AllowAllDevices:  false,
 			AllowedDevices:   configs.DefaultAllowedDevices,
 		},
@@ -133,15 +133,15 @@ config := &configs.Config{
 	UidMappings: []configs.IDMap{
 		{
 			ContainerID: 0,
-			Host: 1000,
-			size: 65536,
+			HostID: 1000,
+			Size: 65536,
 		},
 	},
 	GidMappings: []configs.IDMap{
 		{
 			ContainerID: 0,
-			Host: 1000,
-			size: 65536,
+			HostID: 1000,
+			Size: 65536,
 		},
 	},
 	Networks: []*configs.Network{
@@ -186,8 +186,8 @@ process := &libcontainer.Process{

 err := container.Start(process)
 if err != nil {
-	logrus.Fatal(err)
 	container.Destroy()
+	logrus.Fatal(err)
 	return
 }

@@ -216,6 +216,12 @@ container.Pause()

 // resume all paused processes.
 container.Resume()
+
+// send signal to container's init process.
+container.Signal(signal)
+
+// update container resource constraints.
+container.Set(config)
 ```


--- a/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md
+++ b/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md
@@ -90,7 +90,7 @@ in tmpfs.

 After `/dev/null` has been setup we check for any external links between
 the container's io, STDIN, STDOUT, STDERR.  If the container's io is pointing
-to `/dev/null` outside the container we close and `dup2` the the `/dev/null` 
+to `/dev/null` outside the container we close and `dup2` the `/dev/null` 
 that is local to the container's rootfs.


@@ -142,6 +142,7 @@ system resources like cpu, memory, and device access.
 | perf_event | 1       |
 | freezer    | 1       |
 | hugetlb    | 1       |
+| pids       | 1       |


 All cgroup subsystem are joined so that statistics can be collected from
@@ -199,7 +200,7 @@ provide a good default for security and flexibility for the applications.
 | CAP_SYS_BOOT         | 0       |
 | CAP_LEASE            | 0       |
 | CAP_WAKE_ALARM       | 0       |
-| CAP_BLOCK_SUSPE      | 0       |
+| CAP_BLOCK_SUSPEND    | 0       |


 Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
@@ -296,7 +297,7 @@ a container.
 | -------------- | ------------------------------------------------------------------ |
 | Get processes  | Return all the pids for processes running inside a container       | 
 | Get Stats      | Return resource statistics for the container as a whole            |
-| Wait           | Wait waits on the container's init process ( pid 1 )               |
+| Wait           | Waits on the container's init process ( pid 1 )                    |
 | Wait Process   | Wait on any of the container's processes returning the exit status | 
 | Destroy        | Kill the container's init process and remove any filesystem state  |
 | Signal         | Send a signal to the container's init process                      |
--- a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go
@@ -7,6 +7,7 @@ package apparmor
 // #include <stdlib.h>
 import "C"
 import (
+	"fmt"
 	"io/ioutil"
 	"os"
 	"unsafe"
@@ -32,7 +33,7 @@ func ApplyProfile(name string) error {
 	cName := C.CString(name)
 	defer C.free(unsafe.Pointer(cName))
 	if _, err := C.aa_change_onexec(cName); err != nil {
-		return err
+		return fmt.Errorf("apparmor failed to apply profile: %s", err)
 	}
 	return nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
@@ -9,7 +9,7 @@ import (
 )

 type Manager interface {
-	// Apply cgroup configuration to the process with the specified pid
+	// Applies cgroup configuration to the process with the specified pid
 	Apply(pid int) error

 	// Returns the PIDs inside the cgroup set
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go
@@ -14,6 +14,7 @@ import (

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
 )

 var (
@@ -30,6 +31,7 @@ var (
 		&NetPrioGroup{},
 		&PerfEventGroup{},
 		&FreezerGroup{},
+		&NameGroup{GroupName: "name=systemd", Join: true},
 	}
 	CgroupProcesses  = "cgroup.procs"
 	HugePageSizes, _ = cgroups.GetHugePageSize()
@@ -94,11 +96,10 @@ func getCgroupRoot() (string, error) {
 }

 type cgroupData struct {
-	root   string
-	parent string
-	name   string
-	config *configs.Cgroup
-	pid    int
+	root      string
+	innerPath string
+	config    *configs.Cgroup
+	pid       int
 }

 func (m *Manager) Apply(pid int) (err error) {
@@ -129,12 +130,9 @@ func (m *Manager) Apply(pid int) (err error) {
 		return cgroups.EnterPid(m.Paths, pid)
 	}

+	m.mu.Lock()
+	defer m.mu.Unlock()
 	paths := make(map[string]string)
-	defer func() {
-		if err != nil {
-			cgroups.RemovePaths(paths)
-		}
-	}()
 	for _, sys := range subsystems {
 		if err := sys.Apply(d); err != nil {
 			return err
@@ -144,7 +142,9 @@ func (m *Manager) Apply(pid int) (err error) {
 		// created then join consists of writing the process pids to cgroup.procs
 		p, err := d.path(sys.Name())
 		if err != nil {
-			if cgroups.IsNotFound(err) {
+			// The non-presence of the devices subsystem is
+			// considered fatal for security reasons.
+			if cgroups.IsNotFound(err) && sys.Name() != "devices" {
 				continue
 			}
 			return err
@@ -267,45 +267,31 @@ func getCgroupPath(c *configs.Cgroup) (string, error) {
 	return d.path("devices")
 }

-// pathClean makes a path safe for use with filepath.Join. This is done by not
-// only cleaning the path, but also (if the path is relative) adding a leading
-// '/' and cleaning it (then removing the leading '/'). This ensures that a
-// path resulting from prepending another path will always resolve to lexically
-// be a subdirectory of the prefixed path. This is all done lexically, so paths
-// that include symlinks won't be safe as a result of using pathClean.
-func pathClean(path string) string {
-	// Ensure that all paths are cleaned (especially problematic ones like
-	// "/../../../../../" which can cause lots of issues).
-	path = filepath.Clean(path)
-
-	// If the path isn't absolute, we need to do more processing to fix paths
-	// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
-	// paths to relative ones.
-	if !filepath.IsAbs(path) {
-		path = filepath.Clean(string(os.PathSeparator) + path)
-		// This can't fail, as (by definition) all paths are relative to root.
-		path, _ = filepath.Rel(string(os.PathSeparator), path)
-	}
-
-	// Clean the path again for good measure.
-	return filepath.Clean(path)
-}
-
 func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
 	root, err := getCgroupRoot()
 	if err != nil {
 		return nil, err
 	}

-	// Clean the parent slice path.
-	c.Parent = pathClean(c.Parent)
+	if (c.Name != "" || c.Parent != "") && c.Path != "" {
+		return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
+	}
+
+	// XXX: Do not remove this code. Path safety is important! -- cyphar
+	cgPath := libcontainerUtils.CleanPath(c.Path)
+	cgParent := libcontainerUtils.CleanPath(c.Parent)
+	cgName := libcontainerUtils.CleanPath(c.Name)
+
+	innerPath := cgPath
+	if innerPath == "" {
+		innerPath = filepath.Join(cgParent, cgName)
+	}

 	return &cgroupData{
-		root:   root,
-		parent: c.Parent,
-		name:   c.Name,
-		config: c,
-		pid:    pid,
+		root:      root,
+		innerPath: innerPath,
+		config:    c,
+		pid:       pid,
 	}, nil
 }

@@ -333,11 +319,10 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
 		return "", err
 	}

-	cgPath := filepath.Join(raw.parent, raw.name)
 	// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
-	if filepath.IsAbs(cgPath) {
+	if filepath.IsAbs(raw.innerPath) {
 		// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
-		return filepath.Join(raw.root, filepath.Base(mnt), cgPath), nil
+		return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
 	}

 	parentPath, err := raw.parentPath(subsystem, mnt, root)
@@ -345,7 +330,7 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
 		return "", err
 	}

-	return filepath.Join(parentPath, cgPath), nil
+	return filepath.Join(parentPath, raw.innerPath), nil
 }

 func (raw *cgroupData) join(subsystem string) (string, error) {
@@ -366,9 +351,12 @@ func writeFile(dir, file, data string) error {
 	// Normally dir should not be empty, one case is that cgroup subsystem
 	// is not mounted, we will get empty dir, and we want it fail here.
 	if dir == "" {
-		return fmt.Errorf("no such directory for %s.", file)
+		return fmt.Errorf("no such directory for %s", file)
 	}
-	return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
+	if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700); err != nil {
+		return fmt.Errorf("failed to write %v to %v: %v", data, file, err)
+	}
+	return nil
 }

 func readFile(dir, file string) (string, error) {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go
@@ -12,6 +12,7 @@ import (

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
 )

 type CpusetGroup struct {
@@ -88,7 +89,7 @@ func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []b
 // it's parent.
 func (s *CpusetGroup) ensureParent(current, root string) error {
 	parent := filepath.Dir(current)
-	if filepath.Clean(parent) == root {
+	if libcontainerUtils.CleanPath(parent) == root {
 		return nil
 	}
 	// Avoid infinite recursion.
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go
@@ -5,6 +5,7 @@ package fs
 import (
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/system"
 )

 type DevicesGroup struct {
@@ -25,6 +26,23 @@ func (s *DevicesGroup) Apply(d *cgroupData) error {
 }

 func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
+	if system.RunningInUserNS() {
+		return nil
+	}
+
+	devices := cgroup.Resources.Devices
+	if len(devices) > 0 {
+		for _, dev := range devices {
+			file := "devices.deny"
+			if dev.Allow {
+				file = "devices.allow"
+			}
+			if err := writeFile(path, file, dev.CgroupString()); err != nil {
+				return err
+			}
+		}
+		return nil
+	}
 	if !cgroup.Resources.AllowAllDevices {
 		if err := writeFile(path, "devices.deny", "a"); err != nil {
 			return err
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go
@@ -9,6 +9,7 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
+	"time"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
@@ -26,38 +27,75 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-	if memoryAssigned(d.config) {
-		if path != "" {
-			if err := os.MkdirAll(path, 0755); err != nil {
-				return err
-			}
-		}
-		// We have to set kernel memory here, as we can't change it once
-		// processes have been attached.
-		if err := s.SetKernelMemory(path, d.config); err != nil {
-			return err
-		}
+	// reset error.
+	err = nil
+	if path == "" {
+		// Invalid input.
+		return fmt.Errorf("invalid path for memory cgroups: %+v", d)
 	}
-
 	defer func() {
 		if err != nil {
 			os.RemoveAll(path)
 		}
 	}()
-
+	if !cgroups.PathExists(path) {
+		if err = os.MkdirAll(path, 0755); err != nil {
+			return err
+		}
+	}
+	if memoryAssigned(d.config) {
+		// We have to set kernel memory here, as we can't change it once
+		// processes have been attached to the cgroup.
+		if err = s.SetKernelMemory(path, d.config); err != nil {
+			return err
+		}
+	}
 	// We need to join memory cgroup after set memory limits, because
 	// kmem.limit_in_bytes can only be set when the cgroup is empty.
-	_, err = d.join("memory")
-	if err != nil && !cgroups.IsNotFound(err) {
+	if _, jerr := d.join("memory"); jerr != nil && !cgroups.IsNotFound(jerr) {
+		err = jerr
 		return err
 	}
 	return nil
 }

+func getModifyTime(path string) (time.Time, error) {
+	stat, err := os.Stat(path)
+	if err != nil {
+		return time.Time{}, fmt.Errorf("failed to get memory cgroups creation time: %v", err)
+	}
+	return stat.ModTime(), nil
+}
+
 func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
-	// This has to be done separately because it has special constraints (it
-	// can't be done after there are processes attached to the cgroup).
-	if cgroup.Resources.KernelMemory > 0 {
+	// This has to be done separately because it has special
+	// constraints (it can only be initialized before setting up a
+	// hierarchy or adding a task to the cgroups. However, if
+	// sucessfully initialized, it can be updated anytime afterwards)
+	if cgroup.Resources.KernelMemory != 0 {
+		// Is kmem.limit_in_bytes already set?
+		// memory.kmem.max_usage_in_bytes is a read-only file. Use it to get cgroups creation time.
+		kmemCreationTime, err := getModifyTime(filepath.Join(path, "memory.kmem.max_usage_in_bytes"))
+		if err != nil {
+			return err
+		}
+		kmemLimitsUpdateTime, err := getModifyTime(filepath.Join(path, "memory.kmem.limit_in_bytes"))
+		if err != nil {
+			return err
+		}
+		// kmem.limit_in_bytes has already been set if its update time is after that of creation time.
+		// We use `!=` op instead of `>` because updates are losing precision compared to creation.
+		kmemInitialized := !kmemLimitsUpdateTime.Equal(kmemCreationTime)
+		if !kmemInitialized {
+			// If there's already tasks in the cgroup, we can't change the limit either
+			tasks, err := getCgroupParamString(path, "tasks")
+			if err != nil {
+				return err
+			}
+			if tasks != "" {
+				return fmt.Errorf("cannot set kmem.limit_in_bytes after task have joined this cgroup")
+			}
+		}
 		if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
 			return err
 		}
@@ -65,19 +103,65 @@ func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error
 	return nil
 }

-func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
-	if cgroup.Resources.Memory != 0 {
-		if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
+	// When memory and swap memory are both set, we need to handle the cases
+	// for updating container.
+	if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap > 0 {
+		memoryUsage, err := getMemoryData(path, "")
+		if err != nil {
 			return err
 		}
+
+		// When update memory limit, we should adapt the write sequence
+		// for memory and swap memory, so it won't fail because the new
+		// value and the old value don't fit kernel's validation.
+		if memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
+			if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+				return err
+			}
+			if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+				return err
+			}
+		} else {
+			if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+				return err
+			}
+			if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+				return err
+			}
+		}
+	} else {
+		if cgroup.Resources.Memory != 0 {
+			if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+				return err
+			}
+		}
+		if cgroup.Resources.MemorySwap > 0 {
+			if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+				return err
+			}
+		}
 	}
+
+	return nil
+}
+
+func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
+	if err := setMemoryAndSwap(path, cgroup); err != nil {
+		return err
+	}
+
+	if err := s.SetKernelMemory(path, cgroup); err != nil {
+		return err
+	}
+
 	if cgroup.Resources.MemoryReservation != 0 {
 		if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
 			return err
 		}
 	}
-	if cgroup.Resources.MemorySwap > 0 {
-		if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+	if cgroup.Resources.KernelMemoryTCP != 0 {
+		if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
 			return err
 		}
 	}
@@ -86,14 +170,14 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
 			return err
 		}
 	}
-	if cgroup.Resources.MemorySwappiness >= 0 && cgroup.Resources.MemorySwappiness <= 100 {
-		if err := writeFile(path, "memory.swappiness", strconv.FormatInt(cgroup.Resources.MemorySwappiness, 10)); err != nil {
+	if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
+		return nil
+	} else if int64(*cgroup.Resources.MemorySwappiness) >= 0 && int64(*cgroup.Resources.MemorySwappiness) <= 100 {
+		if err := writeFile(path, "memory.swappiness", strconv.FormatInt(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
 			return err
 		}
-	} else if cgroup.Resources.MemorySwappiness == -1 {
-		return nil
 	} else {
-		return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", cgroup.Resources.MemorySwappiness)
+		return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", int64(*cgroup.Resources.MemorySwappiness))
 	}

 	return nil
@@ -139,6 +223,11 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
 		return err
 	}
 	stats.MemoryStats.KernelUsage = kernelUsage
+	kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.KernelTCPUsage = kernelTCPUsage

 	return nil
 }
@@ -148,8 +237,9 @@ func memoryAssigned(cgroup *configs.Cgroup) bool {
 		cgroup.Resources.MemoryReservation != 0 ||
 		cgroup.Resources.MemorySwap > 0 ||
 		cgroup.Resources.KernelMemory > 0 ||
+		cgroup.Resources.KernelMemoryTCP > 0 ||
 		cgroup.Resources.OomKillDisable ||
-		cgroup.Resources.MemorySwappiness != -1
+		(cgroup.Resources.MemorySwappiness != nil && *cgroup.Resources.MemorySwappiness != -1)
 }

 func getMemoryData(path, name string) (cgroups.MemoryData, error) {
@@ -162,6 +252,7 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
 	usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
 	maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
 	failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
+	limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")

 	value, err := getCgroupParamUint(path, usage)
 	if err != nil {
@@ -187,6 +278,14 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
 		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
 	}
 	memoryData.Failcnt = value
+	value, err = getCgroupParamUint(path, limit)
+	if err != nil {
+		if moduleName != "memory" && os.IsNotExist(err) {
+			return cgroups.MemoryData{}, nil
+		}
+		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
+	}
+	memoryData.Limit = value

 	return memoryData, nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go
@@ -9,6 +9,7 @@ import (

 type NameGroup struct {
 	GroupName string
+	Join      bool
 }

 func (s *NameGroup) Name() string {
@@ -16,6 +17,10 @@ func (s *NameGroup) Name() string {
 }

 func (s *NameGroup) Apply(d *cgroupData) error {
+	if s.Join {
+		// ignore errors if the named cgroup does not exist
+		d.join(s.GroupName)
+	}
 	return nil
 }

@@ -24,6 +29,9 @@ func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
 }

 func (s *NameGroup) Remove(d *cgroupData) error {
+	if s.Join {
+		removePath(d.path(s.GroupName))
+	}
 	return nil
 }

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go
@@ -4,6 +4,7 @@ package fs

 import (
 	"fmt"
+	"path/filepath"
 	"strconv"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -47,11 +48,26 @@ func (s *PidsGroup) Remove(d *cgroupData) error {
 }

 func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
-	value, err := getCgroupParamUint(path, "pids.current")
+	current, err := getCgroupParamUint(path, "pids.current")
 	if err != nil {
 		return fmt.Errorf("failed to parse pids.current - %s", err)
 	}

-	stats.PidsStats.Current = value
+	maxString, err := getCgroupParamString(path, "pids.max")
+	if err != nil {
+		return fmt.Errorf("failed to parse pids.max - %s", err)
+	}
+
+	// Default if pids.max == "max" is 0 -- which represents "no limit".
+	var max uint64
+	if maxString != "max" {
+		max, err = parseUint(maxString, 10, 64)
+		if err != nil {
+			return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max"))
+		}
+	}
+
+	stats.PidsStats.Current = current
+	stats.PidsStats.Limit = max
 	return nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go
@@ -12,7 +12,6 @@ import (
 )

 var (
-	ErrNotSupportStat = errors.New("stats are not supported for subsystem")
 	ErrNotValidFormat = errors.New("line is not a valid key value format")
 )

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
@@ -11,6 +11,7 @@ type ThrottlingData struct {
 	ThrottledTime uint64 `json:"throttled_time,omitempty"`
 }

+// CpuUsage denotes the usage of a CPU.
 // All CPU stats are aggregate since container inception.
 type CpuUsage struct {
 	// Total CPU time consumed.
@@ -36,7 +37,9 @@ type MemoryData struct {
 	Usage    uint64 `json:"usage,omitempty"`
 	MaxUsage uint64 `json:"max_usage,omitempty"`
 	Failcnt  uint64 `json:"failcnt"`
+	Limit    uint64 `json:"limit"`
 }
+
 type MemoryStats struct {
 	// memory used for cache
 	Cache uint64 `json:"cache,omitempty"`
@@ -44,14 +47,19 @@ type MemoryStats struct {
 	Usage MemoryData `json:"usage,omitempty"`
 	// usage of memory + swap
 	SwapUsage MemoryData `json:"swap_usage,omitempty"`
-	// usafe of kernel memory
-	KernelUsage MemoryData        `json:"kernel_usage,omitempty"`
-	Stats       map[string]uint64 `json:"stats,omitempty"`
+	// usage of kernel memory
+	KernelUsage MemoryData `json:"kernel_usage,omitempty"`
+	// usage of kernel TCP memory
+	KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
+
+	Stats map[string]uint64 `json:"stats,omitempty"`
 }

 type PidsStats struct {
 	// number of pids in the cgroup
 	Current uint64 `json:"current,omitempty"`
+	// active pids hard limit
+	Limit uint64 `json:"limit,omitempty"`
 }

 type BlkioStatEntry struct {
@@ -78,7 +86,7 @@ type HugetlbStats struct {
 	Usage uint64 `json:"usage,omitempty"`
 	// maximum usage ever recorded.
 	MaxUsage uint64 `json:"max_usage,omitempty"`
-	// number of times htgetlb usage allocation failure.
+	// number of times hugetlb usage allocation failure.
 	Failcnt uint64 `json:"failcnt"`
 }

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go
@@ -74,6 +74,7 @@ var (
 	theConn                         *systemdDbus.Conn
 	hasStartTransientUnit           bool
 	hasTransientDefaultDependencies bool
+	hasDelegate                     bool
 )

 func newProp(name string, units interface{}) systemdDbus.Property {
@@ -146,20 +147,24 @@ func UseSystemd() bool {

 		// Not critical because of the stop unit logic above.
 		theConn.StopUnit(scope, "replace", nil)
+
+		// Assume StartTransientUnit on a scope allows Delegate
+		hasDelegate = true
+		dl := newProp("Delegate", true)
+		if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil {
+			if dbusError, ok := err.(dbus.Error); ok {
+				if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
+					hasDelegate = false
+				}
+			}
+		}
+
+		// Not critical because of the stop unit logic above.
+		theConn.StopUnit(scope, "replace", nil)
 	}
 	return hasStartTransientUnit
 }

-func getIfaceForUnit(unitName string) string {
-	if strings.HasSuffix(unitName, ".scope") {
-		return "Scope"
-	}
-	if strings.HasSuffix(unitName, ".service") {
-		return "Service"
-	}
-	return "Unit"
-}
-
 func (m *Manager) Apply(pid int) error {
 	var (
 		c          = m.Cgroups
@@ -195,6 +200,11 @@ func (m *Manager) Apply(pid int) error {
 		newProp("PIDs", []uint32{uint32(pid)}),
 	)

+	if hasDelegate {
+		// This is only supported on systemd versions 218 and above.
+		properties = append(properties, newProp("Delegate", true))
+	}
+
 	// Always enable accounting, this gets us the same behaviour as the fs implementation,
 	// plus the kernel has some problems with joining the memory cgroup at a later time.
 	properties = append(properties,
@@ -222,11 +232,9 @@ func (m *Manager) Apply(pid int) error {
 			newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
 	}

-	// We need to set kernel memory before processes join cgroup because
-	// kmem.limit_in_bytes can only be set when the cgroup is empty.
-	// And swap memory limit needs to be set after memory limit, only
-	// memory limit is handled by systemd, so it's kind of ugly here.
-	if c.Resources.KernelMemory > 0 {
+	// We have to set kernel memory here, as we can't change it once
+	// processes have been attached to the cgroup.
+	if c.Resources.KernelMemory != 0 {
 		if err := setKernelMemory(c); err != nil {
 			return err
 		}
@@ -236,53 +244,7 @@ func (m *Manager) Apply(pid int) error {
 		return err
 	}

-	if err := joinDevices(c, pid); err != nil {
-		return err
-	}
-
-	// TODO: CpuQuota and CpuPeriod not available in systemd
-	// we need to manually join the cpu.cfs_quota_us and cpu.cfs_period_us
-	if err := joinCpu(c, pid); err != nil {
-		return err
-	}
-
-	// TODO: MemoryReservation and MemorySwap not available in systemd
-	if err := joinMemory(c, pid); err != nil {
-		return err
-	}
-
-	// we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd
-	// because it does not currently support it via the dbus api.
-	if err := joinFreezer(c, pid); err != nil {
-		return err
-	}
-
-	if err := joinNetPrio(c, pid); err != nil {
-		return err
-	}
-	if err := joinNetCls(c, pid); err != nil {
-		return err
-	}
-
-	if err := joinPids(c, pid); err != nil {
-		return err
-	}
-
-	if err := joinCpuset(c, pid); err != nil {
-		return err
-	}
-
-	if err := joinHugetlb(c, pid); err != nil {
-		return err
-	}
-
-	if err := joinPerfEvent(c, pid); err != nil {
-		return err
-	}
-	// FIXME: Systemd does have `BlockIODeviceWeight` property, but we got problem
-	// using that (at least on systemd 208, see https://github.com/opencontainers/runc/libcontainer/pull/354),
-	// so use fs work around for now.
-	if err := joinBlkio(c, pid); err != nil {
+	if err := joinCgroups(c, pid); err != nil {
 		return err
 	}

@@ -327,7 +289,7 @@ func writeFile(dir, file, data string) error {
 	// Normally dir should not be empty, one case is that cgroup subsystem
 	// is not mounted, we will get empty dir, and we want it fail here.
 	if dir == "" {
-		return fmt.Errorf("no such directory for %s.", file)
+		return fmt.Errorf("no such directory for %s", file)
 	}
 	return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
 }
@@ -347,43 +309,41 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
 	return path, nil
 }

-func joinCpu(c *configs.Cgroup, pid int) error {
-	_, err := join(c, "cpu", pid)
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
+func joinCgroups(c *configs.Cgroup, pid int) error {
+	for _, sys := range subsystems {
+		name := sys.Name()
+		switch name {
+		case "name=systemd":
+			// let systemd handle this
+			break
+		case "cpuset":
+			path, err := getSubsystemPath(c, name)
+			if err != nil && !cgroups.IsNotFound(err) {
+				return err
+			}
+			s := &fs.CpusetGroup{}
+			if err := s.ApplyDir(path, c, pid); err != nil {
+				return err
+			}
+			break
+		default:
+			_, err := join(c, name, pid)
+			if err != nil {
+				// Even if it's `not found` error, we'll return err
+				// because devices cgroup is hard requirement for
+				// container security.
+				if name == "devices" {
+					return err
+				}
+				// For other subsystems, omit the `not found` error
+				// because they are optional.
+				if !cgroups.IsNotFound(err) {
+					return err
+				}
+			}
+		}
 	}
-	return nil
-}

-func joinFreezer(c *configs.Cgroup, pid int) error {
-	_, err := join(c, "freezer", pid)
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
-}
-
-func joinNetPrio(c *configs.Cgroup, pid int) error {
-	_, err := join(c, "net_prio", pid)
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
-}
-
-func joinNetCls(c *configs.Cgroup, pid int) error {
-	_, err := join(c, "net_cls", pid)
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
-}
-
-func joinPids(c *configs.Cgroup, pid int) error {
-	_, err := join(c, "pids", pid)
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
 	return nil
 }

@@ -392,9 +352,18 @@ func joinPids(c *configs.Cgroup, pid int) error {
 // test.slice/test-a.slice/test-a-b.slice.
 func expandSlice(slice string) (string, error) {
 	suffix := ".slice"
-	sliceName := strings.TrimSuffix(slice, suffix)
+	// Name has to end with ".slice", but can't be just ".slice".
+	if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
+		return "", fmt.Errorf("invalid slice name: %s", slice)
+	}
+
+	// Path-separators are not allowed.
+	if strings.Contains(slice, "/") {
+		return "", fmt.Errorf("invalid slice name: %s", slice)
+	}

 	var path, prefix string
+	sliceName := strings.TrimSuffix(slice, suffix)
 	for _, component := range strings.Split(sliceName, "-") {
 		// test--a.slice isn't permitted, nor is -test.slice.
 		if component == "" {
@@ -510,87 +479,11 @@ func getUnitName(c *configs.Cgroup) string {
 	return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
 }

-// Atm we can't use the systemd device support because of two missing things:
-// * Support for wildcards to allow mknod on any device
-// * Support for wildcards to allow /dev/pts support
-//
-// The second is available in more recent systemd as "char-pts", but not in e.g. v208 which is
-// in wide use. When both these are available we will be able to switch, but need to keep the old
-// implementation for backwards compat.
-//
-// Note: we can't use systemd to set up the initial limits, and then change the cgroup
-// because systemd will re-write the device settings if it needs to re-apply the cgroup context.
-// This happens at least for v208 when any sibling unit is started.
-func joinDevices(c *configs.Cgroup, pid int) error {
-	_, err := join(c, "devices", pid)
-	// Even if it's `not found` error, we'll return err because devices cgroup
-	// is hard requirement for container security.
-	if err != nil {
-		return err
-	}
-	return nil
-}
-
 func setKernelMemory(c *configs.Cgroup) error {
 	path, err := getSubsystemPath(c, "memory")
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}

-	if err := os.MkdirAll(path, 0755); err != nil {
-		return err
-	}
-
-	// This doesn't get called by manager.Set, so we need to do it here.
-	s := &fs.MemoryGroup{}
-	return s.SetKernelMemory(path, c)
-}
-
-func joinMemory(c *configs.Cgroup, pid int) error {
-	_, err := join(c, "memory", pid)
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
-}
-
-// systemd does not atm set up the cpuset controller, so we must manually
-// join it. Additionally that is a very finicky controller where each
-// level must have a full setup as the default for a new directory is "no cpus"
-func joinCpuset(c *configs.Cgroup, pid int) error {
-	path, err := getSubsystemPath(c, "cpuset")
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-
-	s := &fs.CpusetGroup{}
-
-	return s.ApplyDir(path, c, pid)
-}
-
-// `BlockIODeviceWeight` property of systemd does not work properly, and systemd
-// expects device path instead of major minor numbers, which is also confusing
-// for users. So we use fs work around for now.
-func joinBlkio(c *configs.Cgroup, pid int) error {
-	_, err := join(c, "blkio", pid)
-	if err != nil {
-		return err
-	}
-	return nil
-}
-
-func joinHugetlb(c *configs.Cgroup, pid int) error {
-	_, err := join(c, "hugetlb", pid)
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
-}
-
-func joinPerfEvent(c *configs.Cgroup, pid int) error {
-	_, err := join(c, "perf_event", pid)
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
+	return os.MkdirAll(path, 0755)
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
@@ -5,6 +5,7 @@ package cgroups
 import (
 	"bufio"
 	"fmt"
+	"io"
 	"io/ioutil"
 	"os"
 	"path/filepath"
@@ -12,17 +13,19 @@ import (
 	"strings"
 	"time"

-	"github.com/docker/docker/pkg/mount"
 	"github.com/docker/go-units"
 )

 const cgroupNamePrefix = "name="

-// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt
+// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
 func FindCgroupMountpoint(subsystem string) (string, error) {
 	// We are not using mount.GetMounts() because it's super-inefficient,
 	// parsing it directly sped up x10 times because of not using Sscanf.
 	// It was one of two major performance drawbacks in container start.
+	if !isSubsystemAvailable(subsystem) {
+		return "", NewNotFoundError(subsystem)
+	}
 	f, err := os.Open("/proc/self/mountinfo")
 	if err != nil {
 		return "", err
@@ -47,6 +50,9 @@ func FindCgroupMountpoint(subsystem string) (string, error) {
 }

 func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
+	if !isSubsystemAvailable(subsystem) {
+		return "", "", NewNotFoundError(subsystem)
+	}
 	f, err := os.Open("/proc/self/mountinfo")
 	if err != nil {
 		return "", "", err
@@ -70,6 +76,15 @@ func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
 	return "", "", NewNotFoundError(subsystem)
 }

+func isSubsystemAvailable(subsystem string) bool {
+	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		return false
+	}
+	_, avail := cgroups[subsystem]
+	return avail
+}
+
 func FindCgroupMountpointDir() (string, error) {
 	f, err := os.Open("/proc/self/mountinfo")
 	if err != nil {
@@ -121,42 +136,63 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
 	return getControllerPath(m.Subsystems[0], cgroups)
 }

+func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
+	res := make([]Mount, 0, len(ss))
+	scanner := bufio.NewScanner(mi)
+	numFound := 0
+	for scanner.Scan() && numFound < len(ss) {
+		txt := scanner.Text()
+		sepIdx := strings.Index(txt, " - ")
+		if sepIdx == -1 {
+			return nil, fmt.Errorf("invalid mountinfo format")
+		}
+		if txt[sepIdx+3:sepIdx+9] != "cgroup" {
+			continue
+		}
+		fields := strings.Split(txt, " ")
+		m := Mount{
+			Mountpoint: fields[4],
+			Root:       fields[3],
+		}
+		for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+			if !ss[opt] {
+				continue
+			}
+			if strings.HasPrefix(opt, cgroupNamePrefix) {
+				m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
+			} else {
+				m.Subsystems = append(m.Subsystems, opt)
+			}
+			numFound++
+		}
+		res = append(res, m)
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, err
+	}
+	return res, nil
+}
+
 func GetCgroupMounts() ([]Mount, error) {
-	mounts, err := mount.GetMounts()
+	f, err := os.Open("/proc/self/mountinfo")
 	if err != nil {
 		return nil, err
 	}
+	defer f.Close()

-	all, err := GetAllSubsystems()
+	all, err := ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
 		return nil, err
 	}

 	allMap := make(map[string]bool)
-	for _, s := range all {
+	for s := range all {
 		allMap[s] = true
 	}
-
-	res := []Mount{}
-	for _, mount := range mounts {
-		if mount.Fstype == "cgroup" {
-			m := Mount{Mountpoint: mount.Mountpoint, Root: mount.Root}
-
-			for _, opt := range strings.Split(mount.VfsOpts, ",") {
-				if strings.HasPrefix(opt, cgroupNamePrefix) {
-					m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
-				}
-				if allMap[opt] {
-					m.Subsystems = append(m.Subsystems, opt)
-				}
-			}
-			res = append(res, m)
-		}
-	}
-	return res, nil
+	return getCgroupMountsHelper(allMap, f)
 }

-// Returns all the cgroup subsystems supported by the kernel
+// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
 func GetAllSubsystems() ([]string, error) {
 	f, err := os.Open("/proc/cgroups")
 	if err != nil {
@@ -182,7 +218,7 @@ func GetAllSubsystems() ([]string, error) {
 	return subsystems, nil
 }

-// Returns the relative path to the cgroup docker is running in.
+// GetThisCgroupDir returns the relative path to the cgroup docker is running in.
 func GetThisCgroupDir(subsystem string) (string, error) {
 	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
@@ -226,6 +262,8 @@ func readProcsFile(dir string) ([]int, error) {
 	return out, nil
 }

+// ParseCgroupFile parses the given cgroup file, typically from
+// /proc/<pid>/cgroup, into a map of subgroups to cgroup names.
 func ParseCgroupFile(path string) (map[string]string, error) {
 	f, err := os.Open(path)
 	if err != nil {
@@ -233,7 +271,12 @@ func ParseCgroupFile(path string) (map[string]string, error) {
 	}
 	defer f.Close()

-	s := bufio.NewScanner(f)
+	return parseCgroupFromReader(f)
+}
+
+// helper function for ParseCgroupFile to make testing easier
+func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
+	s := bufio.NewScanner(r)
 	cgroups := make(map[string]string)

 	for s.Scan() {
@@ -242,7 +285,16 @@ func ParseCgroupFile(path string) (map[string]string, error) {
 		}

 		text := s.Text()
-		parts := strings.Split(text, ":")
+		// from cgroups(7):
+		// /proc/[pid]/cgroup
+		// ...
+		// For each cgroup hierarchy ... there is one entry
+		// containing three colon-separated fields of the form:
+		//     hierarchy-ID:subsystem-list:cgroup-path
+		parts := strings.SplitN(text, ":", 3)
+		if len(parts) < 3 {
+			return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
+		}

 		for _, subs := range strings.Split(parts[1], ",") {
 			cgroups[subs] = parts[2]
@@ -309,7 +361,7 @@ func RemovePaths(paths map[string]string) (err error) {
 			return nil
 		}
 	}
-	return fmt.Errorf("Failed to remove paths: %s", paths)
+	return fmt.Errorf("Failed to remove paths: %v", paths)
 }

 func GetHugePageSize() ([]string, error) {
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go
@@ -11,15 +11,22 @@ const (
 )

 type Cgroup struct {
-	Name string `json:"name"`
+	// Deprecated, use Path instead
+	Name string `json:"name,omitempty"`

-	// name of parent cgroup or slice
-	Parent string `json:"parent"`
+	// name of parent of cgroup or slice
+	// Deprecated, use Path instead
+	Parent string `json:"parent,omitempty"`
+
+	// Path specifies the path to cgroups that are created and/or joined by the container.
+	// The path is assumed to be relative to the host system cgroup mountpoint.
+	Path string `json:"path"`

 	// ScopePrefix decribes prefix for the scope name
 	ScopePrefix string `json:"scope_prefix"`

-	// Paths represent the cgroups paths to join
+	// Paths represent the absolute cgroups paths to join.
+	// This takes precedence over Path.
 	Paths map[string]string

 	// Resources contains various cgroups settings to apply
@@ -28,11 +35,14 @@ type Cgroup struct {

 type Resources struct {
 	// If this is true allow access to any kind of device within the container.  If false, allow access only to devices explicitly listed in the allowed_devices list.
-	AllowAllDevices bool `json:"allow_all_devices"`
+	// Deprecated
+	AllowAllDevices bool `json:"allow_all_devices,omitempty"`
+	// Deprecated
+	AllowedDevices []*Device `json:"allowed_devices,omitempty"`
+	// Deprecated
+	DeniedDevices []*Device `json:"denied_devices,omitempty"`

-	AllowedDevices []*Device `json:"allowed_devices"`
-
-	DeniedDevices []*Device `json:"denied_devices"`
+	Devices []*Device `json:"devices"`

 	// Memory limit (in bytes)
 	Memory int64 `json:"memory"`
@@ -46,6 +56,9 @@ type Resources struct {
 	// Kernel memory limit (in bytes)
 	KernelMemory int64 `json:"kernel_memory"`

+	// Kernel memory limit for TCP use (in bytes)
+	KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
+
 	// CPU shares (relative weight vs. other containers)
 	CpuShares int64 `json:"cpu_shares"`

@@ -101,7 +114,7 @@ type Resources struct {
 	OomKillDisable bool `json:"oom_kill_disable"`

 	// Tuning swappiness behaviour per cgroup
-	MemorySwappiness int64 `json:"memory_swappiness"`
+	MemorySwappiness *int64 `json:"memory_swappiness"`

 	// Set priority of network traffic for container
 	NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
@@ -3,7 +3,11 @@ package configs
 import (
 	"bytes"
 	"encoding/json"
+	"fmt"
 	"os/exec"
+	"time"
+
+	"github.com/Sirupsen/logrus"
 )

 type Rlimit struct {
@@ -29,7 +33,7 @@ type Seccomp struct {
 	Syscalls      []*Syscall `json:"syscalls"`
 }

-// An action to be taken upon rule match in Seccomp
+// Action is taken upon rule match in Seccomp
 type Action int

 const (
@@ -40,7 +44,7 @@ const (
 	Trace
 )

-// A comparison operator to be used when matching syscall arguments in Seccomp
+// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
 type Operator int

 const (
@@ -53,7 +57,7 @@ const (
 	MaskEqualTo
 )

-// A rule to match a specific syscall argument in Seccomp
+// Arg is a rule to match a specific syscall argument in Seccomp
 type Arg struct {
 	Index    uint     `json:"index"`
 	Value    uint64   `json:"value"`
@@ -61,7 +65,7 @@ type Arg struct {
 	Op       Operator `json:"op"`
 }

-// An rule to match a syscall in Seccomp
+// Syscall is a rule to match a syscall in Seccomp
 type Syscall struct {
 	Name   string `json:"name"`
 	Action Action `json:"action"`
@@ -128,15 +132,15 @@ type Config struct {

 	// AppArmorProfile specifies the profile to apply to the process running in the container and is
 	// change at the time the process is execed
-	AppArmorProfile string `json:"apparmor_profile"`
+	AppArmorProfile string `json:"apparmor_profile,omitempty"`

 	// ProcessLabel specifies the label to apply to the process running in the container.  It is
 	// commonly used by selinux
-	ProcessLabel string `json:"process_label"`
+	ProcessLabel string `json:"process_label,omitempty"`

 	// Rlimits specifies the resource limits, such as max open files, to set in the container
 	// If Rlimits are not set, the container will inherit rlimits from the parent process
-	Rlimits []Rlimit `json:"rlimits"`
+	Rlimits []Rlimit `json:"rlimits,omitempty"`

 	// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
 	// for a process. Valid values are between the range [-1000, '1000'], where processes with
@@ -171,12 +175,22 @@ type Config struct {
 	// A default action to be taken if no rules match is also given.
 	Seccomp *Seccomp `json:"seccomp"`

+	// NoNewPrivileges controls whether processes in the container can gain additional privileges.
+	NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
+
 	// Hooks are a collection of actions to perform at various container lifecycle events.
-	// Hooks are not able to be marshaled to json but they are also not needed to.
-	Hooks *Hooks `json:"-"`
+	// CommandHooks are serialized to JSON, but other hooks are not.
+	Hooks *Hooks

 	// Version is the version of opencontainer specification that is supported.
 	Version string `json:"version"`
+
+	// Labels are user defined metadata that is stored in the config and populated on the state
+	Labels []string `json:"labels"`
+
+	// NoNewKeyring will not allocated a new session keyring for the container.  It will use the
+	// callers keyring in this case.
+	NoNewKeyring bool `json:"no_new_keyring"`
 }

 type Hooks struct {
@@ -191,12 +205,59 @@ type Hooks struct {
 	Poststop []Hook
 }

+func (hooks *Hooks) UnmarshalJSON(b []byte) error {
+	var state struct {
+		Prestart  []CommandHook
+		Poststart []CommandHook
+		Poststop  []CommandHook
+	}
+
+	if err := json.Unmarshal(b, &state); err != nil {
+		return err
+	}
+
+	deserialize := func(shooks []CommandHook) (hooks []Hook) {
+		for _, shook := range shooks {
+			hooks = append(hooks, shook)
+		}
+
+		return hooks
+	}
+
+	hooks.Prestart = deserialize(state.Prestart)
+	hooks.Poststart = deserialize(state.Poststart)
+	hooks.Poststop = deserialize(state.Poststop)
+	return nil
+}
+
+func (hooks Hooks) MarshalJSON() ([]byte, error) {
+	serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
+		for _, hook := range hooks {
+			switch chook := hook.(type) {
+			case CommandHook:
+				serializableHooks = append(serializableHooks, chook)
+			default:
+				logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
+			}
+		}
+
+		return serializableHooks
+	}
+
+	return json.Marshal(map[string]interface{}{
+		"prestart":  serialize(hooks.Prestart),
+		"poststart": serialize(hooks.Poststart),
+		"poststop":  serialize(hooks.Poststop),
+	})
+}
+
 // HookState is the payload provided to a hook on execution.
 type HookState struct {
-	Version string `json:"version"`
-	ID      string `json:"id"`
-	Pid     int    `json:"pid"`
-	Root    string `json:"root"`
+	Version    string `json:"ociVersion"`
+	ID         string `json:"id"`
+	Pid        int    `json:"pid"`
+	Root       string `json:"root"`
+	BundlePath string `json:"bundlePath"`
 }

 type Hook interface {
@@ -204,7 +265,7 @@ type Hook interface {
 	Run(HookState) error
 }

-// NewFunctionHooks will call the provided function when the hook is run.
+// NewFunctionHook will call the provided function when the hook is run.
 func NewFunctionHook(f func(HookState) error) FuncHook {
 	return FuncHook{
 		run: f,
@@ -220,13 +281,14 @@ func (f FuncHook) Run(s HookState) error {
 }

 type Command struct {
-	Path string   `json:"path"`
-	Args []string `json:"args"`
-	Env  []string `json:"env"`
-	Dir  string   `json:"dir"`
+	Path    string         `json:"path"`
+	Args    []string       `json:"args"`
+	Env     []string       `json:"env"`
+	Dir     string         `json:"dir"`
+	Timeout *time.Duration `json:"timeout"`
 }

-// NewCommandHooks will execute the provided command when the hook is run.
+// NewCommandHook will execute the provided command when the hook is run.
 func NewCommandHook(cmd Command) CommandHook {
 	return CommandHook{
 		Command: cmd,
@@ -248,5 +310,23 @@ func (c Command) Run(s HookState) error {
 		Env:   c.Env,
 		Stdin: bytes.NewReader(b),
 	}
-	return cmd.Run()
+	errC := make(chan error, 1)
+	go func() {
+		out, err := cmd.CombinedOutput()
+		if err != nil {
+			err = fmt.Errorf("%s: %s", err, out)
+		}
+		errC <- err
+	}()
+	if c.Timeout != nil {
+		select {
+		case err := <-errC:
+			return err
+		case <-time.After(*c.Timeout):
+			cmd.Process.Kill()
+			cmd.Wait()
+			return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
+		}
+	}
+	return <-errC
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go
@@ -4,7 +4,7 @@ package configs

 import "fmt"

-// Gets the root uid for the process on host which could be non-zero
+// HostUID gets the root uid for the process on host which could be non-zero
 // when user namespaces are enabled.
 func (c Config) HostUID() (int, error) {
 	if c.Namespaces.Contains(NEWUSER) {
@@ -21,7 +21,7 @@ func (c Config) HostUID() (int, error) {
 	return 0, nil
 }

-// Gets the root gid for the process on host which could be non-zero
+// HostGID gets the root gid for the process on host which could be non-zero
 // when user namespaces are enabled.
 func (c Config) HostGID() (int, error) {
 	if c.Namespaces.Contains(NEWUSER) {
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go
@@ -35,6 +35,9 @@ type Device struct {

 	// Gid of the device.
 	Gid uint32 `json:"gid"`
+
+	// Write the file to the allowed list
+	Allow bool `json:"allow"`
 }

 func (d *Device) CgroupString() string {
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go
@@ -3,7 +3,7 @@
 package configs

 var (
-	// These are devices that are to be both allowed and created.
+	// DefaultSimpleDevices are devices that are to be both allowed and created.
 	DefaultSimpleDevices = []*Device{
 		// /dev/null and zero
 		{
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go
@@ -18,7 +18,7 @@ var namespaceInfo = map[NamespaceType]int{
 }

 // CloneFlags parses the container's Namespaces options to set the correct
-// flags on clone, unshare. This functions returns flags only for new namespaces.
+// flags on clone, unshare. This function returns flags only for new namespaces.
 func (n *Namespaces) CloneFlags() uintptr {
 	var flag int
 	for _, v := range *n {
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go
@@ -8,7 +8,7 @@ func (n *Namespace) Syscall() int {
 }

 // CloneFlags parses the container's Namespaces options to set the correct
-// flags on clone, unshare. This functions returns flags only for new namespaces.
+// flags on clone, unshare. This function returns flags only for new namespaces.
 func (n *Namespaces) CloneFlags() uintptr {
 	panic("No namespace syscall support")
 	return uintptr(0)
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go
@@ -2,7 +2,11 @@

 package configs

-import "fmt"
+import (
+	"fmt"
+	"os"
+	"sync"
+)

 const (
 	NEWNET  NamespaceType = "NEWNET"
@@ -13,6 +17,51 @@ const (
 	NEWUSER NamespaceType = "NEWUSER"
 )

+var (
+	nsLock              sync.Mutex
+	supportedNamespaces = make(map[NamespaceType]bool)
+)
+
+// nsToFile converts the namespace type to its filename
+func nsToFile(ns NamespaceType) string {
+	switch ns {
+	case NEWNET:
+		return "net"
+	case NEWNS:
+		return "mnt"
+	case NEWPID:
+		return "pid"
+	case NEWIPC:
+		return "ipc"
+	case NEWUSER:
+		return "user"
+	case NEWUTS:
+		return "uts"
+	}
+	return ""
+}
+
+// IsNamespaceSupported returns whether a namespace is available or
+// not
+func IsNamespaceSupported(ns NamespaceType) bool {
+	nsLock.Lock()
+	defer nsLock.Unlock()
+	supported, ok := supportedNamespaces[ns]
+	if ok {
+		return supported
+	}
+	nsFile := nsToFile(ns)
+	// if the namespace type is unknown, just return false
+	if nsFile == "" {
+		return false
+	}
+	_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
+	// a namespace is supported if it exists and we have permissions to read it
+	supported = err == nil
+	supportedNamespaces[ns] = supported
+	return supported
+}
+
 func NamespaceTypes() []NamespaceType {
 	return []NamespaceType{
 		NEWNET,
@@ -35,26 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
 	if n.Path != "" {
 		return n.Path
 	}
-	return fmt.Sprintf("/proc/%d/ns/%s", pid, n.file())
-}
-
-func (n *Namespace) file() string {
-	file := ""
-	switch n.Type {
-	case NEWNET:
-		file = "net"
-	case NEWNS:
-		file = "mnt"
-	case NEWPID:
-		file = "pid"
-	case NEWIPC:
-		file = "ipc"
-	case NEWUSER:
-		file = "user"
-	case NEWUTS:
-		file = "uts"
-	}
-	return file
+	return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type))
 }

 func (n *Namespaces) Remove(t NamespaceType) bool {
@@ -87,3 +117,11 @@ func (n *Namespaces) index(t NamespaceType) int {
 func (n *Namespaces) Contains(t NamespaceType) bool {
 	return n.index(t) != -1
 }
+
+func (n *Namespaces) PathOf(t NamespaceType) string {
+	i := n.index(t)
+	if i == -1 {
+		return ""
+	}
+	return (*n)[i].Path
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go
@@ -4,8 +4,10 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"strings"

 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/selinux"
 )

 type Validator interface {
@@ -35,10 +37,13 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
 	if err := v.usernamespace(config); err != nil {
 		return err
 	}
+	if err := v.sysctl(config); err != nil {
+		return err
+	}
 	return nil
 }

-// rootfs validates the the rootfs is an absolute path and is not a symlink
+// rootfs validates if the rootfs is an absolute path and is not a symlink
 // to the container's root filesystem.
 func (v *ConfigValidator) rootfs(config *configs.Config) error {
 	cleaned, err := filepath.Abs(config.Rootfs)
@@ -48,7 +53,7 @@ func (v *ConfigValidator) rootfs(config *configs.Config) error {
 	if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
 		return err
 	}
-	if config.Rootfs != cleaned {
+	if filepath.Clean(config.Rootfs) != cleaned {
 		return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs)
 	}
 	return nil
@@ -76,6 +81,10 @@ func (v *ConfigValidator) security(config *configs.Config) error {
 		!config.Namespaces.Contains(configs.NEWNS) {
 		return fmt.Errorf("unable to restrict sys entries without a private MNT namespace")
 	}
+	if config.ProcessLabel != "" && !selinux.SelinuxEnabled() {
+		return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported")
+	}
+
 	return nil
 }

@@ -91,3 +100,39 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error {
 	}
 	return nil
 }
+
+// sysctl validates that the specified sysctl keys are valid or not.
+// /proc/sys isn't completely namespaced and depending on which namespaces
+// are specified, a subset of sysctls are permitted.
+func (v *ConfigValidator) sysctl(config *configs.Config) error {
+	validSysctlMap := map[string]bool{
+		"kernel.msgmax":          true,
+		"kernel.msgmnb":          true,
+		"kernel.msgmni":          true,
+		"kernel.sem":             true,
+		"kernel.shmall":          true,
+		"kernel.shmmax":          true,
+		"kernel.shmmni":          true,
+		"kernel.shm_rmid_forced": true,
+	}
+
+	for s := range config.Sysctl {
+		if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
+			if config.Namespaces.Contains(configs.NEWIPC) {
+				continue
+			} else {
+				return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s)
+			}
+		}
+		if strings.HasPrefix(s, "net.") {
+			if config.Namespaces.Contains(configs.NEWNET) {
+				continue
+			} else {
+				return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
+			}
+		}
+		return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
+	}
+
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go
@@ -0,0 +1,11 @@
+package libcontainer
+
+import (
+	"errors"
+)
+
+// NewConsole returns an initalized console that can be used within a container by copying bytes
+// from the master side to the slave that is attached as the tty for the container's init process.
+func NewConsole(uid, gid int) (Console, error) {
+	return nil, errors.New("libcontainer console is not supported on Solaris")
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/container.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/container.go
@@ -1,4 +1,4 @@
-// Libcontainer provides a native Go implementation for creating containers
+// Package libcontainer provides a native Go implementation for creating containers
 // with namespaces, cgroups, capabilities, and filesystem access controls.
 // It allows you to manage the lifecycle of the container performing additional operations
 // after the container is created.
@@ -6,31 +6,25 @@ package libcontainer

 import (
 	"os"
+	"time"

 	"github.com/opencontainers/runc/libcontainer/configs"
 )

-// The status of a container.
+// Status is the status of a container.
 type Status int

 const (
-	// The container exists but has not been run yet
+	// Created is the status that denotes the container exists but has not been run yet.
 	Created Status = iota
-
-	// The container exists and is running.
+	// Running is the status that denotes the container exists and is running.
 	Running
-
-	// The container exists, it is in the process of being paused.
+	// Pausing is the status that denotes the container exists, it is in the process of being paused.
 	Pausing
-
-	// The container exists, but all its processes are paused.
+	// Paused is the status that denotes the container exists, but all its processes are paused.
 	Paused
-
-	// The container exists, but its state is saved on disk
-	Checkpointed
-
-	// The container does not exist.
-	Destroyed
+	// Stopped is the status that denotes the container does not have a created or running process.
+	Stopped
 )

 func (s Status) String() string {
@@ -43,10 +37,8 @@ func (s Status) String() string {
 		return "pausing"
 	case Paused:
 		return "paused"
-	case Checkpointed:
-		return "checkpointed"
-	case Destroyed:
-		return "destroyed"
+	case Stopped:
+		return "stopped"
 	default:
 		return "unknown"
 	}
@@ -61,14 +53,17 @@ type BaseState struct {
 	// InitProcessPid is the init process id in the parent namespace.
 	InitProcessPid int `json:"init_process_pid"`

-	// InitProcessStartTime is the init process start time.
+	// InitProcessStartTime is the init process start time in clock cycles since boot time.
 	InitProcessStartTime string `json:"init_process_start"`

+	// Created is the unix timestamp for the creation time of the container in UTC
+	Created time.Time `json:"created"`
+
 	// Config is the container's configuration.
 	Config configs.Config `json:"config"`
 }

-// A libcontainer container object.
+// BaseContainer is a libcontainer container object.
 //
 // Each container is thread-safe within the same process. Since a container can
 // be destroyed by a separate process, any function may return that the container
@@ -81,13 +76,13 @@ type BaseContainer interface {
 	//
 	// errors:
 	// ContainerDestroyed - Container no longer exists,
-	// Systemerror - System error.
+	// SystemError - System error.
 	Status() (Status, error)

 	// State returns the current container's state information.
 	//
 	// errors:
-	// Systemerror - System error.
+	// SystemError - System error.
 	State() (*State, error)

 	// Returns the current config of the container.
@@ -97,7 +92,7 @@ type BaseContainer interface {
 	//
 	// errors:
 	// ContainerDestroyed - Container no longer exists,
-	// Systemerror - System error.
+	// SystemError - System error.
 	//
 	// Some of the returned PIDs may no longer refer to processes in the Container, unless
 	// the Container state is PAUSED in which case every PID in the slice is valid.
@@ -107,7 +102,7 @@ type BaseContainer interface {
 	//
 	// errors:
 	// ContainerDestroyed - Container no longer exists,
-	// Systemerror - System error.
+	// SystemError - System error.
 	Stats() (*Stats, error)

 	// Set resources of container as configured
@@ -115,7 +110,7 @@ type BaseContainer interface {
 	// We can use this to change resources when containers are running.
 	//
 	// errors:
-	// Systemerror - System error.
+	// SystemError - System error.
 	Set(config configs.Config) error

 	// Start a process inside the container. Returns error if process fails to
@@ -125,21 +120,38 @@ type BaseContainer interface {
 	// ContainerDestroyed - Container no longer exists,
 	// ConfigInvalid - config is invalid,
 	// ContainerPaused - Container is paused,
-	// Systemerror - System error.
+	// SystemError - System error.
 	Start(process *Process) (err error)

+	// Run immediatly starts the process inside the conatiner.  Returns error if process
+	// fails to start.  It does not block waiting for the exec fifo  after start returns but
+	// opens the fifo after start returns.
+	//
+	// errors:
+	// ContainerDestroyed - Container no longer exists,
+	// ConfigInvalid - config is invalid,
+	// ContainerPaused - Container is paused,
+	// SystemError - System error.
+	Run(process *Process) (err error)
+
 	// Destroys the container after killing all running processes.
 	//
 	// Any event registrations are removed before the container is destroyed.
 	// No error is returned if the container is already destroyed.
 	//
 	// errors:
-	// Systemerror - System error.
+	// SystemError - System error.
 	Destroy() error

 	// Signal sends the provided signal code to the container's initial process.
 	//
 	// errors:
-	// Systemerror - System error.
+	// SystemError - System error.
 	Signal(s os.Signal) error
+
+	// Exec signals the container to exec the users process at the end of the init.
+	//
+	// errors:
+	// SystemError - System error.
+	Exec() error
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go
@@ -15,30 +15,35 @@ import (
 	"strings"
 	"sync"
 	"syscall"
+	"time"

 	"github.com/Sirupsen/logrus"
 	"github.com/golang/protobuf/proto"
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/criurpc"
+	"github.com/opencontainers/runc/libcontainer/system"
 	"github.com/opencontainers/runc/libcontainer/utils"
+	"github.com/syndtr/gocapability/capability"
 	"github.com/vishvananda/netlink/nl"
 )

 const stdioFdCount = 3

 type linuxContainer struct {
-	id            string
-	root          string
-	config        *configs.Config
-	cgroupManager cgroups.Manager
-	initPath      string
-	initArgs      []string
-	initProcess   parentProcess
-	criuPath      string
-	m             sync.Mutex
-	criuVersion   int
-	state         containerState
+	id                   string
+	root                 string
+	config               *configs.Config
+	cgroupManager        cgroups.Manager
+	initPath             string
+	initArgs             []string
+	initProcess          parentProcess
+	initProcessStartTime string
+	criuPath             string
+	m                    sync.Mutex
+	criuVersion          int
+	state                containerState
+	created              time.Time
 }

 // State represents a running container's state
@@ -59,7 +64,7 @@ type State struct {
 	ExternalDescriptors []string `json:"external_descriptors,omitempty"`
 }

-// A libcontainer container object.
+// Container is a libcontainer container object.
 //
 // Each container is thread-safe within the same process. Since a container can
 // be destroyed by a separate process, any function may return that the container
@@ -75,13 +80,13 @@ type Container interface {
 	// Systemerror - System error.
 	Checkpoint(criuOpts *CriuOpts) error

-	// Restore restores the checkpointed container to a running state using the criu(8) utiity.
+	// Restore restores the checkpointed container to a running state using the criu(8) utility.
 	//
 	// errors:
 	// Systemerror - System error.
 	Restore(process *Process, criuOpts *CriuOpts) error

-	// If the Container state is RUNNING or PAUSING, sets the Container state to PAUSING and pauses
+	// If the Container state is RUNNING, sets the Container state to PAUSING and pauses
 	// the execution of any user processes. Asynchronously, when the container finished being paused the
 	// state is changed to PAUSED.
 	// If the Container state is PAUSED, do nothing.
@@ -138,7 +143,7 @@ func (c *linuxContainer) State() (*State, error) {
 func (c *linuxContainer) Processes() ([]int, error) {
 	pids, err := c.cgroupManager.GetAllPids()
 	if err != nil {
-		return nil, newSystemError(err)
+		return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
 	}
 	return pids, nil
 }
@@ -149,14 +154,14 @@ func (c *linuxContainer) Stats() (*Stats, error) {
 		stats = &Stats{}
 	)
 	if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
-		return stats, newSystemError(err)
+		return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
 	}
 	for _, iface := range c.config.Networks {
 		switch iface.Type {
 		case "veth":
 			istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
 			if err != nil {
-				return stats, newSystemError(err)
+				return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
 			}
 			stats.Interfaces = append(stats.Interfaces, istats)
 		}
@@ -167,6 +172,13 @@ func (c *linuxContainer) Stats() (*Stats, error) {
 func (c *linuxContainer) Set(config configs.Config) error {
 	c.m.Lock()
 	defer c.m.Unlock()
+	status, err := c.currentStatus()
+	if err != nil {
+		return err
+	}
+	if status == Stopped {
+		return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
+	}
 	c.config = &config
 	return c.cgroupManager.Set(c.config)
 }
@@ -178,38 +190,90 @@ func (c *linuxContainer) Start(process *Process) error {
 	if err != nil {
 		return err
 	}
-	doInit := status == Destroyed
-	parent, err := c.newParentProcess(process, doInit)
+	return c.start(process, status == Stopped)
+}
+
+func (c *linuxContainer) Run(process *Process) error {
+	c.m.Lock()
+	defer c.m.Unlock()
+	status, err := c.currentStatus()
 	if err != nil {
-		return newSystemError(err)
+		return err
+	}
+	if err := c.start(process, status == Stopped); err != nil {
+		return err
+	}
+	if status == Stopped {
+		return c.exec()
+	}
+	return nil
+}
+
+func (c *linuxContainer) Exec() error {
+	c.m.Lock()
+	defer c.m.Unlock()
+	return c.exec()
+}
+
+func (c *linuxContainer) exec() error {
+	path := filepath.Join(c.root, execFifoFilename)
+	f, err := os.OpenFile(path, os.O_RDONLY, 0)
+	if err != nil {
+		return newSystemErrorWithCause(err, "open exec fifo for reading")
+	}
+	defer f.Close()
+	data, err := ioutil.ReadAll(f)
+	if err != nil {
+		return err
+	}
+	if len(data) > 0 {
+		os.Remove(path)
+		return nil
+	}
+	return fmt.Errorf("cannot start an already running container")
+}
+
+func (c *linuxContainer) start(process *Process, isInit bool) error {
+	parent, err := c.newParentProcess(process, isInit)
+	if err != nil {
+		return newSystemErrorWithCause(err, "creating new parent process")
 	}
 	if err := parent.start(); err != nil {
 		// terminate the process to ensure that it properly is reaped.
 		if err := parent.terminate(); err != nil {
 			logrus.Warn(err)
 		}
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "starting container process")
 	}
+	// generate a timestamp indicating when the container was started
+	c.created = time.Now().UTC()
 	c.state = &runningState{
 		c: c,
 	}
-	if doInit {
-		if err := c.updateState(parent); err != nil {
+	if isInit {
+		c.state = &createdState{
+			c: c,
+		}
+		state, err := c.updateState(parent)
+		if err != nil {
 			return err
 		}
+		c.initProcessStartTime = state.InitProcessStartTime
+
 		if c.config.Hooks != nil {
 			s := configs.HookState{
-				Version: c.config.Version,
-				ID:      c.id,
-				Pid:     parent.pid(),
-				Root:    c.config.Rootfs,
+				Version:    c.config.Version,
+				ID:         c.id,
+				Pid:        parent.pid(),
+				Root:       c.config.Rootfs,
+				BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
 			}
-			for _, hook := range c.config.Hooks.Poststart {
+			for i, hook := range c.config.Hooks.Poststart {
 				if err := hook.Run(s); err != nil {
 					if err := parent.terminate(); err != nil {
 						logrus.Warn(err)
 					}
-					return newSystemError(err)
+					return newSystemErrorWithCausef(err, "running poststart hook %d", i)
 				}
 			}
 		}
@@ -219,7 +283,7 @@ func (c *linuxContainer) Start(process *Process) error {

 func (c *linuxContainer) Signal(s os.Signal) error {
 	if err := c.initProcess.signal(s); err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "signaling init process")
 	}
 	return nil
 }
@@ -227,19 +291,23 @@ func (c *linuxContainer) Signal(s os.Signal) error {
 func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
 	parentPipe, childPipe, err := newPipe()
 	if err != nil {
-		return nil, newSystemError(err)
+		return nil, newSystemErrorWithCause(err, "creating new init pipe")
 	}
-	cmd, err := c.commandTemplate(p, childPipe)
+	rootDir, err := os.Open(c.root)
 	if err != nil {
-		return nil, newSystemError(err)
+		return nil, err
+	}
+	cmd, err := c.commandTemplate(p, childPipe, rootDir)
+	if err != nil {
+		return nil, newSystemErrorWithCause(err, "creating new command template")
 	}
 	if !doInit {
-		return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
+		return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir)
 	}
-	return c.newInitProcess(p, cmd, parentPipe, childPipe)
+	return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
 }

-func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
+func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) {
 	cmd := &exec.Cmd{
 		Path: c.initPath,
 		Args: c.initArgs,
@@ -251,8 +319,10 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
 	if cmd.SysProcAttr == nil {
 		cmd.SysProcAttr = &syscall.SysProcAttr{}
 	}
-	cmd.ExtraFiles = append(p.ExtraFiles, childPipe)
-	cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
+	cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir)
+	cmd.Env = append(cmd.Env,
+		fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2),
+		fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
 	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
 	// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
 	// even with the parent still running.
@@ -262,38 +332,42 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
 	return cmd, nil
 }

-func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
-	t := "_LIBCONTAINER_INITTYPE=" + string(initStandard)
-	cloneFlags := c.config.Namespaces.CloneFlags()
-	if cloneFlags&syscall.CLONE_NEWUSER != 0 {
-		if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
-			// user mappings are not supported
-			return nil, err
-		}
-		enableSetgroups(cmd.SysProcAttr)
-		// Default to root user when user namespaces are enabled.
-		if cmd.SysProcAttr.Credential == nil {
-			cmd.SysProcAttr.Credential = &syscall.Credential{}
+func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
+	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
+	nsMaps := make(map[configs.NamespaceType]string)
+	for _, ns := range c.config.Namespaces {
+		if ns.Path != "" {
+			nsMaps[ns.Type] = ns.Path
 		}
 	}
-	cmd.Env = append(cmd.Env, t)
-	cmd.SysProcAttr.Cloneflags = cloneFlags
+	_, sharePidns := nsMaps[configs.NEWPID]
+	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "")
+	if err != nil {
+		return nil, err
+	}
 	return &initProcess{
-		cmd:        cmd,
-		childPipe:  childPipe,
-		parentPipe: parentPipe,
-		manager:    c.cgroupManager,
-		config:     c.newInitConfig(p),
-		container:  c,
-		process:    p,
+		cmd:           cmd,
+		childPipe:     childPipe,
+		parentPipe:    parentPipe,
+		manager:       c.cgroupManager,
+		config:        c.newInitConfig(p),
+		container:     c,
+		process:       p,
+		bootstrapData: data,
+		sharePidns:    sharePidns,
+		rootDir:       rootDir,
 	}, nil
 }

-func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
+func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) {
 	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
+	state, err := c.currentState()
+	if err != nil {
+		return nil, newSystemErrorWithCause(err, "getting container's current state")
+	}
 	// for setns process, we dont have to set cloneflags as the process namespaces
 	// will only be set via setns syscall
-	data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
+	data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath)
 	if err != nil {
 		return nil, err
 	}
@@ -306,11 +380,12 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
 		config:        c.newInitConfig(p),
 		process:       p,
 		bootstrapData: data,
+		rootDir:       rootDir,
 	}, nil
 }

 func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
-	return &initConfig{
+	cfg := &initConfig{
 		Config:           c.config,
 		Args:             process.Args,
 		Env:              process.Env,
@@ -319,7 +394,26 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
 		Console:          process.consolePath,
 		Capabilities:     process.Capabilities,
 		PassedFilesCount: len(process.ExtraFiles),
+		ContainerId:      c.ID(),
+		NoNewPrivileges:  c.config.NoNewPrivileges,
+		AppArmorProfile:  c.config.AppArmorProfile,
+		ProcessLabel:     c.config.ProcessLabel,
+		Rlimits:          c.config.Rlimits,
+		ExecFifoPath:     filepath.Join(c.root, execFifoFilename),
 	}
+	if process.NoNewPrivileges != nil {
+		cfg.NoNewPrivileges = *process.NoNewPrivileges
+	}
+	if process.AppArmorProfile != "" {
+		cfg.AppArmorProfile = process.AppArmorProfile
+	}
+	if process.Label != "" {
+		cfg.ProcessLabel = process.Label
+	}
+	if len(process.Rlimits) > 0 {
+		cfg.Rlimits = process.Rlimits
+	}
+	return cfg
 }

 func newPipe() (parent *os.File, child *os.File, err error) {
@@ -343,15 +437,16 @@ func (c *linuxContainer) Pause() error {
 	if err != nil {
 		return err
 	}
-	if status != Running {
-		return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
+	switch status {
+	case Running, Created:
+		if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
+			return err
+		}
+		return c.state.transition(&pausedState{
+			c: c,
+		})
 	}
-	if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
-		return err
-	}
-	return c.state.transition(&pausedState{
-		c: c,
-	})
+	return newGenericError(fmt.Errorf("container not running: %s", status), ContainerNotRunning)
 }

 func (c *linuxContainer) Resume() error {
@@ -380,23 +475,13 @@ func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struc
 	return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
 }

-// XXX debug support, remove when debugging done.
-func addArgsFromEnv(evar string, args *[]string) {
-	if e := os.Getenv(evar); e != "" {
-		for _, f := range strings.Fields(e) {
-			*args = append(*args, f)
-		}
-	}
-	fmt.Printf(">>> criu %v\n", *args)
-}
-
-// check Criu version greater than or equal to min_version
-func (c *linuxContainer) checkCriuVersion(min_version string) error {
+// checkCriuVersion checks Criu version greater than or equal to minVersion
+func (c *linuxContainer) checkCriuVersion(minVersion string) error {
 	var x, y, z, versionReq int

-	_, err := fmt.Sscanf(min_version, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
+	_, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
 	if err != nil {
-		_, err = fmt.Sscanf(min_version, "Version: %d.%d\n", &x, &y) // 1.6
+		_, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
 	}
 	versionReq = x*10000 + y*100 + z

@@ -441,7 +526,7 @@ func (c *linuxContainer) checkCriuVersion(min_version string) error {
 	c.criuVersion = x*10000 + y*100 + z

 	if c.criuVersion < versionReq {
-		return fmt.Errorf("CRIU version must be %s or higher", min_version)
+		return fmt.Errorf("CRIU version must be %s or higher", minVersion)
 	}

 	return nil
@@ -514,6 +599,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
 		TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
 		ExtUnixSk:      proto.Bool(criuOpts.ExternalUnixConnections),
 		FileLocks:      proto.Bool(criuOpts.FileLocks),
+		EmptyNs:        proto.Uint32(criuOpts.EmptyNs),
 	}

 	// append optional criu opts, e.g., page-server and port
@@ -529,7 +615,8 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
 		if err := c.checkCriuVersion("1.7"); err != nil {
 			return err
 		}
-		rpcOpts.ManageCgroupsMode = proto.Uint32(uint32(criuOpts.ManageCgroupsMode))
+		mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
+		rpcOpts.ManageCgroupsMode = &mode
 	}

 	t := criurpc.CriuReqType_DUMP
@@ -587,6 +674,27 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
 	req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
 }

+func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
+	for _, iface := range c.config.Networks {
+		switch iface.Type {
+		case "veth":
+			veth := new(criurpc.CriuVethPair)
+			veth.IfOut = proto.String(iface.HostInterfaceName)
+			veth.IfIn = proto.String(iface.Name)
+			req.Opts.Veths = append(req.Opts.Veths, veth)
+			break
+		case "loopback":
+			break
+		}
+	}
+	for _, i := range criuOpts.VethPairs {
+		veth := new(criurpc.CriuVethPair)
+		veth.IfOut = proto.String(i.HostInterfaceName)
+		veth.IfIn = proto.String(i.ContainerInterfaceName)
+		req.Opts.Veths = append(req.Opts.Veths, veth)
+	}
+}
+
 func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
 	c.m.Lock()
 	defer c.m.Unlock()
@@ -650,6 +758,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
 			ExtUnixSk:      proto.Bool(criuOpts.ExternalUnixConnections),
 			TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
 			FileLocks:      proto.Bool(criuOpts.FileLocks),
+			EmptyNs:        proto.Uint32(criuOpts.EmptyNs),
 		},
 	}

@@ -669,23 +778,9 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
 			break
 		}
 	}
-	for _, iface := range c.config.Networks {
-		switch iface.Type {
-		case "veth":
-			veth := new(criurpc.CriuVethPair)
-			veth.IfOut = proto.String(iface.HostInterfaceName)
-			veth.IfIn = proto.String(iface.Name)
-			req.Opts.Veths = append(req.Opts.Veths, veth)
-			break
-		case "loopback":
-			break
-		}
-	}
-	for _, i := range criuOpts.VethPairs {
-		veth := new(criurpc.CriuVethPair)
-		veth.IfOut = proto.String(i.HostInterfaceName)
-		veth.IfIn = proto.String(i.ContainerInterfaceName)
-		req.Opts.Veths = append(req.Opts.Veths, veth)
+
+	if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
+		c.restoreNetwork(req, criuOpts)
 	}

 	// append optional manage cgroups mode
@@ -693,7 +788,8 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
 		if err := c.checkCriuVersion("1.7"); err != nil {
 			return err
 		}
-		req.Opts.ManageCgroupsMode = proto.Uint32(uint32(criuOpts.ManageCgroupsMode))
+		mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
+		req.Opts.ManageCgroupsMode = &mode
 	}

 	var (
@@ -850,7 +946,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
 			if err != nil {
 				return err
 			}
-			n, err = criuClient.Write(data)
+			_, err = criuClient.Write(data)
 			if err != nil {
 				return err
 			}
@@ -925,6 +1021,20 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
 		if err := lockNetwork(c.config); err != nil {
 			return err
 		}
+	case notify.GetScript() == "setup-namespaces":
+		if c.config.Hooks != nil {
+			s := configs.HookState{
+				Version: c.config.Version,
+				ID:      c.id,
+				Pid:     int(notify.GetPid()),
+				Root:    c.config.Rootfs,
+			}
+			for i, hook := range c.config.Hooks.Prestart {
+				if err := hook.Run(s); err != nil {
+					return newSystemErrorWithCausef(err, "running prestart hook %d", i)
+				}
+			}
+		}
 	case notify.GetScript() == "post-restore":
 		pid := notify.GetPid()
 		r, err := newRestoredProcess(int(pid), fds)
@@ -938,7 +1048,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
 		}); err != nil {
 			return err
 		}
-		if err := c.updateState(r); err != nil {
+		if _, err := c.updateState(r); err != nil {
 			return err
 		}
 		if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
@@ -950,13 +1060,17 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
 	return nil
 }

-func (c *linuxContainer) updateState(process parentProcess) error {
+func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
 	c.initProcess = process
 	state, err := c.currentState()
 	if err != nil {
-		return err
+		return nil, err
 	}
-	return c.saveState(state)
+	err = c.saveState(state)
+	if err != nil {
+		return nil, err
+	}
+	return state, nil
 }

 func (c *linuxContainer) saveState(s *State) error {
@@ -991,37 +1105,75 @@ func (c *linuxContainer) refreshState() error {
 	if paused {
 		return c.state.transition(&pausedState{c: c})
 	}
-	running, err := c.isRunning()
+	t, err := c.runType()
 	if err != nil {
 		return err
 	}
-	if running {
+	switch t {
+	case Created:
+		return c.state.transition(&createdState{c: c})
+	case Running:
 		return c.state.transition(&runningState{c: c})
 	}
 	return c.state.transition(&stoppedState{c: c})
 }

-func (c *linuxContainer) isRunning() (bool, error) {
-	if c.initProcess == nil {
+// doesInitProcessExist checks if the init process is still the same process
+// as the initial one, it could happen that the original process has exited
+// and a new process has been created with the same pid, in this case, the
+// container would already be stopped.
+func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
+	startTime, err := system.GetProcessStartTime(initPid)
+	if err != nil {
+		return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid)
+	}
+	if c.initProcessStartTime != startTime {
 		return false, nil
 	}
-	// return Running if the init process is alive
-	if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
-		if err == syscall.ESRCH {
-			return false, nil
-		}
-		return false, newSystemError(err)
-	}
 	return true, nil
 }

+func (c *linuxContainer) runType() (Status, error) {
+	if c.initProcess == nil {
+		return Stopped, nil
+	}
+	pid := c.initProcess.pid()
+	// return Running if the init process is alive
+	if err := syscall.Kill(pid, 0); err != nil {
+		if err == syscall.ESRCH {
+			// It means the process does not exist anymore, could happen when the
+			// process exited just when we call the function, we should not return
+			// error in this case.
+			return Stopped, nil
+		}
+		return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
+	}
+	// check if the process is still the original init process.
+	exist, err := c.doesInitProcessExist(pid)
+	if !exist || err != nil {
+		return Stopped, err
+	}
+	// check if the process that is running is the init process or the user's process.
+	// this is the difference between the container Running and Created.
+	environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid))
+	if err != nil {
+		return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid)
+	}
+	check := []byte("_LIBCONTAINER")
+	if bytes.Contains(environ, check) {
+		return Created, nil
+	}
+	return Running, nil
+}
+
 func (c *linuxContainer) isPaused() (bool, error) {
 	data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
 	if err != nil {
+		// If freezer cgroup is not mounted, the container would just be not paused.
 		if os.IsNotExist(err) {
 			return false, nil
 		}
-		return false, newSystemError(err)
+		return false, newSystemErrorWithCause(err, "checking if container is paused")
 	}
 	return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
 }
@@ -1043,6 +1195,7 @@ func (c *linuxContainer) currentState() (*State, error) {
 			Config:               *c.config,
 			InitProcessPid:       pid,
 			InitProcessStartTime: startTime,
+			Created:              c.created,
 		},
 		CgroupPaths:         c.cgroupManager.GetPaths(),
 		NamespacePaths:      make(map[configs.NamespaceType]string),
@@ -1053,6 +1206,9 @@ func (c *linuxContainer) currentState() (*State, error) {
 			state.NamespacePaths[ns.Type] = ns.GetPath(pid)
 		}
 		for _, nsType := range configs.NamespaceTypes() {
+			if !configs.IsNamespaceSupported(nsType) {
+				continue
+			}
 			if _, ok := state.NamespacePaths[nsType]; !ok {
 				ns := configs.Namespace{Type: nsType}
 				state.NamespacePaths[ns.Type] = ns.GetPath(pid)
@@ -1062,18 +1218,69 @@ func (c *linuxContainer) currentState() (*State, error) {
 	return state, nil
 }

-// bootstrapData encodes the necessary data in netlink binary format as a io.Reader.
-// Consumer can write the data to a bootstrap program such as one that uses
-// nsenter package to bootstrap the container's init process correctly, i.e. with
-// correct namespaces, uid/gid mapping etc.
-func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) {
+// orderNamespacePaths sorts namespace paths into a list of paths that we
+// can setns in order.
+func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
+	paths := []string{}
+	nsTypes := []configs.NamespaceType{
+		configs.NEWIPC,
+		configs.NEWUTS,
+		configs.NEWNET,
+		configs.NEWPID,
+		configs.NEWNS,
+	}
+	// join userns if the init process explicitly requires NEWUSER
+	if c.config.Namespaces.Contains(configs.NEWUSER) {
+		nsTypes = append(nsTypes, configs.NEWUSER)
+	}
+	for _, nsType := range nsTypes {
+		if p, ok := namespaces[nsType]; ok && p != "" {
+			// check if the requested namespace is supported
+			if !configs.IsNamespaceSupported(nsType) {
+				return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType))
+			}
+			// only set to join this namespace if it exists
+			if _, err := os.Lstat(p); err != nil {
+				return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
+			}
+			// do not allow namespace path with comma as we use it to separate
+			// the namespace paths
+			if strings.ContainsRune(p, ',') {
+				return nil, newSystemError(fmt.Errorf("invalid path %s", p))
+			}
+			paths = append(paths, p)
+		}
+	}
+	return paths, nil
+}
+
+func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
+	data := bytes.NewBuffer(nil)
+	for _, im := range idMap {
+		line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
+		if _, err := data.WriteString(line); err != nil {
+			return nil, err
+		}
+	}
+	return data.Bytes(), nil
+}
+
+// bootstrapData encodes the necessary data in netlink binary format
+// as a io.Reader.
+// Consumer can write the data to a bootstrap program
+// such as one that uses nsenter package to bootstrap the container's
+// init process correctly, i.e. with correct namespaces, uid/gid
+// mapping etc.
+func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) {
 	// create the netlink message
 	r := nl.NewNetlinkRequest(int(InitMsg), 0)
-	// write pid
+
+	// write cloneFlags
 	r.AddData(&Int32msg{
-		Type:  PidAttr,
-		Value: uint32(pid),
+		Type:  CloneFlagsAttr,
+		Value: uint32(cloneFlags),
 	})
+
 	// write console path
 	if consolePath != "" {
 		r.AddData(&Bytemsg{
@@ -1081,5 +1288,57 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath
 			Value: []byte(consolePath),
 		})
 	}
+
+	// write custom namespace paths
+	if len(nsMaps) > 0 {
+		nsPaths, err := c.orderNamespacePaths(nsMaps)
+		if err != nil {
+			return nil, err
+		}
+		r.AddData(&Bytemsg{
+			Type:  NsPathsAttr,
+			Value: []byte(strings.Join(nsPaths, ",")),
+		})
+	}
+
+	// write namespace paths only when we are not joining an existing user ns
+	_, joinExistingUser := nsMaps[configs.NEWUSER]
+	if !joinExistingUser {
+		// write uid mappings
+		if len(c.config.UidMappings) > 0 {
+			b, err := encodeIDMapping(c.config.UidMappings)
+			if err != nil {
+				return nil, err
+			}
+			r.AddData(&Bytemsg{
+				Type:  UidmapAttr,
+				Value: b,
+			})
+		}
+
+		// write gid mappings
+		if len(c.config.GidMappings) > 0 {
+			b, err := encodeIDMapping(c.config.GidMappings)
+			if err != nil {
+				return nil, err
+			}
+			r.AddData(&Bytemsg{
+				Type:  GidmapAttr,
+				Value: b,
+			})
+			// check if we have CAP_SETGID to setgroup properly
+			pid, err := capability.NewPid(os.Getpid())
+			if err != nil {
+				return nil, err
+			}
+			if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
+				r.AddData(&Boolmsg{
+					Type:  SetgroupAttr,
+					Value: true,
+				})
+			}
+		}
+	}
+
 	return bytes.NewReader(r.Serialize()), nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/container_nouserns_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/container_nouserns_linux.go
@@ -1,13 +0,0 @@
-// +build !go1.4
-
-package libcontainer
-
-import (
-	"fmt"
-	"syscall"
-)
-
-// not available before go 1.4
-func (c *linuxContainer) addUidGidMappings(sys *syscall.SysProcAttr) error {
-	return fmt.Errorf("User namespace is not supported in golang < 1.4")
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go
@@ -0,0 +1,20 @@
+package libcontainer
+
+// State represents a running container's state
+type State struct {
+	BaseState
+
+	// Platform specific fields below here
+}
+
+// A libcontainer container object.
+//
+// Each container is thread-safe within the same process. Since a container can
+// be destroyed by a separate process, any function may return that the container
+// was not found.
+type Container interface {
+	BaseContainer
+
+	// Methods below here are platform specific
+
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/container_userns_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/container_userns_linux.go
@@ -1,26 +0,0 @@
-// +build go1.4
-
-package libcontainer
-
-import "syscall"
-
-// Converts IDMap to SysProcIDMap array and adds it to SysProcAttr.
-func (c *linuxContainer) addUidGidMappings(sys *syscall.SysProcAttr) error {
-	if c.config.UidMappings != nil {
-		sys.UidMappings = make([]syscall.SysProcIDMap, len(c.config.UidMappings))
-		for i, um := range c.config.UidMappings {
-			sys.UidMappings[i].ContainerID = um.ContainerID
-			sys.UidMappings[i].HostID = um.HostID
-			sys.UidMappings[i].Size = um.Size
-		}
-	}
-	if c.config.GidMappings != nil {
-		sys.GidMappings = make([]syscall.SysProcIDMap, len(c.config.GidMappings))
-		for i, gm := range c.config.GidMappings {
-			sys.GidMappings[i].ContainerID = gm.ContainerID
-			sys.GidMappings[i].HostID = gm.HostID
-			sys.GidMappings[i].Size = gm.Size
-		}
-	}
-	return nil
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go
@@ -3,13 +3,13 @@
 package libcontainer

 // cgroup restoring strategy provided by criu
-type cg_mode uint32
+type cgMode uint32

 const (
-	CRIU_CG_MODE_SOFT    cg_mode = 3 + iota // restore cgroup properties if only dir created by criu
-	CRIU_CG_MODE_FULL                       // always restore all cgroups and their properties
-	CRIU_CG_MODE_STRICT                     // restore all, requiring them to not present in the system
-	CRIU_CG_MODE_DEFAULT                    // the same as CRIU_CG_MODE_SOFT
+	CRIU_CG_MODE_SOFT    cgMode = 3 + iota // restore cgroup properties if only dir created by criu
+	CRIU_CG_MODE_FULL                      // always restore all cgroups and their properties
+	CRIU_CG_MODE_STRICT                    // restore all, requiring them to not present in the system
+	CRIU_CG_MODE_DEFAULT                   // the same as CRIU_CG_MODE_SOFT
 )

 type CriuPageServerInfo struct {
@@ -32,5 +32,6 @@ type CriuOpts struct {
 	FileLocks               bool               // handle file locks, for safety
 	PageServer              CriuPageServerInfo // allow to dump to criu page server
 	VethPairs               []VethPairName     // pass the veth to criu when restore
-	ManageCgroupsMode       cg_mode            // dump or restore cgroup mode
+	ManageCgroupsMode       cgMode             // dump or restore cgroup mode
+	EmptyNs                 uint32             // don't c/r properties for namespace from this mask
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go
@@ -19,6 +19,7 @@ It has these top-level messages:
 	CriuDumpResp
 	CriuRestoreResp
 	CriuNotify
+	CriuFeatures
 	CriuReq
 	CriuResp
 */
@@ -31,6 +32,54 @@ import math "math"
 var _ = proto.Marshal
 var _ = math.Inf

+type CriuCgMode int32
+
+const (
+	CriuCgMode_IGNORE  CriuCgMode = 0
+	CriuCgMode_NONE    CriuCgMode = 1
+	CriuCgMode_PROPS   CriuCgMode = 2
+	CriuCgMode_SOFT    CriuCgMode = 3
+	CriuCgMode_FULL    CriuCgMode = 4
+	CriuCgMode_STRICT  CriuCgMode = 5
+	CriuCgMode_DEFAULT CriuCgMode = 6
+)
+
+var CriuCgMode_name = map[int32]string{
+	0: "IGNORE",
+	1: "NONE",
+	2: "PROPS",
+	3: "SOFT",
+	4: "FULL",
+	5: "STRICT",
+	6: "DEFAULT",
+}
+var CriuCgMode_value = map[string]int32{
+	"IGNORE":  0,
+	"NONE":    1,
+	"PROPS":   2,
+	"SOFT":    3,
+	"FULL":    4,
+	"STRICT":  5,
+	"DEFAULT": 6,
+}
+
+func (x CriuCgMode) Enum() *CriuCgMode {
+	p := new(CriuCgMode)
+	*p = x
+	return p
+}
+func (x CriuCgMode) String() string {
+	return proto.EnumName(CriuCgMode_name, int32(x))
+}
+func (x *CriuCgMode) UnmarshalJSON(data []byte) error {
+	value, err := proto.UnmarshalJSONEnum(CriuCgMode_value, data, "CriuCgMode")
+	if err != nil {
+		return err
+	}
+	*x = CriuCgMode(value)
+	return nil
+}
+
 type CriuReqType int32

 const (
@@ -43,6 +92,7 @@ const (
 	CriuReqType_NOTIFY        CriuReqType = 6
 	CriuReqType_CPUINFO_DUMP  CriuReqType = 7
 	CriuReqType_CPUINFO_CHECK CriuReqType = 8
+	CriuReqType_FEATURE_CHECK CriuReqType = 9
 )

 var CriuReqType_name = map[int32]string{
@@ -55,6 +105,7 @@ var CriuReqType_name = map[int32]string{
 	6: "NOTIFY",
 	7: "CPUINFO_DUMP",
 	8: "CPUINFO_CHECK",
+	9: "FEATURE_CHECK",
 }
 var CriuReqType_value = map[string]int32{
 	"EMPTY":         0,
@@ -66,6 +117,7 @@ var CriuReqType_value = map[string]int32{
 	"NOTIFY":        6,
 	"CPUINFO_DUMP":  7,
 	"CPUINFO_CHECK": 8,
+	"FEATURE_CHECK": 9,
 }

 func (x CriuReqType) Enum() *CriuReqType {
@@ -271,7 +323,12 @@ type CriuOpts struct {
 	SkipMnt           []string            `protobuf:"bytes,31,rep,name=skip_mnt" json:"skip_mnt,omitempty"`
 	EnableFs          []string            `protobuf:"bytes,32,rep,name=enable_fs" json:"enable_fs,omitempty"`
 	UnixSkIno         []*UnixSk           `protobuf:"bytes,33,rep,name=unix_sk_ino" json:"unix_sk_ino,omitempty"`
-	ManageCgroupsMode *uint32             `protobuf:"varint,34,opt,name=manage_cgroups_mode" json:"manage_cgroups_mode,omitempty"`
+	ManageCgroupsMode *CriuCgMode         `protobuf:"varint,34,opt,name=manage_cgroups_mode,enum=CriuCgMode" json:"manage_cgroups_mode,omitempty"`
+	GhostLimit        *uint32             `protobuf:"varint,35,opt,name=ghost_limit,def=1048576" json:"ghost_limit,omitempty"`
+	IrmapScanPaths    []string            `protobuf:"bytes,36,rep,name=irmap_scan_paths" json:"irmap_scan_paths,omitempty"`
+	External          []string            `protobuf:"bytes,37,rep,name=external" json:"external,omitempty"`
+	EmptyNs           *uint32             `protobuf:"varint,38,opt,name=empty_ns" json:"empty_ns,omitempty"`
+	NoSeccomp         *bool               `protobuf:"varint,39,opt,name=no_seccomp" json:"no_seccomp,omitempty"`
 	XXX_unrecognized  []byte              `json:"-"`
 }

@@ -281,6 +338,7 @@ func (*CriuOpts) ProtoMessage()    {}

 const Default_CriuOpts_LogLevel int32 = 2
 const Default_CriuOpts_CpuCap uint32 = 4294967295
+const Default_CriuOpts_GhostLimit uint32 = 1048576

 func (m *CriuOpts) GetImagesDirFd() int32 {
 	if m != nil && m.ImagesDirFd != nil {
@@ -513,13 +571,48 @@ func (m *CriuOpts) GetUnixSkIno() []*UnixSk {
 	return nil
 }

-func (m *CriuOpts) GetManageCgroupsMode() uint32 {
+func (m *CriuOpts) GetManageCgroupsMode() CriuCgMode {
 	if m != nil && m.ManageCgroupsMode != nil {
 		return *m.ManageCgroupsMode
 	}
+	return CriuCgMode_IGNORE
+}
+
+func (m *CriuOpts) GetGhostLimit() uint32 {
+	if m != nil && m.GhostLimit != nil {
+		return *m.GhostLimit
+	}
+	return Default_CriuOpts_GhostLimit
+}
+
+func (m *CriuOpts) GetIrmapScanPaths() []string {
+	if m != nil {
+		return m.IrmapScanPaths
+	}
+	return nil
+}
+
+func (m *CriuOpts) GetExternal() []string {
+	if m != nil {
+		return m.External
+	}
+	return nil
+}
+
+func (m *CriuOpts) GetEmptyNs() uint32 {
+	if m != nil && m.EmptyNs != nil {
+		return *m.EmptyNs
+	}
 	return 0
 }

+func (m *CriuOpts) GetNoSeccomp() bool {
+	if m != nil && m.NoSeccomp != nil {
+		return *m.NoSeccomp
+	}
+	return false
+}
+
 type CriuDumpResp struct {
 	Restored         *bool  `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"`
 	XXX_unrecognized []byte `json:"-"`
@@ -576,6 +669,25 @@ func (m *CriuNotify) GetPid() int32 {
 	return 0
 }

+//
+// List of features which can queried via
+// CRIU_REQ_TYPE__FEATURE_CHECK
+type CriuFeatures struct {
+	MemTrack         *bool  `protobuf:"varint,1,opt,name=mem_track" json:"mem_track,omitempty"`
+	XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *CriuFeatures) Reset()         { *m = CriuFeatures{} }
+func (m *CriuFeatures) String() string { return proto.CompactTextString(m) }
+func (*CriuFeatures) ProtoMessage()    {}
+
+func (m *CriuFeatures) GetMemTrack() bool {
+	if m != nil && m.MemTrack != nil {
+		return *m.MemTrack
+	}
+	return false
+}
+
 type CriuReq struct {
 	Type          *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"`
 	Opts          *CriuOpts    `protobuf:"bytes,2,opt,name=opts" json:"opts,omitempty"`
@@ -584,8 +696,13 @@ type CriuReq struct {
 	// When set service won't close the connection but
 	// will wait for more req-s to appear. Works not
 	// for all request types.
-	KeepOpen         *bool  `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"`
-	XXX_unrecognized []byte `json:"-"`
+	KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"`
+	//
+	// 'features' can be used to query which features
+	// are supported by the installed criu/kernel
+	// via RPC.
+	Features         *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"`
+	XXX_unrecognized []byte        `json:"-"`
 }

 func (m *CriuReq) Reset()         { *m = CriuReq{} }
@@ -620,6 +737,13 @@ func (m *CriuReq) GetKeepOpen() bool {
 	return false
 }

+func (m *CriuReq) GetFeatures() *CriuFeatures {
+	if m != nil {
+		return m.Features
+	}
+	return nil
+}
+
 type CriuResp struct {
 	Type             *CriuReqType        `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"`
 	Success          *bool               `protobuf:"varint,2,req,name=success" json:"success,omitempty"`
@@ -628,6 +752,7 @@ type CriuResp struct {
 	Notify           *CriuNotify         `protobuf:"bytes,5,opt,name=notify" json:"notify,omitempty"`
 	Ps               *CriuPageServerInfo `protobuf:"bytes,6,opt,name=ps" json:"ps,omitempty"`
 	CrErrno          *int32              `protobuf:"varint,7,opt,name=cr_errno" json:"cr_errno,omitempty"`
+	Features         *CriuFeatures       `protobuf:"bytes,8,opt,name=features" json:"features,omitempty"`
 	XXX_unrecognized []byte              `json:"-"`
 }

@@ -684,6 +809,14 @@ func (m *CriuResp) GetCrErrno() int32 {
 	return 0
 }

+func (m *CriuResp) GetFeatures() *CriuFeatures {
+	if m != nil {
+		return m.Features
+	}
+	return nil
+}
+
 func init() {
+	proto.RegisterEnum("CriuCgMode", CriuCgMode_name, CriuCgMode_value)
 	proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value)
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.proto
+++ b/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.proto
@@ -29,6 +29,16 @@ message unix_sk {
 	required uint32		inode 	= 1;
 };

+enum criu_cg_mode {
+	IGNORE	= 0;
+	NONE	= 1;
+	PROPS	= 2;
+	SOFT	= 3;
+	FULL	= 4;
+	STRICT	= 5;
+	DEFAULT = 6;
+};
+
 message criu_opts {
 	required int32			images_dir_fd	= 1;
 	optional int32			pid		= 2; /* if not set on dump, will dump requesting process */
@@ -75,7 +85,12 @@ message criu_opts {

 	repeated unix_sk                unix_sk_ino     = 33;

-	optional uint32                 manage_cgroups_mode = 34;
+	optional criu_cg_mode		manage_cgroups_mode = 34;
+	optional uint32			ghost_limit	= 35 [default = 0x100000];
+	repeated string			irmap_scan_paths = 36;
+	repeated string			external	= 37;
+	optional uint32			empty_ns	= 38;
+	optional bool			no_seccomp	= 39;
 }

 message criu_dump_resp {
@@ -103,6 +118,16 @@ enum criu_req_type {

 	CPUINFO_DUMP	= 7;
 	CPUINFO_CHECK	= 8;
+
+	FEATURE_CHECK	= 9;
+}
+
+/*
+ * List of features which can queried via
+ * CRIU_REQ_TYPE__FEATURE_CHECK
+ */
+message criu_features {
+	optional bool			mem_track	= 1;
 }

 /*
@@ -122,11 +147,17 @@ message criu_req {
 	 * for all request types.
 	 */
 	optional bool			keep_open	= 4;
+	/*
+	 * 'features' can be used to query which features
+	 * are supported by the installed criu/kernel
+	 * via RPC.
+	 */
+	optional criu_features		features	= 5;
 }

 /*
- * Responce -- it states whether the request was served
- * and additional request-specific informarion
+ * Response -- it states whether the request was served
+ * and additional request-specific information
 */

 message criu_resp {
@@ -139,4 +170,5 @@ message criu_resp {
 	optional criu_page_server_info	ps		= 6;

 	optional int32			cr_errno	= 7;
+	optional criu_features		features	= 8;
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/error.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/error.go
@@ -2,7 +2,7 @@ package libcontainer

 import "io"

-// API error code type.
+// ErrorCode is the API error code type.
 type ErrorCode int

 // API error codes.
@@ -19,7 +19,7 @@ const (
 	ContainerNotPaused

 	// Process errors
-	ProcessNotExecuted
+	NoProcessOps

 	// Common errors
 	ConfigInvalid
@@ -49,12 +49,14 @@ func (c ErrorCode) String() string {
 		return "Console exists for process"
 	case ContainerNotPaused:
 		return "Container is not paused"
+	case NoProcessOps:
+		return "No process operations"
 	default:
 		return "Unknown error"
 	}
 }

-// API Error type.
+// Error is the API error type.
 type Error interface {
 	error

--- a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go
@@ -9,6 +9,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"regexp"
+	"runtime/debug"
 	"strconv"
 	"syscall"

@@ -22,11 +23,12 @@ import (
 )

 const (
-	stateFilename = "state.json"
+	stateFilename    = "state.json"
+	execFifoFilename = "exec.fifo"
 )

 var (
-	idRegex  = regexp.MustCompile(`^[\w_-]+$`)
+	idRegex  = regexp.MustCompile(`^[\w+-\.]+$`)
 	maxIdLen = 1024
 )

@@ -157,13 +159,34 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
 	if err := l.Validator.Validate(config); err != nil {
 		return nil, newGenericError(err, ConfigInvalid)
 	}
+	uid, err := config.HostUID()
+	if err != nil {
+		return nil, newGenericError(err, SystemError)
+	}
+	gid, err := config.HostGID()
+	if err != nil {
+		return nil, newGenericError(err, SystemError)
+	}
 	containerRoot := filepath.Join(l.Root, id)
 	if _, err := os.Stat(containerRoot); err == nil {
 		return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
 	} else if !os.IsNotExist(err) {
 		return nil, newGenericError(err, SystemError)
 	}
-	if err := os.MkdirAll(containerRoot, 0700); err != nil {
+	if err := os.MkdirAll(containerRoot, 0711); err != nil {
+		return nil, newGenericError(err, SystemError)
+	}
+	if err := os.Chown(containerRoot, uid, gid); err != nil {
+		return nil, newGenericError(err, SystemError)
+	}
+	fifoName := filepath.Join(containerRoot, execFifoFilename)
+	oldMask := syscall.Umask(0000)
+	if err := syscall.Mkfifo(fifoName, 0622); err != nil {
+		syscall.Umask(oldMask)
+		return nil, newGenericError(err, SystemError)
+	}
+	syscall.Umask(oldMask)
+	if err := os.Chown(fifoName, uid, gid); err != nil {
 		return nil, newGenericError(err, SystemError)
 	}
 	c := &linuxContainer{
@@ -194,16 +217,18 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
 		fds:              state.ExternalDescriptors,
 	}
 	c := &linuxContainer{
-		initProcess:   r,
-		id:            id,
-		config:        &state.Config,
-		initPath:      l.InitPath,
-		initArgs:      l.InitArgs,
-		criuPath:      l.CriuPath,
-		cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
-		root:          containerRoot,
+		initProcess:          r,
+		initProcessStartTime: state.InitProcessStartTime,
+		id:                   id,
+		config:               &state.Config,
+		initPath:             l.InitPath,
+		initArgs:             l.InitArgs,
+		criuPath:             l.CriuPath,
+		cgroupManager:        l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
+		root:                 containerRoot,
+		created:              state.Created,
 	}
-	c.state = &createdState{c: c, s: Created}
+	c.state = &loadedState{c: c}
 	if err := c.refreshState(); err != nil {
 		return nil, err
 	}
@@ -217,10 +242,18 @@ func (l *LinuxFactory) Type() string {
 // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
 // This is a low level implementation detail of the reexec and should not be consumed externally
 func (l *LinuxFactory) StartInitialization() (err error) {
-	fdStr := os.Getenv("_LIBCONTAINER_INITPIPE")
-	pipefd, err := strconv.Atoi(fdStr)
-	if err != nil {
-		return fmt.Errorf("error converting env var _LIBCONTAINER_INITPIPE(%q) to an int: %s", fdStr, err)
+	var pipefd, rootfd int
+	for k, v := range map[string]*int{
+		"_LIBCONTAINER_INITPIPE": &pipefd,
+		"_LIBCONTAINER_STATEDIR": &rootfd,
+	} {
+		s := os.Getenv(k)
+
+		i, err := strconv.Atoi(s)
+		if err != nil {
+			return fmt.Errorf("unable to convert %s=%s to int", k, s)
+		}
+		*v = i
 	}
 	var (
 		pipe = os.NewFile(uintptr(pipefd), "pipe")
@@ -229,29 +262,31 @@ func (l *LinuxFactory) StartInitialization() (err error) {
 	// clear the current process's environment to clean any libcontainer
 	// specific env vars.
 	os.Clearenv()
+
 	var i initer
 	defer func() {
-		// if we have an error during the initialization of the container's init then send it back to the
-		// parent process in the form of an initError.
-		if err != nil {
-			if _, ok := i.(*linuxStandardInit); ok {
-				//  Synchronisation only necessary for standard init.
-				if err := utils.WriteJSON(pipe, syncT{procError}); err != nil {
-					panic(err)
-				}
-			}
-			if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
-				panic(err)
-			}
-		} else {
-			if err := utils.WriteJSON(pipe, syncT{procStart}); err != nil {
+		// We have an error during the initialization of the container's init,
+		// send it back to the parent process in the form of an initError.
+		// If container's init successed, syscall.Exec will not return, hence
+		// this defer function will never be called.
+		if _, ok := i.(*linuxStandardInit); ok {
+			//  Synchronisation only necessary for standard init.
+			if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
 				panic(err)
 			}
 		}
+		if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
+			panic(err)
+		}
 		// ensure that this pipe is always closed
 		pipe.Close()
 	}()
-	i, err = newContainerInit(it, pipe)
+	defer func() {
+		if e := recover(); e != nil {
+			err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
+		}
+	}()
+	i, err = newContainerInit(it, pipe, rootfd)
 	if err != nil {
 		return err
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go
@@ -14,8 +14,9 @@ type syncType uint8
 const (
 	procReady syncType = iota
 	procError
-	procStart
 	procRun
+	procHooks
+	procResume
 )

 type syncT struct {
@@ -51,6 +52,21 @@ func newGenericError(err error, c ErrorCode) Error {
 }

 func newSystemError(err error) Error {
+	return createSystemError(err, "")
+}
+
+func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error {
+	return createSystemError(err, fmt.Sprintf(cause, v...))
+}
+
+func newSystemErrorWithCause(err error, cause string) Error {
+	return createSystemError(err, cause)
+}
+
+// createSystemError creates the specified error with the correct number of
+// stack frames skipped. This is only to be called by the other functions for
+// formatting the error.
+func createSystemError(err error, cause string) Error {
 	if le, ok := err.(Error); ok {
 		return le
 	}
@@ -58,7 +74,8 @@ func newSystemError(err error) Error {
 		Timestamp: time.Now(),
 		Err:       err,
 		ECode:     SystemError,
-		Stack:     stacktrace.Capture(1),
+		Cause:     cause,
+		Stack:     stacktrace.Capture(2),
 	}
 	if err != nil {
 		gerr.Message = err.Error()
@@ -70,12 +87,17 @@ type genericError struct {
 	Timestamp time.Time
 	ECode     ErrorCode
 	Err       error `json:"-"`
+	Cause     string
 	Message   string
 	Stack     stacktrace.Stacktrace
 }

 func (e *genericError) Error() string {
-	return fmt.Sprintf("[%d] %s: %s", e.ECode, e.ECode, e.Message)
+	if e.Cause == "" {
+		return e.Message
+	}
+	frame := e.Stack.Frames[0]
+	return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message)
 }

 func (e *genericError) Code() ErrorCode {
--- a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go
@@ -44,22 +44,28 @@ type network struct {

 // initConfig is used for transferring parameters from Exec() to Init()
 type initConfig struct {
-	Args             []string        `json:"args"`
-	Env              []string        `json:"env"`
-	Cwd              string          `json:"cwd"`
-	Capabilities     []string        `json:"capabilities"`
-	User             string          `json:"user"`
-	Config           *configs.Config `json:"config"`
-	Console          string          `json:"console"`
-	Networks         []*network      `json:"network"`
-	PassedFilesCount int             `json:"passed_files_count"`
+	Args             []string         `json:"args"`
+	Env              []string         `json:"env"`
+	Cwd              string           `json:"cwd"`
+	Capabilities     []string         `json:"capabilities"`
+	ProcessLabel     string           `json:"process_label"`
+	AppArmorProfile  string           `json:"apparmor_profile"`
+	NoNewPrivileges  bool             `json:"no_new_privileges"`
+	User             string           `json:"user"`
+	Config           *configs.Config  `json:"config"`
+	Console          string           `json:"console"`
+	Networks         []*network       `json:"network"`
+	PassedFilesCount int              `json:"passed_files_count"`
+	ContainerId      string           `json:"containerid"`
+	Rlimits          []configs.Rlimit `json:"rlimits"`
+	ExecFifoPath     string           `json:"start_pipe_path"`
 }

 type initer interface {
 	Init() error
 }

-func newContainerInit(t initType, pipe *os.File) (initer, error) {
+func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) {
 	var config *initConfig
 	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
 		return nil, err
@@ -74,9 +80,10 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
 		}, nil
 	case initStandard:
 		return &linuxStandardInit{
-			pipe:      pipe,
-			parentPid: syscall.Getppid(),
-			config:    config,
+			pipe:       pipe,
+			parentPid:  syscall.Getppid(),
+			config:     config,
+			stateDirFD: stateDirFD,
 		}, nil
 	}
 	return nil, fmt.Errorf("unknown init type %q", t)
@@ -163,20 +170,22 @@ func syncParentReady(pipe io.ReadWriter) error {
 	return nil
 }

-// joinExistingNamespaces gets all the namespace paths specified for the container and
-// does a setns on the namespace fd so that the current process joins the namespace.
-func joinExistingNamespaces(namespaces []configs.Namespace) error {
-	for _, ns := range namespaces {
-		if ns.Path != "" {
-			f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0)
-			if err != nil {
-				return err
-			}
-			err = system.Setns(f.Fd(), uintptr(ns.Syscall()))
-			f.Close()
-			if err != nil {
-				return err
-			}
+// syncParentHooks sends to the given pipe a JSON payload which indicates that
+// the parent should execute pre-start hooks. It then waits for the parent to
+// indicate that it is cleared to resume.
+func syncParentHooks(pipe io.ReadWriter) error {
+	// Tell parent.
+	if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil {
+		return err
+	}
+	// Wait for parent to give the all-clear.
+	var procSync syncT
+	if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
+		if err == io.EOF {
+			return fmt.Errorf("parent closed synchronisation channel")
+		}
+		if procSync.Type != procResume {
+			return fmt.Errorf("invalid synchronisation flag from parent")
 		}
 	}
 	return nil
@@ -309,19 +318,19 @@ func setupRoute(config *configs.Config) error {
 	return nil
 }

-func setupRlimits(config *configs.Config) error {
-	for _, rlimit := range config.Rlimits {
-		l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}
-		if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
+func setupRlimits(limits []configs.Rlimit, pid int) error {
+	for _, rlimit := range limits {
+		if err := system.Prlimit(pid, rlimit.Type, syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
 			return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
 		}
 	}
 	return nil
 }

-func setOomScoreAdj(oomScoreAdj int) error {
-	path := "/proc/self/oom_score_adj"
-	return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0700)
+func setOomScoreAdj(oomScoreAdj int, pid int) error {
+	path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
+
+	return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600)
 }

 // killCgroupProcesses freezes then iterates over all the processes inside the
@@ -338,11 +347,14 @@ func killCgroupProcesses(m cgroups.Manager) error {
 		return err
 	}
 	for _, pid := range pids {
-		if p, err := os.FindProcess(pid); err == nil {
-			procs = append(procs, p)
-			if err := p.Kill(); err != nil {
-				logrus.Warn(err)
-			}
+		p, err := os.FindProcess(pid)
+		if err != nil {
+			logrus.Warn(err)
+			continue
+		}
+		procs = append(procs, p)
+		if err := p.Kill(); err != nil {
+			logrus.Warn(err)
 		}
 	}
 	if err := m.Freeze(configs.Thawed); err != nil {
--- a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go
@@ -0,0 +1,66 @@
+// +build linux
+
+package keyctl
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"syscall"
+	"unsafe"
+)
+
+const KEYCTL_JOIN_SESSION_KEYRING = 1
+const KEYCTL_SETPERM = 5
+const KEYCTL_DESCRIBE = 6
+
+type KeySerial uint32
+
+func JoinSessionKeyring(name string) (KeySerial, error) {
+	var _name *byte
+	var err error
+
+	if len(name) > 0 {
+		_name, err = syscall.BytePtrFromString(name)
+		if err != nil {
+			return KeySerial(0), err
+		}
+	}
+
+	sessKeyId, _, errn := syscall.Syscall(syscall.SYS_KEYCTL, KEYCTL_JOIN_SESSION_KEYRING, uintptr(unsafe.Pointer(_name)), 0)
+	if errn != 0 {
+		return 0, fmt.Errorf("could not create session key: %v", errn)
+	}
+	return KeySerial(sessKeyId), nil
+}
+
+// ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
+// anding the bits with the given mask (clearing permissions) and setting
+// additional permission bits
+func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
+	dest := make([]byte, 1024)
+	destBytes := unsafe.Pointer(&dest[0])
+
+	if _, _, err := syscall.Syscall6(syscall.SYS_KEYCTL, uintptr(KEYCTL_DESCRIBE), uintptr(ringId), uintptr(destBytes), uintptr(len(dest)), 0, 0); err != 0 {
+		return err
+	}
+
+	res := strings.Split(string(dest), ";")
+	if len(res) < 5 {
+		return fmt.Errorf("Destination buffer for key description is too small")
+	}
+
+	// parse permissions
+	perm64, err := strconv.ParseUint(res[3], 16, 32)
+	if err != nil {
+		return err
+	}
+
+	perm := (uint32(perm64) & mask) | setbits
+
+	if _, _, err := syscall.Syscall(syscall.SYS_KEYCTL, uintptr(KEYCTL_SETPERM), uintptr(ringId), uintptr(perm)); err != 0 {
+		return err
+	}
+
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/label/label.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/label/label.go
@@ -21,6 +21,10 @@ func SetProcessLabel(processLabel string) error {
 	return nil
 }

+func GetFileLabel(path string) (string, error) {
+	return "", nil
+}
+
 func SetFileLabel(path string, fileLabel string) error {
 	return nil
 }
@@ -48,7 +52,7 @@ func UnreserveLabel(label string) error {
 	return nil
 }

-// DupSecOpt takes an process label and returns security options that
+// DupSecOpt takes a process label and returns security options that
 // can be used to set duplicate labels on future container processes
 func DupSecOpt(src string) []string {
 	return nil
--- a/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go
@@ -94,6 +94,11 @@ func GetProcessLabel() (string, error) {
 	return selinux.Getexeccon()
 }

+// GetFileLabel returns the label for specified path
+func GetFileLabel(path string) (string, error) {
+	return selinux.Getfilecon(path)
+}
+
 // SetFileLabel modifies the "path" label to the specified file label
 func SetFileLabel(path string, fileLabel string) error {
 	if selinux.SelinuxEnabled() && fileLabel != "" {
@@ -102,7 +107,7 @@ func SetFileLabel(path string, fileLabel string) error {
 	return nil
 }

-// Tell the kernel the label for all files to be created
+// SetFileCreateLabel tells the kernel the label for all files to be created
 func SetFileCreateLabel(fileLabel string) error {
 	if selinux.SelinuxEnabled() {
 		return selinux.Setfscreatecon(fileLabel)
@@ -110,7 +115,7 @@ func SetFileCreateLabel(fileLabel string) error {
 	return nil
 }

-// Change the label of path to the filelabel string.
+// Relabel changes the label of path to the filelabel string.
 // It changes the MCS label to s0 if shared is true.
 // This will allow all containers to share the content.
 func Relabel(path string, fileLabel string, shared bool) error {
--- a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go
@@ -12,8 +12,12 @@ import (
 // The number is randomly chosen to not conflict with known netlink types
 const (
 	InitMsg         uint16 = 62000
-	PidAttr         uint16 = 27281
+	CloneFlagsAttr  uint16 = 27281
 	ConsolePathAttr uint16 = 27282
+	NsPathsAttr     uint16 = 27283
+	UidmapAttr      uint16 = 27284
+	GidmapAttr      uint16 = 27285
+	SetgroupAttr    uint16 = 27286
 	// When syscall.NLA_HDRLEN is in gccgo, take this out.
 	syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
 )
@@ -23,7 +27,8 @@ type Int32msg struct {
 	Value uint32
 }

-// int32msg has the following representation
+// Serialize serializes the message.
+// Int32msg has the following representation
 // | nlattr len | nlattr type |
 // | uint32 value             |
 func (msg *Int32msg) Serialize() []byte {
@@ -39,7 +44,7 @@ func (msg *Int32msg) Len() int {
 	return syscall_NLA_HDRLEN + 4
 }

-// bytemsg has the following representation
+// Bytemsg has the following representation
 // | nlattr len | nlattr type |
 // | value              | pad |
 type Bytemsg struct {
@@ -60,3 +65,25 @@ func (msg *Bytemsg) Serialize() []byte {
 func (msg *Bytemsg) Len() int {
 	return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
 }
+
+type Boolmsg struct {
+	Type  uint16
+	Value bool
+}
+
+func (msg *Boolmsg) Serialize() []byte {
+	buf := make([]byte, msg.Len())
+	native := nl.NativeEndian()
+	native.PutUint16(buf[0:2], uint16(msg.Len()))
+	native.PutUint16(buf[2:4], msg.Type)
+	if msg.Value {
+		buf[4] = 1
+	} else {
+		buf[4] = 0
+	}
+	return buf
+}
+
+func (msg *Boolmsg) Len() int {
+	return syscall_NLA_HDRLEN + 1
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/process.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/process.go
@@ -5,6 +5,8 @@ import (
 	"io"
 	"math"
 	"os"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
 )

 type processOperations interface {
@@ -48,6 +50,20 @@ type Process struct {
 	// All capabilities not specified will be dropped from the processes capability mask
 	Capabilities []string

+	// AppArmorProfile specifies the profile to apply to the process and is
+	// changed at the time the process is execed
+	AppArmorProfile string
+
+	// Label specifies the label to apply to the process.  It is commonly used by selinux
+	Label string
+
+	// NoNewPrivileges controls whether processes can gain additional privileges.
+	NoNewPrivileges *bool
+
+	// Rlimits specifies the resource limits, such as max open files, to set in the container
+	// If Rlimits are not set, the container will inherit rlimits from the parent process
+	Rlimits []configs.Rlimit
+
 	ops processOperations
 }

@@ -55,7 +71,7 @@ type Process struct {
 // Wait releases any resources associated with the Process
 func (p Process) Wait() (*os.ProcessState, error) {
 	if p.ops == nil {
-		return nil, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
+		return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
 	}
 	return p.ops.wait()
 }
@@ -65,7 +81,7 @@ func (p Process) Pid() (int, error) {
 	// math.MinInt32 is returned here, because it's invalid value
 	// for the kill() system call.
 	if p.ops == nil {
-		return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
+		return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
 	}
 	return p.ops.pid(), nil
 }
@@ -73,7 +89,7 @@ func (p Process) Pid() (int, error) {
 // Signal sends a signal to the Process.
 func (p Process) Signal(sig os.Signal) error {
 	if p.ops == nil {
-		return newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
+		return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
 	}
 	return p.ops.signal(sig)
 }
@@ -86,8 +102,8 @@ type IO struct {
 }

 // NewConsole creates new console for process and returns it
-func (p *Process) NewConsole(rootuid int) (Console, error) {
-	console, err := NewConsole(rootuid, rootuid)
+func (p *Process) NewConsole(rootuid, rootgid int) (Console, error) {
+	console, err := NewConsole(rootuid, rootgid)
 	if err != nil {
 		return nil, err
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go
@@ -51,6 +51,7 @@ type setnsProcess struct {
 	fds           []string
 	process       *Process
 	bootstrapData io.Reader
+	rootDir       *os.File
 }

 func (p *setnsProcess) startTime() (string, error) {
@@ -69,39 +70,49 @@ func (p *setnsProcess) start() (err error) {
 	defer p.parentPipe.Close()
 	err = p.cmd.Start()
 	p.childPipe.Close()
+	p.rootDir.Close()
 	if err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "starting setns process")
 	}
 	if p.bootstrapData != nil {
 		if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
-			return newSystemError(err)
+			return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
 		}
 	}
 	if err = p.execSetns(); err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "executing setns process")
 	}
 	if len(p.cgroupPaths) > 0 {
 		if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
-			return newSystemError(err)
+			return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
 		}
 	}
+	// set oom_score_adj
+	if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
+		return newSystemErrorWithCause(err, "setting oom score")
+	}
+	// set rlimits, this has to be done here because we lose permissions
+	// to raise the limits once we enter a user-namespace
+	if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
+		return newSystemErrorWithCause(err, "setting rlimits for process")
+	}
 	if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "writing config to pipe")
 	}

 	if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "calling shutdown on init pipe")
 	}
 	// wait for the child process to fully complete and receive an error message
 	// if one was encoutered
 	var ierr *genericError
 	if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "decoding init error from pipe")
 	}
 	// Must be done after Shutdown so the child will exit and we can wait for it.
 	if ierr != nil {
 		p.wait()
-		return newSystemError(ierr)
+		return ierr
 	}
 	return nil
 }
@@ -114,7 +125,7 @@ func (p *setnsProcess) execSetns() error {
 	status, err := p.cmd.Process.Wait()
 	if err != nil {
 		p.cmd.Wait()
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "waiting on setns process to finish")
 	}
 	if !status.Success() {
 		p.cmd.Wait()
@@ -123,7 +134,7 @@ func (p *setnsProcess) execSetns() error {
 	var pid *pid
 	if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
 		p.cmd.Wait()
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "reading pid from init pipe")
 	}
 	process, err := os.FindProcess(pid.Pid)
 	if err != nil {
@@ -167,14 +178,17 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) {
 }

 type initProcess struct {
-	cmd        *exec.Cmd
-	parentPipe *os.File
-	childPipe  *os.File
-	config     *initConfig
-	manager    cgroups.Manager
-	container  *linuxContainer
-	fds        []string
-	process    *Process
+	cmd           *exec.Cmd
+	parentPipe    *os.File
+	childPipe     *os.File
+	config        *initConfig
+	manager       cgroups.Manager
+	container     *linuxContainer
+	fds           []string
+	process       *Process
+	bootstrapData io.Reader
+	sharePidns    bool
+	rootDir       *os.File
 }

 func (p *initProcess) pid() int {
@@ -185,27 +199,63 @@ func (p *initProcess) externalDescriptors() []string {
 	return p.fds
 }

-func (p *initProcess) start() (err error) {
+// execSetns runs the process that executes C code to perform the setns calls
+// because setns support requires the C process to fork off a child and perform the setns
+// before the go runtime boots, we wait on the process to die and receive the child's pid
+// over the provided pipe.
+// This is called by initProcess.start function
+func (p *initProcess) execSetns() error {
+	status, err := p.cmd.Process.Wait()
+	if err != nil {
+		p.cmd.Wait()
+		return err
+	}
+	if !status.Success() {
+		p.cmd.Wait()
+		return &exec.ExitError{ProcessState: status}
+	}
+	var pid *pid
+	if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
+		p.cmd.Wait()
+		return err
+	}
+	process, err := os.FindProcess(pid.Pid)
+	if err != nil {
+		return err
+	}
+	p.cmd.Process = process
+	p.process.ops = p
+	return nil
+}
+
+func (p *initProcess) start() error {
 	defer p.parentPipe.Close()
-	err = p.cmd.Start()
+	err := p.cmd.Start()
 	p.process.ops = p
 	p.childPipe.Close()
+	p.rootDir.Close()
 	if err != nil {
 		p.process.ops = nil
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "starting init process command")
+	}
+	if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
+		return err
+	}
+	if err := p.execSetns(); err != nil {
+		return newSystemErrorWithCause(err, "running exec setns process for init")
 	}
 	// Save the standard descriptor names before the container process
 	// can potentially move them (e.g., via dup2()).  If we don't do this now,
 	// we won't know at checkpoint time which file descriptor to look up.
 	fds, err := getPipeFds(p.pid())
 	if err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
 	}
 	p.setExternalDescriptors(fds)
 	// Do this before syncing with child so that no children
 	// can escape the cgroup
 	if err := p.manager.Apply(p.pid()); err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "applying cgroup configuration for process")
 	}
 	defer func() {
 		if err != nil {
@@ -213,56 +263,88 @@ func (p *initProcess) start() (err error) {
 			p.manager.Destroy()
 		}
 	}()
-	if p.config.Config.Hooks != nil {
-		s := configs.HookState{
-			Version: p.container.config.Version,
-			ID:      p.container.id,
-			Pid:     p.pid(),
-			Root:    p.config.Config.Rootfs,
-		}
-		for _, hook := range p.config.Config.Hooks.Prestart {
-			if err := hook.Run(s); err != nil {
-				return newSystemError(err)
-			}
-		}
-	}
 	if err := p.createNetworkInterfaces(); err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "creating nework interfaces")
 	}
 	if err := p.sendConfig(); err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "sending config to init process")
 	}
 	var (
-		procSync syncT
-		sentRun  bool
-		ierr     *genericError
+		procSync   syncT
+		sentRun    bool
+		sentResume bool
+		ierr       *genericError
 	)

+	dec := json.NewDecoder(p.parentPipe)
 loop:
 	for {
-		if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
+		if err := dec.Decode(&procSync); err != nil {
 			if err == io.EOF {
 				break loop
 			}
-			return newSystemError(err)
+			return newSystemErrorWithCause(err, "decoding sync type from init pipe")
 		}
 		switch procSync.Type {
-		case procStart:
-			break loop
 		case procReady:
 			if err := p.manager.Set(p.config.Config); err != nil {
-				return newSystemError(err)
+				return newSystemErrorWithCause(err, "setting cgroup config for ready process")
+			}
+			// set oom_score_adj
+			if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
+				return newSystemErrorWithCause(err, "setting oom score for ready process")
+			}
+			// set rlimits, this has to be done here because we lose permissions
+			// to raise the limits once we enter a user-namespace
+			if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
+				return newSystemErrorWithCause(err, "setting rlimits for ready process")
+			}
+			// call prestart hooks
+			if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
+				if p.config.Config.Hooks != nil {
+					s := configs.HookState{
+						Version: p.container.config.Version,
+						ID:      p.container.id,
+						Pid:     p.pid(),
+						Root:    p.config.Config.Rootfs,
+					}
+					for i, hook := range p.config.Config.Hooks.Prestart {
+						if err := hook.Run(s); err != nil {
+							return newSystemErrorWithCausef(err, "running prestart hook %d", i)
+						}
+					}
+				}
 			}
 			// Sync with child.
 			if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
-				return newSystemError(err)
+				return newSystemErrorWithCause(err, "reading syncT run type")
 			}
 			sentRun = true
+		case procHooks:
+			if p.config.Config.Hooks != nil {
+				s := configs.HookState{
+					Version:    p.container.config.Version,
+					ID:         p.container.id,
+					Pid:        p.pid(),
+					Root:       p.config.Config.Rootfs,
+					BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
+				}
+				for i, hook := range p.config.Config.Hooks.Prestart {
+					if err := hook.Run(s); err != nil {
+						return newSystemErrorWithCausef(err, "running prestart hook %d", i)
+					}
+				}
+			}
+			// Sync with child.
+			if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil {
+				return newSystemErrorWithCause(err, "reading syncT resume type")
+			}
+			sentResume = true
 		case procError:
 			// wait for the child process to fully complete and receive an error message
 			// if one was encoutered
-			if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
-				return newSystemError(err)
+			if err := dec.Decode(&ierr); err != nil && err != io.EOF {
+				return newSystemErrorWithCause(err, "decoding proc error from init")
 			}
 			if ierr != nil {
 				break loop
@@ -270,19 +352,22 @@ loop:
 			// Programmer error.
 			panic("No error following JSON procError payload.")
 		default:
-			return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
+			return newSystemError(fmt.Errorf("invalid JSON payload from child"))
 		}
 	}
 	if !sentRun {
-		return newSystemError(fmt.Errorf("could not synchronise with container process"))
+		return newSystemErrorWithCause(ierr, "container init failed")
+	}
+	if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
+		return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
 	}
 	if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "shutting down init pipe")
 	}
 	// Must be done after Shutdown so the child will exit and we can wait for it.
 	if ierr != nil {
 		p.wait()
-		return newSystemError(ierr)
+		return ierr
 	}
 	return nil
 }
@@ -293,7 +378,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) {
 		return p.cmd.ProcessState, err
 	}
 	// we should kill all processes in cgroup when init is died if we use host PID namespace
-	if p.cmd.SysProcAttr.Cloneflags&syscall.CLONE_NEWPID == 0 {
+	if p.sharePidns {
 		killCgroupProcesses(p.manager)
 	}
 	return p.cmd.ProcessState, nil
@@ -315,7 +400,9 @@ func (p *initProcess) startTime() (string, error) {
 }

 func (p *initProcess) sendConfig() error {
-	// send the state to the container's init process then shutdown writes for the parent
+	// send the config to the container's init process, we don't use JSON Encode
+	// here because there might be a problem in JSON decoder in some cases, see:
+	// https://github.com/docker/docker/issues/14203#issuecomment-174177790
 	return utils.WriteJSON(p.parentPipe, p.config)
 }

@@ -365,7 +452,7 @@ func getPipeFds(pid int) ([]string, error) {

 // InitializeIO creates pipes for use with the process's STDIO
 // and returns the opposite side for each
-func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
+func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
 	var fds []uintptr
 	i = &IO{}
 	// cleanup in case of an error
@@ -397,7 +484,7 @@ func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
 	p.Stderr, i.Stderr = w, r
 	// change ownership of the pipes incase we are in a user namespace
 	for _, fd := range fds {
-		if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil {
+		if err := syscall.Fchown(int(fd), rootuid, rootgid); err != nil {
 			return nil, err
 		}
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go
@@ -4,6 +4,7 @@ package libcontainer

 import (
 	"fmt"
+	"io"
 	"io/ioutil"
 	"os"
 	"os/exec"
@@ -19,47 +20,65 @@ import (
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/label"
 	"github.com/opencontainers/runc/libcontainer/system"
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
 )

 const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV

+// needsSetupDev returns true if /dev needs to be set up.
+func needsSetupDev(config *configs.Config) bool {
+	for _, m := range config.Mounts {
+		if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
+			return false
+		}
+	}
+	return true
+}
+
 // setupRootfs sets up the devices, mount points, and filesystems for use inside a
 // new mount namespace.
-func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
+func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) {
 	if err := prepareRoot(config); err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "preparing rootfs")
 	}

-	setupDev := len(config.Devices) != 0
+	setupDev := needsSetupDev(config)
 	for _, m := range config.Mounts {
 		for _, precmd := range m.PremountCmds {
 			if err := mountCmd(precmd); err != nil {
-				return newSystemError(err)
+				return newSystemErrorWithCause(err, "running premount command")
 			}
 		}
 		if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil {
-			return newSystemError(err)
+			return newSystemErrorWithCausef(err, "mounting %q to rootfs %q", m.Destination, config.Rootfs)
 		}

 		for _, postcmd := range m.PostmountCmds {
 			if err := mountCmd(postcmd); err != nil {
-				return newSystemError(err)
+				return newSystemErrorWithCause(err, "running postmount command")
 			}
 		}
 	}
 	if setupDev {
 		if err := createDevices(config); err != nil {
-			return newSystemError(err)
+			return newSystemErrorWithCause(err, "creating device nodes")
 		}
 		if err := setupPtmx(config, console); err != nil {
-			return newSystemError(err)
+			return newSystemErrorWithCause(err, "setting up ptmx")
 		}
 		if err := setupDevSymlinks(config.Rootfs); err != nil {
-			return newSystemError(err)
+			return newSystemErrorWithCause(err, "setting up /dev symlinks")
 		}
 	}
+	// Signal the parent to run the pre-start hooks.
+	// The hooks are run after the mounts are setup, but before we switch to the new
+	// root, so that the old root is still available in the hooks for any mount
+	// manipulations.
+	if err := syncParentHooks(pipe); err != nil {
+		return err
+	}
 	if err := syscall.Chdir(config.Rootfs); err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
 	}
 	if config.NoPivotRoot {
 		err = msMoveRoot(config.Rootfs)
@@ -67,16 +86,28 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
 		err = pivotRoot(config.Rootfs, config.PivotDir)
 	}
 	if err != nil {
-		return newSystemError(err)
+		return newSystemErrorWithCause(err, "jailing process inside rootfs")
 	}
 	if setupDev {
 		if err := reOpenDevNull(); err != nil {
-			return newSystemError(err)
+			return newSystemErrorWithCause(err, "reopening /dev/null inside container")
 		}
 	}
+	// remount dev as ro if specifed
+	for _, m := range config.Mounts {
+		if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
+			if m.Flags&syscall.MS_RDONLY != 0 {
+				if err := remountReadonly(m.Destination); err != nil {
+					return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
+				}
+			}
+			break
+		}
+	}
+	// set rootfs ( / ) as readonly
 	if config.Readonlyfs {
 		if err := setReadonly(); err != nil {
-			return newSystemError(err)
+			return newSystemErrorWithCause(err, "setting rootfs as readonly")
 		}
 	}
 	syscall.Umask(0022)
@@ -84,14 +115,12 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
 }

 func mountCmd(cmd configs.Command) error {
-
 	command := exec.Command(cmd.Path, cmd.Args[:]...)
 	command.Env = cmd.Env
 	command.Dir = cmd.Dir
 	if out, err := command.CombinedOutput(); err != nil {
 		return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err)
 	}
-
 	return nil
 }

@@ -119,8 +148,9 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
 			if err := mountPropagate(m, rootfs, ""); err != nil {
 				return err
 			}
+			return label.SetFileLabel(dest, mountLabel)
 		}
-		return label.SetFileLabel(dest, mountLabel)
+		return nil
 	case "tmpfs":
 		stat, err := os.Stat(dest)
 		if err != nil {
@@ -137,16 +167,6 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
 			}
 		}
 		return nil
-	case "devpts":
-		if err := os.MkdirAll(dest, 0755); err != nil {
-			return err
-		}
-		return mountPropagate(m, rootfs, mountLabel)
-	case "securityfs":
-		if err := os.MkdirAll(dest, 0755); err != nil {
-			return err
-		}
-		return mountPropagate(m, rootfs, mountLabel)
 	case "bind":
 		stat, err := os.Stat(m.Source)
 		if err != nil {
@@ -218,41 +238,33 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
 				return err
 			}
 		}
-		// create symlinks for merged cgroups
-		cwd, err := os.Getwd()
-		if err != nil {
-			return err
-		}
-		if err := os.Chdir(filepath.Join(rootfs, m.Destination)); err != nil {
-			return err
-		}
 		for _, mc := range merged {
 			for _, ss := range strings.Split(mc, ",") {
-				if err := os.Symlink(mc, ss); err != nil {
-					// if cgroup already exists, then okay(it could have been created before)
-					if os.IsExist(err) {
-						continue
-					}
-					os.Chdir(cwd)
+				// symlink(2) is very dumb, it will just shove the path into
+				// the link and doesn't do any checks or relative path
+				// conversion. Also, don't error out if the cgroup already exists.
+				if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) {
 					return err
 				}
 			}
 		}
-		if err := os.Chdir(cwd); err != nil {
-			return err
-		}
 		if m.Flags&syscall.MS_RDONLY != 0 {
 			// remount cgroup root as readonly
 			mcgrouproot := &configs.Mount{
+				Source:      m.Destination,
+				Device:      "bind",
 				Destination: m.Destination,
-				Flags:       defaultMountFlags | syscall.MS_RDONLY,
+				Flags:       defaultMountFlags | syscall.MS_RDONLY | syscall.MS_BIND,
 			}
 			if err := remount(mcgrouproot, rootfs); err != nil {
 				return err
 			}
 		}
 	default:
-		return fmt.Errorf("unknown mount device %q to %q", m.Device, m.Destination)
+		if err := os.MkdirAll(dest, 0755); err != nil {
+			return err
+		}
+		return mountPropagate(m, rootfs, mountLabel)
 	}
 	return nil
 }
@@ -294,7 +306,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
 // checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
 // dest is required to be an abs path and have any symlinks resolved before calling this function.
 func checkMountDestination(rootfs, dest string) error {
-	if filepath.Clean(rootfs) == filepath.Clean(dest) {
+	if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
 		return fmt.Errorf("mounting into / is prohibited")
 	}
 	invalidDestinations := []string{
@@ -307,7 +319,8 @@ func checkMountDestination(rootfs, dest string) error {
 		"/proc/cpuinfo",
 		"/proc/diskstats",
 		"/proc/meminfo",
-		"/proc/stats",
+		"/proc/stat",
+		"/proc/net/dev",
 	}
 	for _, valid := range validDestinations {
 		path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
@@ -340,7 +353,7 @@ func setupDevSymlinks(rootfs string) error {
 	// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
 	// in /dev if it exists in /proc.
 	if _, err := os.Stat("/proc/kcore"); err == nil {
-		links = append(links, [2]string{"/proc/kcore", "/dev/kcore"})
+		links = append(links, [2]string{"/proc/kcore", "/dev/core"})
 	}
 	for _, link := range links {
 		var (
@@ -489,10 +502,10 @@ func getParentMount(rootfs string) (string, string, error) {
 }

 // Make parent mount private if it was shared
-func rootfsParentMountPrivate(config *configs.Config) error {
+func rootfsParentMountPrivate(rootfs string) error {
 	sharedMount := false

-	parentMount, optionalOpts, err := getParentMount(config.Rootfs)
+	parentMount, optionalOpts, err := getParentMount(rootfs)
 	if err != nil {
 		return err
 	}
@@ -524,9 +537,10 @@ func prepareRoot(config *configs.Config) error {
 	if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil {
 		return err
 	}
-
-	if err := rootfsParentMountPrivate(config); err != nil {
-		return err
+	if config.NoPivotRoot {
+		if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
+			return err
+		}
 	}

 	return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "")
@@ -550,7 +564,7 @@ func setupPtmx(config *configs.Config, console *linuxConsole) error {
 	return nil
 }

-func pivotRoot(rootfs, pivotBaseDir string) error {
+func pivotRoot(rootfs, pivotBaseDir string) (err error) {
 	if pivotBaseDir == "" {
 		pivotBaseDir = "/"
 	}
@@ -562,8 +576,21 @@ func pivotRoot(rootfs, pivotBaseDir string) error {
 	if err != nil {
 		return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err)
 	}
+	defer func() {
+		errVal := os.Remove(pivotDir)
+		if err == nil {
+			err = errVal
+		}
+	}()
 	if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
-		return fmt.Errorf("pivot_root %s", err)
+		// Make the parent mount private
+		if err := rootfsParentMountPrivate(rootfs); err != nil {
+			return err
+		}
+		// Try again
+		if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
+			return fmt.Errorf("pivot_root %s", err)
+		}
 	}
 	if err := syscall.Chdir("/"); err != nil {
 		return fmt.Errorf("chdir / %s", err)
@@ -580,7 +607,7 @@ func pivotRoot(rootfs, pivotBaseDir string) error {
 	if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
 		return fmt.Errorf("unmount pivot_root dir %s", err)
 	}
-	return os.Remove(pivotDir)
+	return nil
 }

 func msMoveRoot(rootfs string) error {
@@ -669,14 +696,18 @@ func remount(m *configs.Mount, rootfs string) error {
 // of propagation flags.
 func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
 	var (
-		dest = m.Destination
-		data = label.FormatMountLabel(m.Data, mountLabel)
+		dest  = m.Destination
+		data  = label.FormatMountLabel(m.Data, mountLabel)
+		flags = m.Flags
 	)
+	if libcontainerUtils.CleanPath(dest) == "/dev" {
+		flags &= ^syscall.MS_RDONLY
+	}
 	if !strings.HasPrefix(dest, rootfs) {
 		dest = filepath.Join(rootfs, dest)
 	}

-	if err := syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data); err != nil {
+	if err := syscall.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
 		return err
 	}

--- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go
@@ -36,6 +36,11 @@ var archs = map[string]string{
 	"SCMP_ARCH_MIPSEL":      "mipsel",
 	"SCMP_ARCH_MIPSEL64":    "mipsel64",
 	"SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
+	"SCMP_ARCH_PPC":         "ppc",
+	"SCMP_ARCH_PPC64":       "ppc64",
+	"SCMP_ARCH_PPC64LE":     "ppc64le",
+	"SCMP_ARCH_S390":        "s390",
+	"SCMP_ARCH_S390X":       "s390x",
 }

 // ConvertStringToOperator converts a string into a Seccomp comparison operator.
--- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go
@@ -5,7 +5,6 @@ package seccomp
 import (
 	"bufio"
 	"fmt"
-	"log"
 	"os"
 	"strings"
 	"syscall"
@@ -167,7 +166,6 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
 	// Ignore it, don't error out
 	callNum, err := libseccomp.GetSyscallFromName(call.Name)
 	if err != nil {
-		log.Printf("Error resolving syscall name %s: %s - ignoring syscall.", call.Name, err)
 		return nil
 	}

--- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go
@@ -10,7 +10,7 @@ import (

 var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")

-// Seccomp not supported, do nothing
+// InitSeccomp does nothing because seccomp is not supported.
 func InitSeccomp(config *configs.Seccomp) error {
 	if config != nil {
 		return ErrSeccompNotEnabled
--- a/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go
@@ -13,9 +13,9 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"sync"
 	"syscall"

-	"github.com/docker/docker/pkg/mount"
 	"github.com/opencontainers/runc/libcontainer/system"
 )

@@ -35,6 +35,7 @@ const (
 var (
 	assignRegex           = regexp.MustCompile(`^([^=]+)=(.*)$`)
 	mcsList               = make(map[string]bool)
+	mcsLock               sync.Mutex
 	selinuxfs             = "unknown"
 	selinuxEnabled        = false // Stores whether selinux is currently enabled
 	selinuxEnabledChecked = false // Stores whether selinux enablement has been checked or established yet
@@ -58,16 +59,31 @@ func getSelinuxMountPoint() string {
 	}
 	selinuxfs = ""

-	mounts, err := mount.GetMounts()
+	f, err := os.Open("/proc/self/mountinfo")
 	if err != nil {
 		return selinuxfs
 	}
-	for _, mount := range mounts {
-		if mount.Fstype == "selinuxfs" {
-			selinuxfs = mount.Mountpoint
-			break
+	defer f.Close()
+
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		txt := scanner.Text()
+		// Safe as mountinfo encodes mountpoints with spaces as \040.
+		sepIdx := strings.Index(txt, " - ")
+		if sepIdx == -1 {
+			continue
 		}
+		if !strings.Contains(txt[sepIdx:], "selinuxfs") {
+			continue
+		}
+		fields := strings.Split(txt, " ")
+		if len(fields) < 5 {
+			continue
+		}
+		selinuxfs = fields[4]
+		break
 	}
+
 	if selinuxfs != "" {
 		var buf syscall.Statfs_t
 		syscall.Statfs(selinuxfs, &buf)
@@ -158,12 +174,14 @@ func Setfilecon(path string, scon string) error {
 // Getfilecon returns the SELinux label for this path or returns an error.
 func Getfilecon(path string) (string, error) {
 	con, err := system.Lgetxattr(path, xattrNameSelinux)
-
+	if err != nil {
+		return "", err
+	}
 	// Trim the NUL byte at the end of the byte buffer, if present.
-	if con[len(con)-1] == '\x00' {
+	if len(con) > 0 && con[len(con)-1] == '\x00' {
 		con = con[:len(con)-1]
 	}
-	return string(con), err
+	return string(con), nil
 }

 func Setfscreatecon(scon string) error {
@@ -265,6 +283,8 @@ func SelinuxGetEnforceMode() int {
 }

 func mcsAdd(mcs string) error {
+	mcsLock.Lock()
+	defer mcsLock.Unlock()
 	if mcsList[mcs] {
 		return fmt.Errorf("MCS Label already exists")
 	}
@@ -273,7 +293,9 @@ func mcsAdd(mcs string) error {
 }

 func mcsDelete(mcs string) {
+	mcsLock.Lock()
 	mcsList[mcs] = false
+	mcsLock.Unlock()
 }

 func IntToMcs(id int, catRange uint32) string {
@@ -289,7 +311,7 @@ func IntToMcs(id int, catRange uint32) string {

 	for ORD > TIER {
 		ORD = ORD - TIER
-		TIER -= 1
+		TIER--
 	}
 	TIER = SETSIZE - TIER
 	ORD = ORD + TIER
@@ -430,7 +452,7 @@ func badPrefix(fpath string) error {
 	return nil
 }

-// Change the fpath file object to the SELinux label scon.
+// Chcon changes the fpath file object to the SELinux label scon.
 // If the fpath is a directory and recurse is true Chcon will walk the
 // directory tree setting the label
 func Chcon(fpath string, scon string, recurse bool) error {
@@ -464,14 +486,14 @@ func DupSecOpt(src string) []string {
 		con["level"] == "" {
 		return nil
 	}
-	return []string{"label:user:" + con["user"],
-		"label:role:" + con["role"],
-		"label:type:" + con["type"],
-		"label:level:" + con["level"]}
+	return []string{"label=user:" + con["user"],
+		"label=role:" + con["role"],
+		"label=type:" + con["type"],
+		"label=level:" + con["level"]}
 }

 // DisableSecOpt returns a security opt that can be used to disabling SELinux
 // labeling support for future container processes
 func DisableSecOpt() []string {
-	return []string{"label:disable"}
+	return []string{"label=disable"}
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go
@@ -3,9 +3,11 @@
 package libcontainer

 import (
+	"fmt"
 	"os"

 	"github.com/opencontainers/runc/libcontainer/apparmor"
+	"github.com/opencontainers/runc/libcontainer/keys"
 	"github.com/opencontainers/runc/libcontainer/label"
 	"github.com/opencontainers/runc/libcontainer/seccomp"
 	"github.com/opencontainers/runc/libcontainer/system"
@@ -17,12 +19,21 @@ type linuxSetnsInit struct {
 	config *initConfig
 }

+func (l *linuxSetnsInit) getSessionRingName() string {
+	return fmt.Sprintf("_ses.%s", l.config.ContainerId)
+}
+
 func (l *linuxSetnsInit) Init() error {
-	if err := setupRlimits(l.config.Config); err != nil {
-		return err
+	if !l.config.Config.NoNewKeyring {
+		// do not inherit the parent's session keyring
+		if _, err := keyctl.JoinSessionKeyring(l.getSessionRingName()); err != nil {
+			return err
+		}
 	}
-	if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
-		return err
+	if l.config.NoNewPrivileges {
+		if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
+			return err
+		}
 	}
 	if l.config.Config.Seccomp != nil {
 		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
@@ -32,13 +43,11 @@ func (l *linuxSetnsInit) Init() error {
 	if err := finalizeNamespace(l.config); err != nil {
 		return err
 	}
-	if err := apparmor.ApplyProfile(l.config.Config.AppArmorProfile); err != nil {
+	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
 		return err
 	}
-	if l.config.Config.ProcessLabel != "" {
-		if err := label.SetProcessLabel(l.config.Config.ProcessLabel); err != nil {
-			return err
-		}
+	if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
+		return err
 	}
 	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go
@@ -2,14 +2,14 @@ package stacktrace

 import "runtime"

-// Caputure captures a stacktrace for the current calling go program
+// Capture captures a stacktrace for the current calling go program
 //
 // skip is the number of frames to skip
 func Capture(userSkip int) Stacktrace {
 	var (
 		skip   = userSkip + 1 // add one for our own function
 		frames []Frame
-		prevPc uintptr = 0
+		prevPc uintptr
 	)
 	for i := skip; ; i++ {
 		pc, file, line, ok := runtime.Caller(i)
--- a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go
@@ -3,28 +3,62 @@
 package libcontainer

 import (
+	"fmt"
 	"io"
 	"os"
+	"os/exec"
 	"syscall"

 	"github.com/opencontainers/runc/libcontainer/apparmor"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/keys"
 	"github.com/opencontainers/runc/libcontainer/label"
 	"github.com/opencontainers/runc/libcontainer/seccomp"
 	"github.com/opencontainers/runc/libcontainer/system"
 )

 type linuxStandardInit struct {
-	pipe      io.ReadWriter
-	parentPid int
-	config    *initConfig
+	pipe       io.ReadWriteCloser
+	parentPid  int
+	stateDirFD int
+	config     *initConfig
 }

-func (l *linuxStandardInit) Init() error {
-	// join any namespaces via a path to the namespace fd if provided
-	if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil {
-		return err
+func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
+	var newperms uint32
+
+	if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
+		// with user ns we need 'other' search permissions
+		newperms = 0x8
+	} else {
+		// without user ns we need 'UID' search permissions
+		newperms = 0x80000
 	}
+
+	// create a unique per session container name that we can
+	// join in setns; however, other containers can also join it
+	return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
+}
+
+// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
+// the kernel
+const PR_SET_NO_NEW_PRIVS = 0x26
+
+func (l *linuxStandardInit) Init() error {
+	if !l.config.Config.NoNewKeyring {
+		ringname, keepperms, newperms := l.getSessionRingParams()
+
+		// do not inherit the parent's session keyring
+		sessKeyId, err := keyctl.JoinSessionKeyring(ringname)
+		if err != nil {
+			return err
+		}
+		// make session keyring searcheable
+		if err := keyctl.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
+			return err
+		}
+	}
+
 	var console *linuxConsole
 	if l.config.Console != "" {
 		console = newConsoleFromPath(l.config.Console)
@@ -32,9 +66,6 @@ func (l *linuxStandardInit) Init() error {
 			return err
 		}
 	}
-	if _, err := syscall.Setsid(); err != nil {
-		return err
-	}
 	if console != nil {
 		if err := system.Setctty(); err != nil {
 			return err
@@ -46,16 +77,11 @@ func (l *linuxStandardInit) Init() error {
 	if err := setupRoute(l.config.Config); err != nil {
 		return err
 	}
-	if err := setupRlimits(l.config.Config); err != nil {
-		return err
-	}
-	if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
-		return err
-	}
+
 	label.Init()
 	// InitializeMountNamespace() can be executed only for a new mount namespace
 	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
-		if err := setupRootfs(l.config.Config, console); err != nil {
+		if err := setupRootfs(l.config.Config, console, l.pipe); err != nil {
 			return err
 		}
 	}
@@ -64,10 +90,10 @@ func (l *linuxStandardInit) Init() error {
 			return err
 		}
 	}
-	if err := apparmor.ApplyProfile(l.config.Config.AppArmorProfile); err != nil {
+	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
 		return err
 	}
-	if err := label.SetProcessLabel(l.config.Config.ProcessLabel); err != nil {
+	if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
 		return err
 	}

@@ -90,13 +116,21 @@ func (l *linuxStandardInit) Init() error {
 	if err != nil {
 		return err
 	}
+	if l.config.NoNewPrivileges {
+		if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
+			return err
+		}
+	}
 	// Tell our parent that we're ready to Execv. This must be done before the
 	// Seccomp rules have been applied, because we need to be able to read and
 	// write to a socket.
 	if err := syncParentReady(l.pipe); err != nil {
 		return err
 	}
-	if l.config.Config.Seccomp != nil {
+	// Without NoNewPrivileges seccomp is a privileged operation, so we need to
+	// do this before dropping capabilities; otherwise do it as late as possible
+	// just before execve so as few syscalls take place after it as possible.
+	if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
 		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
 			return err
 		}
@@ -115,5 +149,30 @@ func (l *linuxStandardInit) Init() error {
 	if syscall.Getppid() != l.parentPid {
 		return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
 	}
-	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
+	// check for the arg before waiting to make sure it exists and it is returned
+	// as a create time error.
+	name, err := exec.LookPath(l.config.Args[0])
+	if err != nil {
+		return err
+	}
+	// close the pipe to signal that we have completed our init.
+	l.pipe.Close()
+	// wait for the fifo to be opened on the other side before
+	// exec'ing the users process.
+	fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
+	if err != nil {
+		return newSystemErrorWithCause(err, "openat exec fifo")
+	}
+	if _, err := syscall.Write(fd, []byte("0")); err != nil {
+		return newSystemErrorWithCause(err, "write 0 exec fifo")
+	}
+	if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
+		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
+			return newSystemErrorWithCause(err, "init seccomp")
+		}
+	}
+	if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
+		return newSystemErrorWithCause(err, "exec user process")
+	}
+	return nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go
@@ -6,9 +6,11 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"syscall"

 	"github.com/Sirupsen/logrus"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/utils"
 )

 func newStateTransitionError(from, to containerState) error {
@@ -56,9 +58,10 @@ func destroy(c *linuxContainer) error {
 func runPoststopHooks(c *linuxContainer) error {
 	if c.config.Hooks != nil {
 		s := configs.HookState{
-			Version: c.config.Version,
-			ID:      c.id,
-			Root:    c.config.Rootfs,
+			Version:    c.config.Version,
+			ID:         c.id,
+			Root:       c.config.Rootfs,
+			BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
 		}
 		for _, hook := range c.config.Hooks.Poststop {
 			if err := hook.Run(s); err != nil {
@@ -75,7 +78,7 @@ type stoppedState struct {
 }

 func (b *stoppedState) status() Status {
-	return Destroyed
+	return Stopped
 }

 func (b *stoppedState) transition(s containerState) error {
@@ -108,11 +111,11 @@ func (r *runningState) status() Status {
 func (r *runningState) transition(s containerState) error {
 	switch s.(type) {
 	case *stoppedState:
-		running, err := r.c.isRunning()
+		t, err := r.c.runType()
 		if err != nil {
 			return err
 		}
-		if running {
+		if t == Running {
 			return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
 		}
 		r.c.state = s
@@ -127,16 +130,40 @@ func (r *runningState) transition(s containerState) error {
 }

 func (r *runningState) destroy() error {
-	running, err := r.c.isRunning()
+	t, err := r.c.runType()
 	if err != nil {
 		return err
 	}
-	if running {
+	if t == Running {
 		return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
 	}
 	return destroy(r.c)
 }

+type createdState struct {
+	c *linuxContainer
+}
+
+func (i *createdState) status() Status {
+	return Created
+}
+
+func (i *createdState) transition(s containerState) error {
+	switch s.(type) {
+	case *runningState, *pausedState, *stoppedState:
+		i.c.state = s
+		return nil
+	case *createdState:
+		return nil
+	}
+	return newStateTransitionError(i, s)
+}
+
+func (i *createdState) destroy() error {
+	i.c.initProcess.signal(syscall.SIGKILL)
+	return destroy(i.c)
+}
+
 // pausedState represents a container that is currently pause.  It cannot be destroyed in a
 // paused state and must transition back to running first.
 type pausedState struct {
@@ -159,11 +186,11 @@ func (p *pausedState) transition(s containerState) error {
 }

 func (p *pausedState) destroy() error {
-	isRunning, err := p.c.isRunning()
+	t, err := p.c.runType()
 	if err != nil {
 		return err
 	}
-	if !isRunning {
+	if t != Running && t != Created {
 		if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
 			return err
 		}
@@ -173,7 +200,7 @@ func (p *pausedState) destroy() error {
 }

 // restoredState is the same as the running state but also has accociated checkpoint
-// information that maybe need destroyed when the container is stopped and destory is called.
+// information that maybe need destroyed when the container is stopped and destroy is called.
 type restoredState struct {
 	imageDir string
 	c        *linuxContainer
@@ -202,22 +229,25 @@ func (r *restoredState) destroy() error {
 	return destroy(r.c)
 }

-// createdState is used whenever a container is restored, loaded, or setting additional
+// loadedState is used whenever a container is restored, loaded, or setting additional
 // processes inside and it should not be destroyed when it is exiting.
-type createdState struct {
+type loadedState struct {
 	c *linuxContainer
 	s Status
 }

-func (n *createdState) status() Status {
+func (n *loadedState) status() Status {
 	return n.s
 }

-func (n *createdState) transition(s containerState) error {
+func (n *loadedState) transition(s containerState) error {
 	n.c.state = s
 	return nil
 }

-func (n *createdState) destroy() error {
-	return nil
+func (n *loadedState) destroy() error {
+	if err := n.c.refreshState(); err != nil {
+		return err
+	}
+	return n.c.state.destroy()
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go
@@ -0,0 +1,7 @@
+package libcontainer
+
+// Solaris - TODO
+
+type Stats struct {
+	Interfaces []*NetworkInterface
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
@@ -11,6 +11,19 @@ import (
 	"unsafe"
 )

+// If arg2 is nonzero, set the "child subreaper" attribute of the
+// calling process; if arg2 is zero, unset the attribute.  When a
+// process is marked as a child subreaper, all of the children
+// that it creates, and their descendants, will be marked as
+// having a subreaper.  In effect, a subreaper fulfills the role
+// of init(1) for its descendant processes.  Upon termination of
+// a process that is orphaned (i.e., its immediate parent has
+// already terminated) and marked as having a subreaper, the
+// nearest still living ancestor subreaper will receive a SIGCHLD
+// signal and be able to wait(2) on the process to discover its
+// termination status.
+const PR_SET_CHILD_SUBREAPER = 36
+
 type ParentDeathSignal int

 func (p ParentDeathSignal) Restore() error {
@@ -40,6 +53,14 @@ func Execv(cmd string, args []string, env []string) error {
 	return syscall.Exec(name, args, env)
 }

+func Prlimit(pid, resource int, limit syscall.Rlimit) error {
+	_, _, err := syscall.RawSyscall6(syscall.SYS_PRLIMIT64, uintptr(pid), uintptr(resource), uintptr(unsafe.Pointer(&limit)), uintptr(unsafe.Pointer(&limit)), 0, 0)
+	if err != 0 {
+		return err
+	}
+	return nil
+}
+
 func SetParentDeathSignal(sig uintptr) error {
 	if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, sig, 0); err != 0 {
 		return err
@@ -79,17 +100,12 @@ func Setctty() error {
 	return nil
 }

-/*
- * Detect whether we are currently running in a user namespace.
- * Copied from github.com/lxc/lxd/shared/util.go
- */
+// RunningInUserNS detects whether we are currently running in a user namespace.
+// Copied from github.com/lxc/lxd/shared/util.go
 func RunningInUserNS() bool {
 	file, err := os.Open("/proc/self/uid_map")
 	if err != nil {
-		/*
-		 * This kernel-provided file only exists if user namespaces are
-		 * supported
-		 */
+		// This kernel-provided file only exists if user namespaces are supported
 		return false
 	}
 	defer file.Close()
@@ -112,3 +128,16 @@ func RunningInUserNS() bool {
 	}
 	return true
 }
+
+// SetSubreaper sets the value i as the subreaper setting for the calling process
+func SetSubreaper(i int) error {
+	return Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
+}
+
+func Prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) {
+	_, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0)
+	if e1 != 0 {
+		err = e1
+	}
+	return
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go
@@ -0,0 +1,9 @@
+// +build !linux
+
+package system
+
+// RunningInUserNS is a stub for non-Linux systems
+// Always returns false
+func RunningInUserNS() bool {
+	return false
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go
@@ -2,13 +2,15 @@ package user

 import (
 	"errors"
-	"fmt"
 	"syscall"
 )

 var (
 	// The current operating system does not provide the required data for user lookups.
 	ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
+	// No matching entries found in file.
+	ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
+	ErrNoGroupEntries  = errors.New("no matching entries in group file")
 )

 func lookupUser(filter func(u User) bool) (User, error) {
@@ -27,7 +29,7 @@ func lookupUser(filter func(u User) bool) (User, error) {

 	// No user entries found.
 	if len(users) == 0 {
-		return User{}, fmt.Errorf("no matching entries in passwd file")
+		return User{}, ErrNoPasswdEntries
 	}

 	// Assume the first entry is the "correct" one.
@@ -75,7 +77,7 @@ func lookupGroup(filter func(g Group) bool) (Group, error) {

 	// No user entries found.
 	if len(groups) == 0 {
-		return Group{}, fmt.Errorf("no matching entries in group file")
+		return Group{}, ErrNoGroupEntries
 	}

 	// Assume the first entry is the "correct" one.
--- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go
@@ -15,7 +15,7 @@ const (
 )

 var (
-	ErrRange = fmt.Errorf("Uids and gids must be in range %d-%d", minId, maxId)
+	ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
 )

 type User struct {
@@ -42,29 +42,30 @@ func parseLine(line string, v ...interface{}) {

 	parts := strings.Split(line, ":")
 	for i, p := range parts {
+		// Ignore cases where we don't have enough fields to populate the arguments.
+		// Some configuration files like to misbehave.
 		if len(v) <= i {
-			// if we have more "parts" than we have places to put them, bail for great "tolerance" of naughty configuration files
 			break
 		}

+		// Use the type of the argument to figure out how to parse it, scanf() style.
+		// This is legit.
 		switch e := v[i].(type) {
 		case *string:
-			// "root", "adm", "/bin/bash"
 			*e = p
 		case *int:
-			// "0", "4", "1000"
-			// ignore string to int conversion errors, for great "tolerance" of naughty configuration files
+			// "numbers", with conversion errors ignored because of some misbehaving configuration files.
 			*e, _ = strconv.Atoi(p)
 		case *[]string:
-			// "", "root", "root,adm,daemon"
+			// Comma-separated lists.
 			if p != "" {
 				*e = strings.Split(p, ",")
 			} else {
 				*e = []string{}
 			}
 		default:
-			// panic, because this is a programming/logic error, not a runtime one
-			panic("parseLine expects only pointers!  argument " + strconv.Itoa(i) + " is not a pointer!")
+			// Someone goof'd when writing code using this function. Scream so they can hear us.
+			panic(fmt.Sprintf("parseLine only accepts {*string, *int, *[]string} as arguments! %#v is not a pointer!", e))
 		}
 	}
 }
@@ -106,8 +107,8 @@ func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
 			return nil, err
 		}

-		text := strings.TrimSpace(s.Text())
-		if text == "" {
+		line := strings.TrimSpace(s.Text())
+		if line == "" {
 			continue
 		}

@@ -117,10 +118,7 @@ func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
 		//  root:x:0:0:root:/root:/bin/bash
 		//  adm:x:3:4:adm:/var/adm:/bin/false
 		p := User{}
-		parseLine(
-			text,
-			&p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell,
-		)
+		parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell)

 		if filter == nil || filter(p) {
 			out = append(out, p)
@@ -135,6 +133,7 @@ func ParseGroupFile(path string) ([]Group, error) {
 	if err != nil {
 		return nil, err
 	}
+
 	defer group.Close()
 	return ParseGroup(group)
 }
@@ -178,10 +177,7 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
 		//  root:x:0:root
 		//  adm:x:4:root,adm,daemon
 		p := Group{}
-		parseLine(
-			text,
-			&p.Name, &p.Pass, &p.Gid, &p.List,
-		)
+		parseLine(text, &p.Name, &p.Pass, &p.Gid, &p.List)

 		if filter == nil || filter(p) {
 			out = append(out, p)
@@ -192,9 +188,10 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
 }

 type ExecUser struct {
-	Uid, Gid int
-	Sgids    []int
-	Home     string
+	Uid   int
+	Gid   int
+	Sgids []int
+	Home  string
 }

 // GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the
@@ -235,12 +232,12 @@ func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath
 //     * "uid:gid
 //     * "user:gid"
 //     * "uid:group"
+//
+// It should be noted that if you specify a numeric user or group id, they will
+// not be evaluated as usernames (only the metadata will be filled). So attempting
+// to parse a user with user.Name = "1337" will produce the user with a UID of
+// 1337.
 func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) {
-	var (
-		userArg, groupArg string
-		name              string
-	)
-
 	if defaults == nil {
 		defaults = new(ExecUser)
 	}
@@ -258,87 +255,113 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (
 		user.Sgids = []int{}
 	}

-	// allow for userArg to have either "user" syntax, or optionally "user:group" syntax
+	// Allow for userArg to have either "user" syntax, or optionally "user:group" syntax
+	var userArg, groupArg string
 	parseLine(userSpec, &userArg, &groupArg)

+	// Convert userArg and groupArg to be numeric, so we don't have to execute
+	// Atoi *twice* for each iteration over lines.
+	uidArg, uidErr := strconv.Atoi(userArg)
+	gidArg, gidErr := strconv.Atoi(groupArg)
+
+	// Find the matching user.
 	users, err := ParsePasswdFilter(passwd, func(u User) bool {
 		if userArg == "" {
+			// Default to current state of the user.
 			return u.Uid == user.Uid
 		}
-		return u.Name == userArg || strconv.Itoa(u.Uid) == userArg
+
+		if uidErr == nil {
+			// If the userArg is numeric, always treat it as a UID.
+			return uidArg == u.Uid
+		}
+
+		return u.Name == userArg
 	})
+
+	// If we can't find the user, we have to bail.
 	if err != nil && passwd != nil {
 		if userArg == "" {
 			userArg = strconv.Itoa(user.Uid)
 		}
-		return nil, fmt.Errorf("Unable to find user %v: %v", userArg, err)
+		return nil, fmt.Errorf("unable to find user %s: %v", userArg, err)
 	}

-	haveUser := users != nil && len(users) > 0
-	if haveUser {
-		// if we found any user entries that matched our filter, let's take the first one as "correct"
-		name = users[0].Name
+	var matchedUserName string
+	if len(users) > 0 {
+		// First match wins, even if there's more than one matching entry.
+		matchedUserName = users[0].Name
 		user.Uid = users[0].Uid
 		user.Gid = users[0].Gid
 		user.Home = users[0].Home
 	} else if userArg != "" {
-		// we asked for a user but didn't find them...  let's check to see if we wanted a numeric user
-		user.Uid, err = strconv.Atoi(userArg)
-		if err != nil {
-			// not numeric - we have to bail
-			return nil, fmt.Errorf("Unable to find user %v", userArg)
+		// If we can't find a user with the given username, the only other valid
+		// option is if it's a numeric username with no associated entry in passwd.
+
+		if uidErr != nil {
+			// Not numeric.
+			return nil, fmt.Errorf("unable to find user %s: %v", userArg, ErrNoPasswdEntries)
 		}
+		user.Uid = uidArg

 		// Must be inside valid uid range.
 		if user.Uid < minId || user.Uid > maxId {
 			return nil, ErrRange
 		}

-		// if userArg couldn't be found in /etc/passwd but is numeric, just roll with it - this is legit
+		// Okay, so it's numeric. We can just roll with this.
 	}

-	if groupArg != "" || name != "" {
+	// On to the groups. If we matched a username, we need to do this because of
+	// the supplementary group IDs.
+	if groupArg != "" || matchedUserName != "" {
 		groups, err := ParseGroupFilter(group, func(g Group) bool {
-			// Explicit group format takes precedence.
-			if groupArg != "" {
-				return g.Name == groupArg || strconv.Itoa(g.Gid) == groupArg
-			}
-
-			// Check if user is a member.
-			for _, u := range g.List {
-				if u == name {
-					return true
+			// If the group argument isn't explicit, we'll just search for it.
+			if groupArg == "" {
+				// Check if user is a member of this group.
+				for _, u := range g.List {
+					if u == matchedUserName {
+						return true
+					}
 				}
+				return false
 			}

-			return false
+			if gidErr == nil {
+				// If the groupArg is numeric, always treat it as a GID.
+				return gidArg == g.Gid
+			}
+
+			return g.Name == groupArg
 		})
 		if err != nil && group != nil {
-			return nil, fmt.Errorf("Unable to find groups for user %v: %v", users[0].Name, err)
+			return nil, fmt.Errorf("unable to find groups for spec %v: %v", matchedUserName, err)
 		}

-		haveGroup := groups != nil && len(groups) > 0
+		// Only start modifying user.Gid if it is in explicit form.
 		if groupArg != "" {
-			if haveGroup {
-				// if we found any group entries that matched our filter, let's take the first one as "correct"
+			if len(groups) > 0 {
+				// First match wins, even if there's more than one matching entry.
 				user.Gid = groups[0].Gid
-			} else {
-				// we asked for a group but didn't find id...  let's check to see if we wanted a numeric group
-				user.Gid, err = strconv.Atoi(groupArg)
-				if err != nil {
-					// not numeric - we have to bail
-					return nil, fmt.Errorf("Unable to find group %v", groupArg)
-				}
+			} else if groupArg != "" {
+				// If we can't find a group with the given name, the only other valid
+				// option is if it's a numeric group name with no associated entry in group.

-				// Ensure gid is inside gid range.
+				if gidErr != nil {
+					// Not numeric.
+					return nil, fmt.Errorf("unable to find group %s: %v", groupArg, ErrNoGroupEntries)
+				}
+				user.Gid = gidArg
+
+				// Must be inside valid gid range.
 				if user.Gid < minId || user.Gid > maxId {
 					return nil, ErrRange
 				}

-				// if groupArg couldn't be found in /etc/group but is numeric, just roll with it - this is legit
+				// Okay, so it's numeric. We can just roll with this.
 			}
-		} else if haveGroup {
-			// If implicit group format, fill supplementary gids.
+		} else if len(groups) > 0 {
+			// Supplementary group ids only make sense if in the implicit form.
 			user.Sgids = make([]int, len(groups))
 			for i, group := range groups {
 				user.Sgids[i] = group.Gid
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
@@ -5,7 +5,9 @@ import (
 	"encoding/hex"
 	"encoding/json"
 	"io"
+	"os"
 	"path/filepath"
+	"strings"
 	"syscall"
 )

@@ -54,3 +56,66 @@ func WriteJSON(w io.Writer, v interface{}) error {
 	_, err = w.Write(data)
 	return err
 }
+
+// CleanPath makes a path safe for use with filepath.Join. This is done by not
+// only cleaning the path, but also (if the path is relative) adding a leading
+// '/' and cleaning it (then removing the leading '/'). This ensures that a
+// path resulting from prepending another path will always resolve to lexically
+// be a subdirectory of the prefixed path. This is all done lexically, so paths
+// that include symlinks won't be safe as a result of using CleanPath.
+func CleanPath(path string) string {
+	// Deal with empty strings nicely.
+	if path == "" {
+		return ""
+	}
+
+	// Ensure that all paths are cleaned (especially problematic ones like
+	// "/../../../../../" which can cause lots of issues).
+	path = filepath.Clean(path)
+
+	// If the path isn't absolute, we need to do more processing to fix paths
+	// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
+	// paths to relative ones.
+	if !filepath.IsAbs(path) {
+		path = filepath.Clean(string(os.PathSeparator) + path)
+		// This can't fail, as (by definition) all paths are relative to root.
+		path, _ = filepath.Rel(string(os.PathSeparator), path)
+	}
+
+	// Clean the path again for good measure.
+	return filepath.Clean(path)
+}
+
+// SearchLabels searches a list of key-value pairs for the provided key and
+// returns the corresponding value. The pairs must be separated with '='.
+func SearchLabels(labels []string, query string) string {
+	for _, l := range labels {
+		parts := strings.SplitN(l, "=", 2)
+		if len(parts) < 2 {
+			continue
+		}
+		if parts[0] == query {
+			return parts[1]
+		}
+	}
+	return ""
+}
+
+// Annotations returns the bundle path and user defined annotations from the
+// libcontianer state.  We need to remove the bundle because that is a label
+// added by libcontainer.
+func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
+	userAnnotations = make(map[string]string)
+	for _, l := range labels {
+		parts := strings.SplitN(l, "=", 2)
+		if len(parts) < 2 {
+			continue
+		}
+		if parts[0] == "bundle" {
+			bundle = parts[1]
+		} else {
+			userAnnotations[parts[0]] = parts[1]
+		}
+	}
+	return
+}