330
vendor/github.com/opencontainers/runc/libcontainer/README.md
generated
vendored
330
vendor/github.com/opencontainers/runc/libcontainer/README.md
generated
vendored
@@ -1,330 +0,0 @@
|
||||
# libcontainer
|
||||
|
||||
[](https://godoc.org/github.com/opencontainers/runc/libcontainer)
|
||||
|
||||
Libcontainer provides a native Go implementation for creating containers
|
||||
with namespaces, cgroups, capabilities, and filesystem access controls.
|
||||
It allows you to manage the lifecycle of the container performing additional operations
|
||||
after the container is created.
|
||||
|
||||
|
||||
#### Container
|
||||
A container is a self contained execution environment that shares the kernel of the
|
||||
host system and which is (optionally) isolated from other containers in the system.
|
||||
|
||||
#### Using libcontainer
|
||||
|
||||
Because containers are spawned in a two step process you will need a binary that
|
||||
will be executed as the init process for the container. In libcontainer, we use
|
||||
the current binary (/proc/self/exe) to be executed as the init process, and use
|
||||
arg "init", we call the first step process "bootstrap", so you always need a "init"
|
||||
function as the entry of "bootstrap".
|
||||
|
||||
In addition to the go init function the early stage bootstrap is handled by importing
|
||||
[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md).
|
||||
|
||||
```go
|
||||
import (
|
||||
_ "github.com/opencontainers/runc/libcontainer/nsenter"
|
||||
)
|
||||
|
||||
func init() {
|
||||
if len(os.Args) > 1 && os.Args[1] == "init" {
|
||||
runtime.GOMAXPROCS(1)
|
||||
runtime.LockOSThread()
|
||||
factory, _ := libcontainer.New("")
|
||||
if err := factory.StartInitialization(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
panic("--this line should have never been executed, congratulations--")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Then to create a container you first have to initialize an instance of a factory
|
||||
that will handle the creation and initialization for a container.
|
||||
|
||||
```go
|
||||
factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
return
|
||||
}
|
||||
```
|
||||
|
||||
Once you have an instance of the factory created we can create a configuration
|
||||
struct describing how the container is to be created. A sample would look similar to this:
|
||||
|
||||
```go
|
||||
defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
|
||||
config := &configs.Config{
|
||||
Rootfs: "/your/path/to/rootfs",
|
||||
Capabilities: &configs.Capabilities{
|
||||
Bounding: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Effective: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Inheritable: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Permitted: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Ambient: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
},
|
||||
Namespaces: configs.Namespaces([]configs.Namespace{
|
||||
{Type: configs.NEWNS},
|
||||
{Type: configs.NEWUTS},
|
||||
{Type: configs.NEWIPC},
|
||||
{Type: configs.NEWPID},
|
||||
{Type: configs.NEWUSER},
|
||||
{Type: configs.NEWNET},
|
||||
{Type: configs.NEWCGROUP},
|
||||
}),
|
||||
Cgroups: &configs.Cgroup{
|
||||
Name: "test-container",
|
||||
Parent: "system",
|
||||
Resources: &configs.Resources{
|
||||
MemorySwappiness: nil,
|
||||
Devices: specconv.AllowedDevices,
|
||||
},
|
||||
},
|
||||
MaskPaths: []string{
|
||||
"/proc/kcore",
|
||||
"/sys/firmware",
|
||||
},
|
||||
ReadonlyPaths: []string{
|
||||
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
|
||||
},
|
||||
Devices: specconv.AllowedDevices,
|
||||
Hostname: "testing",
|
||||
Mounts: []*configs.Mount{
|
||||
{
|
||||
Source: "proc",
|
||||
Destination: "/proc",
|
||||
Device: "proc",
|
||||
Flags: defaultMountFlags,
|
||||
},
|
||||
{
|
||||
Source: "tmpfs",
|
||||
Destination: "/dev",
|
||||
Device: "tmpfs",
|
||||
Flags: unix.MS_NOSUID | unix.MS_STRICTATIME,
|
||||
Data: "mode=755",
|
||||
},
|
||||
{
|
||||
Source: "devpts",
|
||||
Destination: "/dev/pts",
|
||||
Device: "devpts",
|
||||
Flags: unix.MS_NOSUID | unix.MS_NOEXEC,
|
||||
Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
|
||||
},
|
||||
{
|
||||
Device: "tmpfs",
|
||||
Source: "shm",
|
||||
Destination: "/dev/shm",
|
||||
Data: "mode=1777,size=65536k",
|
||||
Flags: defaultMountFlags,
|
||||
},
|
||||
{
|
||||
Source: "mqueue",
|
||||
Destination: "/dev/mqueue",
|
||||
Device: "mqueue",
|
||||
Flags: defaultMountFlags,
|
||||
},
|
||||
{
|
||||
Source: "sysfs",
|
||||
Destination: "/sys",
|
||||
Device: "sysfs",
|
||||
Flags: defaultMountFlags | unix.MS_RDONLY,
|
||||
},
|
||||
},
|
||||
UidMappings: []configs.IDMap{
|
||||
{
|
||||
ContainerID: 0,
|
||||
HostID: 1000,
|
||||
Size: 65536,
|
||||
},
|
||||
},
|
||||
GidMappings: []configs.IDMap{
|
||||
{
|
||||
ContainerID: 0,
|
||||
HostID: 1000,
|
||||
Size: 65536,
|
||||
},
|
||||
},
|
||||
Networks: []*configs.Network{
|
||||
{
|
||||
Type: "loopback",
|
||||
Address: "127.0.0.1/0",
|
||||
Gateway: "localhost",
|
||||
},
|
||||
},
|
||||
Rlimits: []configs.Rlimit{
|
||||
{
|
||||
Type: unix.RLIMIT_NOFILE,
|
||||
Hard: uint64(1025),
|
||||
Soft: uint64(1025),
|
||||
},
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
Once you have the configuration populated you can create a container:
|
||||
|
||||
```go
|
||||
container, err := factory.Create("container-id", config)
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
return
|
||||
}
|
||||
```
|
||||
|
||||
To spawn bash as the initial process inside the container and have the
|
||||
processes pid returned in order to wait, signal, or kill the process:
|
||||
|
||||
```go
|
||||
process := &libcontainer.Process{
|
||||
Args: []string{"/bin/bash"},
|
||||
Env: []string{"PATH=/bin"},
|
||||
User: "daemon",
|
||||
Stdin: os.Stdin,
|
||||
Stdout: os.Stdout,
|
||||
Stderr: os.Stderr,
|
||||
Init: true,
|
||||
}
|
||||
|
||||
err := container.Run(process)
|
||||
if err != nil {
|
||||
container.Destroy()
|
||||
logrus.Fatal(err)
|
||||
return
|
||||
}
|
||||
|
||||
// wait for the process to finish.
|
||||
_, err := process.Wait()
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
|
||||
// destroy the container.
|
||||
container.Destroy()
|
||||
```
|
||||
|
||||
Additional ways to interact with a running container are:
|
||||
|
||||
```go
|
||||
// return all the pids for all processes running inside the container.
|
||||
processes, err := container.Processes()
|
||||
|
||||
// get detailed cpu, memory, io, and network statistics for the container and
|
||||
// it's processes.
|
||||
stats, err := container.Stats()
|
||||
|
||||
// pause all processes inside the container.
|
||||
container.Pause()
|
||||
|
||||
// resume all paused processes.
|
||||
container.Resume()
|
||||
|
||||
// send signal to container's init process.
|
||||
container.Signal(signal)
|
||||
|
||||
// update container resource constraints.
|
||||
container.Set(config)
|
||||
|
||||
// get current status of the container.
|
||||
status, err := container.Status()
|
||||
|
||||
// get current container's state information.
|
||||
state, err := container.State()
|
||||
```
|
||||
|
||||
|
||||
#### Checkpoint & Restore
|
||||
|
||||
libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
|
||||
This let's you save the state of a process running inside a container to disk, and then restore
|
||||
that state into a new process, on the same machine or on another machine.
|
||||
|
||||
`criu` version 1.5.2 or higher is required to use checkpoint and restore.
|
||||
If you don't already have `criu` installed, you can build it from source, following the
|
||||
[online instructions](http://criu.org/Installation). `criu` is also installed in the docker image
|
||||
generated when building libcontainer with docker.
|
||||
|
||||
|
||||
## Copyright and license
|
||||
|
||||
Code and documentation copyright 2014 Docker, inc.
|
||||
The code and documentation are released under the [Apache 2.0 license](../LICENSE).
|
||||
The documentation is also released under Creative Commons Attribution 4.0 International License.
|
||||
You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/.
|
||||
44
vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md
generated
vendored
44
vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md
generated
vendored
@@ -1,44 +0,0 @@
|
||||
## nsenter
|
||||
|
||||
The `nsenter` package registers a special init constructor that is called before
|
||||
the Go runtime has a chance to boot. This provides us the ability to `setns` on
|
||||
existing namespaces and avoid the issues that the Go runtime has with multiple
|
||||
threads. This constructor will be called if this package is registered,
|
||||
imported, in your go application.
|
||||
|
||||
The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd/cgo/)
|
||||
package. In cgo, if the import of "C" is immediately preceded by a comment, that comment,
|
||||
called the preamble, is used as a header when compiling the C parts of the package.
|
||||
So every time we import package `nsenter`, the C code function `nsexec()` would be
|
||||
called. And package `nsenter` is only imported in `init.go`, so every time the runc
|
||||
`init` command is invoked, that C code is run.
|
||||
|
||||
Because `nsexec()` must be run before the Go runtime in order to use the
|
||||
Linux kernel namespace, you must `import` this library into a package if
|
||||
you plan to use `libcontainer` directly. Otherwise Go will not execute
|
||||
the `nsexec()` constructor, which means that the re-exec will not cause
|
||||
the namespaces to be joined. You can import it like this:
|
||||
|
||||
```go
|
||||
import _ "github.com/opencontainers/runc/libcontainer/nsenter"
|
||||
```
|
||||
|
||||
`nsexec()` will first get the file descriptor number for the init pipe
|
||||
from the environment variable `_LIBCONTAINER_INITPIPE` (which was opened
|
||||
by the parent and kept open across the fork-exec of the `nsexec()` init
|
||||
process). The init pipe is used to read bootstrap data (namespace paths,
|
||||
clone flags, uid and gid mappings, and the console path) from the parent
|
||||
process. `nsexec()` will then call `setns(2)` to join the namespaces
|
||||
provided in the bootstrap data (if available), `clone(2)` a child process
|
||||
with the provided clone flags, update the user and group ID mappings, do
|
||||
some further miscellaneous setup steps, and then send the PID of the
|
||||
child process to the parent of the `nsexec()` "caller". Finally,
|
||||
the parent `nsexec()` will exit and the child `nsexec()` process will
|
||||
return to allow the Go runtime take over.
|
||||
|
||||
NOTE: We do both `setns(2)` and `clone(2)` even if we don't have any
|
||||
`CLONE_NEW*` clone flags because we must fork a new process in order to
|
||||
enter the PID namespace.
|
||||
|
||||
|
||||
|
||||
542
vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c
generated
vendored
542
vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c
generated
vendored
@@ -1,542 +0,0 @@
|
||||
// SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
|
||||
/*
|
||||
* Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
|
||||
* Copyright (C) 2019 SUSE LLC
|
||||
*
|
||||
* This work is dual licensed under the following licenses. You may use,
|
||||
* redistribute, and/or modify the work under the conditions of either (or
|
||||
* both) licenses.
|
||||
*
|
||||
* === Apache-2.0 ===
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* === LGPL-2.1-or-later ===
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library. If not, see
|
||||
* <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*/
|
||||
|
||||
#define _GNU_SOURCE
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/statfs.h>
|
||||
#include <sys/vfs.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/mount.h>
|
||||
#include <sys/sendfile.h>
|
||||
#include <sys/syscall.h>
|
||||
|
||||
/* Use our own wrapper for memfd_create. */
|
||||
#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
|
||||
# define SYS_memfd_create __NR_memfd_create
|
||||
#endif
|
||||
/* memfd_create(2) flags -- copied from <linux/memfd.h>. */
|
||||
#ifndef MFD_CLOEXEC
|
||||
# define MFD_CLOEXEC 0x0001U
|
||||
# define MFD_ALLOW_SEALING 0x0002U
|
||||
#endif
|
||||
int memfd_create(const char *name, unsigned int flags)
|
||||
{
|
||||
#ifdef SYS_memfd_create
|
||||
return syscall(SYS_memfd_create, name, flags);
|
||||
#else
|
||||
errno = ENOSYS;
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/* This comes directly from <linux/fcntl.h>. */
|
||||
#ifndef F_LINUX_SPECIFIC_BASE
|
||||
# define F_LINUX_SPECIFIC_BASE 1024
|
||||
#endif
|
||||
#ifndef F_ADD_SEALS
|
||||
# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
|
||||
# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
|
||||
#endif
|
||||
#ifndef F_SEAL_SEAL
|
||||
# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
|
||||
# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
|
||||
# define F_SEAL_GROW 0x0004 /* prevent file from growing */
|
||||
# define F_SEAL_WRITE 0x0008 /* prevent writes */
|
||||
#endif
|
||||
|
||||
#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
|
||||
#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
|
||||
#define RUNC_MEMFD_SEALS \
|
||||
(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
|
||||
|
||||
static void *must_realloc(void *ptr, size_t size)
|
||||
{
|
||||
void *old = ptr;
|
||||
do {
|
||||
ptr = realloc(old, size);
|
||||
} while(!ptr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify whether we are currently in a self-cloned program (namely, is
|
||||
* /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
|
||||
* for shmem files), and we want to be sure it's actually sealed.
|
||||
*/
|
||||
static int is_self_cloned(void)
|
||||
{
|
||||
int fd, ret, is_cloned = 0;
|
||||
struct stat statbuf = {};
|
||||
struct statfs fsbuf = {};
|
||||
|
||||
fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
fprintf(stderr, "you have no read access to runc binary file\n");
|
||||
return -ENOTRECOVERABLE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
|
||||
* this, because you cannot write to a sealed memfd no matter what (so
|
||||
* sharing it isn't a bad thing -- and an admin could bind-mount a sealed
|
||||
* memfd to /usr/bin/runc to allow re-use).
|
||||
*/
|
||||
ret = fcntl(fd, F_GET_SEALS);
|
||||
if (ret >= 0) {
|
||||
is_cloned = (ret == RUNC_MEMFD_SEALS);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* All other forms require CLONED_BINARY_ENV, since they are potentially
|
||||
* writeable (or we can't tell if they're fully safe) and thus we must
|
||||
* check the environment as an extra layer of defence.
|
||||
*/
|
||||
if (!getenv(CLONED_BINARY_ENV)) {
|
||||
is_cloned = false;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Is the binary on a read-only filesystem? We can't detect bind-mounts in
|
||||
* particular (in-kernel they are identical to regular mounts) but we can
|
||||
* at least be sure that it's read-only. In addition, to make sure that
|
||||
* it's *our* bind-mount we check CLONED_BINARY_ENV.
|
||||
*/
|
||||
if (fstatfs(fd, &fsbuf) >= 0)
|
||||
is_cloned |= (fsbuf.f_flags & MS_RDONLY);
|
||||
|
||||
/*
|
||||
* Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
|
||||
* which appears to have a borked backport of F_GET_SEALS. Either way,
|
||||
* having a file which has no hardlinks indicates that we aren't using
|
||||
* a host-side "runc" binary and this is something that a container
|
||||
* cannot fake (because unlinking requires being able to resolve the
|
||||
* path that you want to unlink).
|
||||
*/
|
||||
if (fstat(fd, &statbuf) >= 0)
|
||||
is_cloned |= (statbuf.st_nlink == 0);
|
||||
|
||||
out:
|
||||
close(fd);
|
||||
return is_cloned;
|
||||
}
|
||||
|
||||
/* Read a given file into a new buffer, and providing the length. */
|
||||
static char *read_file(char *path, size_t *length)
|
||||
{
|
||||
int fd;
|
||||
char buf[4096], *copy = NULL;
|
||||
|
||||
if (!length)
|
||||
return NULL;
|
||||
|
||||
fd = open(path, O_RDONLY | O_CLOEXEC);
|
||||
if (fd < 0)
|
||||
return NULL;
|
||||
|
||||
*length = 0;
|
||||
for (;;) {
|
||||
ssize_t n;
|
||||
|
||||
n = read(fd, buf, sizeof(buf));
|
||||
if (n < 0)
|
||||
goto error;
|
||||
if (!n)
|
||||
break;
|
||||
|
||||
copy = must_realloc(copy, (*length + n) * sizeof(*copy));
|
||||
memcpy(copy + *length, buf, n);
|
||||
*length += n;
|
||||
}
|
||||
close(fd);
|
||||
return copy;
|
||||
|
||||
error:
|
||||
close(fd);
|
||||
free(copy);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* A poor-man's version of "xargs -0". Basically parses a given block of
|
||||
* NUL-delimited data, within the given length and adds a pointer to each entry
|
||||
* to the array of pointers.
|
||||
*/
|
||||
static int parse_xargs(char *data, int data_length, char ***output)
|
||||
{
|
||||
int num = 0;
|
||||
char *cur = data;
|
||||
|
||||
if (!data || *output != NULL)
|
||||
return -1;
|
||||
|
||||
while (cur < data + data_length) {
|
||||
num++;
|
||||
*output = must_realloc(*output, (num + 1) * sizeof(**output));
|
||||
(*output)[num - 1] = cur;
|
||||
cur += strlen(cur) + 1;
|
||||
}
|
||||
(*output)[num] = NULL;
|
||||
return num;
|
||||
}
|
||||
|
||||
/*
|
||||
* "Parse" out argv from /proc/self/cmdline.
|
||||
* This is necessary because we are running in a context where we don't have a
|
||||
* main() that we can just get the arguments from.
|
||||
*/
|
||||
static int fetchve(char ***argv)
|
||||
{
|
||||
char *cmdline = NULL;
|
||||
size_t cmdline_size;
|
||||
|
||||
cmdline = read_file("/proc/self/cmdline", &cmdline_size);
|
||||
if (!cmdline)
|
||||
goto error;
|
||||
|
||||
if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
|
||||
goto error;
|
||||
|
||||
return 0;
|
||||
|
||||
error:
|
||||
free(cmdline);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
enum {
|
||||
EFD_NONE = 0,
|
||||
EFD_MEMFD,
|
||||
EFD_FILE,
|
||||
};
|
||||
|
||||
/*
|
||||
* This comes from <linux/fcntl.h>. We can't hard-code __O_TMPFILE because it
|
||||
* changes depending on the architecture. If we don't have O_TMPFILE we always
|
||||
* have the mkostemp(3) fallback.
|
||||
*/
|
||||
#ifndef O_TMPFILE
|
||||
# if defined(__O_TMPFILE) && defined(O_DIRECTORY)
|
||||
# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
static int make_execfd(int *fdtype)
|
||||
{
|
||||
int fd = -1;
|
||||
char template[PATH_MAX] = {0};
|
||||
char *prefix = getenv("_LIBCONTAINER_STATEDIR");
|
||||
|
||||
if (!prefix || *prefix != '/')
|
||||
prefix = "/tmp";
|
||||
if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
|
||||
return -1;
|
||||
|
||||
/*
|
||||
* Now try memfd, it's much nicer than actually creating a file in STATEDIR
|
||||
* since it's easily detected thanks to sealing and also doesn't require
|
||||
* assumptions about STATEDIR.
|
||||
*/
|
||||
*fdtype = EFD_MEMFD;
|
||||
fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
|
||||
if (fd >= 0)
|
||||
return fd;
|
||||
if (errno != ENOSYS && errno != EINVAL)
|
||||
goto error;
|
||||
|
||||
#ifdef O_TMPFILE
|
||||
/*
|
||||
* Try O_TMPFILE to avoid races where someone might snatch our file. Note
|
||||
* that O_EXCL isn't actually a security measure here (since you can just
|
||||
* fd re-open it and clear O_EXCL).
|
||||
*/
|
||||
*fdtype = EFD_FILE;
|
||||
fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
|
||||
if (fd >= 0) {
|
||||
struct stat statbuf = {};
|
||||
bool working_otmpfile = false;
|
||||
|
||||
/*
|
||||
* open(2) ignores unknown O_* flags -- yeah, I was surprised when I
|
||||
* found this out too. As a result we can't check for EINVAL. However,
|
||||
* if we get nlink != 0 (or EISDIR) then we know that this kernel
|
||||
* doesn't support O_TMPFILE.
|
||||
*/
|
||||
if (fstat(fd, &statbuf) >= 0)
|
||||
working_otmpfile = (statbuf.st_nlink == 0);
|
||||
|
||||
if (working_otmpfile)
|
||||
return fd;
|
||||
|
||||
/* Pretend that we got EISDIR since O_TMPFILE failed. */
|
||||
close(fd);
|
||||
errno = EISDIR;
|
||||
}
|
||||
if (errno != EISDIR)
|
||||
goto error;
|
||||
#endif /* defined(O_TMPFILE) */
|
||||
|
||||
/*
|
||||
* Our final option is to create a temporary file the old-school way, and
|
||||
* then unlink it so that nothing else sees it by accident.
|
||||
*/
|
||||
*fdtype = EFD_FILE;
|
||||
fd = mkostemp(template, O_CLOEXEC);
|
||||
if (fd >= 0) {
|
||||
if (unlink(template) >= 0)
|
||||
return fd;
|
||||
close(fd);
|
||||
}
|
||||
|
||||
error:
|
||||
*fdtype = EFD_NONE;
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int seal_execfd(int *fd, int fdtype)
|
||||
{
|
||||
switch (fdtype) {
|
||||
case EFD_MEMFD:
|
||||
return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
|
||||
case EFD_FILE: {
|
||||
/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
|
||||
int newfd;
|
||||
char fdpath[PATH_MAX] = {0};
|
||||
|
||||
if (fchmod(*fd, 0100) < 0)
|
||||
return -1;
|
||||
|
||||
if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
|
||||
return -1;
|
||||
|
||||
newfd = open(fdpath, O_PATH | O_CLOEXEC);
|
||||
if (newfd < 0)
|
||||
return -1;
|
||||
|
||||
close(*fd);
|
||||
*fd = newfd;
|
||||
return 0;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int try_bindfd(void)
|
||||
{
|
||||
int fd, ret = -1;
|
||||
char template[PATH_MAX] = {0};
|
||||
char *prefix = getenv("_LIBCONTAINER_STATEDIR");
|
||||
|
||||
if (!prefix || *prefix != '/')
|
||||
prefix = "/tmp";
|
||||
if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* We need somewhere to mount it, mounting anything over /proc/self is a
|
||||
* BAD idea on the host -- even if we do it temporarily.
|
||||
*/
|
||||
fd = mkstemp(template);
|
||||
if (fd < 0)
|
||||
return ret;
|
||||
close(fd);
|
||||
|
||||
/*
|
||||
* For obvious reasons this won't work in rootless mode because we haven't
|
||||
* created a userns+mntns -- but getting that to work will be a bit
|
||||
* complicated and it's only worth doing if someone actually needs it.
|
||||
*/
|
||||
ret = -EPERM;
|
||||
if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
|
||||
goto out;
|
||||
if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
|
||||
goto out_umount;
|
||||
|
||||
|
||||
/* Get read-only handle that we're sure can't be made read-write. */
|
||||
ret = open(template, O_PATH | O_CLOEXEC);
|
||||
|
||||
out_umount:
|
||||
/*
|
||||
* Make sure the MNT_DETACH works, otherwise we could get remounted
|
||||
* read-write and that would be quite bad (the fd would be made read-write
|
||||
* too, invalidating the protection).
|
||||
*/
|
||||
if (umount2(template, MNT_DETACH) < 0) {
|
||||
if (ret >= 0)
|
||||
close(ret);
|
||||
ret = -ENOTRECOVERABLE;
|
||||
}
|
||||
|
||||
out:
|
||||
/*
|
||||
* We don't care about unlink errors, the worst that happens is that
|
||||
* there's an empty file left around in STATEDIR.
|
||||
*/
|
||||
unlink(template);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t fd_to_fd(int outfd, int infd)
|
||||
{
|
||||
ssize_t total = 0;
|
||||
char buffer[4096];
|
||||
|
||||
for (;;) {
|
||||
ssize_t nread, nwritten = 0;
|
||||
|
||||
nread = read(infd, buffer, sizeof(buffer));
|
||||
if (nread < 0)
|
||||
return -1;
|
||||
if (!nread)
|
||||
break;
|
||||
|
||||
do {
|
||||
ssize_t n = write(outfd, buffer + nwritten, nread - nwritten);
|
||||
if (n < 0)
|
||||
return -1;
|
||||
nwritten += n;
|
||||
} while(nwritten < nread);
|
||||
|
||||
total += nwritten;
|
||||
}
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
static int clone_binary(void)
|
||||
{
|
||||
int binfd, execfd;
|
||||
struct stat statbuf = {};
|
||||
size_t sent = 0;
|
||||
int fdtype = EFD_NONE;
|
||||
|
||||
/*
|
||||
* Before we resort to copying, let's try creating an ro-binfd in one shot
|
||||
* by getting a handle for a read-only bind-mount of the execfd.
|
||||
*/
|
||||
execfd = try_bindfd();
|
||||
if (execfd >= 0)
|
||||
return execfd;
|
||||
|
||||
/*
|
||||
* Dammit, that didn't work -- time to copy the binary to a safe place we
|
||||
* can seal the contents.
|
||||
*/
|
||||
execfd = make_execfd(&fdtype);
|
||||
if (execfd < 0 || fdtype == EFD_NONE)
|
||||
return -ENOTRECOVERABLE;
|
||||
|
||||
binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
|
||||
if (binfd < 0)
|
||||
goto error;
|
||||
|
||||
if (fstat(binfd, &statbuf) < 0)
|
||||
goto error_binfd;
|
||||
|
||||
while (sent < statbuf.st_size) {
|
||||
int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent);
|
||||
if (n < 0) {
|
||||
/* sendfile can fail so we fallback to a dumb user-space copy. */
|
||||
n = fd_to_fd(execfd, binfd);
|
||||
if (n < 0)
|
||||
goto error_binfd;
|
||||
}
|
||||
sent += n;
|
||||
}
|
||||
close(binfd);
|
||||
if (sent != statbuf.st_size)
|
||||
goto error;
|
||||
|
||||
if (seal_execfd(&execfd, fdtype) < 0)
|
||||
goto error;
|
||||
|
||||
return execfd;
|
||||
|
||||
error_binfd:
|
||||
close(binfd);
|
||||
error:
|
||||
close(execfd);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
/* Get cheap access to the environment. */
|
||||
extern char **environ;
|
||||
|
||||
int ensure_cloned_binary(void)
|
||||
{
|
||||
int execfd;
|
||||
char **argv = NULL;
|
||||
|
||||
/* Check that we're not self-cloned, and if we are then bail. */
|
||||
int cloned = is_self_cloned();
|
||||
if (cloned > 0 || cloned == -ENOTRECOVERABLE)
|
||||
return cloned;
|
||||
|
||||
if (fetchve(&argv) < 0)
|
||||
return -EINVAL;
|
||||
|
||||
execfd = clone_binary();
|
||||
if (execfd < 0)
|
||||
return -EIO;
|
||||
|
||||
if (putenv(CLONED_BINARY_ENV "=1"))
|
||||
goto error;
|
||||
|
||||
fexecve(execfd, argv, environ);
|
||||
error:
|
||||
close(execfd);
|
||||
return -ENOEXEC;
|
||||
}
|
||||
32
vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h
generated
vendored
32
vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h
generated
vendored
@@ -1,32 +0,0 @@
|
||||
#ifndef NSENTER_NAMESPACE_H
|
||||
#define NSENTER_NAMESPACE_H
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
# define _GNU_SOURCE
|
||||
#endif
|
||||
#include <sched.h>
|
||||
|
||||
/* All of these are taken from include/uapi/linux/sched.h */
|
||||
#ifndef CLONE_NEWNS
|
||||
# define CLONE_NEWNS 0x00020000 /* New mount namespace group */
|
||||
#endif
|
||||
#ifndef CLONE_NEWCGROUP
|
||||
# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
|
||||
#endif
|
||||
#ifndef CLONE_NEWUTS
|
||||
# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
|
||||
#endif
|
||||
#ifndef CLONE_NEWIPC
|
||||
# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
|
||||
#endif
|
||||
#ifndef CLONE_NEWUSER
|
||||
# define CLONE_NEWUSER 0x10000000 /* New user namespace */
|
||||
#endif
|
||||
#ifndef CLONE_NEWPID
|
||||
# define CLONE_NEWPID 0x20000000 /* New pid namespace */
|
||||
#endif
|
||||
#ifndef CLONE_NEWNET
|
||||
# define CLONE_NEWNET 0x40000000 /* New network namespace */
|
||||
#endif
|
||||
|
||||
#endif /* NSENTER_NAMESPACE_H */
|
||||
12
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go
generated
vendored
12
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go
generated
vendored
@@ -1,12 +0,0 @@
|
||||
// +build linux,!gccgo
|
||||
|
||||
package nsenter
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -Wall
|
||||
extern void nsexec();
|
||||
void __attribute__((constructor)) init(void) {
|
||||
nsexec();
|
||||
}
|
||||
*/
|
||||
import "C"
|
||||
25
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go
generated
vendored
25
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go
generated
vendored
@@ -1,25 +0,0 @@
|
||||
// +build linux,gccgo
|
||||
|
||||
package nsenter
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -Wall
|
||||
extern void nsexec();
|
||||
void __attribute__((constructor)) init(void) {
|
||||
nsexec();
|
||||
}
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// AlwaysFalse is here to stay false
|
||||
// (and be exported so the compiler doesn't optimize out its reference)
|
||||
var AlwaysFalse bool
|
||||
|
||||
func init() {
|
||||
if AlwaysFalse {
|
||||
// by referencing this C init() in a noop test, it will ensure the compiler
|
||||
// links in the C function.
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134
|
||||
C.init()
|
||||
}
|
||||
}
|
||||
3
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go
generated
vendored
3
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go
generated
vendored
@@ -1,3 +0,0 @@
|
||||
// +build !linux !cgo
|
||||
|
||||
package nsenter
|
||||
1032
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
generated
vendored
1032
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
generated
vendored
File diff suppressed because it is too large
Load Diff
2
vendor/github.com/opencontainers/runc/libcontainer/user/MAINTAINERS
generated
vendored
Normal file
2
vendor/github.com/opencontainers/runc/libcontainer/user/MAINTAINERS
generated
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
Tianon Gravi <admwiggin@gmail.com> (@tianon)
|
||||
Aleksa Sarai <cyphar@cyphar.com> (@cyphar)
|
||||
Reference in New Issue
Block a user