containerd/runtime/v1/linux/proc/init.go
Michael Crosby e6ae9cc64f Shim pluggable logging
Closes #603

This adds logging facilities at the shim level to provide minimal I/O
overhead and pluggable logging options.  Log handling is done within the
shim so that all I/O, cpu, and memory can be charged to the container.

A sample logging driver setting up logging for a container the systemd
journal looks like this:

```go
package main

import (
	"bufio"
	"context"
	"fmt"
	"io"
	"sync"

	"github.com/containerd/containerd/runtime/v2/logging"
	"github.com/coreos/go-systemd/journal"
)

func main() {
	logging.Run(log)
}

func log(ctx context.Context, config *logging.Config, ready func() error) error {
	// construct any log metadata for the container
	vars := map[string]string{
		"SYSLOG_IDENTIFIER": fmt.Sprintf("%s:%s", config.Namespace, config.ID),
	}
	var wg sync.WaitGroup
	wg.Add(2)
	// forward both stdout and stderr to the journal
	go copy(&wg, config.Stdout, journal.PriInfo, vars)
	go copy(&wg, config.Stderr, journal.PriErr, vars)

	// signal that we are ready and setup for the container to be started
	if err := ready(); err != nil {
		return err
	}
	wg.Wait()
	return nil
}

func copy(wg *sync.WaitGroup, r io.Reader, pri journal.Priority, vars map[string]string) {
	defer wg.Done()
	s := bufio.NewScanner(r)
	for s.Scan() {
		if s.Err() != nil {
			return
		}
		journal.Send(s.Text(), pri, vars)
	}
}
```

A `logging` package has been created to assist log developers create
logging plugins for containerd.

This uses a URI based approach for logging drivers that can be expanded
in the future.

Supported URI scheme's are:

* binary
* fifo
* file

You can pass the log url via ctr on the command line:

```bash
> ctr run --rm --runtime io.containerd.runc.v2 --log-uri binary://shim-journald docker.io/library/redis:alpine redis
```

```bash
> journalctl -f -t default:redis

-- Logs begin at Tue 2018-12-11 16:29:51 EST. --
Mar 08 16:08:22 deathstar default:redis[120760]: 1:C 08 Mar 2019 21:08:22.703 # Warning: no config file specified, using the default config. In order to specify a config file use redis-server /path/to/redis.conf
Mar 08 16:08:22 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:22.704 # You requested maxclients of 10000 requiring at least 10032 max file descriptors.
Mar 08 16:08:22 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:22.704 # Server can't set maximum open files to 10032 because of OS error: Operation not permitted.
Mar 08 16:08:22 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:22.704 # Current maximum open files is 1024. maxclients has been reduced to 992 to compensate for low ulimit. If you need higher maxclients increase 'ulimit -n'.
Mar 08 16:08:22 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:22.705 * Running mode=standalone, port=6379.
Mar 08 16:08:22 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:22.705 # WARNING: The TCP backlog setting of 511 cannot be enforced because /proc/sys/net/core/somaxconn is set to the lower value of 128.
Mar 08 16:08:22 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:22.705 # Server initialized
Mar 08 16:08:22 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:22.705 # WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.
Mar 08 16:08:22 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:22.705 # WARNING you have Transparent Huge Pages (THP) support enabled in your kernel. This will create latency and memory usage issues with Redis. To fix this issue run the command 'echo never > /sys/kernel/mm/transparent_hugepage/enabled' as root, and add it to your /etc/rc.local in order to retain the setting after a reboot. Redis must be restarted after THP is disabled.
Mar 08 16:08:22 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:22.705 * Ready to accept connections
Mar 08 16:08:50 deathstar default:redis[120760]: 1:signal-handler (1552079330) Received SIGINT scheduling shutdown...
Mar 08 16:08:50 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:50.405 # User requested shutdown...
Mar 08 16:08:50 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:50.406 * Saving the final RDB snapshot before exiting.
Mar 08 16:08:50 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:50.452 * DB saved on disk
Mar 08 16:08:50 deathstar default:redis[120760]: 1:M 08 Mar 2019 21:08:50.453 # Redis is now ready to exit, bye bye...
```

The following client side Opts are added:

```go
// LogURI provides the raw logging URI
func LogURI(uri *url.URL) Creator { }
// BinaryIO forwards contianer STDOUT|STDERR directly to a logging binary
func BinaryIO(binary string, args map[string]string) Creator {}
```

Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
2019-03-12 12:18:28 -04:00

502 lines
12 KiB
Go

// +build !windows
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package proc
import (
"context"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"sync"
"syscall"
"time"
"github.com/containerd/console"
"github.com/containerd/containerd/log"
"github.com/containerd/containerd/mount"
"github.com/containerd/containerd/runtime/proc"
"github.com/containerd/fifo"
runc "github.com/containerd/go-runc"
google_protobuf "github.com/gogo/protobuf/types"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
)
// Init represents an initial process for a container
type Init struct {
wg sync.WaitGroup
initState initState
// mu is used to ensure that `Start()` and `Exited()` calls return in
// the right order when invoked in separate go routines.
// This is the case within the shim implementation as it makes use of
// the reaper interface.
mu sync.Mutex
waitBlock chan struct{}
WorkDir string
id string
Bundle string
console console.Console
Platform proc.Platform
io *processIO
runtime *runc.Runc
status int
exited time.Time
pid int
closers []io.Closer
stdin io.Closer
stdio proc.Stdio
Rootfs string
IoUID int
IoGID int
NoPivotRoot bool
NoNewKeyring bool
CriuWorkPath string
}
// NewRunc returns a new runc instance for a process
func NewRunc(root, path, namespace, runtime, criu string, systemd bool) *runc.Runc {
if root == "" {
root = RuncRoot
}
return &runc.Runc{
Command: runtime,
Log: filepath.Join(path, "log.json"),
LogFormat: runc.JSON,
PdeathSignal: syscall.SIGKILL,
Root: filepath.Join(root, namespace),
Criu: criu,
SystemdCgroup: systemd,
}
}
// New returns a new process
func New(id string, runtime *runc.Runc, stdio proc.Stdio) *Init {
p := &Init{
id: id,
runtime: runtime,
stdio: stdio,
status: 0,
waitBlock: make(chan struct{}),
}
p.initState = &createdState{p: p}
return p
}
// Create the process with the provided config
func (p *Init) Create(ctx context.Context, r *CreateConfig) error {
var (
err error
socket *runc.Socket
pio *processIO
pidFile = newPidFile(p.Bundle)
)
if r.Terminal {
if socket, err = runc.NewTempConsoleSocket(); err != nil {
return errors.Wrap(err, "failed to create OCI runtime console socket")
}
defer socket.Close()
} else {
if pio, err = createIO(ctx, p.id, p.IoUID, p.IoGID, p.stdio); err != nil {
return errors.Wrap(err, "failed to create init process I/O")
}
p.io = pio
}
if r.Checkpoint != "" {
return p.createCheckpointedState(r, pidFile)
}
opts := &runc.CreateOpts{
PidFile: pidFile.Path(),
NoPivot: p.NoPivotRoot,
NoNewKeyring: p.NoNewKeyring,
}
if p.io != nil {
opts.IO = p.io.IO()
}
if socket != nil {
opts.ConsoleSocket = socket
}
if err := p.runtime.Create(ctx, r.ID, r.Bundle, opts); err != nil {
return p.runtimeError(err, "OCI runtime create failed")
}
if r.Stdin != "" {
if err := p.openStdin(r.Stdin); err != nil {
return err
}
}
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
if socket != nil {
console, err := socket.ReceiveMaster()
if err != nil {
return errors.Wrap(err, "failed to retrieve console master")
}
console, err = p.Platform.CopyConsole(ctx, console, r.Stdin, r.Stdout, r.Stderr, &p.wg)
if err != nil {
return errors.Wrap(err, "failed to start console copy")
}
p.console = console
} else {
if err := pio.Copy(ctx, &p.wg); err != nil {
return errors.Wrap(err, "failed to start io pipe copy")
}
}
pid, err := pidFile.Read()
if err != nil {
return errors.Wrap(err, "failed to retrieve OCI runtime container pid")
}
p.pid = pid
return nil
}
func (p *Init) openStdin(path string) error {
sc, err := fifo.OpenFifo(context.Background(), path, syscall.O_WRONLY|syscall.O_NONBLOCK, 0)
if err != nil {
return errors.Wrapf(err, "failed to open stdin fifo %s", path)
}
p.stdin = sc
p.closers = append(p.closers, sc)
return nil
}
func (p *Init) createCheckpointedState(r *CreateConfig, pidFile *pidFile) error {
opts := &runc.RestoreOpts{
CheckpointOpts: runc.CheckpointOpts{
ImagePath: r.Checkpoint,
WorkDir: p.CriuWorkPath,
ParentPath: r.ParentCheckpoint,
},
PidFile: pidFile.Path(),
IO: p.io.IO(),
NoPivot: p.NoPivotRoot,
Detach: true,
NoSubreaper: true,
}
p.initState = &createdCheckpointState{
p: p,
opts: opts,
}
return nil
}
// Wait for the process to exit
func (p *Init) Wait() {
<-p.waitBlock
}
// ID of the process
func (p *Init) ID() string {
return p.id
}
// Pid of the process
func (p *Init) Pid() int {
return p.pid
}
// ExitStatus of the process
func (p *Init) ExitStatus() int {
p.mu.Lock()
defer p.mu.Unlock()
return p.status
}
// ExitedAt at time when the process exited
func (p *Init) ExitedAt() time.Time {
p.mu.Lock()
defer p.mu.Unlock()
return p.exited
}
// Status of the process
func (p *Init) Status(ctx context.Context) (string, error) {
p.mu.Lock()
defer p.mu.Unlock()
c, err := p.runtime.State(ctx, p.id)
if err != nil {
if strings.Contains(err.Error(), "does not exist") {
return "stopped", nil
}
return "", p.runtimeError(err, "OCI runtime state failed")
}
return c.Status, nil
}
// Start the init process
func (p *Init) Start(ctx context.Context) error {
p.mu.Lock()
defer p.mu.Unlock()
return p.initState.Start(ctx)
}
func (p *Init) start(ctx context.Context) error {
err := p.runtime.Start(ctx, p.id)
return p.runtimeError(err, "OCI runtime start failed")
}
// SetExited of the init process with the next status
func (p *Init) SetExited(status int) {
p.mu.Lock()
defer p.mu.Unlock()
p.initState.SetExited(status)
}
func (p *Init) setExited(status int) {
p.exited = time.Now()
p.status = status
p.Platform.ShutdownConsole(context.Background(), p.console)
close(p.waitBlock)
}
// Delete the init process
func (p *Init) Delete(ctx context.Context) error {
p.mu.Lock()
defer p.mu.Unlock()
return p.initState.Delete(ctx)
}
func (p *Init) delete(ctx context.Context) error {
p.wg.Wait()
err := p.runtime.Delete(ctx, p.id, nil)
// ignore errors if a runtime has already deleted the process
// but we still hold metadata and pipes
//
// this is common during a checkpoint, runc will delete the container state
// after a checkpoint and the container will no longer exist within runc
if err != nil {
if strings.Contains(err.Error(), "does not exist") {
err = nil
} else {
err = p.runtimeError(err, "failed to delete task")
}
}
if p.io != nil {
for _, c := range p.closers {
c.Close()
}
p.io.Close()
}
if err2 := mount.UnmountAll(p.Rootfs, 0); err2 != nil {
log.G(ctx).WithError(err2).Warn("failed to cleanup rootfs mount")
if err == nil {
err = errors.Wrap(err2, "failed rootfs umount")
}
}
return err
}
// Resize the init processes console
func (p *Init) Resize(ws console.WinSize) error {
p.mu.Lock()
defer p.mu.Unlock()
if p.console == nil {
return nil
}
return p.console.Resize(ws)
}
func (p *Init) resize(ws console.WinSize) error {
if p.console == nil {
return nil
}
return p.console.Resize(ws)
}
// Pause the init process and all its child processes
func (p *Init) Pause(ctx context.Context) error {
p.mu.Lock()
defer p.mu.Unlock()
return p.initState.Pause(ctx)
}
// Resume the init process and all its child processes
func (p *Init) Resume(ctx context.Context) error {
p.mu.Lock()
defer p.mu.Unlock()
return p.initState.Resume(ctx)
}
// Kill the init process
func (p *Init) Kill(ctx context.Context, signal uint32, all bool) error {
p.mu.Lock()
defer p.mu.Unlock()
return p.initState.Kill(ctx, signal, all)
}
func (p *Init) kill(ctx context.Context, signal uint32, all bool) error {
err := p.runtime.Kill(ctx, p.id, int(signal), &runc.KillOpts{
All: all,
})
return checkKillError(err)
}
// KillAll processes belonging to the init process
func (p *Init) KillAll(ctx context.Context) error {
p.mu.Lock()
defer p.mu.Unlock()
err := p.runtime.Kill(ctx, p.id, int(syscall.SIGKILL), &runc.KillOpts{
All: true,
})
return p.runtimeError(err, "OCI runtime killall failed")
}
// Stdin of the process
func (p *Init) Stdin() io.Closer {
return p.stdin
}
// Runtime returns the OCI runtime configured for the init process
func (p *Init) Runtime() *runc.Runc {
return p.runtime
}
// Exec returns a new child process
func (p *Init) Exec(ctx context.Context, path string, r *ExecConfig) (proc.Process, error) {
p.mu.Lock()
defer p.mu.Unlock()
return p.initState.Exec(ctx, path, r)
}
// exec returns a new exec'd process
func (p *Init) exec(ctx context.Context, path string, r *ExecConfig) (proc.Process, error) {
// process exec request
var spec specs.Process
if err := json.Unmarshal(r.Spec.Value, &spec); err != nil {
return nil, err
}
spec.Terminal = r.Terminal
e := &execProcess{
id: r.ID,
path: path,
parent: p,
spec: spec,
stdio: proc.Stdio{
Stdin: r.Stdin,
Stdout: r.Stdout,
Stderr: r.Stderr,
Terminal: r.Terminal,
},
waitBlock: make(chan struct{}),
pid: &safePid{},
}
e.execState = &execCreatedState{p: e}
return e, nil
}
// Checkpoint the init process
func (p *Init) Checkpoint(ctx context.Context, r *CheckpointConfig) error {
p.mu.Lock()
defer p.mu.Unlock()
return p.initState.Checkpoint(ctx, r)
}
func (p *Init) checkpoint(ctx context.Context, r *CheckpointConfig) error {
var actions []runc.CheckpointAction
if !r.Exit {
actions = append(actions, runc.LeaveRunning)
}
// keep criu work directory if criu work dir is set
work := r.WorkDir
if work == "" {
work = filepath.Join(p.WorkDir, "criu-work")
defer os.RemoveAll(work)
}
if err := p.runtime.Checkpoint(ctx, p.id, &runc.CheckpointOpts{
WorkDir: work,
ImagePath: r.Path,
AllowOpenTCP: r.AllowOpenTCP,
AllowExternalUnixSockets: r.AllowExternalUnixSockets,
AllowTerminal: r.AllowTerminal,
FileLocks: r.FileLocks,
EmptyNamespaces: r.EmptyNamespaces,
}, actions...); err != nil {
dumpLog := filepath.Join(p.Bundle, "criu-dump.log")
if cerr := copyFile(dumpLog, filepath.Join(work, "dump.log")); cerr != nil {
log.G(ctx).Error(err)
}
return fmt.Errorf("%s path= %s", criuError(err), dumpLog)
}
return nil
}
// Update the processes resource configuration
func (p *Init) Update(ctx context.Context, r *google_protobuf.Any) error {
p.mu.Lock()
defer p.mu.Unlock()
return p.initState.Update(ctx, r)
}
func (p *Init) update(ctx context.Context, r *google_protobuf.Any) error {
var resources specs.LinuxResources
if err := json.Unmarshal(r.Value, &resources); err != nil {
return err
}
return p.runtime.Update(ctx, p.id, &resources)
}
// Stdio of the process
func (p *Init) Stdio() proc.Stdio {
return p.stdio
}
func (p *Init) runtimeError(rErr error, msg string) error {
if rErr == nil {
return nil
}
rMsg, err := getLastRuntimeError(p.runtime)
switch {
case err != nil:
return errors.Wrapf(rErr, "%s: %s (%s)", msg, "unable to retrieve OCI runtime error", err.Error())
case rMsg == "":
return errors.Wrap(rErr, msg)
default:
return errors.Errorf("%s: %s", msg, rMsg)
}
}
func withConditionalIO(c proc.Stdio) runc.IOOpt {
return func(o *runc.IOOption) {
o.OpenStdin = c.Stdin != ""
o.OpenStdout = c.Stdout != ""
o.OpenStderr = c.Stderr != ""
}
}