containerd/pkg/oom/v2/v2.go
Jeremi Piotrowski 7275411ec8 cgroup2: monitor OOMKill instead of OOM to prevent missing container OOM events
With the cgroupv2 configuration employed by Kubernetes, the pod cgroup (slice)
and container cgroup (scope) will both have the same memory limit applied. In
that situation, the kernel will consider an OOM event to be triggered by the
parent cgroup (slice), and increment 'oom' there. The child cgroup (scope) only
sees an oom_kill increment. Since we monitor child cgroups for oom events,
check the OOMKill field so that we don't miss events.

This is not visible when running containers through docker or ctr, because they
set the limits differently (only container level). An alternative would be to
not configure limits at the pod level - that way the container limit will be
hit and the OOM will be correctly generated. An interesting consequence is that
when spawning a pod with multiple containers, the oom events also work
correctly, because:

a) if one of the containers has no limit, the pod has no limit so OOM events in
   another container report correctly.
b) if all of the containers have limits then the pod limit will be a sum of
   container events, so a container will be able to hit its limit first.

Signed-off-by: Jeremi Piotrowski <jpiotrowski@microsoft.com>
2022-02-03 13:39:16 +01:00

115 lines
2.8 KiB
Go

//go:build linux
// +build linux
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v2
import (
"context"
cgroupsv2 "github.com/containerd/cgroups/v2"
eventstypes "github.com/containerd/containerd/api/events"
"github.com/containerd/containerd/pkg/oom"
"github.com/containerd/containerd/runtime"
"github.com/containerd/containerd/runtime/v2/shim"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
// New returns an implementation that listens to OOM events
// from a container's cgroups.
func New(publisher shim.Publisher) (oom.Watcher, error) {
return &watcher{
itemCh: make(chan item),
publisher: publisher,
}, nil
}
// watcher implementation for handling OOM events from a container's cgroup
type watcher struct {
itemCh chan item
publisher shim.Publisher
}
type item struct {
id string
ev cgroupsv2.Event
err error
}
// Close closes the watcher
func (w *watcher) Close() error {
return nil
}
// Run the loop
func (w *watcher) Run(ctx context.Context) {
lastOOMMap := make(map[string]uint64) // key: id, value: ev.OOM
for {
select {
case <-ctx.Done():
w.Close()
return
case i := <-w.itemCh:
if i.err != nil {
delete(lastOOMMap, i.id)
continue
}
lastOOM := lastOOMMap[i.id]
if i.ev.OOMKill > lastOOM {
if err := w.publisher.Publish(ctx, runtime.TaskOOMEventTopic, &eventstypes.TaskOOM{
ContainerID: i.id,
}); err != nil {
logrus.WithError(err).Error("publish OOM event")
}
}
if i.ev.OOMKill > 0 {
lastOOMMap[i.id] = i.ev.OOMKill
}
}
}
}
// Add cgroups.Cgroup to the epoll monitor
func (w *watcher) Add(id string, cgx interface{}) error {
cg, ok := cgx.(*cgroupsv2.Manager)
if !ok {
return errors.Errorf("expected *cgroupsv2.Manager, got: %T", cgx)
}
// FIXME: cgroupsv2.Manager does not support closing eventCh routine currently.
// The routine shuts down when an error happens, mostly when the cgroup is deleted.
eventCh, errCh := cg.EventChan()
go func() {
for {
i := item{id: id}
select {
case ev := <-eventCh:
i.ev = ev
w.itemCh <- i
case err := <-errCh:
i.err = err
w.itemCh <- i
// we no longer get any event/err when we got an err
logrus.WithError(err).Warn("error from *cgroupsv2.Manager.EventChan")
return
}
}
}()
return nil
}