Retry and backoff when lost connection with containerd.

Signed-off-by: Lantao Liu <lantaol@google.com>
This commit is contained in:
Lantao Liu
2017-05-31 21:18:22 +00:00
parent 6e27320f40
commit 0179d0fbaf
7 changed files with 273 additions and 25 deletions

View File

@@ -17,47 +17,65 @@ limitations under the License.
package server
import (
"github.com/golang/glog"
"golang.org/x/net/context"
"time"
"github.com/containerd/containerd/api/services/execution"
"github.com/containerd/containerd/api/types/container"
"github.com/golang/glog"
"github.com/jpillora/backoff"
"golang.org/x/net/context"
"github.com/kubernetes-incubator/cri-containerd/pkg/metadata"
)
const (
// minRetryInterval is the minimum retry interval when lost connection with containerd.
minRetryInterval = 100 * time.Millisecond
// maxRetryInterval is the maximum retry interval when lost connection with containerd.
maxRetryInterval = 30 * time.Second
// exponentialFactor is the exponential backoff factor.
exponentialFactor = 2.0
)
// startEventMonitor starts an event monitor which monitors and handles all
// container events.
// TODO(random-liu): [P1] Figure out:
// 1) Is it possible to drop event during containerd is running?
// 2) How to deal with containerd down? We should restart event monitor, and
// we should recover all container state.
func (c *criContainerdService) startEventMonitor() error {
events, err := c.containerService.Events(context.Background(), &execution.EventsRequest{})
if err != nil {
return err
// TODO(random-liu): [P1] Is it possible to drop event during containerd is running?
func (c *criContainerdService) startEventMonitor() {
b := backoff.Backoff{
Min: minRetryInterval,
Max: maxRetryInterval,
Factor: exponentialFactor,
}
go func() {
for {
c.handleEventStream(events)
events, err := c.containerService.Events(context.Background(), &execution.EventsRequest{})
if err != nil {
glog.Errorf("Failed to connect to containerd event stream: %v", err)
time.Sleep(b.Duration())
continue
}
// Successfully connect with containerd, reset backoff.
b.Reset()
// TODO(random-liu): Relist to recover state, should prevent other operations
// until state is fully recovered.
if err := c.handleEventStream(events); err != nil {
glog.Errorf("Failed to handle event stream: %v", err)
time.Sleep(b.Duration())
continue
}
}
}()
return nil
}
// handleEventStream receives an event from containerd and handles the event.
func (c *criContainerdService) handleEventStream(events execution.ContainerService_EventsClient) {
// TODO(random-liu): [P1] Should backoff on this error, or else this will
// cause a busy loop.
// TODO(random-liu): Handle io.EOF.
func (c *criContainerdService) handleEventStream(events execution.ContainerService_EventsClient) error {
e, err := events.Recv()
if err != nil {
glog.Errorf("Failed to receive event: %v", err)
return
return err
}
glog.V(2).Infof("Received container event: %+v", e)
c.handleEvent(e)
return
return nil
}
// handleEvent handles a containerd event.

View File

@@ -54,7 +54,7 @@ import (
// CRIContainerdService is the interface implement CRI remote service server.
type CRIContainerdService interface {
Start() error
Start()
runtime.RuntimeServiceServer
runtime.ImageServiceServer
}
@@ -128,6 +128,6 @@ func NewCRIContainerdService(conn *grpc.ClientConn, rootDir, networkPluginBinDir
return c, nil
}
func (c *criContainerdService) Start() error {
return c.startEventMonitor()
func (c *criContainerdService) Start() {
c.startEventMonitor()
}