Support unknown state for sandbox and container
Signed-off-by: Lantao Liu <lantaol@google.com>
This commit is contained in:
@@ -179,140 +179,130 @@ func (c *criService) loadContainer(ctx context.Context, cntr containerd.Containe
|
||||
status = unknownContainerStatus()
|
||||
}
|
||||
|
||||
// Load up-to-date status from containerd.
|
||||
var containerIO *cio.ContainerIO
|
||||
t, err := cntr.Task(ctx, func(fifos *containerdio.FIFOSet) (_ containerdio.IO, err error) {
|
||||
stdoutWC, stderrWC, err := c.createContainerLoggers(meta.LogPath, meta.Config.GetTty())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
err = func() error {
|
||||
// Load up-to-date status from containerd.
|
||||
t, err := cntr.Task(ctx, func(fifos *containerdio.FIFOSet) (_ containerdio.IO, err error) {
|
||||
stdoutWC, stderrWC, err := c.createContainerLoggers(meta.LogPath, meta.Config.GetTty())
|
||||
if err != nil {
|
||||
if stdoutWC != nil {
|
||||
stdoutWC.Close()
|
||||
}
|
||||
if stderrWC != nil {
|
||||
stderrWC.Close()
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
}()
|
||||
containerIO, err = cio.NewContainerIO(id,
|
||||
cio.WithFIFOs(fifos),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
containerIO.AddOutput("log", stdoutWC, stderrWC)
|
||||
containerIO.Pipe()
|
||||
return containerIO, nil
|
||||
})
|
||||
if err != nil && !errdefs.IsNotFound(err) {
|
||||
return container, errors.Wrap(err, "failed to load task")
|
||||
}
|
||||
var s containerd.Status
|
||||
var notFound bool
|
||||
if errdefs.IsNotFound(err) {
|
||||
// Task is not found.
|
||||
notFound = true
|
||||
} else {
|
||||
// Task is found. Get task status.
|
||||
s, err = t.Status(ctx)
|
||||
if err != nil {
|
||||
// It's still possible that task is deleted during this window.
|
||||
if !errdefs.IsNotFound(err) {
|
||||
return container, errors.Wrap(err, "failed to get task status")
|
||||
}
|
||||
notFound = true
|
||||
}
|
||||
}
|
||||
if notFound {
|
||||
// Task is not created or has been deleted, use the checkpointed status
|
||||
// to generate container status.
|
||||
switch status.State() {
|
||||
case runtime.ContainerState_CONTAINER_CREATED:
|
||||
// NOTE: Another possibility is that we've tried to start the container, but
|
||||
// containerd got restarted during that. In that case, we still
|
||||
// treat the container as `CREATED`.
|
||||
defer func() {
|
||||
if err != nil {
|
||||
if stdoutWC != nil {
|
||||
stdoutWC.Close()
|
||||
}
|
||||
if stderrWC != nil {
|
||||
stderrWC.Close()
|
||||
}
|
||||
}
|
||||
}()
|
||||
containerIO, err = cio.NewContainerIO(id,
|
||||
cio.WithNewFIFOs(volatileContainerDir, meta.Config.GetTty(), meta.Config.GetStdin()),
|
||||
cio.WithFIFOs(fifos),
|
||||
)
|
||||
if err != nil {
|
||||
return container, errors.Wrap(err, "failed to create container io")
|
||||
return nil, err
|
||||
}
|
||||
case runtime.ContainerState_CONTAINER_RUNNING:
|
||||
// Container was in running state, but its task has been deleted,
|
||||
// set unknown exited state. Container io is not needed in this case.
|
||||
status.FinishedAt = time.Now().UnixNano()
|
||||
status.ExitCode = unknownExitCode
|
||||
status.Reason = unknownExitReason
|
||||
default:
|
||||
// Container is in exited/unknown state, return the status as it is.
|
||||
containerIO.AddOutput("log", stdoutWC, stderrWC)
|
||||
containerIO.Pipe()
|
||||
return containerIO, nil
|
||||
})
|
||||
if err != nil && !errdefs.IsNotFound(err) {
|
||||
return errors.Wrap(err, "failed to load task")
|
||||
}
|
||||
} else {
|
||||
// Task status is found. Update container status based on the up-to-date task status.
|
||||
switch s.Status {
|
||||
case containerd.Created:
|
||||
// Task has been created, but not started yet. This could only happen if containerd
|
||||
// gets restarted during container start.
|
||||
// Container must be in `CREATED` state.
|
||||
if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
|
||||
return container, errors.Wrap(err, "failed to delete task")
|
||||
var s containerd.Status
|
||||
var notFound bool
|
||||
if errdefs.IsNotFound(err) {
|
||||
// Task is not found.
|
||||
notFound = true
|
||||
} else {
|
||||
// Task is found. Get task status.
|
||||
s, err = t.Status(ctx)
|
||||
if err != nil {
|
||||
// It's still possible that task is deleted during this window.
|
||||
if !errdefs.IsNotFound(err) {
|
||||
return errors.Wrap(err, "failed to get task status")
|
||||
}
|
||||
notFound = true
|
||||
}
|
||||
if status.State() != runtime.ContainerState_CONTAINER_CREATED {
|
||||
return container, errors.Errorf("unexpected container state for created task: %q", status.State())
|
||||
}
|
||||
case containerd.Running:
|
||||
// Task is running. Container must be in `RUNNING` state, based on our assuption that
|
||||
// "task should not be started when containerd is down".
|
||||
}
|
||||
if notFound {
|
||||
// Task is not created or has been deleted, use the checkpointed status
|
||||
// to generate container status.
|
||||
switch status.State() {
|
||||
case runtime.ContainerState_CONTAINER_EXITED:
|
||||
return container, errors.Errorf("unexpected container state for running task: %q", status.State())
|
||||
case runtime.ContainerState_CONTAINER_CREATED:
|
||||
// NOTE: Another possibility is that we've tried to start the container, but
|
||||
// containerd got restarted during that. In that case, we still
|
||||
// treat the container as `CREATED`.
|
||||
containerIO, err = cio.NewContainerIO(id,
|
||||
cio.WithNewFIFOs(volatileContainerDir, meta.Config.GetTty(), meta.Config.GetStdin()),
|
||||
)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "failed to create container io")
|
||||
}
|
||||
case runtime.ContainerState_CONTAINER_RUNNING:
|
||||
// Container was in running state, but its task has been deleted,
|
||||
// set unknown exited state. Container io is not needed in this case.
|
||||
status.FinishedAt = time.Now().UnixNano()
|
||||
status.ExitCode = unknownExitCode
|
||||
status.Reason = unknownExitReason
|
||||
default:
|
||||
// This may happen if containerd gets restarted after task is started, but
|
||||
// before status is checkpointed.
|
||||
status.StartedAt = time.Now().UnixNano()
|
||||
status.Pid = t.Pid()
|
||||
// Container is in exited/unknown state, return the status as it is.
|
||||
}
|
||||
case containerd.Stopped:
|
||||
// Task is stopped. Updata status and delete the task.
|
||||
if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
|
||||
return container, errors.Wrap(err, "failed to delete task")
|
||||
} else {
|
||||
// Task status is found. Update container status based on the up-to-date task status.
|
||||
switch s.Status {
|
||||
case containerd.Created:
|
||||
// Task has been created, but not started yet. This could only happen if containerd
|
||||
// gets restarted during container start.
|
||||
// Container must be in `CREATED` state.
|
||||
if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
|
||||
return errors.Wrap(err, "failed to delete task")
|
||||
}
|
||||
if status.State() != runtime.ContainerState_CONTAINER_CREATED {
|
||||
return errors.Errorf("unexpected container state for created task: %q", status.State())
|
||||
}
|
||||
case containerd.Running:
|
||||
// Task is running. Container must be in `RUNNING` state, based on our assuption that
|
||||
// "task should not be started when containerd is down".
|
||||
switch status.State() {
|
||||
case runtime.ContainerState_CONTAINER_EXITED:
|
||||
return errors.Errorf("unexpected container state for running task: %q", status.State())
|
||||
case runtime.ContainerState_CONTAINER_RUNNING:
|
||||
default:
|
||||
// This may happen if containerd gets restarted after task is started, but
|
||||
// before status is checkpointed.
|
||||
status.StartedAt = time.Now().UnixNano()
|
||||
status.Pid = t.Pid()
|
||||
}
|
||||
case containerd.Stopped:
|
||||
// Task is stopped. Updata status and delete the task.
|
||||
if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
|
||||
return errors.Wrap(err, "failed to delete task")
|
||||
}
|
||||
status.FinishedAt = s.ExitTime.UnixNano()
|
||||
status.ExitCode = int32(s.ExitStatus)
|
||||
default:
|
||||
return errors.Errorf("unexpected task status %q", s.Status)
|
||||
}
|
||||
status.FinishedAt = s.ExitTime.UnixNano()
|
||||
status.ExitCode = int32(s.ExitStatus)
|
||||
default:
|
||||
return container, errors.Errorf("unexpected task status %q", s.Status)
|
||||
}
|
||||
return nil
|
||||
}()
|
||||
if err != nil {
|
||||
logrus.WithError(err).Errorf("Failed to load container status for %q", id)
|
||||
status = unknownContainerStatus()
|
||||
}
|
||||
opts := []containerstore.Opts{
|
||||
containerstore.WithStatus(status, containerDir),
|
||||
containerstore.WithContainer(cntr),
|
||||
}
|
||||
// containerIO could be nil for container in unknown state.
|
||||
if containerIO != nil {
|
||||
opts = append(opts, containerstore.WithContainerIO(containerIO))
|
||||
}
|
||||
return containerstore.NewContainer(*meta, opts...)
|
||||
}
|
||||
|
||||
const (
|
||||
// unknownExitCode is the exit code when exit reason is unknown.
|
||||
unknownExitCode = 255
|
||||
// unknownExitReason is the exit reason when exit reason is unknown.
|
||||
unknownExitReason = "Unknown"
|
||||
)
|
||||
|
||||
// unknownContainerStatus returns the default container status when its status is unknown.
|
||||
func unknownContainerStatus() containerstore.Status {
|
||||
return containerstore.Status{
|
||||
CreatedAt: 0,
|
||||
StartedAt: 0,
|
||||
FinishedAt: 0,
|
||||
ExitCode: unknownExitCode,
|
||||
Reason: unknownExitReason,
|
||||
}
|
||||
}
|
||||
|
||||
// loadSandbox loads sandbox from containerd.
|
||||
func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
|
||||
@@ -333,61 +323,59 @@ func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.S
|
||||
}
|
||||
meta := data.(*sandboxstore.Metadata)
|
||||
|
||||
// Load sandbox created timestamp.
|
||||
info, err := cntr.Info(ctx)
|
||||
if err != nil {
|
||||
return sandbox, errors.Wrap(err, "failed to get sandbox container info")
|
||||
}
|
||||
createdAt := info.CreatedAt
|
||||
|
||||
// Load sandbox status.
|
||||
t, err := cntr.Task(ctx, nil)
|
||||
if err != nil && !errdefs.IsNotFound(err) {
|
||||
return sandbox, errors.Wrap(err, "failed to load task")
|
||||
}
|
||||
var s containerd.Status
|
||||
var notFound bool
|
||||
if errdefs.IsNotFound(err) {
|
||||
// Task is not found.
|
||||
notFound = true
|
||||
} else {
|
||||
// Task is found. Get task status.
|
||||
s, err = t.Status(ctx)
|
||||
s, err := func() (sandboxstore.Status, error) {
|
||||
status := unknownSandboxStatus()
|
||||
// Load sandbox created timestamp.
|
||||
info, err := cntr.Info(ctx)
|
||||
if err != nil {
|
||||
// It's still possible that task is deleted during this window.
|
||||
if !errdefs.IsNotFound(err) {
|
||||
return sandbox, errors.Wrap(err, "failed to get task status")
|
||||
}
|
||||
return status, errors.Wrap(err, "failed to get sandbox container info")
|
||||
}
|
||||
status.CreatedAt = info.CreatedAt
|
||||
|
||||
// Load sandbox state.
|
||||
t, err := cntr.Task(ctx, nil)
|
||||
if err != nil && !errdefs.IsNotFound(err) {
|
||||
return status, errors.Wrap(err, "failed to load task")
|
||||
}
|
||||
var taskStatus containerd.Status
|
||||
var notFound bool
|
||||
if errdefs.IsNotFound(err) {
|
||||
// Task is not found.
|
||||
notFound = true
|
||||
}
|
||||
}
|
||||
var state sandboxstore.State
|
||||
var pid uint32
|
||||
if notFound {
|
||||
// Task does not exist, set sandbox state as NOTREADY.
|
||||
state = sandboxstore.StateNotReady
|
||||
} else {
|
||||
if s.Status == containerd.Running {
|
||||
// Task is running, set sandbox state as READY.
|
||||
state = sandboxstore.StateReady
|
||||
pid = t.Pid()
|
||||
} else {
|
||||
// Task is not running. Delete the task and set sandbox state as NOTREADY.
|
||||
if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
|
||||
return sandbox, errors.Wrap(err, "failed to delete task")
|
||||
// Task is found. Get task status.
|
||||
taskStatus, err = t.Status(ctx)
|
||||
if err != nil {
|
||||
// It's still possible that task is deleted during this window.
|
||||
if !errdefs.IsNotFound(err) {
|
||||
return status, errors.Wrap(err, "failed to get task status")
|
||||
}
|
||||
notFound = true
|
||||
}
|
||||
state = sandboxstore.StateNotReady
|
||||
}
|
||||
if notFound {
|
||||
// Task does not exist, set sandbox state as NOTREADY.
|
||||
status.State = sandboxstore.StateNotReady
|
||||
} else {
|
||||
if taskStatus.Status == containerd.Running {
|
||||
// Task is running, set sandbox state as READY.
|
||||
status.State = sandboxstore.StateReady
|
||||
status.Pid = t.Pid()
|
||||
} else {
|
||||
// Task is not running. Delete the task and set sandbox state as NOTREADY.
|
||||
if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
|
||||
return status, errors.Wrap(err, "failed to delete task")
|
||||
}
|
||||
status.State = sandboxstore.StateNotReady
|
||||
}
|
||||
}
|
||||
return status, nil
|
||||
}()
|
||||
if err != nil {
|
||||
logrus.WithError(err).Errorf("Failed to load sandbox status for %q", cntr.ID())
|
||||
}
|
||||
|
||||
sandbox = sandboxstore.NewSandbox(
|
||||
*meta,
|
||||
sandboxstore.Status{
|
||||
Pid: pid,
|
||||
CreatedAt: createdAt,
|
||||
State: state,
|
||||
},
|
||||
)
|
||||
sandbox = sandboxstore.NewSandbox(*meta, s)
|
||||
sandbox.Container = cntr
|
||||
|
||||
// Load network namespace.
|
||||
|
||||
Reference in New Issue
Block a user