Fix process locking and state management
There were races with the way process states. This displayed in ways, especially around pausing the container for atomic operations. Users would get errors like, cannnot delete container in paused state and such. This can be eaisly reproduced with `docker` and the following command: ```bash > (for i in `seq 1 25`; do id=$(docker create alpine usleep 50000);docker start $id;docker commit $id;docker wait $id;docker rm $id; done) ``` This two issues that this fixes are: * locks must be held by the owning process, not the state operations. * If a container ends up being paused but before the operation completes, the process exists, make sure we resume the container before setting the the process as exited. Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
This commit is contained in:
@@ -30,16 +30,20 @@ import (
|
||||
runc "github.com/containerd/go-runc"
|
||||
google_protobuf "github.com/gogo/protobuf/types"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type initState interface {
|
||||
proc.State
|
||||
|
||||
Resize(console.WinSize) error
|
||||
Start(context.Context) error
|
||||
Delete(context.Context) error
|
||||
Pause(context.Context) error
|
||||
Resume(context.Context) error
|
||||
Update(context.Context, *google_protobuf.Any) error
|
||||
Checkpoint(context.Context, *CheckpointConfig) error
|
||||
Exec(context.Context, string, *ExecConfig) (proc.Process, error)
|
||||
Kill(context.Context, uint32, bool) error
|
||||
SetExited(int)
|
||||
}
|
||||
|
||||
type createdState struct {
|
||||
@@ -61,43 +65,26 @@ func (s *createdState) transition(name string) error {
|
||||
}
|
||||
|
||||
func (s *createdState) Pause(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot pause task in created state")
|
||||
}
|
||||
|
||||
func (s *createdState) Resume(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot resume task in created state")
|
||||
}
|
||||
|
||||
func (s *createdState) Update(context context.Context, r *google_protobuf.Any) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.update(context, r)
|
||||
func (s *createdState) Update(ctx context.Context, r *google_protobuf.Any) error {
|
||||
return s.p.update(ctx, r)
|
||||
}
|
||||
|
||||
func (s *createdState) Checkpoint(context context.Context, r *CheckpointConfig) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
func (s *createdState) Checkpoint(ctx context.Context, r *CheckpointConfig) error {
|
||||
return errors.Errorf("cannot checkpoint a task in created state")
|
||||
}
|
||||
|
||||
func (s *createdState) Resize(ws console.WinSize) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.resize(ws)
|
||||
}
|
||||
|
||||
func (s *createdState) Start(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
if err := s.p.start(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -105,8 +92,6 @@ func (s *createdState) Start(ctx context.Context) error {
|
||||
}
|
||||
|
||||
func (s *createdState) Delete(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
if err := s.p.delete(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -114,16 +99,10 @@ func (s *createdState) Delete(ctx context.Context) error {
|
||||
}
|
||||
|
||||
func (s *createdState) Kill(ctx context.Context, sig uint32, all bool) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.kill(ctx, sig, all)
|
||||
}
|
||||
|
||||
func (s *createdState) SetExited(status int) {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
s.p.setExited(status)
|
||||
|
||||
if err := s.transition("stopped"); err != nil {
|
||||
@@ -132,8 +111,6 @@ func (s *createdState) SetExited(status int) {
|
||||
}
|
||||
|
||||
func (s *createdState) Exec(ctx context.Context, path string, r *ExecConfig) (proc.Process, error) {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
return s.p.exec(ctx, path, r)
|
||||
}
|
||||
|
||||
@@ -157,43 +134,26 @@ func (s *createdCheckpointState) transition(name string) error {
|
||||
}
|
||||
|
||||
func (s *createdCheckpointState) Pause(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot pause task in created state")
|
||||
}
|
||||
|
||||
func (s *createdCheckpointState) Resume(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot resume task in created state")
|
||||
}
|
||||
|
||||
func (s *createdCheckpointState) Update(context context.Context, r *google_protobuf.Any) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.update(context, r)
|
||||
func (s *createdCheckpointState) Update(ctx context.Context, r *google_protobuf.Any) error {
|
||||
return s.p.update(ctx, r)
|
||||
}
|
||||
|
||||
func (s *createdCheckpointState) Checkpoint(context context.Context, r *CheckpointConfig) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
func (s *createdCheckpointState) Checkpoint(ctx context.Context, r *CheckpointConfig) error {
|
||||
return errors.Errorf("cannot checkpoint a task in created state")
|
||||
}
|
||||
|
||||
func (s *createdCheckpointState) Resize(ws console.WinSize) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.resize(ws)
|
||||
}
|
||||
|
||||
func (s *createdCheckpointState) Start(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
p := s.p
|
||||
sio := p.stdio
|
||||
|
||||
@@ -247,8 +207,6 @@ func (s *createdCheckpointState) Start(ctx context.Context) error {
|
||||
}
|
||||
|
||||
func (s *createdCheckpointState) Delete(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
if err := s.p.delete(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -256,16 +214,10 @@ func (s *createdCheckpointState) Delete(ctx context.Context) error {
|
||||
}
|
||||
|
||||
func (s *createdCheckpointState) Kill(ctx context.Context, sig uint32, all bool) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.kill(ctx, sig, all)
|
||||
}
|
||||
|
||||
func (s *createdCheckpointState) SetExited(status int) {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
s.p.setExited(status)
|
||||
|
||||
if err := s.transition("stopped"); err != nil {
|
||||
@@ -274,9 +226,6 @@ func (s *createdCheckpointState) SetExited(status int) {
|
||||
}
|
||||
|
||||
func (s *createdCheckpointState) Exec(ctx context.Context, path string, r *ExecConfig) (proc.Process, error) {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return nil, errors.Errorf("cannot exec in a created state")
|
||||
}
|
||||
|
||||
@@ -297,67 +246,42 @@ func (s *runningState) transition(name string) error {
|
||||
}
|
||||
|
||||
func (s *runningState) Pause(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
if err := s.p.pause(ctx); err != nil {
|
||||
return err
|
||||
if err := s.p.runtime.Pause(ctx, s.p.id); err != nil {
|
||||
return s.p.runtimeError(err, "OCI runtime pause failed")
|
||||
}
|
||||
|
||||
return s.transition("paused")
|
||||
}
|
||||
|
||||
func (s *runningState) Resume(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot resume a running process")
|
||||
}
|
||||
|
||||
func (s *runningState) Update(context context.Context, r *google_protobuf.Any) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.update(context, r)
|
||||
func (s *runningState) Update(ctx context.Context, r *google_protobuf.Any) error {
|
||||
return s.p.update(ctx, r)
|
||||
}
|
||||
|
||||
func (s *runningState) Checkpoint(ctx context.Context, r *CheckpointConfig) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.checkpoint(ctx, r)
|
||||
}
|
||||
|
||||
func (s *runningState) Resize(ws console.WinSize) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.resize(ws)
|
||||
}
|
||||
|
||||
func (s *runningState) Start(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot start a running process")
|
||||
}
|
||||
|
||||
func (s *runningState) Delete(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot delete a running process")
|
||||
}
|
||||
|
||||
func (s *runningState) Kill(ctx context.Context, sig uint32, all bool) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.kill(ctx, sig, all)
|
||||
}
|
||||
|
||||
func (s *runningState) SetExited(status int) {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
s.p.setExited(status)
|
||||
|
||||
if err := s.transition("stopped"); err != nil {
|
||||
@@ -366,8 +290,6 @@ func (s *runningState) SetExited(status int) {
|
||||
}
|
||||
|
||||
func (s *runningState) Exec(ctx context.Context, path string, r *ExecConfig) (proc.Process, error) {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
return s.p.exec(ctx, path, r)
|
||||
}
|
||||
|
||||
@@ -388,79 +310,54 @@ func (s *pausedState) transition(name string) error {
|
||||
}
|
||||
|
||||
func (s *pausedState) Pause(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot pause a paused container")
|
||||
}
|
||||
|
||||
func (s *pausedState) Resume(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
if err := s.p.resume(ctx); err != nil {
|
||||
return err
|
||||
if err := s.p.runtime.Resume(ctx, s.p.id); err != nil {
|
||||
return s.p.runtimeError(err, "OCI runtime resume failed")
|
||||
}
|
||||
|
||||
return s.transition("running")
|
||||
}
|
||||
|
||||
func (s *pausedState) Update(context context.Context, r *google_protobuf.Any) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.update(context, r)
|
||||
func (s *pausedState) Update(ctx context.Context, r *google_protobuf.Any) error {
|
||||
return s.p.update(ctx, r)
|
||||
}
|
||||
|
||||
func (s *pausedState) Checkpoint(ctx context.Context, r *CheckpointConfig) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.checkpoint(ctx, r)
|
||||
}
|
||||
|
||||
func (s *pausedState) Resize(ws console.WinSize) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.resize(ws)
|
||||
}
|
||||
|
||||
func (s *pausedState) Start(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot start a paused process")
|
||||
}
|
||||
|
||||
func (s *pausedState) Delete(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot delete a paused process")
|
||||
}
|
||||
|
||||
func (s *pausedState) Kill(ctx context.Context, sig uint32, all bool) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return s.p.kill(ctx, sig, all)
|
||||
}
|
||||
|
||||
func (s *pausedState) SetExited(status int) {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
s.p.setExited(status)
|
||||
|
||||
if err := s.p.runtime.Resume(context.Background(), s.p.id); err != nil {
|
||||
logrus.WithError(err).Error("resuming exited container from paused state")
|
||||
}
|
||||
|
||||
if err := s.transition("stopped"); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *pausedState) Exec(ctx context.Context, path string, r *ExecConfig) (proc.Process, error) {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return nil, errors.Errorf("cannot exec in a paused state")
|
||||
}
|
||||
|
||||
@@ -479,50 +376,30 @@ func (s *stoppedState) transition(name string) error {
|
||||
}
|
||||
|
||||
func (s *stoppedState) Pause(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot pause a stopped container")
|
||||
}
|
||||
|
||||
func (s *stoppedState) Resume(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot resume a stopped container")
|
||||
}
|
||||
|
||||
func (s *stoppedState) Update(context context.Context, r *google_protobuf.Any) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
func (s *stoppedState) Update(ctx context.Context, r *google_protobuf.Any) error {
|
||||
return errors.Errorf("cannot update a stopped container")
|
||||
}
|
||||
|
||||
func (s *stoppedState) Checkpoint(ctx context.Context, r *CheckpointConfig) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot checkpoint a stopped container")
|
||||
}
|
||||
|
||||
func (s *stoppedState) Resize(ws console.WinSize) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot resize a stopped container")
|
||||
}
|
||||
|
||||
func (s *stoppedState) Start(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return errors.Errorf("cannot start a stopped process")
|
||||
}
|
||||
|
||||
func (s *stoppedState) Delete(ctx context.Context) error {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
if err := s.p.delete(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -538,8 +415,5 @@ func (s *stoppedState) SetExited(status int) {
|
||||
}
|
||||
|
||||
func (s *stoppedState) Exec(ctx context.Context, path string, r *ExecConfig) (proc.Process, error) {
|
||||
s.p.mu.Lock()
|
||||
defer s.p.mu.Unlock()
|
||||
|
||||
return nil, errors.Errorf("cannot exec in a stopped state")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user