
Currently kubelet volume management works on the concept of desired and actual world of states. The volume manager periodically compares the two worlds and perform volume mount/unmount and/or attach/detach operations. When kubelet restarts, the cache of those two worlds are gone. Although desired world can be recovered through apiserver, actual world can not be recovered which may cause some volumes cannot be cleaned up if their information is deleted by apiserver. This change adds the reconstruction of the actual world by reading the pod directories from disk. The reconstructed volume information is added to both desired world and actual world if it cannot be found in either world. The rest logic would be as same as before, desired world populator may clean up the volume entry if it is no longer in apiserver, and then volume manager should invoke unmount to clean it up.
308 lines
9.4 KiB
Go
308 lines
9.4 KiB
Go
/*
|
|
Copyright 2016 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
/*
|
|
Package nestedpendingoperations is a modified implementation of
|
|
pkg/util/goroutinemap. It implements a data structure for managing go routines
|
|
by volume/pod name. It prevents the creation of new go routines if an existing
|
|
go routine for the volume already exists. It also allows multiple operations to
|
|
execute in parallel for the same volume as long as they are operating on
|
|
different pods.
|
|
*/
|
|
package nestedpendingoperations
|
|
|
|
import (
|
|
"fmt"
|
|
"sync"
|
|
|
|
"github.com/golang/glog"
|
|
"k8s.io/kubernetes/pkg/api"
|
|
"k8s.io/kubernetes/pkg/util/goroutinemap/exponentialbackoff"
|
|
k8sRuntime "k8s.io/kubernetes/pkg/util/runtime"
|
|
"k8s.io/kubernetes/pkg/volume/util/types"
|
|
)
|
|
|
|
const (
|
|
// emptyUniquePodName is a UniquePodName for empty string.
|
|
emptyUniquePodName types.UniquePodName = types.UniquePodName("")
|
|
)
|
|
|
|
// NestedPendingOperations defines the supported set of operations.
|
|
type NestedPendingOperations interface {
|
|
// Run adds the concatenation of volumeName and podName to the list of
|
|
// running operations and spawns a new go routine to execute operationFunc.
|
|
// If an operation with the same volumeName and same or empty podName
|
|
// exists, an AlreadyExists or ExponentialBackoff error is returned.
|
|
// This enables multiple operations to execute in parallel for the same
|
|
// volumeName as long as they have different podName.
|
|
// Once the operation is complete, the go routine is terminated and the
|
|
// concatenation of volumeName and podName is removed from the list of
|
|
// executing operations allowing a new operation to be started with the
|
|
// volumeName without error.
|
|
Run(volumeName api.UniqueVolumeName, podName types.UniquePodName, operationFunc func() error) error
|
|
|
|
// Wait blocks until all operations are completed. This is typically
|
|
// necessary during tests - the test should wait until all operations finish
|
|
// and evaluate results after that.
|
|
Wait()
|
|
|
|
// IsOperationPending returns true if an operation for the given volumeName and podName is pending,
|
|
// otherwise it returns false
|
|
IsOperationPending(volumeName api.UniqueVolumeName, podName types.UniquePodName) bool
|
|
}
|
|
|
|
// NewNestedPendingOperations returns a new instance of NestedPendingOperations.
|
|
func NewNestedPendingOperations(exponentialBackOffOnError bool) NestedPendingOperations {
|
|
g := &nestedPendingOperations{
|
|
operations: []operation{},
|
|
exponentialBackOffOnError: exponentialBackOffOnError,
|
|
}
|
|
g.cond = sync.NewCond(&g.lock)
|
|
return g
|
|
}
|
|
|
|
type nestedPendingOperations struct {
|
|
operations []operation
|
|
exponentialBackOffOnError bool
|
|
cond *sync.Cond
|
|
lock sync.RWMutex
|
|
}
|
|
|
|
type operation struct {
|
|
volumeName api.UniqueVolumeName
|
|
podName types.UniquePodName
|
|
operationPending bool
|
|
expBackoff exponentialbackoff.ExponentialBackoff
|
|
}
|
|
|
|
func (grm *nestedPendingOperations) Run(
|
|
volumeName api.UniqueVolumeName,
|
|
podName types.UniquePodName,
|
|
operationFunc func() error) error {
|
|
grm.lock.Lock()
|
|
defer grm.lock.Unlock()
|
|
opExists, previousOpIndex := grm.isOperationExists(volumeName, podName)
|
|
if opExists {
|
|
previousOp := grm.operations[previousOpIndex]
|
|
// Operation already exists
|
|
if previousOp.operationPending {
|
|
// Operation is pending
|
|
operationName := getOperationName(volumeName, podName)
|
|
return NewAlreadyExistsError(operationName)
|
|
}
|
|
|
|
operationName := getOperationName(volumeName, podName)
|
|
if err := previousOp.expBackoff.SafeToRetry(operationName); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Update existing operation to mark as pending.
|
|
grm.operations[previousOpIndex].operationPending = true
|
|
grm.operations[previousOpIndex].volumeName = volumeName
|
|
grm.operations[previousOpIndex].podName = podName
|
|
} else {
|
|
// Create a new operation
|
|
grm.operations = append(grm.operations,
|
|
operation{
|
|
operationPending: true,
|
|
volumeName: volumeName,
|
|
podName: podName,
|
|
expBackoff: exponentialbackoff.ExponentialBackoff{},
|
|
})
|
|
}
|
|
|
|
go func() (err error) {
|
|
// Handle unhandled panics (very unlikely)
|
|
defer k8sRuntime.HandleCrash()
|
|
// Handle completion of and error, if any, from operationFunc()
|
|
defer grm.operationComplete(volumeName, podName, &err)
|
|
// Handle panic, if any, from operationFunc()
|
|
defer k8sRuntime.RecoverFromPanic(&err)
|
|
return operationFunc()
|
|
}()
|
|
|
|
return nil
|
|
}
|
|
|
|
func (grm *nestedPendingOperations) IsOperationPending(
|
|
volumeName api.UniqueVolumeName,
|
|
podName types.UniquePodName) bool {
|
|
|
|
grm.lock.RLock()
|
|
defer grm.lock.RUnlock()
|
|
|
|
exist, previousOpIndex := grm.isOperationExists(volumeName, podName)
|
|
if exist && grm.operations[previousOpIndex].operationPending {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (grm *nestedPendingOperations) isOperationExists(
|
|
volumeName api.UniqueVolumeName,
|
|
podName types.UniquePodName) (bool, int) {
|
|
|
|
for previousOpIndex, previousOp := range grm.operations {
|
|
if previousOp.volumeName != volumeName {
|
|
// No match, keep searching
|
|
continue
|
|
}
|
|
|
|
if previousOp.podName != emptyUniquePodName &&
|
|
podName != emptyUniquePodName &&
|
|
previousOp.podName != podName {
|
|
// No match, keep searching
|
|
continue
|
|
}
|
|
|
|
// Match
|
|
return true, previousOpIndex
|
|
}
|
|
return false, -1
|
|
}
|
|
|
|
func (grm *nestedPendingOperations) getOperation(
|
|
volumeName api.UniqueVolumeName,
|
|
podName types.UniquePodName) (uint, error) {
|
|
// Assumes lock has been acquired by caller.
|
|
|
|
for i, op := range grm.operations {
|
|
if op.volumeName == volumeName &&
|
|
op.podName == podName {
|
|
return uint(i), nil
|
|
}
|
|
}
|
|
|
|
logOperationName := getOperationName(volumeName, podName)
|
|
return 0, fmt.Errorf("Operation %q not found.", logOperationName)
|
|
}
|
|
|
|
func (grm *nestedPendingOperations) deleteOperation(
|
|
// Assumes lock has been acquired by caller.
|
|
volumeName api.UniqueVolumeName,
|
|
podName types.UniquePodName) {
|
|
|
|
opIndex := -1
|
|
for i, op := range grm.operations {
|
|
if op.volumeName == volumeName &&
|
|
op.podName == podName {
|
|
opIndex = i
|
|
break
|
|
}
|
|
}
|
|
|
|
// Delete index without preserving order
|
|
grm.operations[opIndex] = grm.operations[len(grm.operations)-1]
|
|
grm.operations = grm.operations[:len(grm.operations)-1]
|
|
}
|
|
|
|
func (grm *nestedPendingOperations) operationComplete(
|
|
volumeName api.UniqueVolumeName, podName types.UniquePodName, err *error) {
|
|
// Defer operations are executed in Last-In is First-Out order. In this case
|
|
// the lock is acquired first when operationCompletes begins, and is
|
|
// released when the method finishes, after the lock is released cond is
|
|
// signaled to wake waiting goroutine.
|
|
defer grm.cond.Signal()
|
|
grm.lock.Lock()
|
|
defer grm.lock.Unlock()
|
|
|
|
if *err == nil || !grm.exponentialBackOffOnError {
|
|
// Operation completed without error, or exponentialBackOffOnError disabled
|
|
grm.deleteOperation(volumeName, podName)
|
|
if *err != nil {
|
|
// Log error
|
|
logOperationName := getOperationName(volumeName, podName)
|
|
glog.Errorf("operation %s failed with: %v",
|
|
logOperationName,
|
|
*err)
|
|
}
|
|
return
|
|
}
|
|
|
|
// Operation completed with error and exponentialBackOffOnError Enabled
|
|
existingOpIndex, getOpErr := grm.getOperation(volumeName, podName)
|
|
if getOpErr != nil {
|
|
// Failed to find existing operation
|
|
logOperationName := getOperationName(volumeName, podName)
|
|
glog.Errorf("Operation %s completed. error: %v. exponentialBackOffOnError is enabled, but failed to get operation to update.",
|
|
logOperationName,
|
|
*err)
|
|
return
|
|
}
|
|
|
|
grm.operations[existingOpIndex].expBackoff.Update(err)
|
|
grm.operations[existingOpIndex].operationPending = false
|
|
|
|
// Log error
|
|
operationName :=
|
|
getOperationName(volumeName, podName)
|
|
glog.Errorf("%v", grm.operations[existingOpIndex].expBackoff.
|
|
GenerateNoRetriesPermittedMsg(operationName))
|
|
}
|
|
|
|
func (grm *nestedPendingOperations) Wait() {
|
|
grm.lock.Lock()
|
|
defer grm.lock.Unlock()
|
|
|
|
for len(grm.operations) > 0 {
|
|
grm.cond.Wait()
|
|
}
|
|
}
|
|
|
|
func getOperationName(
|
|
volumeName api.UniqueVolumeName, podName types.UniquePodName) string {
|
|
podNameStr := ""
|
|
if podName != emptyUniquePodName {
|
|
podNameStr = fmt.Sprintf(" (%q)", podName)
|
|
}
|
|
|
|
return fmt.Sprintf("%q%s",
|
|
volumeName,
|
|
podNameStr)
|
|
}
|
|
|
|
// NewAlreadyExistsError returns a new instance of AlreadyExists error.
|
|
func NewAlreadyExistsError(operationName string) error {
|
|
return alreadyExistsError{operationName}
|
|
}
|
|
|
|
// IsAlreadyExists returns true if an error returned from
|
|
// NestedPendingOperations indicates a new operation can not be started because
|
|
// an operation with the same operation name is already executing.
|
|
func IsAlreadyExists(err error) bool {
|
|
switch err.(type) {
|
|
case alreadyExistsError:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// alreadyExistsError is the error returned by NestedPendingOperations when a
|
|
// new operation can not be started because an operation with the same operation
|
|
// name is already executing.
|
|
type alreadyExistsError struct {
|
|
operationName string
|
|
}
|
|
|
|
var _ error = alreadyExistsError{}
|
|
|
|
func (err alreadyExistsError) Error() string {
|
|
return fmt.Sprintf(
|
|
"Failed to create operation with name %q. An operation with that name is already executing.",
|
|
err.operationName)
|
|
}
|