consolidate node deletion logic between node lifecycle and cloud node controller

This commit is contained in:
andrewsykim
2018-10-28 21:57:23 -04:00
parent 6be4f1bbf3
commit 5329f09663
18 changed files with 750 additions and 702 deletions

View File

@@ -37,9 +37,6 @@ import (
"k8s.io/client-go/tools/record"
clientretry "k8s.io/client-go/util/retry"
cloudprovider "k8s.io/cloud-provider"
nodeutilv1 "k8s.io/kubernetes/pkg/api/v1/node"
"k8s.io/kubernetes/pkg/controller"
nodectrlutil "k8s.io/kubernetes/pkg/controller/util/node"
kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
nodeutil "k8s.io/kubernetes/pkg/util/node"
@@ -58,11 +55,6 @@ type CloudNodeController struct {
cloud cloudprovider.Interface
// Value controlling NodeController monitoring period, i.e. how often does NodeController
// check node status posted from kubelet. This value should be lower than nodeMonitorGracePeriod
// set in controller-manager
nodeMonitorPeriod time.Duration
nodeStatusUpdateFrequency time.Duration
}
@@ -79,7 +71,6 @@ func NewCloudNodeController(
nodeInformer coreinformers.NodeInformer,
kubeClient clientset.Interface,
cloud cloudprovider.Interface,
nodeMonitorPeriod time.Duration,
nodeStatusUpdateFrequency time.Duration) *CloudNodeController {
eventBroadcaster := record.NewBroadcaster()
@@ -97,7 +88,6 @@ func NewCloudNodeController(
kubeClient: kubeClient,
recorder: recorder,
cloud: cloud,
nodeMonitorPeriod: nodeMonitorPeriod,
nodeStatusUpdateFrequency: nodeStatusUpdateFrequency,
}
@@ -111,8 +101,9 @@ func NewCloudNodeController(
return cnc
}
// This controller deletes a node if kubelet is not reporting
// and the node is gone from the cloud provider.
// This controller updates newly registered nodes with information
// from the cloud provider. This call is blocking so should be called
// via a goroutine
func (cnc *CloudNodeController) Run(stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
@@ -121,10 +112,7 @@ func (cnc *CloudNodeController) Run(stopCh <-chan struct{}) {
// very infrequently. DO NOT MODIFY this to perform frequent operations.
// Start a loop to periodically update the node addresses obtained from the cloud
go wait.Until(cnc.UpdateNodeStatus, cnc.nodeStatusUpdateFrequency, stopCh)
// Start a loop to periodically check if any nodes have been deleted from cloudprovider
go wait.Until(cnc.MonitorNode, cnc.nodeMonitorPeriod, stopCh)
wait.Until(cnc.UpdateNodeStatus, cnc.nodeStatusUpdateFrequency, stopCh)
}
// UpdateNodeStatus updates the node status, such as node addresses
@@ -210,108 +198,6 @@ func (cnc *CloudNodeController) updateNodeAddress(node *v1.Node, instances cloud
}
}
// Monitor node queries the cloudprovider for non-ready nodes and deletes them
// if they cannot be found in the cloud provider
func (cnc *CloudNodeController) MonitorNode() {
instances, ok := cnc.cloud.Instances()
if !ok {
utilruntime.HandleError(fmt.Errorf("failed to get instances from cloud provider"))
return
}
nodes, err := cnc.kubeClient.CoreV1().Nodes().List(metav1.ListOptions{ResourceVersion: "0"})
if err != nil {
klog.Errorf("Error monitoring node status: %v", err)
return
}
for i := range nodes.Items {
var currentReadyCondition *v1.NodeCondition
node := &nodes.Items[i]
// Try to get the current node status
// If node status is empty, then kubelet has not posted ready status yet. In this case, process next node
for rep := 0; rep < nodeStatusUpdateRetry; rep++ {
_, currentReadyCondition = nodeutilv1.GetNodeCondition(&node.Status, v1.NodeReady)
if currentReadyCondition != nil {
break
}
name := node.Name
node, err = cnc.kubeClient.CoreV1().Nodes().Get(name, metav1.GetOptions{})
if err != nil {
klog.Errorf("Failed while getting a Node to retry updating NodeStatus. Probably Node %s was deleted.", name)
break
}
time.Sleep(retrySleepTime)
}
if currentReadyCondition == nil {
klog.Errorf("Update status of Node %v from CloudNodeController exceeds retry count or the Node was deleted.", node.Name)
continue
}
// If the known node status says that Node is NotReady, then check if the node has been removed
// from the cloud provider. If node cannot be found in cloudprovider, then delete the node immediately
if currentReadyCondition != nil {
if currentReadyCondition.Status != v1.ConditionTrue {
// we need to check this first to get taint working in similar in all cloudproviders
// current problem is that shutdown nodes are not working in similar way ie. all cloudproviders
// does not delete node from kubernetes cluster when instance it is shutdown see issue #46442
shutdown, err := nodectrlutil.ShutdownInCloudProvider(context.TODO(), cnc.cloud, node)
if err != nil {
klog.Errorf("Error checking if node %s is shutdown: %v", node.Name, err)
}
if shutdown && err == nil {
// if node is shutdown add shutdown taint
err = controller.AddOrUpdateTaintOnNode(cnc.kubeClient, node.Name, controller.ShutdownTaint)
if err != nil {
klog.Errorf("Error patching node taints: %v", err)
}
// Continue checking the remaining nodes since the current one is shutdown.
continue
}
// Check with the cloud provider to see if the node still exists. If it
// doesn't, delete the node immediately.
exists, err := ensureNodeExistsByProviderID(instances, node)
if err != nil {
klog.Errorf("Error checking if node %s exists: %v", node.Name, err)
continue
}
if exists {
// Continue checking the remaining nodes since the current one is fine.
continue
}
klog.V(2).Infof("Deleting node since it is no longer present in cloud provider: %s", node.Name)
ref := &v1.ObjectReference{
Kind: "Node",
Name: node.Name,
UID: types.UID(node.UID),
Namespace: "",
}
klog.V(2).Infof("Recording %s event message for node %s", "DeletingNode", node.Name)
cnc.recorder.Eventf(ref, v1.EventTypeNormal, fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name), "Node %s event: %s", node.Name, "DeletingNode")
go func(nodeName string) {
defer utilruntime.HandleCrash()
if err := cnc.kubeClient.CoreV1().Nodes().Delete(nodeName, nil); err != nil {
klog.Errorf("unable to delete node %q: %v", nodeName, err)
}
}(node.Name)
} else {
// if taint exist remove taint
err = controller.RemoveTaintOffNode(cnc.kubeClient, node.Name, node, controller.ShutdownTaint)
if err != nil {
klog.Errorf("Error patching node taints: %v", err)
}
}
}
}
}
func (cnc *CloudNodeController) UpdateCloudNode(_, newObj interface{}) {
if _, ok := newObj.(*v1.Node); !ok {
utilruntime.HandleError(fmt.Errorf("unexpected object type: %v", newObj))