Merge pull request #118480 from carlory/gc_metrics
podgc metrics should count all pod deletion behaviors
This commit is contained in:
		@@ -37,6 +37,7 @@ import (
 | 
			
		||||
	"k8s.io/client-go/tools/cache"
 | 
			
		||||
	"k8s.io/client-go/util/workqueue"
 | 
			
		||||
	"k8s.io/klog/v2"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/controller/podgc/metrics"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/features"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/eviction"
 | 
			
		||||
	nodeutil "k8s.io/kubernetes/pkg/util/node"
 | 
			
		||||
@@ -69,11 +70,6 @@ type PodGCController struct {
 | 
			
		||||
	quarantineTime         time.Duration
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func init() {
 | 
			
		||||
	// Register prometheus metrics
 | 
			
		||||
	RegisterMetrics()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func NewPodGC(ctx context.Context, kubeClient clientset.Interface, podInformer coreinformers.PodInformer,
 | 
			
		||||
	nodeInformer coreinformers.NodeInformer, terminatedPodThreshold int) *PodGCController {
 | 
			
		||||
	return NewPodGCInternal(ctx, kubeClient, podInformer, nodeInformer, terminatedPodThreshold, gcCheckPeriod, quarantineTime)
 | 
			
		||||
@@ -94,6 +90,8 @@ func NewPodGCInternal(ctx context.Context, kubeClient clientset.Interface, podIn
 | 
			
		||||
		quarantineTime:         quarantineTime,
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Register prometheus metrics
 | 
			
		||||
	metrics.RegisterMetrics()
 | 
			
		||||
	return gcc
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -182,11 +180,11 @@ func (gcc *PodGCController) gcTerminating(ctx context.Context, pods []*v1.Pod) {
 | 
			
		||||
		wait.Add(1)
 | 
			
		||||
		go func(pod *v1.Pod) {
 | 
			
		||||
			defer wait.Done()
 | 
			
		||||
			deletingPodsTotal.WithLabelValues().Inc()
 | 
			
		||||
			metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingOutOfService).Inc()
 | 
			
		||||
			if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil {
 | 
			
		||||
				// ignore not founds
 | 
			
		||||
				utilruntime.HandleError(err)
 | 
			
		||||
				deletingPodsErrorTotal.WithLabelValues().Inc()
 | 
			
		||||
				metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingOutOfService).Inc()
 | 
			
		||||
			}
 | 
			
		||||
		}(terminatingPods[i])
 | 
			
		||||
	}
 | 
			
		||||
@@ -220,7 +218,9 @@ func (gcc *PodGCController) gcTerminated(ctx context.Context, pods []*v1.Pod) {
 | 
			
		||||
			if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil {
 | 
			
		||||
				// ignore not founds
 | 
			
		||||
				defer utilruntime.HandleError(err)
 | 
			
		||||
				metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminated).Inc()
 | 
			
		||||
			}
 | 
			
		||||
			metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminated).Inc()
 | 
			
		||||
		}(terminatedPods[i])
 | 
			
		||||
	}
 | 
			
		||||
	wait.Wait()
 | 
			
		||||
@@ -259,9 +259,11 @@ func (gcc *PodGCController) gcOrphaned(ctx context.Context, pods []*v1.Pod, node
 | 
			
		||||
			WithLastTransitionTime(metav1.Now())
 | 
			
		||||
		if err := gcc.markFailedAndDeletePodWithCondition(ctx, pod, condition); err != nil {
 | 
			
		||||
			utilruntime.HandleError(err)
 | 
			
		||||
			metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonOrphaned).Inc()
 | 
			
		||||
		} else {
 | 
			
		||||
			logger.Info("Forced deletion of orphaned Pod succeeded", "pod", klog.KObj(pod))
 | 
			
		||||
		}
 | 
			
		||||
		metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonOrphaned).Inc()
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -309,9 +311,11 @@ func (gcc *PodGCController) gcUnscheduledTerminating(ctx context.Context, pods [
 | 
			
		||||
		logger.V(2).Info("Found unscheduled terminating Pod not assigned to any Node, deleting", "pod", klog.KObj(pod))
 | 
			
		||||
		if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil {
 | 
			
		||||
			utilruntime.HandleError(err)
 | 
			
		||||
			metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingUnscheduled).Inc()
 | 
			
		||||
		} else {
 | 
			
		||||
			logger.Info("Forced deletion of unscheduled terminating Pod succeeded", "pod", klog.KObj(pod))
 | 
			
		||||
		}
 | 
			
		||||
		metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingUnscheduled).Inc()
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -38,6 +38,7 @@ import (
 | 
			
		||||
	metricstestutil "k8s.io/component-base/metrics/testutil"
 | 
			
		||||
	"k8s.io/klog/v2/ktesting"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/controller"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/controller/podgc/metrics"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/controller/testutil"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/features"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/eviction"
 | 
			
		||||
@@ -160,7 +161,7 @@ func TestGCTerminated(t *testing.T) {
 | 
			
		||||
			for _, pod := range test.pods {
 | 
			
		||||
				creationTime = creationTime.Add(1 * time.Hour)
 | 
			
		||||
				pods = append(pods, &v1.Pod{
 | 
			
		||||
					ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime}},
 | 
			
		||||
					ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime}},
 | 
			
		||||
					Status:     v1.PodStatus{Phase: pod.phase, Reason: pod.reason},
 | 
			
		||||
					Spec:       v1.PodSpec{NodeName: "node"},
 | 
			
		||||
				})
 | 
			
		||||
@@ -176,12 +177,16 @@ func TestGCTerminated(t *testing.T) {
 | 
			
		||||
			verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
 | 
			
		||||
		})
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// testDeletingPodsMetrics is 9 in this test
 | 
			
		||||
	testDeletingPodsMetrics(t, 9, metrics.PodGCReasonTerminated)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func makePod(name string, nodeName string, phase v1.PodPhase) *v1.Pod {
 | 
			
		||||
	return &v1.Pod{
 | 
			
		||||
		ObjectMeta: metav1.ObjectMeta{
 | 
			
		||||
			Name: name,
 | 
			
		||||
			Name:      name,
 | 
			
		||||
			Namespace: metav1.NamespaceDefault,
 | 
			
		||||
		},
 | 
			
		||||
		Spec:   v1.PodSpec{NodeName: nodeName},
 | 
			
		||||
		Status: v1.PodStatus{Phase: phase},
 | 
			
		||||
@@ -408,6 +413,9 @@ func TestGCOrphaned(t *testing.T) {
 | 
			
		||||
			verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
 | 
			
		||||
		})
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// testDeletingPodsMetrics is 10 in this test
 | 
			
		||||
	testDeletingPodsMetrics(t, 10, metrics.PodGCReasonOrphaned)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func TestGCUnscheduledTerminating(t *testing.T) {
 | 
			
		||||
@@ -466,7 +474,7 @@ func TestGCUnscheduledTerminating(t *testing.T) {
 | 
			
		||||
			for _, pod := range test.pods {
 | 
			
		||||
				creationTime = creationTime.Add(1 * time.Hour)
 | 
			
		||||
				pods = append(pods, &v1.Pod{
 | 
			
		||||
					ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime},
 | 
			
		||||
					ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime},
 | 
			
		||||
						DeletionTimestamp: pod.deletionTimeStamp},
 | 
			
		||||
					Status: v1.PodStatus{Phase: pod.phase},
 | 
			
		||||
					Spec:   v1.PodSpec{NodeName: pod.nodeName},
 | 
			
		||||
@@ -489,6 +497,9 @@ func TestGCUnscheduledTerminating(t *testing.T) {
 | 
			
		||||
			verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
 | 
			
		||||
		})
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// testDeletingPodsMetrics is 6 in this test
 | 
			
		||||
	testDeletingPodsMetrics(t, 6, metrics.PodGCReasonTerminatingUnscheduled)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func TestGCTerminating(t *testing.T) {
 | 
			
		||||
@@ -637,7 +648,7 @@ func TestGCTerminating(t *testing.T) {
 | 
			
		||||
			for _, pod := range test.pods {
 | 
			
		||||
				creationTime = creationTime.Add(1 * time.Hour)
 | 
			
		||||
				pods = append(pods, &v1.Pod{
 | 
			
		||||
					ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime},
 | 
			
		||||
					ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime},
 | 
			
		||||
						DeletionTimestamp: pod.deletionTimeStamp},
 | 
			
		||||
					Status: v1.PodStatus{Phase: pod.phase},
 | 
			
		||||
					Spec:   v1.PodSpec{NodeName: pod.nodeName},
 | 
			
		||||
@@ -657,8 +668,8 @@ func TestGCTerminating(t *testing.T) {
 | 
			
		||||
			verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
 | 
			
		||||
		})
 | 
			
		||||
	}
 | 
			
		||||
	// deletingPodsTotal is 7 in this test
 | 
			
		||||
	testDeletingPodsMetrics(t, 7)
 | 
			
		||||
	// testDeletingPodsMetrics is 7 in this test
 | 
			
		||||
	testDeletingPodsMetrics(t, 7, metrics.PodGCReasonTerminatingOutOfService)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func verifyDeletedAndPatchedPods(t *testing.T, client *fake.Clientset, wantDeletedPodNames, wantPatchedPodNames sets.String) {
 | 
			
		||||
@@ -673,18 +684,18 @@ func verifyDeletedAndPatchedPods(t *testing.T, client *fake.Clientset, wantDelet
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func testDeletingPodsMetrics(t *testing.T, inputDeletingPodsTotal int) {
 | 
			
		||||
func testDeletingPodsMetrics(t *testing.T, total int, reason string) {
 | 
			
		||||
	t.Helper()
 | 
			
		||||
 | 
			
		||||
	actualDeletingPodsTotal, err := metricstestutil.GetCounterMetricValue(deletingPodsTotal.WithLabelValues())
 | 
			
		||||
	actualDeletingPodsTotal, err := metricstestutil.GetCounterMetricValue(metrics.DeletingPodsTotal.WithLabelValues(metav1.NamespaceDefault, reason))
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		t.Errorf("Error getting actualDeletingPodsTotal")
 | 
			
		||||
	}
 | 
			
		||||
	if actualDeletingPodsTotal != float64(inputDeletingPodsTotal) {
 | 
			
		||||
		t.Errorf("Expected desiredDeletingPodsTotal to be %d, got %v", inputDeletingPodsTotal, actualDeletingPodsTotal)
 | 
			
		||||
	if actualDeletingPodsTotal != float64(total) {
 | 
			
		||||
		t.Errorf("Expected desiredDeletingPodsTotal to be %d, got %v", total, actualDeletingPodsTotal)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	actualDeletingPodsErrorTotal, err := metricstestutil.GetCounterMetricValue(deletingPodsErrorTotal.WithLabelValues())
 | 
			
		||||
	actualDeletingPodsErrorTotal, err := metricstestutil.GetCounterMetricValue(metrics.DeletingPodsErrorTotal.WithLabelValues("", reason))
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		t.Errorf("Error getting actualDeletingPodsErrorTotal")
 | 
			
		||||
	}
 | 
			
		||||
 
 | 
			
		||||
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 | 
			
		||||
limitations under the License.
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
package podgc
 | 
			
		||||
package metrics
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"sync"
 | 
			
		||||
@@ -28,32 +28,47 @@ const (
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
var (
 | 
			
		||||
	deletingPodsTotal = metrics.NewCounterVec(
 | 
			
		||||
	DeletingPodsTotal = metrics.NewCounterVec(
 | 
			
		||||
		&metrics.CounterOpts{
 | 
			
		||||
			Subsystem:      podGCController,
 | 
			
		||||
			Name:           "force_delete_pods_total",
 | 
			
		||||
			Help:           "Number of pods that are being forcefully deleted since the Pod GC Controller started.",
 | 
			
		||||
			StabilityLevel: metrics.ALPHA,
 | 
			
		||||
		},
 | 
			
		||||
		[]string{},
 | 
			
		||||
		[]string{"namespace", "reason"},
 | 
			
		||||
	)
 | 
			
		||||
	deletingPodsErrorTotal = metrics.NewCounterVec(
 | 
			
		||||
	DeletingPodsErrorTotal = metrics.NewCounterVec(
 | 
			
		||||
		&metrics.CounterOpts{
 | 
			
		||||
			Subsystem:      podGCController,
 | 
			
		||||
			Name:           "force_delete_pod_errors_total",
 | 
			
		||||
			Help:           "Number of errors encountered when forcefully deleting the pods since the Pod GC Controller started.",
 | 
			
		||||
			StabilityLevel: metrics.ALPHA,
 | 
			
		||||
		},
 | 
			
		||||
		[]string{},
 | 
			
		||||
		[]string{"namespace", "reason"},
 | 
			
		||||
	)
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	// Possible values for the "reason" label in the above metrics.
 | 
			
		||||
 | 
			
		||||
	// PodGCReasonTerminated is used when the pod is terminated.
 | 
			
		||||
	PodGCReasonTerminated = "terminated"
 | 
			
		||||
	// PodGCReasonCompleted is used when the pod is terminating and the corresponding node
 | 
			
		||||
	// is not ready and has `node.kubernetes.io/out-of-service` taint.
 | 
			
		||||
	PodGCReasonTerminatingOutOfService = "out-of-service"
 | 
			
		||||
	// PodGCReasonOrphaned is used when the pod is orphaned which means the corresponding node
 | 
			
		||||
	// has been deleted.
 | 
			
		||||
	PodGCReasonOrphaned = "orphaned"
 | 
			
		||||
	// PodGCReasonUnscheduled is used when the pod is terminating and unscheduled.
 | 
			
		||||
	PodGCReasonTerminatingUnscheduled = "unscheduled"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
var registerMetrics sync.Once
 | 
			
		||||
 | 
			
		||||
// Register the metrics that are to be monitored.
 | 
			
		||||
func RegisterMetrics() {
 | 
			
		||||
	registerMetrics.Do(func() {
 | 
			
		||||
		legacyregistry.MustRegister(deletingPodsTotal)
 | 
			
		||||
		legacyregistry.MustRegister(deletingPodsErrorTotal)
 | 
			
		||||
		legacyregistry.MustRegister(DeletingPodsTotal)
 | 
			
		||||
		legacyregistry.MustRegister(DeletingPodsErrorTotal)
 | 
			
		||||
	})
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user