Merge pull request #41147 from derekwaynecarr/improve-eviction-logs
Automatic merge from submit-queue (batch tested with PRs 41074, 41147, 40854, 41167, 40045) Add debug logging to eviction manager **What this PR does / why we need it**: This PR adds debug logging to eviction manager. We need it to help users understand when/why eviction manager is/is not making decisions to support information gathering during support.
This commit is contained in:
		| @@ -41,6 +41,11 @@ CGROUP_ROOT=${CGROUP_ROOT:-""} | ||||
| # name of the cgroup driver, i.e. cgroupfs or systemd | ||||
| CGROUP_DRIVER=${CGROUP_DRIVER:-""} | ||||
|  | ||||
| # enables testing eviction scenarios locally. | ||||
| EVICTION_HARD=${EVICTION_HARD:-"memory.available<100Mi"} | ||||
| EVICTION_SOFT=${EVICTION_SOFT:-""} | ||||
| EVICTION_PRESSURE_TRANSITION_PERIOD=${EVICTION_PRESSURE_TRANSITION_PERIOD:-"1m"} | ||||
|  | ||||
| # We disable cluster DNS by default because this script uses docker0 (or whatever | ||||
| # container bridge docker is currently using) and we don't know the IP of the | ||||
| # DNS pod to pass in as --cluster-dns. To set this up by hand, set this flag | ||||
| @@ -558,6 +563,9 @@ function start_kubelet { | ||||
|         --cgroup-driver=${CGROUP_DRIVER} \ | ||||
|         --cgroup-root=${CGROUP_ROOT} \ | ||||
|         --keep-terminated-pod-volumes=true \ | ||||
|         --eviction-hard=${EVICTION_HARD} \ | ||||
|         --eviction-soft=${EVICTION_SOFT} \ | ||||
|         --eviction-pressure-transition-period=${EVICTION_PRESSURE_TRANSITION_PERIOD} \ | ||||
|         ${auth_args} \ | ||||
|         ${dns_args} \ | ||||
|         ${net_plugin_dir_args} \ | ||||
|   | ||||
| @@ -187,6 +187,8 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act | ||||
| 		return | ||||
| 	} | ||||
|  | ||||
| 	glog.V(3).Infof("eviction manager: synchronize housekeeping") | ||||
|  | ||||
| 	// build the ranking functions (if not yet known) | ||||
| 	// TODO: have a function in cadvisor that lets us know if global housekeeping has completed | ||||
| 	if len(m.resourceToRankFunc) == 0 || len(m.resourceToNodeReclaimFuncs) == 0 { | ||||
| @@ -205,6 +207,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act | ||||
| 		glog.Errorf("eviction manager: unexpected err: %v", err) | ||||
| 		return | ||||
| 	} | ||||
| 	debugLogObservations("observations", observations) | ||||
|  | ||||
| 	// attempt to create a threshold notifier to improve eviction response time | ||||
| 	if m.config.KernelMemcgNotification && !m.notifiersInitialized { | ||||
| @@ -231,15 +234,18 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act | ||||
|  | ||||
| 	// determine the set of thresholds met independent of grace period | ||||
| 	thresholds = thresholdsMet(thresholds, observations, false) | ||||
| 	debugLogThresholdsWithObservation("thresholds - ignoring grace period", thresholds, observations) | ||||
|  | ||||
| 	// determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim | ||||
| 	if len(m.thresholdsMet) > 0 { | ||||
| 		thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true) | ||||
| 		thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved) | ||||
| 	} | ||||
| 	debugLogThresholdsWithObservation("thresholds - reclaim not satisfied", thresholds, observations) | ||||
|  | ||||
| 	// determine the set of thresholds whose stats have been updated since the last sync | ||||
| 	thresholds = thresholdsUpdatedStats(thresholds, observations, m.lastObservations) | ||||
| 	debugLogThresholdsWithObservation("thresholds - updated stats", thresholds, observations) | ||||
|  | ||||
| 	// track when a threshold was first observed | ||||
| 	now := m.clock.Now() | ||||
| @@ -247,15 +253,22 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act | ||||
|  | ||||
| 	// the set of node conditions that are triggered by currently observed thresholds | ||||
| 	nodeConditions := nodeConditions(thresholds) | ||||
| 	if len(nodeConditions) > 0 { | ||||
| 		glog.V(3).Infof("eviction manager: node conditions - observed: %v", nodeConditions) | ||||
| 	} | ||||
|  | ||||
| 	// track when a node condition was last observed | ||||
| 	nodeConditionsLastObservedAt := nodeConditionsLastObservedAt(nodeConditions, m.nodeConditionsLastObservedAt, now) | ||||
|  | ||||
| 	// node conditions report true if it has been observed within the transition period window | ||||
| 	nodeConditions = nodeConditionsObservedSince(nodeConditionsLastObservedAt, m.config.PressureTransitionPeriod, now) | ||||
| 	if len(nodeConditions) > 0 { | ||||
| 		glog.V(3).Infof("eviction manager: node conditions - transition period not met: %v", nodeConditions) | ||||
| 	} | ||||
|  | ||||
| 	// determine the set of thresholds we need to drive eviction behavior (i.e. all grace periods are met) | ||||
| 	thresholds = thresholdsMetGracePeriod(thresholdsFirstObservedAt, now) | ||||
| 	debugLogThresholdsWithObservation("thresholds - grace periods satisified", thresholds, observations) | ||||
|  | ||||
| 	// update internal state | ||||
| 	m.Lock() | ||||
|   | ||||
| @@ -694,6 +694,29 @@ func thresholdsMet(thresholds []Threshold, observations signalObservations, enfo | ||||
| 	return results | ||||
| } | ||||
|  | ||||
| func debugLogObservations(logPrefix string, observations signalObservations) { | ||||
| 	for k, v := range observations { | ||||
| 		if !v.time.IsZero() { | ||||
| 			glog.V(3).Infof("eviction manager: %v: signal=%v, available: %v, capacity: %v, time: %v", logPrefix, k, v.available, v.capacity, v.time) | ||||
| 		} else { | ||||
| 			glog.V(3).Infof("eviction manager: %v: signal=%v, available: %v, capacity: %v", logPrefix, k, v.available, v.capacity) | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func debugLogThresholdsWithObservation(logPrefix string, thresholds []Threshold, observations signalObservations) { | ||||
| 	for i := range thresholds { | ||||
| 		threshold := thresholds[i] | ||||
| 		observed, found := observations[threshold.Signal] | ||||
| 		if found { | ||||
| 			quantity := getThresholdQuantity(threshold.Value, observed.capacity) | ||||
| 			glog.V(3).Infof("eviction manager: %v: threshold [signal=%v, quantity=%v] observed %v", logPrefix, threshold.Signal, quantity, observed.available) | ||||
| 		} else { | ||||
| 			glog.V(3).Infof("eviction manager: %v: threshold [signal=%v] had no observation", logPrefix, threshold.Signal) | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func thresholdsUpdatedStats(thresholds []Threshold, observations, lastObservations signalObservations) []Threshold { | ||||
| 	results := []Threshold{} | ||||
| 	for i := range thresholds { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Submit Queue
					Kubernetes Submit Queue