Merge pull request #35932 from jayunit100/sched_events_spam_reduce
Automatic merge from submit-queue Reduce spam in Events from scheduler by counter aggregation of failure Fixes #35842 Part of overall #35555
This commit is contained in:
		| @@ -49,14 +49,24 @@ var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods") | ||||
| func (f *FitError) Error() string { | ||||
| 	var buf bytes.Buffer | ||||
| 	buf.WriteString(fmt.Sprintf("pod (%s) failed to fit in any node\n", f.Pod.Name)) | ||||
| 	for node, predicates := range f.FailedPredicates { | ||||
| 		reasons := make([]string, 0) | ||||
| 	reasons := make(map[string]int) | ||||
| 	for _, predicates := range f.FailedPredicates { | ||||
| 		for _, pred := range predicates { | ||||
| 			reasons = append(reasons, pred.GetReason()) | ||||
| 			reasons[pred.GetReason()] += 1 | ||||
| 		} | ||||
| 		reasonMsg := fmt.Sprintf("fit failure on node (%s): %s\n", node, strings.Join(reasons, ", ")) | ||||
| 		buf.WriteString(reasonMsg) | ||||
| 	} | ||||
|  | ||||
| 	sortReasonsHistogram := func() []string { | ||||
| 		reasonStrings := []string{} | ||||
| 		for k, v := range reasons { | ||||
| 			reasonStrings = append(reasonStrings, fmt.Sprintf("%v (%v)", k, v)) | ||||
| 		} | ||||
| 		sort.Strings(reasonStrings) | ||||
| 		return reasonStrings | ||||
| 	} | ||||
|  | ||||
| 	reasonMsg := fmt.Sprintf("fit failure summary on nodes : %v", strings.Join(sortReasonsHistogram(), ", ")) | ||||
| 	buf.WriteString(reasonMsg) | ||||
| 	return buf.String() | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -18,6 +18,7 @@ package scheduler | ||||
|  | ||||
| import ( | ||||
| 	"errors" | ||||
| 	"fmt" | ||||
| 	"reflect" | ||||
| 	"testing" | ||||
| 	"time" | ||||
| @@ -331,49 +332,66 @@ func TestSchedulerFailedSchedulingReasons(t *testing.T) { | ||||
| 	defer close(stop) | ||||
| 	queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) | ||||
| 	scache := schedulercache.New(10*time.Minute, stop) | ||||
| 	node := api.Node{ | ||||
| 		ObjectMeta: api.ObjectMeta{Name: "machine1"}, | ||||
| 		Status: api.NodeStatus{ | ||||
| 			Capacity: api.ResourceList{ | ||||
| 				api.ResourceCPU:    *(resource.NewQuantity(2, resource.DecimalSI)), | ||||
| 				api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)), | ||||
| 				api.ResourcePods:   *(resource.NewQuantity(10, resource.DecimalSI)), | ||||
| 			}, | ||||
| 			Allocatable: api.ResourceList{ | ||||
| 				api.ResourceCPU:    *(resource.NewQuantity(2, resource.DecimalSI)), | ||||
| 				api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)), | ||||
| 				api.ResourcePods:   *(resource.NewQuantity(10, resource.DecimalSI)), | ||||
| 			}}, | ||||
|  | ||||
| 	// Design the baseline for the pods, and we will make nodes that dont fit it later. | ||||
| 	var cpu = int64(4) | ||||
| 	var mem = int64(500) | ||||
| 	podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{ | ||||
| 		api.ResourceCPU:    *(resource.NewQuantity(cpu, resource.DecimalSI)), | ||||
| 		api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)), | ||||
| 	}, api.ResourceList{ | ||||
| 		api.ResourceCPU:    *(resource.NewQuantity(cpu, resource.DecimalSI)), | ||||
| 		api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)), | ||||
| 	}) | ||||
|  | ||||
| 	// create several nodes which cannot schedule the above pod | ||||
| 	nodes := []*api.Node{} | ||||
| 	for i := 0; i < 100; i++ { | ||||
| 		node := api.Node{ | ||||
| 			ObjectMeta: api.ObjectMeta{Name: fmt.Sprintf("machine%v", i)}, | ||||
| 			Status: api.NodeStatus{ | ||||
| 				Capacity: api.ResourceList{ | ||||
| 					api.ResourceCPU:    *(resource.NewQuantity(cpu/2, resource.DecimalSI)), | ||||
| 					api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)), | ||||
| 					api.ResourcePods:   *(resource.NewQuantity(10, resource.DecimalSI)), | ||||
| 				}, | ||||
| 				Allocatable: api.ResourceList{ | ||||
| 					api.ResourceCPU:    *(resource.NewQuantity(cpu/2, resource.DecimalSI)), | ||||
| 					api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)), | ||||
| 					api.ResourcePods:   *(resource.NewQuantity(10, resource.DecimalSI)), | ||||
| 				}}, | ||||
| 		} | ||||
| 		scache.AddNode(&node) | ||||
| 		nodes = append(nodes, &node) | ||||
| 	} | ||||
| 	scache.AddNode(&node) | ||||
| 	nodeLister := algorithm.FakeNodeLister([]*api.Node{&node}) | ||||
| 	nodeLister := algorithm.FakeNodeLister(nodes) | ||||
| 	predicateMap := map[string]algorithm.FitPredicate{ | ||||
| 		"PodFitsResources": predicates.PodFitsResources, | ||||
| 	} | ||||
|  | ||||
| 	// Create expected failure reasons for all the nodes.  Hopefully they will get rolled up into a non-spammy summary. | ||||
| 	failedPredicatesMap := FailedPredicateMap{} | ||||
| 	for _, node := range nodes { | ||||
| 		failedPredicatesMap[node.Name] = []algorithm.PredicateFailureReason{ | ||||
| 			predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000), | ||||
| 			predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100), | ||||
| 		} | ||||
| 	} | ||||
| 	scheduler, _, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap) | ||||
|  | ||||
| 	podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{ | ||||
| 		api.ResourceCPU:    *(resource.NewQuantity(4, resource.DecimalSI)), | ||||
| 		api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)), | ||||
| 	}, api.ResourceList{ | ||||
| 		api.ResourceCPU:    *(resource.NewQuantity(4, resource.DecimalSI)), | ||||
| 		api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)), | ||||
| 	}) | ||||
| 	queuedPodStore.Add(podWithTooBigResourceRequests) | ||||
| 	scheduler.scheduleOne() | ||||
|  | ||||
| 	select { | ||||
| 	case err := <-errChan: | ||||
| 		expectErr := &FitError{ | ||||
| 			Pod: podWithTooBigResourceRequests, | ||||
| 			FailedPredicates: FailedPredicateMap{node.Name: []algorithm.PredicateFailureReason{ | ||||
| 				predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000), | ||||
| 				predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100), | ||||
| 			}}, | ||||
| 			Pod:              podWithTooBigResourceRequests, | ||||
| 			FailedPredicates: failedPredicatesMap, | ||||
| 		} | ||||
| 		if len(fmt.Sprint(expectErr)) > 150 { | ||||
| 			t.Errorf("message is too spammy ! %v ", len(fmt.Sprint(expectErr))) | ||||
| 		} | ||||
| 		if !reflect.DeepEqual(expectErr, err) { | ||||
| 			t.Errorf("err want=%+v, get=%+v", expectErr, err) | ||||
| 			t.Errorf("\n err \nWANT=%+v,\nGOT=%+v", expectErr, err) | ||||
| 		} | ||||
| 	case <-time.After(wait.ForeverTestTimeout): | ||||
| 		t.Fatalf("timeout after %v", wait.ForeverTestTimeout) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Submit Queue
					Kubernetes Submit Queue