Merge pull request #126067 from tenzen-y/implement-job-success-policy-e2e

Graduate the JobSuccessPolicy to Beta
This commit is contained in:
Kubernetes Prow Robot
2024-07-23 06:14:23 -07:00
committed by GitHub
14 changed files with 374 additions and 62 deletions

View File

@@ -988,7 +988,12 @@ func (jm *Controller) newSuccessCondition() *batch.JobCondition {
if delayTerminalCondition() {
cType = batch.JobSuccessCriteriaMet
}
return newCondition(cType, v1.ConditionTrue, "", "", jm.clock.Now())
var reason, message string
if feature.DefaultFeatureGate.Enabled(features.JobSuccessPolicy) {
reason = batch.JobReasonCompletionsReached
message = "Reached expected number of succeeded pods"
}
return newCondition(cType, v1.ConditionTrue, reason, message, jm.clock.Now())
}
func delayTerminalCondition() bool {
@@ -1419,7 +1424,7 @@ func (jm *Controller) recordJobFinished(job *batch.Job, finishedCond *batch.JobC
jm.recorder.Event(job, v1.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached")
}
jm.recorder.Event(job, v1.EventTypeNormal, "Completed", "Job completed")
metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded", "").Inc()
metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded", finishedCond.Reason).Inc()
} else {
jm.recorder.Event(job, v1.EventTypeWarning, finishedCond.Reason, finishedCond.Message)
metrics.JobFinishedNum.WithLabelValues(completionMode, "failed", finishedCond.Reason).Inc()

View File

@@ -4991,6 +4991,45 @@ func TestSyncJobWithJobSuccessPolicy(t *testing.T) {
},
},
},
"job without successPolicy; jobSuccessPolicy is enabled; job got SuccessCriteriaMet and Completion with CompletionsReached reason conditions": {
enableJobSuccessPolicy: true,
enableJobManagedBy: true,
job: batch.Job{
TypeMeta: validTypeMeta,
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
CompletionMode: ptr.To(batch.IndexedCompletion),
Completions: ptr.To[int32](1),
Parallelism: ptr.To[int32](1),
BackoffLimit: ptr.To[int32](math.MaxInt32),
},
},
pods: []v1.Pod{
*buildPod().uid("a1").index("0").phase(v1.PodSucceeded).trackingFinalizer().Pod,
},
wantStatus: batch.JobStatus{
Failed: 0,
Succeeded: 1,
CompletedIndexes: "0",
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
Conditions: []batch.JobCondition{
{
Type: batch.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
Reason: batch.JobReasonCompletionsReached,
Message: "Reached expected number of succeeded pods",
},
{
Type: batch.JobComplete,
Status: v1.ConditionTrue,
Reason: batch.JobReasonCompletionsReached,
Message: "Reached expected number of succeeded pods",
},
},
},
},
"when the JobSuccessPolicy is disabled, the Job never got SuccessCriteriaMet condition even if the Job has the successPolicy field": {
job: batch.Job{
TypeMeta: validTypeMeta,
@@ -5132,12 +5171,16 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
Conditions: []batch.JobCondition{
{
Type: batch.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
Type: batch.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
Reason: batch.JobReasonCompletionsReached,
Message: "Reached expected number of succeeded pods",
},
{
Type: batch.JobComplete,
Status: v1.ConditionTrue,
Type: batch.JobComplete,
Status: v1.ConditionTrue,
Reason: batch.JobReasonCompletionsReached,
Message: "Reached expected number of succeeded pods",
},
},
},
@@ -7066,8 +7109,10 @@ func TestJobBackoffForOnFailure(t *testing.T) {
expectedFailed: 0,
expectedConditions: []batch.JobCondition{
{
Type: batch.JobComplete,
Status: v1.ConditionTrue,
Type: batch.JobComplete,
Status: v1.ConditionTrue,
Reason: batch.JobReasonCompletionsReached,
Message: "Reached expected number of succeeded pods",
},
},
},
@@ -7085,12 +7130,16 @@ func TestJobBackoffForOnFailure(t *testing.T) {
expectedFailed: 0,
expectedConditions: []batch.JobCondition{
{
Type: batch.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
Type: batch.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
Reason: batch.JobReasonCompletionsReached,
Message: "Reached expected number of succeeded pods",
},
{
Type: batch.JobComplete,
Status: v1.ConditionTrue,
Type: batch.JobComplete,
Status: v1.ConditionTrue,
Reason: batch.JobReasonCompletionsReached,
Message: "Reached expected number of succeeded pods",
},
},
},
@@ -7108,12 +7157,16 @@ func TestJobBackoffForOnFailure(t *testing.T) {
expectedFailed: 0,
expectedConditions: []batch.JobCondition{
{
Type: batch.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
Type: batch.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
Reason: batch.JobReasonCompletionsReached,
Message: "Reached expected number of succeeded pods",
},
{
Type: batch.JobComplete,
Status: v1.ConditionTrue,
Type: batch.JobComplete,
Status: v1.ConditionTrue,
Reason: batch.JobReasonCompletionsReached,
Message: "Reached expected number of succeeded pods",
},
},
},

View File

@@ -55,12 +55,14 @@ var (
},
[]string{"completion_mode", "result", "action"},
)
// JobFinishedNum tracks the number of Jobs that finish. Empty reason label
// is used to count successful jobs.
// JobFinishedNum tracks the number of Jobs that finish.
// TODO: Once we remove the JobSuccessPolicy feature gate, we need to remove "" reason label comment.
// When the JobSuccessPolicy feature gate is disabled, empty reason label is used to count successful jobs.
// Otherwise, "CompletionsReached" reason label is used to count successful jobs.
// Possible label values:
// completion_mode: Indexed, NonIndexed
// result: failed, succeeded
// reason: "BackoffLimitExceeded", "DeadlineExceeded", "PodFailurePolicy", "FailedIndexes", "MaxFailedIndexesExceeded", ""
// reason: "BackoffLimitExceeded", "DeadlineExceeded", "PodFailurePolicy", "FailedIndexes", "MaxFailedIndexesExceeded", "SuccessPolicy", "CompletionsReached", ""
JobFinishedNum = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: JobControllerSubsystem,