Support handling of pod failures with respect to the specified rules
This commit is contained in:
@@ -61,6 +61,12 @@ import (
|
||||
// a Job. It is used if the feature gate JobReadyPods is enabled.
|
||||
const podUpdateBatchPeriod = time.Second
|
||||
|
||||
const (
|
||||
// PodFailurePolicy reason indicates a job failure condition is added due to
|
||||
// a failed pod matching a pod failure policy rule
|
||||
jobConditionReasonPodFailurePolicy = "PodFailurePolicy"
|
||||
)
|
||||
|
||||
// controllerKind contains the schema.GroupVersionKind for this controller type.
|
||||
var controllerKind = batch.SchemeGroupVersion.WithKind("Job")
|
||||
|
||||
@@ -758,16 +764,31 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (forget bool, rEr
|
||||
exceedsBackoffLimit := jobHasNewFailure && (active != *job.Spec.Parallelism) &&
|
||||
(failed > *job.Spec.BackoffLimit)
|
||||
|
||||
if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
|
||||
// check if the number of pod restart exceeds backoff (for restart OnFailure only)
|
||||
// OR if the number of failed jobs increased since the last syncJob
|
||||
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "BackoffLimitExceeded", "Job has reached the specified backoff limit")
|
||||
} else if pastActiveDeadline(&job) {
|
||||
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded", "Job was active longer than specified deadline")
|
||||
} else if job.Spec.ActiveDeadlineSeconds != nil && !jobSuspended(&job) {
|
||||
syncDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second - time.Since(job.Status.StartTime.Time)
|
||||
klog.V(2).InfoS("Job has activeDeadlineSeconds configuration. Will sync this job again", "job", key, "nextSyncIn", syncDuration)
|
||||
jm.queue.AddAfter(key, syncDuration)
|
||||
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
|
||||
if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.AlphaNoCompatGuaranteeJobFailureTarget); failureTargetCondition != nil {
|
||||
finishedCondition = newFailedConditionForFailureTarget(failureTargetCondition)
|
||||
} else if failJobMessage := getFailJobMessage(&job, pods, uncounted.Failed()); failJobMessage != nil {
|
||||
if uncounted != nil {
|
||||
// Prepare the interim FailureTarget condition to record the failure message before the finalizers (allowing removal of the pods) are removed.
|
||||
finishedCondition = newCondition(batch.AlphaNoCompatGuaranteeJobFailureTarget, v1.ConditionTrue, jobConditionReasonPodFailurePolicy, *failJobMessage)
|
||||
} else {
|
||||
// Prepare the Failed job condition for the legacy path without finalizers (don't use the interim FailureTarget condition).
|
||||
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, jobConditionReasonPodFailurePolicy, *failJobMessage)
|
||||
}
|
||||
}
|
||||
}
|
||||
if finishedCondition == nil {
|
||||
if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
|
||||
// check if the number of pod restart exceeds backoff (for restart OnFailure only)
|
||||
// OR if the number of failed jobs increased since the last syncJob
|
||||
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "BackoffLimitExceeded", "Job has reached the specified backoff limit")
|
||||
} else if pastActiveDeadline(&job) {
|
||||
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded", "Job was active longer than specified deadline")
|
||||
} else if job.Spec.ActiveDeadlineSeconds != nil && !jobSuspended(&job) {
|
||||
syncDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second - time.Since(job.Status.StartTime.Time)
|
||||
klog.V(2).InfoS("Job has activeDeadlineSeconds configuration. Will sync this job again", "job", key, "nextSyncIn", syncDuration)
|
||||
jm.queue.AddAfter(key, syncDuration)
|
||||
}
|
||||
}
|
||||
|
||||
var prevSucceededIndexes, succeededIndexes orderedIntervals
|
||||
@@ -1039,8 +1060,16 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
|
||||
} else if pod.Status.Phase == v1.PodFailed || podTerminating {
|
||||
ix := getCompletionIndex(pod.Annotations)
|
||||
if !uncounted.failed.Has(string(pod.UID)) && (!isIndexed || (ix != unknownCompletionIndex && ix < int(*job.Spec.Completions))) {
|
||||
needsFlush = true
|
||||
uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID)
|
||||
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil {
|
||||
_, countFailed := matchPodFailurePolicy(job.Spec.PodFailurePolicy, pod)
|
||||
if countFailed {
|
||||
needsFlush = true
|
||||
uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID)
|
||||
}
|
||||
} else {
|
||||
needsFlush = true
|
||||
uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(newSucceededIndexes)+len(uncountedStatus.Succeeded)+len(uncountedStatus.Failed) >= MaxUncountedPods {
|
||||
@@ -1060,6 +1089,18 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
|
||||
job.Status.Succeeded = int32(succeededIndexes.total())
|
||||
job.Status.CompletedIndexes = succeededIndexes.String()
|
||||
}
|
||||
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
|
||||
if finishedCond != nil && finishedCond.Type == batch.AlphaNoCompatGuaranteeJobFailureTarget {
|
||||
|
||||
// Append the interim FailureTarget condition to update the job status with before finalizers are removed.
|
||||
job.Status.Conditions = append(job.Status.Conditions, *finishedCond)
|
||||
needsFlush = true
|
||||
|
||||
// Prepare the final Failed condition to update the job status with after the finalizers are removed.
|
||||
// It is also used in the enactJobFinished function for reporting.
|
||||
finishedCond = newFailedConditionForFailureTarget(finishedCond)
|
||||
}
|
||||
}
|
||||
var err error
|
||||
if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, needsFlush); err != nil {
|
||||
return err
|
||||
@@ -1077,7 +1118,8 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
|
||||
}
|
||||
|
||||
// flushUncountedAndRemoveFinalizers does:
|
||||
// 1. flush the Job status that might include new uncounted Pod UIDs.
|
||||
// 1. flush the Job status that might include new uncounted Pod UIDs. Also flush the interim FailureTarget condition
|
||||
// if present.
|
||||
// 2. perform the removal of finalizers from Pods which are in the uncounted
|
||||
// lists.
|
||||
// 3. update the counters based on the Pods for which it successfully removed
|
||||
@@ -1231,6 +1273,12 @@ func filterInUncountedUIDs(uncounted []types.UID, include sets.String) []types.U
|
||||
return newUncounted
|
||||
}
|
||||
|
||||
// newFailedConditionForFailureTarget creates a job Failed condition based on
|
||||
// the interim FailureTarget condition.
|
||||
func newFailedConditionForFailureTarget(condition *batch.JobCondition) *batch.JobCondition {
|
||||
return newCondition(batch.JobFailed, v1.ConditionTrue, condition.Reason, condition.Message)
|
||||
}
|
||||
|
||||
// pastBackoffLimitOnFailure checks if container restartCounts sum exceeds BackoffLimit
|
||||
// this method applies only to pods with restartPolicy == OnFailure
|
||||
func pastBackoffLimitOnFailure(job *batch.Job, pods []*v1.Pod) bool {
|
||||
@@ -1282,7 +1330,24 @@ func newCondition(conditionType batch.JobConditionType, status v1.ConditionStatu
|
||||
}
|
||||
}
|
||||
|
||||
// getStatus returns number of succeeded and failed pods running a job
|
||||
// getFailJobMessage returns a job failure message if the job should fail with the current counters
|
||||
func getFailJobMessage(job *batch.Job, pods []*v1.Pod, uncounted sets.String) *string {
|
||||
if !feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) || job.Spec.PodFailurePolicy == nil {
|
||||
return nil
|
||||
}
|
||||
for _, p := range pods {
|
||||
if isPodFailed(p, uncounted != nil) {
|
||||
jobFailureMessage, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
|
||||
if jobFailureMessage != nil {
|
||||
return jobFailureMessage
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// getStatus returns number of succeeded and failed pods running a job. The number
|
||||
// of failed pods can be affected by the podFailurePolicy.
|
||||
func getStatus(job *batch.Job, pods []*v1.Pod, uncounted *uncountedTerminatedPods, expectedRmFinalizers sets.String) (succeeded, failed int32) {
|
||||
if uncounted != nil {
|
||||
succeeded = job.Status.Succeeded
|
||||
@@ -1292,13 +1357,15 @@ func getStatus(job *batch.Job, pods []*v1.Pod, uncounted *uncountedTerminatedPod
|
||||
return p.Status.Phase == v1.PodSucceeded
|
||||
}))
|
||||
failed += int32(countValidPodsWithFilter(job, pods, uncounted.Failed(), expectedRmFinalizers, func(p *v1.Pod) bool {
|
||||
if p.Status.Phase == v1.PodFailed {
|
||||
return true
|
||||
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil {
|
||||
if !isPodFailed(p, uncounted != nil) {
|
||||
return false
|
||||
}
|
||||
_, countFailed := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
|
||||
return countFailed
|
||||
} else {
|
||||
return isPodFailed(p, uncounted != nil)
|
||||
}
|
||||
// When tracking with finalizers: counting deleted Pods as failures to
|
||||
// account for orphan Pods that never have a chance to reach the Failed
|
||||
// phase.
|
||||
return uncounted != nil && p.DeletionTimestamp != nil && p.Status.Phase != v1.PodSucceeded
|
||||
}))
|
||||
return succeeded, failed
|
||||
}
|
||||
@@ -1667,6 +1734,16 @@ func ensureJobConditionStatus(list []batch.JobCondition, cType batch.JobConditio
|
||||
return list, false
|
||||
}
|
||||
|
||||
func isPodFailed(p *v1.Pod, wFinalizers bool) bool {
|
||||
if p.Status.Phase == v1.PodFailed {
|
||||
return true
|
||||
}
|
||||
// When tracking with finalizers: counting deleted Pods as failures to
|
||||
// account for orphan Pods that never have a chance to reach the Failed
|
||||
// phase.
|
||||
return wFinalizers && p.DeletionTimestamp != nil && p.Status.Phase != v1.PodSucceeded
|
||||
}
|
||||
|
||||
func findConditionByType(list []batch.JobCondition, cType batch.JobConditionType) *batch.JobCondition {
|
||||
for i := range list {
|
||||
if list[i].Type == cType {
|
||||
|
@@ -2019,6 +2019,962 @@ func TestSyncJobDeleted(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
||||
indexedCompletionMode := batch.IndexedCompletion
|
||||
validObjectMeta := metav1.ObjectMeta{
|
||||
Name: "foobar",
|
||||
UID: uuid.NewUUID(),
|
||||
Namespace: metav1.NamespaceDefault,
|
||||
}
|
||||
validSelector := &metav1.LabelSelector{
|
||||
MatchLabels: map[string]string{"foo": "bar"},
|
||||
}
|
||||
validTemplate := v1.PodTemplateSpec{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
},
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
Containers: []v1.Container{
|
||||
{Image: "foo/bar"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
onExitCodeRules := []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{1, 2, 3},
|
||||
},
|
||||
},
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{5, 6, 7},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
testCases := map[string]struct {
|
||||
enableJobPodFailurePolicy bool
|
||||
job batch.Job
|
||||
pods []v1.PodStatus
|
||||
wantConditions *[]batch.JobCondition
|
||||
wantStatusFailed int32
|
||||
wantStatusActive int32
|
||||
wantStatusSucceeded int32
|
||||
}{
|
||||
"default handling for pod failure if the container matching the exit codes does not match the containerName restriction": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
ContainerName: pointer.String("main-container"),
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{1, 2, 3},
|
||||
},
|
||||
},
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
ContainerName: pointer.String("main-container"),
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{5, 6, 7},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "monitoring-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 42,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: nil,
|
||||
wantStatusActive: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
wantStatusFailed: 1,
|
||||
},
|
||||
"running pod should not result in job fail based on OnExitCodes": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: onExitCodeRules,
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodRunning,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: nil,
|
||||
wantStatusActive: 1,
|
||||
wantStatusFailed: 0,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"fail job based on OnExitCodes": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: onExitCodeRules,
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: &[]batch.JobCondition{
|
||||
{
|
||||
Type: batch.JobFailed,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "PodFailurePolicy",
|
||||
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
||||
},
|
||||
},
|
||||
wantStatusActive: 0,
|
||||
wantStatusFailed: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"job marked already as failure target with failed pod": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: onExitCodeRules,
|
||||
},
|
||||
},
|
||||
Status: batch.JobStatus{
|
||||
Conditions: []batch.JobCondition{
|
||||
{
|
||||
Type: batch.AlphaNoCompatGuaranteeJobFailureTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "PodFailurePolicy",
|
||||
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: &[]batch.JobCondition{
|
||||
{
|
||||
Type: batch.JobFailed,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "PodFailurePolicy",
|
||||
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
||||
},
|
||||
},
|
||||
wantStatusActive: 0,
|
||||
wantStatusFailed: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"job marked already as failure target with failed pod, message based on already deleted pod": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: onExitCodeRules,
|
||||
},
|
||||
},
|
||||
Status: batch.JobStatus{
|
||||
Conditions: []batch.JobCondition{
|
||||
{
|
||||
Type: batch.AlphaNoCompatGuaranteeJobFailureTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "PodFailurePolicy",
|
||||
Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: &[]batch.JobCondition{
|
||||
{
|
||||
Type: batch.JobFailed,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "PodFailurePolicy",
|
||||
Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
|
||||
},
|
||||
},
|
||||
wantStatusActive: 0,
|
||||
wantStatusFailed: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"default handling for a failed pod when the feature is disabled even, despite matching rule": {
|
||||
enableJobPodFailurePolicy: false,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: onExitCodeRules,
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: nil,
|
||||
wantStatusActive: 1,
|
||||
wantStatusFailed: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"fail job with multiple pods": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(2),
|
||||
Completions: pointer.Int32(2),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: onExitCodeRules,
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodRunning,
|
||||
},
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: &[]batch.JobCondition{
|
||||
{
|
||||
Type: batch.JobFailed,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "PodFailurePolicy",
|
||||
Message: "Container main-container for pod default/mypod-1 failed with exit code 5 matching FailJob rule at index 1",
|
||||
},
|
||||
},
|
||||
wantStatusActive: 0,
|
||||
wantStatusFailed: 2,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"fail indexed job based on OnExitCodes": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
CompletionMode: &indexedCompletionMode,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: onExitCodeRules,
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: &[]batch.JobCondition{
|
||||
{
|
||||
Type: batch.JobFailed,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "PodFailurePolicy",
|
||||
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
||||
},
|
||||
},
|
||||
wantStatusActive: 0,
|
||||
wantStatusFailed: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"fail job based on OnExitCodes with NotIn operator": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
|
||||
Values: []int32{5, 6, 7},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 42,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: &[]batch.JobCondition{
|
||||
{
|
||||
Type: batch.JobFailed,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "PodFailurePolicy",
|
||||
Message: "Container main-container for pod default/mypod-0 failed with exit code 42 matching FailJob rule at index 0",
|
||||
},
|
||||
},
|
||||
wantStatusActive: 0,
|
||||
wantStatusFailed: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"default handling job based on OnExitCodes with NotIn operator": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
|
||||
Values: []int32{5, 6, 7},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: nil,
|
||||
wantStatusActive: 1,
|
||||
wantStatusFailed: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"fail job based on OnExitCodes for InitContainer": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: onExitCodeRules,
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
InitContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "init-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 143,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: &[]batch.JobCondition{
|
||||
{
|
||||
Type: batch.JobFailed,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "PodFailurePolicy",
|
||||
Message: "Container init-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
||||
},
|
||||
},
|
||||
wantStatusActive: 0,
|
||||
wantStatusFailed: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"ignore pod failure; both rules are matching, the first is executed only": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(0),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: onExitCodeRules,
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "container1",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "container2",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: nil,
|
||||
wantStatusActive: 1,
|
||||
wantStatusFailed: 0,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"ignore pod failure based on OnExitCodes": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(0),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: onExitCodeRules,
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: nil,
|
||||
wantStatusActive: 1,
|
||||
wantStatusFailed: 0,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"default job based on OnExitCodes": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(0),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: onExitCodeRules,
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: &[]batch.JobCondition{
|
||||
{
|
||||
Type: batch.JobFailed,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "BackoffLimitExceeded",
|
||||
Message: "Job has reached the specified backoff limit",
|
||||
},
|
||||
},
|
||||
wantStatusActive: 0,
|
||||
wantStatusFailed: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"count pod failure based on OnExitCodes; both rules are matching, the first is executed only": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionCount,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{1, 2},
|
||||
},
|
||||
},
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{2, 3},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: nil,
|
||||
wantStatusActive: 1,
|
||||
wantStatusFailed: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"count pod failure based on OnPodConditions; both rules are matching, the first is executed only": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionCount,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.PodConditionType("ResourceLimitExceeded"),
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.PodConditionType("ResourceLimitExceeded"),
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: nil,
|
||||
wantStatusActive: 1,
|
||||
wantStatusFailed: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"ignore pod failure based on OnPodConditions": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(0),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: nil,
|
||||
wantStatusActive: 1,
|
||||
wantStatusFailed: 0,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
"fail job based on OnPodConditions": {
|
||||
enableJobPodFailurePolicy: true,
|
||||
job: batch.Job{
|
||||
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||
ObjectMeta: validObjectMeta,
|
||||
Spec: batch.JobSpec{
|
||||
Selector: validSelector,
|
||||
Template: validTemplate,
|
||||
Parallelism: pointer.Int32(1),
|
||||
Completions: pointer.Int32(1),
|
||||
BackoffLimit: pointer.Int32(6),
|
||||
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
pods: []v1.PodStatus{
|
||||
{
|
||||
Phase: v1.PodFailed,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantConditions: &[]batch.JobCondition{
|
||||
{
|
||||
Type: batch.JobFailed,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "PodFailurePolicy",
|
||||
Message: "Pod default/mypod-0 has condition DisruptionTarget matching FailJob rule at index 0",
|
||||
},
|
||||
},
|
||||
wantStatusActive: 0,
|
||||
wantStatusFailed: 1,
|
||||
wantStatusSucceeded: 0,
|
||||
},
|
||||
}
|
||||
for _, wFinalizers := range []bool{false, true} {
|
||||
for name, tc := range testCases {
|
||||
t.Run(fmt.Sprintf("%s; finalizers=%t", name, wFinalizers), func(t *testing.T) {
|
||||
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, wFinalizers)()
|
||||
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)()
|
||||
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
|
||||
manager, sharedInformerFactory := newControllerFromClient(clientset, controller.NoResyncPeriodFunc)
|
||||
fakePodControl := controller.FakePodControl{}
|
||||
manager.podControl = &fakePodControl
|
||||
manager.podStoreSynced = alwaysReady
|
||||
manager.jobStoreSynced = alwaysReady
|
||||
job := &tc.job
|
||||
|
||||
if wFinalizers {
|
||||
job.Annotations = map[string]string{
|
||||
batch.JobTrackingFinalizer: "",
|
||||
}
|
||||
}
|
||||
|
||||
actual := job
|
||||
manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
|
||||
actual = job
|
||||
return job, nil
|
||||
}
|
||||
sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
|
||||
for i, podStatus := range tc.pods {
|
||||
pb := buildPod().name(fmt.Sprintf("mypod-%d", i)).job(job).status(podStatus)
|
||||
if job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion {
|
||||
pb.index(fmt.Sprintf("%v", i))
|
||||
}
|
||||
if wFinalizers {
|
||||
pb.trackingFinalizer()
|
||||
}
|
||||
sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer().Add(pb.Pod)
|
||||
}
|
||||
|
||||
manager.syncJob(context.TODO(), testutil.GetKey(job, t))
|
||||
|
||||
if tc.wantConditions != nil {
|
||||
for _, wantCondition := range *tc.wantConditions {
|
||||
conditions := getConditionsByType(actual.Status.Conditions, wantCondition.Type)
|
||||
if len(conditions) != 1 {
|
||||
t.Fatalf("Expected a single completion condition. Got %#v for type: %q", conditions, wantCondition.Type)
|
||||
}
|
||||
condition := *conditions[0]
|
||||
if diff := cmp.Diff(wantCondition, condition, cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" {
|
||||
t.Errorf("Unexpected job condition (-want,+got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if cond := hasTrueCondition(actual); cond != nil {
|
||||
t.Errorf("Got condition %s, want none", *cond)
|
||||
}
|
||||
}
|
||||
// validate status
|
||||
if actual.Status.Active != tc.wantStatusActive {
|
||||
t.Errorf("unexpected number of active pods. Expected %d, saw %d\n", tc.wantStatusActive, actual.Status.Active)
|
||||
}
|
||||
if actual.Status.Succeeded != tc.wantStatusSucceeded {
|
||||
t.Errorf("unexpected number of succeeded pods. Expected %d, saw %d\n", tc.wantStatusSucceeded, actual.Status.Succeeded)
|
||||
}
|
||||
if actual.Status.Failed != tc.wantStatusFailed {
|
||||
t.Errorf("unexpected number of failed pods. Expected %d, saw %d\n", tc.wantStatusFailed, actual.Status.Failed)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyncJobUpdateRequeue(t *testing.T) {
|
||||
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
|
||||
DefaultJobBackOff = time.Duration(0) // overwrite the default value for testing
|
||||
@@ -3449,6 +4405,11 @@ func (pb podBuilder) index(ix string) podBuilder {
|
||||
return pb
|
||||
}
|
||||
|
||||
func (pb podBuilder) status(s v1.PodStatus) podBuilder {
|
||||
pb.Status = s
|
||||
return pb
|
||||
}
|
||||
|
||||
func (pb podBuilder) phase(p v1.PodPhase) podBuilder {
|
||||
pb.Status.Phase = p
|
||||
return pb
|
||||
|
117
pkg/controller/job/pod_failure_policy.go
Normal file
117
pkg/controller/job/pod_failure_policy.go
Normal file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package job
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
batch "k8s.io/api/batch/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
)
|
||||
|
||||
// matchPodFailurePolicy returns information about matching a given failed pod
|
||||
// against the pod failure policy rules. The information is represented as an
|
||||
// optional job failure message (present in case the pod matched a 'FailJob'
|
||||
// rule) and a boolean indicating if the failure should be counted towards
|
||||
// backoffLimit (it should not be counted if the pod matched an 'Ignore' rule).
|
||||
func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool) {
|
||||
if podFailurePolicy == nil {
|
||||
return nil, true
|
||||
}
|
||||
for index, podFailurePolicyRule := range podFailurePolicy.Rules {
|
||||
if podFailurePolicyRule.OnExitCodes != nil {
|
||||
if containerStatus := matchOnExitCodes(&failedPod.Status, podFailurePolicyRule.OnExitCodes); containerStatus != nil {
|
||||
switch podFailurePolicyRule.Action {
|
||||
case batch.PodFailurePolicyActionIgnore:
|
||||
return nil, false
|
||||
case batch.PodFailurePolicyActionCount:
|
||||
return nil, true
|
||||
case batch.PodFailurePolicyActionFailJob:
|
||||
msg := fmt.Sprintf("Container %s for pod %s/%s failed with exit code %v matching %v rule at index %d",
|
||||
containerStatus.Name, failedPod.Namespace, failedPod.Name, containerStatus.State.Terminated.ExitCode, podFailurePolicyRule.Action, index)
|
||||
return &msg, true
|
||||
}
|
||||
}
|
||||
} else if podFailurePolicyRule.OnPodConditions != nil {
|
||||
if podCondition := matchOnPodConditions(&failedPod.Status, podFailurePolicyRule.OnPodConditions); podCondition != nil {
|
||||
switch podFailurePolicyRule.Action {
|
||||
case batch.PodFailurePolicyActionIgnore:
|
||||
return nil, false
|
||||
case batch.PodFailurePolicyActionCount:
|
||||
return nil, true
|
||||
case batch.PodFailurePolicyActionFailJob:
|
||||
msg := fmt.Sprintf("Pod %s/%s has condition %v matching %v rule at index %d",
|
||||
failedPod.Namespace, failedPod.Name, podCondition.Type, podFailurePolicyRule.Action, index)
|
||||
return &msg, true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil, true
|
||||
}
|
||||
|
||||
func matchOnExitCodes(podStatus *v1.PodStatus, requirement *batch.PodFailurePolicyOnExitCodesRequirement) *v1.ContainerStatus {
|
||||
if containerStatus := getMatchingContainerFromList(podStatus.ContainerStatuses, requirement); containerStatus != nil {
|
||||
return containerStatus
|
||||
}
|
||||
return getMatchingContainerFromList(podStatus.InitContainerStatuses, requirement)
|
||||
}
|
||||
|
||||
func matchOnPodConditions(podStatus *v1.PodStatus, requirement []batch.PodFailurePolicyOnPodConditionsPattern) *v1.PodCondition {
|
||||
for _, podCondition := range podStatus.Conditions {
|
||||
for _, pattern := range requirement {
|
||||
if podCondition.Type == pattern.Type && podCondition.Status == pattern.Status {
|
||||
return &podCondition
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func getMatchingContainerFromList(containerStatuses []v1.ContainerStatus, requirement *batch.PodFailurePolicyOnExitCodesRequirement) *v1.ContainerStatus {
|
||||
for _, containerStatus := range containerStatuses {
|
||||
if requirement.ContainerName == nil || *requirement.ContainerName == containerStatus.Name {
|
||||
if containerStatus.State.Terminated.ExitCode != 0 {
|
||||
if isOnExitCodesOperatorMatching(containerStatus.State.Terminated.ExitCode, requirement) {
|
||||
return &containerStatus
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func isOnExitCodesOperatorMatching(exitCode int32, requirement *batch.PodFailurePolicyOnExitCodesRequirement) bool {
|
||||
switch requirement.Operator {
|
||||
case batch.PodFailurePolicyOnExitCodesOpIn:
|
||||
for _, value := range requirement.Values {
|
||||
if value == exitCode {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
case batch.PodFailurePolicyOnExitCodesOpNotIn:
|
||||
for _, value := range requirement.Values {
|
||||
if value == exitCode {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
707
pkg/controller/job/pod_failure_policy_test.go
Normal file
707
pkg/controller/job/pod_failure_policy_test.go
Normal file
@@ -0,0 +1,707 @@
|
||||
/*
|
||||
Copyright 2015 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package job
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
batch "k8s.io/api/batch/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
_ "k8s.io/kubernetes/pkg/apis/core/install"
|
||||
"k8s.io/utils/pointer"
|
||||
)
|
||||
|
||||
func TestMatchPodFailurePolicy(t *testing.T) {
|
||||
validPodObjectMeta := metav1.ObjectMeta{
|
||||
Namespace: "default",
|
||||
Name: "mypod",
|
||||
}
|
||||
|
||||
testCases := map[string]struct {
|
||||
podFailurePolicy *batch.PodFailurePolicy
|
||||
failedPod *v1.Pod
|
||||
wantJobFailureMessage *string
|
||||
wantCountFailed bool
|
||||
}{
|
||||
"unknown action for rule matching by exit codes - skip rule with unknown action": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: "UnknownAction",
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{1, 2},
|
||||
},
|
||||
},
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{2, 3},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 1"),
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"unknown action for rule matching by pod conditions - skip rule with unknown action": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: "UnkonwnAction",
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: false,
|
||||
},
|
||||
"unknown operator - rule with unknown action is skipped for onExitCodes": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: "UnknownOperator",
|
||||
Values: []int32{1, 2},
|
||||
},
|
||||
},
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{2, 3},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 1"),
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"no policy rules": {
|
||||
podFailurePolicy: nil,
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"ignore rule matched for exit codes": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{1, 2, 3},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: false,
|
||||
},
|
||||
"FailJob rule matched for exit codes": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{1, 2, 3},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 0"),
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"successful containers are skipped by the rules": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
|
||||
Values: []int32{111},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
InitContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "init-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 111,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "suppport-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"pod failure policy with NotIn operator and value 0": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
|
||||
Values: []int32{0},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "suppport-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 1 matching FailJob rule at index 0"),
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"second jobfail rule matched for exit codes": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionCount,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{1, 2, 3},
|
||||
},
|
||||
},
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{4, 5, 6},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 6 matching FailJob rule at index 1"),
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"count rule matched for exit codes": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionCount,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{1, 2, 3},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
Name: "main-container",
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"ignore rule matched for pod conditions": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: false,
|
||||
},
|
||||
"ignore rule matches by the status=False": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionFalse,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionFalse,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: false,
|
||||
},
|
||||
"ignore rule matches by the status=Unknown": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionUnknown,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionUnknown,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: false,
|
||||
},
|
||||
"ignore rule does not match when status for pattern is False, but actual True": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionFalse,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"ignore rule does not match when status for pattern is True, but actual False": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionFalse,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"default - do not match condition with status=False": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionFalse,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"job fail rule matched for pod conditions": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: pointer.String("Pod default/mypod has condition DisruptionTarget matching FailJob rule at index 0"),
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"count rule matched for pod conditions": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionCount,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: true,
|
||||
},
|
||||
"no rule matched": {
|
||||
podFailurePolicy: &batch.PodFailurePolicy{
|
||||
Rules: []batch.PodFailurePolicyRule{
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionCount,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
|
||||
Values: []int32{8},
|
||||
},
|
||||
},
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||
Values: []int32{1, 2, 3},
|
||||
},
|
||||
},
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionFailJob,
|
||||
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||
Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
|
||||
Values: []int32{5, 6, 7},
|
||||
},
|
||||
},
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionCount,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.PodConditionType("ResourceLimitExceeded"),
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Action: batch.PodFailurePolicyActionIgnore,
|
||||
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||
{
|
||||
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
failedPod: &v1.Pod{
|
||||
ObjectMeta: validPodObjectMeta,
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
ContainerStatuses: []v1.ContainerStatus{
|
||||
{
|
||||
State: v1.ContainerState{
|
||||
Terminated: &v1.ContainerStateTerminated{
|
||||
ExitCode: 32,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
wantJobFailureMessage: nil,
|
||||
wantCountFailed: true,
|
||||
},
|
||||
}
|
||||
for name, tc := range testCases {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
jobFailMessage, countFailed := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod)
|
||||
if tc.wantJobFailureMessage == nil {
|
||||
if jobFailMessage != nil {
|
||||
t.Errorf("Unexpected job fail message. Got: %q", *jobFailMessage)
|
||||
}
|
||||
} else {
|
||||
if jobFailMessage == nil {
|
||||
t.Errorf("Missing job fail message. want: %q", *tc.wantJobFailureMessage)
|
||||
} else if *tc.wantJobFailureMessage != *jobFailMessage {
|
||||
t.Errorf("Unexpected job fail message. want: %q. got: %q", *tc.wantJobFailureMessage, *jobFailMessage)
|
||||
}
|
||||
}
|
||||
if tc.wantCountFailed != countFailed {
|
||||
t.Errorf("Unexpected count failed. want: %v. got: %v", tc.wantCountFailed, countFailed)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user