Support handling of pod failures with respect to the specified rules

This commit is contained in:
Michal Wozniak
2022-08-04 08:21:32 +02:00
parent c8edeab234
commit bf9ce70de3
43 changed files with 5934 additions and 127 deletions

View File

@@ -61,6 +61,12 @@ import (
// a Job. It is used if the feature gate JobReadyPods is enabled.
const podUpdateBatchPeriod = time.Second
const (
// PodFailurePolicy reason indicates a job failure condition is added due to
// a failed pod matching a pod failure policy rule
jobConditionReasonPodFailurePolicy = "PodFailurePolicy"
)
// controllerKind contains the schema.GroupVersionKind for this controller type.
var controllerKind = batch.SchemeGroupVersion.WithKind("Job")
@@ -758,16 +764,31 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (forget bool, rEr
exceedsBackoffLimit := jobHasNewFailure && (active != *job.Spec.Parallelism) &&
(failed > *job.Spec.BackoffLimit)
if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
// check if the number of pod restart exceeds backoff (for restart OnFailure only)
// OR if the number of failed jobs increased since the last syncJob
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "BackoffLimitExceeded", "Job has reached the specified backoff limit")
} else if pastActiveDeadline(&job) {
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded", "Job was active longer than specified deadline")
} else if job.Spec.ActiveDeadlineSeconds != nil && !jobSuspended(&job) {
syncDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second - time.Since(job.Status.StartTime.Time)
klog.V(2).InfoS("Job has activeDeadlineSeconds configuration. Will sync this job again", "job", key, "nextSyncIn", syncDuration)
jm.queue.AddAfter(key, syncDuration)
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.AlphaNoCompatGuaranteeJobFailureTarget); failureTargetCondition != nil {
finishedCondition = newFailedConditionForFailureTarget(failureTargetCondition)
} else if failJobMessage := getFailJobMessage(&job, pods, uncounted.Failed()); failJobMessage != nil {
if uncounted != nil {
// Prepare the interim FailureTarget condition to record the failure message before the finalizers (allowing removal of the pods) are removed.
finishedCondition = newCondition(batch.AlphaNoCompatGuaranteeJobFailureTarget, v1.ConditionTrue, jobConditionReasonPodFailurePolicy, *failJobMessage)
} else {
// Prepare the Failed job condition for the legacy path without finalizers (don't use the interim FailureTarget condition).
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, jobConditionReasonPodFailurePolicy, *failJobMessage)
}
}
}
if finishedCondition == nil {
if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
// check if the number of pod restart exceeds backoff (for restart OnFailure only)
// OR if the number of failed jobs increased since the last syncJob
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "BackoffLimitExceeded", "Job has reached the specified backoff limit")
} else if pastActiveDeadline(&job) {
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded", "Job was active longer than specified deadline")
} else if job.Spec.ActiveDeadlineSeconds != nil && !jobSuspended(&job) {
syncDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second - time.Since(job.Status.StartTime.Time)
klog.V(2).InfoS("Job has activeDeadlineSeconds configuration. Will sync this job again", "job", key, "nextSyncIn", syncDuration)
jm.queue.AddAfter(key, syncDuration)
}
}
var prevSucceededIndexes, succeededIndexes orderedIntervals
@@ -1039,8 +1060,16 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
} else if pod.Status.Phase == v1.PodFailed || podTerminating {
ix := getCompletionIndex(pod.Annotations)
if !uncounted.failed.Has(string(pod.UID)) && (!isIndexed || (ix != unknownCompletionIndex && ix < int(*job.Spec.Completions))) {
needsFlush = true
uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID)
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil {
_, countFailed := matchPodFailurePolicy(job.Spec.PodFailurePolicy, pod)
if countFailed {
needsFlush = true
uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID)
}
} else {
needsFlush = true
uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID)
}
}
}
if len(newSucceededIndexes)+len(uncountedStatus.Succeeded)+len(uncountedStatus.Failed) >= MaxUncountedPods {
@@ -1060,6 +1089,18 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
job.Status.Succeeded = int32(succeededIndexes.total())
job.Status.CompletedIndexes = succeededIndexes.String()
}
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
if finishedCond != nil && finishedCond.Type == batch.AlphaNoCompatGuaranteeJobFailureTarget {
// Append the interim FailureTarget condition to update the job status with before finalizers are removed.
job.Status.Conditions = append(job.Status.Conditions, *finishedCond)
needsFlush = true
// Prepare the final Failed condition to update the job status with after the finalizers are removed.
// It is also used in the enactJobFinished function for reporting.
finishedCond = newFailedConditionForFailureTarget(finishedCond)
}
}
var err error
if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, needsFlush); err != nil {
return err
@@ -1077,7 +1118,8 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
}
// flushUncountedAndRemoveFinalizers does:
// 1. flush the Job status that might include new uncounted Pod UIDs.
// 1. flush the Job status that might include new uncounted Pod UIDs. Also flush the interim FailureTarget condition
// if present.
// 2. perform the removal of finalizers from Pods which are in the uncounted
// lists.
// 3. update the counters based on the Pods for which it successfully removed
@@ -1231,6 +1273,12 @@ func filterInUncountedUIDs(uncounted []types.UID, include sets.String) []types.U
return newUncounted
}
// newFailedConditionForFailureTarget creates a job Failed condition based on
// the interim FailureTarget condition.
func newFailedConditionForFailureTarget(condition *batch.JobCondition) *batch.JobCondition {
return newCondition(batch.JobFailed, v1.ConditionTrue, condition.Reason, condition.Message)
}
// pastBackoffLimitOnFailure checks if container restartCounts sum exceeds BackoffLimit
// this method applies only to pods with restartPolicy == OnFailure
func pastBackoffLimitOnFailure(job *batch.Job, pods []*v1.Pod) bool {
@@ -1282,7 +1330,24 @@ func newCondition(conditionType batch.JobConditionType, status v1.ConditionStatu
}
}
// getStatus returns number of succeeded and failed pods running a job
// getFailJobMessage returns a job failure message if the job should fail with the current counters
func getFailJobMessage(job *batch.Job, pods []*v1.Pod, uncounted sets.String) *string {
if !feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) || job.Spec.PodFailurePolicy == nil {
return nil
}
for _, p := range pods {
if isPodFailed(p, uncounted != nil) {
jobFailureMessage, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
if jobFailureMessage != nil {
return jobFailureMessage
}
}
}
return nil
}
// getStatus returns number of succeeded and failed pods running a job. The number
// of failed pods can be affected by the podFailurePolicy.
func getStatus(job *batch.Job, pods []*v1.Pod, uncounted *uncountedTerminatedPods, expectedRmFinalizers sets.String) (succeeded, failed int32) {
if uncounted != nil {
succeeded = job.Status.Succeeded
@@ -1292,13 +1357,15 @@ func getStatus(job *batch.Job, pods []*v1.Pod, uncounted *uncountedTerminatedPod
return p.Status.Phase == v1.PodSucceeded
}))
failed += int32(countValidPodsWithFilter(job, pods, uncounted.Failed(), expectedRmFinalizers, func(p *v1.Pod) bool {
if p.Status.Phase == v1.PodFailed {
return true
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil {
if !isPodFailed(p, uncounted != nil) {
return false
}
_, countFailed := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
return countFailed
} else {
return isPodFailed(p, uncounted != nil)
}
// When tracking with finalizers: counting deleted Pods as failures to
// account for orphan Pods that never have a chance to reach the Failed
// phase.
return uncounted != nil && p.DeletionTimestamp != nil && p.Status.Phase != v1.PodSucceeded
}))
return succeeded, failed
}
@@ -1667,6 +1734,16 @@ func ensureJobConditionStatus(list []batch.JobCondition, cType batch.JobConditio
return list, false
}
func isPodFailed(p *v1.Pod, wFinalizers bool) bool {
if p.Status.Phase == v1.PodFailed {
return true
}
// When tracking with finalizers: counting deleted Pods as failures to
// account for orphan Pods that never have a chance to reach the Failed
// phase.
return wFinalizers && p.DeletionTimestamp != nil && p.Status.Phase != v1.PodSucceeded
}
func findConditionByType(list []batch.JobCondition, cType batch.JobConditionType) *batch.JobCondition {
for i := range list {
if list[i].Type == cType {

View File

@@ -2019,6 +2019,962 @@ func TestSyncJobDeleted(t *testing.T) {
}
}
func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
indexedCompletionMode := batch.IndexedCompletion
validObjectMeta := metav1.ObjectMeta{
Name: "foobar",
UID: uuid.NewUUID(),
Namespace: metav1.NamespaceDefault,
}
validSelector := &metav1.LabelSelector{
MatchLabels: map[string]string{"foo": "bar"},
}
validTemplate := v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"foo": "bar",
},
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{Image: "foo/bar"},
},
},
}
onExitCodeRules := []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionIgnore,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{1, 2, 3},
},
},
{
Action: batch.PodFailurePolicyActionFailJob,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{5, 6, 7},
},
},
}
testCases := map[string]struct {
enableJobPodFailurePolicy bool
job batch.Job
pods []v1.PodStatus
wantConditions *[]batch.JobCondition
wantStatusFailed int32
wantStatusActive int32
wantStatusSucceeded int32
}{
"default handling for pod failure if the container matching the exit codes does not match the containerName restriction": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionIgnore,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
ContainerName: pointer.String("main-container"),
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{1, 2, 3},
},
},
{
Action: batch.PodFailurePolicyActionFailJob,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
ContainerName: pointer.String("main-container"),
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{5, 6, 7},
},
},
},
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "monitoring-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 42,
},
},
},
},
},
},
wantConditions: nil,
wantStatusActive: 1,
wantStatusSucceeded: 0,
wantStatusFailed: 1,
},
"running pod should not result in job fail based on OnExitCodes": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: onExitCodeRules,
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodRunning,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
},
},
wantConditions: nil,
wantStatusActive: 1,
wantStatusFailed: 0,
wantStatusSucceeded: 0,
},
"fail job based on OnExitCodes": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: onExitCodeRules,
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
},
},
wantConditions: &[]batch.JobCondition{
{
Type: batch.JobFailed,
Status: v1.ConditionTrue,
Reason: "PodFailurePolicy",
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
},
},
wantStatusActive: 0,
wantStatusFailed: 1,
wantStatusSucceeded: 0,
},
"job marked already as failure target with failed pod": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: onExitCodeRules,
},
},
Status: batch.JobStatus{
Conditions: []batch.JobCondition{
{
Type: batch.AlphaNoCompatGuaranteeJobFailureTarget,
Status: v1.ConditionTrue,
Reason: "PodFailurePolicy",
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
},
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
},
},
wantConditions: &[]batch.JobCondition{
{
Type: batch.JobFailed,
Status: v1.ConditionTrue,
Reason: "PodFailurePolicy",
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
},
},
wantStatusActive: 0,
wantStatusFailed: 1,
wantStatusSucceeded: 0,
},
"job marked already as failure target with failed pod, message based on already deleted pod": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: onExitCodeRules,
},
},
Status: batch.JobStatus{
Conditions: []batch.JobCondition{
{
Type: batch.AlphaNoCompatGuaranteeJobFailureTarget,
Status: v1.ConditionTrue,
Reason: "PodFailurePolicy",
Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
},
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
},
},
wantConditions: &[]batch.JobCondition{
{
Type: batch.JobFailed,
Status: v1.ConditionTrue,
Reason: "PodFailurePolicy",
Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
},
},
wantStatusActive: 0,
wantStatusFailed: 1,
wantStatusSucceeded: 0,
},
"default handling for a failed pod when the feature is disabled even, despite matching rule": {
enableJobPodFailurePolicy: false,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: onExitCodeRules,
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
},
},
wantConditions: nil,
wantStatusActive: 1,
wantStatusFailed: 1,
wantStatusSucceeded: 0,
},
"fail job with multiple pods": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(2),
Completions: pointer.Int32(2),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: onExitCodeRules,
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodRunning,
},
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
},
},
wantConditions: &[]batch.JobCondition{
{
Type: batch.JobFailed,
Status: v1.ConditionTrue,
Reason: "PodFailurePolicy",
Message: "Container main-container for pod default/mypod-1 failed with exit code 5 matching FailJob rule at index 1",
},
},
wantStatusActive: 0,
wantStatusFailed: 2,
wantStatusSucceeded: 0,
},
"fail indexed job based on OnExitCodes": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
CompletionMode: &indexedCompletionMode,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: onExitCodeRules,
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
},
},
wantConditions: &[]batch.JobCondition{
{
Type: batch.JobFailed,
Status: v1.ConditionTrue,
Reason: "PodFailurePolicy",
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
},
},
wantStatusActive: 0,
wantStatusFailed: 1,
wantStatusSucceeded: 0,
},
"fail job based on OnExitCodes with NotIn operator": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionFailJob,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
Values: []int32{5, 6, 7},
},
},
},
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 42,
},
},
},
},
},
},
wantConditions: &[]batch.JobCondition{
{
Type: batch.JobFailed,
Status: v1.ConditionTrue,
Reason: "PodFailurePolicy",
Message: "Container main-container for pod default/mypod-0 failed with exit code 42 matching FailJob rule at index 0",
},
},
wantStatusActive: 0,
wantStatusFailed: 1,
wantStatusSucceeded: 0,
},
"default handling job based on OnExitCodes with NotIn operator": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionFailJob,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
Values: []int32{5, 6, 7},
},
},
},
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
},
},
wantConditions: nil,
wantStatusActive: 1,
wantStatusFailed: 1,
wantStatusSucceeded: 0,
},
"fail job based on OnExitCodes for InitContainer": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: onExitCodeRules,
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
InitContainerStatuses: []v1.ContainerStatus{
{
Name: "init-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 143,
},
},
},
},
},
},
wantConditions: &[]batch.JobCondition{
{
Type: batch.JobFailed,
Status: v1.ConditionTrue,
Reason: "PodFailurePolicy",
Message: "Container init-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
},
},
wantStatusActive: 0,
wantStatusFailed: 1,
wantStatusSucceeded: 0,
},
"ignore pod failure; both rules are matching, the first is executed only": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(0),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: onExitCodeRules,
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "container1",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 2,
},
},
},
{
Name: "container2",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 6,
},
},
},
},
},
},
wantConditions: nil,
wantStatusActive: 1,
wantStatusFailed: 0,
wantStatusSucceeded: 0,
},
"ignore pod failure based on OnExitCodes": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(0),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: onExitCodeRules,
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 1,
},
},
},
},
},
},
wantConditions: nil,
wantStatusActive: 1,
wantStatusFailed: 0,
wantStatusSucceeded: 0,
},
"default job based on OnExitCodes": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(0),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: onExitCodeRules,
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 10,
},
},
},
},
},
},
wantConditions: &[]batch.JobCondition{
{
Type: batch.JobFailed,
Status: v1.ConditionTrue,
Reason: "BackoffLimitExceeded",
Message: "Job has reached the specified backoff limit",
},
},
wantStatusActive: 0,
wantStatusFailed: 1,
wantStatusSucceeded: 0,
},
"count pod failure based on OnExitCodes; both rules are matching, the first is executed only": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionCount,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{1, 2},
},
},
{
Action: batch.PodFailurePolicyActionIgnore,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{2, 3},
},
},
},
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 2,
},
},
},
},
},
},
wantConditions: nil,
wantStatusActive: 1,
wantStatusFailed: 1,
wantStatusSucceeded: 0,
},
"count pod failure based on OnPodConditions; both rules are matching, the first is executed only": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionCount,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.PodConditionType("ResourceLimitExceeded"),
Status: v1.ConditionTrue,
},
},
},
{
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.PodConditionType("ResourceLimitExceeded"),
Status: v1.ConditionTrue,
},
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
wantConditions: nil,
wantStatusActive: 1,
wantStatusFailed: 1,
wantStatusSucceeded: 0,
},
"ignore pod failure based on OnPodConditions": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(0),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
wantConditions: nil,
wantStatusActive: 1,
wantStatusFailed: 0,
wantStatusSucceeded: 0,
},
"fail job based on OnPodConditions": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(6),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionFailJob,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
},
},
pods: []v1.PodStatus{
{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
wantConditions: &[]batch.JobCondition{
{
Type: batch.JobFailed,
Status: v1.ConditionTrue,
Reason: "PodFailurePolicy",
Message: "Pod default/mypod-0 has condition DisruptionTarget matching FailJob rule at index 0",
},
},
wantStatusActive: 0,
wantStatusFailed: 1,
wantStatusSucceeded: 0,
},
}
for _, wFinalizers := range []bool{false, true} {
for name, tc := range testCases {
t.Run(fmt.Sprintf("%s; finalizers=%t", name, wFinalizers), func(t *testing.T) {
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, wFinalizers)()
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)()
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
manager, sharedInformerFactory := newControllerFromClient(clientset, controller.NoResyncPeriodFunc)
fakePodControl := controller.FakePodControl{}
manager.podControl = &fakePodControl
manager.podStoreSynced = alwaysReady
manager.jobStoreSynced = alwaysReady
job := &tc.job
if wFinalizers {
job.Annotations = map[string]string{
batch.JobTrackingFinalizer: "",
}
}
actual := job
manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
actual = job
return job, nil
}
sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
for i, podStatus := range tc.pods {
pb := buildPod().name(fmt.Sprintf("mypod-%d", i)).job(job).status(podStatus)
if job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion {
pb.index(fmt.Sprintf("%v", i))
}
if wFinalizers {
pb.trackingFinalizer()
}
sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer().Add(pb.Pod)
}
manager.syncJob(context.TODO(), testutil.GetKey(job, t))
if tc.wantConditions != nil {
for _, wantCondition := range *tc.wantConditions {
conditions := getConditionsByType(actual.Status.Conditions, wantCondition.Type)
if len(conditions) != 1 {
t.Fatalf("Expected a single completion condition. Got %#v for type: %q", conditions, wantCondition.Type)
}
condition := *conditions[0]
if diff := cmp.Diff(wantCondition, condition, cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" {
t.Errorf("Unexpected job condition (-want,+got):\n%s", diff)
}
}
} else {
if cond := hasTrueCondition(actual); cond != nil {
t.Errorf("Got condition %s, want none", *cond)
}
}
// validate status
if actual.Status.Active != tc.wantStatusActive {
t.Errorf("unexpected number of active pods. Expected %d, saw %d\n", tc.wantStatusActive, actual.Status.Active)
}
if actual.Status.Succeeded != tc.wantStatusSucceeded {
t.Errorf("unexpected number of succeeded pods. Expected %d, saw %d\n", tc.wantStatusSucceeded, actual.Status.Succeeded)
}
if actual.Status.Failed != tc.wantStatusFailed {
t.Errorf("unexpected number of failed pods. Expected %d, saw %d\n", tc.wantStatusFailed, actual.Status.Failed)
}
})
}
}
}
func TestSyncJobUpdateRequeue(t *testing.T) {
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
DefaultJobBackOff = time.Duration(0) // overwrite the default value for testing
@@ -3449,6 +4405,11 @@ func (pb podBuilder) index(ix string) podBuilder {
return pb
}
func (pb podBuilder) status(s v1.PodStatus) podBuilder {
pb.Status = s
return pb
}
func (pb podBuilder) phase(p v1.PodPhase) podBuilder {
pb.Status.Phase = p
return pb

View File

@@ -0,0 +1,117 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"fmt"
batch "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
)
// matchPodFailurePolicy returns information about matching a given failed pod
// against the pod failure policy rules. The information is represented as an
// optional job failure message (present in case the pod matched a 'FailJob'
// rule) and a boolean indicating if the failure should be counted towards
// backoffLimit (it should not be counted if the pod matched an 'Ignore' rule).
func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool) {
if podFailurePolicy == nil {
return nil, true
}
for index, podFailurePolicyRule := range podFailurePolicy.Rules {
if podFailurePolicyRule.OnExitCodes != nil {
if containerStatus := matchOnExitCodes(&failedPod.Status, podFailurePolicyRule.OnExitCodes); containerStatus != nil {
switch podFailurePolicyRule.Action {
case batch.PodFailurePolicyActionIgnore:
return nil, false
case batch.PodFailurePolicyActionCount:
return nil, true
case batch.PodFailurePolicyActionFailJob:
msg := fmt.Sprintf("Container %s for pod %s/%s failed with exit code %v matching %v rule at index %d",
containerStatus.Name, failedPod.Namespace, failedPod.Name, containerStatus.State.Terminated.ExitCode, podFailurePolicyRule.Action, index)
return &msg, true
}
}
} else if podFailurePolicyRule.OnPodConditions != nil {
if podCondition := matchOnPodConditions(&failedPod.Status, podFailurePolicyRule.OnPodConditions); podCondition != nil {
switch podFailurePolicyRule.Action {
case batch.PodFailurePolicyActionIgnore:
return nil, false
case batch.PodFailurePolicyActionCount:
return nil, true
case batch.PodFailurePolicyActionFailJob:
msg := fmt.Sprintf("Pod %s/%s has condition %v matching %v rule at index %d",
failedPod.Namespace, failedPod.Name, podCondition.Type, podFailurePolicyRule.Action, index)
return &msg, true
}
}
}
}
return nil, true
}
func matchOnExitCodes(podStatus *v1.PodStatus, requirement *batch.PodFailurePolicyOnExitCodesRequirement) *v1.ContainerStatus {
if containerStatus := getMatchingContainerFromList(podStatus.ContainerStatuses, requirement); containerStatus != nil {
return containerStatus
}
return getMatchingContainerFromList(podStatus.InitContainerStatuses, requirement)
}
func matchOnPodConditions(podStatus *v1.PodStatus, requirement []batch.PodFailurePolicyOnPodConditionsPattern) *v1.PodCondition {
for _, podCondition := range podStatus.Conditions {
for _, pattern := range requirement {
if podCondition.Type == pattern.Type && podCondition.Status == pattern.Status {
return &podCondition
}
}
}
return nil
}
func getMatchingContainerFromList(containerStatuses []v1.ContainerStatus, requirement *batch.PodFailurePolicyOnExitCodesRequirement) *v1.ContainerStatus {
for _, containerStatus := range containerStatuses {
if requirement.ContainerName == nil || *requirement.ContainerName == containerStatus.Name {
if containerStatus.State.Terminated.ExitCode != 0 {
if isOnExitCodesOperatorMatching(containerStatus.State.Terminated.ExitCode, requirement) {
return &containerStatus
}
}
}
}
return nil
}
func isOnExitCodesOperatorMatching(exitCode int32, requirement *batch.PodFailurePolicyOnExitCodesRequirement) bool {
switch requirement.Operator {
case batch.PodFailurePolicyOnExitCodesOpIn:
for _, value := range requirement.Values {
if value == exitCode {
return true
}
}
return false
case batch.PodFailurePolicyOnExitCodesOpNotIn:
for _, value := range requirement.Values {
if value == exitCode {
return false
}
}
return true
default:
return false
}
}

View File

@@ -0,0 +1,707 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"testing"
batch "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
_ "k8s.io/kubernetes/pkg/apis/core/install"
"k8s.io/utils/pointer"
)
func TestMatchPodFailurePolicy(t *testing.T) {
validPodObjectMeta := metav1.ObjectMeta{
Namespace: "default",
Name: "mypod",
}
testCases := map[string]struct {
podFailurePolicy *batch.PodFailurePolicy
failedPod *v1.Pod
wantJobFailureMessage *string
wantCountFailed bool
}{
"unknown action for rule matching by exit codes - skip rule with unknown action": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: "UnknownAction",
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{1, 2},
},
},
{
Action: batch.PodFailurePolicyActionFailJob,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{2, 3},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 2,
},
},
},
},
},
},
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 1"),
wantCountFailed: true,
},
"unknown action for rule matching by pod conditions - skip rule with unknown action": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: "UnkonwnAction",
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
{
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: false,
},
"unknown operator - rule with unknown action is skipped for onExitCodes": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionIgnore,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: "UnknownOperator",
Values: []int32{1, 2},
},
},
{
Action: batch.PodFailurePolicyActionFailJob,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{2, 3},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 2,
},
},
},
},
},
},
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 1"),
wantCountFailed: true,
},
"no policy rules": {
podFailurePolicy: nil,
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 2,
},
},
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: true,
},
"ignore rule matched for exit codes": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionIgnore,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{1, 2, 3},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 2,
},
},
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: false,
},
"FailJob rule matched for exit codes": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionFailJob,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{1, 2, 3},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 2,
},
},
},
},
},
},
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 0"),
wantCountFailed: true,
},
"successful containers are skipped by the rules": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionFailJob,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
Values: []int32{111},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
InitContainerStatuses: []v1.ContainerStatus{
{
Name: "init-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 0,
},
},
},
},
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 111,
},
},
},
{
Name: "suppport-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 0,
},
},
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: true,
},
"pod failure policy with NotIn operator and value 0": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionFailJob,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
Values: []int32{0},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 1,
},
},
},
{
Name: "suppport-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 0,
},
},
},
},
},
},
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 1 matching FailJob rule at index 0"),
wantCountFailed: true,
},
"second jobfail rule matched for exit codes": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionCount,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{1, 2, 3},
},
},
{
Action: batch.PodFailurePolicyActionFailJob,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{4, 5, 6},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 6,
},
},
},
},
},
},
wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 6 matching FailJob rule at index 1"),
wantCountFailed: true,
},
"count rule matched for exit codes": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionCount,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{1, 2, 3},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 2,
},
},
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: true,
},
"ignore rule matched for pod conditions": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: false,
},
"ignore rule matches by the status=False": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionFalse,
},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionFalse,
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: false,
},
"ignore rule matches by the status=Unknown": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionUnknown,
},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionUnknown,
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: false,
},
"ignore rule does not match when status for pattern is False, but actual True": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionFalse,
},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: true,
},
"ignore rule does not match when status for pattern is True, but actual False": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionFalse,
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: true,
},
"default - do not match condition with status=False": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionFalse,
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: true,
},
"job fail rule matched for pod conditions": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionFailJob,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
wantJobFailureMessage: pointer.String("Pod default/mypod has condition DisruptionTarget matching FailJob rule at index 0"),
wantCountFailed: true,
},
"count rule matched for pod conditions": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionCount,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: true,
},
"no rule matched": {
podFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionCount,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
Values: []int32{8},
},
},
{
Action: batch.PodFailurePolicyActionIgnore,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{1, 2, 3},
},
},
{
Action: batch.PodFailurePolicyActionFailJob,
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
Values: []int32{5, 6, 7},
},
},
{
Action: batch.PodFailurePolicyActionCount,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.PodConditionType("ResourceLimitExceeded"),
Status: v1.ConditionTrue,
},
},
},
{
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
failedPod: &v1.Pod{
ObjectMeta: validPodObjectMeta,
Status: v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 32,
},
},
},
},
},
},
wantJobFailureMessage: nil,
wantCountFailed: true,
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
jobFailMessage, countFailed := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod)
if tc.wantJobFailureMessage == nil {
if jobFailMessage != nil {
t.Errorf("Unexpected job fail message. Got: %q", *jobFailMessage)
}
} else {
if jobFailMessage == nil {
t.Errorf("Missing job fail message. want: %q", *tc.wantJobFailureMessage)
} else if *tc.wantJobFailureMessage != *jobFailMessage {
t.Errorf("Unexpected job fail message. want: %q. got: %q", *tc.wantJobFailureMessage, *jobFailMessage)
}
}
if tc.wantCountFailed != countFailed {
t.Errorf("Unexpected count failed. want: %v. got: %v", tc.wantCountFailed, countFailed)
}
})
}
}