Support handling of pod failures with respect to the specified rules

2022-08-04 08:21:32 +02:00
parent c8edeab234
commit bf9ce70de3
43 changed files with 5934 additions and 127 deletions
--- a/api/openapi-spec/swagger.json
+++ b/api/openapi-spec/swagger.json
@@ -3707,6 +3707,10 @@
          "format": "int32",
          "type": "integer"
        },
+        "podFailurePolicy": {
+          "$ref": "#/definitions/io.k8s.api.batch.v1.PodFailurePolicy",
+          "description": "Specifies the policy of handling failed pods. In particular, it allows to specify the set of actions and conditions which need to be satisfied to take the associated action. If empty, the default behaviour applies - the counter of failed pods, represented by the jobs's .status.failed field, is incremented and it is checked against the backoffLimit. This field cannot be used in combination with restartPolicy=OnFailure.\n\nThis field is alpha-level. To use this field, you must enable the `JobPodFailurePolicy` feature gate (disabled by default)."
+        },
        "selector": {
          "$ref": "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector",
          "description": "A label query over pods that should match the pod count. Normally, the system sets this field for you. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors"
@@ -3796,6 +3800,94 @@
      },
      "type": "object"
    },
+    "io.k8s.api.batch.v1.PodFailurePolicy": {
+      "description": "PodFailurePolicy describes how failed pods influence the backoffLimit.",
+      "properties": {
+        "rules": {
+          "description": "A list of pod failure policy rules. The rules are evaluated in order. Once a rule matches a Pod failure, the remaining of the rules are ignored. When no rule matches the Pod failure, the default handling applies - the counter of pod failures is incremented and it is checked against the backoffLimit. At most 20 elements are allowed.",
+          "items": {
+            "$ref": "#/definitions/io.k8s.api.batch.v1.PodFailurePolicyRule"
+          },
+          "type": "array",
+          "x-kubernetes-list-type": "atomic"
+        }
+      },
+      "required": [
+        "rules"
+      ],
+      "type": "object"
+    },
+    "io.k8s.api.batch.v1.PodFailurePolicyOnExitCodesRequirement": {
+      "description": "PodFailurePolicyOnExitCodesRequirement describes the requirement for handling a failed pod based on its container exit codes. In particular, it lookups the .state.terminated.exitCode for each app container and init container status, represented by the .status.containerStatuses and .status.initContainerStatuses fields in the Pod status, respectively. Containers completed with success (exit code 0) are excluded from the requirement check.",
+      "properties": {
+        "containerName": {
+          "description": "Restricts the check for exit codes to the container with the specified name. When null, the rule applies to all containers. When specified, it should match one the container or initContainer names in the pod template.",
+          "type": "string"
+        },
+        "operator": {
+          "description": "Represents the relationship between the container exit code(s) and the specified values. Containers completed with success (exit code 0) are excluded from the requirement check. Possible values are: - In: the requirement is satisfied if at least one container exit code\n  (might be multiple if there are multiple containers not restricted\n  by the 'containerName' field) is in the set of specified values.\n- NotIn: the requirement is satisfied if at least one container exit code\n  (might be multiple if there are multiple containers not restricted\n  by the 'containerName' field) is not in the set of specified values.\nAdditional values are considered to be added in the future. Clients should react to an unknown operator by assuming the requirement is not satisfied.\n\n",
+          "type": "string"
+        },
+        "values": {
+          "description": "Specifies the set of values. Each returned container exit code (might be multiple in case of multiple containers) is checked against this set of values with respect to the operator. The list of values must be ordered and must not contain duplicates. Value '0' cannot be used for the In operator. At least one element is required. At most 255 elements are allowed.",
+          "items": {
+            "format": "int32",
+            "type": "integer"
+          },
+          "type": "array",
+          "x-kubernetes-list-type": "set"
+        }
+      },
+      "required": [
+        "operator",
+        "values"
+      ],
+      "type": "object"
+    },
+    "io.k8s.api.batch.v1.PodFailurePolicyOnPodConditionsPattern": {
+      "description": "PodFailurePolicyOnPodConditionsPattern describes a pattern for matching an actual pod condition type.",
+      "properties": {
+        "status": {
+          "description": "Specifies the required Pod condition status. To match a pod condition it is required that the specified status equals the pod condition status. Defaults to True.",
+          "type": "string"
+        },
+        "type": {
+          "description": "Specifies the required Pod condition type. To match a pod condition it is required that specified type equals the pod condition type.",
+          "type": "string"
+        }
+      },
+      "required": [
+        "type",
+        "status"
+      ],
+      "type": "object"
+    },
+    "io.k8s.api.batch.v1.PodFailurePolicyRule": {
+      "description": "PodFailurePolicyRule describes how a pod failure is handled when the requirements are met. One of OnExitCodes and onPodConditions, but not both, can be used in each rule.",
+      "properties": {
+        "action": {
+          "description": "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are: - FailJob: indicates that the pod's job is marked as Failed and all\n  running pods are terminated.\n- Ignore: indicates that the counter towards the .backoffLimit is not\n  incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n  counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.\n\n",
+          "type": "string"
+        },
+        "onExitCodes": {
+          "$ref": "#/definitions/io.k8s.api.batch.v1.PodFailurePolicyOnExitCodesRequirement",
+          "description": "Represents the requirement on the container exit codes."
+        },
+        "onPodConditions": {
+          "description": "Represents the requirement on the pod conditions. The requirement is represented as a list of pod condition patterns. The requirement is satisfied if at least one pattern matches an actual pod condition. At most 20 elements are allowed.",
+          "items": {
+            "$ref": "#/definitions/io.k8s.api.batch.v1.PodFailurePolicyOnPodConditionsPattern"
+          },
+          "type": "array",
+          "x-kubernetes-list-type": "atomic"
+        }
+      },
+      "required": [
+        "action",
+        "onPodConditions"
+      ],
+      "type": "object"
+    },
    "io.k8s.api.batch.v1.UncountedTerminatedPods": {
      "description": "UncountedTerminatedPods holds UIDs of Pods that have terminated but haven't been accounted in Job status counters.",
      "properties": {
--- a/api/openapi-spec/v3/apisbatchv1_openapi.json
+++ b/api/openapi-spec/v3/apisbatchv1_openapi.json
@@ -350,6 +350,14 @@
            "format": "int32",
            "type": "integer"
          },
+          "podFailurePolicy": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/io.k8s.api.batch.v1.PodFailurePolicy"
+              }
+            ],
+            "description": "Specifies the policy of handling failed pods. In particular, it allows to specify the set of actions and conditions which need to be satisfied to take the associated action. If empty, the default behaviour applies - the counter of failed pods, represented by the jobs's .status.failed field, is incremented and it is checked against the backoffLimit. This field cannot be used in combination with restartPolicy=OnFailure.\n\nThis field is alpha-level. To use this field, you must enable the `JobPodFailurePolicy` feature gate (disabled by default)."
+          },
          "selector": {
            "allOf": [
              {
@@ -475,6 +483,113 @@
        },
        "type": "object"
      },
+      "io.k8s.api.batch.v1.PodFailurePolicy": {
+        "description": "PodFailurePolicy describes how failed pods influence the backoffLimit.",
+        "properties": {
+          "rules": {
+            "description": "A list of pod failure policy rules. The rules are evaluated in order. Once a rule matches a Pod failure, the remaining of the rules are ignored. When no rule matches the Pod failure, the default handling applies - the counter of pod failures is incremented and it is checked against the backoffLimit. At most 20 elements are allowed.",
+            "items": {
+              "allOf": [
+                {
+                  "$ref": "#/components/schemas/io.k8s.api.batch.v1.PodFailurePolicyRule"
+                }
+              ],
+              "default": {}
+            },
+            "type": "array",
+            "x-kubernetes-list-type": "atomic"
+          }
+        },
+        "required": [
+          "rules"
+        ],
+        "type": "object"
+      },
+      "io.k8s.api.batch.v1.PodFailurePolicyOnExitCodesRequirement": {
+        "description": "PodFailurePolicyOnExitCodesRequirement describes the requirement for handling a failed pod based on its container exit codes. In particular, it lookups the .state.terminated.exitCode for each app container and init container status, represented by the .status.containerStatuses and .status.initContainerStatuses fields in the Pod status, respectively. Containers completed with success (exit code 0) are excluded from the requirement check.",
+        "properties": {
+          "containerName": {
+            "description": "Restricts the check for exit codes to the container with the specified name. When null, the rule applies to all containers. When specified, it should match one the container or initContainer names in the pod template.",
+            "type": "string"
+          },
+          "operator": {
+            "default": "",
+            "description": "Represents the relationship between the container exit code(s) and the specified values. Containers completed with success (exit code 0) are excluded from the requirement check. Possible values are: - In: the requirement is satisfied if at least one container exit code\n  (might be multiple if there are multiple containers not restricted\n  by the 'containerName' field) is in the set of specified values.\n- NotIn: the requirement is satisfied if at least one container exit code\n  (might be multiple if there are multiple containers not restricted\n  by the 'containerName' field) is not in the set of specified values.\nAdditional values are considered to be added in the future. Clients should react to an unknown operator by assuming the requirement is not satisfied.\n\n",
+            "type": "string"
+          },
+          "values": {
+            "description": "Specifies the set of values. Each returned container exit code (might be multiple in case of multiple containers) is checked against this set of values with respect to the operator. The list of values must be ordered and must not contain duplicates. Value '0' cannot be used for the In operator. At least one element is required. At most 255 elements are allowed.",
+            "items": {
+              "default": 0,
+              "format": "int32",
+              "type": "integer"
+            },
+            "type": "array",
+            "x-kubernetes-list-type": "set"
+          }
+        },
+        "required": [
+          "operator",
+          "values"
+        ],
+        "type": "object"
+      },
+      "io.k8s.api.batch.v1.PodFailurePolicyOnPodConditionsPattern": {
+        "description": "PodFailurePolicyOnPodConditionsPattern describes a pattern for matching an actual pod condition type.",
+        "properties": {
+          "status": {
+            "default": "",
+            "description": "Specifies the required Pod condition status. To match a pod condition it is required that the specified status equals the pod condition status. Defaults to True.",
+            "type": "string"
+          },
+          "type": {
+            "default": "",
+            "description": "Specifies the required Pod condition type. To match a pod condition it is required that specified type equals the pod condition type.",
+            "type": "string"
+          }
+        },
+        "required": [
+          "type",
+          "status"
+        ],
+        "type": "object"
+      },
+      "io.k8s.api.batch.v1.PodFailurePolicyRule": {
+        "description": "PodFailurePolicyRule describes how a pod failure is handled when the requirements are met. One of OnExitCodes and onPodConditions, but not both, can be used in each rule.",
+        "properties": {
+          "action": {
+            "default": "",
+            "description": "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are: - FailJob: indicates that the pod's job is marked as Failed and all\n  running pods are terminated.\n- Ignore: indicates that the counter towards the .backoffLimit is not\n  incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n  counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.\n\n",
+            "type": "string"
+          },
+          "onExitCodes": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/io.k8s.api.batch.v1.PodFailurePolicyOnExitCodesRequirement"
+              }
+            ],
+            "description": "Represents the requirement on the container exit codes."
+          },
+          "onPodConditions": {
+            "description": "Represents the requirement on the pod conditions. The requirement is represented as a list of pod condition patterns. The requirement is satisfied if at least one pattern matches an actual pod condition. At most 20 elements are allowed.",
+            "items": {
+              "allOf": [
+                {
+                  "$ref": "#/components/schemas/io.k8s.api.batch.v1.PodFailurePolicyOnPodConditionsPattern"
+                }
+              ],
+              "default": {}
+            },
+            "type": "array",
+            "x-kubernetes-list-type": "atomic"
+          }
+        },
+        "required": [
+          "action",
+          "onPodConditions"
+        ],
+        "type": "object"
+      },
      "io.k8s.api.batch.v1.UncountedTerminatedPods": {
        "description": "UncountedTerminatedPods holds UIDs of Pods that have terminated but haven't been accounted in Job status counters.",
        "properties": {
--- a/pkg/apis/batch/types.go
+++ b/pkg/apis/batch/types.go
@@ -110,6 +110,119 @@ const (
 	IndexedCompletion CompletionMode = "Indexed"
 )

+// PodFailurePolicyAction specifies how a Pod failure is handled.
+// +enum
+type PodFailurePolicyAction string
+
+const (
+	// This is an action which might be taken on a pod failure - mark the
+	// pod's job as Failed and terminate all running pods.
+	PodFailurePolicyActionFailJob PodFailurePolicyAction = "FailJob"
+
+	// This is an action which might be taken on a pod failure - the counter towards
+	// .backoffLimit, represented by the job's .status.failed field, is not
+	// incremented and a replacement pod is created.
+	PodFailurePolicyActionIgnore PodFailurePolicyAction = "Ignore"
+
+	// This is an action which might be taken on a pod failure - the pod failure
+	// is handled in the default way - the counter towards .backoffLimit,
+	// represented by the job's .status.failed field, is incremented.
+	PodFailurePolicyActionCount PodFailurePolicyAction = "Count"
+)
+
+// +enum
+type PodFailurePolicyOnExitCodesOperator string
+
+const (
+	PodFailurePolicyOnExitCodesOpIn    PodFailurePolicyOnExitCodesOperator = "In"
+	PodFailurePolicyOnExitCodesOpNotIn PodFailurePolicyOnExitCodesOperator = "NotIn"
+)
+
+// PodFailurePolicyOnExitCodesRequirement describes the requirement for handling
+// a failed pod based on its container exit codes. In particular, it lookups the
+// .state.terminated.exitCode for each app container and init container status,
+// represented by the .status.containerStatuses and .status.initContainerStatuses
+// fields in the Pod status, respectively. Containers completed with success
+// (exit code 0) are excluded from the requirement check.
+type PodFailurePolicyOnExitCodesRequirement struct {
+	// Restricts the check for exit codes to the container with the
+	// specified name. When null, the rule applies to all containers.
+	// When specified, it should match one the container or initContainer
+	// names in the pod template.
+	// +optional
+	ContainerName *string
+
+	// Represents the relationship between the container exit code(s) and the
+	// specified values. Containers completed with success (exit code 0) are
+	// excluded from the requirement check. Possible values are:
+	// - In: the requirement is satisfied if at least one container exit code
+	//   (might be multiple if there are multiple containers not restricted
+	//   by the 'containerName' field) is in the set of specified values.
+	// - NotIn: the requirement is satisfied if at least one container exit code
+	//   (might be multiple if there are multiple containers not restricted
+	//   by the 'containerName' field) is not in the set of specified values.
+	// Additional values are considered to be added in the future. Clients should
+	// react to an unknown operator by assuming the requirement is not satisfied.
+	Operator PodFailurePolicyOnExitCodesOperator
+
+	// Specifies the set of values. Each returned container exit code (might be
+	// multiple in case of multiple containers) is checked against this set of
+	// values with respect to the operator. The list of values must be ordered
+	// and must not contain duplicates. Value '0' cannot be used for the In operator.
+	// At least one element is required. At most 255 elements are allowed.
+	// +listType=set
+	Values []int32
+}
+
+// PodFailurePolicyOnPodConditionsPattern describes a pattern for matching
+// an actual pod condition type.
+type PodFailurePolicyOnPodConditionsPattern struct {
+	// Specifies the required Pod condition type. To match a pod condition
+	// it is required that specified type equals the pod condition type.
+	Type api.PodConditionType
+	// Specifies the required Pod condition status. To match a pod condition
+	// it is required that the specified status equals the pod condition status.
+	// Defaults to True.
+	Status api.ConditionStatus
+}
+
+// PodFailurePolicyRule describes how a pod failure is handled when the requirements are met.
+// One of OnExitCodes and onPodConditions, but not both, can be used in each rule.
+type PodFailurePolicyRule struct {
+	// Specifies the action taken on a pod failure when the requirements are satisfied.
+	// Possible values are:
+	// - FailJob: indicates that the pod's job is marked as Failed and all
+	//   running pods are terminated.
+	// - Ignore: indicates that the counter towards the .backoffLimit is not
+	//   incremented and a replacement pod is created.
+	// - Count: indicates that the pod is handled in the default way - the
+	//   counter towards the .backoffLimit is incremented.
+	// Additional values are considered to be added in the future. Clients should
+	// react to an unknown action by skipping the rule.
+	Action PodFailurePolicyAction
+
+	// Represents the requirement on the container exit codes.
+	// +optional
+	OnExitCodes *PodFailurePolicyOnExitCodesRequirement
+
+	// Represents the requirement on the pod conditions. The requirement is represented
+	// as a list of pod condition patterns. The requirement is satisfied if at
+	// least one pattern matches an actual pod condition. At most 20 elements are allowed.
+	// +listType=atomic
+	OnPodConditions []PodFailurePolicyOnPodConditionsPattern
+}
+
+// PodFailurePolicy describes how failed pods influence the backoffLimit.
+type PodFailurePolicy struct {
+	// A list of pod failure policy rules. The rules are evaluated in order.
+	// Once a rule matches a Pod failure, the remaining of the rules are ignored.
+	// When no rule matches the Pod failure, the default handling applies - the
+	// counter of pod failures is incremented and it is checked against
+	// the backoffLimit. At most 20 elements are allowed.
+	// +listType=atomic
+	Rules []PodFailurePolicyRule
+}
+
 // JobSpec describes how the job execution will look like.
 type JobSpec struct {

@@ -128,6 +241,19 @@ type JobSpec struct {
 	// +optional
 	Completions *int32

+	// Specifies the policy of handling failed pods. In particular, it allows to
+	// specify the set of actions and conditions which need to be
+	// satisfied to take the associated action.
+	// If empty, the default behaviour applies - the counter of failed pods,
+	// represented by the jobs's .status.failed field, is incremented and it is
+	// checked against the backoffLimit. This field cannot be used in combination
+	// with .spec.podTemplate.spec.restartPolicy=OnFailure.
+	//
+	// This field is alpha-level. To use this field, you must enable the
+	// `JobPodFailurePolicy` feature gate (disabled by default).
+	// +optional
+	PodFailurePolicy *PodFailurePolicy
+
 	// Specifies the duration in seconds relative to the startTime that the job
 	// may be continuously active before the system tries to terminate it; value
 	// must be positive integer. If a Job is suspended (at creation or through an
@@ -313,6 +439,9 @@ const (
 	JobComplete JobConditionType = "Complete"
 	// JobFailed means the job has failed its execution.
 	JobFailed JobConditionType = "Failed"
+	// FailureTarget means the job is about to fail its execution.
+	// The constant is to be renamed once the name is accepted within the KEP-3329.
+	AlphaNoCompatGuaranteeJobFailureTarget JobConditionType = "FailureTarget"
 )

 // JobCondition describes current state of a job.
--- a/pkg/apis/batch/v1/defaults.go
+++ b/pkg/apis/batch/v1/defaults.go
@@ -18,6 +18,7 @@ package v1

 import (
 	batchv1 "k8s.io/api/batch/v1"
+	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	utilpointer "k8s.io/utils/pointer"
 )
@@ -50,6 +51,17 @@ func SetDefaults_Job(obj *batchv1.Job) {
 	if obj.Spec.Suspend == nil {
 		obj.Spec.Suspend = utilpointer.BoolPtr(false)
 	}
+	if obj.Spec.PodFailurePolicy != nil {
+		for _, rule := range obj.Spec.PodFailurePolicy.Rules {
+			if rule.OnPodConditions != nil {
+				for i, pattern := range rule.OnPodConditions {
+					if pattern.Status == "" {
+						rule.OnPodConditions[i].Status = corev1.ConditionTrue
+					}
+				}
+			}
+		}
+	}
 }

 func SetDefaults_CronJob(obj *batchv1.CronJob) {
--- a/pkg/apis/batch/v1/defaults_test.go
+++ b/pkg/apis/batch/v1/defaults_test.go
@@ -22,7 +22,7 @@ import (

 	"github.com/google/go-cmp/cmp"
 	batchv1 "k8s.io/api/batch/v1"
-	"k8s.io/api/core/v1"
+	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/kubernetes/pkg/api/legacyscheme"
@@ -40,6 +40,97 @@ func TestSetDefaultJob(t *testing.T) {
 		expected     *batchv1.Job
 		expectLabels bool
 	}{
+		"Pod failure policy with some field values unspecified -> set default values": {
+			original: &batchv1.Job{
+				Spec: batchv1.JobSpec{
+					Template: v1.PodTemplateSpec{
+						ObjectMeta: metav1.ObjectMeta{Labels: defaultLabels},
+					},
+					PodFailurePolicy: &batchv1.PodFailurePolicy{
+						Rules: []batchv1.PodFailurePolicyRule{
+							{
+								Action: batchv1.PodFailurePolicyActionFailJob,
+								OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
+									{
+										Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+										Status: v1.ConditionTrue,
+									},
+									{
+										Type:   v1.PodConditionType("MemoryLimitExceeded"),
+										Status: v1.ConditionFalse,
+									},
+									{
+										Type: v1.PodConditionType("DiskLimitExceeded"),
+									},
+								},
+							},
+							{
+								Action: batchv1.PodFailurePolicyActionFailJob,
+								OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
+									Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
+									Values:   []int32{1},
+								},
+							},
+							{
+								Action: batchv1.PodFailurePolicyActionFailJob,
+								OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
+									{
+										Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: &batchv1.Job{
+				Spec: batchv1.JobSpec{
+					Completions:    pointer.Int32Ptr(1),
+					Parallelism:    pointer.Int32Ptr(1),
+					BackoffLimit:   pointer.Int32Ptr(6),
+					CompletionMode: completionModePtr(batchv1.NonIndexedCompletion),
+					Suspend:        pointer.BoolPtr(false),
+					PodFailurePolicy: &batchv1.PodFailurePolicy{
+						Rules: []batchv1.PodFailurePolicyRule{
+							{
+								Action: batchv1.PodFailurePolicyActionFailJob,
+								OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
+									{
+										Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+										Status: v1.ConditionTrue,
+									},
+									{
+										Type:   v1.PodConditionType("MemoryLimitExceeded"),
+										Status: v1.ConditionFalse,
+									},
+									{
+										Type:   v1.PodConditionType("DiskLimitExceeded"),
+										Status: v1.ConditionTrue,
+									},
+								},
+							},
+							{
+								Action: batchv1.PodFailurePolicyActionFailJob,
+								OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
+									Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
+									Values:   []int32{1},
+								},
+							},
+							{
+								Action: batchv1.PodFailurePolicyActionFailJob,
+								OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
+									{
+										Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+										Status: v1.ConditionTrue,
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expectLabels: true,
+		},
 		"All unspecified -> sets all to default values": {
 			original: &batchv1.Job{
 				Spec: batchv1.JobSpec{
@@ -267,6 +358,9 @@ func TestSetDefaultJob(t *testing.T) {
 			validateDefaultInt32(t, "Parallelism", actual.Spec.Parallelism, expected.Spec.Parallelism)
 			validateDefaultInt32(t, "BackoffLimit", actual.Spec.BackoffLimit, expected.Spec.BackoffLimit)

+			if diff := cmp.Diff(expected.Spec.PodFailurePolicy, actual.Spec.PodFailurePolicy); diff != "" {
+				t.Errorf("unexpected diff in errors (-want, +got):\n%s", diff)
+			}
 			if test.expectLabels != reflect.DeepEqual(actual.Labels, actual.Spec.Template.Labels) {
 				if test.expectLabels {
 					t.Errorf("Expected labels: %v, got: %v", actual.Spec.Template.Labels, actual.Labels)
--- a/pkg/apis/batch/v1/zz_generated.conversion.go
+++ b/pkg/apis/batch/v1/zz_generated.conversion.go
@@ -132,6 +132,46 @@ func RegisterConversions(s *runtime.Scheme) error {
 	}); err != nil {
 		return err
 	}
+	if err := s.AddGeneratedConversionFunc((*v1.PodFailurePolicy)(nil), (*batch.PodFailurePolicy)(nil), func(a, b interface{}, scope conversion.Scope) error {
+		return Convert_v1_PodFailurePolicy_To_batch_PodFailurePolicy(a.(*v1.PodFailurePolicy), b.(*batch.PodFailurePolicy), scope)
+	}); err != nil {
+		return err
+	}
+	if err := s.AddGeneratedConversionFunc((*batch.PodFailurePolicy)(nil), (*v1.PodFailurePolicy)(nil), func(a, b interface{}, scope conversion.Scope) error {
+		return Convert_batch_PodFailurePolicy_To_v1_PodFailurePolicy(a.(*batch.PodFailurePolicy), b.(*v1.PodFailurePolicy), scope)
+	}); err != nil {
+		return err
+	}
+	if err := s.AddGeneratedConversionFunc((*v1.PodFailurePolicyOnExitCodesRequirement)(nil), (*batch.PodFailurePolicyOnExitCodesRequirement)(nil), func(a, b interface{}, scope conversion.Scope) error {
+		return Convert_v1_PodFailurePolicyOnExitCodesRequirement_To_batch_PodFailurePolicyOnExitCodesRequirement(a.(*v1.PodFailurePolicyOnExitCodesRequirement), b.(*batch.PodFailurePolicyOnExitCodesRequirement), scope)
+	}); err != nil {
+		return err
+	}
+	if err := s.AddGeneratedConversionFunc((*batch.PodFailurePolicyOnExitCodesRequirement)(nil), (*v1.PodFailurePolicyOnExitCodesRequirement)(nil), func(a, b interface{}, scope conversion.Scope) error {
+		return Convert_batch_PodFailurePolicyOnExitCodesRequirement_To_v1_PodFailurePolicyOnExitCodesRequirement(a.(*batch.PodFailurePolicyOnExitCodesRequirement), b.(*v1.PodFailurePolicyOnExitCodesRequirement), scope)
+	}); err != nil {
+		return err
+	}
+	if err := s.AddGeneratedConversionFunc((*v1.PodFailurePolicyOnPodConditionsPattern)(nil), (*batch.PodFailurePolicyOnPodConditionsPattern)(nil), func(a, b interface{}, scope conversion.Scope) error {
+		return Convert_v1_PodFailurePolicyOnPodConditionsPattern_To_batch_PodFailurePolicyOnPodConditionsPattern(a.(*v1.PodFailurePolicyOnPodConditionsPattern), b.(*batch.PodFailurePolicyOnPodConditionsPattern), scope)
+	}); err != nil {
+		return err
+	}
+	if err := s.AddGeneratedConversionFunc((*batch.PodFailurePolicyOnPodConditionsPattern)(nil), (*v1.PodFailurePolicyOnPodConditionsPattern)(nil), func(a, b interface{}, scope conversion.Scope) error {
+		return Convert_batch_PodFailurePolicyOnPodConditionsPattern_To_v1_PodFailurePolicyOnPodConditionsPattern(a.(*batch.PodFailurePolicyOnPodConditionsPattern), b.(*v1.PodFailurePolicyOnPodConditionsPattern), scope)
+	}); err != nil {
+		return err
+	}
+	if err := s.AddGeneratedConversionFunc((*v1.PodFailurePolicyRule)(nil), (*batch.PodFailurePolicyRule)(nil), func(a, b interface{}, scope conversion.Scope) error {
+		return Convert_v1_PodFailurePolicyRule_To_batch_PodFailurePolicyRule(a.(*v1.PodFailurePolicyRule), b.(*batch.PodFailurePolicyRule), scope)
+	}); err != nil {
+		return err
+	}
+	if err := s.AddGeneratedConversionFunc((*batch.PodFailurePolicyRule)(nil), (*v1.PodFailurePolicyRule)(nil), func(a, b interface{}, scope conversion.Scope) error {
+		return Convert_batch_PodFailurePolicyRule_To_v1_PodFailurePolicyRule(a.(*batch.PodFailurePolicyRule), b.(*v1.PodFailurePolicyRule), scope)
+	}); err != nil {
+		return err
+	}
 	if err := s.AddGeneratedConversionFunc((*v1.UncountedTerminatedPods)(nil), (*batch.UncountedTerminatedPods)(nil), func(a, b interface{}, scope conversion.Scope) error {
 		return Convert_v1_UncountedTerminatedPods_To_batch_UncountedTerminatedPods(a.(*v1.UncountedTerminatedPods), b.(*batch.UncountedTerminatedPods), scope)
 	}); err != nil {
@@ -399,6 +439,7 @@ func autoConvert_v1_JobSpec_To_batch_JobSpec(in *v1.JobSpec, out *batch.JobSpec,
 	out.Parallelism = (*int32)(unsafe.Pointer(in.Parallelism))
 	out.Completions = (*int32)(unsafe.Pointer(in.Completions))
 	out.ActiveDeadlineSeconds = (*int64)(unsafe.Pointer(in.ActiveDeadlineSeconds))
+	out.PodFailurePolicy = (*batch.PodFailurePolicy)(unsafe.Pointer(in.PodFailurePolicy))
 	out.BackoffLimit = (*int32)(unsafe.Pointer(in.BackoffLimit))
 	out.Selector = (*metav1.LabelSelector)(unsafe.Pointer(in.Selector))
 	out.ManualSelector = (*bool)(unsafe.Pointer(in.ManualSelector))
@@ -414,6 +455,7 @@ func autoConvert_v1_JobSpec_To_batch_JobSpec(in *v1.JobSpec, out *batch.JobSpec,
 func autoConvert_batch_JobSpec_To_v1_JobSpec(in *batch.JobSpec, out *v1.JobSpec, s conversion.Scope) error {
 	out.Parallelism = (*int32)(unsafe.Pointer(in.Parallelism))
 	out.Completions = (*int32)(unsafe.Pointer(in.Completions))
+	out.PodFailurePolicy = (*v1.PodFailurePolicy)(unsafe.Pointer(in.PodFailurePolicy))
 	out.ActiveDeadlineSeconds = (*int64)(unsafe.Pointer(in.ActiveDeadlineSeconds))
 	out.BackoffLimit = (*int32)(unsafe.Pointer(in.BackoffLimit))
 	out.Selector = (*metav1.LabelSelector)(unsafe.Pointer(in.Selector))
@@ -489,6 +531,96 @@ func Convert_batch_JobTemplateSpec_To_v1_JobTemplateSpec(in *batch.JobTemplateSp
 	return autoConvert_batch_JobTemplateSpec_To_v1_JobTemplateSpec(in, out, s)
 }

+func autoConvert_v1_PodFailurePolicy_To_batch_PodFailurePolicy(in *v1.PodFailurePolicy, out *batch.PodFailurePolicy, s conversion.Scope) error {
+	out.Rules = *(*[]batch.PodFailurePolicyRule)(unsafe.Pointer(&in.Rules))
+	return nil
+}
+
+// Convert_v1_PodFailurePolicy_To_batch_PodFailurePolicy is an autogenerated conversion function.
+func Convert_v1_PodFailurePolicy_To_batch_PodFailurePolicy(in *v1.PodFailurePolicy, out *batch.PodFailurePolicy, s conversion.Scope) error {
+	return autoConvert_v1_PodFailurePolicy_To_batch_PodFailurePolicy(in, out, s)
+}
+
+func autoConvert_batch_PodFailurePolicy_To_v1_PodFailurePolicy(in *batch.PodFailurePolicy, out *v1.PodFailurePolicy, s conversion.Scope) error {
+	out.Rules = *(*[]v1.PodFailurePolicyRule)(unsafe.Pointer(&in.Rules))
+	return nil
+}
+
+// Convert_batch_PodFailurePolicy_To_v1_PodFailurePolicy is an autogenerated conversion function.
+func Convert_batch_PodFailurePolicy_To_v1_PodFailurePolicy(in *batch.PodFailurePolicy, out *v1.PodFailurePolicy, s conversion.Scope) error {
+	return autoConvert_batch_PodFailurePolicy_To_v1_PodFailurePolicy(in, out, s)
+}
+
+func autoConvert_v1_PodFailurePolicyOnExitCodesRequirement_To_batch_PodFailurePolicyOnExitCodesRequirement(in *v1.PodFailurePolicyOnExitCodesRequirement, out *batch.PodFailurePolicyOnExitCodesRequirement, s conversion.Scope) error {
+	out.ContainerName = (*string)(unsafe.Pointer(in.ContainerName))
+	out.Operator = batch.PodFailurePolicyOnExitCodesOperator(in.Operator)
+	out.Values = *(*[]int32)(unsafe.Pointer(&in.Values))
+	return nil
+}
+
+// Convert_v1_PodFailurePolicyOnExitCodesRequirement_To_batch_PodFailurePolicyOnExitCodesRequirement is an autogenerated conversion function.
+func Convert_v1_PodFailurePolicyOnExitCodesRequirement_To_batch_PodFailurePolicyOnExitCodesRequirement(in *v1.PodFailurePolicyOnExitCodesRequirement, out *batch.PodFailurePolicyOnExitCodesRequirement, s conversion.Scope) error {
+	return autoConvert_v1_PodFailurePolicyOnExitCodesRequirement_To_batch_PodFailurePolicyOnExitCodesRequirement(in, out, s)
+}
+
+func autoConvert_batch_PodFailurePolicyOnExitCodesRequirement_To_v1_PodFailurePolicyOnExitCodesRequirement(in *batch.PodFailurePolicyOnExitCodesRequirement, out *v1.PodFailurePolicyOnExitCodesRequirement, s conversion.Scope) error {
+	out.ContainerName = (*string)(unsafe.Pointer(in.ContainerName))
+	out.Operator = v1.PodFailurePolicyOnExitCodesOperator(in.Operator)
+	out.Values = *(*[]int32)(unsafe.Pointer(&in.Values))
+	return nil
+}
+
+// Convert_batch_PodFailurePolicyOnExitCodesRequirement_To_v1_PodFailurePolicyOnExitCodesRequirement is an autogenerated conversion function.
+func Convert_batch_PodFailurePolicyOnExitCodesRequirement_To_v1_PodFailurePolicyOnExitCodesRequirement(in *batch.PodFailurePolicyOnExitCodesRequirement, out *v1.PodFailurePolicyOnExitCodesRequirement, s conversion.Scope) error {
+	return autoConvert_batch_PodFailurePolicyOnExitCodesRequirement_To_v1_PodFailurePolicyOnExitCodesRequirement(in, out, s)
+}
+
+func autoConvert_v1_PodFailurePolicyOnPodConditionsPattern_To_batch_PodFailurePolicyOnPodConditionsPattern(in *v1.PodFailurePolicyOnPodConditionsPattern, out *batch.PodFailurePolicyOnPodConditionsPattern, s conversion.Scope) error {
+	out.Type = core.PodConditionType(in.Type)
+	out.Status = core.ConditionStatus(in.Status)
+	return nil
+}
+
+// Convert_v1_PodFailurePolicyOnPodConditionsPattern_To_batch_PodFailurePolicyOnPodConditionsPattern is an autogenerated conversion function.
+func Convert_v1_PodFailurePolicyOnPodConditionsPattern_To_batch_PodFailurePolicyOnPodConditionsPattern(in *v1.PodFailurePolicyOnPodConditionsPattern, out *batch.PodFailurePolicyOnPodConditionsPattern, s conversion.Scope) error {
+	return autoConvert_v1_PodFailurePolicyOnPodConditionsPattern_To_batch_PodFailurePolicyOnPodConditionsPattern(in, out, s)
+}
+
+func autoConvert_batch_PodFailurePolicyOnPodConditionsPattern_To_v1_PodFailurePolicyOnPodConditionsPattern(in *batch.PodFailurePolicyOnPodConditionsPattern, out *v1.PodFailurePolicyOnPodConditionsPattern, s conversion.Scope) error {
+	out.Type = corev1.PodConditionType(in.Type)
+	out.Status = corev1.ConditionStatus(in.Status)
+	return nil
+}
+
+// Convert_batch_PodFailurePolicyOnPodConditionsPattern_To_v1_PodFailurePolicyOnPodConditionsPattern is an autogenerated conversion function.
+func Convert_batch_PodFailurePolicyOnPodConditionsPattern_To_v1_PodFailurePolicyOnPodConditionsPattern(in *batch.PodFailurePolicyOnPodConditionsPattern, out *v1.PodFailurePolicyOnPodConditionsPattern, s conversion.Scope) error {
+	return autoConvert_batch_PodFailurePolicyOnPodConditionsPattern_To_v1_PodFailurePolicyOnPodConditionsPattern(in, out, s)
+}
+
+func autoConvert_v1_PodFailurePolicyRule_To_batch_PodFailurePolicyRule(in *v1.PodFailurePolicyRule, out *batch.PodFailurePolicyRule, s conversion.Scope) error {
+	out.Action = batch.PodFailurePolicyAction(in.Action)
+	out.OnExitCodes = (*batch.PodFailurePolicyOnExitCodesRequirement)(unsafe.Pointer(in.OnExitCodes))
+	out.OnPodConditions = *(*[]batch.PodFailurePolicyOnPodConditionsPattern)(unsafe.Pointer(&in.OnPodConditions))
+	return nil
+}
+
+// Convert_v1_PodFailurePolicyRule_To_batch_PodFailurePolicyRule is an autogenerated conversion function.
+func Convert_v1_PodFailurePolicyRule_To_batch_PodFailurePolicyRule(in *v1.PodFailurePolicyRule, out *batch.PodFailurePolicyRule, s conversion.Scope) error {
+	return autoConvert_v1_PodFailurePolicyRule_To_batch_PodFailurePolicyRule(in, out, s)
+}
+
+func autoConvert_batch_PodFailurePolicyRule_To_v1_PodFailurePolicyRule(in *batch.PodFailurePolicyRule, out *v1.PodFailurePolicyRule, s conversion.Scope) error {
+	out.Action = v1.PodFailurePolicyAction(in.Action)
+	out.OnExitCodes = (*v1.PodFailurePolicyOnExitCodesRequirement)(unsafe.Pointer(in.OnExitCodes))
+	out.OnPodConditions = *(*[]v1.PodFailurePolicyOnPodConditionsPattern)(unsafe.Pointer(&in.OnPodConditions))
+	return nil
+}
+
+// Convert_batch_PodFailurePolicyRule_To_v1_PodFailurePolicyRule is an autogenerated conversion function.
+func Convert_batch_PodFailurePolicyRule_To_v1_PodFailurePolicyRule(in *batch.PodFailurePolicyRule, out *v1.PodFailurePolicyRule, s conversion.Scope) error {
+	return autoConvert_batch_PodFailurePolicyRule_To_v1_PodFailurePolicyRule(in, out, s)
+}
+
 func autoConvert_v1_UncountedTerminatedPods_To_batch_UncountedTerminatedPods(in *v1.UncountedTerminatedPods, out *batch.UncountedTerminatedPods, s conversion.Scope) error {
 	out.Succeeded = *(*[]types.UID)(unsafe.Pointer(&in.Succeeded))
 	out.Failed = *(*[]types.UID)(unsafe.Pointer(&in.Failed))
--- a/pkg/apis/batch/validation/validation.go
+++ b/pkg/apis/batch/validation/validation.go
@@ -22,6 +22,7 @@ import (
 	"time"

 	"github.com/robfig/cron/v3"
+	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	unversionedvalidation "k8s.io/apimachinery/pkg/apis/meta/v1/validation"
 	"k8s.io/apimachinery/pkg/labels"
@@ -39,6 +40,33 @@ import (
 // .status.completedIndexes.
 const maxParallelismForIndexedJob = 100000

+const (
+	// maximum number of rules in pod failure policy
+	maxPodFailurePolicyRules = 20
+
+	// maximum number of values for a OnExitCodes requirement in pod failure policy
+	maxPodFailurePolicyOnExitCodesValues = 255
+
+	// maximum number of patterns for a OnPodConditions requirement in pod failure policy
+	maxPodFailurePolicyOnPodConditionsPatterns = 20
+)
+
+var (
+	supportedPodFailurePolicyActions sets.String = sets.NewString(
+		string(batch.PodFailurePolicyActionCount),
+		string(batch.PodFailurePolicyActionFailJob),
+		string(batch.PodFailurePolicyActionIgnore))
+
+	supportedPodFailurePolicyOnExitCodesOperator sets.String = sets.NewString(
+		string(batch.PodFailurePolicyOnExitCodesOpIn),
+		string(batch.PodFailurePolicyOnExitCodesOpNotIn))
+
+	supportedPodFailurePolicyOnPodConditionsStatus sets.String = sets.NewString(
+		string(v1.ConditionFalse),
+		string(v1.ConditionTrue),
+		string(v1.ConditionUnknown))
+)
+
 // ValidateGeneratedSelector validates that the generated selector on a controller object match the controller object
 // metadata, and the labels on the pod template are as generated.
 //
@@ -168,6 +196,10 @@ func validateJobSpec(spec *batch.JobSpec, fldPath *field.Path, opts apivalidatio
 		}
 	}

+	if spec.PodFailurePolicy != nil {
+		allErrs = append(allErrs, validatePodFailurePolicy(spec, fldPath.Child("podFailurePolicy"))...)
+	}
+
 	allErrs = append(allErrs, apivalidation.ValidatePodTemplateSpec(&spec.Template, fldPath.Child("template"), opts)...)

 	// spec.Template.Spec.RestartPolicy can be defaulted as RestartPolicyAlways
@@ -179,10 +211,113 @@ func validateJobSpec(spec *batch.JobSpec, fldPath *field.Path, opts apivalidatio
 	} else if spec.Template.Spec.RestartPolicy != api.RestartPolicyOnFailure && spec.Template.Spec.RestartPolicy != api.RestartPolicyNever {
 		allErrs = append(allErrs, field.NotSupported(fldPath.Child("template", "spec", "restartPolicy"),
 			spec.Template.Spec.RestartPolicy, []string{string(api.RestartPolicyOnFailure), string(api.RestartPolicyNever)}))
+	} else if spec.PodFailurePolicy != nil && spec.Template.Spec.RestartPolicy != api.RestartPolicyNever {
+		allErrs = append(allErrs, field.Invalid(fldPath.Child("template", "spec", "restartPolicy"),
+			spec.Template.Spec.RestartPolicy, fmt.Sprintf("only %q is supported when podFailurePolicy is specified", api.RestartPolicyNever)))
 	}
 	return allErrs
 }

+func validatePodFailurePolicy(spec *batch.JobSpec, fldPath *field.Path) field.ErrorList {
+	var allErrs field.ErrorList
+	rulesPath := fldPath.Child("rules")
+	if len(spec.PodFailurePolicy.Rules) > maxPodFailurePolicyRules {
+		allErrs = append(allErrs, field.TooMany(rulesPath, len(spec.PodFailurePolicy.Rules), maxPodFailurePolicyRules))
+	}
+	containerNames := sets.NewString()
+	for _, containerSpec := range spec.Template.Spec.Containers {
+		containerNames.Insert(containerSpec.Name)
+	}
+	for _, containerSpec := range spec.Template.Spec.InitContainers {
+		containerNames.Insert(containerSpec.Name)
+	}
+	for i, rule := range spec.PodFailurePolicy.Rules {
+		allErrs = append(allErrs, validatePodFailurePolicyRule(&rule, rulesPath.Index(i), containerNames)...)
+	}
+	return allErrs
+}
+
+func validatePodFailurePolicyRule(rule *batch.PodFailurePolicyRule, rulePath *field.Path, containerNames sets.String) field.ErrorList {
+	var allErrs field.ErrorList
+	actionPath := rulePath.Child("action")
+	if rule.Action == "" {
+		allErrs = append(allErrs, field.Required(actionPath, fmt.Sprintf("valid values: %q", supportedPodFailurePolicyActions.List())))
+	} else if !supportedPodFailurePolicyActions.Has(string(rule.Action)) {
+		allErrs = append(allErrs, field.NotSupported(actionPath, rule.Action, supportedPodFailurePolicyActions.List()))
+	}
+	if rule.OnExitCodes != nil {
+		allErrs = append(allErrs, validatePodFailurePolicyRuleOnExitCodes(rule.OnExitCodes, rulePath.Child("onExitCodes"), containerNames)...)
+	}
+	if len(rule.OnPodConditions) > 0 {
+		allErrs = append(allErrs, validatePodFailurePolicyRuleOnPodConditions(rule.OnPodConditions, rulePath.Child("onPodConditions"))...)
+	}
+	if rule.OnExitCodes != nil && len(rule.OnPodConditions) > 0 {
+		allErrs = append(allErrs, field.Invalid(rulePath, field.OmitValueType{}, "specifying both OnExitCodes and OnPodConditions is not supported"))
+	}
+	if rule.OnExitCodes == nil && len(rule.OnPodConditions) == 0 {
+		allErrs = append(allErrs, field.Invalid(rulePath, field.OmitValueType{}, "specifying one of OnExitCodes and OnPodConditions is required"))
+	}
+	return allErrs
+}
+
+func validatePodFailurePolicyRuleOnPodConditions(onPodConditions []batch.PodFailurePolicyOnPodConditionsPattern, onPodConditionsPath *field.Path) field.ErrorList {
+	var allErrs field.ErrorList
+	if len(onPodConditions) > maxPodFailurePolicyOnPodConditionsPatterns {
+		allErrs = append(allErrs, field.TooMany(onPodConditionsPath, len(onPodConditions), maxPodFailurePolicyOnPodConditionsPatterns))
+	}
+	for j, pattern := range onPodConditions {
+		patternPath := onPodConditionsPath.Index(j)
+		statusPath := patternPath.Child("status")
+		allErrs = append(allErrs, apivalidation.ValidateQualifiedName(string(pattern.Type), patternPath.Child("type"))...)
+		if pattern.Status == "" {
+			allErrs = append(allErrs, field.Required(statusPath, fmt.Sprintf("valid values: %q", supportedPodFailurePolicyOnPodConditionsStatus.List())))
+		} else if !supportedPodFailurePolicyOnPodConditionsStatus.Has(string(pattern.Status)) {
+			allErrs = append(allErrs, field.NotSupported(statusPath, pattern.Status, supportedPodFailurePolicyOnPodConditionsStatus.List()))
+		}
+	}
+	return allErrs
+}
+
+func validatePodFailurePolicyRuleOnExitCodes(onExitCode *batch.PodFailurePolicyOnExitCodesRequirement, onExitCodesPath *field.Path, containerNames sets.String) field.ErrorList {
+	var allErrs field.ErrorList
+	operatorPath := onExitCodesPath.Child("operator")
+	if onExitCode.Operator == "" {
+		allErrs = append(allErrs, field.Required(operatorPath, fmt.Sprintf("valid values: %q", supportedPodFailurePolicyOnExitCodesOperator.List())))
+	} else if !supportedPodFailurePolicyOnExitCodesOperator.Has(string(onExitCode.Operator)) {
+		allErrs = append(allErrs, field.NotSupported(operatorPath, onExitCode.Operator, supportedPodFailurePolicyOnExitCodesOperator.List()))
+	}
+	if onExitCode.ContainerName != nil && !containerNames.Has(*onExitCode.ContainerName) {
+		allErrs = append(allErrs, field.Invalid(onExitCodesPath.Child("containerName"), *onExitCode.ContainerName, "must be one of the container or initContainer names in the pod template"))
+	}
+	valuesPath := onExitCodesPath.Child("values")
+	if len(onExitCode.Values) == 0 {
+		allErrs = append(allErrs, field.Invalid(valuesPath, onExitCode.Values, "at least one value is required"))
+	} else if len(onExitCode.Values) > maxPodFailurePolicyOnExitCodesValues {
+		allErrs = append(allErrs, field.TooMany(valuesPath, len(onExitCode.Values), maxPodFailurePolicyOnExitCodesValues))
+	}
+	isOrdered := true
+	uniqueValues := sets.NewInt32()
+	for j, exitCodeValue := range onExitCode.Values {
+		valuePath := valuesPath.Index(j)
+		if onExitCode.Operator == batch.PodFailurePolicyOnExitCodesOpIn && exitCodeValue == 0 {
+			allErrs = append(allErrs, field.Invalid(valuePath, exitCodeValue, "must not be 0 for the In operator"))
+		}
+		if uniqueValues.Has(exitCodeValue) {
+			allErrs = append(allErrs, field.Duplicate(valuePath, exitCodeValue))
+		} else {
+			uniqueValues.Insert(exitCodeValue)
+		}
+		if j > 0 && onExitCode.Values[j-1] > exitCodeValue {
+			isOrdered = false
+		}
+	}
+	if !isOrdered {
+		allErrs = append(allErrs, field.Invalid(valuesPath, onExitCode.Values, "must be ordered"))
+	}
+
+	return allErrs
+}
+
 // validateJobStatus validates a JobStatus and returns an ErrorList with any errors.
 func validateJobStatus(status *batch.JobStatus, fldPath *field.Path) field.ErrorList {
 	allErrs := field.ErrorList{}
@@ -241,6 +376,7 @@ func ValidateJobSpecUpdate(spec, oldSpec batch.JobSpec, fldPath *field.Path, opt
 	allErrs = append(allErrs, apivalidation.ValidateImmutableField(spec.Selector, oldSpec.Selector, fldPath.Child("selector"))...)
 	allErrs = append(allErrs, validatePodTemplateUpdate(spec, oldSpec, fldPath, opts)...)
 	allErrs = append(allErrs, apivalidation.ValidateImmutableField(spec.CompletionMode, oldSpec.CompletionMode, fldPath.Child("completionMode"))...)
+	allErrs = append(allErrs, apivalidation.ValidateImmutableField(spec.PodFailurePolicy, oldSpec.PodFailurePolicy, fldPath.Child("podFailurePolicy"))...)
 	return allErrs
 }

--- a/pkg/apis/batch/validation/validation_test.go
+++ b/pkg/apis/batch/validation/validation_test.go
@@ -18,6 +18,7 @@ package validation

 import (
 	"archive/zip"
+	"fmt"
 	"io"
 	"os"
 	"path/filepath"
@@ -80,23 +81,85 @@ func getValidPodTemplateSpecForGenerated(selector *metav1.LabelSelector) api.Pod
 			Labels: selector.MatchLabels,
 		},
 		Spec: api.PodSpec{
-			RestartPolicy: api.RestartPolicyOnFailure,
-			DNSPolicy:     api.DNSClusterFirst,
-			Containers:    []api.Container{{Name: "abc", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: api.TerminationMessageReadFile}},
+			RestartPolicy:  api.RestartPolicyOnFailure,
+			DNSPolicy:      api.DNSClusterFirst,
+			Containers:     []api.Container{{Name: "abc", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: api.TerminationMessageReadFile}},
+			InitContainers: []api.Container{{Name: "def", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: api.TerminationMessageReadFile}},
 		},
 	}
 }

 func TestValidateJob(t *testing.T) {
+	validJobObjectMeta := metav1.ObjectMeta{
+		Name:      "myjob",
+		Namespace: metav1.NamespaceDefault,
+		UID:       types.UID("1a2b3c"),
+	}
 	validManualSelector := getValidManualSelector()
 	validPodTemplateSpecForManual := getValidPodTemplateSpecForManual(validManualSelector)
 	validGeneratedSelector := getValidGeneratedSelector()
 	validPodTemplateSpecForGenerated := getValidPodTemplateSpecForGenerated(validGeneratedSelector)
+	validPodTemplateSpecForGeneratedRestartPolicyNever := getValidPodTemplateSpecForGenerated(validGeneratedSelector)
+	validPodTemplateSpecForGeneratedRestartPolicyNever.Spec.RestartPolicy = api.RestartPolicyNever

 	successCases := map[string]struct {
 		opts JobValidationOptions
 		job  batch.Job
 	}{
+		"valid pod failure policy": {
+			job: batch.Job{
+				ObjectMeta: validJobObjectMeta,
+				Spec: batch.JobSpec{
+					Selector: validGeneratedSelector,
+					Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: []batch.PodFailurePolicyRule{
+							{
+								Action: batch.PodFailurePolicyActionIgnore,
+								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+									{
+										Type:   api.AlphaNoCompatGuaranteeDisruptionTarget,
+										Status: api.ConditionTrue,
+									},
+								},
+							},
+							{
+								Action: batch.PodFailurePolicyActionFailJob,
+								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+									{
+										Type:   api.PodConditionType("CustomConditionType"),
+										Status: api.ConditionFalse,
+									},
+								},
+							},
+							{
+								Action: batch.PodFailurePolicyActionCount,
+								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+									ContainerName: pointer.String("abc"),
+									Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
+									Values:        []int32{1, 2, 3},
+								},
+							},
+							{
+								Action: batch.PodFailurePolicyActionIgnore,
+								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+									ContainerName: pointer.String("def"),
+									Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
+									Values:        []int32{4},
+								},
+							},
+							{
+								Action: batch.PodFailurePolicyActionFailJob,
+								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+									Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
+									Values:   []int32{5, 6, 7},
+								},
+							},
+						},
+					},
+				},
+			},
+		},
 		"valid manual selector": {
 			job: batch.Job{
 				ObjectMeta: metav1.ObjectMeta{
@@ -185,6 +248,402 @@ func TestValidateJob(t *testing.T) {
 	negative := int32(-1)
 	negative64 := int64(-1)
 	errorCases := map[string]batch.Job{
+		`spec.podFailurePolicy.rules[0]: Invalid value: specifying one of OnExitCodes and OnPodConditions is required`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionFailJob,
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onExitCodes.values[1]: Duplicate value: 11`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionFailJob,
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+								Values:   []int32{11, 11},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onExitCodes.values: Too many: 256: must have at most 255 items`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionFailJob,
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+								Values: func() (values []int32) {
+									tooManyValues := make([]int32, maxPodFailurePolicyOnExitCodesValues+1)
+									for i := range tooManyValues {
+										tooManyValues[i] = int32(i)
+									}
+									return tooManyValues
+								}(),
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules: Too many: 21: must have at most 20 items`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: func() []batch.PodFailurePolicyRule {
+						tooManyRules := make([]batch.PodFailurePolicyRule, maxPodFailurePolicyRules+1)
+						for i := range tooManyRules {
+							tooManyRules[i] = batch.PodFailurePolicyRule{
+								Action: batch.PodFailurePolicyActionFailJob,
+								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+									Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+									Values:   []int32{int32(i + 1)},
+								},
+							}
+						}
+						return tooManyRules
+					}(),
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onPodConditions: Too many: 21: must have at most 20 items`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionFailJob,
+							OnPodConditions: func() []batch.PodFailurePolicyOnPodConditionsPattern {
+								tooManyPatterns := make([]batch.PodFailurePolicyOnPodConditionsPattern, maxPodFailurePolicyOnPodConditionsPatterns+1)
+								for i := range tooManyPatterns {
+									tooManyPatterns[i] = batch.PodFailurePolicyOnPodConditionsPattern{
+										Type:   api.PodConditionType(fmt.Sprintf("CustomType_%d", i)),
+										Status: api.ConditionTrue,
+									}
+								}
+								return tooManyPatterns
+							}(),
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onExitCodes.values[2]: Duplicate value: 13`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionFailJob,
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+								Values:   []int32{12, 13, 13, 13},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onExitCodes.values: Invalid value: []int32{19, 11}: must be ordered`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionFailJob,
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+								Values:   []int32{19, 11},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onExitCodes.values: Invalid value: []int32{}: at least one value is required`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionFailJob,
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+								Values:   []int32{},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].action: Required value: valid values: ["Count" "FailJob" "Ignore"]`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: "",
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+								Values:   []int32{1, 2, 3},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onExitCodes.operator: Required value: valid values: ["In" "NotIn"]`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionFailJob,
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								Operator: "",
+								Values:   []int32{1, 2, 3},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0]: Invalid value: specifying both OnExitCodes and OnPodConditions is not supported`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionFailJob,
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								ContainerName: pointer.String("abc"),
+								Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
+								Values:        []int32{1, 2, 3},
+							},
+							OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+								{
+									Type:   api.AlphaNoCompatGuaranteeDisruptionTarget,
+									Status: api.ConditionTrue,
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onExitCodes.values[1]: Invalid value: 0: must not be 0 for the In operator`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionIgnore,
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+								Values:   []int32{1, 0, 2},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[1].onExitCodes.containerName: Invalid value: "xyz": must be one of the container or initContainer names in the pod template`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionIgnore,
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								ContainerName: pointer.String("abc"),
+								Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
+								Values:        []int32{1, 2, 3},
+							},
+						},
+						{
+							Action: batch.PodFailurePolicyActionFailJob,
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								ContainerName: pointer.String("xyz"),
+								Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
+								Values:        []int32{5, 6, 7},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].action: Unsupported value: "UnknownAction": supported values: "Count", "FailJob", "Ignore"`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: "UnknownAction",
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								ContainerName: pointer.String("abc"),
+								Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
+								Values:        []int32{1, 2, 3},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onExitCodes.operator: Unsupported value: "UnknownOperator": supported values: "In", "NotIn"`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionIgnore,
+							OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+								Operator: "UnknownOperator",
+								Values:   []int32{1, 2, 3},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onPodConditions[0].status: Required value: valid values: ["False" "True" "Unknown"]`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionIgnore,
+							OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+								{
+									Type: api.AlphaNoCompatGuaranteeDisruptionTarget,
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onPodConditions[0].status: Unsupported value: "UnknownStatus": supported values: "False", "True", "Unknown"`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionIgnore,
+							OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+								{
+									Type:   api.AlphaNoCompatGuaranteeDisruptionTarget,
+									Status: "UnknownStatus",
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onPodConditions[0].type: Invalid value: "": name part must be non-empty`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionIgnore,
+							OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+								{
+									Status: api.ConditionTrue,
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.podFailurePolicy.rules[0].onPodConditions[0].type: Invalid value: "Invalid Condition Type": name part must consist of alphanumeric characters, '-', '_' or '.', and must start and end with an alphanumeric character (e.g. 'MyName',  or 'my.name',  or '123-abc', regex used for validation is '([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]')`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionIgnore,
+							OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+								{
+									Type:   api.PodConditionType("Invalid Condition Type"),
+									Status: api.ConditionTrue,
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+		`spec.template.spec.restartPolicy: Invalid value: "OnFailure": only "Never" is supported when podFailurePolicy is specified`: {
+			ObjectMeta: validJobObjectMeta,
+			Spec: batch.JobSpec{
+				Selector: validGeneratedSelector,
+				Template: api.PodTemplateSpec{
+					ObjectMeta: metav1.ObjectMeta{
+						Labels: validGeneratedSelector.MatchLabels,
+					},
+					Spec: api.PodSpec{
+						RestartPolicy: api.RestartPolicyOnFailure,
+						DNSPolicy:     api.DNSClusterFirst,
+						Containers:    []api.Container{{Name: "abc", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: api.TerminationMessageReadFile}},
+					},
+				},
+				PodFailurePolicy: &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{},
+				},
+			},
+		},
 		"spec.parallelism:must be greater than or equal to 0": {
 			ObjectMeta: metav1.ObjectMeta{
 				Name:      "myjob",
@@ -388,6 +847,9 @@ func TestValidateJob(t *testing.T) {
 func TestValidateJobUpdate(t *testing.T) {
 	validGeneratedSelector := getValidGeneratedSelector()
 	validPodTemplateSpecForGenerated := getValidPodTemplateSpecForGenerated(validGeneratedSelector)
+	validPodTemplateSpecForGeneratedRestartPolicyNever := getValidPodTemplateSpecForGenerated(validGeneratedSelector)
+	validPodTemplateSpecForGeneratedRestartPolicyNever.Spec.RestartPolicy = api.RestartPolicyNever
+
 	validNodeAffinity := &api.Affinity{
 		NodeAffinity: &api.NodeAffinity{
 			RequiredDuringSchedulingIgnoredDuringExecution: &api.NodeSelector{
@@ -491,6 +953,100 @@ func TestValidateJobUpdate(t *testing.T) {
 				Field: "spec.selector",
 			},
 		},
+		"add pod failure policy": {
+			old: batch.Job{
+				ObjectMeta: metav1.ObjectMeta{Name: "abc", Namespace: metav1.NamespaceDefault},
+				Spec: batch.JobSpec{
+					Selector: validGeneratedSelector,
+					Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+				},
+			},
+			update: func(job *batch.Job) {
+				job.Spec.PodFailurePolicy = &batch.PodFailurePolicy{
+					Rules: []batch.PodFailurePolicyRule{
+						{
+							Action: batch.PodFailurePolicyActionIgnore,
+							OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+								{
+									Type:   api.AlphaNoCompatGuaranteeDisruptionTarget,
+									Status: api.ConditionTrue,
+								},
+							},
+						},
+					},
+				}
+			},
+			err: &field.Error{
+				Type:  field.ErrorTypeInvalid,
+				Field: "spec.podFailurePolicy",
+			},
+		},
+		"remove pod failure policy": {
+			old: batch.Job{
+				ObjectMeta: metav1.ObjectMeta{Name: "abc", Namespace: metav1.NamespaceDefault},
+				Spec: batch.JobSpec{
+					Selector: validGeneratedSelector,
+					Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: []batch.PodFailurePolicyRule{
+							{
+								Action: batch.PodFailurePolicyActionIgnore,
+								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+									{
+										Type:   api.AlphaNoCompatGuaranteeDisruptionTarget,
+										Status: api.ConditionTrue,
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			update: func(job *batch.Job) {
+				job.Spec.PodFailurePolicy.Rules = append(job.Spec.PodFailurePolicy.Rules, batch.PodFailurePolicyRule{
+					Action: batch.PodFailurePolicyActionCount,
+					OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+						{
+							Type:   api.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: api.ConditionTrue,
+						},
+					},
+				})
+			},
+			err: &field.Error{
+				Type:  field.ErrorTypeInvalid,
+				Field: "spec.podFailurePolicy",
+			},
+		},
+		"update pod failure policy": {
+			old: batch.Job{
+				ObjectMeta: metav1.ObjectMeta{Name: "abc", Namespace: metav1.NamespaceDefault},
+				Spec: batch.JobSpec{
+					Selector: validGeneratedSelector,
+					Template: validPodTemplateSpecForGeneratedRestartPolicyNever,
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: []batch.PodFailurePolicyRule{
+							{
+								Action: batch.PodFailurePolicyActionIgnore,
+								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+									{
+										Type:   api.AlphaNoCompatGuaranteeDisruptionTarget,
+										Status: api.ConditionTrue,
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			update: func(job *batch.Job) {
+				job.Spec.PodFailurePolicy = nil
+			},
+			err: &field.Error{
+				Type:  field.ErrorTypeInvalid,
+				Field: "spec.podFailurePolicy",
+			},
+		},
 		"immutable pod template": {
 			old: batch.Job{
 				ObjectMeta: metav1.ObjectMeta{Name: "abc", Namespace: metav1.NamespaceDefault},
--- a/pkg/apis/batch/zz_generated.deepcopy.go
+++ b/pkg/apis/batch/zz_generated.deepcopy.go
@@ -252,6 +252,11 @@ func (in *JobSpec) DeepCopyInto(out *JobSpec) {
 		*out = new(int32)
 		**out = **in
 	}
+	if in.PodFailurePolicy != nil {
+		in, out := &in.PodFailurePolicy, &out.PodFailurePolicy
+		*out = new(PodFailurePolicy)
+		(*in).DeepCopyInto(*out)
+	}
 	if in.ActiveDeadlineSeconds != nil {
 		in, out := &in.ActiveDeadlineSeconds, &out.ActiveDeadlineSeconds
 		*out = new(int64)
@@ -387,6 +392,97 @@ func (in *JobTemplateSpec) DeepCopy() *JobTemplateSpec {
 	return out
 }

+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *PodFailurePolicy) DeepCopyInto(out *PodFailurePolicy) {
+	*out = *in
+	if in.Rules != nil {
+		in, out := &in.Rules, &out.Rules
+		*out = make([]PodFailurePolicyRule, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	return
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodFailurePolicy.
+func (in *PodFailurePolicy) DeepCopy() *PodFailurePolicy {
+	if in == nil {
+		return nil
+	}
+	out := new(PodFailurePolicy)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *PodFailurePolicyOnExitCodesRequirement) DeepCopyInto(out *PodFailurePolicyOnExitCodesRequirement) {
+	*out = *in
+	if in.ContainerName != nil {
+		in, out := &in.ContainerName, &out.ContainerName
+		*out = new(string)
+		**out = **in
+	}
+	if in.Values != nil {
+		in, out := &in.Values, &out.Values
+		*out = make([]int32, len(*in))
+		copy(*out, *in)
+	}
+	return
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodFailurePolicyOnExitCodesRequirement.
+func (in *PodFailurePolicyOnExitCodesRequirement) DeepCopy() *PodFailurePolicyOnExitCodesRequirement {
+	if in == nil {
+		return nil
+	}
+	out := new(PodFailurePolicyOnExitCodesRequirement)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *PodFailurePolicyOnPodConditionsPattern) DeepCopyInto(out *PodFailurePolicyOnPodConditionsPattern) {
+	*out = *in
+	return
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodFailurePolicyOnPodConditionsPattern.
+func (in *PodFailurePolicyOnPodConditionsPattern) DeepCopy() *PodFailurePolicyOnPodConditionsPattern {
+	if in == nil {
+		return nil
+	}
+	out := new(PodFailurePolicyOnPodConditionsPattern)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *PodFailurePolicyRule) DeepCopyInto(out *PodFailurePolicyRule) {
+	*out = *in
+	if in.OnExitCodes != nil {
+		in, out := &in.OnExitCodes, &out.OnExitCodes
+		*out = new(PodFailurePolicyOnExitCodesRequirement)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.OnPodConditions != nil {
+		in, out := &in.OnPodConditions, &out.OnPodConditions
+		*out = make([]PodFailurePolicyOnPodConditionsPattern, len(*in))
+		copy(*out, *in)
+	}
+	return
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodFailurePolicyRule.
+func (in *PodFailurePolicyRule) DeepCopy() *PodFailurePolicyRule {
+	if in == nil {
+		return nil
+	}
+	out := new(PodFailurePolicyRule)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *UncountedTerminatedPods) DeepCopyInto(out *UncountedTerminatedPods) {
 	*out = *in
--- a/pkg/controller/job/job_controller.go
+++ b/pkg/controller/job/job_controller.go
@@ -61,6 +61,12 @@ import (
 // a Job. It is used if the feature gate JobReadyPods is enabled.
 const podUpdateBatchPeriod = time.Second

+const (
+	// PodFailurePolicy reason indicates a job failure condition is added due to
+	// a failed pod matching a pod failure policy rule
+	jobConditionReasonPodFailurePolicy = "PodFailurePolicy"
+)
+
 // controllerKind contains the schema.GroupVersionKind for this controller type.
 var controllerKind = batch.SchemeGroupVersion.WithKind("Job")

@@ -758,16 +764,31 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (forget bool, rEr
 	exceedsBackoffLimit := jobHasNewFailure && (active != *job.Spec.Parallelism) &&
 		(failed > *job.Spec.BackoffLimit)

-	if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
-		// check if the number of pod restart exceeds backoff (for restart OnFailure only)
-		// OR if the number of failed jobs increased since the last syncJob
-		finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "BackoffLimitExceeded", "Job has reached the specified backoff limit")
-	} else if pastActiveDeadline(&job) {
-		finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded", "Job was active longer than specified deadline")
-	} else if job.Spec.ActiveDeadlineSeconds != nil && !jobSuspended(&job) {
-		syncDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second - time.Since(job.Status.StartTime.Time)
-		klog.V(2).InfoS("Job has activeDeadlineSeconds configuration. Will sync this job again", "job", key, "nextSyncIn", syncDuration)
-		jm.queue.AddAfter(key, syncDuration)
+	if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
+		if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.AlphaNoCompatGuaranteeJobFailureTarget); failureTargetCondition != nil {
+			finishedCondition = newFailedConditionForFailureTarget(failureTargetCondition)
+		} else if failJobMessage := getFailJobMessage(&job, pods, uncounted.Failed()); failJobMessage != nil {
+			if uncounted != nil {
+				// Prepare the interim FailureTarget condition to record the failure message before the finalizers (allowing removal of the pods) are removed.
+				finishedCondition = newCondition(batch.AlphaNoCompatGuaranteeJobFailureTarget, v1.ConditionTrue, jobConditionReasonPodFailurePolicy, *failJobMessage)
+			} else {
+				// Prepare the Failed job condition for the legacy path without finalizers (don't use the interim FailureTarget condition).
+				finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, jobConditionReasonPodFailurePolicy, *failJobMessage)
+			}
+		}
+	}
+	if finishedCondition == nil {
+		if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
+			// check if the number of pod restart exceeds backoff (for restart OnFailure only)
+			// OR if the number of failed jobs increased since the last syncJob
+			finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "BackoffLimitExceeded", "Job has reached the specified backoff limit")
+		} else if pastActiveDeadline(&job) {
+			finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded", "Job was active longer than specified deadline")
+		} else if job.Spec.ActiveDeadlineSeconds != nil && !jobSuspended(&job) {
+			syncDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second - time.Since(job.Status.StartTime.Time)
+			klog.V(2).InfoS("Job has activeDeadlineSeconds configuration. Will sync this job again", "job", key, "nextSyncIn", syncDuration)
+			jm.queue.AddAfter(key, syncDuration)
+		}
 	}

 	var prevSucceededIndexes, succeededIndexes orderedIntervals
@@ -1039,8 +1060,16 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 		} else if pod.Status.Phase == v1.PodFailed || podTerminating {
 			ix := getCompletionIndex(pod.Annotations)
 			if !uncounted.failed.Has(string(pod.UID)) && (!isIndexed || (ix != unknownCompletionIndex && ix < int(*job.Spec.Completions))) {
-				needsFlush = true
-				uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID)
+				if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil {
+					_, countFailed := matchPodFailurePolicy(job.Spec.PodFailurePolicy, pod)
+					if countFailed {
+						needsFlush = true
+						uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID)
+					}
+				} else {
+					needsFlush = true
+					uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID)
+				}
 			}
 		}
 		if len(newSucceededIndexes)+len(uncountedStatus.Succeeded)+len(uncountedStatus.Failed) >= MaxUncountedPods {
@@ -1060,6 +1089,18 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 		job.Status.Succeeded = int32(succeededIndexes.total())
 		job.Status.CompletedIndexes = succeededIndexes.String()
 	}
+	if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
+		if finishedCond != nil && finishedCond.Type == batch.AlphaNoCompatGuaranteeJobFailureTarget {
+
+			// Append the interim FailureTarget condition to update the job status with before finalizers are removed.
+			job.Status.Conditions = append(job.Status.Conditions, *finishedCond)
+			needsFlush = true
+
+			// Prepare the final Failed condition to update the job status with after the finalizers are removed.
+			// It is also used in the enactJobFinished function for reporting.
+			finishedCond = newFailedConditionForFailureTarget(finishedCond)
+		}
+	}
 	var err error
 	if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, needsFlush); err != nil {
 		return err
@@ -1077,7 +1118,8 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 }

 // flushUncountedAndRemoveFinalizers does:
-//  1. flush the Job status that might include new uncounted Pod UIDs.
+//  1. flush the Job status that might include new uncounted Pod UIDs. Also flush the interim FailureTarget condition
+//     if present.
 //  2. perform the removal of finalizers from Pods which are in the uncounted
 //     lists.
 //  3. update the counters based on the Pods for which it successfully removed
@@ -1231,6 +1273,12 @@ func filterInUncountedUIDs(uncounted []types.UID, include sets.String) []types.U
 	return newUncounted
 }

+// newFailedConditionForFailureTarget creates a job Failed condition based on
+// the interim FailureTarget condition.
+func newFailedConditionForFailureTarget(condition *batch.JobCondition) *batch.JobCondition {
+	return newCondition(batch.JobFailed, v1.ConditionTrue, condition.Reason, condition.Message)
+}
+
 // pastBackoffLimitOnFailure checks if container restartCounts sum exceeds BackoffLimit
 // this method applies only to pods with restartPolicy == OnFailure
 func pastBackoffLimitOnFailure(job *batch.Job, pods []*v1.Pod) bool {
@@ -1282,7 +1330,24 @@ func newCondition(conditionType batch.JobConditionType, status v1.ConditionStatu
 	}
 }

-// getStatus returns number of succeeded and failed pods running a job
+// getFailJobMessage returns a job failure message if the job should fail with the current counters
+func getFailJobMessage(job *batch.Job, pods []*v1.Pod, uncounted sets.String) *string {
+	if !feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) || job.Spec.PodFailurePolicy == nil {
+		return nil
+	}
+	for _, p := range pods {
+		if isPodFailed(p, uncounted != nil) {
+			jobFailureMessage, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
+			if jobFailureMessage != nil {
+				return jobFailureMessage
+			}
+		}
+	}
+	return nil
+}
+
+// getStatus returns number of succeeded and failed pods running a job. The number
+// of failed pods can be affected by the podFailurePolicy.
 func getStatus(job *batch.Job, pods []*v1.Pod, uncounted *uncountedTerminatedPods, expectedRmFinalizers sets.String) (succeeded, failed int32) {
 	if uncounted != nil {
 		succeeded = job.Status.Succeeded
@@ -1292,13 +1357,15 @@ func getStatus(job *batch.Job, pods []*v1.Pod, uncounted *uncountedTerminatedPod
 		return p.Status.Phase == v1.PodSucceeded
 	}))
 	failed += int32(countValidPodsWithFilter(job, pods, uncounted.Failed(), expectedRmFinalizers, func(p *v1.Pod) bool {
-		if p.Status.Phase == v1.PodFailed {
-			return true
+		if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil {
+			if !isPodFailed(p, uncounted != nil) {
+				return false
+			}
+			_, countFailed := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p)
+			return countFailed
+		} else {
+			return isPodFailed(p, uncounted != nil)
 		}
-		// When tracking with finalizers: counting deleted Pods as failures to
-		// account for orphan Pods that never have a chance to reach the Failed
-		// phase.
-		return uncounted != nil && p.DeletionTimestamp != nil && p.Status.Phase != v1.PodSucceeded
 	}))
 	return succeeded, failed
 }
@@ -1667,6 +1734,16 @@ func ensureJobConditionStatus(list []batch.JobCondition, cType batch.JobConditio
 	return list, false
 }

+func isPodFailed(p *v1.Pod, wFinalizers bool) bool {
+	if p.Status.Phase == v1.PodFailed {
+		return true
+	}
+	// When tracking with finalizers: counting deleted Pods as failures to
+	// account for orphan Pods that never have a chance to reach the Failed
+	// phase.
+	return wFinalizers && p.DeletionTimestamp != nil && p.Status.Phase != v1.PodSucceeded
+}
+
 func findConditionByType(list []batch.JobCondition, cType batch.JobConditionType) *batch.JobCondition {
 	for i := range list {
 		if list[i].Type == cType {
--- a/pkg/controller/job/job_controller_test.go
+++ b/pkg/controller/job/job_controller_test.go
@@ -2019,6 +2019,962 @@ func TestSyncJobDeleted(t *testing.T) {
 	}
 }

+func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
+	indexedCompletionMode := batch.IndexedCompletion
+	validObjectMeta := metav1.ObjectMeta{
+		Name:      "foobar",
+		UID:       uuid.NewUUID(),
+		Namespace: metav1.NamespaceDefault,
+	}
+	validSelector := &metav1.LabelSelector{
+		MatchLabels: map[string]string{"foo": "bar"},
+	}
+	validTemplate := v1.PodTemplateSpec{
+		ObjectMeta: metav1.ObjectMeta{
+			Labels: map[string]string{
+				"foo": "bar",
+			},
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{Image: "foo/bar"},
+			},
+		},
+	}
+
+	onExitCodeRules := []batch.PodFailurePolicyRule{
+		{
+			Action: batch.PodFailurePolicyActionIgnore,
+			OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+				Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+				Values:   []int32{1, 2, 3},
+			},
+		},
+		{
+			Action: batch.PodFailurePolicyActionFailJob,
+			OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+				Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+				Values:   []int32{5, 6, 7},
+			},
+		},
+	}
+
+	testCases := map[string]struct {
+		enableJobPodFailurePolicy bool
+		job                       batch.Job
+		pods                      []v1.PodStatus
+		wantConditions            *[]batch.JobCondition
+		wantStatusFailed          int32
+		wantStatusActive          int32
+		wantStatusSucceeded       int32
+	}{
+		"default handling for pod failure if the container matching the exit codes does not match the containerName restriction": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: []batch.PodFailurePolicyRule{
+							{
+								Action: batch.PodFailurePolicyActionIgnore,
+								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+									ContainerName: pointer.String("main-container"),
+									Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
+									Values:        []int32{1, 2, 3},
+								},
+							},
+							{
+								Action: batch.PodFailurePolicyActionFailJob,
+								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+									ContainerName: pointer.String("main-container"),
+									Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
+									Values:        []int32{5, 6, 7},
+								},
+							},
+						},
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "monitoring-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 5,
+								},
+							},
+						},
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 42,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions:      nil,
+			wantStatusActive:    1,
+			wantStatusSucceeded: 0,
+			wantStatusFailed:    1,
+		},
+		"running pod should not result in job fail based on OnExitCodes": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: onExitCodeRules,
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 5,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions:      nil,
+			wantStatusActive:    1,
+			wantStatusFailed:    0,
+			wantStatusSucceeded: 0,
+		},
+		"fail job based on OnExitCodes": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: onExitCodeRules,
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 5,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions: &[]batch.JobCondition{
+				{
+					Type:    batch.JobFailed,
+					Status:  v1.ConditionTrue,
+					Reason:  "PodFailurePolicy",
+					Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
+				},
+			},
+			wantStatusActive:    0,
+			wantStatusFailed:    1,
+			wantStatusSucceeded: 0,
+		},
+		"job marked already as failure target with failed pod": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: onExitCodeRules,
+					},
+				},
+				Status: batch.JobStatus{
+					Conditions: []batch.JobCondition{
+						{
+							Type:    batch.AlphaNoCompatGuaranteeJobFailureTarget,
+							Status:  v1.ConditionTrue,
+							Reason:  "PodFailurePolicy",
+							Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
+						},
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 5,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions: &[]batch.JobCondition{
+				{
+					Type:    batch.JobFailed,
+					Status:  v1.ConditionTrue,
+					Reason:  "PodFailurePolicy",
+					Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
+				},
+			},
+			wantStatusActive:    0,
+			wantStatusFailed:    1,
+			wantStatusSucceeded: 0,
+		},
+		"job marked already as failure target with failed pod, message based on already deleted pod": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: onExitCodeRules,
+					},
+				},
+				Status: batch.JobStatus{
+					Conditions: []batch.JobCondition{
+						{
+							Type:    batch.AlphaNoCompatGuaranteeJobFailureTarget,
+							Status:  v1.ConditionTrue,
+							Reason:  "PodFailurePolicy",
+							Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
+						},
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 5,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions: &[]batch.JobCondition{
+				{
+					Type:    batch.JobFailed,
+					Status:  v1.ConditionTrue,
+					Reason:  "PodFailurePolicy",
+					Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
+				},
+			},
+			wantStatusActive:    0,
+			wantStatusFailed:    1,
+			wantStatusSucceeded: 0,
+		},
+		"default handling for a failed pod when the feature is disabled even, despite matching rule": {
+			enableJobPodFailurePolicy: false,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: onExitCodeRules,
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 5,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions:      nil,
+			wantStatusActive:    1,
+			wantStatusFailed:    1,
+			wantStatusSucceeded: 0,
+		},
+		"fail job with multiple pods": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(2),
+					Completions:  pointer.Int32(2),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: onExitCodeRules,
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodRunning,
+				},
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 5,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions: &[]batch.JobCondition{
+				{
+					Type:    batch.JobFailed,
+					Status:  v1.ConditionTrue,
+					Reason:  "PodFailurePolicy",
+					Message: "Container main-container for pod default/mypod-1 failed with exit code 5 matching FailJob rule at index 1",
+				},
+			},
+			wantStatusActive:    0,
+			wantStatusFailed:    2,
+			wantStatusSucceeded: 0,
+		},
+		"fail indexed job based on OnExitCodes": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:       validSelector,
+					Template:       validTemplate,
+					CompletionMode: &indexedCompletionMode,
+					Parallelism:    pointer.Int32(1),
+					Completions:    pointer.Int32(1),
+					BackoffLimit:   pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: onExitCodeRules,
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 5,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions: &[]batch.JobCondition{
+				{
+					Type:    batch.JobFailed,
+					Status:  v1.ConditionTrue,
+					Reason:  "PodFailurePolicy",
+					Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
+				},
+			},
+			wantStatusActive:    0,
+			wantStatusFailed:    1,
+			wantStatusSucceeded: 0,
+		},
+		"fail job based on OnExitCodes with NotIn operator": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: []batch.PodFailurePolicyRule{
+							{
+								Action: batch.PodFailurePolicyActionFailJob,
+								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+									Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
+									Values:   []int32{5, 6, 7},
+								},
+							},
+						},
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 42,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions: &[]batch.JobCondition{
+				{
+					Type:    batch.JobFailed,
+					Status:  v1.ConditionTrue,
+					Reason:  "PodFailurePolicy",
+					Message: "Container main-container for pod default/mypod-0 failed with exit code 42 matching FailJob rule at index 0",
+				},
+			},
+			wantStatusActive:    0,
+			wantStatusFailed:    1,
+			wantStatusSucceeded: 0,
+		},
+		"default handling job based on OnExitCodes with NotIn operator": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: []batch.PodFailurePolicyRule{
+							{
+								Action: batch.PodFailurePolicyActionFailJob,
+								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+									Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
+									Values:   []int32{5, 6, 7},
+								},
+							},
+						},
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 5,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions:      nil,
+			wantStatusActive:    1,
+			wantStatusFailed:    1,
+			wantStatusSucceeded: 0,
+		},
+		"fail job based on OnExitCodes for InitContainer": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: onExitCodeRules,
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					InitContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "init-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 5,
+								},
+							},
+						},
+					},
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 143,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions: &[]batch.JobCondition{
+				{
+					Type:    batch.JobFailed,
+					Status:  v1.ConditionTrue,
+					Reason:  "PodFailurePolicy",
+					Message: "Container init-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
+				},
+			},
+			wantStatusActive:    0,
+			wantStatusFailed:    1,
+			wantStatusSucceeded: 0,
+		},
+		"ignore pod failure; both rules are matching, the first is executed only": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(0),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: onExitCodeRules,
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "container1",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 2,
+								},
+							},
+						},
+						{
+							Name: "container2",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 6,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions:      nil,
+			wantStatusActive:    1,
+			wantStatusFailed:    0,
+			wantStatusSucceeded: 0,
+		},
+		"ignore pod failure based on OnExitCodes": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(0),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: onExitCodeRules,
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 1,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions:      nil,
+			wantStatusActive:    1,
+			wantStatusFailed:    0,
+			wantStatusSucceeded: 0,
+		},
+		"default job based on OnExitCodes": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(0),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: onExitCodeRules,
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 10,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions: &[]batch.JobCondition{
+				{
+					Type:    batch.JobFailed,
+					Status:  v1.ConditionTrue,
+					Reason:  "BackoffLimitExceeded",
+					Message: "Job has reached the specified backoff limit",
+				},
+			},
+			wantStatusActive:    0,
+			wantStatusFailed:    1,
+			wantStatusSucceeded: 0,
+		},
+		"count pod failure based on OnExitCodes; both rules are matching, the first is executed only": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: []batch.PodFailurePolicyRule{
+							{
+								Action: batch.PodFailurePolicyActionCount,
+								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+									Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+									Values:   []int32{1, 2},
+								},
+							},
+							{
+								Action: batch.PodFailurePolicyActionIgnore,
+								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+									Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+									Values:   []int32{2, 3},
+								},
+							},
+						},
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 2,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantConditions:      nil,
+			wantStatusActive:    1,
+			wantStatusFailed:    1,
+			wantStatusSucceeded: 0,
+		},
+		"count pod failure based on OnPodConditions; both rules are matching, the first is executed only": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: []batch.PodFailurePolicyRule{
+							{
+								Action: batch.PodFailurePolicyActionCount,
+								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+									{
+										Type:   v1.PodConditionType("ResourceLimitExceeded"),
+										Status: v1.ConditionTrue,
+									},
+								},
+							},
+							{
+								Action: batch.PodFailurePolicyActionIgnore,
+								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+									{
+										Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+										Status: v1.ConditionTrue,
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					Conditions: []v1.PodCondition{
+						{
+							Type:   v1.PodConditionType("ResourceLimitExceeded"),
+							Status: v1.ConditionTrue,
+						},
+						{
+							Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: v1.ConditionTrue,
+						},
+					},
+				},
+			},
+			wantConditions:      nil,
+			wantStatusActive:    1,
+			wantStatusFailed:    1,
+			wantStatusSucceeded: 0,
+		},
+		"ignore pod failure based on OnPodConditions": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(0),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: []batch.PodFailurePolicyRule{
+							{
+								Action: batch.PodFailurePolicyActionIgnore,
+								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+									{
+										Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+										Status: v1.ConditionTrue,
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					Conditions: []v1.PodCondition{
+						{
+							Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: v1.ConditionTrue,
+						},
+					},
+				},
+			},
+			wantConditions:      nil,
+			wantStatusActive:    1,
+			wantStatusFailed:    0,
+			wantStatusSucceeded: 0,
+		},
+		"fail job based on OnPodConditions": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:     validSelector,
+					Template:     validTemplate,
+					Parallelism:  pointer.Int32(1),
+					Completions:  pointer.Int32(1),
+					BackoffLimit: pointer.Int32(6),
+					PodFailurePolicy: &batch.PodFailurePolicy{
+						Rules: []batch.PodFailurePolicyRule{
+							{
+								Action: batch.PodFailurePolicyActionFailJob,
+								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+									{
+										Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+										Status: v1.ConditionTrue,
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			pods: []v1.PodStatus{
+				{
+					Phase: v1.PodFailed,
+					Conditions: []v1.PodCondition{
+						{
+							Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: v1.ConditionTrue,
+						},
+					},
+				},
+			},
+			wantConditions: &[]batch.JobCondition{
+				{
+					Type:    batch.JobFailed,
+					Status:  v1.ConditionTrue,
+					Reason:  "PodFailurePolicy",
+					Message: "Pod default/mypod-0 has condition DisruptionTarget matching FailJob rule at index 0",
+				},
+			},
+			wantStatusActive:    0,
+			wantStatusFailed:    1,
+			wantStatusSucceeded: 0,
+		},
+	}
+	for _, wFinalizers := range []bool{false, true} {
+		for name, tc := range testCases {
+			t.Run(fmt.Sprintf("%s; finalizers=%t", name, wFinalizers), func(t *testing.T) {
+				defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, wFinalizers)()
+				defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)()
+				clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
+				manager, sharedInformerFactory := newControllerFromClient(clientset, controller.NoResyncPeriodFunc)
+				fakePodControl := controller.FakePodControl{}
+				manager.podControl = &fakePodControl
+				manager.podStoreSynced = alwaysReady
+				manager.jobStoreSynced = alwaysReady
+				job := &tc.job
+
+				if wFinalizers {
+					job.Annotations = map[string]string{
+						batch.JobTrackingFinalizer: "",
+					}
+				}
+
+				actual := job
+				manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
+					actual = job
+					return job, nil
+				}
+				sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
+				for i, podStatus := range tc.pods {
+					pb := buildPod().name(fmt.Sprintf("mypod-%d", i)).job(job).status(podStatus)
+					if job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion {
+						pb.index(fmt.Sprintf("%v", i))
+					}
+					if wFinalizers {
+						pb.trackingFinalizer()
+					}
+					sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer().Add(pb.Pod)
+				}
+
+				manager.syncJob(context.TODO(), testutil.GetKey(job, t))
+
+				if tc.wantConditions != nil {
+					for _, wantCondition := range *tc.wantConditions {
+						conditions := getConditionsByType(actual.Status.Conditions, wantCondition.Type)
+						if len(conditions) != 1 {
+							t.Fatalf("Expected a single completion condition. Got %#v for type: %q", conditions, wantCondition.Type)
+						}
+						condition := *conditions[0]
+						if diff := cmp.Diff(wantCondition, condition, cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" {
+							t.Errorf("Unexpected job condition (-want,+got):\n%s", diff)
+						}
+					}
+				} else {
+					if cond := hasTrueCondition(actual); cond != nil {
+						t.Errorf("Got condition %s, want none", *cond)
+					}
+				}
+				// validate status
+				if actual.Status.Active != tc.wantStatusActive {
+					t.Errorf("unexpected number of active pods. Expected %d, saw %d\n", tc.wantStatusActive, actual.Status.Active)
+				}
+				if actual.Status.Succeeded != tc.wantStatusSucceeded {
+					t.Errorf("unexpected number of succeeded pods. Expected %d, saw %d\n", tc.wantStatusSucceeded, actual.Status.Succeeded)
+				}
+				if actual.Status.Failed != tc.wantStatusFailed {
+					t.Errorf("unexpected number of failed pods. Expected %d, saw %d\n", tc.wantStatusFailed, actual.Status.Failed)
+				}
+			})
+		}
+	}
+}
+
 func TestSyncJobUpdateRequeue(t *testing.T) {
 	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
 	DefaultJobBackOff = time.Duration(0) // overwrite the default value for testing
@@ -3449,6 +4405,11 @@ func (pb podBuilder) index(ix string) podBuilder {
 	return pb
 }

+func (pb podBuilder) status(s v1.PodStatus) podBuilder {
+	pb.Status = s
+	return pb
+}
+
 func (pb podBuilder) phase(p v1.PodPhase) podBuilder {
 	pb.Status.Phase = p
 	return pb
--- a/pkg/controller/job/pod_failure_policy.go
+++ b/pkg/controller/job/pod_failure_policy.go
@@ -0,0 +1,117 @@
+/*
+Copyright 2021 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package job
+
+import (
+	"fmt"
+
+	batch "k8s.io/api/batch/v1"
+	v1 "k8s.io/api/core/v1"
+)
+
+// matchPodFailurePolicy returns information about matching a given failed pod
+// against the pod failure policy rules. The information is represented as an
+// optional job failure message (present in case the pod matched a 'FailJob'
+// rule) and a boolean indicating if the failure should be counted towards
+// backoffLimit (it should not be counted if the pod matched an 'Ignore' rule).
+func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool) {
+	if podFailurePolicy == nil {
+		return nil, true
+	}
+	for index, podFailurePolicyRule := range podFailurePolicy.Rules {
+		if podFailurePolicyRule.OnExitCodes != nil {
+			if containerStatus := matchOnExitCodes(&failedPod.Status, podFailurePolicyRule.OnExitCodes); containerStatus != nil {
+				switch podFailurePolicyRule.Action {
+				case batch.PodFailurePolicyActionIgnore:
+					return nil, false
+				case batch.PodFailurePolicyActionCount:
+					return nil, true
+				case batch.PodFailurePolicyActionFailJob:
+					msg := fmt.Sprintf("Container %s for pod %s/%s failed with exit code %v matching %v rule at index %d",
+						containerStatus.Name, failedPod.Namespace, failedPod.Name, containerStatus.State.Terminated.ExitCode, podFailurePolicyRule.Action, index)
+					return &msg, true
+				}
+			}
+		} else if podFailurePolicyRule.OnPodConditions != nil {
+			if podCondition := matchOnPodConditions(&failedPod.Status, podFailurePolicyRule.OnPodConditions); podCondition != nil {
+				switch podFailurePolicyRule.Action {
+				case batch.PodFailurePolicyActionIgnore:
+					return nil, false
+				case batch.PodFailurePolicyActionCount:
+					return nil, true
+				case batch.PodFailurePolicyActionFailJob:
+					msg := fmt.Sprintf("Pod %s/%s has condition %v matching %v rule at index %d",
+						failedPod.Namespace, failedPod.Name, podCondition.Type, podFailurePolicyRule.Action, index)
+					return &msg, true
+				}
+			}
+		}
+	}
+	return nil, true
+}
+
+func matchOnExitCodes(podStatus *v1.PodStatus, requirement *batch.PodFailurePolicyOnExitCodesRequirement) *v1.ContainerStatus {
+	if containerStatus := getMatchingContainerFromList(podStatus.ContainerStatuses, requirement); containerStatus != nil {
+		return containerStatus
+	}
+	return getMatchingContainerFromList(podStatus.InitContainerStatuses, requirement)
+}
+
+func matchOnPodConditions(podStatus *v1.PodStatus, requirement []batch.PodFailurePolicyOnPodConditionsPattern) *v1.PodCondition {
+	for _, podCondition := range podStatus.Conditions {
+		for _, pattern := range requirement {
+			if podCondition.Type == pattern.Type && podCondition.Status == pattern.Status {
+				return &podCondition
+			}
+		}
+	}
+	return nil
+}
+
+func getMatchingContainerFromList(containerStatuses []v1.ContainerStatus, requirement *batch.PodFailurePolicyOnExitCodesRequirement) *v1.ContainerStatus {
+	for _, containerStatus := range containerStatuses {
+		if requirement.ContainerName == nil || *requirement.ContainerName == containerStatus.Name {
+			if containerStatus.State.Terminated.ExitCode != 0 {
+				if isOnExitCodesOperatorMatching(containerStatus.State.Terminated.ExitCode, requirement) {
+					return &containerStatus
+				}
+			}
+		}
+	}
+	return nil
+}
+
+func isOnExitCodesOperatorMatching(exitCode int32, requirement *batch.PodFailurePolicyOnExitCodesRequirement) bool {
+	switch requirement.Operator {
+	case batch.PodFailurePolicyOnExitCodesOpIn:
+		for _, value := range requirement.Values {
+			if value == exitCode {
+				return true
+			}
+		}
+		return false
+	case batch.PodFailurePolicyOnExitCodesOpNotIn:
+		for _, value := range requirement.Values {
+			if value == exitCode {
+				return false
+			}
+		}
+		return true
+	default:
+		return false
+	}
+}
--- a/pkg/controller/job/pod_failure_policy_test.go
+++ b/pkg/controller/job/pod_failure_policy_test.go
@@ -0,0 +1,707 @@
+/*
+Copyright 2015 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package job
+
+import (
+	"testing"
+
+	batch "k8s.io/api/batch/v1"
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	_ "k8s.io/kubernetes/pkg/apis/core/install"
+	"k8s.io/utils/pointer"
+)
+
+func TestMatchPodFailurePolicy(t *testing.T) {
+	validPodObjectMeta := metav1.ObjectMeta{
+		Namespace: "default",
+		Name:      "mypod",
+	}
+
+	testCases := map[string]struct {
+		podFailurePolicy      *batch.PodFailurePolicy
+		failedPod             *v1.Pod
+		wantJobFailureMessage *string
+		wantCountFailed       bool
+	}{
+		"unknown action for rule matching by exit codes - skip rule with unknown action": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: "UnknownAction",
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+							Values:   []int32{1, 2},
+						},
+					},
+					{
+						Action: batch.PodFailurePolicyActionFailJob,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+							Values:   []int32{2, 3},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 2,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 1"),
+			wantCountFailed:       true,
+		},
+		"unknown action for rule matching by pod conditions - skip rule with unknown action": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: "UnkonwnAction",
+						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+								Status: v1.ConditionTrue,
+							},
+						},
+					},
+					{
+						Action: batch.PodFailurePolicyActionIgnore,
+						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+								Status: v1.ConditionTrue,
+							},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					Conditions: []v1.PodCondition{
+						{
+							Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: v1.ConditionTrue,
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       false,
+		},
+		"unknown operator - rule with unknown action is skipped for onExitCodes": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionIgnore,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: "UnknownOperator",
+							Values:   []int32{1, 2},
+						},
+					},
+					{
+						Action: batch.PodFailurePolicyActionFailJob,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+							Values:   []int32{2, 3},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 2,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 1"),
+			wantCountFailed:       true,
+		},
+		"no policy rules": {
+			podFailurePolicy: nil,
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 2,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       true,
+		},
+		"ignore rule matched for exit codes": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionIgnore,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+							Values:   []int32{1, 2, 3},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 2,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       false,
+		},
+		"FailJob rule matched for exit codes": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionFailJob,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+							Values:   []int32{1, 2, 3},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 2,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 2 matching FailJob rule at index 0"),
+			wantCountFailed:       true,
+		},
+		"successful containers are skipped by the rules": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionFailJob,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
+							Values:   []int32{111},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					InitContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "init-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 0,
+								},
+							},
+						},
+					},
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 111,
+								},
+							},
+						},
+						{
+							Name: "suppport-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 0,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       true,
+		},
+		"pod failure policy with NotIn operator and value 0": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionFailJob,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
+							Values:   []int32{0},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 1,
+								},
+							},
+						},
+						{
+							Name: "suppport-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 0,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 1 matching FailJob rule at index 0"),
+			wantCountFailed:       true,
+		},
+		"second jobfail rule matched for exit codes": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionCount,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+							Values:   []int32{1, 2, 3},
+						},
+					},
+					{
+						Action: batch.PodFailurePolicyActionFailJob,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+							Values:   []int32{4, 5, 6},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 6,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: pointer.String("Container main-container for pod default/mypod failed with exit code 6 matching FailJob rule at index 1"),
+			wantCountFailed:       true,
+		},
+		"count rule matched for exit codes": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionCount,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+							Values:   []int32{1, 2, 3},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "main-container",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 2,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       true,
+		},
+		"ignore rule matched for pod conditions": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionIgnore,
+						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+								Status: v1.ConditionTrue,
+							},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					Conditions: []v1.PodCondition{
+						{
+							Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: v1.ConditionTrue,
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       false,
+		},
+		"ignore rule matches by the status=False": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionIgnore,
+						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+								Status: v1.ConditionFalse,
+							},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					Conditions: []v1.PodCondition{
+						{
+							Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: v1.ConditionFalse,
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       false,
+		},
+		"ignore rule matches by the status=Unknown": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionIgnore,
+						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+								Status: v1.ConditionUnknown,
+							},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					Conditions: []v1.PodCondition{
+						{
+							Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: v1.ConditionUnknown,
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       false,
+		},
+		"ignore rule does not match when status for pattern is False, but actual True": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionIgnore,
+						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+								Status: v1.ConditionFalse,
+							},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					Conditions: []v1.PodCondition{
+						{
+							Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: v1.ConditionTrue,
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       true,
+		},
+		"ignore rule does not match when status for pattern is True, but actual False": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionIgnore,
+						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+								Status: v1.ConditionTrue,
+							},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					Conditions: []v1.PodCondition{
+						{
+							Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: v1.ConditionFalse,
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       true,
+		},
+		"default - do not match condition with status=False": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionIgnore,
+						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+								Status: v1.ConditionTrue,
+							},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					Conditions: []v1.PodCondition{
+						{
+							Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: v1.ConditionFalse,
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       true,
+		},
+		"job fail rule matched for pod conditions": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionFailJob,
+						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+								Status: v1.ConditionTrue,
+							},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					Conditions: []v1.PodCondition{
+						{
+							Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: v1.ConditionTrue,
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: pointer.String("Pod default/mypod has condition DisruptionTarget matching FailJob rule at index 0"),
+			wantCountFailed:       true,
+		},
+		"count rule matched for pod conditions": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionCount,
+						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+								Status: v1.ConditionTrue,
+							},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					Conditions: []v1.PodCondition{
+						{
+							Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							Status: v1.ConditionTrue,
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       true,
+		},
+		"no rule matched": {
+			podFailurePolicy: &batch.PodFailurePolicy{
+				Rules: []batch.PodFailurePolicyRule{
+					{
+						Action: batch.PodFailurePolicyActionCount,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
+							Values:   []int32{8},
+						},
+					},
+					{
+						Action: batch.PodFailurePolicyActionIgnore,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpIn,
+							Values:   []int32{1, 2, 3},
+						},
+					},
+					{
+						Action: batch.PodFailurePolicyActionFailJob,
+						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
+							Values:   []int32{5, 6, 7},
+						},
+					},
+					{
+						Action: batch.PodFailurePolicyActionCount,
+						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type:   v1.PodConditionType("ResourceLimitExceeded"),
+								Status: v1.ConditionTrue,
+							},
+						},
+					},
+					{
+						Action: batch.PodFailurePolicyActionIgnore,
+						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+								Status: v1.ConditionTrue,
+							},
+						},
+					},
+				},
+			},
+			failedPod: &v1.Pod{
+				ObjectMeta: validPodObjectMeta,
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 32,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantJobFailureMessage: nil,
+			wantCountFailed:       true,
+		},
+	}
+	for name, tc := range testCases {
+		t.Run(name, func(t *testing.T) {
+			jobFailMessage, countFailed := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod)
+			if tc.wantJobFailureMessage == nil {
+				if jobFailMessage != nil {
+					t.Errorf("Unexpected job fail message. Got: %q", *jobFailMessage)
+				}
+			} else {
+				if jobFailMessage == nil {
+					t.Errorf("Missing job fail message. want: %q", *tc.wantJobFailureMessage)
+				} else if *tc.wantJobFailureMessage != *jobFailMessage {
+					t.Errorf("Unexpected job fail message. want: %q. got: %q", *tc.wantJobFailureMessage, *jobFailMessage)
+				}
+			}
+			if tc.wantCountFailed != countFailed {
+				t.Errorf("Unexpected count failed. want: %v. got: %v", tc.wantCountFailed, countFailed)
+			}
+		})
+	}
+}
--- a/pkg/features/kube_features.go
+++ b/pkg/features/kube_features.go
@@ -448,6 +448,14 @@ const (
 	// Causes kubelet to no longer create legacy IPTables rules
 	IPTablesOwnershipCleanup featuregate.Feature = "IPTablesOwnershipCleanup"

+	// owner: @mimowo
+	// kep: http://kep.k8s.io/3329
+	// alpha: v1.25
+	//
+	// Allow users to specify handling of pod failures based on container exit codes
+	// and pod conditions.
+	JobPodFailurePolicy featuregate.Feature = "JobPodFailurePolicy"
+
 	// owner: @ahg
 	// beta: v1.23
 	//
@@ -987,6 +995,8 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS

 	IPTablesOwnershipCleanup: {Default: false, PreRelease: featuregate.Alpha},

+	JobPodFailurePolicy: {Default: false, PreRelease: featuregate.Alpha},
+
 	JobMutableNodeSchedulingDirectives: {Default: true, PreRelease: featuregate.Beta},

 	JobReadyPods: {Default: true, PreRelease: featuregate.Beta},
--- a/pkg/generated/openapi/zz_generated.openapi.go
+++ b/pkg/generated/openapi/zz_generated.openapi.go
@@ -279,6 +279,10 @@ func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenA
 		"k8s.io/api/batch/v1.JobSpec":                                                                     schema_k8sio_api_batch_v1_JobSpec(ref),
 		"k8s.io/api/batch/v1.JobStatus":                                                                   schema_k8sio_api_batch_v1_JobStatus(ref),
 		"k8s.io/api/batch/v1.JobTemplateSpec":                                                             schema_k8sio_api_batch_v1_JobTemplateSpec(ref),
+		"k8s.io/api/batch/v1.PodFailurePolicy":                                                            schema_k8sio_api_batch_v1_PodFailurePolicy(ref),
+		"k8s.io/api/batch/v1.PodFailurePolicyOnExitCodesRequirement":                                      schema_k8sio_api_batch_v1_PodFailurePolicyOnExitCodesRequirement(ref),
+		"k8s.io/api/batch/v1.PodFailurePolicyOnPodConditionsPattern":                                      schema_k8sio_api_batch_v1_PodFailurePolicyOnPodConditionsPattern(ref),
+		"k8s.io/api/batch/v1.PodFailurePolicyRule":                                                        schema_k8sio_api_batch_v1_PodFailurePolicyRule(ref),
 		"k8s.io/api/batch/v1.UncountedTerminatedPods":                                                     schema_k8sio_api_batch_v1_UncountedTerminatedPods(ref),
 		"k8s.io/api/batch/v1beta1.CronJob":                                                                schema_k8sio_api_batch_v1beta1_CronJob(ref),
 		"k8s.io/api/batch/v1beta1.CronJobList":                                                            schema_k8sio_api_batch_v1beta1_CronJobList(ref),
@@ -12994,6 +12998,12 @@ func schema_k8sio_api_batch_v1_JobSpec(ref common.ReferenceCallback) common.Open
 							Format:      "int64",
 						},
 					},
+					"podFailurePolicy": {
+						SchemaProps: spec.SchemaProps{
+							Description: "Specifies the policy of handling failed pods. In particular, it allows to specify the set of actions and conditions which need to be satisfied to take the associated action. If empty, the default behaviour applies - the counter of failed pods, represented by the jobs's .status.failed field, is incremented and it is checked against the backoffLimit. This field cannot be used in combination with restartPolicy=OnFailure.\n\nThis field is alpha-level. To use this field, you must enable the `JobPodFailurePolicy` feature gate (disabled by default).",
+							Ref:         ref("k8s.io/api/batch/v1.PodFailurePolicy"),
+						},
+					},
 					"backoffLimit": {
 						SchemaProps: spec.SchemaProps{
 							Description: "Specifies the number of retries before marking this job failed. Defaults to 6",
@@ -13047,7 +13057,7 @@ func schema_k8sio_api_batch_v1_JobSpec(ref common.ReferenceCallback) common.Open
 			},
 		},
 		Dependencies: []string{
-			"k8s.io/api/core/v1.PodTemplateSpec", "k8s.io/apimachinery/pkg/apis/meta/v1.LabelSelector"},
+			"k8s.io/api/batch/v1.PodFailurePolicy", "k8s.io/api/core/v1.PodTemplateSpec", "k8s.io/apimachinery/pkg/apis/meta/v1.LabelSelector"},
 	}
 }

@@ -13169,6 +13179,169 @@ func schema_k8sio_api_batch_v1_JobTemplateSpec(ref common.ReferenceCallback) com
 	}
 }

+func schema_k8sio_api_batch_v1_PodFailurePolicy(ref common.ReferenceCallback) common.OpenAPIDefinition {
+	return common.OpenAPIDefinition{
+		Schema: spec.Schema{
+			SchemaProps: spec.SchemaProps{
+				Description: "PodFailurePolicy describes how failed pods influence the backoffLimit.",
+				Type:        []string{"object"},
+				Properties: map[string]spec.Schema{
+					"rules": {
+						VendorExtensible: spec.VendorExtensible{
+							Extensions: spec.Extensions{
+								"x-kubernetes-list-type": "atomic",
+							},
+						},
+						SchemaProps: spec.SchemaProps{
+							Description: "A list of pod failure policy rules. The rules are evaluated in order. Once a rule matches a Pod failure, the remaining of the rules are ignored. When no rule matches the Pod failure, the default handling applies - the counter of pod failures is incremented and it is checked against the backoffLimit. At most 20 elements are allowed.",
+							Type:        []string{"array"},
+							Items: &spec.SchemaOrArray{
+								Schema: &spec.Schema{
+									SchemaProps: spec.SchemaProps{
+										Default: map[string]interface{}{},
+										Ref:     ref("k8s.io/api/batch/v1.PodFailurePolicyRule"),
+									},
+								},
+							},
+						},
+					},
+				},
+				Required: []string{"rules"},
+			},
+		},
+		Dependencies: []string{
+			"k8s.io/api/batch/v1.PodFailurePolicyRule"},
+	}
+}
+
+func schema_k8sio_api_batch_v1_PodFailurePolicyOnExitCodesRequirement(ref common.ReferenceCallback) common.OpenAPIDefinition {
+	return common.OpenAPIDefinition{
+		Schema: spec.Schema{
+			SchemaProps: spec.SchemaProps{
+				Description: "PodFailurePolicyOnExitCodesRequirement describes the requirement for handling a failed pod based on its container exit codes. In particular, it lookups the .state.terminated.exitCode for each app container and init container status, represented by the .status.containerStatuses and .status.initContainerStatuses fields in the Pod status, respectively. Containers completed with success (exit code 0) are excluded from the requirement check.",
+				Type:        []string{"object"},
+				Properties: map[string]spec.Schema{
+					"containerName": {
+						SchemaProps: spec.SchemaProps{
+							Description: "Restricts the check for exit codes to the container with the specified name. When null, the rule applies to all containers. When specified, it should match one the container or initContainer names in the pod template.",
+							Type:        []string{"string"},
+							Format:      "",
+						},
+					},
+					"operator": {
+						SchemaProps: spec.SchemaProps{
+							Description: "Represents the relationship between the container exit code(s) and the specified values. Containers completed with success (exit code 0) are excluded from the requirement check. Possible values are: - In: the requirement is satisfied if at least one container exit code\n  (might be multiple if there are multiple containers not restricted\n  by the 'containerName' field) is in the set of specified values.\n- NotIn: the requirement is satisfied if at least one container exit code\n  (might be multiple if there are multiple containers not restricted\n  by the 'containerName' field) is not in the set of specified values.\nAdditional values are considered to be added in the future. Clients should react to an unknown operator by assuming the requirement is not satisfied.\n\nPossible enum values:\n - `\"In\"`\n - `\"NotIn\"`",
+							Default:     "",
+							Type:        []string{"string"},
+							Format:      "",
+							Enum:        []interface{}{"In", "NotIn"}},
+					},
+					"values": {
+						VendorExtensible: spec.VendorExtensible{
+							Extensions: spec.Extensions{
+								"x-kubernetes-list-type": "set",
+							},
+						},
+						SchemaProps: spec.SchemaProps{
+							Description: "Specifies the set of values. Each returned container exit code (might be multiple in case of multiple containers) is checked against this set of values with respect to the operator. The list of values must be ordered and must not contain duplicates. Value '0' cannot be used for the In operator. At least one element is required. At most 255 elements are allowed.",
+							Type:        []string{"array"},
+							Items: &spec.SchemaOrArray{
+								Schema: &spec.Schema{
+									SchemaProps: spec.SchemaProps{
+										Default: 0,
+										Type:    []string{"integer"},
+										Format:  "int32",
+									},
+								},
+							},
+						},
+					},
+				},
+				Required: []string{"operator", "values"},
+			},
+		},
+	}
+}
+
+func schema_k8sio_api_batch_v1_PodFailurePolicyOnPodConditionsPattern(ref common.ReferenceCallback) common.OpenAPIDefinition {
+	return common.OpenAPIDefinition{
+		Schema: spec.Schema{
+			SchemaProps: spec.SchemaProps{
+				Description: "PodFailurePolicyOnPodConditionsPattern describes a pattern for matching an actual pod condition type.",
+				Type:        []string{"object"},
+				Properties: map[string]spec.Schema{
+					"type": {
+						SchemaProps: spec.SchemaProps{
+							Description: "Specifies the required Pod condition type. To match a pod condition it is required that specified type equals the pod condition type.",
+							Default:     "",
+							Type:        []string{"string"},
+							Format:      "",
+						},
+					},
+					"status": {
+						SchemaProps: spec.SchemaProps{
+							Description: "Specifies the required Pod condition status. To match a pod condition it is required that the specified status equals the pod condition status. Defaults to True.",
+							Default:     "",
+							Type:        []string{"string"},
+							Format:      "",
+						},
+					},
+				},
+				Required: []string{"type", "status"},
+			},
+		},
+	}
+}
+
+func schema_k8sio_api_batch_v1_PodFailurePolicyRule(ref common.ReferenceCallback) common.OpenAPIDefinition {
+	return common.OpenAPIDefinition{
+		Schema: spec.Schema{
+			SchemaProps: spec.SchemaProps{
+				Description: "PodFailurePolicyRule describes how a pod failure is handled when the requirements are met. One of OnExitCodes and onPodConditions, but not both, can be used in each rule.",
+				Type:        []string{"object"},
+				Properties: map[string]spec.Schema{
+					"action": {
+						SchemaProps: spec.SchemaProps{
+							Description: "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are: - FailJob: indicates that the pod's job is marked as Failed and all\n  running pods are terminated.\n- Ignore: indicates that the counter towards the .backoffLimit is not\n  incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n  counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.\n\nPossible enum values:\n - `\"Count\"` This is an action which might be taken on a pod failure - the pod failure is handled in the default way - the counter towards .backoffLimit, represented by the job's .status.failed field, is incremented.\n - `\"FailJob\"` This is an action which might be taken on a pod failure - mark the pod's job as Failed and terminate all running pods.\n - `\"Ignore\"` This is an action which might be taken on a pod failure - the counter towards .backoffLimit, represented by the job's .status.failed field, is not incremented and a replacement pod is created.",
+							Default:     "",
+							Type:        []string{"string"},
+							Format:      "",
+							Enum:        []interface{}{"Count", "FailJob", "Ignore"}},
+					},
+					"onExitCodes": {
+						SchemaProps: spec.SchemaProps{
+							Description: "Represents the requirement on the container exit codes.",
+							Ref:         ref("k8s.io/api/batch/v1.PodFailurePolicyOnExitCodesRequirement"),
+						},
+					},
+					"onPodConditions": {
+						VendorExtensible: spec.VendorExtensible{
+							Extensions: spec.Extensions{
+								"x-kubernetes-list-type": "atomic",
+							},
+						},
+						SchemaProps: spec.SchemaProps{
+							Description: "Represents the requirement on the pod conditions. The requirement is represented as a list of pod condition patterns. The requirement is satisfied if at least one pattern matches an actual pod condition. At most 20 elements are allowed.",
+							Type:        []string{"array"},
+							Items: &spec.SchemaOrArray{
+								Schema: &spec.Schema{
+									SchemaProps: spec.SchemaProps{
+										Default: map[string]interface{}{},
+										Ref:     ref("k8s.io/api/batch/v1.PodFailurePolicyOnPodConditionsPattern"),
+									},
+								},
+							},
+						},
+					},
+				},
+				Required: []string{"action", "onPodConditions"},
+			},
+		},
+		Dependencies: []string{
+			"k8s.io/api/batch/v1.PodFailurePolicyOnExitCodesRequirement", "k8s.io/api/batch/v1.PodFailurePolicyOnPodConditionsPattern"},
+	}
+}
+
 func schema_k8sio_api_batch_v1_UncountedTerminatedPods(ref common.ReferenceCallback) common.OpenAPIDefinition {
 	return common.OpenAPIDefinition{
 		Schema: spec.Schema{
--- a/pkg/registry/batch/job/strategy.go
+++ b/pkg/registry/batch/job/strategy.go
@@ -100,6 +100,9 @@ func (jobStrategy) PrepareForCreate(ctx context.Context, obj runtime.Object) {
 	} else {
 		dropJobTrackingAnnotation(job)
 	}
+	if !utilfeature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
+		job.Spec.PodFailurePolicy = nil
+	}

 	pod.DropDisabledTemplateFields(&job.Spec.Template, nil)
 }
@@ -133,6 +136,10 @@ func (jobStrategy) PrepareForUpdate(ctx context.Context, obj, old runtime.Object
 		dropJobTrackingAnnotation(newJob)
 	}

+	if !utilfeature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && oldJob.Spec.PodFailurePolicy == nil {
+		newJob.Spec.PodFailurePolicy = nil
+	}
+
 	pod.DropDisabledTemplateFields(&newJob.Spec.Template, &oldJob.Spec.Template)

 	// Any changes to the spec increment the generation number.
--- a/pkg/registry/batch/job/strategy_test.go
+++ b/pkg/registry/batch/job/strategy_test.go
@@ -40,6 +40,244 @@ import (

 var ignoreErrValueDetail = cmpopts.IgnoreFields(field.Error{}, "BadValue", "Detail")

+// TestJobStrategy_PrepareForUpdate tests various scenearios for PrepareForUpdate
+func TestJobStrategy_PrepareForUpdate(t *testing.T) {
+	validSelector := getValidLabelSelector()
+	validPodTemplateSpec := getValidPodTemplateSpecForSelector(validSelector)
+
+	podFailurePolicy := &batch.PodFailurePolicy{
+		Rules: []batch.PodFailurePolicyRule{
+			{
+				Action: batch.PodFailurePolicyActionFailJob,
+				OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+					ContainerName: pointer.String("container-name"),
+					Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
+					Values:        []int32{1},
+				},
+			},
+		},
+	}
+	updatedPodFailurePolicy := &batch.PodFailurePolicy{
+		Rules: []batch.PodFailurePolicyRule{
+			{
+				Action: batch.PodFailurePolicyActionIgnore,
+				OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+					ContainerName: pointer.String("updated-container-name"),
+					Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
+					Values:        []int32{2},
+				},
+			},
+		},
+	}
+
+	cases := map[string]struct {
+		enableJobPodFailurePolicy bool
+		job                       batch.Job
+		updatedJob                batch.Job
+		wantJob                   batch.Job
+	}{
+		"update job with a new field; updated when JobPodFailurePolicy enabled": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				ObjectMeta: getValidObjectMeta(0),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: nil,
+				},
+			},
+			updatedJob: batch.Job{
+				ObjectMeta: getValidObjectMeta(0),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: updatedPodFailurePolicy,
+				},
+			},
+			wantJob: batch.Job{
+				ObjectMeta: getValidObjectMeta(1),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: updatedPodFailurePolicy,
+				},
+			},
+		},
+		"update job with a new field; not updated when JobPodFailurePolicy disabled": {
+			enableJobPodFailurePolicy: false,
+			job: batch.Job{
+				ObjectMeta: getValidObjectMeta(0),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: nil,
+				},
+			},
+			updatedJob: batch.Job{
+				ObjectMeta: getValidObjectMeta(0),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: updatedPodFailurePolicy,
+				},
+			},
+			wantJob: batch.Job{
+				ObjectMeta: getValidObjectMeta(0),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: nil,
+				},
+			},
+		},
+		"update pre-existing field; updated when JobPodFailurePolicy enabled": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				ObjectMeta: getValidObjectMeta(0),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: podFailurePolicy,
+				},
+			},
+			updatedJob: batch.Job{
+				ObjectMeta: getValidObjectMeta(0),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: updatedPodFailurePolicy,
+				},
+			},
+			wantJob: batch.Job{
+				ObjectMeta: getValidObjectMeta(1),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: updatedPodFailurePolicy,
+				},
+			},
+		},
+		"update pre-existing field; updated when JobPodFailurePolicy disabled": {
+			enableJobPodFailurePolicy: false,
+			job: batch.Job{
+				ObjectMeta: getValidObjectMeta(0),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: podFailurePolicy,
+				},
+			},
+			updatedJob: batch.Job{
+				ObjectMeta: getValidObjectMeta(0),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: updatedPodFailurePolicy,
+				},
+			},
+			wantJob: batch.Job{
+				ObjectMeta: getValidObjectMeta(1),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: updatedPodFailurePolicy,
+				},
+			},
+		},
+	}
+
+	for name, tc := range cases {
+		t.Run(name, func(t *testing.T) {
+			defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)()
+			ctx := genericapirequest.NewDefaultContext()
+
+			Strategy.PrepareForUpdate(ctx, &tc.updatedJob, &tc.job)
+
+			if diff := cmp.Diff(tc.wantJob, tc.updatedJob); diff != "" {
+				t.Errorf("Job pod failure policy (-want,+got):\n%s", diff)
+			}
+		})
+	}
+}
+
+// TestJobStrategy_PrepareForUpdate tests various scenearios for PrepareForCreate
+func TestJobStrategy_PrepareForCreate(t *testing.T) {
+	validSelector := getValidLabelSelector()
+	validPodTemplateSpec := getValidPodTemplateSpecForSelector(validSelector)
+
+	podFailurePolicy := &batch.PodFailurePolicy{
+		Rules: []batch.PodFailurePolicyRule{
+			{
+				Action: batch.PodFailurePolicyActionFailJob,
+				OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
+					ContainerName: pointer.String("container-name"),
+					Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
+					Values:        []int32{1},
+				},
+			},
+		},
+	}
+
+	cases := map[string]struct {
+		enableJobPodFailurePolicy bool
+		job                       batch.Job
+		wantJob                   batch.Job
+	}{
+		"create job with a new field; JobPodFailurePolicy enabled": {
+			enableJobPodFailurePolicy: true,
+			job: batch.Job{
+				ObjectMeta: getValidObjectMeta(0),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: podFailurePolicy,
+				},
+			},
+			wantJob: batch.Job{
+				ObjectMeta: getValidObjectMetaWithAnnotations(1, map[string]string{batchv1.JobTrackingFinalizer: ""}),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: podFailurePolicy,
+				},
+			},
+		},
+		"create job with a new field; JobPodFailurePolicy disabled": {
+			enableJobPodFailurePolicy: false,
+			job: batch.Job{
+				ObjectMeta: getValidObjectMeta(0),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: podFailurePolicy,
+				},
+			},
+			wantJob: batch.Job{
+				ObjectMeta: getValidObjectMetaWithAnnotations(1, map[string]string{batchv1.JobTrackingFinalizer: ""}),
+				Spec: batch.JobSpec{
+					Selector:         validSelector,
+					Template:         validPodTemplateSpec,
+					PodFailurePolicy: nil,
+				},
+			},
+		},
+	}
+
+	for name, tc := range cases {
+		t.Run(name, func(t *testing.T) {
+			defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)()
+			ctx := genericapirequest.NewDefaultContext()
+
+			Strategy.PrepareForCreate(ctx, &tc.job)
+
+			if diff := cmp.Diff(tc.wantJob, tc.job); diff != "" {
+				t.Errorf("Job pod failure policy (-want,+got):\n%s", diff)
+			}
+		})
+	}
+}
+
+// TODO(#111514): refactor by spliting into dedicated test functions
 func TestJobStrategy(t *testing.T) {
 	cases := map[string]struct {
 		trackingWithFinalizersEnabled bool
@@ -589,3 +827,35 @@ func TestSelectableFieldLabelConversions(t *testing.T) {
 func completionModePtr(m batch.CompletionMode) *batch.CompletionMode {
 	return &m
 }
+
+func getValidObjectMeta(generation int64) metav1.ObjectMeta {
+	return getValidObjectMetaWithAnnotations(generation, nil)
+}
+
+func getValidObjectMetaWithAnnotations(generation int64, annotations map[string]string) metav1.ObjectMeta {
+	return metav1.ObjectMeta{
+		Name:        "myjob",
+		Namespace:   metav1.NamespaceDefault,
+		Generation:  generation,
+		Annotations: annotations,
+	}
+}
+
+func getValidLabelSelector() *metav1.LabelSelector {
+	return &metav1.LabelSelector{
+		MatchLabels: map[string]string{"a": "b"},
+	}
+}
+
+func getValidPodTemplateSpecForSelector(validSelector *metav1.LabelSelector) api.PodTemplateSpec {
+	return api.PodTemplateSpec{
+		ObjectMeta: metav1.ObjectMeta{
+			Labels: validSelector.MatchLabels,
+		},
+		Spec: api.PodSpec{
+			RestartPolicy: api.RestartPolicyOnFailure,
+			DNSPolicy:     api.DNSClusterFirst,
+			Containers:    []api.Container{{Name: "abc", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: api.TerminationMessageReadFile}},
+		},
+	}
+}
--- a/staging/src/k8s.io/api/batch/v1/generated.pb.go
+++ b/staging/src/k8s.io/api/batch/v1/generated.pb.go
--- a/staging/src/k8s.io/api/batch/v1/generated.proto
+++ b/staging/src/k8s.io/api/batch/v1/generated.proto
@@ -205,6 +205,19 @@ message JobSpec {
  // +optional
  optional int64 activeDeadlineSeconds = 3;

+  // Specifies the policy of handling failed pods. In particular, it allows to
+  // specify the set of actions and conditions which need to be
+  // satisfied to take the associated action.
+  // If empty, the default behaviour applies - the counter of failed pods,
+  // represented by the jobs's .status.failed field, is incremented and it is
+  // checked against the backoffLimit. This field cannot be used in combination
+  // with restartPolicy=OnFailure.
+  //
+  // This field is alpha-level. To use this field, you must enable the
+  // `JobPodFailurePolicy` feature gate (disabled by default).
+  // +optional
+  optional PodFailurePolicy podFailurePolicy = 11;
+
  // Specifies the number of retries before marking this job failed.
  // Defaults to 6
  // +optional
@@ -371,6 +384,92 @@ message JobTemplateSpec {
  optional JobSpec spec = 2;
 }

+// PodFailurePolicy describes how failed pods influence the backoffLimit.
+message PodFailurePolicy {
+  // A list of pod failure policy rules. The rules are evaluated in order.
+  // Once a rule matches a Pod failure, the remaining of the rules are ignored.
+  // When no rule matches the Pod failure, the default handling applies - the
+  // counter of pod failures is incremented and it is checked against
+  // the backoffLimit. At most 20 elements are allowed.
+  // +listType=atomic
+  repeated PodFailurePolicyRule rules = 1;
+}
+
+// PodFailurePolicyOnExitCodesRequirement describes the requirement for handling
+// a failed pod based on its container exit codes. In particular, it lookups the
+// .state.terminated.exitCode for each app container and init container status,
+// represented by the .status.containerStatuses and .status.initContainerStatuses
+// fields in the Pod status, respectively. Containers completed with success
+// (exit code 0) are excluded from the requirement check.
+message PodFailurePolicyOnExitCodesRequirement {
+  // Restricts the check for exit codes to the container with the
+  // specified name. When null, the rule applies to all containers.
+  // When specified, it should match one the container or initContainer
+  // names in the pod template.
+  // +optional
+  optional string containerName = 1;
+
+  // Represents the relationship between the container exit code(s) and the
+  // specified values. Containers completed with success (exit code 0) are
+  // excluded from the requirement check. Possible values are:
+  // - In: the requirement is satisfied if at least one container exit code
+  //   (might be multiple if there are multiple containers not restricted
+  //   by the 'containerName' field) is in the set of specified values.
+  // - NotIn: the requirement is satisfied if at least one container exit code
+  //   (might be multiple if there are multiple containers not restricted
+  //   by the 'containerName' field) is not in the set of specified values.
+  // Additional values are considered to be added in the future. Clients should
+  // react to an unknown operator by assuming the requirement is not satisfied.
+  optional string operator = 2;
+
+  // Specifies the set of values. Each returned container exit code (might be
+  // multiple in case of multiple containers) is checked against this set of
+  // values with respect to the operator. The list of values must be ordered
+  // and must not contain duplicates. Value '0' cannot be used for the In operator.
+  // At least one element is required. At most 255 elements are allowed.
+  // +listType=set
+  repeated int32 values = 3;
+}
+
+// PodFailurePolicyOnPodConditionsPattern describes a pattern for matching
+// an actual pod condition type.
+message PodFailurePolicyOnPodConditionsPattern {
+  // Specifies the required Pod condition type. To match a pod condition
+  // it is required that specified type equals the pod condition type.
+  optional string type = 1;
+
+  // Specifies the required Pod condition status. To match a pod condition
+  // it is required that the specified status equals the pod condition status.
+  // Defaults to True.
+  optional string status = 2;
+}
+
+// PodFailurePolicyRule describes how a pod failure is handled when the requirements are met.
+// One of OnExitCodes and onPodConditions, but not both, can be used in each rule.
+message PodFailurePolicyRule {
+  // Specifies the action taken on a pod failure when the requirements are satisfied.
+  // Possible values are:
+  // - FailJob: indicates that the pod's job is marked as Failed and all
+  //   running pods are terminated.
+  // - Ignore: indicates that the counter towards the .backoffLimit is not
+  //   incremented and a replacement pod is created.
+  // - Count: indicates that the pod is handled in the default way - the
+  //   counter towards the .backoffLimit is incremented.
+  // Additional values are considered to be added in the future. Clients should
+  // react to an unknown action by skipping the rule.
+  optional string action = 1;
+
+  // Represents the requirement on the container exit codes.
+  // +optional
+  optional PodFailurePolicyOnExitCodesRequirement onExitCodes = 2;
+
+  // Represents the requirement on the pod conditions. The requirement is represented
+  // as a list of pod condition patterns. The requirement is satisfied if at
+  // least one pattern matches an actual pod condition. At most 20 elements are allowed.
+  // +listType=atomic
+  repeated PodFailurePolicyOnPodConditionsPattern onPodConditions = 3;
+}
+
 // UncountedTerminatedPods holds UIDs of Pods that have terminated but haven't
 // been accounted in Job status counters.
 message UncountedTerminatedPods {
--- a/staging/src/k8s.io/api/batch/v1/types.go
+++ b/staging/src/k8s.io/api/batch/v1/types.go
@@ -87,6 +87,120 @@ const (
 	IndexedCompletion CompletionMode = "Indexed"
 )

+// PodFailurePolicyAction specifies how a Pod failure is handled.
+// +enum
+type PodFailurePolicyAction string
+
+const (
+	// This is an action which might be taken on a pod failure - mark the
+	// pod's job as Failed and terminate all running pods.
+	PodFailurePolicyActionFailJob PodFailurePolicyAction = "FailJob"
+
+	// This is an action which might be taken on a pod failure - the counter towards
+	// .backoffLimit, represented by the job's .status.failed field, is not
+	// incremented and a replacement pod is created.
+	PodFailurePolicyActionIgnore PodFailurePolicyAction = "Ignore"
+
+	// This is an action which might be taken on a pod failure - the pod failure
+	// is handled in the default way - the counter towards .backoffLimit,
+	// represented by the job's .status.failed field, is incremented.
+	PodFailurePolicyActionCount PodFailurePolicyAction = "Count"
+)
+
+// +enum
+type PodFailurePolicyOnExitCodesOperator string
+
+const (
+	PodFailurePolicyOnExitCodesOpIn    PodFailurePolicyOnExitCodesOperator = "In"
+	PodFailurePolicyOnExitCodesOpNotIn PodFailurePolicyOnExitCodesOperator = "NotIn"
+)
+
+// PodFailurePolicyOnExitCodesRequirement describes the requirement for handling
+// a failed pod based on its container exit codes. In particular, it lookups the
+// .state.terminated.exitCode for each app container and init container status,
+// represented by the .status.containerStatuses and .status.initContainerStatuses
+// fields in the Pod status, respectively. Containers completed with success
+// (exit code 0) are excluded from the requirement check.
+type PodFailurePolicyOnExitCodesRequirement struct {
+	// Restricts the check for exit codes to the container with the
+	// specified name. When null, the rule applies to all containers.
+	// When specified, it should match one the container or initContainer
+	// names in the pod template.
+	// +optional
+	ContainerName *string `json:"containerName" protobuf:"bytes,1,opt,name=containerName"`
+
+	// Represents the relationship between the container exit code(s) and the
+	// specified values. Containers completed with success (exit code 0) are
+	// excluded from the requirement check. Possible values are:
+	// - In: the requirement is satisfied if at least one container exit code
+	//   (might be multiple if there are multiple containers not restricted
+	//   by the 'containerName' field) is in the set of specified values.
+	// - NotIn: the requirement is satisfied if at least one container exit code
+	//   (might be multiple if there are multiple containers not restricted
+	//   by the 'containerName' field) is not in the set of specified values.
+	// Additional values are considered to be added in the future. Clients should
+	// react to an unknown operator by assuming the requirement is not satisfied.
+	Operator PodFailurePolicyOnExitCodesOperator `json:"operator" protobuf:"bytes,2,req,name=operator"`
+
+	// Specifies the set of values. Each returned container exit code (might be
+	// multiple in case of multiple containers) is checked against this set of
+	// values with respect to the operator. The list of values must be ordered
+	// and must not contain duplicates. Value '0' cannot be used for the In operator.
+	// At least one element is required. At most 255 elements are allowed.
+	// +listType=set
+	Values []int32 `json:"values" protobuf:"varint,3,rep,name=values"`
+}
+
+// PodFailurePolicyOnPodConditionsPattern describes a pattern for matching
+// an actual pod condition type.
+type PodFailurePolicyOnPodConditionsPattern struct {
+	// Specifies the required Pod condition type. To match a pod condition
+	// it is required that specified type equals the pod condition type.
+	Type corev1.PodConditionType `json:"type" protobuf:"bytes,1,req,name=type"`
+
+	// Specifies the required Pod condition status. To match a pod condition
+	// it is required that the specified status equals the pod condition status.
+	// Defaults to True.
+	Status corev1.ConditionStatus `json:"status" protobuf:"bytes,2,req,name=status"`
+}
+
+// PodFailurePolicyRule describes how a pod failure is handled when the requirements are met.
+// One of OnExitCodes and onPodConditions, but not both, can be used in each rule.
+type PodFailurePolicyRule struct {
+	// Specifies the action taken on a pod failure when the requirements are satisfied.
+	// Possible values are:
+	// - FailJob: indicates that the pod's job is marked as Failed and all
+	//   running pods are terminated.
+	// - Ignore: indicates that the counter towards the .backoffLimit is not
+	//   incremented and a replacement pod is created.
+	// - Count: indicates that the pod is handled in the default way - the
+	//   counter towards the .backoffLimit is incremented.
+	// Additional values are considered to be added in the future. Clients should
+	// react to an unknown action by skipping the rule.
+	Action PodFailurePolicyAction `json:"action" protobuf:"bytes,1,req,name=action"`
+
+	// Represents the requirement on the container exit codes.
+	// +optional
+	OnExitCodes *PodFailurePolicyOnExitCodesRequirement `json:"onExitCodes" protobuf:"bytes,2,opt,name=onExitCodes"`
+
+	// Represents the requirement on the pod conditions. The requirement is represented
+	// as a list of pod condition patterns. The requirement is satisfied if at
+	// least one pattern matches an actual pod condition. At most 20 elements are allowed.
+	// +listType=atomic
+	OnPodConditions []PodFailurePolicyOnPodConditionsPattern `json:"onPodConditions" protobuf:"bytes,3,opt,name=onPodConditions"`
+}
+
+// PodFailurePolicy describes how failed pods influence the backoffLimit.
+type PodFailurePolicy struct {
+	// A list of pod failure policy rules. The rules are evaluated in order.
+	// Once a rule matches a Pod failure, the remaining of the rules are ignored.
+	// When no rule matches the Pod failure, the default handling applies - the
+	// counter of pod failures is incremented and it is checked against
+	// the backoffLimit. At most 20 elements are allowed.
+	// +listType=atomic
+	Rules []PodFailurePolicyRule `json:"rules" protobuf:"bytes,1,opt,name=rules"`
+}
+
 // JobSpec describes how the job execution will look like.
 type JobSpec struct {

@@ -115,6 +229,19 @@ type JobSpec struct {
 	// +optional
 	ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty" protobuf:"varint,3,opt,name=activeDeadlineSeconds"`

+	// Specifies the policy of handling failed pods. In particular, it allows to
+	// specify the set of actions and conditions which need to be
+	// satisfied to take the associated action.
+	// If empty, the default behaviour applies - the counter of failed pods,
+	// represented by the jobs's .status.failed field, is incremented and it is
+	// checked against the backoffLimit. This field cannot be used in combination
+	// with restartPolicy=OnFailure.
+	//
+	// This field is alpha-level. To use this field, you must enable the
+	// `JobPodFailurePolicy` feature gate (disabled by default).
+	// +optional
+	PodFailurePolicy *PodFailurePolicy `json:"podFailurePolicy,omitempty" protobuf:"bytes,11,opt,name=podFailurePolicy"`
+
 	// Specifies the number of retries before marking this job failed.
 	// Defaults to 6
 	// +optional
@@ -297,6 +424,9 @@ const (
 	JobComplete JobConditionType = "Complete"
 	// JobFailed means the job has failed its execution.
 	JobFailed JobConditionType = "Failed"
+	// FailureTarget means the job is about to fail its execution.
+	// The constant is to be renamed once the name is accepted within the KEP-3329.
+	AlphaNoCompatGuaranteeJobFailureTarget JobConditionType = "FailureTarget"
 )

 // JobCondition describes current state of a job.
--- a/staging/src/k8s.io/api/batch/v1/types_swagger_doc_generated.go
+++ b/staging/src/k8s.io/api/batch/v1/types_swagger_doc_generated.go
@@ -115,6 +115,7 @@ var map_JobSpec = map[string]string{
 	"parallelism":             "Specifies the maximum desired number of pods the job should run at any given time. The actual number of pods running in steady state will be less than this number when ((.spec.completions - .status.successful) < .spec.parallelism), i.e. when the work left to do is less than max parallelism. More info: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/",
 	"completions":             "Specifies the desired number of successfully finished pods the job should be run with.  Setting to nil means that the success of any pod signals the success of all pods, and allows parallelism to have any positive value.  Setting to 1 means that parallelism is limited to 1 and the success of that pod signals the success of the job. More info: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/",
 	"activeDeadlineSeconds":   "Specifies the duration in seconds relative to the startTime that the job may be continuously active before the system tries to terminate it; value must be positive integer. If a Job is suspended (at creation or through an update), this timer will effectively be stopped and reset when the Job is resumed again.",
+	"podFailurePolicy":        "Specifies the policy of handling failed pods. In particular, it allows to specify the set of actions and conditions which need to be satisfied to take the associated action. If empty, the default behaviour applies - the counter of failed pods, represented by the jobs's .status.failed field, is incremented and it is checked against the backoffLimit. This field cannot be used in combination with restartPolicy=OnFailure.\n\nThis field is alpha-level. To use this field, you must enable the `JobPodFailurePolicy` feature gate (disabled by default).",
 	"backoffLimit":            "Specifies the number of retries before marking this job failed. Defaults to 6",
 	"selector":                "A label query over pods that should match the pod count. Normally, the system sets this field for you. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors",
 	"manualSelector":          "manualSelector controls generation of pod labels and pod selectors. Leave `manualSelector` unset unless you are certain what you are doing. When false or unset, the system pick labels unique to this job and appends those labels to the pod template.  When true, the user is responsible for picking unique labels and specifying the selector.  Failure to pick a unique label may cause this and other jobs to not function correctly.  However, You may see `manualSelector=true` in jobs that were created with the old `extensions/v1beta1` API. More info: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#specifying-your-own-pod-selector",
@@ -155,6 +156,47 @@ func (JobTemplateSpec) SwaggerDoc() map[string]string {
 	return map_JobTemplateSpec
 }

+var map_PodFailurePolicy = map[string]string{
+	"":      "PodFailurePolicy describes how failed pods influence the backoffLimit.",
+	"rules": "A list of pod failure policy rules. The rules are evaluated in order. Once a rule matches a Pod failure, the remaining of the rules are ignored. When no rule matches the Pod failure, the default handling applies - the counter of pod failures is incremented and it is checked against the backoffLimit. At most 20 elements are allowed.",
+}
+
+func (PodFailurePolicy) SwaggerDoc() map[string]string {
+	return map_PodFailurePolicy
+}
+
+var map_PodFailurePolicyOnExitCodesRequirement = map[string]string{
+	"":              "PodFailurePolicyOnExitCodesRequirement describes the requirement for handling a failed pod based on its container exit codes. In particular, it lookups the .state.terminated.exitCode for each app container and init container status, represented by the .status.containerStatuses and .status.initContainerStatuses fields in the Pod status, respectively. Containers completed with success (exit code 0) are excluded from the requirement check.",
+	"containerName": "Restricts the check for exit codes to the container with the specified name. When null, the rule applies to all containers. When specified, it should match one the container or initContainer names in the pod template.",
+	"operator":      "Represents the relationship between the container exit code(s) and the specified values. Containers completed with success (exit code 0) are excluded from the requirement check. Possible values are: - In: the requirement is satisfied if at least one container exit code\n  (might be multiple if there are multiple containers not restricted\n  by the 'containerName' field) is in the set of specified values.\n- NotIn: the requirement is satisfied if at least one container exit code\n  (might be multiple if there are multiple containers not restricted\n  by the 'containerName' field) is not in the set of specified values.\nAdditional values are considered to be added in the future. Clients should react to an unknown operator by assuming the requirement is not satisfied.",
+	"values":        "Specifies the set of values. Each returned container exit code (might be multiple in case of multiple containers) is checked against this set of values with respect to the operator. The list of values must be ordered and must not contain duplicates. Value '0' cannot be used for the In operator. At least one element is required. At most 255 elements are allowed.",
+}
+
+func (PodFailurePolicyOnExitCodesRequirement) SwaggerDoc() map[string]string {
+	return map_PodFailurePolicyOnExitCodesRequirement
+}
+
+var map_PodFailurePolicyOnPodConditionsPattern = map[string]string{
+	"":       "PodFailurePolicyOnPodConditionsPattern describes a pattern for matching an actual pod condition type.",
+	"type":   "Specifies the required Pod condition type. To match a pod condition it is required that specified type equals the pod condition type.",
+	"status": "Specifies the required Pod condition status. To match a pod condition it is required that the specified status equals the pod condition status. Defaults to True.",
+}
+
+func (PodFailurePolicyOnPodConditionsPattern) SwaggerDoc() map[string]string {
+	return map_PodFailurePolicyOnPodConditionsPattern
+}
+
+var map_PodFailurePolicyRule = map[string]string{
+	"":                "PodFailurePolicyRule describes how a pod failure is handled when the requirements are met. One of OnExitCodes and onPodConditions, but not both, can be used in each rule.",
+	"action":          "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are: - FailJob: indicates that the pod's job is marked as Failed and all\n  running pods are terminated.\n- Ignore: indicates that the counter towards the .backoffLimit is not\n  incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n  counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.",
+	"onExitCodes":     "Represents the requirement on the container exit codes.",
+	"onPodConditions": "Represents the requirement on the pod conditions. The requirement is represented as a list of pod condition patterns. The requirement is satisfied if at least one pattern matches an actual pod condition. At most 20 elements are allowed.",
+}
+
+func (PodFailurePolicyRule) SwaggerDoc() map[string]string {
+	return map_PodFailurePolicyRule
+}
+
 var map_UncountedTerminatedPods = map[string]string{
 	"":          "UncountedTerminatedPods holds UIDs of Pods that have terminated but haven't been accounted in Job status counters.",
 	"succeeded": "Succeeded holds UIDs of succeeded Pods.",
--- a/staging/src/k8s.io/api/batch/v1/zz_generated.deepcopy.go
+++ b/staging/src/k8s.io/api/batch/v1/zz_generated.deepcopy.go
@@ -257,6 +257,11 @@ func (in *JobSpec) DeepCopyInto(out *JobSpec) {
 		*out = new(int64)
 		**out = **in
 	}
+	if in.PodFailurePolicy != nil {
+		in, out := &in.PodFailurePolicy, &out.PodFailurePolicy
+		*out = new(PodFailurePolicy)
+		(*in).DeepCopyInto(*out)
+	}
 	if in.BackoffLimit != nil {
 		in, out := &in.BackoffLimit, &out.BackoffLimit
 		*out = new(int32)
@@ -360,6 +365,97 @@ func (in *JobTemplateSpec) DeepCopy() *JobTemplateSpec {
 	return out
 }

+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *PodFailurePolicy) DeepCopyInto(out *PodFailurePolicy) {
+	*out = *in
+	if in.Rules != nil {
+		in, out := &in.Rules, &out.Rules
+		*out = make([]PodFailurePolicyRule, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	return
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodFailurePolicy.
+func (in *PodFailurePolicy) DeepCopy() *PodFailurePolicy {
+	if in == nil {
+		return nil
+	}
+	out := new(PodFailurePolicy)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *PodFailurePolicyOnExitCodesRequirement) DeepCopyInto(out *PodFailurePolicyOnExitCodesRequirement) {
+	*out = *in
+	if in.ContainerName != nil {
+		in, out := &in.ContainerName, &out.ContainerName
+		*out = new(string)
+		**out = **in
+	}
+	if in.Values != nil {
+		in, out := &in.Values, &out.Values
+		*out = make([]int32, len(*in))
+		copy(*out, *in)
+	}
+	return
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodFailurePolicyOnExitCodesRequirement.
+func (in *PodFailurePolicyOnExitCodesRequirement) DeepCopy() *PodFailurePolicyOnExitCodesRequirement {
+	if in == nil {
+		return nil
+	}
+	out := new(PodFailurePolicyOnExitCodesRequirement)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *PodFailurePolicyOnPodConditionsPattern) DeepCopyInto(out *PodFailurePolicyOnPodConditionsPattern) {
+	*out = *in
+	return
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodFailurePolicyOnPodConditionsPattern.
+func (in *PodFailurePolicyOnPodConditionsPattern) DeepCopy() *PodFailurePolicyOnPodConditionsPattern {
+	if in == nil {
+		return nil
+	}
+	out := new(PodFailurePolicyOnPodConditionsPattern)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *PodFailurePolicyRule) DeepCopyInto(out *PodFailurePolicyRule) {
+	*out = *in
+	if in.OnExitCodes != nil {
+		in, out := &in.OnExitCodes, &out.OnExitCodes
+		*out = new(PodFailurePolicyOnExitCodesRequirement)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.OnPodConditions != nil {
+		in, out := &in.OnPodConditions, &out.OnPodConditions
+		*out = make([]PodFailurePolicyOnPodConditionsPattern, len(*in))
+		copy(*out, *in)
+	}
+	return
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodFailurePolicyRule.
+func (in *PodFailurePolicyRule) DeepCopy() *PodFailurePolicyRule {
+	if in == nil {
+		return nil
+	}
+	out := new(PodFailurePolicyRule)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *UncountedTerminatedPods) DeepCopyInto(out *UncountedTerminatedPods) {
 	*out = *in
--- a/staging/src/k8s.io/api/testdata/HEAD/batch.v1.CronJob.json
+++ b/staging/src/k8s.io/api/testdata/HEAD/batch.v1.CronJob.json
@@ -96,6 +96,26 @@
        "parallelism": 1,
        "completions": 2,
        "activeDeadlineSeconds": 3,
+        "podFailurePolicy": {
+          "rules": [
+            {
+              "action": "actionValue",
+              "onExitCodes": {
+                "containerName": "containerNameValue",
+                "operator": "operatorValue",
+                "values": [
+                  3
+                ]
+              },
+              "onPodConditions": [
+                {
+                  "type": "typeValue",
+                  "status": "statusValue"
+                }
+              ]
+            }
+          ]
+        },
        "backoffLimit": 7,
        "selector": {
          "matchLabels": {
--- a/staging/src/k8s.io/api/testdata/HEAD/batch.v1.CronJob.pb
+++ b/staging/src/k8s.io/api/testdata/HEAD/batch.v1.CronJob.pb
--- a/staging/src/k8s.io/api/testdata/HEAD/batch.v1.CronJob.yaml
+++ b/staging/src/k8s.io/api/testdata/HEAD/batch.v1.CronJob.yaml
@@ -75,6 +75,17 @@ spec:
      completions: 2
      manualSelector: true
      parallelism: 1
+      podFailurePolicy:
+        rules:
+        - action: actionValue
+          onExitCodes:
+            containerName: containerNameValue
+            operator: operatorValue
+            values:
+            - 3
+          onPodConditions:
+          - status: statusValue
+            type: typeValue
      selector:
        matchExpressions:
        - key: keyValue
--- a/staging/src/k8s.io/api/testdata/HEAD/batch.v1.Job.json
+++ b/staging/src/k8s.io/api/testdata/HEAD/batch.v1.Job.json
@@ -47,6 +47,26 @@
    "parallelism": 1,
    "completions": 2,
    "activeDeadlineSeconds": 3,
+    "podFailurePolicy": {
+      "rules": [
+        {
+          "action": "actionValue",
+          "onExitCodes": {
+            "containerName": "containerNameValue",
+            "operator": "operatorValue",
+            "values": [
+              3
+            ]
+          },
+          "onPodConditions": [
+            {
+              "type": "typeValue",
+              "status": "statusValue"
+            }
+          ]
+        }
+      ]
+    },
    "backoffLimit": 7,
    "selector": {
      "matchLabels": {
--- a/staging/src/k8s.io/api/testdata/HEAD/batch.v1.Job.pb
+++ b/staging/src/k8s.io/api/testdata/HEAD/batch.v1.Job.pb
--- a/staging/src/k8s.io/api/testdata/HEAD/batch.v1.Job.yaml
+++ b/staging/src/k8s.io/api/testdata/HEAD/batch.v1.Job.yaml
@@ -39,6 +39,17 @@ spec:
  completions: 2
  manualSelector: true
  parallelism: 1
+  podFailurePolicy:
+    rules:
+    - action: actionValue
+      onExitCodes:
+        containerName: containerNameValue
+        operator: operatorValue
+        values:
+        - 3
+      onPodConditions:
+      - status: statusValue
+        type: typeValue
  selector:
    matchExpressions:
    - key: keyValue
--- a/staging/src/k8s.io/api/testdata/HEAD/batch.v1beta1.CronJob.json
+++ b/staging/src/k8s.io/api/testdata/HEAD/batch.v1beta1.CronJob.json
@@ -96,6 +96,26 @@
        "parallelism": 1,
        "completions": 2,
        "activeDeadlineSeconds": 3,
+        "podFailurePolicy": {
+          "rules": [
+            {
+              "action": "actionValue",
+              "onExitCodes": {
+                "containerName": "containerNameValue",
+                "operator": "operatorValue",
+                "values": [
+                  3
+                ]
+              },
+              "onPodConditions": [
+                {
+                  "type": "typeValue",
+                  "status": "statusValue"
+                }
+              ]
+            }
+          ]
+        },
        "backoffLimit": 7,
        "selector": {
          "matchLabels": {
--- a/staging/src/k8s.io/api/testdata/HEAD/batch.v1beta1.CronJob.pb
+++ b/staging/src/k8s.io/api/testdata/HEAD/batch.v1beta1.CronJob.pb
--- a/staging/src/k8s.io/api/testdata/HEAD/batch.v1beta1.CronJob.yaml
+++ b/staging/src/k8s.io/api/testdata/HEAD/batch.v1beta1.CronJob.yaml
@@ -75,6 +75,17 @@ spec:
      completions: 2
      manualSelector: true
      parallelism: 1
+      podFailurePolicy:
+        rules:
+        - action: actionValue
+          onExitCodes:
+            containerName: containerNameValue
+            operator: operatorValue
+            values:
+            - 3
+          onPodConditions:
+          - status: statusValue
+            type: typeValue
      selector:
        matchExpressions:
        - key: keyValue
--- a/staging/src/k8s.io/api/testdata/HEAD/batch.v1beta1.JobTemplate.json
+++ b/staging/src/k8s.io/api/testdata/HEAD/batch.v1beta1.JobTemplate.json
@@ -90,6 +90,26 @@
      "parallelism": 1,
      "completions": 2,
      "activeDeadlineSeconds": 3,
+      "podFailurePolicy": {
+        "rules": [
+          {
+            "action": "actionValue",
+            "onExitCodes": {
+              "containerName": "containerNameValue",
+              "operator": "operatorValue",
+              "values": [
+                3
+              ]
+            },
+            "onPodConditions": [
+              {
+                "type": "typeValue",
+                "status": "statusValue"
+              }
+            ]
+          }
+        ]
+      },
      "backoffLimit": 7,
      "selector": {
        "matchLabels": {
--- a/staging/src/k8s.io/api/testdata/HEAD/batch.v1beta1.JobTemplate.pb
+++ b/staging/src/k8s.io/api/testdata/HEAD/batch.v1beta1.JobTemplate.pb
--- a/staging/src/k8s.io/api/testdata/HEAD/batch.v1beta1.JobTemplate.yaml
+++ b/staging/src/k8s.io/api/testdata/HEAD/batch.v1beta1.JobTemplate.yaml
@@ -72,6 +72,17 @@ template:
    completions: 2
    manualSelector: true
    parallelism: 1
+    podFailurePolicy:
+      rules:
+      - action: actionValue
+        onExitCodes:
+          containerName: containerNameValue
+          operator: operatorValue
+          values:
+          - 3
+        onPodConditions:
+        - status: statusValue
+          type: typeValue
    selector:
      matchExpressions:
      - key: keyValue
--- a/staging/src/k8s.io/apimachinery/pkg/util/validation/field/errors.go
+++ b/staging/src/k8s.io/apimachinery/pkg/util/validation/field/errors.go
@@ -42,9 +42,9 @@ func (v *Error) Error() string {
 	return fmt.Sprintf("%s: %s", v.Field, v.ErrorBody())
 }

-type omitValueType struct{}
+type OmitValueType struct{}

-var omitValue = omitValueType{}
+var omitValue = OmitValueType{}

 // ErrorBody returns the error message without the field name.  This is useful
 // for building nice-looking higher-level error reporting.
--- a/staging/src/k8s.io/client-go/applyconfigurations/batch/v1/jobspec.go
+++ b/staging/src/k8s.io/client-go/applyconfigurations/batch/v1/jobspec.go
@@ -21,7 +21,7 @@ package v1
 import (
 	batchv1 "k8s.io/api/batch/v1"
 	corev1 "k8s.io/client-go/applyconfigurations/core/v1"
-	v1 "k8s.io/client-go/applyconfigurations/meta/v1"
+	metav1 "k8s.io/client-go/applyconfigurations/meta/v1"
 )

 // JobSpecApplyConfiguration represents an declarative configuration of the JobSpec type for use
@@ -30,8 +30,9 @@ type JobSpecApplyConfiguration struct {
 	Parallelism             *int32                                    `json:"parallelism,omitempty"`
 	Completions             *int32                                    `json:"completions,omitempty"`
 	ActiveDeadlineSeconds   *int64                                    `json:"activeDeadlineSeconds,omitempty"`
+	PodFailurePolicy        *PodFailurePolicyApplyConfiguration       `json:"podFailurePolicy,omitempty"`
 	BackoffLimit            *int32                                    `json:"backoffLimit,omitempty"`
-	Selector                *v1.LabelSelectorApplyConfiguration       `json:"selector,omitempty"`
+	Selector                *metav1.LabelSelectorApplyConfiguration   `json:"selector,omitempty"`
 	ManualSelector          *bool                                     `json:"manualSelector,omitempty"`
 	Template                *corev1.PodTemplateSpecApplyConfiguration `json:"template,omitempty"`
 	TTLSecondsAfterFinished *int32                                    `json:"ttlSecondsAfterFinished,omitempty"`
@@ -69,6 +70,14 @@ func (b *JobSpecApplyConfiguration) WithActiveDeadlineSeconds(value int64) *JobS
 	return b
 }

+// WithPodFailurePolicy sets the PodFailurePolicy field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the PodFailurePolicy field is set to the value of the last call.
+func (b *JobSpecApplyConfiguration) WithPodFailurePolicy(value *PodFailurePolicyApplyConfiguration) *JobSpecApplyConfiguration {
+	b.PodFailurePolicy = value
+	return b
+}
+
 // WithBackoffLimit sets the BackoffLimit field in the declarative configuration to the given value
 // and returns the receiver, so that objects can be built by chaining "With" function invocations.
 // If called multiple times, the BackoffLimit field is set to the value of the last call.
@@ -80,7 +89,7 @@ func (b *JobSpecApplyConfiguration) WithBackoffLimit(value int32) *JobSpecApplyC
 // WithSelector sets the Selector field in the declarative configuration to the given value
 // and returns the receiver, so that objects can be built by chaining "With" function invocations.
 // If called multiple times, the Selector field is set to the value of the last call.
-func (b *JobSpecApplyConfiguration) WithSelector(value *v1.LabelSelectorApplyConfiguration) *JobSpecApplyConfiguration {
+func (b *JobSpecApplyConfiguration) WithSelector(value *metav1.LabelSelectorApplyConfiguration) *JobSpecApplyConfiguration {
 	b.Selector = value
 	return b
 }
--- a/staging/src/k8s.io/client-go/applyconfigurations/batch/v1/podfailurepolicy.go
+++ b/staging/src/k8s.io/client-go/applyconfigurations/batch/v1/podfailurepolicy.go
@@ -0,0 +1,44 @@
+/*
+Copyright The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Code generated by applyconfiguration-gen. DO NOT EDIT.
+
+package v1
+
+// PodFailurePolicyApplyConfiguration represents an declarative configuration of the PodFailurePolicy type for use
+// with apply.
+type PodFailurePolicyApplyConfiguration struct {
+	Rules []PodFailurePolicyRuleApplyConfiguration `json:"rules,omitempty"`
+}
+
+// PodFailurePolicyApplyConfiguration constructs an declarative configuration of the PodFailurePolicy type for use with
+// apply.
+func PodFailurePolicy() *PodFailurePolicyApplyConfiguration {
+	return &PodFailurePolicyApplyConfiguration{}
+}
+
+// WithRules adds the given value to the Rules field in the declarative configuration
+// and returns the receiver, so that objects can be build by chaining "With" function invocations.
+// If called multiple times, values provided by each call will be appended to the Rules field.
+func (b *PodFailurePolicyApplyConfiguration) WithRules(values ...*PodFailurePolicyRuleApplyConfiguration) *PodFailurePolicyApplyConfiguration {
+	for i := range values {
+		if values[i] == nil {
+			panic("nil value passed to WithRules")
+		}
+		b.Rules = append(b.Rules, *values[i])
+	}
+	return b
+}
--- a/staging/src/k8s.io/client-go/applyconfigurations/batch/v1/podfailurepolicyonexitcodesrequirement.go
+++ b/staging/src/k8s.io/client-go/applyconfigurations/batch/v1/podfailurepolicyonexitcodesrequirement.go
@@ -0,0 +1,63 @@
+/*
+Copyright The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Code generated by applyconfiguration-gen. DO NOT EDIT.
+
+package v1
+
+import (
+	v1 "k8s.io/api/batch/v1"
+)
+
+// PodFailurePolicyOnExitCodesRequirementApplyConfiguration represents an declarative configuration of the PodFailurePolicyOnExitCodesRequirement type for use
+// with apply.
+type PodFailurePolicyOnExitCodesRequirementApplyConfiguration struct {
+	ContainerName *string                                 `json:"containerName,omitempty"`
+	Operator      *v1.PodFailurePolicyOnExitCodesOperator `json:"operator,omitempty"`
+	Values        []int32                                 `json:"values,omitempty"`
+}
+
+// PodFailurePolicyOnExitCodesRequirementApplyConfiguration constructs an declarative configuration of the PodFailurePolicyOnExitCodesRequirement type for use with
+// apply.
+func PodFailurePolicyOnExitCodesRequirement() *PodFailurePolicyOnExitCodesRequirementApplyConfiguration {
+	return &PodFailurePolicyOnExitCodesRequirementApplyConfiguration{}
+}
+
+// WithContainerName sets the ContainerName field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the ContainerName field is set to the value of the last call.
+func (b *PodFailurePolicyOnExitCodesRequirementApplyConfiguration) WithContainerName(value string) *PodFailurePolicyOnExitCodesRequirementApplyConfiguration {
+	b.ContainerName = &value
+	return b
+}
+
+// WithOperator sets the Operator field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the Operator field is set to the value of the last call.
+func (b *PodFailurePolicyOnExitCodesRequirementApplyConfiguration) WithOperator(value v1.PodFailurePolicyOnExitCodesOperator) *PodFailurePolicyOnExitCodesRequirementApplyConfiguration {
+	b.Operator = &value
+	return b
+}
+
+// WithValues adds the given value to the Values field in the declarative configuration
+// and returns the receiver, so that objects can be build by chaining "With" function invocations.
+// If called multiple times, values provided by each call will be appended to the Values field.
+func (b *PodFailurePolicyOnExitCodesRequirementApplyConfiguration) WithValues(values ...int32) *PodFailurePolicyOnExitCodesRequirementApplyConfiguration {
+	for i := range values {
+		b.Values = append(b.Values, values[i])
+	}
+	return b
+}
--- a/staging/src/k8s.io/client-go/applyconfigurations/batch/v1/podfailurepolicyonpodconditionspattern.go
+++ b/staging/src/k8s.io/client-go/applyconfigurations/batch/v1/podfailurepolicyonpodconditionspattern.go
@@ -0,0 +1,52 @@
+/*
+Copyright The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Code generated by applyconfiguration-gen. DO NOT EDIT.
+
+package v1
+
+import (
+	v1 "k8s.io/api/core/v1"
+)
+
+// PodFailurePolicyOnPodConditionsPatternApplyConfiguration represents an declarative configuration of the PodFailurePolicyOnPodConditionsPattern type for use
+// with apply.
+type PodFailurePolicyOnPodConditionsPatternApplyConfiguration struct {
+	Type   *v1.PodConditionType `json:"type,omitempty"`
+	Status *v1.ConditionStatus  `json:"status,omitempty"`
+}
+
+// PodFailurePolicyOnPodConditionsPatternApplyConfiguration constructs an declarative configuration of the PodFailurePolicyOnPodConditionsPattern type for use with
+// apply.
+func PodFailurePolicyOnPodConditionsPattern() *PodFailurePolicyOnPodConditionsPatternApplyConfiguration {
+	return &PodFailurePolicyOnPodConditionsPatternApplyConfiguration{}
+}
+
+// WithType sets the Type field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the Type field is set to the value of the last call.
+func (b *PodFailurePolicyOnPodConditionsPatternApplyConfiguration) WithType(value v1.PodConditionType) *PodFailurePolicyOnPodConditionsPatternApplyConfiguration {
+	b.Type = &value
+	return b
+}
+
+// WithStatus sets the Status field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the Status field is set to the value of the last call.
+func (b *PodFailurePolicyOnPodConditionsPatternApplyConfiguration) WithStatus(value v1.ConditionStatus) *PodFailurePolicyOnPodConditionsPatternApplyConfiguration {
+	b.Status = &value
+	return b
+}
--- a/staging/src/k8s.io/client-go/applyconfigurations/batch/v1/podfailurepolicyrule.go
+++ b/staging/src/k8s.io/client-go/applyconfigurations/batch/v1/podfailurepolicyrule.go
@@ -0,0 +1,66 @@
+/*
+Copyright The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Code generated by applyconfiguration-gen. DO NOT EDIT.
+
+package v1
+
+import (
+	v1 "k8s.io/api/batch/v1"
+)
+
+// PodFailurePolicyRuleApplyConfiguration represents an declarative configuration of the PodFailurePolicyRule type for use
+// with apply.
+type PodFailurePolicyRuleApplyConfiguration struct {
+	Action          *v1.PodFailurePolicyAction                                 `json:"action,omitempty"`
+	OnExitCodes     *PodFailurePolicyOnExitCodesRequirementApplyConfiguration  `json:"onExitCodes,omitempty"`
+	OnPodConditions []PodFailurePolicyOnPodConditionsPatternApplyConfiguration `json:"onPodConditions,omitempty"`
+}
+
+// PodFailurePolicyRuleApplyConfiguration constructs an declarative configuration of the PodFailurePolicyRule type for use with
+// apply.
+func PodFailurePolicyRule() *PodFailurePolicyRuleApplyConfiguration {
+	return &PodFailurePolicyRuleApplyConfiguration{}
+}
+
+// WithAction sets the Action field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the Action field is set to the value of the last call.
+func (b *PodFailurePolicyRuleApplyConfiguration) WithAction(value v1.PodFailurePolicyAction) *PodFailurePolicyRuleApplyConfiguration {
+	b.Action = &value
+	return b
+}
+
+// WithOnExitCodes sets the OnExitCodes field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the OnExitCodes field is set to the value of the last call.
+func (b *PodFailurePolicyRuleApplyConfiguration) WithOnExitCodes(value *PodFailurePolicyOnExitCodesRequirementApplyConfiguration) *PodFailurePolicyRuleApplyConfiguration {
+	b.OnExitCodes = value
+	return b
+}
+
+// WithOnPodConditions adds the given value to the OnPodConditions field in the declarative configuration
+// and returns the receiver, so that objects can be build by chaining "With" function invocations.
+// If called multiple times, values provided by each call will be appended to the OnPodConditions field.
+func (b *PodFailurePolicyRuleApplyConfiguration) WithOnPodConditions(values ...*PodFailurePolicyOnPodConditionsPatternApplyConfiguration) *PodFailurePolicyRuleApplyConfiguration {
+	for i := range values {
+		if values[i] == nil {
+			panic("nil value passed to WithOnPodConditions")
+		}
+		b.OnPodConditions = append(b.OnPodConditions, *values[i])
+	}
+	return b
+}
--- a/staging/src/k8s.io/client-go/applyconfigurations/internal/internal.go
+++ b/staging/src/k8s.io/client-go/applyconfigurations/internal/internal.go
@@ -3041,6 +3041,9 @@ var schemaYAML = typed.YAMLObject(`types:
    - name: parallelism
      type:
        scalar: numeric
+    - name: podFailurePolicy
+      type:
+        namedType: io.k8s.api.batch.v1.PodFailurePolicy
    - name: selector
      type:
        namedType: io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector
@@ -3098,6 +3101,58 @@ var schemaYAML = typed.YAMLObject(`types:
      type:
        namedType: io.k8s.api.batch.v1.JobSpec
      default: {}
+- name: io.k8s.api.batch.v1.PodFailurePolicy
+  map:
+    fields:
+    - name: rules
+      type:
+        list:
+          elementType:
+            namedType: io.k8s.api.batch.v1.PodFailurePolicyRule
+          elementRelationship: atomic
+- name: io.k8s.api.batch.v1.PodFailurePolicyOnExitCodesRequirement
+  map:
+    fields:
+    - name: containerName
+      type:
+        scalar: string
+    - name: operator
+      type:
+        scalar: string
+      default: ""
+    - name: values
+      type:
+        list:
+          elementType:
+            scalar: numeric
+          elementRelationship: associative
+- name: io.k8s.api.batch.v1.PodFailurePolicyOnPodConditionsPattern
+  map:
+    fields:
+    - name: status
+      type:
+        scalar: string
+      default: ""
+    - name: type
+      type:
+        scalar: string
+      default: ""
+- name: io.k8s.api.batch.v1.PodFailurePolicyRule
+  map:
+    fields:
+    - name: action
+      type:
+        scalar: string
+      default: ""
+    - name: onExitCodes
+      type:
+        namedType: io.k8s.api.batch.v1.PodFailurePolicyOnExitCodesRequirement
+    - name: onPodConditions
+      type:
+        list:
+          elementType:
+            namedType: io.k8s.api.batch.v1.PodFailurePolicyOnPodConditionsPattern
+          elementRelationship: atomic
 - name: io.k8s.api.batch.v1.UncountedTerminatedPods
  map:
    fields:
--- a/staging/src/k8s.io/client-go/applyconfigurations/utils.go
+++ b/staging/src/k8s.io/client-go/applyconfigurations/utils.go
@@ -447,6 +447,14 @@ func ForKind(kind schema.GroupVersionKind) interface{} {
 		return &applyconfigurationsbatchv1.JobStatusApplyConfiguration{}
 	case batchv1.SchemeGroupVersion.WithKind("JobTemplateSpec"):
 		return &applyconfigurationsbatchv1.JobTemplateSpecApplyConfiguration{}
+	case batchv1.SchemeGroupVersion.WithKind("PodFailurePolicy"):
+		return &applyconfigurationsbatchv1.PodFailurePolicyApplyConfiguration{}
+	case batchv1.SchemeGroupVersion.WithKind("PodFailurePolicyOnExitCodesRequirement"):
+		return &applyconfigurationsbatchv1.PodFailurePolicyOnExitCodesRequirementApplyConfiguration{}
+	case batchv1.SchemeGroupVersion.WithKind("PodFailurePolicyOnPodConditionsPattern"):
+		return &applyconfigurationsbatchv1.PodFailurePolicyOnPodConditionsPatternApplyConfiguration{}
+	case batchv1.SchemeGroupVersion.WithKind("PodFailurePolicyRule"):
+		return &applyconfigurationsbatchv1.PodFailurePolicyRuleApplyConfiguration{}
 	case batchv1.SchemeGroupVersion.WithKind("UncountedTerminatedPods"):
 		return &applyconfigurationsbatchv1.UncountedTerminatedPodsApplyConfiguration{}

--- a/test/integration/job/job_test.go
+++ b/test/integration/job/job_test.go
@@ -48,6 +48,7 @@ import (
 	"k8s.io/client-go/util/retry"
 	featuregatetesting "k8s.io/component-base/featuregate/testing"
 	"k8s.io/controller-manager/pkg/informerfactory"
+	"k8s.io/klog/v2"
 	kubeapiservertesting "k8s.io/kubernetes/cmd/kube-apiserver/app/testing"
 	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
 	"k8s.io/kubernetes/pkg/controller/garbagecollector"
@@ -59,6 +60,303 @@ import (

 const waitInterval = time.Second

+// TestJobPodFailurePolicyWithFailedPodDeletedDuringControllerRestart verifies that the job is properly marked as Failed
+// in a scenario when the job controller crashes between removing pod finalizers and marking the job as Failed (based on
+// the pod failure policy). After the finalizer for the failed pod is removed we remove the failed pod. This step is
+// done to simulate what PodGC would do. Then, the test spawns the second instance of the controller to check that it
+// will pick up the job state properly and will mark it as Failed, even if th pod triggering the pod failure policy is
+// already deleted.
+// Note: this scenario requires the use of finalizers. Without finalizers there is no guarantee a failed pod would be
+// checked against the pod failure policy rules before its removal by PodGC.
+func TestJobPodFailurePolicyWithFailedPodDeletedDuringControllerRestart(t *testing.T) {
+	count := 3
+	job := batchv1.Job{
+		Spec: batchv1.JobSpec{
+			Template: v1.PodTemplateSpec{
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name:                     "main-container",
+							Image:                    "foo",
+							ImagePullPolicy:          v1.PullIfNotPresent,
+							TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
+						},
+					},
+				},
+			},
+			Parallelism: pointer.Int32(int32(count)),
+			Completions: pointer.Int32(int32(count)),
+			PodFailurePolicy: &batchv1.PodFailurePolicy{
+				Rules: []batchv1.PodFailurePolicyRule{
+					{
+						Action: batchv1.PodFailurePolicyActionFailJob,
+						OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
+							Values:   []int32{5},
+						},
+					},
+				},
+			},
+		},
+	}
+	podStatusMatchingOnExitCodesTerminateRule := v1.PodStatus{
+		Phase: v1.PodFailed,
+		ContainerStatuses: []v1.ContainerStatus{
+			{
+				Name: "main-container",
+				State: v1.ContainerState{
+					Terminated: &v1.ContainerStateTerminated{
+						ExitCode: 5,
+					},
+				},
+			},
+		},
+	}
+	wFinalizers := true
+	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, wFinalizers)()
+	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, true)()
+	closeFn, restConfig, cs, ns := setup(t, "simple")
+	defer closeFn()
+
+	// Make the job controller significantly slower to trigger race condition.
+	restConfig.QPS = 1
+	restConfig.Burst = 1
+	ctx, cancel := startJobControllerAndWaitForCaches(restConfig)
+	defer func() {
+		cancel()
+	}()
+	restConfig.QPS = 200
+	restConfig.Burst = 200
+
+	// create a job with a failed pod matching the exit code rule and a couple of successful pods
+	jobObj, err := createJobWithDefaults(ctx, cs, ns.Name, &job)
+	if err != nil {
+		t.Fatalf("Failed to create Job: %v", err)
+	}
+	validateJobPodsStatus(ctx, t, cs, jobObj, podsByStatus{
+		Active: count,
+		Ready:  pointer.Int32(0),
+	}, wFinalizers)
+
+	jobPods, err := getJobPods(ctx, t, cs, jobObj)
+	if err != nil {
+		t.Fatalf("Failed to list Job Pods: %v", err)
+	}
+
+	failedIndex := 1
+	wg := sync.WaitGroup{}
+	wg.Add(1)
+
+	// Await for the failed pod (with index failedIndex) to have its finalizer
+	// removed. The finalizer will be removed by the job controller just after
+	// appending the FailureTarget condition to the job to mark it as targeted
+	// for failure.
+	go func() {
+		err := wait.PollImmediate(10*time.Millisecond, time.Minute, func() (bool, error) {
+			failedPodUpdated, err := cs.CoreV1().Pods(jobObj.Namespace).Get(ctx, jobPods[failedIndex].Name, metav1.GetOptions{})
+			if err != nil {
+				return true, err
+			}
+			if len(failedPodUpdated.Finalizers) == 0 {
+				return true, nil
+			}
+			return false, nil
+		})
+		if err != nil {
+			t.Logf("Failed awaiting for the the finalizer removal for pod %v", klog.KObj(jobPods[failedIndex]))
+		}
+		wg.Done()
+	}()
+
+	// We update one pod as failed with state matching the pod failure policy rule. This results in removal
+	// of the pod finalizer from the pod by the job controller.
+	failedPod := jobPods[failedIndex]
+	updatedPod := failedPod.DeepCopy()
+	updatedPod.Status = podStatusMatchingOnExitCodesTerminateRule
+	err, _ = updatePodStatuses(ctx, cs, []v1.Pod{*updatedPod})
+	if err != nil {
+		t.Fatalf("Failed to update pod statuses %q for pods of job %q", err, klog.KObj(jobObj))
+	}
+	wg.Wait()
+
+	t.Logf("Finalizer is removed for the failed pod %q. Shutting down the controller.", klog.KObj(failedPod))
+	// shut down the first job controller as soon as it removed the finalizer for the failed pod. This will
+	// likely happen before the first controller is able to mark the job as Failed.
+	cancel()
+
+	// Delete the failed pod to make sure it is not used by the second instance of the controller
+	ctx, cancel = context.WithCancel(context.Background())
+	err = cs.CoreV1().Pods(failedPod.Namespace).Delete(ctx, failedPod.Name, metav1.DeleteOptions{GracePeriodSeconds: pointer.Int64(0)})
+	if err != nil {
+		t.Fatalf("Error: '%v' while deleting pod: '%v'", err, klog.KObj(failedPod))
+	}
+	t.Logf("The failed pod %q is deleted", klog.KObj(failedPod))
+	cancel()
+
+	// start the second controller to promote the interim FailureTarget job condition as Failed
+	ctx, cancel = startJobControllerAndWaitForCaches(restConfig)
+	// verify the job is correctly marked as Failed
+	validateJobFailed(ctx, t, cs, jobObj)
+	validateNoOrphanPodsWithFinalizers(ctx, t, cs, jobObj)
+}
+
+// TestJobPodFailurePolicy tests handling of pod failures with respect to the
+// configured pod failure policy rules
+func TestJobPodFailurePolicy(t *testing.T) {
+	job := batchv1.Job{
+		Spec: batchv1.JobSpec{
+			Template: v1.PodTemplateSpec{
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name:                     "main-container",
+							Image:                    "foo",
+							ImagePullPolicy:          v1.PullIfNotPresent,
+							TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
+						},
+					},
+				},
+			},
+			PodFailurePolicy: &batchv1.PodFailurePolicy{
+				Rules: []batchv1.PodFailurePolicyRule{
+					{
+						Action: batchv1.PodFailurePolicyActionIgnore,
+						OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
+							{
+								Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
+							},
+						},
+					},
+					{
+						Action: batchv1.PodFailurePolicyActionFailJob,
+						OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
+							Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
+							Values:   []int32{5, 6, 7},
+						},
+					},
+				},
+			},
+		},
+	}
+	podStatusMatchingOnExitCodesTerminateRule := v1.PodStatus{
+		Phase: v1.PodFailed,
+		ContainerStatuses: []v1.ContainerStatus{
+			{
+				Name: "main-container",
+				State: v1.ContainerState{
+					Terminated: &v1.ContainerStateTerminated{
+						ExitCode: 5,
+					},
+				},
+			},
+		},
+	}
+	podStatusMatchingOnPodConditionsIgnoreRule := v1.PodStatus{
+		Phase: v1.PodFailed,
+		Conditions: []v1.PodCondition{
+			{
+				Type:   v1.AlphaNoCompatGuaranteeDisruptionTarget,
+				Status: v1.ConditionTrue,
+			},
+		},
+	}
+	testCases := map[string]struct {
+		enableJobPodFailurePolicy bool
+		restartController         bool
+		job                       batchv1.Job
+		podStatus                 v1.PodStatus
+		wantActive                int
+		wantFailed                int
+		wantJobConditionType      batchv1.JobConditionType
+	}{
+		"pod status matching the configured FailJob rule on exit codes; job terminated when JobPodFailurePolicy enabled": {
+			enableJobPodFailurePolicy: true,
+			job:                       job,
+			podStatus:                 podStatusMatchingOnExitCodesTerminateRule,
+			wantActive:                0,
+			wantFailed:                1,
+			wantJobConditionType:      batchv1.JobFailed,
+		},
+		"pod status matching the configured FailJob rule on exit codes; with controller restart; job terminated when JobPodFailurePolicy enabled": {
+			enableJobPodFailurePolicy: true,
+			restartController:         true,
+			job:                       job,
+			podStatus:                 podStatusMatchingOnExitCodesTerminateRule,
+			wantActive:                0,
+			wantFailed:                1,
+			wantJobConditionType:      batchv1.JobFailed,
+		},
+		"pod status matching the configured FailJob rule on exit codes; default handling when JobPodFailurePolicy disabled": {
+			enableJobPodFailurePolicy: false,
+			job:                       job,
+			podStatus:                 podStatusMatchingOnExitCodesTerminateRule,
+			wantActive:                1,
+			wantFailed:                1,
+			wantJobConditionType:      batchv1.JobComplete,
+		},
+		"pod status matching the configured Ignore rule on pod conditions; pod failure not counted when JobPodFailurePolicy enabled": {
+			enableJobPodFailurePolicy: true,
+			job:                       job,
+			podStatus:                 podStatusMatchingOnPodConditionsIgnoreRule,
+			wantActive:                1,
+			wantFailed:                0,
+			wantJobConditionType:      batchv1.JobComplete,
+		},
+	}
+	for name, test := range testCases {
+		for _, wFinalizers := range []bool{false, true} {
+			t.Run(fmt.Sprintf("%s; finalizers=%t", name, wFinalizers), func(t *testing.T) {
+				defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, wFinalizers)()
+				defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, test.enableJobPodFailurePolicy)()
+
+				closeFn, restConfig, clientSet, ns := setup(t, "simple")
+				defer closeFn()
+				ctx, cancel := startJobControllerAndWaitForCaches(restConfig)
+				defer func() {
+					cancel()
+				}()
+
+				jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &test.job)
+				if err != nil {
+					t.Fatalf("Error %q while creating the job %q", err, jobObj.Name)
+				}
+				validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
+					Active: 1,
+					Ready:  pointer.Int32(0),
+				}, wFinalizers)
+
+				op := func(p *v1.Pod) bool {
+					p.Status = test.podStatus
+					return true
+				}
+
+				if err, _ := updateJobPodsStatus(ctx, clientSet, jobObj, op, 1); err != nil {
+					t.Fatalf("Error %q while updating pod status for Job: %q", err, jobObj.Name)
+				}
+
+				if test.restartController {
+					cancel()
+					ctx, cancel = startJobControllerAndWaitForCaches(restConfig)
+				}
+
+				validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
+					Active: test.wantActive,
+					Failed: test.wantFailed,
+					Ready:  pointer.Int32(0),
+				}, wFinalizers)
+
+				if test.wantJobConditionType == batchv1.JobComplete {
+					if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
+						t.Fatalf("Failed setting phase %q on Job Pod: %q", v1.PodSucceeded, err)
+					}
+				}
+				validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType)
+				validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
+			})
+		}
+	}
+}
+
 // TestNonParallelJob tests that a Job that only executes one Pod. The test
 // recreates the Job controller at some points to make sure a new controller
 // is able to pickup.
@@ -676,7 +974,7 @@ func TestJobFailedWithInterrupts(t *testing.T) {
 func validateNoOrphanPodsWithFinalizers(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job) {
 	t.Helper()
 	orphanPods := 0
-	if err := wait.Poll(waitInterval, wait.ForeverTestTimeout, func() (done bool, err error) {
+	if err := wait.PollImmediate(waitInterval, wait.ForeverTestTimeout, func() (done bool, err error) {
 		pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{
 			LabelSelector: metav1.FormatLabelSelector(jobObj.Spec.Selector),
 		})
@@ -735,7 +1033,7 @@ func TestOrphanPodsFinalizersClearedWithFeatureDisabled(t *testing.T) {

 	// Restart controller.
 	ctx, cancel = startJobControllerAndWaitForCaches(restConfig)
-	if err := wait.Poll(waitInterval, wait.ForeverTestTimeout, func() (done bool, err error) {
+	if err := wait.PollImmediate(waitInterval, wait.ForeverTestTimeout, func() (done bool, err error) {
 		pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
 		if err != nil {
 			t.Fatalf("Failed to list Job Pods: %v", err)
@@ -939,7 +1237,7 @@ type podsByStatus struct {
 func validateJobPodsStatus(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job, desired podsByStatus, wFinalizer bool) {
 	t.Helper()
 	var actualCounts podsByStatus
-	if err := wait.Poll(waitInterval, wait.ForeverTestTimeout, func() (bool, error) {
+	if err := wait.PollImmediate(waitInterval, wait.ForeverTestTimeout, func() (bool, error) {
 		updatedJob, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{})
 		if err != nil {
 			t.Fatalf("Failed to get updated Job: %v", err)
@@ -982,6 +1280,23 @@ func validateJobPodsStatus(ctx context.Context, t *testing.T, clientSet clientse
 	}
 }

+func getJobPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job) ([]*v1.Pod, error) {
+	t.Helper()
+	allPods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return nil, err
+	}
+	jobPods := make([]*v1.Pod, 0, 0)
+	for _, pod := range allPods.Items {
+		phase := pod.Status.Phase
+		if metav1.IsControlledBy(&pod, jobObj) && (phase == v1.PodPending || phase == v1.PodRunning) {
+			p := pod
+			jobPods = append(jobPods, &p)
+		}
+	}
+	return jobPods, nil
+}
+
 func validateFinishedPodsNoFinalizer(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job) {
 	t.Helper()
 	pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
@@ -1041,7 +1356,7 @@ func waitForEvent(events watch.Interface, uid types.UID, reason string) error {
 	if reason == "" {
 		return nil
 	}
-	return wait.Poll(waitInterval, wait.ForeverTestTimeout, func() (bool, error) {
+	return wait.PollImmediate(waitInterval, wait.ForeverTestTimeout, func() (bool, error) {
 		for {
 			var ev watch.Event
 			select {
@@ -1082,7 +1397,7 @@ func validateJobSucceeded(ctx context.Context, t *testing.T, clientSet clientset

 func validateJobCondition(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job, cond batchv1.JobConditionType) {
 	t.Helper()
-	if err := wait.Poll(waitInterval, wait.ForeverTestTimeout, func() (bool, error) {
+	if err := wait.PollImmediate(waitInterval, wait.ForeverTestTimeout, func() (bool, error) {
 		j, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{})
 		if err != nil {
 			t.Fatalf("Failed to obtain updated Job: %v", err)