kubernetes/pkg/controller/job/pod_failure_policy.go
Aldo Culquicondor 7dc36bdf82
Wait for Pods to finish before considering Failed in Job (#113860)
* Wait for Pods to finish before considering Failed

Limit behavior to feature gates PodDisruptionConditions and
JobPodFailurePolicy and jobs with a podFailurePolicy.

Change-Id: I926391cc2521b389c8e52962afb0d4a6a845ab8f

* Remove check for unsheduled terminating pod

Change-Id: I3dc05bb4ea3738604f01bf8cb5fc8cc0f6ea54ec
2022-11-15 09:44:53 -08:00

130 lines
5.0 KiB
Go

/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"fmt"
batch "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
)
// matchPodFailurePolicy returns information about matching a given failed pod
// against the pod failure policy rules. The information is represented as an
// optional job failure message (present in case the pod matched a 'FailJob'
// rule), a boolean indicating if the failure should be counted towards
// backoffLimit (it should not be counted if the pod matched an 'Ignore' rule),
// and a pointer to the matched pod failure policy action.
func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool, *batch.PodFailurePolicyAction) {
if podFailurePolicy == nil {
return nil, true, nil
}
ignore := batch.PodFailurePolicyActionIgnore
failJob := batch.PodFailurePolicyActionFailJob
count := batch.PodFailurePolicyActionCount
for index, podFailurePolicyRule := range podFailurePolicy.Rules {
if podFailurePolicyRule.OnExitCodes != nil {
if containerStatus := matchOnExitCodes(&failedPod.Status, podFailurePolicyRule.OnExitCodes); containerStatus != nil {
switch podFailurePolicyRule.Action {
case batch.PodFailurePolicyActionIgnore:
return nil, false, &ignore
case batch.PodFailurePolicyActionCount:
return nil, true, &count
case batch.PodFailurePolicyActionFailJob:
msg := fmt.Sprintf("Container %s for pod %s/%s failed with exit code %v matching %v rule at index %d",
containerStatus.Name, failedPod.Namespace, failedPod.Name, containerStatus.State.Terminated.ExitCode, podFailurePolicyRule.Action, index)
return &msg, true, &failJob
}
}
} else if podFailurePolicyRule.OnPodConditions != nil {
if podCondition := matchOnPodConditions(&failedPod.Status, podFailurePolicyRule.OnPodConditions); podCondition != nil {
switch podFailurePolicyRule.Action {
case batch.PodFailurePolicyActionIgnore:
return nil, false, &ignore
case batch.PodFailurePolicyActionCount:
return nil, true, &count
case batch.PodFailurePolicyActionFailJob:
msg := fmt.Sprintf("Pod %s/%s has condition %v matching %v rule at index %d",
failedPod.Namespace, failedPod.Name, podCondition.Type, podFailurePolicyRule.Action, index)
return &msg, true, &failJob
}
}
}
}
return nil, true, nil
}
// matchOnExitCodes returns a terminated container status that matches the error code requirement, if any exists.
// If the returned status is non-nil, it has a non-nil Terminated field.
func matchOnExitCodes(podStatus *v1.PodStatus, requirement *batch.PodFailurePolicyOnExitCodesRequirement) *v1.ContainerStatus {
if containerStatus := getMatchingContainerFromList(podStatus.ContainerStatuses, requirement); containerStatus != nil {
return containerStatus
}
return getMatchingContainerFromList(podStatus.InitContainerStatuses, requirement)
}
func matchOnPodConditions(podStatus *v1.PodStatus, requirement []batch.PodFailurePolicyOnPodConditionsPattern) *v1.PodCondition {
for _, podCondition := range podStatus.Conditions {
for _, pattern := range requirement {
if podCondition.Type == pattern.Type && podCondition.Status == pattern.Status {
return &podCondition
}
}
}
return nil
}
// getMatchingContainerFromList returns the first terminated container status in the list that matches the error code requirement, or nil if none match.
// If the returned status is non-nil, it has a non-nil Terminated field
func getMatchingContainerFromList(containerStatuses []v1.ContainerStatus, requirement *batch.PodFailurePolicyOnExitCodesRequirement) *v1.ContainerStatus {
for _, containerStatus := range containerStatuses {
if containerStatus.State.Terminated == nil {
// This container is still be terminating. There is no exit code to match.
continue
}
if requirement.ContainerName == nil || *requirement.ContainerName == containerStatus.Name {
if containerStatus.State.Terminated.ExitCode != 0 {
if isOnExitCodesOperatorMatching(containerStatus.State.Terminated.ExitCode, requirement) {
return &containerStatus
}
}
}
}
return nil
}
func isOnExitCodesOperatorMatching(exitCode int32, requirement *batch.PodFailurePolicyOnExitCodesRequirement) bool {
switch requirement.Operator {
case batch.PodFailurePolicyOnExitCodesOpIn:
for _, value := range requirement.Values {
if value == exitCode {
return true
}
}
return false
case batch.PodFailurePolicyOnExitCodesOpNotIn:
for _, value := range requirement.Values {
if value == exitCode {
return false
}
}
return true
default:
return false
}
}