Modify mostRecentScheduleTime to return more detailed information about missed schedules

Initially this method was returning a number of missed schedules, but
that turned out to be not reliable for some complex schedules. For
example, those which are being run only during week days. The second
approach was to only return a boolean indicating the too many missed
information. It turns out that we need to return all three values:
none missed, few missed and many missed, to let consumers know what to
do, but don't leak the wrong number out of mostRecentScheduleTime.
This commit is contained in:
Maciej Szulik
2023-10-18 19:29:03 +02:00
parent 6c4f71b31c
commit db8b303156
2 changed files with 125 additions and 30 deletions

View File

@@ -36,6 +36,27 @@ import (
// Utilities for dealing with Jobs and CronJobs and time.
type missedSchedulesType int
const (
noneMissed missedSchedulesType = iota
fewMissed
manyMissed
)
func (e missedSchedulesType) String() string {
switch e {
case noneMissed:
return "none"
case fewMissed:
return "few"
case manyMissed:
return "many"
default:
return fmt.Sprintf("unknown(%d)", int(e))
}
}
// inActiveList checks if cronjob's .status.active has a job with the same UID.
func inActiveList(cj *batchv1.CronJob, uid types.UID) bool {
for _, j := range cj.Status.Active {
@@ -75,11 +96,12 @@ func deleteFromActiveList(cj *batchv1.CronJob, uid types.UID) {
// mostRecentScheduleTime returns:
// - the last schedule time or CronJob's creation time,
// - the most recent time a Job should be created or nil, if that's after now,
// - boolean indicating an excessive number of missed schedules,
// - value indicating either none missed schedules, a few missed or many missed
// - error in an edge case where the schedule specification is grammatically correct,
// but logically doesn't make sense (31st day for months with only 30 days, for example).
func mostRecentScheduleTime(cj *batchv1.CronJob, now time.Time, schedule cron.Schedule, includeStartingDeadlineSeconds bool) (time.Time, *time.Time, bool, error) {
func mostRecentScheduleTime(cj *batchv1.CronJob, now time.Time, schedule cron.Schedule, includeStartingDeadlineSeconds bool) (time.Time, *time.Time, missedSchedulesType, error) {
earliestTime := cj.ObjectMeta.CreationTimestamp.Time
missedSchedules := noneMissed
if cj.Status.LastScheduleTime != nil {
earliestTime = cj.Status.LastScheduleTime.Time
}
@@ -96,10 +118,10 @@ func mostRecentScheduleTime(cj *batchv1.CronJob, now time.Time, schedule cron.Sc
t2 := schedule.Next(t1)
if now.Before(t1) {
return earliestTime, nil, false, nil
return earliestTime, nil, missedSchedules, nil
}
if now.Before(t2) {
return earliestTime, &t1, false, nil
return earliestTime, &t1, missedSchedules, nil
}
// It is possible for cron.ParseStandard("59 23 31 2 *") to return an invalid schedule
@@ -107,7 +129,7 @@ func mostRecentScheduleTime(cj *batchv1.CronJob, now time.Time, schedule cron.Sc
// In this case the timeBetweenTwoSchedules will be 0, and we error out the invalid schedule
timeBetweenTwoSchedules := int64(t2.Sub(t1).Round(time.Second).Seconds())
if timeBetweenTwoSchedules < 1 {
return earliestTime, nil, false, fmt.Errorf("time difference between two schedules is less than 1 second")
return earliestTime, nil, missedSchedules, fmt.Errorf("time difference between two schedules is less than 1 second")
}
// this logic used for calculating number of missed schedules does a rough
// approximation, by calculating a diff between two schedules (t1 and t2),
@@ -146,12 +168,18 @@ func mostRecentScheduleTime(cj *batchv1.CronJob, now time.Time, schedule cron.Sc
//
// I've somewhat arbitrarily picked 100, as more than 80,
// but less than "lots".
tooManyMissed := numberOfMissedSchedules > 100
switch {
case numberOfMissedSchedules > 100:
missedSchedules = manyMissed
// inform about few missed, still
case numberOfMissedSchedules > 0:
missedSchedules = fewMissed
}
if mostRecentTime.IsZero() {
return earliestTime, nil, tooManyMissed, nil
return earliestTime, nil, missedSchedules, nil
}
return earliestTime, &mostRecentTime, tooManyMissed, nil
return earliestTime, &mostRecentTime, missedSchedules, nil
}
// nextScheduleTimeDuration returns the time duration to requeue based on
@@ -160,13 +188,18 @@ func mostRecentScheduleTime(cj *batchv1.CronJob, now time.Time, schedule cron.Sc
// realistic cases should be around 100s, the job will still be executed without missing
// the schedule.
func nextScheduleTimeDuration(cj *batchv1.CronJob, now time.Time, schedule cron.Schedule) *time.Duration {
earliestTime, mostRecentTime, _, err := mostRecentScheduleTime(cj, now, schedule, false)
earliestTime, mostRecentTime, missedSchedules, err := mostRecentScheduleTime(cj, now, schedule, false)
if err != nil {
// we still have to requeue at some point, so aim for the next scheduling slot from now
mostRecentTime = &now
} else if mostRecentTime == nil {
// no missed schedules since earliestTime
mostRecentTime = &earliestTime
if missedSchedules == noneMissed {
// no missed schedules since earliestTime
mostRecentTime = &earliestTime
} else {
// if there are missed schedules since earliestTime, always use now
mostRecentTime = &now
}
}
t := schedule.Next(*mostRecentTime).Add(nextScheduleDelta).Sub(now)
@@ -177,13 +210,13 @@ func nextScheduleTimeDuration(cj *batchv1.CronJob, now time.Time, schedule cron.
// and before now, or nil if no unmet schedule times, and an error.
// If there are too many (>100) unstarted times, it will also record a warning.
func nextScheduleTime(logger klog.Logger, cj *batchv1.CronJob, now time.Time, schedule cron.Schedule, recorder record.EventRecorder) (*time.Time, error) {
_, mostRecentTime, tooManyMissed, err := mostRecentScheduleTime(cj, now, schedule, true)
_, mostRecentTime, missedSchedules, err := mostRecentScheduleTime(cj, now, schedule, true)
if mostRecentTime == nil || mostRecentTime.After(now) {
return nil, err
}
if tooManyMissed {
if missedSchedules == manyMissed {
recorder.Eventf(cj, corev1.EventTypeWarning, "TooManyMissedTimes", "too many missed start times. Set or decrease .spec.startingDeadlineSeconds or check clock skew")
logger.Info("too many missed times", "cronjob", klog.KObj(cj))
}

View File

@@ -364,7 +364,7 @@ func TestMostRecentScheduleTime(t *testing.T) {
now time.Time
expectedEarliestTime time.Time
expectedRecentTime *time.Time
expectedTooManyMissed bool
expectedTooManyMissed missedSchedulesType
wantErr bool
}{
{
@@ -405,9 +405,10 @@ func TestMostRecentScheduleTime(t *testing.T) {
Schedule: "0 * * * *",
},
},
now: *deltaTimeAfterTopOfTheHour(301 * time.Minute),
expectedRecentTime: deltaTimeAfterTopOfTheHour(300 * time.Minute),
expectedEarliestTime: *deltaTimeAfterTopOfTheHour(10 * time.Second),
now: *deltaTimeAfterTopOfTheHour(301 * time.Minute),
expectedRecentTime: deltaTimeAfterTopOfTheHour(300 * time.Minute),
expectedEarliestTime: *deltaTimeAfterTopOfTheHour(10 * time.Second),
expectedTooManyMissed: fewMissed,
},
{
name: "complex schedule",
@@ -422,9 +423,10 @@ func TestMostRecentScheduleTime(t *testing.T) {
LastScheduleTime: &metav1HalfPastTheHour,
},
},
now: *deltaTimeAfterTopOfTheHour(24*time.Hour + 31*time.Minute),
expectedRecentTime: deltaTimeAfterTopOfTheHour(24*time.Hour + 30*time.Minute),
expectedEarliestTime: *deltaTimeAfterTopOfTheHour(30 * time.Minute),
now: *deltaTimeAfterTopOfTheHour(24*time.Hour + 31*time.Minute),
expectedRecentTime: deltaTimeAfterTopOfTheHour(24*time.Hour + 30*time.Minute),
expectedEarliestTime: *deltaTimeAfterTopOfTheHour(30 * time.Minute),
expectedTooManyMissed: fewMissed,
},
{
name: "another complex schedule",
@@ -439,9 +441,10 @@ func TestMostRecentScheduleTime(t *testing.T) {
LastScheduleTime: &metav1HalfPastTheHour,
},
},
now: *deltaTimeAfterTopOfTheHour(30*time.Hour + 30*time.Minute),
expectedRecentTime: nil,
expectedEarliestTime: *deltaTimeAfterTopOfTheHour(30 * time.Minute),
now: *deltaTimeAfterTopOfTheHour(30*time.Hour + 30*time.Minute),
expectedRecentTime: nil,
expectedEarliestTime: *deltaTimeAfterTopOfTheHour(30 * time.Minute),
expectedTooManyMissed: fewMissed,
},
{
name: "complex schedule with longer diff between executions",
@@ -456,9 +459,10 @@ func TestMostRecentScheduleTime(t *testing.T) {
LastScheduleTime: &metav1HalfPastTheHour,
},
},
now: *deltaTimeAfterTopOfTheHour(96*time.Hour + 31*time.Minute),
expectedRecentTime: deltaTimeAfterTopOfTheHour(96*time.Hour + 30*time.Minute),
expectedEarliestTime: *deltaTimeAfterTopOfTheHour(30 * time.Minute),
now: *deltaTimeAfterTopOfTheHour(96*time.Hour + 31*time.Minute),
expectedRecentTime: deltaTimeAfterTopOfTheHour(96*time.Hour + 30*time.Minute),
expectedEarliestTime: *deltaTimeAfterTopOfTheHour(30 * time.Minute),
expectedTooManyMissed: fewMissed,
},
{
name: "complex schedule with shorter diff between executions",
@@ -470,9 +474,10 @@ func TestMostRecentScheduleTime(t *testing.T) {
Schedule: "30 6-16/4 * * 1-5",
},
},
now: *deltaTimeAfterTopOfTheHour(24*time.Hour + 31*time.Minute),
expectedRecentTime: deltaTimeAfterTopOfTheHour(24*time.Hour + 30*time.Minute),
expectedEarliestTime: *topOfTheHour(),
now: *deltaTimeAfterTopOfTheHour(24*time.Hour + 31*time.Minute),
expectedRecentTime: deltaTimeAfterTopOfTheHour(24*time.Hour + 30*time.Minute),
expectedEarliestTime: *topOfTheHour(),
expectedTooManyMissed: fewMissed,
},
{
name: "@every schedule",
@@ -491,7 +496,7 @@ func TestMostRecentScheduleTime(t *testing.T) {
now: *deltaTimeAfterTopOfTheHour(7 * 24 * time.Hour),
expectedRecentTime: deltaTimeAfterTopOfTheHour((6 * 24 * time.Hour) + 23*time.Hour + 1*time.Minute),
expectedEarliestTime: *deltaTimeAfterTopOfTheHour(1 * time.Minute),
expectedTooManyMissed: true,
expectedTooManyMissed: manyMissed,
},
{
name: "rogue cronjob",
@@ -611,6 +616,63 @@ func TestMostRecentScheduleTime(t *testing.T) {
}
}
func TestNextScheduleTimeDuration(t *testing.T) {
metav1TopOfTheHour := metav1.NewTime(*topOfTheHour())
metav1HalfPastTheHour := metav1.NewTime(*deltaTimeAfterTopOfTheHour(30 * time.Minute))
tests := []struct {
name string
cj *batchv1.CronJob
now time.Time
expectedDuration time.Duration
}{
{
name: "complex schedule skipping weekend",
cj: &batchv1.CronJob{
ObjectMeta: metav1.ObjectMeta{
CreationTimestamp: metav1TopOfTheHour,
},
Spec: batchv1.CronJobSpec{
Schedule: "30 6-16/4 * * 1-5",
},
Status: batchv1.CronJobStatus{
LastScheduleTime: &metav1HalfPastTheHour,
},
},
now: *deltaTimeAfterTopOfTheHour(24*time.Hour + 31*time.Minute),
expectedDuration: 3*time.Hour + 59*time.Minute + nextScheduleDelta,
},
{
name: "another complex schedule skipping weekend",
cj: &batchv1.CronJob{
ObjectMeta: metav1.ObjectMeta{
CreationTimestamp: metav1TopOfTheHour,
},
Spec: batchv1.CronJobSpec{
Schedule: "30 10,11,12 * * 1-5",
},
Status: batchv1.CronJobStatus{
LastScheduleTime: &metav1HalfPastTheHour,
},
},
now: *deltaTimeAfterTopOfTheHour(30*time.Hour + 30*time.Minute),
expectedDuration: 66*time.Hour + nextScheduleDelta,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
sched, err := cron.ParseStandard(tt.cj.Spec.Schedule)
if err != nil {
t.Errorf("error setting up the test, %s", err)
}
gotScheduleTimeDuration := nextScheduleTimeDuration(tt.cj, tt.now, sched)
if !reflect.DeepEqual(gotScheduleTimeDuration, &tt.expectedDuration) {
t.Errorf("scheduleTimeDuration - got %s, want %s", gotScheduleTimeDuration, tt.expectedDuration)
}
})
}
}
func topOfTheHour() *time.Time {
T1, err := time.Parse(time.RFC3339, "2016-05-19T10:00:00Z")
if err != nil {