kubelet: improve checkpoint errors
Recording the expected and actual checksum in the error makes it possible to provide that information, for example in a failed test like the ones for DRA. Otherwise developers have to manually step through the test with a debugger to figure out what the new checksum is.
This commit is contained in:
		@@ -28,8 +28,9 @@ type Checksum uint64
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
// Verify verifies that passed checksum is same as calculated checksum
 | 
					// Verify verifies that passed checksum is same as calculated checksum
 | 
				
			||||||
func (cs Checksum) Verify(data interface{}) error {
 | 
					func (cs Checksum) Verify(data interface{}) error {
 | 
				
			||||||
	if cs != New(data) {
 | 
						actualCS := New(data)
 | 
				
			||||||
		return errors.ErrCorruptCheckpoint
 | 
						if cs != actualCS {
 | 
				
			||||||
 | 
							return &errors.CorruptCheckpointError{ActualCS: uint64(actualCS), ExpectedCS: uint64(cs)}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	return nil
 | 
						return nil
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -18,8 +18,28 @@ package errors
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
import "fmt"
 | 
					import "fmt"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// ErrCorruptCheckpoint error is reported when checksum does not match
 | 
					// ErrCorruptCheckpoint error is reported when checksum does not match.
 | 
				
			||||||
var ErrCorruptCheckpoint = fmt.Errorf("checkpoint is corrupted")
 | 
					// Check for it with:
 | 
				
			||||||
 | 
					//
 | 
				
			||||||
 | 
					//	var csErr *CorruptCheckpointError
 | 
				
			||||||
 | 
					//	if errors.As(err, &csErr) { ... }
 | 
				
			||||||
 | 
					//	if errors.Is(err, CorruptCheckpointError{}) { ... }
 | 
				
			||||||
 | 
					type CorruptCheckpointError struct {
 | 
				
			||||||
 | 
						ActualCS, ExpectedCS uint64
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func (err CorruptCheckpointError) Error() string {
 | 
				
			||||||
 | 
						return "checkpoint is corrupted"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func (err CorruptCheckpointError) Is(target error) bool {
 | 
				
			||||||
 | 
						switch target.(type) {
 | 
				
			||||||
 | 
						case *CorruptCheckpointError, CorruptCheckpointError:
 | 
				
			||||||
 | 
							return true
 | 
				
			||||||
 | 
						default:
 | 
				
			||||||
 | 
							return false
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// ErrCheckpointNotFound is reported when checkpoint is not found for a given key
 | 
					// ErrCheckpointNotFound is reported when checkpoint is not found for a given key
 | 
				
			||||||
var ErrCheckpointNotFound = fmt.Errorf("checkpoint is not found")
 | 
					var ErrCheckpointNotFound = fmt.Errorf("checkpoint is not found")
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -110,8 +110,12 @@ func (cp *CPUManagerCheckpointV1) VerifyChecksum() error {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	hash := fnv.New32a()
 | 
						hash := fnv.New32a()
 | 
				
			||||||
	fmt.Fprintf(hash, "%v", object)
 | 
						fmt.Fprintf(hash, "%v", object)
 | 
				
			||||||
	if cp.Checksum != checksum.Checksum(hash.Sum32()) {
 | 
						actualCS := checksum.Checksum(hash.Sum32())
 | 
				
			||||||
		return errors.ErrCorruptCheckpoint
 | 
						if cp.Checksum != actualCS {
 | 
				
			||||||
 | 
							return &errors.CorruptCheckpointError{
 | 
				
			||||||
 | 
								ActualCS:   uint64(actualCS),
 | 
				
			||||||
 | 
								ExpectedCS: uint64(cp.Checksum),
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return nil
 | 
						return nil
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -18,6 +18,7 @@ package state
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
import (
 | 
					import (
 | 
				
			||||||
	"encoding/json"
 | 
						"encoding/json"
 | 
				
			||||||
 | 
						"errors"
 | 
				
			||||||
	"fmt"
 | 
						"fmt"
 | 
				
			||||||
	"hash/fnv"
 | 
						"hash/fnv"
 | 
				
			||||||
	"strings"
 | 
						"strings"
 | 
				
			||||||
@@ -25,7 +26,7 @@ import (
 | 
				
			|||||||
	"k8s.io/apimachinery/pkg/util/dump"
 | 
						"k8s.io/apimachinery/pkg/util/dump"
 | 
				
			||||||
	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
 | 
						"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
 | 
				
			||||||
	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
 | 
						"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
 | 
				
			||||||
	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
 | 
						cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
var _ checkpointmanager.Checkpoint = &DRAManagerCheckpoint{}
 | 
					var _ checkpointmanager.Checkpoint = &DRAManagerCheckpoint{}
 | 
				
			||||||
@@ -79,7 +80,7 @@ func (dc *DRAManagerCheckpoint) VerifyChecksum() error {
 | 
				
			|||||||
	ck := dc.Checksum
 | 
						ck := dc.Checksum
 | 
				
			||||||
	dc.Checksum = 0
 | 
						dc.Checksum = 0
 | 
				
			||||||
	err := ck.Verify(dc)
 | 
						err := ck.Verify(dc)
 | 
				
			||||||
	if err == errors.ErrCorruptCheckpoint {
 | 
						if errors.Is(err, cmerrors.CorruptCheckpointError{}) {
 | 
				
			||||||
		// Verify with old structs without ResourceHandles field
 | 
							// Verify with old structs without ResourceHandles field
 | 
				
			||||||
		// TODO: remove in Beta
 | 
							// TODO: remove in Beta
 | 
				
			||||||
		err = verifyChecksumWithoutResourceHandles(dc, ck)
 | 
							err = verifyChecksumWithoutResourceHandles(dc, ck)
 | 
				
			||||||
@@ -115,8 +116,12 @@ func verifyChecksumWithoutResourceHandles(dc *DRAManagerCheckpoint, checkSum che
 | 
				
			|||||||
	object = strings.Replace(object, "ClaimInfoStateListWithoutResourceHandles", "ClaimInfoStateList", 1)
 | 
						object = strings.Replace(object, "ClaimInfoStateListWithoutResourceHandles", "ClaimInfoStateList", 1)
 | 
				
			||||||
	hash := fnv.New32a()
 | 
						hash := fnv.New32a()
 | 
				
			||||||
	fmt.Fprintf(hash, "%v", object)
 | 
						fmt.Fprintf(hash, "%v", object)
 | 
				
			||||||
	if checkSum != checksum.Checksum(hash.Sum32()) {
 | 
						actualCS := checksum.Checksum(hash.Sum32())
 | 
				
			||||||
		return errors.ErrCorruptCheckpoint
 | 
						if checkSum != actualCS {
 | 
				
			||||||
 | 
							return &cmerrors.CorruptCheckpointError{
 | 
				
			||||||
 | 
								ActualCS:   uint64(actualCS),
 | 
				
			||||||
 | 
								ExpectedCS: uint64(checkSum),
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	return nil
 | 
						return nil
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -126,7 +126,7 @@ func (sc *stateCheckpoint) GetOrCreate() (ClaimInfoStateList, error) {
 | 
				
			|||||||
		return ClaimInfoStateList{}, nil
 | 
							return ClaimInfoStateList{}, nil
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	if err != nil {
 | 
						if err != nil {
 | 
				
			||||||
		return nil, fmt.Errorf("failed to get checkpoint %v: %v", sc.checkpointName, err)
 | 
							return nil, fmt.Errorf("failed to get checkpoint %v: %w", sc.checkpointName, err)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return checkpoint.Entries, nil
 | 
						return checkpoint.Entries, nil
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -17,16 +17,19 @@ limitations under the License.
 | 
				
			|||||||
package state
 | 
					package state
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import (
 | 
					import (
 | 
				
			||||||
 | 
						"errors"
 | 
				
			||||||
	"os"
 | 
						"os"
 | 
				
			||||||
	"path"
 | 
						"path"
 | 
				
			||||||
	"strings"
 | 
						"strings"
 | 
				
			||||||
	"testing"
 | 
						"testing"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	"github.com/stretchr/testify/assert"
 | 
						"github.com/stretchr/testify/assert"
 | 
				
			||||||
 | 
						"github.com/stretchr/testify/require"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	resourcev1alpha2 "k8s.io/api/resource/v1alpha2"
 | 
						resourcev1alpha2 "k8s.io/api/resource/v1alpha2"
 | 
				
			||||||
	"k8s.io/apimachinery/pkg/util/sets"
 | 
						"k8s.io/apimachinery/pkg/util/sets"
 | 
				
			||||||
	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
 | 
						"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
 | 
				
			||||||
 | 
						cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
 | 
				
			||||||
	testutil "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state/testing"
 | 
						testutil "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state/testing"
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -198,7 +201,7 @@ func TestCheckpointGetOrCreate(t *testing.T) {
 | 
				
			|||||||
				assert.Error(t, err)
 | 
									assert.Error(t, err)
 | 
				
			||||||
				assert.Contains(t, err.Error(), tc.expectedError)
 | 
									assert.Contains(t, err.Error(), tc.expectedError)
 | 
				
			||||||
			} else {
 | 
								} else {
 | 
				
			||||||
				assert.NoError(t, err, "unexpected error while creating checkpointState")
 | 
									requireNoCheckpointError(t, err)
 | 
				
			||||||
				// compare state after restoration with the one expected
 | 
									// compare state after restoration with the one expected
 | 
				
			||||||
				assertStateEqual(t, state, tc.expectedState)
 | 
									assertStateEqual(t, state, tc.expectedState)
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
@@ -270,7 +273,7 @@ func TestOldCheckpointRestore(t *testing.T) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	checkpoint := NewDRAManagerCheckpoint()
 | 
						checkpoint := NewDRAManagerCheckpoint()
 | 
				
			||||||
	err = cpm.GetCheckpoint(testingCheckpoint, checkpoint)
 | 
						err = cpm.GetCheckpoint(testingCheckpoint, checkpoint)
 | 
				
			||||||
	assert.NoError(t, err, "could not restore checkpoint")
 | 
						requireNoCheckpointError(t, err)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	checkpointData, err := checkpoint.MarshalCheckpoint()
 | 
						checkpointData, err := checkpoint.MarshalCheckpoint()
 | 
				
			||||||
	assert.NoError(t, err, "could not Marshal Checkpoint")
 | 
						assert.NoError(t, err, "could not Marshal Checkpoint")
 | 
				
			||||||
@@ -278,3 +281,13 @@ func TestOldCheckpointRestore(t *testing.T) {
 | 
				
			|||||||
	expectedData := `{"version":"v1","entries":[{"DriverName":"test-driver.cdi.k8s.io","ClassName":"class-name","ClaimUID":"067798be-454e-4be4-9047-1aa06aea63f7","ClaimName":"example","Namespace":"default","PodUIDs":{"139cdb46-f989-4f17-9561-ca10cfb509a6":{}},"ResourceHandles":null,"CDIDevices":{"test-driver.cdi.k8s.io":["example.com/example=cdi-example"]}}],"checksum":453625682}`
 | 
						expectedData := `{"version":"v1","entries":[{"DriverName":"test-driver.cdi.k8s.io","ClassName":"class-name","ClaimUID":"067798be-454e-4be4-9047-1aa06aea63f7","ClaimName":"example","Namespace":"default","PodUIDs":{"139cdb46-f989-4f17-9561-ca10cfb509a6":{}},"ResourceHandles":null,"CDIDevices":{"test-driver.cdi.k8s.io":["example.com/example=cdi-example"]}}],"checksum":453625682}`
 | 
				
			||||||
	assert.Equal(t, expectedData, string(checkpointData), "expected ClaimInfoState does not equal to restored one")
 | 
						assert.Equal(t, expectedData, string(checkpointData), "expected ClaimInfoState does not equal to restored one")
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func requireNoCheckpointError(t *testing.T, err error) {
 | 
				
			||||||
 | 
						t.Helper()
 | 
				
			||||||
 | 
						var cksumErr *cmerrors.CorruptCheckpointError
 | 
				
			||||||
 | 
						if errors.As(err, &cksumErr) {
 | 
				
			||||||
 | 
							t.Fatalf("unexpected corrupt checkpoint, expected checksum %d, got %d", cksumErr.ExpectedCS, cksumErr.ActualCS)
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							require.NoError(t, err, "could not restore checkpoint")
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user