From d55d1b6fbeeabfcb16be71a2830ff69dce7cad96 Mon Sep 17 00:00:00 2001 From: Jason DeTiberus Date: Fri, 30 Mar 2018 10:19:26 -0400 Subject: [PATCH] [kubeadm] fix mirror-pod hash race condition - Update kubeadm static pod upgrades to use the kubetypes.ConfigHashAnnotationKey annotation on the mirror pod rather than generating a hash from the full object info. Previously, a status update for the pod would allow the upgrade to proceed before the new static pod manifest was actually deployed. Signed-off-by: Jason DeTiberus --- cmd/kubeadm/app/phases/upgrade/staticpods.go | 2 +- .../app/phases/upgrade/staticpods_test.go | 4 +-- cmd/kubeadm/app/util/apiclient/BUILD | 1 + cmd/kubeadm/app/util/apiclient/wait.go | 26 ++++++++----------- cmd/kubeadm/app/util/dryrun/dryrun.go | 7 +++-- 5 files changed, 18 insertions(+), 22 deletions(-) diff --git a/cmd/kubeadm/app/phases/upgrade/staticpods.go b/cmd/kubeadm/app/phases/upgrade/staticpods.go index 5d45fdb5606..eae2425c2c9 100644 --- a/cmd/kubeadm/app/phases/upgrade/staticpods.go +++ b/cmd/kubeadm/app/phases/upgrade/staticpods.go @@ -186,7 +186,7 @@ func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticP // notice the removal of the Static Pod, leading to a false positive below where we check that the API endpoint is healthy // If we don't do this, there is a case where we remove the Static Pod manifest, kubelet is slow to react, kubeadm checks the // API endpoint below of the OLD Static Pod component and proceeds quickly enough, which might lead to unexpected results. - if err := waiter.WaitForStaticPodControlPlaneHashChange(cfg.NodeName, component, beforePodHash); err != nil { + if err := waiter.WaitForStaticPodHashChange(cfg.NodeName, component, beforePodHash); err != nil { return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd) } diff --git a/cmd/kubeadm/app/phases/upgrade/staticpods_test.go b/cmd/kubeadm/app/phases/upgrade/staticpods_test.go index e8e8f3ffafc..91cb76d7e29 100644 --- a/cmd/kubeadm/app/phases/upgrade/staticpods_test.go +++ b/cmd/kubeadm/app/phases/upgrade/staticpods_test.go @@ -117,8 +117,8 @@ func (w *fakeWaiter) WaitForStaticPodSingleHash(_ string, _ string) (string, err return "", w.errsToReturn[waitForHashes] } -// WaitForStaticPodControlPlaneHashChange returns an error if set from errsToReturn -func (w *fakeWaiter) WaitForStaticPodControlPlaneHashChange(_, _, _ string) error { +// WaitForStaticPodHashChange returns an error if set from errsToReturn +func (w *fakeWaiter) WaitForStaticPodHashChange(_, _, _ string) error { return w.errsToReturn[waitForHashChange] } diff --git a/cmd/kubeadm/app/util/apiclient/BUILD b/cmd/kubeadm/app/util/apiclient/BUILD index f00dbf1232d..5704d1228a1 100644 --- a/cmd/kubeadm/app/util/apiclient/BUILD +++ b/cmd/kubeadm/app/util/apiclient/BUILD @@ -19,6 +19,7 @@ go_library( deps = [ "//cmd/kubeadm/app/constants:go_default_library", "//cmd/kubeadm/app/util:go_default_library", + "//pkg/kubelet/types:go_default_library", "//pkg/registry/core/service/ipallocator:go_default_library", "//vendor/k8s.io/api/apps/v1:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", diff --git a/cmd/kubeadm/app/util/apiclient/wait.go b/cmd/kubeadm/app/util/apiclient/wait.go index 6d76a8bac9e..3a75b8c6d1e 100644 --- a/cmd/kubeadm/app/util/apiclient/wait.go +++ b/cmd/kubeadm/app/util/apiclient/wait.go @@ -17,8 +17,6 @@ limitations under the License. package apiclient import ( - "crypto/sha256" - "encoding/json" "fmt" "io" "net/http" @@ -31,6 +29,7 @@ import ( "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" "k8s.io/kubernetes/cmd/kubeadm/app/constants" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" ) // Waiter is an interface for waiting for criteria in Kubernetes to happen @@ -43,11 +42,11 @@ type Waiter interface { WaitForPodToDisappear(staticPodName string) error // WaitForStaticPodSingleHash fetches sha256 hash for the control plane static pod WaitForStaticPodSingleHash(nodeName string, component string) (string, error) + // WaitForStaticPodHashChange waits for the given static pod component's static pod hash to get updated. + // By doing that we can be sure that the kubelet has restarted the given Static Pod + WaitForStaticPodHashChange(nodeName, component, previousHash string) error // WaitForStaticPodControlPlaneHashes fetches sha256 hashes for the control plane static pods WaitForStaticPodControlPlaneHashes(nodeName string) (map[string]string, error) - // WaitForStaticPodControlPlaneHashChange waits for the given static pod component's static pod hash to get updated. - // By doing that we can be sure that the kubelet has restarted the given Static Pod - WaitForStaticPodControlPlaneHashChange(nodeName, component, previousHash string) error // WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok' WaitForHealthyKubelet(initalTimeout time.Duration, healthzEndpoint string) error // SetTimeout adjusts the timeout to the specified duration @@ -194,17 +193,17 @@ func (w *KubeWaiter) WaitForStaticPodSingleHash(nodeName string, component strin return componentPodHash, err } -// WaitForStaticPodControlPlaneHashChange blocks until it timeouts or notices that the Mirror Pod (for the Static Pod, respectively) has changed +// WaitForStaticPodHashChange blocks until it timeouts or notices that the Mirror Pod (for the Static Pod, respectively) has changed // This implicitly means this function blocks until the kubelet has restarted the Static Pod in question -func (w *KubeWaiter) WaitForStaticPodControlPlaneHashChange(nodeName, component, previousHash string) error { +func (w *KubeWaiter) WaitForStaticPodHashChange(nodeName, component, previousHash string) error { return wait.PollImmediate(constants.APICallRetryInterval, w.timeout, func() (bool, error) { - hashes, err := getStaticPodControlPlaneHashes(w.client, nodeName) + hash, err := getStaticPodSingleHash(w.client, nodeName, component) if err != nil { return false, nil } // We should continue polling until the UID changes - if hashes[component] == previousHash { + if hash == previousHash { return false, nil } @@ -235,12 +234,9 @@ func getStaticPodSingleHash(client clientset.Interface, nodeName string, compone return "", err } - podBytes, err := json.Marshal(staticPod) - if err != nil { - return "", err - } - - return fmt.Sprintf("%x", sha256.Sum256(podBytes)), nil + staticPodHash := staticPod.Annotations[kubetypes.ConfigHashAnnotationKey] + fmt.Printf("Static pod: %s hash: %s\n", staticPodName, staticPodHash) + return staticPodHash, nil } // TryRunCommand runs a function a maximum of failureThreshold times, and retries on error. If failureThreshold is hit; the last error is returned diff --git a/cmd/kubeadm/app/util/dryrun/dryrun.go b/cmd/kubeadm/app/util/dryrun/dryrun.go index cf3b391b1a0..d02206f2331 100644 --- a/cmd/kubeadm/app/util/dryrun/dryrun.go +++ b/cmd/kubeadm/app/util/dryrun/dryrun.go @@ -106,8 +106,7 @@ func (w *Waiter) WaitForHealthyKubelet(_ time.Duration, healthzEndpoint string) // SetTimeout is a no-op; we don't wait in this implementation func (w *Waiter) SetTimeout(_ time.Duration) {} -// WaitForStaticPodControlPlaneHashes returns an empty hash for all control plane images; WaitForStaticPodControlPlaneHashChange won't block in any case -// but the empty strings there are needed +// WaitForStaticPodControlPlaneHashes returns an empty hash for all control plane images; func (w *Waiter) WaitForStaticPodControlPlaneHashes(_ string) (map[string]string, error) { return map[string]string{ constants.KubeAPIServer: "", @@ -122,7 +121,7 @@ func (w *Waiter) WaitForStaticPodSingleHash(_ string, _ string) (string, error) return "", nil } -// WaitForStaticPodControlPlaneHashChange returns a dummy nil error in order for the flow to just continue as we're dryrunning -func (w *Waiter) WaitForStaticPodControlPlaneHashChange(_, _, _ string) error { +// WaitForStaticPodHashChange returns a dummy nil error in order for the flow to just continue as we're dryrunning +func (w *Waiter) WaitForStaticPodHashChange(_, _, _ string) error { return nil }