diff --git a/pkg/cri/config/config.go b/pkg/cri/config/config.go index 43e954b5b..fb526ecd9 100644 --- a/pkg/cri/config/config.go +++ b/pkg/cri/config/config.go @@ -30,6 +30,30 @@ import ( "github.com/containerd/containerd/v2/pkg/deprecation" ) +const ( + // defaultImagePullProgressTimeoutDuration is the default value of imagePullProgressTimeout. + // + // NOTE: + // + // This ImagePullProgressTimeout feature is ported from kubelet/dockershim's + // --image-pull-progress-deadline. The original value is 1m0. Unlike docker + // daemon, the containerd doesn't have global concurrent download limitation + // before migrating to Transfer Service. If kubelet runs with concurrent + // image pull, the node will run under IO pressure. The ImagePull process + // could be impacted by self, if the target image is large one with a + // lot of layers. And also both container's writable layers and image's storage + // share one disk. The ImagePull process commits blob to content store + // with fsync, which might bring the unrelated files' dirty pages into + // disk in one transaction [1]. The 1m0 value isn't good enough. Based + // on #9347 case and kubernetes community's usage [2], the default value + // is updated to 5m0. If end-user still runs into unexpected cancel, + // they need to config it based on their environment. + // + // [1]: Fast commits for ext4 - https://lwn.net/Articles/842385/ + // [2]: https://github.com/kubernetes/kubernetes/blob/1635c380b26a1d8cc25d36e9feace9797f4bae3c/cluster/gce/util.sh#L882 + defaultImagePullProgressTimeoutDuration = 5 * time.Minute +) + type SandboxControllerMode string const ( diff --git a/pkg/cri/config/config_unix.go b/pkg/cri/config/config_unix.go index fbd308124..d3fe60e8e 100644 --- a/pkg/cri/config/config_unix.go +++ b/pkg/cri/config/config_unix.go @@ -19,8 +19,6 @@ package config import ( - "time" - containerd "github.com/containerd/containerd/v2/client" "github.com/pelletier/go-toml/v2" "k8s.io/kubelet/pkg/cri/streaming" @@ -100,7 +98,7 @@ func DefaultConfig() PluginConfig { }, EnableCDI: false, CDISpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, - ImagePullProgressTimeout: time.Minute.String(), + ImagePullProgressTimeout: defaultImagePullProgressTimeoutDuration.String(), DrainExecSyncIOTimeout: "0s", EnableUnprivilegedPorts: true, EnableUnprivilegedICMP: true, diff --git a/pkg/cri/config/config_windows.go b/pkg/cri/config/config_windows.go index 53ee83f9f..f4f23ea7f 100644 --- a/pkg/cri/config/config_windows.go +++ b/pkg/cri/config/config_windows.go @@ -19,7 +19,6 @@ package config import ( "os" "path/filepath" - "time" containerd "github.com/containerd/containerd/v2/client" "k8s.io/kubelet/pkg/cri/streaming" @@ -84,7 +83,7 @@ func DefaultConfig() PluginConfig { ImageDecryption: ImageDecryption{ KeyModel: KeyModelNode, }, - ImagePullProgressTimeout: time.Minute.String(), + ImagePullProgressTimeout: defaultImagePullProgressTimeoutDuration.String(), DrainExecSyncIOTimeout: "0s", } } diff --git a/pkg/cri/server/images/image_pull.go b/pkg/cri/server/images/image_pull.go index 840015052..8de15f902 100644 --- a/pkg/cri/server/images/image_pull.go +++ b/pkg/cri/server/images/image_pull.go @@ -616,9 +616,9 @@ func (reporter *pullProgressReporter) start(ctx context.Context) { WithField("activeReqs", activeReqs). WithField("totalBytesRead", bytesRead). WithField("lastSeenBytesRead", lastSeenBytesRead). - WithField("lastSeenTimestamp", lastSeenTimestamp). + WithField("lastSeenTimestamp", lastSeenTimestamp.Format(time.RFC3339)). WithField("reportInterval", reportInterval). - Tracef("progress for image pull") + Debugf("progress for image pull") if activeReqs == 0 || bytesRead > lastSeenBytesRead { lastSeenBytesRead = bytesRead diff --git a/remotes/docker/httpreadseeker.go b/remotes/docker/httpreadseeker.go index 5ada7e247..5d873173c 100644 --- a/remotes/docker/httpreadseeker.go +++ b/remotes/docker/httpreadseeker.go @@ -76,6 +76,16 @@ func (hrs *httpReadSeeker) Read(p []byte) (n int, err error) { if _, err2 := hrs.reader(); err2 == nil { return n, nil } + } else if err == io.EOF { + // The CRI's imagePullProgressTimeout relies on responseBody.Close to + // update the process monitor's status. If the err is io.EOF, close + // the connection since there is no more available data. + if hrs.rc != nil { + if clsErr := hrs.rc.Close(); clsErr != nil { + log.L.WithError(clsErr).Error("httpReadSeeker: failed to close ReadCloser after io.EOF") + } + hrs.rc = nil + } } return }