40 Commits

Author SHA1 Message Date
90d73c9e88 Ignore reading-only judgment to support CDI volume
Downgrade log level for task exit.
2025-06-10 11:22:13 +08:00
Shiming Zhang
a3c777afd1 Add OCI/Image Volume Source support
Signed-off-by: Shiming Zhang <wzshiming@hotmail.com>
2025-06-05 10:21:04 +08:00
Brad Davidson
890953d3c6 Enable btrfs/fuse-overlayfs/stargz snapshotter plugins
Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
2025-05-06 22:38:41 +00:00
Brad Davidson
f660f4424f Add rewrite support to hosts.toml loader
Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
2025-05-06 22:38:14 +00:00
Jacob Blain Christen
ba6b205d0f Mirror repository rewrites (v1.1)
Support CRI configuration to allow for request-time rewrite rules
applicable only to the repository portion of resource paths when pulling
images. Because the rewrites are applied at request time, images
themselves will not be "rewritten" -- images as stored by CRI (and the
underlying containerd facility) will continue to present as normal.

As an example, if you use the following config for your containerd:
```toml
[plugins]
  [plugins."io.containerd.grpc.v1.cri"]
    [plugins."io.containerd.grpc.v1.cri".registry]
      [plugins."io.containerd.grpc.v1.cri".registry.mirrors]
        [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
          endpoint = ["https://registry-1.docker.io/v2"]
       	  [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io".rewrite]
            "^library/(.*)" = "my-org/$1"
```

And then subsequently invoke `crictl pull alpine:3.13` it will pull
content from `docker.io/my-org/alpine:3.13` but still show up as
`docker.io/library/alpine:3.13` in the `crictl images` listing.

This commit has been reworked from the original implementation. Rewites
are now done when resolving instead of when building the request, so
that auth token scopes stored in the context properly reflect the
rewritten repository path. For the original implementation, see
06c4ea9baec2b278b8172a789bf601168292f645.
Ref: https://github.com/k3s-io/k3s/issues/11191#issuecomment-2455525773

Signed-off-by: Jacob Blain Christen <jacob@rancher.com>
Co-authored-by: Brad Davidson <brad.davidson@rancher.com>
Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
2025-05-06 22:38:14 +00:00
Brad Davidson
676ba43ad3 Remove GRPC metrics
These conflict with other GRPC servers when running embedded

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
2025-05-06 22:38:14 +00:00
ningmingxiao
b9ab7a3f49 cri:fix containerd panic when can't find sandbox extension
Signed-off-by: ningmingxiao <ning.mingxiao@zte.com.cn>
Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
2025-05-06 22:38:13 +00:00
Maksym Pavlenko
fb4c30d4ed Merge pull request #11717 from dmcgowan/backport-go-1.23.8
[release/2.0] Update go to 1.23.8
2025-04-17 12:39:09 -07:00
Maksym Pavlenko
d60230c0a2 Merge pull request #11713 from dmcgowan/prepare-v2.0.5
[release/2.0] Prepare release notes for v2.0.5
2025-04-17 12:37:45 -07:00
Akhil Mohan
5bcf0a95e3 use go1.23.8 as the default go version
use go1.23.8 as the default go version for running in CI and making
release binaries.

Signed-off-by: Akhil Mohan <akhilerm@gmail.com>
(cherry picked from commit 6f93c65f52c9e1c5e25595429fd50ce2e5da6843)
Signed-off-by: Derek McGowan <derek@mcg.dev>
2025-04-17 11:18:09 -07:00
Akhil Mohan
4838f33f7e update to go 1.24.2, 1.23.8
- go1.23.8 (released 2025-04-01) includes security fixes to the net/http
  package, as well as bug fixes to the runtime and the go command.
  Ref: https://github.com/golang/go/issues?q=milestone%3AGo1.23.8+label%3ACherryPickApproved
- go1.24.2 (released 2025-04-01) includes security fixes to the net/http
  package, as well as bug fixes to the compiler, the runtime, the go
  command, and the crypto/tls, go/types, net/http, and testing packages.
  Ref: https://github.com/golang/go/issues?q=milestone%3AGo1.24.2+label%3ACherryPickApproved

Signed-off-by: Akhil Mohan <akhilerm@gmail.com>
(cherry picked from commit 5629e9fff7de69a36f5f563d41966aa562866258)
Signed-off-by: Derek McGowan <derek@mcg.dev>
2025-04-17 11:18:09 -07:00
Derek McGowan
a8082cd60d Prepare release notes for v2.0.5
Signed-off-by: Derek McGowan <derek@mcg.dev>
2025-04-16 21:55:18 -07:00
Phil Estes
ab513cdec2 Merge pull request #11710 from dmcgowan/backport-11707
[release/2.0] Disable criu test on arm64
2025-04-16 20:59:16 -04:00
Phil Estes
58b715ad8d Disable arm64 criu testing in GH Actions
Due to current 100% failure rate on arm64 with the current OS image, disable criu testing for now

Signed-off-by: Phil Estes <estesp@amazon.com>
(cherry picked from commit 9ca6a7ee0aa0ea8added551dd16e00b2102fdea4)
Signed-off-by: Derek McGowan <derek@mcg.dev>
2025-04-16 16:17:25 -07:00
Akhil Mohan
b4a53e8264 disable portmap test in ubuntu-22 to make CI happy
Signed-off-by: Akhil Mohan <akhilerm@gmail.com>
(cherry picked from commit 338e0a1266302fed4b52a852735b90a832ad2b0f)
(cherry picked from commit 70db1bd00fb5db7c3958da4aefac0c41c89bb654)
Signed-off-by: Derek McGowan <derek@mcg.dev>
2025-04-16 16:15:12 -07:00
Akhil Mohan
4bcf472de6 add option to skip tests in critest
Signed-off-by: Akhil Mohan <akhilerm@gmail.com>
(cherry picked from commit 4ba3d82ca270937a428d4b6c006bd7e9f8261743)
(cherry picked from commit 8e6c93b6b057230915b449349836bf198f8ebcfe)
Signed-off-by: Derek McGowan <derek@mcg.dev>
2025-04-16 16:14:51 -07:00
Derek McGowan
ea7be04cb4 Merge pull request #11698 from k8s-infra-cherrypick-robot/cherry-pick-11670-to-release/2.0
[release/2.0] Prevent panic on zero length push
2025-04-16 08:57:13 +08:00
Derek McGowan
ebd9a50325 Merge pull request #11688 from estesp/cp-11641
[release/2.0] backport: Set default differ for the default unpack config of transfer service
2025-04-16 08:56:47 +08:00
Derek McGowan
ff47757ae4 Merge pull request #11703 from k8s-infra-cherrypick-robot/cherry-pick-11479-to-release/2.0
[release/2.0] ci: update GitHub Actions release runner to ubuntu-24.04
2025-04-16 08:53:11 +08:00
Austin Vazquez
b184a97d30 ci: update GitHub Actions release runner to ubuntu-24.04
Signed-off-by: Austin Vazquez <macedonv@amazon.com>
2025-04-16 00:16:49 +00:00
Cesar Talledo
8a638b71ae Prevent panic in Docker pusher.
Prevent a panic in the Docker pusher pushWriter, by checking that
the pipe is non nil before attempting to use it.

The panic was found by Moby issue #46746 (https://github.com/moby/moby/issues/46746).
With this fix the panic no longer reproduces.

Signed-off-by: Cesar Talledo <cesar.talledo@docker.com>
2025-04-14 21:25:41 +00:00
Henry Wang
84d9658c36 Set default differ for the default unpack config of transfer service
Signed-off-by: Henry Wang <henwang@amazon.com>
(cherry picked from commit a083b669c9412eef55ee103fe2bb1dec7c6178bc)
2025-04-11 12:09:53 -04:00
Fu Wei
9e97c2e626 Merge pull request #11621 from k8s-infra-cherrypick-robot/cherry-pick-11475-to-release/2.0
[release/2.0] fix: call checkCopyShimLogError(shimCtx) to avoid expected error log flood
2025-03-28 19:12:21 -04:00
yylt
e04543db09 use shimCtx for fifo copy
Signed-off-by: yylt <yang8518296@163.com>
2025-03-28 20:28:58 +00:00
Fu Wei
a5b872b5c8 Merge pull request #11618 from k8s-infra-cherrypick-robot/cherry-pick-11569-to-release/2.0
[release/2.0] update taskOptions based on runtimeOptions when creating a task
2025-03-28 15:24:53 -04:00
Iceber Gu
9f46e7a449 integration/client: add tests for TaskOptions is not empty
Co-authored-by: Wei Fu <fuweid89@gmail.com>
Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
2025-03-28 18:33:13 +00:00
Iceber Gu
8a16a6a04a prefer task options for PluginInfo request
Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
2025-03-28 18:33:13 +00:00
Iceber Gu
a183b2d232 update taskOptions based on runtimeOptions when creating a task
Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
2025-03-28 18:33:13 +00:00
Fu Wei
c146996f3f Merge pull request #11599 from k8s-infra-cherrypick-robot/cherry-pick-11581-to-release/2.0
[release/2.0] *: CRIImageService should delete image synchronously
2025-03-25 09:47:53 -04:00
Wei Fu
091143135b *: CRIImageService should delete image synchronously
Use memory service instead of metadata store.

Signed-off-by: Wei Fu <fuweid89@gmail.com>
2025-03-25 02:36:45 +00:00
Phil Estes
148fbbb92f Merge pull request #11583 from k8s-infra-cherrypick-robot/cherry-pick-11560-to-release/2.0
[release/2.0] Update runc binary to v1.2.6
2025-03-24 16:16:45 +01:00
Austin Vazquez
c2372c072c Update runc binary to v1.2.6
Signed-off-by: Austin Vazquez <macedonv@amazon.com>
2025-03-22 18:53:57 +00:00
Akihiro Suda
ceb33770d2 Merge pull request #11566 from klihub/2.0/deps/bump-cdi-version
[release/2.0] go.{mod,sum}: bump CDI deps to stable v1.0.0.
2025-03-20 02:57:31 +09:00
Phil Estes
c0d93d20be Merge pull request #11571 from AkihiroSuda/dev-2.0
[release/2.0] silence govulncheck false positives
2025-03-19 17:58:09 +01:00
Krisztian Litkey
e8506511b2 go.{mod,sum}: bump CDI deps to stable v1.0.0.
Signed-off-by: Krisztian Litkey <krisztian.litkey@intel.com>
2025-03-19 17:29:35 +02:00
Akihiro Suda
4cfb89430c go.mod: github.com/go-jose/go-jose/v4
Signed-off-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
2025-03-19 23:00:47 +09:00
Akihiro Suda
2b9e6a29d7 go.mod: golang.org/x/oauth2 v0.28.0
Signed-off-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
2025-03-19 23:00:20 +09:00
Akihiro Suda
6df1ea0d9e go.mod: golang.org/x/net v0.37.0
Signed-off-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
2025-03-19 22:59:07 +09:00
Akihiro Suda
dcd661b138 Merge pull request #11567 from klihub/fixes/2.0/ci-lint-errors
[release/2.0] Fix CI lint error (cherry-picked #11555)
2025-03-19 22:49:23 +09:00
Jin Dong
16f20abdff Fix CI lint error
Signed-off-by: Jin Dong <djdongjin95@gmail.com>
(cherry picked from commit c8effff1a823bed757194584a80a043c3a69da1a)
2025-03-19 11:35:45 +02:00
2219 changed files with 545327 additions and 5156 deletions

View File

@@ -12,7 +12,7 @@
"features": {
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
"ghcr.io/devcontainers/features/go:1": {
"version": "1.23.7"
"version": "1.23.8"
}
},

View File

@@ -3,7 +3,7 @@ description: "Reusable action to install Go, so there is one place to bump Go ve
inputs:
go-version:
required: true
default: "1.23.7"
default: "1.23.8"
description: "Go version to install"
runs:

View File

@@ -6,7 +6,7 @@ on:
name: API Release
env:
GO_VERSION: "1.23.7"
GO_VERSION: "1.23.8"
permissions: # added using https://github.com/step-security/secure-workflows
contents: read

View File

@@ -189,7 +189,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-22.04, ubuntu-24.04, ubuntu-24.04-arm, macos-13, windows-2019, windows-2022]
go-version: ["1.24.1", "1.23.7"]
go-version: ["1.24.2", "1.23.8"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: ./.github/actions/install-go
@@ -405,7 +405,9 @@ jobs:
script/setup/install-critools
script/setup/install-failpoint-binaries
- name: Install criu
# Disable criu testing on arm64 until we can solve the consistent failures of restore testing
- if: matrix.os != 'ubuntu-24.04-arm'
name: Install criu
run: |
sudo add-apt-repository -y ppa:criu/ppa
sudo apt-get update
@@ -486,8 +488,12 @@ jobs:
TEST_RUNTIME: ${{ matrix.runtime }}
CGROUP_DRIVER: ${{ matrix.cgroup_driver }}
run: |
# skipping the ipv6 test till https://github.com/actions/runner-images/issues/11985 is fixed
if [[ ${{matrix.os}} == "ubuntu-22.04" ]]; then
skip_test="runtime should support port mapping with host port and container port"
fi
env
sudo -E PATH=$PATH ./script/critest.sh "${{github.workspace}}/report"
sudo -E PATH=$PATH SKIP_TEST="$skip_test" ./script/critest.sh "${{github.workspace}}/report"
# Log the status of this VM to investigate issues like
# https://github.com/containerd/containerd/issues/4969

View File

@@ -13,7 +13,7 @@ on:
name: Release
env:
GO_VERSION: "1.23.7"
GO_VERSION: "1.23.8"
permissions: # added using https://github.com/step-security/secure-workflows
contents: read
@@ -64,7 +64,7 @@ jobs:
build:
name: Build Release Binaries
runs-on: ubuntu-20.04
runs-on: ubuntu-24.04
timeout-minutes: 30
strategy:
matrix:

2
Vagrantfile vendored
View File

@@ -107,7 +107,7 @@ EOF
config.vm.provision "install-golang", type: "shell", run: "once" do |sh|
sh.upload_path = "/tmp/vagrant-install-golang"
sh.env = {
'GO_VERSION': ENV['GO_VERSION'] || "1.23.7",
'GO_VERSION': ENV['GO_VERSION'] || "1.23.8",
}
sh.inline = <<~SHELL
#!/usr/bin/env bash

View File

@@ -1,6 +1,6 @@
module github.com/containerd/containerd/api
go 1.21
go 1.23.0
require (
github.com/containerd/ttrpc v1.2.5
@@ -17,7 +17,7 @@ require (
github.com/golang/protobuf v1.5.3 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect
golang.org/x/net v0.23.0 // indirect
golang.org/x/sys v0.18.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/net v0.37.0 // indirect
golang.org/x/sys v0.31.0 // indirect
golang.org/x/text v0.23.0 // indirect
)

View File

@@ -42,8 +42,8 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs=
golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg=
golang.org/x/net v0.37.0 h1:1zLorHbz+LYj7MQlSf1+2tPIIgibq2eL5xkrGk6f+2c=
golang.org/x/net v0.37.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -51,12 +51,12 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=

View File

@@ -279,7 +279,8 @@ func (c *container) NewTask(ctx context.Context, ioCreate cio.Creator, opts ...N
}
}
info := TaskInfo{
runtime: r.Runtime.Name,
runtime: r.Runtime.Name,
runtimeOptions: r.Runtime.Options,
}
for _, o := range opts {
if err := o(ctx, c.client, &info); err != nil {

View File

@@ -146,6 +146,11 @@ type TaskInfo struct {
// runtime is the runtime name for the container, and cannot be changed.
runtime string
// runtimeOptions is the runtime options for the container, and when task options are set,
// they will be based on the runtimeOptions.
// https://github.com/containerd/containerd/issues/11568
runtimeOptions typeurl.Any
}
// Runtime name for the container
@@ -153,6 +158,29 @@ func (i *TaskInfo) Runtime() string {
return i.runtime
}
// getRuncOptions returns a reference to the runtime options for use by the task.
// If the set of options is not set by the opts passed into the NewTask creation
// this function first attempts to initialize the runtime options with a copy of the runtimeOptions,
// otherwise an empty set of options is assigned and returned
func (i *TaskInfo) getRuncOptions() (*options.Options, error) {
if i.Options != nil {
opts, ok := i.Options.(*options.Options)
if !ok {
return nil, errors.New("invalid runtime v2 options format")
}
return opts, nil
}
opts := &options.Options{}
if i.runtimeOptions != nil && i.runtimeOptions.GetValue() != nil {
if err := typeurl.UnmarshalTo(i.runtimeOptions, opts); err != nil {
return nil, fmt.Errorf("failed to get runtime v2 options: %w", err)
}
}
i.Options = opts
return opts, nil
}
// Task is the executable object within containerd
type Task interface {
Process

View File

@@ -54,12 +54,9 @@ func WithRuntimePath(absRuntimePath string) NewTaskOpts {
// usually it is served inside a sandbox, and we can get it from sandbox status.
func WithTaskAPIEndpoint(address string, version uint32) NewTaskOpts {
return func(ctx context.Context, client *Client, info *TaskInfo) error {
if info.Options == nil {
info.Options = &options.Options{}
}
opts, ok := info.Options.(*options.Options)
if !ok {
return errors.New("invalid runtime v2 options format")
opts, err := info.getRuncOptions()
if err != nil {
return err
}
opts.TaskApiAddress = address
opts.TaskApiVersion = version
@@ -119,12 +116,9 @@ func WithCheckpointImagePath(path string) CheckpointTaskOpts {
// WithRestoreImagePath sets image path for create option
func WithRestoreImagePath(path string) NewTaskOpts {
return func(ctx context.Context, c *Client, ti *TaskInfo) error {
if ti.Options == nil {
ti.Options = &options.Options{}
}
opts, ok := ti.Options.(*options.Options)
if !ok {
return errors.New("invalid runtime v2 options format")
opts, err := ti.getRuncOptions()
if err != nil {
return err
}
opts.CriuImagePath = path
return nil
@@ -134,12 +128,9 @@ func WithRestoreImagePath(path string) NewTaskOpts {
// WithRestoreWorkPath sets criu work path for create option
func WithRestoreWorkPath(path string) NewTaskOpts {
return func(ctx context.Context, c *Client, ti *TaskInfo) error {
if ti.Options == nil {
ti.Options = &options.Options{}
}
opts, ok := ti.Options.(*options.Options)
if !ok {
return errors.New("invalid runtime v2 options format")
opts, err := ti.getRuncOptions()
if err != nil {
return err
}
opts.CriuWorkPath = path
return nil

View File

@@ -20,20 +20,14 @@ package client
import (
"context"
"errors"
"github.com/containerd/containerd/api/types/runc/options"
)
// WithNoNewKeyring causes tasks not to be created with a new keyring for secret storage.
// There is an upper limit on the number of keyrings in a linux system
func WithNoNewKeyring(ctx context.Context, c *Client, ti *TaskInfo) error {
if ti.Options == nil {
ti.Options = &options.Options{}
}
opts, ok := ti.Options.(*options.Options)
if !ok {
return errors.New("invalid v2 shim create options format")
opts, err := ti.getRuncOptions()
if err != nil {
return err
}
opts.NoNewKeyring = true
return nil
@@ -41,12 +35,9 @@ func WithNoNewKeyring(ctx context.Context, c *Client, ti *TaskInfo) error {
// WithNoPivotRoot instructs the runtime not to you pivot_root
func WithNoPivotRoot(_ context.Context, _ *Client, ti *TaskInfo) error {
if ti.Options == nil {
ti.Options = &options.Options{}
}
opts, ok := ti.Options.(*options.Options)
if !ok {
return errors.New("invalid v2 shim create options format")
opts, err := ti.getRuncOptions()
if err != nil {
return err
}
opts.NoPivotRoot = true
return nil
@@ -55,12 +46,9 @@ func WithNoPivotRoot(_ context.Context, _ *Client, ti *TaskInfo) error {
// WithShimCgroup sets the existing cgroup for the shim
func WithShimCgroup(path string) NewTaskOpts {
return func(ctx context.Context, c *Client, ti *TaskInfo) error {
if ti.Options == nil {
ti.Options = &options.Options{}
}
opts, ok := ti.Options.(*options.Options)
if !ok {
return errors.New("invalid v2 shim create options format")
opts, err := ti.getRuncOptions()
if err != nil {
return err
}
opts.ShimCgroup = path
return nil
@@ -70,12 +58,9 @@ func WithShimCgroup(path string) NewTaskOpts {
// WithUIDOwner allows console I/O to work with the remapped UID in user namespace
func WithUIDOwner(uid uint32) NewTaskOpts {
return func(ctx context.Context, c *Client, ti *TaskInfo) error {
if ti.Options == nil {
ti.Options = &options.Options{}
}
opts, ok := ti.Options.(*options.Options)
if !ok {
return errors.New("invalid v2 shim create options format")
opts, err := ti.getRuncOptions()
if err != nil {
return err
}
opts.IoUid = uid
return nil
@@ -85,12 +70,9 @@ func WithUIDOwner(uid uint32) NewTaskOpts {
// WithGIDOwner allows console I/O to work with the remapped GID in user namespace
func WithGIDOwner(gid uint32) NewTaskOpts {
return func(ctx context.Context, c *Client, ti *TaskInfo) error {
if ti.Options == nil {
ti.Options = &options.Options{}
}
opts, ok := ti.Options.(*options.Options)
if !ok {
return errors.New("invalid v2 shim create options format")
opts, err := ti.getRuncOptions()
if err != nil {
return err
}
opts.IoGid = gid
return nil

View File

@@ -22,6 +22,9 @@ import (
_ "github.com/containerd/containerd/v2/core/metrics/cgroups/v2"
_ "github.com/containerd/containerd/v2/plugins/diff/walking/plugin"
_ "github.com/containerd/containerd/v2/plugins/snapshots/blockfile/plugin"
_ "github.com/containerd/containerd/v2/plugins/snapshots/btrfs/plugin"
_ "github.com/containerd/containerd/v2/plugins/snapshots/native/plugin"
_ "github.com/containerd/containerd/v2/plugins/snapshots/overlay/plugin"
_ "github.com/containerd/fuse-overlayfs-snapshotter/v2/plugin"
_ "github.com/containerd/stargz-snapshotter/service/plugin"
)

View File

@@ -37,9 +37,7 @@ import (
"github.com/containerd/log"
"github.com/containerd/ttrpc"
"github.com/docker/go-metrics"
grpc_prometheus "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus"
v1 "github.com/opencontainers/image-spec/specs-go/v1"
"github.com/prometheus/client_golang/prometheus"
"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
"google.golang.org/grpc"
"google.golang.org/grpc/backoff"
@@ -151,25 +149,10 @@ func New(ctx context.Context, config *srvconfig.Config) (*Server, error) {
diff.RegisterProcessor(diff.BinaryHandler(id, p.Returns, p.Accepts, p.Path, p.Args, p.Env))
}
var prometheusServerMetricsOpts []grpc_prometheus.ServerMetricsOption
if config.Metrics.GRPCHistogram {
// Enable grpc time histograms to measure rpc latencies
prometheusServerMetricsOpts = append(prometheusServerMetricsOpts, grpc_prometheus.WithServerHandlingTimeHistogram())
}
prometheusServerMetrics := grpc_prometheus.NewServerMetrics(prometheusServerMetricsOpts...)
prometheus.MustRegister(prometheusServerMetrics)
serverOpts := []grpc.ServerOption{
grpc.StatsHandler(otelgrpc.NewServerHandler()),
grpc.ChainStreamInterceptor(
streamNamespaceInterceptor,
prometheusServerMetrics.StreamServerInterceptor(),
),
grpc.ChainUnaryInterceptor(
unaryNamespaceInterceptor,
prometheusServerMetrics.UnaryServerInterceptor(),
),
grpc.StreamInterceptor(streamNamespaceInterceptor),
grpc.UnaryInterceptor(unaryNamespaceInterceptor),
}
if config.GRPC.MaxRecvMsgSize > 0 {
serverOpts = append(serverOpts, grpc.MaxRecvMsgSize(config.GRPC.MaxRecvMsgSize))
@@ -229,11 +212,10 @@ func New(ctx context.Context, config *srvconfig.Config) (*Server, error) {
ttrpcServices []ttrpcService
s = &Server{
prometheusServerMetrics: prometheusServerMetrics,
grpcServer: grpcServer,
tcpServer: tcpServer,
ttrpcServer: ttrpcServer,
config: config,
grpcServer: grpcServer,
tcpServer: tcpServer,
ttrpcServer: ttrpcServer,
config: config,
}
initialized = plugin.NewPluginSet()
required = make(map[string]struct{})
@@ -381,18 +363,16 @@ func recordConfigDeprecations(ctx context.Context, config *srvconfig.Config, set
// Server is the containerd main daemon
type Server struct {
prometheusServerMetrics *grpc_prometheus.ServerMetrics
grpcServer *grpc.Server
ttrpcServer *ttrpc.Server
tcpServer *grpc.Server
config *srvconfig.Config
plugins []*plugin.Plugin
ready sync.WaitGroup
grpcServer *grpc.Server
ttrpcServer *ttrpc.Server
tcpServer *grpc.Server
config *srvconfig.Config
plugins []*plugin.Plugin
ready sync.WaitGroup
}
// ServeGRPC provides the containerd grpc APIs on the provided listener
func (s *Server) ServeGRPC(l net.Listener) error {
s.prometheusServerMetrics.InitializeMetrics(s.grpcServer)
return trapClosedConnErr(s.grpcServer.Serve(l))
}
@@ -414,7 +394,6 @@ func (s *Server) ServeMetrics(l net.Listener) error {
// ServeTCP allows services to serve over tcp
func (s *Server) ServeTCP(l net.Listener) error {
s.prometheusServerMetrics.InitializeMetrics(s.tcpServer)
return trapClosedConnErr(s.tcpServer.Serve(l))
}

View File

@@ -34,7 +34,7 @@
# docker run --privileged --group-add keep-groups -v ./critest_exit_code.txt:/tmp/critest_exit_code.txt containerd-test
# ------------------------------------------------------------------------------
ARG GOLANG_VERSION=1.23.7
ARG GOLANG_VERSION=1.23.8
ARG GOLANG_IMAGE=golang
FROM ${GOLANG_IMAGE}:${GOLANG_VERSION} AS golang

View File

@@ -43,11 +43,11 @@ go run main.go --target_dir $SRC/containerd/images
apt-get update && apt-get install -y wget
cd $SRC
wget --quiet https://go.dev/dl/go1.23.7.linux-amd64.tar.gz
wget --quiet https://go.dev/dl/go1.23.8.linux-amd64.tar.gz
mkdir temp-go
rm -rf /root/.go/*
tar -C temp-go/ -xzf go1.23.7.linux-amd64.tar.gz
tar -C temp-go/ -xzf go1.23.8.linux-amd64.tar.gz
mv temp-go/go/* /root/.go/
cd $SRC/containerd

View File

@@ -54,6 +54,7 @@ type hostConfig struct {
header http.Header
rewrites map[string]string
// TODO: Add credential configuration (domain alias, username)
}
@@ -263,6 +264,7 @@ func ConfigureHosts(ctx context.Context, options HostOptions) docker.RegistryHos
rhosts[i].Path = host.path
rhosts[i].Capabilities = host.capabilities
rhosts[i].Header = host.header
rhosts[i].Rewrites = host.rewrites
}
return rhosts, nil
@@ -352,6 +354,10 @@ type hostFileConfig struct {
// API root endpoint.
OverridePath bool `toml:"override_path"`
// Rewrites contains a map of regex/replacement values, used to modify
// the image name when pulling.
Rewrites map[string]string `toml:"rewrite"`
// TODO: Credentials: helper? name? username? alternate domain? token?
}
@@ -389,13 +395,22 @@ func parseHostsFile(baseDir string, b []byte) ([]hostConfig, error) {
hosts = append(hosts, parsed)
}
// Parse root host config and append it as the last element
parsed, err := parseHostConfig(c.Server, baseDir, c.hostFileConfig)
tree, err := getTopLevelKeyValues(b)
if err != nil {
return nil, err
}
hosts = append(hosts, parsed)
// If the server key is set at the root of the tree, or no other hosts are configured,
// parse the root host config and append it as the last element.
// This allows not using upstream by including hosts and excluding the server key as documented at
// https://github.com/containerd/containerd/blob/release/1.7/docs/hosts.md#setup-a-local-mirror-for-docker
if _, ok := tree["server"]; ok || len(hosts) == 0 {
parsed, err := parseHostConfig(c.Server, baseDir, c.hostFileConfig)
if err != nil {
return nil, err
}
hosts = append(hosts, parsed)
}
return hosts, nil
}
@@ -427,6 +442,7 @@ func parseHostConfig(server string, baseDir string, config hostFileConfig) (host
}
result.skipVerify = config.SkipVerify
result.rewrites = config.Rewrites
if len(config.Capabilities) > 0 {
for _, c := range config.Capabilities {
@@ -515,6 +531,32 @@ func parseHostConfig(server string, baseDir string, config hostFileConfig) (host
return result, nil
}
func getTopLevelKeyValues(b []byte) (map[string]string, error) {
keyVals := map[string]string{}
// Use toml unstable package for directly parsing toml
// See https://github.com/pelletier/go-toml/discussions/801#discussioncomment-7083586
p := tomlu.Parser{}
p.Reset(b)
// iterate over top level expressions until we find something that's not a simple KeyValue
for p.NextExpression() {
e := p.Expression()
if e.Kind != tomlu.KeyValue {
break
}
k := e.Key()
v := e.Value()
if keyNode := k.Node(); keyNode != nil && v != nil {
keyVals[string(keyNode.Data)] = string(v.Data)
}
}
return keyVals, p.Error()
}
// getSortedHosts returns the list of hosts in the order are they defined in the file.
func getSortedHosts(b []byte) ([]string, error) {
var hostsInOrder []string

View File

@@ -49,14 +49,14 @@ func (r dockerFetcher) Fetch(ctx context.Context, desc ocispec.Descriptor) (io.R
return nil, fmt.Errorf("no pull hosts: %w", errdefs.ErrNotFound)
}
ctx, err := ContextWithRepositoryScope(ctx, r.refspec, false)
if err != nil {
return nil, err
}
return newHTTPReadSeeker(desc.Size, func(offset int64) (io.ReadCloser, error) {
// firstly try fetch via external urls
for _, us := range desc.URLs {
ctx, err := ContextWithRepositoryScope(ctx, r.refspec, false)
if err != nil {
return nil, err
}
u, err := url.Parse(us)
if err != nil {
log.G(ctx).WithError(err).Debugf("failed to parse %q", us)
@@ -102,8 +102,14 @@ func (r dockerFetcher) Fetch(ctx context.Context, desc ocispec.Descriptor) (io.R
var firstErr error
for _, host := range r.hosts {
req := r.request(host, http.MethodGet, "manifests", desc.Digest.String())
if err := req.addNamespace(r.refspec.Hostname()); err != nil {
base := r.withRewritesFromHost(host)
ctx, err := ContextWithRepositoryScope(ctx, base.refspec, false)
if err != nil {
return nil, err
}
req := base.request(host, http.MethodGet, "manifests", desc.Digest.String())
if err := req.addNamespace(base.refspec.Hostname()); err != nil {
return nil, err
}
@@ -125,8 +131,14 @@ func (r dockerFetcher) Fetch(ctx context.Context, desc ocispec.Descriptor) (io.R
// Finally use blobs endpoints
var firstErr error
for _, host := range r.hosts {
req := r.request(host, http.MethodGet, "blobs", desc.Digest.String())
if err := req.addNamespace(r.refspec.Hostname()); err != nil {
base := r.withRewritesFromHost(host)
ctx, err := ContextWithRepositoryScope(ctx, base.refspec, false)
if err != nil {
return nil, err
}
req := base.request(host, http.MethodGet, "blobs", desc.Digest.String())
if err := req.addNamespace(base.refspec.Hostname()); err != nil {
return nil, err
}
@@ -154,8 +166,14 @@ func (r dockerFetcher) Fetch(ctx context.Context, desc ocispec.Descriptor) (io.R
}
func (r dockerFetcher) createGetReq(ctx context.Context, host RegistryHost, mediatype string, ps ...string) (*request, int64, error) {
headReq := r.request(host, http.MethodHead, ps...)
if err := headReq.addNamespace(r.refspec.Hostname()); err != nil {
base := r.withRewritesFromHost(host)
ctx, err := ContextWithRepositoryScope(ctx, base.refspec, false)
if err != nil {
return nil, 0, err
}
headReq := base.request(host, http.MethodHead, ps...)
if err := headReq.addNamespace(base.refspec.Hostname()); err != nil {
return nil, 0, err
}
@@ -176,8 +194,8 @@ func (r dockerFetcher) createGetReq(ctx context.Context, host RegistryHost, medi
return nil, 0, fmt.Errorf("unexpected HEAD status code %v: %s", headReq.String(), headResp.Status)
}
getReq := r.request(host, http.MethodGet, ps...)
if err := getReq.addNamespace(r.refspec.Hostname()); err != nil {
getReq := base.request(host, http.MethodGet, ps...)
if err := getReq.addNamespace(base.refspec.Hostname()); err != nil {
return nil, 0, err
}
return getReq, headResp.ContentLength, nil
@@ -198,15 +216,10 @@ func (r dockerFetcher) FetchByDigest(ctx context.Context, dgst digest.Digest, op
return nil, desc, fmt.Errorf("no pull hosts: %w", errdefs.ErrNotFound)
}
ctx, err := ContextWithRepositoryScope(ctx, r.refspec, false)
if err != nil {
return nil, desc, err
}
var (
getReq *request
sz int64
firstErr error
getReq *request
sz int64
err, firstErr error
)
for _, host := range r.hosts {

View File

@@ -72,10 +72,6 @@ func (p dockerPusher) push(ctx context.Context, desc ocispec.Descriptor, ref str
l.Lock(ref)
defer l.Unlock(ref)
}
ctx, err := ContextWithRepositoryScope(ctx, p.refspec, true)
if err != nil {
return nil, err
}
status, err := p.tracker.GetStatus(ref)
if err == nil {
if status.Committed && status.Offset == status.Total {
@@ -103,6 +99,12 @@ func (p dockerPusher) push(ctx context.Context, desc ocispec.Descriptor, ref str
host = hosts[0]
)
base := p.withRewritesFromHost(host)
ctx, err = ContextWithRepositoryScope(ctx, base.refspec, true)
if err != nil {
return nil, err
}
if images.IsManifestType(desc.MediaType) || images.IsIndexType(desc.MediaType) {
isManifest = true
existCheck = getManifestPath(p.object, desc.Digest)
@@ -110,7 +112,7 @@ func (p dockerPusher) push(ctx context.Context, desc ocispec.Descriptor, ref str
existCheck = []string{"blobs", desc.Digest.String()}
}
req := p.request(host, http.MethodHead, existCheck...)
req := base.request(host, http.MethodHead, existCheck...)
req.header.Set("Accept", strings.Join([]string{desc.MediaType, `*/*`}, ", "))
log.G(ctx).WithField("url", req.String()).Debugf("checking and pushing to")
@@ -158,11 +160,11 @@ func (p dockerPusher) push(ctx context.Context, desc ocispec.Descriptor, ref str
if isManifest {
putPath := getManifestPath(p.object, desc.Digest)
req = p.request(host, http.MethodPut, putPath...)
req = base.request(host, http.MethodPut, putPath...)
req.header.Add("Content-Type", desc.MediaType)
} else {
// Start upload request
req = p.request(host, http.MethodPost, "blobs", "uploads/")
req = base.request(host, http.MethodPost, "blobs", "uploads/")
mountedFrom := ""
var resp *http.Response
@@ -477,13 +479,15 @@ func (pw *pushWriter) Digest() digest.Digest {
func (pw *pushWriter) Commit(ctx context.Context, size int64, expected digest.Digest, opts ...content.Opt) error {
// Check whether read has already thrown an error
if _, err := pw.pipe.Write([]byte{}); err != nil && !errors.Is(err, io.ErrClosedPipe) {
return fmt.Errorf("pipe error before commit: %w", err)
if pw.pipe != nil {
if _, err := pw.pipe.Write([]byte{}); err != nil && !errors.Is(err, io.ErrClosedPipe) {
return fmt.Errorf("pipe error before commit: %w", err)
}
if err := pw.pipe.Close(); err != nil {
return err
}
}
if err := pw.pipe.Close(); err != nil {
return err
}
// TODO: timeout waiting for response
var resp *http.Response
select {

View File

@@ -74,6 +74,7 @@ type RegistryHost struct {
Path string
Capabilities HostCapabilities
Header http.Header
Rewrites map[string]string
}
func (h RegistryHost) isProxy(refhost string) bool {

View File

@@ -27,6 +27,7 @@ import (
"net/url"
"os"
"path"
"regexp"
"strings"
"sync"
@@ -239,14 +240,13 @@ func (r *dockerResolver) Resolve(ctx context.Context, ref string) (string, ocisp
if err != nil {
return "", ocispec.Descriptor{}, err
}
refspec := base.refspec
if refspec.Object == "" {
if base.refspec.Object == "" {
return "", ocispec.Descriptor{}, reference.ErrObjectRequired
}
var (
paths [][]string
dgst = refspec.Digest()
dgst = base.refspec.Digest()
caps = HostCapabilityPull
)
@@ -264,7 +264,7 @@ func (r *dockerResolver) Resolve(ctx context.Context, ref string) (string, ocisp
paths = append(paths, []string{"blobs", dgst.String()})
} else {
// Add
paths = append(paths, []string{"manifests", refspec.Object})
paths = append(paths, []string{"manifests", base.refspec.Object})
caps |= HostCapabilityResolve
}
@@ -273,11 +273,6 @@ func (r *dockerResolver) Resolve(ctx context.Context, ref string) (string, ocisp
return "", ocispec.Descriptor{}, fmt.Errorf("no resolve hosts: %w", errdefs.ErrNotFound)
}
ctx, err = ContextWithRepositoryScope(ctx, refspec, false)
if err != nil {
return "", ocispec.Descriptor{}, err
}
var (
// firstErr is the most relevant error encountered during resolution.
// We use this to determine the error to return, making sure that the
@@ -296,7 +291,11 @@ func (r *dockerResolver) Resolve(ctx context.Context, ref string) (string, ocisp
for _, u := range paths {
for i, host := range hosts {
ctx := log.WithLogger(ctx, log.G(ctx).WithField("host", host.Host))
base := base.withRewritesFromHost(host)
ctx, err = ContextWithRepositoryScope(ctx, base.refspec, false)
if err != nil {
return "", ocispec.Descriptor{}, err
}
req := base.request(host, http.MethodHead, u...)
if err := req.addNamespace(base.refspec.Hostname()); err != nil {
return "", ocispec.Descriptor{}, err
@@ -501,7 +500,6 @@ func (r *dockerBase) request(host RegistryHost, method string, ps ...string) *re
if header == nil {
header = http.Header{}
}
for key, value := range host.Header {
header[key] = append(header[key], value...)
}
@@ -519,6 +517,28 @@ func (r *dockerBase) request(host RegistryHost, method string, ps ...string) *re
}
}
func (r *dockerBase) withRewritesFromHost(host RegistryHost) *dockerBase {
for pattern, replace := range host.Rewrites {
exp, err := regexp.Compile(pattern)
if err != nil {
log.L.WithError(err).Warnf("Failed to compile rewrite, `%s`, for %s", pattern, host.Host)
continue
}
if rr := exp.ReplaceAllString(r.repository, replace); rr != r.repository {
log.L.Debugf("Rewrote repository for %s: %s => %s", r.refspec, r.repository, rr)
return &dockerBase{
refspec: reference.Spec{
Locator: r.refspec.Hostname() + "/" + rr,
Object: r.refspec.Object,
},
repository: rr,
header: r.header,
}
}
}
return r
}
func (r *request) authorize(ctx context.Context, req *http.Request) error {
// Check if has header for host
if r.host.Authorizer != nil {

View File

@@ -91,9 +91,9 @@ func loadShim(ctx context.Context, bundle *Bundle, onClose func()) (_ ShimInstan
// To prevent flood of error messages, the expected error
// should be reset, like os.ErrClosed or os.ErrNotExist, which
// depends on platform.
err = checkCopyShimLogError(ctx, err)
err = checkCopyShimLogError(shimCtx, err)
if err != nil {
log.G(ctx).WithError(err).Error("copy shim log after reload")
log.G(shimCtx).WithError(err).Error("copy shim log after reload")
}
}()
onCloseWithShimLog := func() {

View File

@@ -266,12 +266,12 @@ func (m *TaskManager) validateRuntimeFeatures(ctx context.Context, opts runtime.
return nil
}
ropts := opts.RuntimeOptions
if ropts == nil || ropts.GetValue() == nil {
ropts = opts.TaskOptions
topts := opts.TaskOptions
if topts == nil || topts.GetValue() == nil {
topts = opts.RuntimeOptions
}
pInfo, err := m.PluginInfo(ctx, &apitypes.RuntimeRequest{RuntimePath: opts.Runtime, Options: typeurl.MarshalProto(ropts)})
pInfo, err := m.PluginInfo(ctx, &apitypes.RuntimeRequest{RuntimePath: opts.Runtime, Options: typeurl.MarshalProto(topts)})
if err != nil {
return fmt.Errorf("runtime info: %w", err)
}

View File

@@ -21,4 +21,6 @@ const (
// This will be based on the client compilation target, so take that into
// account when choosing this value.
DefaultSnapshotter = "overlayfs"
// DefaultDiffer will set the default differ for the platform.
DefaultDiffer = "walking"
)

View File

@@ -23,4 +23,6 @@ const (
// This will be based on the client compilation target, so take that into
// account when choosing this value.
DefaultSnapshotter = "native"
// DefaultDiffer will set the default differ for the platform.
DefaultDiffer = "walking"
)

48
go.mod
View File

@@ -1,6 +1,6 @@
module github.com/containerd/containerd/v2
go 1.22.0
go 1.23.0
require (
dario.cat/mergo v1.0.1
@@ -14,10 +14,11 @@ require (
github.com/containerd/cgroups/v3 v3.0.3
github.com/containerd/console v1.0.4
github.com/containerd/containerd/api v1.8.0
github.com/containerd/continuity v0.4.4
github.com/containerd/continuity v0.4.5
github.com/containerd/errdefs v1.0.0
github.com/containerd/errdefs/pkg v0.3.0
github.com/containerd/fifo v1.1.0
github.com/containerd/fuse-overlayfs-snapshotter/v2 v2.1.0
github.com/containerd/go-cni v1.1.12
github.com/containerd/go-runc v1.1.0
github.com/containerd/imgcrypt/v2 v2.0.0
@@ -26,6 +27,7 @@ require (
github.com/containerd/otelttrpc v0.1.0
github.com/containerd/platforms v1.0.0-rc.1
github.com/containerd/plugin v1.0.0
github.com/containerd/stargz-snapshotter v0.16.3
github.com/containerd/ttrpc v1.2.7
github.com/containerd/typeurl/v2 v2.2.3
github.com/containerd/zfs/v2 v2.0.0-rc.0
@@ -40,7 +42,6 @@ require (
github.com/fsnotify/fsnotify v1.7.0
github.com/google/go-cmp v0.6.0
github.com/google/uuid v1.6.0
github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1
github.com/intel/goresctrl v0.8.0
github.com/klauspost/compress v1.17.11
github.com/mdlayher/vsock v1.2.1
@@ -73,19 +74,19 @@ require (
go.opentelemetry.io/otel/sdk v1.31.0
go.opentelemetry.io/otel/trace v1.31.0
golang.org/x/mod v0.21.0
golang.org/x/sync v0.10.0
golang.org/x/sys v0.28.0
golang.org/x/sync v0.12.0
golang.org/x/sys v0.31.0
google.golang.org/genproto/googleapis/rpc v0.0.0-20241021214115-324edc3d5d38
google.golang.org/grpc v1.68.1
google.golang.org/protobuf v1.35.2
k8s.io/apimachinery v0.31.2
k8s.io/client-go v0.31.2
k8s.io/component-base v0.31.2
k8s.io/cri-api v0.31.2
k8s.io/cri-api v0.32.0-alpha.0
k8s.io/klog/v2 v2.130.1
k8s.io/kubelet v0.31.2
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
tags.cncf.io/container-device-interface v0.8.1
tags.cncf.io/container-device-interface v1.0.0
)
require (
@@ -93,23 +94,37 @@ require (
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cilium/ebpf v0.11.0 // indirect
github.com/containerd/stargz-snapshotter/estargz v0.16.3 // indirect
github.com/containers/ocicrypt v1.2.1 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.5 // indirect
github.com/docker/cli v27.3.1+incompatible // indirect
github.com/docker/docker-credential-helpers v0.7.0 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/go-jose/go-jose/v4 v4.0.4 // indirect
github.com/go-jose/go-jose/v4 v4.0.5 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.22.4 // indirect
github.com/godbus/dbus/v5 v5.1.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/gorilla/websocket v1.5.0 // indirect
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 // indirect
github.com/hanwen/go-fuse/v2 v2.6.3 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-cleanhttp v0.5.2 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/hashicorp/go-retryablehttp v0.7.7 // indirect
github.com/imdario/mergo v0.3.13 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mdlayher/socket v0.4.1 // indirect
github.com/miekg/pkcs11 v1.1.1 // indirect
github.com/mistifyio/go-zfs/v3 v3.0.1 // indirect
@@ -127,20 +142,22 @@ require (
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sasha-s/go-deadlock v0.3.5 // indirect
github.com/smallstep/pkcs7 v0.1.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/stefanberger/go-pkcs11uri v0.0.0-20230803200340-78284954bff6 // indirect
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect
github.com/vbatts/tar-split v0.11.6 // indirect
github.com/vishvananda/netns v0.0.4 // indirect
github.com/x448/float16 v0.8.4 // indirect
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/otel/metric v1.31.0 // indirect
go.opentelemetry.io/proto/otlp v1.3.1 // indirect
golang.org/x/crypto v0.31.0 // indirect
golang.org/x/crypto v0.36.0 // indirect
golang.org/x/exp v0.0.0-20231214170342-aacd6d4b4611 // indirect
golang.org/x/net v0.33.0 // indirect
golang.org/x/oauth2 v0.23.0 // indirect
golang.org/x/term v0.27.0 // indirect
golang.org/x/text v0.21.0 // indirect
golang.org/x/net v0.37.0 // indirect
golang.org/x/oauth2 v0.28.0 // indirect
golang.org/x/term v0.30.0 // indirect
golang.org/x/text v0.23.0 // indirect
golang.org/x/time v0.3.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20241007155032-5fefd90f89a9 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
@@ -148,10 +165,11 @@ require (
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/api v0.31.2 // indirect
k8s.io/apiserver v0.31.2 // indirect
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
tags.cncf.io/container-device-interface/specs-go v0.8.0 // indirect
tags.cncf.io/container-device-interface/specs-go v1.0.0 // indirect
)
exclude (

84
go.sum
View File

@@ -41,14 +41,16 @@ github.com/containerd/console v1.0.4 h1:F2g4+oChYvBTsASRTz8NP6iIAi97J3TtSAsLbIFn
github.com/containerd/console v1.0.4/go.mod h1:YynlIjWYF8myEu6sdkwKIvGQq+cOckRm6So2avqoYAk=
github.com/containerd/containerd/api v1.8.0 h1:hVTNJKR8fMc/2Tiw60ZRijntNMd1U+JVMyTRdsD2bS0=
github.com/containerd/containerd/api v1.8.0/go.mod h1:dFv4lt6S20wTu/hMcP4350RL87qPWLVa/OHOwmmdnYc=
github.com/containerd/continuity v0.4.4 h1:/fNVfTJ7wIl/YPMHjf+5H32uFhl63JucB34PlCpMKII=
github.com/containerd/continuity v0.4.4/go.mod h1:/lNJvtJKUQStBzpVQ1+rasXO1LAWtUQssk28EZvJ3nE=
github.com/containerd/continuity v0.4.5 h1:ZRoN1sXq9u7V6QoHMcVWGhOwDFqZ4B9i5H6un1Wh0x4=
github.com/containerd/continuity v0.4.5/go.mod h1:/lNJvtJKUQStBzpVQ1+rasXO1LAWtUQssk28EZvJ3nE=
github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI=
github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M=
github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE=
github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk=
github.com/containerd/fifo v1.1.0 h1:4I2mbh5stb1u6ycIABlBw9zgtlK8viPI9QkQNRQEEmY=
github.com/containerd/fifo v1.1.0/go.mod h1:bmC4NWMbXlt2EZ0Hc7Fx7QzTFxgPID13eH0Qu+MAb2o=
github.com/containerd/fuse-overlayfs-snapshotter/v2 v2.1.0 h1:okk7wQXjHJhG+Y+Rs3wToje/yHJInlE3DjLNQNJ1WGI=
github.com/containerd/fuse-overlayfs-snapshotter/v2 v2.1.0/go.mod h1:yK/eAdWigKE4XsBi8WDHV52jO8MJZcTRU5tCVOnEE9w=
github.com/containerd/go-cni v1.1.12 h1:wm/5VD/i255hjM4uIZjBRiEQ7y98W9ACy/mHeLi4+94=
github.com/containerd/go-cni v1.1.12/go.mod h1:+jaqRBdtW5faJxj2Qwg1Of7GsV66xcvnCx4mSJtUlxU=
github.com/containerd/go-runc v1.1.0 h1:OX4f+/i2y5sUT7LhmcJH7GYrjjhHa1QI4e8yO0gGleA=
@@ -65,6 +67,10 @@ github.com/containerd/platforms v1.0.0-rc.1 h1:83KIq4yy1erSRgOVHNk1HYdPvzdJ5CnsW
github.com/containerd/platforms v1.0.0-rc.1/go.mod h1:J71L7B+aiM5SdIEqmd9wp6THLVRzJGXfNuWCZCllLA4=
github.com/containerd/plugin v1.0.0 h1:c8Kf1TNl6+e2TtMHZt+39yAPDbouRH9WAToRjex483Y=
github.com/containerd/plugin v1.0.0/go.mod h1:hQfJe5nmWfImiqT1q8Si3jLv3ynMUIBB47bQ+KexvO8=
github.com/containerd/stargz-snapshotter v0.16.3 h1:zbQMm8dRuPHEOD4OqAYGajJJUwCeUzt4j7w9Iaw58u4=
github.com/containerd/stargz-snapshotter v0.16.3/go.mod h1:XPOl2oa9zjWidTM2IX191smolwWc3/zkKtp02TzTFb0=
github.com/containerd/stargz-snapshotter/estargz v0.16.3 h1:7evrXtoh1mSbGj/pfRccTampEyKpjpOnS3CyiV1Ebr8=
github.com/containerd/stargz-snapshotter/estargz v0.16.3/go.mod h1:uyr4BfYfOj3G9WBVE8cOlQmXAbPN9VEQpBBeJIuOipU=
github.com/containerd/ttrpc v1.2.7 h1:qIrroQvuOL9HQ1X6KHe2ohc7p+HP/0VE6XPU7elJRqQ=
github.com/containerd/ttrpc v1.2.7/go.mod h1:YCXHsb32f+Sq5/72xHubdiJRQY9inL4a4ZQrAbN1q9o=
github.com/containerd/typeurl/v2 v2.2.3 h1:yNA/94zxWdvYACdYO8zofhrTVuQY73fFU1y++dYSw40=
@@ -81,11 +87,16 @@ github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc=
github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
github.com/docker/cli v27.3.1+incompatible h1:qEGdFBF3Xu6SCvCYhc7CzaQTlBmqDuzxPDpigSyeKQQ=
github.com/docker/cli v27.3.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8=
github.com/docker/docker-credential-helpers v0.7.0 h1:xtCHsjxogADNZcdv1pKUHXryefjlVRqWqIhk/uXJp0A=
github.com/docker/docker-credential-helpers v0.7.0/go.mod h1:rETQfLdHNT3foU5kuNkFR1R1V12OJRRO5lzt2D1b5X0=
github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c h1:+pKlWGMw7gf6bQ+oDZB4KHQFypsfjYlq/C4rfL7D3g8=
github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c/go.mod h1:Uw6UezgYA44ePAFQYUehOuCzmy5zmg/+nl2ZfMWGkpA=
github.com/docker/go-metrics v0.0.1 h1:AgB/0SvBxihN0X8OR4SjsblXkbMvalQ8cjmtKQ2rQV8=
@@ -98,6 +109,8 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA=
@@ -106,8 +119,8 @@ github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nos
github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
github.com/go-jose/go-jose/v4 v4.0.4 h1:VsjPI33J0SB9vQM6PLmNjoHqMQNGPiZ0rHL7Ni7Q6/E=
github.com/go-jose/go-jose/v4 v4.0.4/go.mod h1:NKb5HO1EZccyMpiZNbdUw/14tiXNyUJh188dfnMCAfc=
github.com/go-jose/go-jose/v4 v4.0.5 h1:M6T8+mKZl/+fNNuFHvGIzDz7BTLQPIounk/b9dw3AaE=
github.com/go-jose/go-jose/v4 v4.0.5/go.mod h1:s3P1lRrkT8igV8D9OjyL4WRyHvjB6a4JSllnOrmmBOA=
github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
@@ -120,6 +133,7 @@ github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn
github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
github.com/go-openapi/swag v0.22.4 h1:QLMzNJnMGPRNDCbySlcj1x01tzU8/9LTTL9hZZZogBU=
github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
@@ -171,17 +185,23 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc=
github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1 h1:qnpSQwGEnkcRpTqNOIR6bJbR0gAorgP9CSALpRcKoAA=
github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1/go.mod h1:lXGCsh6c22WGtjr+qGHj1otzZpV/1kwTMAqkwZsnWRU=
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 h1:pRhl55Yx1eC7BZ1N+BBWwnKaMyD8uC+34TLdndZMAKk=
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0/go.mod h1:XKMd7iuf/RGPSMJ/U4HP0zS2Z9Fh8Ps9a+6X26m/tmI=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 h1:asbCHRVmodnJTuQ3qamDwqVOIjwqUPTYmYuemVOx+Ys=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0/go.mod h1:ggCgvZ2r7uOoQjOyu2Y1NhHmEPPzzuhWgcza5M1Ji1I=
github.com/hanwen/go-fuse/v2 v2.6.3 h1:tDcEkLRx93lXu4XyN1/j8Z74VWvhHDl6qU1kNnvFUqI=
github.com/hanwen/go-fuse/v2 v2.6.3/go.mod h1:ugNaD/iv5JYyS1Rcvi57Wz7/vrLQJo10mmketmoef48=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I=
github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ=
github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48=
github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB11/k=
github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU=
github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk=
github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk=
github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg=
github.com/intel/goresctrl v0.8.0 h1:N3shVbS3kA1Hk2AmcbHv8805Hjbv+zqsCIZCGktxx50=
github.com/intel/goresctrl v0.8.0/go.mod h1:T3ZZnuHSNouwELB5wvOoUJaB7l/4Rm23rJy/wuWJlr0=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
@@ -197,14 +217,21 @@ github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IX
github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U=
github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA=
@@ -325,6 +352,8 @@ github.com/tchap/go-patricia/v2 v2.3.1/go.mod h1:VZRHKAb53DLaG+nA9EaYYiaEx6YztwD
github.com/urfave/cli v1.19.1/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA=
github.com/urfave/cli/v2 v2.27.5 h1:WoHEJLdsXr6dDWoJgMq/CboDmyY/8HMMH1fTECbih+w=
github.com/urfave/cli/v2 v2.27.5/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ=
github.com/vbatts/tar-split v0.11.6 h1:4SjTW5+PU11n6fZenf2IPoV8/tz3AaYHMWjf23envGs=
github.com/vbatts/tar-split v0.11.6/go.mod h1:dqKNtesIOr2j2Qv3W/cHjnvk9I8+G7oAkFDFN6TCBEI=
github.com/vishvananda/netlink v1.3.0 h1:X7l42GfcV4S6E4vHTsw48qbrV+9PVojNfIhZcwQdrZk=
github.com/vishvananda/netlink v1.3.0/go.mod h1:i6NetklAujEcC6fK0JPjT8qSwWyO0HLn4UKG+hGqeJs=
github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8=
@@ -378,8 +407,8 @@ golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliY
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
golang.org/x/crypto v0.30.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20231214170342-aacd6d4b4611 h1:qCEDpW1G+vcj3Y7Fy52pEM1AWm3abj8WimGYejI3SC4=
golang.org/x/exp v0.0.0-20231214170342-aacd6d4b4611/go.mod h1:iRJReGqOEeBhDZGkGbynYwcHlctCvnjTYIamk7uXpHI=
@@ -413,11 +442,11 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
golang.org/x/net v0.37.0 h1:1zLorHbz+LYj7MQlSf1+2tPIIgibq2eL5xkrGk6f+2c=
golang.org/x/net v0.37.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs=
golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/oauth2 v0.28.0 h1:CrgCKl8PPAVtLnU3c+EDw6x11699EWlsDeWNWKdIOkc=
golang.org/x/oauth2 v0.28.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -429,8 +458,9 @@ golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -453,8 +483,9 @@ golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
@@ -463,8 +494,9 @@ golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q=
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
@@ -473,8 +505,9 @@ golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4=
golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -532,8 +565,11 @@ gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gotest.tools/v3 v3.5.0 h1:Ljk6PdHdOhAb5aDMWXjDLMMhph+BpztA4v1QdqEW2eY=
gotest.tools/v3 v3.5.0/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
k8s.io/api v0.31.2 h1:3wLBbL5Uom/8Zy98GRPXpJ254nEFpl+hwndmk9RwmL0=
@@ -546,8 +582,8 @@ k8s.io/client-go v0.31.2 h1:Y2F4dxU5d3AQj+ybwSMqQnpZH9F30//1ObxOKlTI9yc=
k8s.io/client-go v0.31.2/go.mod h1:NPa74jSVR/+eez2dFsEIHNa+3o09vtNaWwWwb1qSxSs=
k8s.io/component-base v0.31.2 h1:Z1J1LIaC0AV+nzcPRFqfK09af6bZ4D1nAOpWsy9owlA=
k8s.io/component-base v0.31.2/go.mod h1:9PeyyFN/drHjtJZMCTkSpQJS3U9OXORnHQqMLDz0sUQ=
k8s.io/cri-api v0.31.2 h1:O/weUnSHvM59nTio0unxIUFyRHMRKkYn96YDILSQKmo=
k8s.io/cri-api v0.31.2/go.mod h1:Po3TMAYH/+KrZabi7QiwQI4a692oZcUOUThd/rqwxrI=
k8s.io/cri-api v0.32.0-alpha.0 h1:Rs9prajcHWZAdy9ueQdD2R+OOnDD3rKYbM9hQ90iEQU=
k8s.io/cri-api v0.32.0-alpha.0/go.mod h1:Po3TMAYH/+KrZabi7QiwQI4a692oZcUOUThd/rqwxrI=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag=
@@ -562,7 +598,7 @@ sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+s
sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08=
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
tags.cncf.io/container-device-interface v0.8.1 h1:c0jN4Mt6781jD67NdPajmZlD1qrqQyov/Xfoab37lj0=
tags.cncf.io/container-device-interface v0.8.1/go.mod h1:Apb7N4VdILW0EVdEMRYXIDVRZfNJZ+kmEUss2kRRQ6Y=
tags.cncf.io/container-device-interface/specs-go v0.8.0 h1:QYGFzGxvYK/ZLMrjhvY0RjpUavIn4KcmRmVP/JjdBTA=
tags.cncf.io/container-device-interface/specs-go v0.8.0/go.mod h1:BhJIkjjPh4qpys+qm4DAYtUyryaTDg9zris+AczXyws=
tags.cncf.io/container-device-interface v1.0.0 h1:fbwPQiWZNpXUb9Os6t6JW52rsOppTFUbeJOpNtN1TmI=
tags.cncf.io/container-device-interface v1.0.0/go.mod h1:mmi2aRGmOjK/6NR3TXjLpEIarOJ9qwgZjQ3nTIRwAaA=
tags.cncf.io/container-device-interface/specs-go v1.0.0 h1:8gLw29hH1ZQP9K1YtAzpvkHCjjyIxHZYzBAvlQ+0vD8=
tags.cncf.io/container-device-interface/specs-go v1.0.0/go.mod h1:u86hoFWqnh3hWz3esofRFKbI261bUlvUfLKGrDhJkgQ=

View File

@@ -19,12 +19,26 @@
package client
import (
"context"
"strings"
"sync"
"testing"
"github.com/containerd/containerd/api/services/tasks/v1"
"github.com/containerd/containerd/api/types/runc/options"
. "github.com/containerd/containerd/v2/client"
"github.com/containerd/containerd/v2/integration/images"
"github.com/containerd/containerd/v2/pkg/deprecation"
"github.com/containerd/containerd/v2/pkg/oci"
"github.com/containerd/containerd/v2/pkg/protobuf"
"github.com/containerd/containerd/v2/plugins"
"github.com/containerd/errdefs"
"github.com/containerd/errdefs/pkg/errgrpc"
"github.com/containerd/platforms"
"github.com/containerd/typeurl/v2"
"github.com/google/go-cmp/cmp"
"github.com/stretchr/testify/require"
"google.golang.org/grpc"
)
var (
@@ -63,3 +77,118 @@ func TestImagePullSchema1WithEmptyLayers(t *testing.T) {
t.Fatal(err)
}
}
func TestNewTaskWithRuntimeOption(t *testing.T) {
t.Parallel()
fakeTasks := &fakeTaskService{
TasksClient: tasks.NewTasksClient(nil),
createRequests: map[string]*tasks.CreateTaskRequest{},
}
cli, err := newClient(t, address,
WithServices(WithTaskClient(fakeTasks)),
)
require.NoError(t, err)
defer cli.Close()
var (
image Image
ctx, cancel = testContext(t)
)
defer cancel()
image, err = cli.GetImage(ctx, testImage)
require.NoError(t, err)
for _, tc := range []struct {
name string
runtimeOption *options.Options
taskOpts []NewTaskOpts
expectedOptions *options.Options
}{
{
name: "should be empty options",
runtimeOption: &options.Options{
BinaryName: "no-runc",
},
expectedOptions: nil,
},
{
name: "should overwrite IOUid/ShimCgroup",
runtimeOption: &options.Options{
BinaryName: "no-runc",
ShimCgroup: "/abc",
IoUid: 1000,
SystemdCgroup: true,
},
taskOpts: []NewTaskOpts{
WithUIDOwner(2000),
WithGIDOwner(3000),
WithShimCgroup("/def"),
},
expectedOptions: &options.Options{
BinaryName: "no-runc",
ShimCgroup: "/def",
IoUid: 2000,
IoGid: 3000,
SystemdCgroup: true,
},
},
} {
t.Run(tc.name, func(t *testing.T) {
id := strings.Replace(t.Name(), "/", "_", -1)
container, err := cli.NewContainer(
ctx,
id,
WithNewSnapshotView(id, image),
WithNewSpec(oci.WithImageConfig(image), withExitStatus(7)),
WithRuntime(plugins.RuntimeRuncV2, tc.runtimeOption),
)
require.NoError(t, err)
defer container.Delete(ctx, WithSnapshotCleanup)
_, err = container.NewTask(ctx, empty(), tc.taskOpts...)
require.NoError(t, err)
fakeTasks.Lock()
req := fakeTasks.createRequests[id]
fakeTasks.Unlock()
if tc.expectedOptions == nil {
require.Nil(t, req.Options)
return
}
gotOptions := &options.Options{}
require.NoError(t, typeurl.UnmarshalTo(req.Options, gotOptions))
require.True(t, cmp.Equal(tc.expectedOptions, gotOptions, protobuf.Compare))
})
}
}
type fakeTaskService struct {
sync.Mutex
createRequests map[string]*tasks.CreateTaskRequest
tasks.TasksClient
}
func (ts *fakeTaskService) Create(ctx context.Context, in *tasks.CreateTaskRequest, opts ...grpc.CallOption) (*tasks.CreateTaskResponse, error) {
ts.Lock()
defer ts.Unlock()
ts.createRequests[in.ContainerID] = in
return &tasks.CreateTaskResponse{
ContainerID: in.ContainerID,
Pid: 1,
}, nil
}
func (ts *fakeTaskService) Get(ctx context.Context, in *tasks.GetRequest, opts ...grpc.CallOption) (*tasks.GetResponse, error) {
return nil, errgrpc.ToGRPC(errdefs.ErrNotFound)
}
func (ts *fakeTaskService) Delete(ctx context.Context, in *tasks.DeleteTaskRequest, opts ...grpc.CallOption) (*tasks.DeleteResponse, error) {
return nil, errgrpc.ToGRPC(errdefs.ErrNotFound)
}

View File

@@ -1090,6 +1090,19 @@ func TestContainerRuntimeOptionsv2(t *testing.T) {
if !strings.Contains(err.Error(), `"no-runc"`) {
t.Errorf("task creation should have failed because of lack of executable. Instead failed with: %v", err.Error())
}
// It doesn't matter what the NewTaskOpts function is. We are using an existing function in the client package,
// which will cause the TaskOptions in the new task request to be non-empty.
// https://github.com/containerd/containerd/issues/11568
task, err = container.NewTask(ctx, empty(), WithNoNewKeyring)
if err == nil {
t.Errorf("task creation should have failed")
task.Delete(ctx)
return
}
if !strings.Contains(err.Error(), `"no-runc"`) {
t.Errorf("task creation should have failed because of lack of executable. Instead failed with: %v", err.Error())
}
}
func TestContainerKillInitPidHost(t *testing.T) {

View File

@@ -0,0 +1,161 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package integration
import (
"fmt"
"os"
"path/filepath"
"runtime"
"testing"
"time"
"github.com/containerd/containerd/v2/integration/images"
"github.com/opencontainers/selinux/go-selinux"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
criruntime "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func TestImageMount(t *testing.T) {
if runtime.GOOS != "linux" {
t.Skip("Only running on linux")
}
testImage := images.Get(images.Alpine)
testMountImage := images.Get(images.Pause)
mountPath := "/image-mount"
EnsureImageExists(t, testMountImage)
EnsureImageExists(t, testImage)
testImageMount(t, testImage, testMountImage, mountPath, []string{
"ls",
mountPath,
}, []string{
fmt.Sprintf("%s %s %s", criruntime.Stdout, criruntime.LogTagFull, "pause"),
})
}
func TestImageMountSELinux(t *testing.T) {
if runtime.GOOS != "linux" {
t.Skip("Only running on linux")
}
if !selinux.GetEnabled() {
t.Skip("SELinux is not enabled")
}
testImage := images.Get(images.ResourceConsumer)
testMountImage := images.Get(images.Pause)
mountPath := "/image-mount"
EnsureImageExists(t, testMountImage)
EnsureImageExists(t, testImage)
testImageMountSELinux(t, testImage, testMountImage, mountPath, "s0:c4,c5", "system_u:object_r:container_file_t:s0:c4,c5 pause")
testImageMountSELinux(t, testImage, testMountImage, mountPath, "s0:c200,c100", "system_u:object_r:container_file_t:s0:c100,c200 pause")
}
func testImageMountSELinux(t *testing.T, testImage, testMountImage, mountPath string, level string, want string) {
var (
containerName = "test-image-mount-container"
)
testPodLogDir := t.TempDir()
sb, sbConfig := PodSandboxConfigWithCleanup(t, "sandbox",
"image-mount",
WithHostNetwork,
WithSelinuxLevel(level),
WithPodLogDirectory(testPodLogDir),
)
containerConfig := ContainerConfig(
containerName,
testImage,
WithCommand("ls", "-Z", mountPath),
WithImageVolumeMount(testMountImage, mountPath),
WithLogPath(containerName),
)
cn, err := runtimeService.CreateContainer(sb, containerConfig, sbConfig)
require.NoError(t, err)
defer func() {
assert.NoError(t, runtimeService.RemoveContainer(cn))
}()
require.NoError(t, runtimeService.StartContainer(cn))
require.NoError(t, Eventually(func() (bool, error) {
s, err := runtimeService.ContainerStatus(cn)
if err != nil {
return false, err
}
if s.GetState() == criruntime.ContainerState_CONTAINER_EXITED {
return true, nil
}
return false, nil
}, time.Second, 30*time.Second))
content, err := os.ReadFile(filepath.Join(testPodLogDir, containerName))
assert.NoError(t, err)
checkContainerLog(t, string(content), []string{
fmt.Sprintf("%s %s %s", criruntime.Stdout, criruntime.LogTagFull, want),
})
}
func testImageMount(t *testing.T, testImage, testMountImage, mountPath string, cmd, want []string) {
var (
containerName = "test-image-mount-container"
)
testPodLogDir := t.TempDir()
sb, sbConfig := PodSandboxConfigWithCleanup(t, "sandbox",
"image-mount",
WithHostNetwork,
WithPodLogDirectory(testPodLogDir),
)
containerConfig := ContainerConfig(
containerName,
testImage,
WithCommand(cmd[0], cmd[1:]...),
WithImageVolumeMount(testMountImage, mountPath),
WithLogPath(containerName),
)
cn, err := runtimeService.CreateContainer(sb, containerConfig, sbConfig)
require.NoError(t, err)
defer func() {
assert.NoError(t, runtimeService.RemoveContainer(cn))
}()
require.NoError(t, runtimeService.StartContainer(cn))
require.NoError(t, Eventually(func() (bool, error) {
s, err := runtimeService.ContainerStatus(cn)
if err != nil {
return false, err
}
if s.GetState() == criruntime.ContainerState_CONTAINER_EXITED {
return true, nil
}
return false, nil
}, time.Second, 30*time.Second))
content, err := os.ReadFile(filepath.Join(testPodLogDir, containerName))
assert.NoError(t, err)
checkContainerLog(t, string(content), want)
}

View File

@@ -138,6 +138,22 @@ func WithHostNetwork(p *runtime.PodSandboxConfig) {
p.Linux.SecurityContext.NamespaceOptions.Network = runtime.NamespaceMode_NODE
}
// Set selinux level
func WithSelinuxLevel(level string) PodSandboxOpts {
return func(p *runtime.PodSandboxConfig) {
if p.Linux == nil {
p.Linux = &runtime.LinuxPodSandboxConfig{}
}
if p.Linux.SecurityContext == nil {
p.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{}
}
if p.Linux.SecurityContext.SelinuxOptions == nil {
p.Linux.SecurityContext.SelinuxOptions = &runtime.SELinuxOption{}
}
p.Linux.SecurityContext.SelinuxOptions.Level = level
}
}
// Set pod userns.
func WithPodUserNs(containerID, hostID, length uint32) PodSandboxOpts {
return func(p *runtime.PodSandboxConfig) {
@@ -338,6 +354,27 @@ func WithIDMapVolumeMount(hostPath, containerPath string, uidMaps, gidMaps []*ru
}
}
func WithImageVolumeMount(image, containerPath string) ContainerOpts {
return WithIDMapImageVolumeMount(image, containerPath, nil, nil)
}
func WithIDMapImageVolumeMount(image string, containerPath string, uidMaps, gidMaps []*runtime.IDMapping) ContainerOpts {
return func(c *runtime.ContainerConfig) {
containerPath, _ = filepath.Abs(containerPath)
mount := &runtime.Mount{
ContainerPath: containerPath,
UidMappings: uidMaps,
GidMappings: gidMaps,
Image: &runtime.ImageSpec{
Image: image,
},
Readonly: true,
SelinuxRelabel: true,
}
c.Mounts = append(c.Mounts, mount)
}
}
func WithWindowsUsername(username string) ContainerOpts {
return func(c *runtime.ContainerConfig) {
if c.Windows == nil {

View File

@@ -196,6 +196,23 @@ type Mirror struct {
// with host specified.
// The scheme, host and path from the endpoint URL will be used.
Endpoints []string `toml:"endpoint" json:"endpoint"`
// Rewrites is a map of repository rewrite rules for a namespace. When fetching image resources
// from an endpoint and a key matches the repository via regular expression matching
// it will be replaced with the corresponding value from the map in the resource request.
//
// This example configures CRI to pull docker.io/library/* images from docker.io/my-org/*:
//
// [plugins]
// [plugins."io.containerd.grpc.v1.cri"]
// [plugins."io.containerd.grpc.v1.cri".registry]
// [plugins."io.containerd.grpc.v1.cri".registry.mirrors]
// [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
// endpoint = ["https://registry-1.docker.io/v2"]
// [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io".rewrite]
// "^library/(.*)" = "my-org/$1"
//
Rewrites map[string]string `toml:"rewrite" json:"rewrite"`
}
// AuthConfig contains the config related to authentication to a specific registry

View File

@@ -161,18 +161,25 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta
return nil, fmt.Errorf("failed to query sandbox platform: %w", err)
}
ociRuntime, err := c.getPodSandboxRuntime(sandboxID)
if err != nil {
return nil, fmt.Errorf("failed to get sandbox runtime: %w", err)
}
// mutate the extra CRI volume mounts from the runtime spec to properly specify the OCI image volume mount requests as bind mounts for this container
err = c.mutateMounts(ctx, config.GetMounts(), c.RuntimeSnapshotter(ctx, ociRuntime), sandboxID, platform)
if err != nil {
return nil, fmt.Errorf("failed to mount image volume: %w", err)
}
var volumeMounts []*runtime.Mount
if !c.config.IgnoreImageDefinedVolumes {
// Create container image volumes mounts.
// create a list of image volume mounts from the image spec that are not also already in the runtime config volume list
volumeMounts = c.volumeMounts(platform, containerRootDir, config, &image.ImageSpec.Config)
} else if len(image.ImageSpec.Config.Volumes) != 0 {
log.G(ctx).Debugf("Ignoring volumes defined in image %v because IgnoreImageDefinedVolumes is set", image.ID)
}
ociRuntime, err := c.config.GetSandboxRuntime(sandboxConfig, sandbox.Metadata.RuntimeHandler)
if err != nil {
return nil, fmt.Errorf("failed to get sandbox runtime: %w", err)
}
var runtimeHandler *runtime.RuntimeHandler
for _, f := range c.runtimeHandlers {
f := f

View File

@@ -0,0 +1,196 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package server
import (
"context"
"fmt"
"os"
"path/filepath"
containerd "github.com/containerd/containerd/v2/client"
"github.com/containerd/containerd/v2/core/leases"
"github.com/containerd/containerd/v2/core/mount"
"github.com/containerd/errdefs"
"github.com/containerd/log"
"github.com/containerd/platforms"
"github.com/opencontainers/image-spec/identity"
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func (c *criService) mutateMounts(
ctx context.Context,
extraMounts []*runtime.Mount,
snapshotter string,
sandboxID string,
platform imagespec.Platform,
) error {
if err := c.ensureLeaseExist(ctx, sandboxID); err != nil {
return fmt.Errorf("failed to ensure lease %v for sandbox: %w", sandboxID, err)
}
ctx = leases.WithLease(ctx, sandboxID)
for _, m := range extraMounts {
err := c.mutateImageMount(ctx, m, snapshotter, sandboxID, platform)
if err != nil {
return err
}
}
return nil
}
func (c *criService) ensureLeaseExist(ctx context.Context, sandboxID string) error {
leaseSvc := c.client.LeasesService()
_, err := leaseSvc.Create(ctx, leases.WithID(sandboxID))
if err != nil {
if errdefs.IsAlreadyExists(err) {
err = nil
}
}
return err
}
func (c *criService) mutateImageMount(
ctx context.Context,
extraMount *runtime.Mount,
snapshotter string,
sandboxID string,
platform imagespec.Platform,
) (retErr error) {
imageSpec := extraMount.GetImage()
if imageSpec == nil {
return nil
}
if extraMount.GetHostPath() != "" {
return fmt.Errorf("hostpath must be empty while mount image: %+v", extraMount)
}
if !extraMount.GetReadonly() {
// return fmt.Errorf("readonly must be true while mount image: %+v", extraMount)
}
ref := imageSpec.GetImage()
if ref == "" {
return fmt.Errorf("image not specified in: %+v", imageSpec)
}
image, err := c.LocalResolve(ref)
if err != nil {
return fmt.Errorf("failed to resolve image %q: %w", ref, err)
}
containerdImage, err := c.toContainerdImage(ctx, image)
if err != nil {
return fmt.Errorf("failed to get image from containerd %q: %w", image.ID, err)
}
// This is a digest of the manifest
imageID := containerdImage.Target().Digest.Encoded()
target := c.getImageVolumeHostPath(sandboxID, imageID)
// Already mounted in another container on the same pod
if stat, err := os.Stat(target); err == nil && stat.IsDir() {
extraMount.HostPath = target
return nil
}
img, err := c.client.ImageService().Get(ctx, ref)
if err != nil {
return fmt.Errorf("failed to get image volume ref %q: %w", ref, err)
}
i := containerd.NewImageWithPlatform(c.client, img, platforms.Only(platform))
if err := i.Unpack(ctx, snapshotter); err != nil {
return fmt.Errorf("failed to unpack image volume: %w", err)
}
diffIDs, err := i.RootFS(ctx)
if err != nil {
return fmt.Errorf("failed to get diff IDs for image volume %q: %w", ref, err)
}
chainID := identity.ChainID(diffIDs).String()
s := c.client.SnapshotService(snapshotter)
mounts, err := s.Prepare(ctx, target, chainID)
if err != nil {
return fmt.Errorf("failed to prepare for image volume %q: %w", ref, err)
}
defer func() {
if retErr != nil {
_ = s.Remove(ctx, target)
}
}()
err = os.MkdirAll(target, 0755)
if err != nil {
return fmt.Errorf("failed to create directory to image volume target path %q: %w", target, err)
}
if err := mount.All(mounts, target); err != nil {
return fmt.Errorf("failed to mount image volume component %q: %w", target, err)
}
extraMount.HostPath = target
return nil
}
func (c *criService) cleanupImageMounts(
ctx context.Context,
sandboxID string,
) (retErr error) {
// Some checks to avoid affecting old pods.
ociRuntime, err := c.getPodSandboxRuntime(sandboxID)
if err != nil {
log.G(ctx).WithError(err).Errorf("failed to get sandbox runtime handler %q", sandboxID)
return nil
}
snapshotter := c.RuntimeSnapshotter(ctx, ociRuntime)
s := c.client.SnapshotService(snapshotter)
if s == nil {
return nil
}
targetBase := c.getImageVolumeBaseDir(sandboxID)
entries, err := os.ReadDir(targetBase)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return fmt.Errorf("failed to read directory: %w", err)
}
for _, entry := range entries {
target := filepath.Join(targetBase, entry.Name())
err = mount.UnmountAll(target, 0)
if err != nil {
return fmt.Errorf("failed to unmount image volume component %q: %w", target, err)
}
err = s.Remove(ctx, target)
if err != nil && !errdefs.IsNotFound(err) {
return fmt.Errorf("failed to removing snapshot: %w", err)
}
err = os.Remove(target)
if err != nil && !errdefs.IsNotFound(err) {
return fmt.Errorf("failed to removing mounts directory: %w", err)
}
}
err = os.Remove(targetBase)
if err != nil && !errdefs.IsNotFound(err) {
return fmt.Errorf("failed to remove directory to cleanup image volume mounts: %w", err)
}
return nil
}

View File

@@ -33,6 +33,7 @@ import (
containerd "github.com/containerd/containerd/v2/client"
"github.com/containerd/containerd/v2/core/containers"
criconfig "github.com/containerd/containerd/v2/internal/cri/config"
containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
imagestore "github.com/containerd/containerd/v2/internal/cri/store/image"
"github.com/containerd/errdefs"
@@ -61,6 +62,8 @@ const (
sandboxesDir = "sandboxes"
// containersDir contains all container root.
containersDir = "containers"
// imageVolumeDir contains all image volume root.
imageVolumeDir = "image-volumes"
// Delimiter used to construct container/sandbox names.
nameDelimiter = "_"
@@ -139,6 +142,16 @@ func (c *criService) getContainerRootDir(id string) string {
return filepath.Join(c.config.RootDir, containersDir, id)
}
// getImageVolumeHostPath returns the image volume directory for share.
func (c *criService) getImageVolumeHostPath(podID, imageID string) string {
return filepath.Join(c.config.StateDir, imageVolumeDir, podID, imageID)
}
// getImageVolumeBaseDir returns the image volume base directory for cleanup.
func (c *criService) getImageVolumeBaseDir(podID string) string {
return filepath.Join(c.config.StateDir, imageVolumeDir, podID)
}
// getVolatileContainerRootDir returns the root directory for managing volatile container files,
// e.g. named pipes.
func (c *criService) getVolatileContainerRootDir(id string) string {
@@ -356,6 +369,18 @@ func (c *criService) generateAndSendContainerEvent(ctx context.Context, containe
c.containerEventsQ.Send(event)
}
func (c *criService) getPodSandboxRuntime(sandboxID string) (runtime criconfig.Runtime, err error) {
sandbox, err := c.sandboxStore.Get(sandboxID)
if err != nil {
return criconfig.Runtime{}, err
}
runtime, err = c.config.GetSandboxRuntime(sandbox.Config, sandbox.Metadata.RuntimeHandler)
if err != nil {
return criconfig.Runtime{}, err
}
return runtime, nil
}
func (c *criService) getPodSandboxStatus(ctx context.Context, podSandboxID string) (*runtime.PodSandboxStatus, error) {
request := &runtime.PodSandboxStatusRequest{PodSandboxId: podSandboxID}
response, err := c.PodSandboxStatus(ctx, request)

View File

@@ -437,6 +437,10 @@ func (c *CRIImageService) registryHosts(ctx context.Context, credentials func(ho
return func(host string) ([]docker.RegistryHost, error) {
var registries []docker.RegistryHost
rewrites, err := c.registryRewrites(host)
if err != nil {
return nil, fmt.Errorf("get registry rewrites: %w", err)
}
endpoints, err := c.registryEndpoints(host)
if err != nil {
return nil, fmt.Errorf("get registry endpoints: %w", err)
@@ -492,6 +496,7 @@ func (c *CRIImageService) registryHosts(ctx context.Context, credentials func(ho
Scheme: u.Scheme,
Path: u.Path,
Capabilities: docker.HostCapabilityResolve | docker.HostCapabilityPull,
Rewrites: rewrites,
})
}
return registries, nil
@@ -564,6 +569,16 @@ func (c *CRIImageService) registryEndpoints(host string) ([]string, error) {
return append(endpoints, defaultScheme(defaultHost)+"://"+defaultHost), nil
}
func (c *CRIImageService) registryRewrites(host string) (map[string]string, error) {
hosts := []string{host, "*"}
for _, host := range hosts {
if host, ok := c.config.Registry.Mirrors[host]; ok {
return host.Rewrites, nil
}
}
return nil, nil
}
// newTransport returns a new HTTP transport used to pull image.
// TODO(random-liu): Create a library and share this code with `ctr`.
func newTransport() *http.Transport {

View File

@@ -43,7 +43,7 @@ type podSandboxEventHandler struct {
func (p *podSandboxEventHandler) HandleEvent(any interface{}) error {
switch e := any.(type) {
case *eventtypes.TaskExit:
log.L.Infof("TaskExit event in podsandbox handler %+v", e)
log.L.Debugf("TaskExit event in podsandbox handler %+v", e)
// Use ID instead of ContainerID to rule out TaskExit event for exec.
sb := p.controller.store.Get(e.ID)
if sb == nil {

View File

@@ -18,6 +18,7 @@ package server
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
@@ -107,6 +108,14 @@ func (c *criService) recover(ctx context.Context) error {
metadata := sandboxstore.Metadata{}
err := sbx.GetExtension(podsandbox.MetadataKey, &metadata)
if err != nil {
if errors.Is(err, errdefs.ErrNotFound) {
err = c.client.SandboxStore().Delete(ctx, sbx.ID)
if err != nil {
return fmt.Errorf("failed to delete sandbox in response to missing metadata for sandbox %q: %w", sbx.ID, err)
}
// TODO: cleanup network namespace
continue
}
return fmt.Errorf("failed to get metadata for stored sandbox %q: %w", sbx.ID, err)
}

View File

@@ -21,6 +21,7 @@ import (
"fmt"
"time"
"github.com/containerd/containerd/v2/core/leases"
"github.com/containerd/containerd/v2/pkg/tracing"
"github.com/containerd/errdefs"
"github.com/containerd/log"
@@ -59,6 +60,12 @@ func (c *criService) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodS
return nil, fmt.Errorf("failed to forcibly stop sandbox %q: %w", id, err)
}
if err := c.client.LeasesService().Delete(ctx, leases.Lease{ID: id}); err != nil {
if !errdefs.IsNotFound(err) {
return nil, fmt.Errorf("failed to delete lease for sandbox %q: %w", id, err)
}
}
// Return error if sandbox network namespace is not closed yet.
if sandbox.NetNS != nil {
nsPath := sandbox.NetNS.GetPath()

View File

@@ -31,6 +31,7 @@ import (
"github.com/containerd/typeurl/v2"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/v2/core/leases"
sb "github.com/containerd/containerd/v2/core/sandbox"
"github.com/containerd/containerd/v2/internal/cri/annotations"
"github.com/containerd/containerd/v2/internal/cri/bandwidth"
@@ -87,6 +88,22 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
}
}()
leaseSvc := c.client.LeasesService()
ls, lerr := leaseSvc.Create(ctx, leases.WithID(id))
if lerr != nil {
return nil, fmt.Errorf("failed to create lease for sandbox name %q: %w", name, lerr)
}
defer func() {
if retErr != nil {
deferCtx, deferCancel := util.DeferContext()
defer deferCancel()
if derr := leaseSvc.Delete(deferCtx, ls); derr != nil {
log.G(deferCtx).WithError(derr).Error("failed to delete lease during cleanup")
}
}
}()
var (
err error
sandboxInfo = sb.Sandbox{ID: id}

View File

@@ -130,6 +130,11 @@ func (c *criService) stopPodSandbox(ctx context.Context, sandbox sandboxstore.Sa
}
log.G(ctx).Infof("TearDown network for sandbox %q successfully", id)
err = c.cleanupImageMounts(ctx, id)
if err != nil {
return fmt.Errorf("failed to cleanup image mounts for sandbox %q: %w", id, err)
}
return nil
}

View File

@@ -595,13 +595,13 @@ func WithUser(userstr string) SpecOpts {
setProcess(s)
s.Process.User.AdditionalGids = nil
// While the Linux kernel allows the max UID to be MaxUint32 - 2,
// and the OCI Runtime Spec has no definition about the max UID,
// the runc implementation is known to require the UID to be <= MaxInt32.
//
// containerd follows runc's limitation here.
//
// In future we may relax this limitation to allow MaxUint32 - 2,
// or, amend the OCI Runtime Spec to codify the implementation limitation.
// and the OCI Runtime Spec has no definition about the max UID,
// the runc implementation is known to require the UID to be <= MaxInt32.
//
// containerd follows runc's limitation here.
//
// In future we may relax this limitation to allow MaxUint32 - 2,
// or, amend the OCI Runtime Spec to codify the implementation limitation.
const (
minUserID = 0
maxUserID = math.MaxInt32

View File

@@ -105,7 +105,6 @@ guest:x:100:guest
},
}
for _, testCase := range testCases {
testCase := testCase
t.Run(testCase.user, func(t *testing.T) {
t.Parallel()
s := Spec{

View File

@@ -73,13 +73,12 @@ func init() {
options := &images.CRIImageServiceOptions{
Content: mdb.ContentStore(),
Images: metadata.NewImageStore(mdb),
RuntimePlatforms: map[string]images.ImagePlatform{},
Snapshotters: map[string]snapshots.Snapshotter{},
ImageFSPaths: map[string]string{},
}
options.Client, err = containerd.New(
ctrdCli, err := containerd.New(
"",
containerd.WithDefaultNamespace(constants.K8sContainerdNamespace),
containerd.WithDefaultPlatform(platforms.Default()),
@@ -88,6 +87,8 @@ func init() {
if err != nil {
return nil, fmt.Errorf("unable to init client for cri image service: %w", err)
}
options.Images = ctrdCli.ImageService()
options.Client = ctrdCli
allSnapshotters := mdb.Snapshotters()
defaultSnapshotter := config.Snapshotter

View File

@@ -28,6 +28,7 @@ func defaultUnpackConfig() []unpackConfiguration {
{
Platform: platforms.Format(platforms.DefaultSpec()),
Snapshotter: defaults.DefaultSnapshotter,
Differ: defaults.DefaultDiffer,
},
}
}

27
releases/v2.0.5.toml Normal file
View File

@@ -0,0 +1,27 @@
# commit to be tagged for new release
commit = "HEAD"
project_name = "containerd"
github_repo = "containerd/containerd"
match_deps = "^github.com/(containerd/[a-zA-Z0-9-]+)$"
ignore_deps = [ "github.com/containerd/containerd" ]
# previous release
previous = "v2.0.4"
pre_release = false
preface = """\
The fifth patch release for containerd 2.0 includes various bug fixes and updates.
"""
postface = """\
### Which file should I download?
* `containerd-<VERSION>-<OS>-<ARCH>.tar.gz`: ✅Recommended. Dynamically linked with glibc 2.31 (Ubuntu 20.04).
* `containerd-static-<VERSION>-<OS>-<ARCH>.tar.gz`: Statically linked. Expected to be used on non-glibc Linux distributions. Not position-independent.
In addition to containerd, typically you will have to install [runc](https://github.com/opencontainers/runc/releases)
and [CNI plugins](https://github.com/containernetworking/plugins/releases) from their official sites too.
See also the [Getting Started](https://github.com/containerd/containerd/blob/main/docs/getting-started.md) documentation.
"""

View File

@@ -70,6 +70,11 @@ if [ ! -z "$CGROUP_DRIVER" ] && [ "$CGROUP_DRIVER" = "systemd" ];then
EOF
fi
GINKGO_SKIP_TEST=()
if [ ! -z "$SKIP_TEST" ]; then
GINKGO_SKIP_TEST+=("--ginkgo.skip" "$SKIP_TEST")
fi
ls /etc/cni/net.d
/usr/local/bin/containerd \
@@ -85,4 +90,4 @@ do
crictl --runtime-endpoint ${BDIR}/c.sock info && break || sleep 1
done
critest --report-dir "$report_dir" --runtime-endpoint=unix:///${BDIR}/c.sock --parallel=8 "${EXTRA_CRITEST_OPTIONS:-""}"
critest --report-dir "$report_dir" --runtime-endpoint=unix:///${BDIR}/c.sock --parallel=8 "${GINKGO_SKIP_TEST[@]}" "${EXTRA_CRITEST_OPTIONS:-""}"

View File

@@ -5,7 +5,7 @@
# lived test environment.
Set-MpPreference -DisableRealtimeMonitoring:$true
$PACKAGES= @{ mingw = "10.2.0"; git = ""; golang = "1.23.7"; make = ""; nssm = "" }
$PACKAGES= @{ mingw = "10.2.0"; git = ""; golang = "1.23.8"; make = ""; nssm = "" }
Write-Host "Downloading chocolatey package"
curl.exe -L "https://packages.chocolatey.org/chocolatey.0.10.15.nupkg" -o 'c:\choco.zip'

View File

@@ -1 +1 @@
v1.2.5
v1.2.6

View File

@@ -1,14 +1,14 @@
linters:
enable:
- staticcheck
- unconvert
- gofmt
- goimports
- govet
- ineffassign
- revive
- vet
- unused
- misspell
- revive
- staticcheck
- unconvert
- unused
disable:
- errcheck

View File

@@ -46,6 +46,7 @@ generate:
lint:
@echo "+ $@"
@golangci-lint run
@(cd cmd/continuity && golangci-lint --config=../../.golangci.yml run)
build:
@echo "+ $@"

View File

@@ -19,10 +19,36 @@
package fs
import (
"fmt"
"io/fs"
"syscall"
"time"
)
func Atime(st fs.FileInfo) (time.Time, error) {
stSys, ok := st.Sys().(*syscall.Stat_t)
if !ok {
return time.Time{}, fmt.Errorf("expected st.Sys() to be *syscall.Stat_t, got %T", st.Sys())
}
return time.Unix(stSys.Atimespec.Unix()), nil
}
func Ctime(st fs.FileInfo) (time.Time, error) {
stSys, ok := st.Sys().(*syscall.Stat_t)
if !ok {
return time.Time{}, fmt.Errorf("expected st.Sys() to be *syscall.Stat_t, got %T", st.Sys())
}
return time.Unix(stSys.Ctimespec.Unix()), nil
}
func Mtime(st fs.FileInfo) (time.Time, error) {
stSys, ok := st.Sys().(*syscall.Stat_t)
if !ok {
return time.Time{}, fmt.Errorf("expected st.Sys() to be *syscall.Stat_t, got %T", st.Sys())
}
return time.Unix(stSys.Mtimespec.Unix()), nil
}
// StatAtime returns the access time from a stat struct
func StatAtime(st *syscall.Stat_t) syscall.Timespec {
return st.Atimespec

View File

@@ -30,7 +30,7 @@ func Atime(st fs.FileInfo) (time.Time, error) {
if !ok {
return time.Time{}, fmt.Errorf("expected st.Sys() to be *syscall.Stat_t, got %T", st.Sys())
}
return StatATimeAsTime(stSys), nil
return time.Unix(stSys.Atim.Unix()), nil
}
func Ctime(st fs.FileInfo) (time.Time, error) {
@@ -38,7 +38,7 @@ func Ctime(st fs.FileInfo) (time.Time, error) {
if !ok {
return time.Time{}, fmt.Errorf("expected st.Sys() to be *syscall.Stat_t, got %T", st.Sys())
}
return time.Unix(stSys.Atim.Unix()), nil
return time.Unix(stSys.Ctim.Unix()), nil
}
func Mtime(st fs.FileInfo) (time.Time, error) {

View File

@@ -56,8 +56,8 @@ func Unmarshal(p []byte) (*Manifest, error) {
func Marshal(m *Manifest) ([]byte, error) {
var bm pb.Manifest
for _, resource := range m.Resources {
bm.Resource = append(bm.Resource, toProto(resource))
for _, rsrc := range m.Resources {
bm.Resource = append(bm.Resource, toProto(rsrc))
}
return proto.Marshal(&bm)
@@ -65,8 +65,8 @@ func Marshal(m *Manifest) ([]byte, error) {
func MarshalText(w io.Writer, m *Manifest) error {
var bm pb.Manifest
for _, resource := range m.Resources {
bm.Resource = append(bm.Resource, toProto(resource))
for _, rsrc := range m.Resources {
bm.Resource = append(bm.Resource, toProto(rsrc))
}
b, err := prototext.Marshal(&bm)
@@ -78,11 +78,11 @@ func MarshalText(w io.Writer, m *Manifest) error {
}
// BuildManifest creates the manifest for the given context
func BuildManifest(ctx Context) (*Manifest, error) {
func BuildManifest(fsContext Context) (*Manifest, error) {
resourcesByPath := map[string]Resource{}
hardLinks := newHardlinkManager()
if err := ctx.Walk(func(p string, fi os.FileInfo, err error) error {
if err := fsContext.Walk(func(p string, fi os.FileInfo, err error) error {
if err != nil {
return fmt.Errorf("error walking %s: %w", p, err)
}
@@ -92,7 +92,7 @@ func BuildManifest(ctx Context) (*Manifest, error) {
return nil
}
resource, err := ctx.Resource(p, fi)
rsrc, err := fsContext.Resource(p, fi)
if err != nil {
if err == ErrNotFound {
return nil
@@ -101,7 +101,7 @@ func BuildManifest(ctx Context) (*Manifest, error) {
}
// add to the hardlink manager
if err := hardLinks.Add(fi, resource); err == nil {
if err := hardLinks.Add(fi, rsrc); err == nil {
// Resource has been accepted by hardlink manager so we don't add
// it to the resourcesByPath until we merge at the end.
return nil
@@ -110,7 +110,7 @@ func BuildManifest(ctx Context) (*Manifest, error) {
return fmt.Errorf("adding hardlink %s: %w", p, err)
}
resourcesByPath[p] = resource
resourcesByPath[p] = rsrc
return nil
}); err != nil {
@@ -123,13 +123,13 @@ func BuildManifest(ctx Context) (*Manifest, error) {
return nil, err
}
for _, resource := range hardLinked {
resourcesByPath[resource.Path()] = resource
for _, rsrc := range hardLinked {
resourcesByPath[rsrc.Path()] = rsrc
}
var resources []Resource
for _, resource := range resourcesByPath {
resources = append(resources, resource)
for _, rsrc := range resourcesByPath {
resources = append(resources, rsrc)
}
sort.Stable(ByPath(resources))
@@ -141,9 +141,9 @@ func BuildManifest(ctx Context) (*Manifest, error) {
// VerifyManifest verifies all the resources in a manifest
// against files from the given context.
func VerifyManifest(ctx Context, manifest *Manifest) error {
for _, resource := range manifest.Resources {
if err := ctx.Verify(resource); err != nil {
func VerifyManifest(fsContext Context, manifest *Manifest) error {
for _, rsrc := range manifest.Resources {
if err := fsContext.Verify(rsrc); err != nil {
return err
}
}
@@ -153,9 +153,9 @@ func VerifyManifest(ctx Context, manifest *Manifest) error {
// ApplyManifest applies on the resources in a manifest to
// the given context.
func ApplyManifest(ctx Context, manifest *Manifest) error {
for _, resource := range manifest.Resources {
if err := ctx.Apply(resource); err != nil {
func ApplyManifest(fsContext Context, manifest *Manifest) error {
for _, rsrc := range manifest.Resources {
if err := fsContext.Apply(rsrc); err != nil {
return err
}
}

View File

@@ -0,0 +1,2 @@
*.test
bin

View File

@@ -0,0 +1,3 @@
*.test
bin
/_output

View File

@@ -0,0 +1,65 @@
# Copyright The containerd Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG FUSEOVERLAYFS_COMMIT=main
ARG ROOTLESSKIT_COMMIT=v1.1.0
ARG GO_VERSION=1.22
ARG DEBIAN_VERSION=11
ARG ALPINE_VERSION=3.18
FROM golang:${GO_VERSION}-alpine AS containerd-fuse-overlayfs-test
COPY . /go/src/github.com/containerd/fuse-overlayfs-snapshotter
WORKDIR /go/src/github.com/containerd/fuse-overlayfs-snapshotter
ENV CGO_ENABLED=0
ENV GO111MODULE=on
RUN go build ./...
RUN mkdir /out && go test -c -o /out/containerd-fuse-overlayfs.test
# from https://github.com/containers/fuse-overlayfs/blob/53c17dab78b43de1cd121bf9260b20b76371bbaf/Dockerfile.static.ubuntu
FROM debian:${DEBIAN_VERSION} AS fuse-overlayfs
RUN apt-get update && \
apt-get install --no-install-recommends -y \
git ca-certificates libc6-dev gcc g++ make automake autoconf clang pkgconf libfuse3-dev
RUN git clone https://github.com/containers/fuse-overlayfs
WORKDIR fuse-overlayfs
ARG FUSEOVERLAYFS_COMMIT
RUN git pull && git checkout ${FUSEOVERLAYFS_COMMIT}
RUN ./autogen.sh && \
LIBS="-ldl" LDFLAGS="-static" ./configure && \
make && mkdir /out && cp fuse-overlayfs /out
FROM golang:${GO_VERSION}-alpine AS rootlesskit
RUN apk add --no-cache git
RUN git clone https://github.com/rootless-containers/rootlesskit.git /go/src/github.com/rootless-containers/rootlesskit
WORKDIR /go/src/github.com/rootless-containers/rootlesskit
ARG ROOTLESSKIT_COMMIT
RUN git pull && git checkout ${ROOTLESSKIT_COMMIT}
ENV CGO_ENABLED=0
RUN mkdir /out && go build -o /out/rootlesskit github.com/rootless-containers/rootlesskit/cmd/rootlesskit
FROM alpine:${ALPINE_VERSION}
COPY --from=containerd-fuse-overlayfs-test /out/containerd-fuse-overlayfs.test /usr/local/bin
COPY --from=rootlesskit /out/rootlesskit /usr/local/bin
COPY --from=fuse-overlayfs /out/fuse-overlayfs /usr/local/bin
RUN apk add --no-cache fuse3 libcap shadow-uidmap && \
setcap CAP_SETUID=ep /usr/bin/newuidmap && \
setcap CAP_SETGID=ep /usr/bin/newgidmap && \
adduser -D -u 1000 testuser && \
echo testuser:100000:65536 | tee /etc/subuid | tee /etc/subgid
USER testuser
# If /tmp is real overlayfs, some tests fail. Mount a volume to ensure /tmp to be a sane filesystem.
VOLUME /tmp
# requires --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/fuse
CMD ["rootlesskit", "containerd-fuse-overlayfs.test", "-test.root", "-test.v"]

View File

@@ -0,0 +1,191 @@
Apache License
Version 2.0, January 2004
https://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Copyright The containerd Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -0,0 +1,4 @@
# fuse-overlayfs-snapshotter maintainers
#
# As a containerd sub-project, containerd maintainers are also included from https://github.com/containerd/project/blob/master/MAINTAINERS.
# See https://github.com/containerd/project/blob/master/GOVERNANCE.md for description of maintainer role

View File

@@ -0,0 +1,108 @@
# Copyright The containerd Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Deliverables path
DESTDIR ?= /usr/local
BINDIR ?= $(DESTDIR)/bin
# Tools path
ECHO ?= echo
DOCKER ?= docker
GO ?= go
MKDIR ?= mkdir
TAR ?= tar
INSTALL ?= install
GIT ?= git
TARGET_BIN=containerd-fuse-overlayfs-grpc
VERSION ?= $(shell $(GIT) describe --match 'v[0-9]*' --dirty='.m' --always --tags)
VERSION_TRIMMED := $(VERSION:v%=%)
REVISION ?= $(shell $(GIT) rev-parse HEAD)$(shell if ! $(GIT) diff --no-ext-diff --quiet --exit-code; then $(ECHO) .m; fi)
PKG_MAIN := github.com/containerd/fuse-overlayfs-snapshotter/v2/cmd/$(TARGET_BIN)
PKG_VERSION := github.com/containerd/fuse-overlayfs-snapshotter/v2/cmd/$(TARGET_BIN)/version
export GO_BUILD=GO111MODULE=on CGO_ENABLED=0 $(GO) build -ldflags "-s -w -X $(PKG_VERSION).Version=$(VERSION) -X $(PKG_VERSION).Revision=$(REVISION)"
bin/$(TARGET_BIN):
$(GO_BUILD) -o $@ $(PKG_MAIN)
all: binaries
help:
@$(ECHO) "Usage: make <target>"
@$(ECHO)
@$(ECHO) " * 'install' - Install binaries to system locations."
@$(ECHO) " * 'uninstall' - Uninstall binaries from system."
@$(ECHO) " * 'binaries' - Build $(TARGET_BIN)."
@$(ECHO) " * 'test' - Run tests."
@$(ECHO) " * 'clean' - Clean artifacts."
@$(ECHO) " * 'help' - Show this help message."
binaries: bin/$(TARGET_BIN)
$(TARGET_BIN):
$(GO_BUILD) -o $(CURDIR)/bin/$@ $(PKG_MAIN)
binaries: $(TARGET_BIN)
install:
$(INSTALL) -D -m 755 $(CURDIR)/bin/$(TARGET_BIN) $(BINDIR)/$(TARGET_BIN)
uninstall:
$(RM) $(BINDIR)/$(TARGET_BIN)
clean:
$(RM) -r $(CURDIR)/bin $(CURDIR)/_output
TEST_DOCKER_IMG_TAG=containerd-fuse-overlayfs-test
test:
DOCKER_BUILDKIT=1 $(DOCKER) build -t $(TEST_DOCKER_IMG_TAG) --build-arg FUSEOVERLAYFS_COMMIT=${FUSEOVERLAYFS_COMMIT} .
$(DOCKER) run --rm $(TEST_DOCKER_IMG_TAG) fuse-overlayfs -V
$(DOCKER) run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/fuse $(TEST_DOCKER_IMG_TAG)
$(DOCKER) rmi $(TEST_DOCKER_IMG_TAG)
_test:
$(GO) test -exec rootlesskit -test.v -test.root
TAR_FLAGS=--transform 's/.*\///g' --owner=0 --group=0
ARTIFACT_NAME=containerd-fuse-overlayfs-$(VERSION_TRIMMED)
artifacts: clean
$(MKDIR) -p _output
GOOS=linux GOARCH=amd64 make -B
$(TAR) $(TAR_FLAGS) -czvf _output/$(ARTIFACT_NAME)-linux-amd64.tar.gz $(CURDIR)/bin/*
GOOS=linux GOARCH=arm64 make -B
$(TAR) $(TAR_FLAGS) -czvf _output/$(ARTIFACT_NAME)-linux-arm64.tar.gz $(CURDIR)/bin/*
GOOS=linux GOARCH=arm GOARM=7 make -B
$(TAR) $(TAR_FLAGS) -czvf _output/$(ARTIFACT_NAME)-linux-arm-v7.tar.gz $(CURDIR)/bin/*
GOOS=linux GOARCH=ppc64le make -B
$(TAR) $(TAR_FLAGS) -czvf _output/$(ARTIFACT_NAME)-linux-ppc64le.tar.gz $(CURDIR)/bin/*
GOOS=linux GOARCH=s390x make -B
$(TAR) $(TAR_FLAGS) -czvf _output/$(ARTIFACT_NAME)-linux-s390x.tar.gz $(CURDIR)/bin/*
GOOS=linux GOARCH=riscv64 make -B
$(TAR) $(TAR_FLAGS) -czvf _output/$(ARTIFACT_NAME)-linux-riscv64.tar.gz $(CURDIR)/bin/*
.PHONY: \
$(TARGET_BIN) \
install \
uninstall \
clean \
test \
_test \
artifacts \
help

View File

@@ -0,0 +1,157 @@
# [`fuse-overlayfs`](https://github.com/containers/fuse-overlayfs) snapshotter plugin for [containerd](https://containerd.io)
Unlike `overlayfs`, `fuse-overlayfs` can be used as a non-root user on almost all recent distros.
You do NOT need this `fuse-overlayfs` plugin on the following environments, because they support the real `overlayfs` for non-root users:
- [kernel >= 5.11](https://github.com/torvalds/linux/commit/459c7c565ac36ba09ffbf24231147f408fde4203)
- [Ubuntu kernel, since circa 2015](https://kernel.ubuntu.com/git/ubuntu/ubuntu-bionic.git/commit/fs/overlayfs?id=3b7da90f28fe1ed4b79ef2d994c81efbc58f1144)
fuse-overlayfs-snapshotter is a **non-core** sub-project of containerd.
## Requirements
* kernel >= 4.18
* containerd >= 1.4
* fuse-overlayfs >= 0.7.0
## Setup
Two installation options are supported:
1. Embed `fuse-overlayfs` plugin into the containerd binary
2. Execute `fuse-overlayfs` plugin as a separate binary
Choose 1 if you don't mind recompiling containerd, otherwise choose 2.
### Option 1: Embed `fuse-overlayfs` plugin into the containerd binary
Create `builtins_fuseoverlayfs_linux.go` under [`$GOPATH/src/github.com/containerd/containerd/cmd/containerd/builtins`](https://github.com/containerd/containerd/tree/master/cmd/containerd/builtins)
with the following content, and recompile the containerd binary:
```go
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import _ "github.com/containerd/fuse-overlayfs-snapshotter/v2/plugin"
```
No extra configuration is needed.
See https://github.com/containerd/containerd/blob/master/docs/rootless.md for how to run containerd as a non-root user.
### Option 2: Execute `fuse-overlayfs` plugin as a separate binary
#### "Easy way"
The easiest way is to use `containerd-rootless-setuptool.sh` included in [nerdctl](https://github.com/containerd/nerdctl).
```console
$ containerd-rootless-setuptool.sh install
$ containerd-rootless-setuptool.sh install-fuse-overlayfs
[INFO] Creating "/home/exampleuser/.config/systemd/user/containerd-fuse-overlayfs.service"
...
[INFO] Installed "containerd-fuse-overlayfs.service" successfully.
[INFO] To control "containerd-fuse-overlayfs.service", run: `systemctl --user (start|stop|restart) containerd-fuse-overlayfs.service`
[INFO] Add the following lines to "/home/exampleuser/.config/containerd/config.toml" manually:
### BEGIN ###
[proxy_plugins]
[proxy_plugins."fuse-overlayfs"]
type = "snapshot"
address = "/run/user/1000/containerd-fuse-overlayfs.sock"
### END ###
[INFO] Set `export CONTAINERD_SNAPSHOTTER="fuse-overlayfs"` to use the fuse-overlayfs snapshotter.
```
Add the `[proxy_plugins."fuse-overlayfs"]` configuration shown above to `~/.config/containerd/config.toml`.
"1000" needs to be replaced with your actual UID.
#### "Hard way"
<details>
<summary>Click here to show the "hard way"</summary>
<p>
* Install `containerd-fuse-overlayfs-grpc` binary. The binary will be installed under `$DESTDIR/bin`.
```console
$ make && DESTDIR=$HOME make install
```
* Create the following configuration in `~/.config/containerd/config.toml`:
```toml
version = 2
# substitute "/home/suda" with your own $HOME
root = "/home/suda/.local/share/containerd"
# substitute "/run/user/1001" with your own $XDG_RUNTIME_DIR
state = "/run/user/1001/containerd"
[grpc]
address = "/run/user/1001/containerd/containerd.sock"
[proxy_plugins]
[proxy_plugins."fuse-overlayfs"]
type = "snapshot"
address = "/run/user/1001/containerd/fuse-overlayfs.sock"
```
* Start [RootlessKit](https://github.com/rootless-containers/rootlesskit) with `sleep infinity` (or any kind of "pause" command):
```console
$ rootlesskit \
--net=slirp4netns --disable-host-loopback \
--copy-up=/etc --copy-up=/run \
--state-dir=$XDG_RUNTIME_DIR/rootlesskit-containerd \
sh -c "rm -rf /run/containerd ; sleep infinity"
```
(Note: `rm -rf /run/containerd` is a workaround for [containerd/containerd#2767](https://github.com/containerd/containerd/issues/2767))
* Enter the RootlessKit namespaces and run `containerd-fuse-overlayfs-grpc`:
```console
$ nsenter -U --preserve-credentials -m -n -t $(cat $XDG_RUNTIME_DIR/rootlesskit-containerd/child_pid) \
containerd-fuse-overlayfs-grpc $XDG_RUNTIME_DIR/containerd/fuse-overlayfs.sock $HOME/.local/share/containerd-fuse-overlayfs
```
* Enter the same namespaces and run `containerd`:
```console
$ nsenter -U --preserve-credentials -m -n -t $(cat $XDG_RUNTIME_DIR/rootlesskit-containerd/child_pid) \
containerd -c $HOME/.config/containerd/config.toml
```
</p>
</details>
## Usage
```console
$ export CONTAINERD_SNAPSHOTTER=fuse-overlayfs
$ nerdctl run ...
```
## How to test
To run the test as a non-root user, [RootlessKit](https://github.com/rootless-containers/rootlesskit) needs to be installed.
```console
$ go test -exec rootlesskit -test.v -test.root
```
## Project details
fuse-overlayfs-snapshotter is a containerd **non-core** sub-project, licensed under the [Apache 2.0 license](./LICENSE).
As a containerd non-core sub-project, you will find the:
* [Project governance](https://github.com/containerd/project/blob/master/GOVERNANCE.md),
* [Maintainers](./MAINTAINERS),
* and [Contributing guidelines](https://github.com/containerd/project/blob/master/CONTRIBUTING.md)
information in our [`containerd/project`](https://github.com/containerd/project) repository.

View File

@@ -0,0 +1,82 @@
//go:build linux
// +build linux
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package fuseoverlayfs
import (
"fmt"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"github.com/containerd/containerd/v2/core/mount"
"github.com/containerd/log"
)
// supportsReadonlyMultipleLowerDir checks if read-only multiple lowerdirs can be mounted with fuse-overlayfs.
// https://github.com/containers/fuse-overlayfs/pull/133
func supportsReadonlyMultipleLowerDir(d string) error {
td, err := ioutil.TempDir(d, "fuseoverlayfs-check")
if err != nil {
return err
}
defer func() {
if err := os.RemoveAll(td); err != nil {
log.L.WithError(err).Warnf("Failed to remove check directory %v", td)
}
}()
for _, dir := range []string{"lower1", "lower2", "merged"} {
if err := os.Mkdir(filepath.Join(td, dir), 0755); err != nil {
return err
}
}
opts := []string{fmt.Sprintf("lowerdir=%s:%s", filepath.Join(td, "lower2"), filepath.Join(td, "lower1"))}
m := mount.Mount{
Type: "fuse3." + fuseoverlayfsBinary,
Source: "overlay",
Options: opts,
}
dest := filepath.Join(td, "merged")
if err := m.Mount(dest); err != nil {
return fmt.Errorf("failed to mount fuse-overlayfs (%+v) on %s: %w", m, dest, err)
}
if err := mount.UnmountAll(dest, 0); err != nil {
log.L.WithError(err).Warnf("Failed to unmount check directory %v", dest)
}
return nil
}
// Supported returns nil when the overlayfs is functional on the system with the root directory.
// Supported is not called during plugin initialization, but exposed for downstream projects which uses
// this snapshotter as a library.
func Supported(root string) error {
if _, err := exec.LookPath(fuseoverlayfsBinary); err != nil {
return fmt.Errorf("%s not installed: %w", fuseoverlayfsBinary, err)
}
if err := os.MkdirAll(root, 0700); err != nil {
return err
}
if err := supportsReadonlyMultipleLowerDir(root); err != nil {
return fmt.Errorf("fuse-overlayfs not functional, make sure running with kernel >= 4.18: %w", err)
}
return nil
}

View File

@@ -0,0 +1,518 @@
//go:build linux
// +build linux
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package fuseoverlayfs
import (
"context"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"
"syscall"
"github.com/containerd/containerd/v2/core/mount"
"github.com/containerd/containerd/v2/core/snapshots"
"github.com/containerd/containerd/v2/core/snapshots/storage"
"github.com/containerd/continuity/fs"
"github.com/containerd/log"
)
const (
fuseoverlayfsBinary = "fuse-overlayfs"
)
// SnapshotterConfig is used to configure the overlay snapshotter instance
type SnapshotterConfig struct {
asyncRemove bool
}
// Opt is an option to configure the overlay snapshotter
type Opt func(config *SnapshotterConfig) error
// AsynchronousRemove defers removal of filesystem content until
// the Cleanup method is called. Removals will make the snapshot
// referred to by the key unavailable and make the key immediately
// available for re-use.
//
// AsynchronousRemove is untested for fuse-overlayfs
func AsynchronousRemove(config *SnapshotterConfig) error {
config.asyncRemove = true
return nil
}
type snapshotter struct {
root string
ms *storage.MetaStore
asyncRemove bool
}
// NewSnapshotter returns a Snapshotter which uses overlayfs. The overlayfs
// diffs are stored under the provided root. A metadata file is stored under
// the root.
func NewSnapshotter(root string, opts ...Opt) (snapshots.Snapshotter, error) {
var config SnapshotterConfig
for _, opt := range opts {
if err := opt(&config); err != nil {
return nil, err
}
}
if err := os.MkdirAll(root, 0700); err != nil {
return nil, err
}
ms, err := storage.NewMetaStore(filepath.Join(root, "metadata.db"))
if err != nil {
return nil, err
}
if err := os.Mkdir(filepath.Join(root, "snapshots"), 0700); err != nil && !os.IsExist(err) {
return nil, err
}
return &snapshotter{
root: root,
ms: ms,
asyncRemove: config.asyncRemove,
}, nil
}
// Stat returns the info for an active or committed snapshot by name or
// key.
//
// Should be used for parent resolution, existence checks and to discern
// the kind of snapshot.
func (o *snapshotter) Stat(ctx context.Context, key string) (snapshots.Info, error) {
ctx, t, err := o.ms.TransactionContext(ctx, false)
if err != nil {
return snapshots.Info{}, err
}
defer t.Rollback()
_, info, _, err := storage.GetInfo(ctx, key)
if err != nil {
return snapshots.Info{}, err
}
return info, nil
}
func (o *snapshotter) Update(ctx context.Context, info snapshots.Info, fieldpaths ...string) (snapshots.Info, error) {
ctx, t, err := o.ms.TransactionContext(ctx, true)
if err != nil {
return snapshots.Info{}, err
}
info, err = storage.UpdateInfo(ctx, info, fieldpaths...)
if err != nil {
t.Rollback()
return snapshots.Info{}, err
}
if err := t.Commit(); err != nil {
return snapshots.Info{}, err
}
return info, nil
}
// Usage returns the resources taken by the snapshot identified by key.
//
// For active snapshots, this will scan the usage of the overlay "diff" (aka
// "upper") directory and may take some time.
//
// For committed snapshots, the value is returned from the metadata database.
func (o *snapshotter) Usage(ctx context.Context, key string) (snapshots.Usage, error) {
ctx, t, err := o.ms.TransactionContext(ctx, false)
if err != nil {
return snapshots.Usage{}, err
}
id, info, usage, err := storage.GetInfo(ctx, key)
t.Rollback() // transaction no longer needed at this point.
if err != nil {
return snapshots.Usage{}, err
}
upperPath := o.upperPath(id)
if info.Kind == snapshots.KindActive {
du, err := fs.DiskUsage(ctx, upperPath)
if err != nil {
// TODO(stevvooe): Consider not reporting an error in this case.
return snapshots.Usage{}, err
}
usage = snapshots.Usage(du)
}
return usage, nil
}
func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
return o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts)
}
func (o *snapshotter) View(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
return o.createSnapshot(ctx, snapshots.KindView, key, parent, opts)
}
// Mounts returns the mounts for the transaction identified by key. Can be
// called on an read-write or readonly transaction.
//
// This can be used to recover mounts after calling View or Prepare.
func (o *snapshotter) Mounts(ctx context.Context, key string) ([]mount.Mount, error) {
ctx, t, err := o.ms.TransactionContext(ctx, false)
if err != nil {
return nil, err
}
s, err := storage.GetSnapshot(ctx, key)
if err != nil {
return nil, err
}
_, info, _, err := storage.GetInfo(ctx, key)
t.Rollback()
if err != nil {
return nil, fmt.Errorf("failed to get active mount: %w", err)
}
return o.mounts(s, info), nil
}
func (o *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error {
ctx, t, err := o.ms.TransactionContext(ctx, true)
if err != nil {
return err
}
defer func() {
if err != nil {
if rerr := t.Rollback(); rerr != nil {
log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
}
}
}()
// grab the existing id
id, _, _, err := storage.GetInfo(ctx, key)
if err != nil {
return err
}
usage, err := fs.DiskUsage(ctx, o.upperPath(id))
if err != nil {
return err
}
if _, err = storage.CommitActive(ctx, key, name, snapshots.Usage(usage), opts...); err != nil {
return fmt.Errorf("failed to commit snapshot: %w", err)
}
return t.Commit()
}
// Remove abandons the snapshot identified by key. The snapshot will
// immediately become unavailable and unrecoverable. Disk space will
// be freed up on the next call to `Cleanup`.
func (o *snapshotter) Remove(ctx context.Context, key string) (err error) {
ctx, t, err := o.ms.TransactionContext(ctx, true)
if err != nil {
return err
}
defer func() {
if err != nil {
if rerr := t.Rollback(); rerr != nil {
log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
}
}
}()
_, _, err = storage.Remove(ctx, key)
if err != nil {
return fmt.Errorf("failed to remove: %w", err)
}
if !o.asyncRemove {
var removals []string
removals, err = o.getCleanupDirectories(ctx, t)
if err != nil {
return fmt.Errorf("unable to get directories for removal: %w", err)
}
// Remove directories after the transaction is closed, failures must not
// return error since the transaction is committed with the removal
// key no longer available.
defer func() {
if err == nil {
for _, dir := range removals {
if err := os.RemoveAll(dir); err != nil {
log.G(ctx).WithError(err).WithField("path", dir).Warn("failed to remove directory")
}
}
}
}()
}
return t.Commit()
}
// Walk the committed snapshots.
func (o *snapshotter) Walk(ctx context.Context, fn snapshots.WalkFunc, fs ...string) error {
ctx, t, err := o.ms.TransactionContext(ctx, false)
if err != nil {
return err
}
defer t.Rollback()
return storage.WalkInfo(ctx, fn, fs...)
}
// Cleanup cleans up disk resources from removed or abandoned snapshots
func (o *snapshotter) Cleanup(ctx context.Context) error {
cleanup, err := o.cleanupDirectories(ctx)
if err != nil {
return err
}
for _, dir := range cleanup {
if err := os.RemoveAll(dir); err != nil {
log.G(ctx).WithError(err).WithField("path", dir).Warn("failed to remove directory")
}
}
return nil
}
func (o *snapshotter) cleanupDirectories(ctx context.Context) ([]string, error) {
// Get a write transaction to ensure no other write transaction can be entered
// while the cleanup is scanning.
ctx, t, err := o.ms.TransactionContext(ctx, true)
if err != nil {
return nil, err
}
defer t.Rollback()
return o.getCleanupDirectories(ctx, t)
}
func (o *snapshotter) getCleanupDirectories(ctx context.Context, t storage.Transactor) ([]string, error) {
ids, err := storage.IDMap(ctx)
if err != nil {
return nil, err
}
snapshotDir := filepath.Join(o.root, "snapshots")
fd, err := os.Open(snapshotDir)
if err != nil {
return nil, err
}
defer fd.Close()
dirs, err := fd.Readdirnames(0)
if err != nil {
return nil, err
}
cleanup := []string{}
for _, d := range dirs {
if _, ok := ids[d]; ok {
continue
}
cleanup = append(cleanup, filepath.Join(snapshotDir, d))
}
return cleanup, nil
}
func (o *snapshotter) createSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts []snapshots.Opt) (_ []mount.Mount, err error) {
ctx, t, err := o.ms.TransactionContext(ctx, true)
if err != nil {
return nil, err
}
var td, path string
defer func() {
if err != nil {
if td != "" {
if err1 := os.RemoveAll(td); err1 != nil {
log.G(ctx).WithError(err1).Warn("failed to cleanup temp snapshot directory")
}
}
if path != "" {
if err1 := os.RemoveAll(path); err1 != nil {
log.G(ctx).WithError(err1).WithField("path", path).Error("failed to reclaim snapshot directory, directory may need removal")
err = fmt.Errorf("failed to remove path: %v: %w", err1, err)
}
}
}
}()
snapshotDir := filepath.Join(o.root, "snapshots")
td, err = o.prepareDirectory(ctx, snapshotDir, kind)
if err != nil {
if rerr := t.Rollback(); rerr != nil {
log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
}
return nil, fmt.Errorf("failed to create prepare snapshot dir: %w", err)
}
rollback := true
defer func() {
if rollback {
if rerr := t.Rollback(); rerr != nil {
log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
}
}
}()
s, err := storage.CreateSnapshot(ctx, kind, key, parent, opts...)
if err != nil {
return nil, fmt.Errorf("failed to create snapshot: %w", err)
}
if len(s.ParentIDs) > 0 {
st, err := os.Stat(o.upperPath(s.ParentIDs[0]))
if err != nil {
return nil, fmt.Errorf("failed to stat parent: %w", err)
}
stat := st.Sys().(*syscall.Stat_t)
if err := os.Lchown(filepath.Join(td, "fs"), int(stat.Uid), int(stat.Gid)); err != nil {
if rerr := t.Rollback(); rerr != nil {
log.G(ctx).WithError(rerr).Warn("failed to rollback transaction")
}
return nil, fmt.Errorf("failed to chown: %w", err)
}
}
path = filepath.Join(snapshotDir, s.ID)
if err = os.Rename(td, path); err != nil {
return nil, fmt.Errorf("failed to rename: %w", err)
}
td = ""
_, info, _, err := storage.GetInfo(ctx, key)
rollback = false
if err = t.Commit(); err != nil {
return nil, fmt.Errorf("commit failed: %w", err)
}
return o.mounts(s, info), nil
}
func (o *snapshotter) prepareDirectory(ctx context.Context, snapshotDir string, kind snapshots.Kind) (string, error) {
td, err := ioutil.TempDir(snapshotDir, "new-")
if err != nil {
return "", fmt.Errorf("failed to create temp dir: %w", err)
}
if err := os.Mkdir(filepath.Join(td, "fs"), 0755); err != nil {
return td, err
}
if kind == snapshots.KindActive {
if err := os.Mkdir(filepath.Join(td, "work"), 0711); err != nil {
return td, err
}
}
return td, nil
}
func (o *snapshotter) mounts(s storage.Snapshot, info snapshots.Info) []mount.Mount {
if len(s.ParentIDs) == 0 {
// if we only have one layer/no parents then just return a bind mount as overlay
// will not work
roFlag := "rw"
if s.Kind == snapshots.KindView {
roFlag = "ro"
}
return []mount.Mount{
{
Source: o.upperPath(s.ID),
Type: "bind",
Options: []string{
roFlag,
"rbind",
},
},
}
}
var options []string
if s.Kind == snapshots.KindActive {
options = append(options,
fmt.Sprintf("workdir=%s", o.workPath(s.ID)),
fmt.Sprintf("upperdir=%s", o.upperPath(s.ID)),
)
} else if len(s.ParentIDs) == 1 {
return []mount.Mount{
{
Source: o.upperPath(s.ParentIDs[0]),
Type: "bind",
Options: []string{
"ro",
"rbind",
},
},
}
}
parentPaths := make([]string, len(s.ParentIDs))
for i := range s.ParentIDs {
parentPaths[i] = o.upperPath(s.ParentIDs[i])
}
options = append(options, fmt.Sprintf("lowerdir=%s", strings.Join(parentPaths, ":")))
if mapping, ok := info.Labels["containerd.io/snapshot/uidmapping"]; ok {
options = append(options, fmt.Sprintf("uidmapping=%s", convertIDMappingOption(mapping)))
}
if mapping, ok := info.Labels["containerd.io/snapshot/gidmapping"]; ok {
options = append(options, fmt.Sprintf("gidmapping=%s", convertIDMappingOption(mapping)))
}
return []mount.Mount{
{
Type: "fuse3." + fuseoverlayfsBinary,
Source: "overlay",
Options: options,
},
}
}
func (o *snapshotter) upperPath(id string) string {
return filepath.Join(o.root, "snapshots", id, "fs")
}
func (o *snapshotter) workPath(id string) string {
return filepath.Join(o.root, "snapshots", id, "work")
}
// Close closes the snapshotter
func (o *snapshotter) Close() error {
return o.ms.Close()
}
// fuseIDMappingOption converts mapping entries joined with ',' to ':'
// This is expected by the fuse-overlayfs program:
// https://github.com/containers/fuse-overlayfs/blob/main/fuse-overlayfs.1.md
func convertIDMappingOption(label string) string {
return strings.ReplaceAll(label, ",", ":")
}

View File

@@ -0,0 +1,60 @@
//go:build linux
// +build linux
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package fuseoverlayfs
import (
"errors"
"github.com/containerd/containerd/v2/plugins"
fuseoverlayfs "github.com/containerd/fuse-overlayfs-snapshotter/v2"
"github.com/containerd/platforms"
"github.com/containerd/plugin"
"github.com/containerd/plugin/registry"
)
// Config represents configuration for the fuse-overlayfs plugin.
type Config struct {
// Root directory for the plugin
RootPath string `toml:"root_path"`
}
func init() {
registry.Register(&plugin.Registration{
Type: plugins.SnapshotPlugin,
ID: "fuse-overlayfs",
Config: &Config{},
InitFn: func(ic *plugin.InitContext) (interface{}, error) {
ic.Meta.Platforms = append(ic.Meta.Platforms, platforms.DefaultSpec())
config, ok := ic.Config.(*Config)
if !ok {
return nil, errors.New("invalid fuse-overlayfs configuration")
}
root := ic.Properties[plugins.PropertyRootDir]
if config.RootPath != "" {
root = config.RootPath
}
ic.Meta.Exports["root"] = root
return fuseoverlayfs.NewSnapshotter(root)
},
})
}

View File

@@ -1,4 +1,5 @@
Apache License
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
@@ -198,4 +199,4 @@
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
limitations under the License.

View File

@@ -0,0 +1,67 @@
The source code developed under the Stargz Snapshotter Project is licensed under Apache License 2.0.
However, the Stargz Snapshotter project contains modified subcomponents from Container Registry Filesystem Project with separate copyright notices and license terms. Your use of the source code for the subcomponent is subject to the terms and conditions as defined by the source project. Files in these subcomponents contain following file header.
```
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the NOTICE.md file.
```
These source code is governed by a 3-Clause BSD license. The copyright notice, list of conditions and disclaimer are the following.
```
Copyright (c) 2019 Google LLC. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```
The Stargz Snapshotter project also contains modified benchmarking code from HelloBench Project with separate copyright notices and license terms. Your use of the source code for the benchmarking code is subject to the terms and conditions as defined by the source project. These source code is governed by a MIT license. The copyright notice, condition and disclaimer are the following. The file in the benchmarking code contains it as the file header.
```
The MIT License (MIT)
Copyright (c) 2015 Tintri
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
```

View File

@@ -0,0 +1,440 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"bytes"
"fmt"
"io"
"os"
"path/filepath"
"sync"
"github.com/containerd/stargz-snapshotter/util/cacheutil"
"github.com/containerd/stargz-snapshotter/util/namedmutex"
"github.com/hashicorp/go-multierror"
)
const (
defaultMaxLRUCacheEntry = 10
defaultMaxCacheFds = 10
)
type DirectoryCacheConfig struct {
// Number of entries of LRU cache (default: 10).
// This won't be used when DataCache is specified.
MaxLRUCacheEntry int
// Number of file descriptors to cache (default: 10).
// This won't be used when FdCache is specified.
MaxCacheFds int
// On Add, wait until the data is fully written to the cache directory.
SyncAdd bool
// DataCache is an on-memory cache of the data.
// OnEvicted will be overridden and replaced for internal use.
DataCache *cacheutil.LRUCache
// FdCache is a cache for opened file descriptors.
// OnEvicted will be overridden and replaced for internal use.
FdCache *cacheutil.LRUCache
// BufPool will be used for pooling bytes.Buffer.
BufPool *sync.Pool
// Direct forcefully enables direct mode for all operation in cache.
// Thus operation won't use on-memory caches.
Direct bool
}
// TODO: contents validation.
// BlobCache represents a cache for bytes data
type BlobCache interface {
// Add returns a writer to add contents to cache
Add(key string, opts ...Option) (Writer, error)
// Get returns a reader to read the specified contents
// from cache
Get(key string, opts ...Option) (Reader, error)
// Close closes the cache
Close() error
}
// Reader provides the data cached.
type Reader interface {
io.ReaderAt
Close() error
}
// Writer enables the client to cache byte data. Commit() must be
// called after data is fully written to Write(). To abort the written
// data, Abort() must be called.
type Writer interface {
io.WriteCloser
Commit() error
Abort() error
}
type cacheOpt struct {
direct bool
}
type Option func(o *cacheOpt) *cacheOpt
// Direct option lets FetchAt and Add methods not to use on-memory caches. When
// you know that the targeting value won't be used immediately, you can prevent
// the limited space of on-memory caches from being polluted by these unimportant
// values.
func Direct() Option {
return func(o *cacheOpt) *cacheOpt {
o.direct = true
return o
}
}
func NewDirectoryCache(directory string, config DirectoryCacheConfig) (BlobCache, error) {
if !filepath.IsAbs(directory) {
return nil, fmt.Errorf("dir cache path must be an absolute path; got %q", directory)
}
bufPool := config.BufPool
if bufPool == nil {
bufPool = &sync.Pool{
New: func() interface{} {
return new(bytes.Buffer)
},
}
}
dataCache := config.DataCache
if dataCache == nil {
maxEntry := config.MaxLRUCacheEntry
if maxEntry == 0 {
maxEntry = defaultMaxLRUCacheEntry
}
dataCache = cacheutil.NewLRUCache(maxEntry)
dataCache.OnEvicted = func(key string, value interface{}) {
value.(*bytes.Buffer).Reset()
bufPool.Put(value)
}
}
fdCache := config.FdCache
if fdCache == nil {
maxEntry := config.MaxCacheFds
if maxEntry == 0 {
maxEntry = defaultMaxCacheFds
}
fdCache = cacheutil.NewLRUCache(maxEntry)
fdCache.OnEvicted = func(key string, value interface{}) {
value.(*os.File).Close()
}
}
if err := os.MkdirAll(directory, 0700); err != nil {
return nil, err
}
wipdir := filepath.Join(directory, "wip")
if err := os.MkdirAll(wipdir, 0700); err != nil {
return nil, err
}
dc := &directoryCache{
cache: dataCache,
fileCache: fdCache,
wipLock: new(namedmutex.NamedMutex),
directory: directory,
wipDirectory: wipdir,
bufPool: bufPool,
direct: config.Direct,
}
dc.syncAdd = config.SyncAdd
return dc, nil
}
// directoryCache is a cache implementation which backend is a directory.
type directoryCache struct {
cache *cacheutil.LRUCache
fileCache *cacheutil.LRUCache
wipDirectory string
directory string
wipLock *namedmutex.NamedMutex
bufPool *sync.Pool
syncAdd bool
direct bool
closed bool
closedMu sync.Mutex
}
func (dc *directoryCache) Get(key string, opts ...Option) (Reader, error) {
if dc.isClosed() {
return nil, fmt.Errorf("cache is already closed")
}
opt := &cacheOpt{}
for _, o := range opts {
opt = o(opt)
}
if !dc.direct && !opt.direct {
// Get data from memory
if b, done, ok := dc.cache.Get(key); ok {
return &reader{
ReaderAt: bytes.NewReader(b.(*bytes.Buffer).Bytes()),
closeFunc: func() error {
done()
return nil
},
}, nil
}
// Get data from disk. If the file is already opened, use it.
if f, done, ok := dc.fileCache.Get(key); ok {
return &reader{
ReaderAt: f.(*os.File),
closeFunc: func() error {
done() // file will be closed when it's evicted from the cache
return nil
},
}, nil
}
}
// Open the cache file and read the target region
// TODO: If the target cache is write-in-progress, should we wait for the completion
// or simply report the cache miss?
file, err := os.Open(dc.cachePath(key))
if err != nil {
return nil, fmt.Errorf("failed to open blob file for %q: %w", key, err)
}
// If "direct" option is specified, do not cache the file on memory.
// This option is useful for preventing memory cache from being polluted by data
// that won't be accessed immediately.
if dc.direct || opt.direct {
return &reader{
ReaderAt: file,
closeFunc: func() error { return file.Close() },
}, nil
}
// TODO: should we cache the entire file data on memory?
// but making I/O (possibly huge) on every fetching
// might be costly.
return &reader{
ReaderAt: file,
closeFunc: func() error {
_, done, added := dc.fileCache.Add(key, file)
defer done() // Release it immediately. Cleaned up on eviction.
if !added {
return file.Close() // file already exists in the cache. close it.
}
return nil
},
}, nil
}
func (dc *directoryCache) Add(key string, opts ...Option) (Writer, error) {
if dc.isClosed() {
return nil, fmt.Errorf("cache is already closed")
}
opt := &cacheOpt{}
for _, o := range opts {
opt = o(opt)
}
wip, err := dc.wipFile(key)
if err != nil {
return nil, err
}
w := &writer{
WriteCloser: wip,
commitFunc: func() error {
if dc.isClosed() {
return fmt.Errorf("cache is already closed")
}
// Commit the cache contents
c := dc.cachePath(key)
if err := os.MkdirAll(filepath.Dir(c), os.ModePerm); err != nil {
var allErr error
if err := os.Remove(wip.Name()); err != nil {
allErr = multierror.Append(allErr, err)
}
return multierror.Append(allErr,
fmt.Errorf("failed to create cache directory %q: %w", c, err))
}
return os.Rename(wip.Name(), c)
},
abortFunc: func() error {
return os.Remove(wip.Name())
},
}
// If "direct" option is specified, do not cache the passed data on memory.
// This option is useful for preventing memory cache from being polluted by data
// that won't be accessed immediately.
if dc.direct || opt.direct {
return w, nil
}
b := dc.bufPool.Get().(*bytes.Buffer)
memW := &writer{
WriteCloser: nopWriteCloser(io.Writer(b)),
commitFunc: func() error {
if dc.isClosed() {
w.Close()
return fmt.Errorf("cache is already closed")
}
cached, done, added := dc.cache.Add(key, b)
if !added {
dc.putBuffer(b) // already exists in the cache. abort it.
}
commit := func() error {
defer done()
defer w.Close()
n, err := w.Write(cached.(*bytes.Buffer).Bytes())
if err != nil || n != cached.(*bytes.Buffer).Len() {
w.Abort()
return err
}
return w.Commit()
}
if dc.syncAdd {
return commit()
}
go func() {
if err := commit(); err != nil {
fmt.Println("failed to commit to file:", err)
}
}()
return nil
},
abortFunc: func() error {
defer w.Close()
defer w.Abort()
dc.putBuffer(b) // abort it.
return nil
},
}
return memW, nil
}
func (dc *directoryCache) putBuffer(b *bytes.Buffer) {
b.Reset()
dc.bufPool.Put(b)
}
func (dc *directoryCache) Close() error {
dc.closedMu.Lock()
defer dc.closedMu.Unlock()
if dc.closed {
return nil
}
dc.closed = true
return os.RemoveAll(dc.directory)
}
func (dc *directoryCache) isClosed() bool {
dc.closedMu.Lock()
closed := dc.closed
dc.closedMu.Unlock()
return closed
}
func (dc *directoryCache) cachePath(key string) string {
return filepath.Join(dc.directory, key[:2], key)
}
func (dc *directoryCache) wipFile(key string) (*os.File, error) {
return os.CreateTemp(dc.wipDirectory, key+"-*")
}
func NewMemoryCache() BlobCache {
return &MemoryCache{
Membuf: map[string]*bytes.Buffer{},
}
}
// MemoryCache is a cache implementation which backend is a memory.
type MemoryCache struct {
Membuf map[string]*bytes.Buffer
mu sync.Mutex
}
func (mc *MemoryCache) Get(key string, opts ...Option) (Reader, error) {
mc.mu.Lock()
defer mc.mu.Unlock()
b, ok := mc.Membuf[key]
if !ok {
return nil, fmt.Errorf("Missed cache: %q", key)
}
return &reader{bytes.NewReader(b.Bytes()), func() error { return nil }}, nil
}
func (mc *MemoryCache) Add(key string, opts ...Option) (Writer, error) {
b := new(bytes.Buffer)
return &writer{
WriteCloser: nopWriteCloser(io.Writer(b)),
commitFunc: func() error {
mc.mu.Lock()
defer mc.mu.Unlock()
mc.Membuf[key] = b
return nil
},
abortFunc: func() error { return nil },
}, nil
}
func (mc *MemoryCache) Close() error {
return nil
}
type reader struct {
io.ReaderAt
closeFunc func() error
}
func (r *reader) Close() error { return r.closeFunc() }
type writer struct {
io.WriteCloser
commitFunc func() error
abortFunc func() error
}
func (w *writer) Commit() error {
return w.commitFunc()
}
func (w *writer) Abort() error {
return w.abortFunc()
}
type writeCloser struct {
io.Writer
closeFunc func() error
}
func (w *writeCloser) Close() error { return w.closeFunc() }
func nopWriteCloser(w io.Writer) io.WriteCloser {
return &writeCloser{w, func() error { return nil }}
}

View File

@@ -1,4 +1,5 @@
Apache License
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
@@ -198,4 +199,4 @@
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
limitations under the License.

View File

@@ -0,0 +1,689 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the LICENSE file.
*/
package estargz
import (
"archive/tar"
"bytes"
"compress/gzip"
"context"
"errors"
"fmt"
"io"
"os"
"path"
"runtime"
"strings"
"sync"
"github.com/containerd/stargz-snapshotter/estargz/errorutil"
"github.com/klauspost/compress/zstd"
digest "github.com/opencontainers/go-digest"
"golang.org/x/sync/errgroup"
)
type options struct {
chunkSize int
compressionLevel int
prioritizedFiles []string
missedPrioritizedFiles *[]string
compression Compression
ctx context.Context
minChunkSize int
}
type Option func(o *options) error
// WithChunkSize option specifies the chunk size of eStargz blob to build.
func WithChunkSize(chunkSize int) Option {
return func(o *options) error {
o.chunkSize = chunkSize
return nil
}
}
// WithCompressionLevel option specifies the gzip compression level.
// The default is gzip.BestCompression.
// This option will be ignored if WithCompression option is used.
// See also: https://godoc.org/compress/gzip#pkg-constants
func WithCompressionLevel(level int) Option {
return func(o *options) error {
o.compressionLevel = level
return nil
}
}
// WithPrioritizedFiles option specifies the list of prioritized files.
// These files must be complete paths that are absolute or relative to "/"
// For example, all of "foo/bar", "/foo/bar", "./foo/bar" and "../foo/bar"
// are treated as "/foo/bar".
func WithPrioritizedFiles(files []string) Option {
return func(o *options) error {
o.prioritizedFiles = files
return nil
}
}
// WithAllowPrioritizeNotFound makes Build continue the execution even if some
// of prioritized files specified by WithPrioritizedFiles option aren't found
// in the input tar. Instead, this records all missed file names to the passed
// slice.
func WithAllowPrioritizeNotFound(missedFiles *[]string) Option {
return func(o *options) error {
if missedFiles == nil {
return fmt.Errorf("WithAllowPrioritizeNotFound: slice must be passed")
}
o.missedPrioritizedFiles = missedFiles
return nil
}
}
// WithCompression specifies compression algorithm to be used.
// Default is gzip.
func WithCompression(compression Compression) Option {
return func(o *options) error {
o.compression = compression
return nil
}
}
// WithContext specifies a context that can be used for clean canceleration.
func WithContext(ctx context.Context) Option {
return func(o *options) error {
o.ctx = ctx
return nil
}
}
// WithMinChunkSize option specifies the minimal number of bytes of data
// must be written in one gzip stream.
// By increasing this number, one gzip stream can contain multiple files
// and it hopefully leads to smaller result blob.
// NOTE: This adds a TOC property that old reader doesn't understand.
func WithMinChunkSize(minChunkSize int) Option {
return func(o *options) error {
o.minChunkSize = minChunkSize
return nil
}
}
// Blob is an eStargz blob.
type Blob struct {
io.ReadCloser
diffID digest.Digester
tocDigest digest.Digest
}
// DiffID returns the digest of uncompressed blob.
// It is only valid to call DiffID after Close.
func (b *Blob) DiffID() digest.Digest {
return b.diffID.Digest()
}
// TOCDigest returns the digest of uncompressed TOC JSON.
func (b *Blob) TOCDigest() digest.Digest {
return b.tocDigest
}
// Build builds an eStargz blob which is an extended version of stargz, from a blob (gzip, zstd
// or plain tar) passed through the argument. If there are some prioritized files are listed in
// the option, these files are grouped as "prioritized" and can be used for runtime optimization
// (e.g. prefetch). This function builds a blob in parallel, with dividing that blob into several
// (at least the number of runtime.GOMAXPROCS(0)) sub-blobs.
func Build(tarBlob *io.SectionReader, opt ...Option) (_ *Blob, rErr error) {
var opts options
opts.compressionLevel = gzip.BestCompression // BestCompression by default
for _, o := range opt {
if err := o(&opts); err != nil {
return nil, err
}
}
if opts.compression == nil {
opts.compression = newGzipCompressionWithLevel(opts.compressionLevel)
}
layerFiles := newTempFiles()
ctx := opts.ctx
if ctx == nil {
ctx = context.Background()
}
done := make(chan struct{})
defer close(done)
go func() {
select {
case <-done:
// nop
case <-ctx.Done():
layerFiles.CleanupAll()
}
}()
defer func() {
if rErr != nil {
if err := layerFiles.CleanupAll(); err != nil {
rErr = fmt.Errorf("failed to cleanup tmp files: %v: %w", err, rErr)
}
}
if cErr := ctx.Err(); cErr != nil {
rErr = fmt.Errorf("error from context %q: %w", cErr, rErr)
}
}()
tarBlob, err := decompressBlob(tarBlob, layerFiles)
if err != nil {
return nil, err
}
entries, err := sortEntries(tarBlob, opts.prioritizedFiles, opts.missedPrioritizedFiles)
if err != nil {
return nil, err
}
var tarParts [][]*entry
if opts.minChunkSize > 0 {
// Each entry needs to know the size of the current gzip stream so they
// cannot be processed in parallel.
tarParts = [][]*entry{entries}
} else {
tarParts = divideEntries(entries, runtime.GOMAXPROCS(0))
}
writers := make([]*Writer, len(tarParts))
payloads := make([]*os.File, len(tarParts))
var mu sync.Mutex
var eg errgroup.Group
for i, parts := range tarParts {
i, parts := i, parts
// builds verifiable stargz sub-blobs
eg.Go(func() error {
esgzFile, err := layerFiles.TempFile("", "esgzdata")
if err != nil {
return err
}
sw := NewWriterWithCompressor(esgzFile, opts.compression)
sw.ChunkSize = opts.chunkSize
sw.MinChunkSize = opts.minChunkSize
if sw.needsOpenGzEntries == nil {
sw.needsOpenGzEntries = make(map[string]struct{})
}
for _, f := range []string{PrefetchLandmark, NoPrefetchLandmark} {
sw.needsOpenGzEntries[f] = struct{}{}
}
if err := sw.AppendTar(readerFromEntries(parts...)); err != nil {
return err
}
mu.Lock()
writers[i] = sw
payloads[i] = esgzFile
mu.Unlock()
return nil
})
}
if err := eg.Wait(); err != nil {
rErr = err
return nil, err
}
tocAndFooter, tocDgst, err := closeWithCombine(writers...)
if err != nil {
rErr = err
return nil, err
}
var rs []io.Reader
for _, p := range payloads {
fs, err := fileSectionReader(p)
if err != nil {
return nil, err
}
rs = append(rs, fs)
}
diffID := digest.Canonical.Digester()
pr, pw := io.Pipe()
go func() {
r, err := opts.compression.Reader(io.TeeReader(io.MultiReader(append(rs, tocAndFooter)...), pw))
if err != nil {
pw.CloseWithError(err)
return
}
defer r.Close()
if _, err := io.Copy(diffID.Hash(), r); err != nil {
pw.CloseWithError(err)
return
}
pw.Close()
}()
return &Blob{
ReadCloser: readCloser{
Reader: pr,
closeFunc: layerFiles.CleanupAll,
},
tocDigest: tocDgst,
diffID: diffID,
}, nil
}
// closeWithCombine takes unclosed Writers and close them. This also returns the
// toc that combined all Writers into.
// Writers doesn't write TOC and footer to the underlying writers so they can be
// combined into a single eStargz and tocAndFooter returned by this function can
// be appended at the tail of that combined blob.
func closeWithCombine(ws ...*Writer) (tocAndFooterR io.Reader, tocDgst digest.Digest, err error) {
if len(ws) == 0 {
return nil, "", fmt.Errorf("at least one writer must be passed")
}
for _, w := range ws {
if w.closed {
return nil, "", fmt.Errorf("writer must be unclosed")
}
defer func(w *Writer) { w.closed = true }(w)
if err := w.closeGz(); err != nil {
return nil, "", err
}
if err := w.bw.Flush(); err != nil {
return nil, "", err
}
}
var (
mtoc = new(JTOC)
currentOffset int64
)
mtoc.Version = ws[0].toc.Version
for _, w := range ws {
for _, e := range w.toc.Entries {
// Recalculate Offset of non-empty files/chunks
if (e.Type == "reg" && e.Size > 0) || e.Type == "chunk" {
e.Offset += currentOffset
}
mtoc.Entries = append(mtoc.Entries, e)
}
if w.toc.Version > mtoc.Version {
mtoc.Version = w.toc.Version
}
currentOffset += w.cw.n
}
return tocAndFooter(ws[0].compressor, mtoc, currentOffset)
}
func tocAndFooter(compressor Compressor, toc *JTOC, offset int64) (io.Reader, digest.Digest, error) {
buf := new(bytes.Buffer)
tocDigest, err := compressor.WriteTOCAndFooter(buf, offset, toc, nil)
if err != nil {
return nil, "", err
}
return buf, tocDigest, nil
}
// divideEntries divides passed entries to the parts at least the number specified by the
// argument.
func divideEntries(entries []*entry, minPartsNum int) (set [][]*entry) {
var estimatedSize int64
for _, e := range entries {
estimatedSize += e.header.Size
}
unitSize := estimatedSize / int64(minPartsNum)
var (
nextEnd = unitSize
offset int64
)
set = append(set, []*entry{})
for _, e := range entries {
set[len(set)-1] = append(set[len(set)-1], e)
offset += e.header.Size
if offset > nextEnd {
set = append(set, []*entry{})
nextEnd += unitSize
}
}
return
}
var errNotFound = errors.New("not found")
// sortEntries reads the specified tar blob and returns a list of tar entries.
// If some of prioritized files are specified, the list starts from these
// files with keeping the order specified by the argument.
func sortEntries(in io.ReaderAt, prioritized []string, missedPrioritized *[]string) ([]*entry, error) {
// Import tar file.
intar, err := importTar(in)
if err != nil {
return nil, fmt.Errorf("failed to sort: %w", err)
}
// Sort the tar file respecting to the prioritized files list.
sorted := &tarFile{}
for _, l := range prioritized {
if err := moveRec(l, intar, sorted); err != nil {
if errors.Is(err, errNotFound) && missedPrioritized != nil {
*missedPrioritized = append(*missedPrioritized, l)
continue // allow not found
}
return nil, fmt.Errorf("failed to sort tar entries: %w", err)
}
}
if len(prioritized) == 0 {
sorted.add(&entry{
header: &tar.Header{
Name: NoPrefetchLandmark,
Typeflag: tar.TypeReg,
Size: int64(len([]byte{landmarkContents})),
},
payload: bytes.NewReader([]byte{landmarkContents}),
})
} else {
sorted.add(&entry{
header: &tar.Header{
Name: PrefetchLandmark,
Typeflag: tar.TypeReg,
Size: int64(len([]byte{landmarkContents})),
},
payload: bytes.NewReader([]byte{landmarkContents}),
})
}
// Dump all entry and concatinate them.
return append(sorted.dump(), intar.dump()...), nil
}
// readerFromEntries returns a reader of tar archive that contains entries passed
// through the arguments.
func readerFromEntries(entries ...*entry) io.Reader {
pr, pw := io.Pipe()
go func() {
tw := tar.NewWriter(pw)
defer tw.Close()
for _, entry := range entries {
if err := tw.WriteHeader(entry.header); err != nil {
pw.CloseWithError(fmt.Errorf("Failed to write tar header: %v", err))
return
}
if _, err := io.Copy(tw, entry.payload); err != nil {
pw.CloseWithError(fmt.Errorf("Failed to write tar payload: %v", err))
return
}
}
pw.Close()
}()
return pr
}
func importTar(in io.ReaderAt) (*tarFile, error) {
tf := &tarFile{}
pw, err := newCountReadSeeker(in)
if err != nil {
return nil, fmt.Errorf("failed to make position watcher: %w", err)
}
tr := tar.NewReader(pw)
// Walk through all nodes.
for {
// Fetch and parse next header.
h, err := tr.Next()
if err != nil {
if err == io.EOF {
break
}
return nil, fmt.Errorf("failed to parse tar file, %w", err)
}
switch cleanEntryName(h.Name) {
case PrefetchLandmark, NoPrefetchLandmark:
// Ignore existing landmark
continue
}
// Add entry. If it already exists, replace it.
if _, ok := tf.get(h.Name); ok {
tf.remove(h.Name)
}
tf.add(&entry{
header: h,
payload: io.NewSectionReader(in, pw.currentPos(), h.Size),
})
}
return tf, nil
}
func moveRec(name string, in *tarFile, out *tarFile) error {
name = cleanEntryName(name)
if name == "" { // root directory. stop recursion.
if e, ok := in.get(name); ok {
// entry of the root directory exists. we should move it as well.
// this case will occur if tar entries are prefixed with "./", "/", etc.
out.add(e)
in.remove(name)
}
return nil
}
_, okIn := in.get(name)
_, okOut := out.get(name)
if !okIn && !okOut {
return fmt.Errorf("file: %q: %w", name, errNotFound)
}
parent, _ := path.Split(strings.TrimSuffix(name, "/"))
if err := moveRec(parent, in, out); err != nil {
return err
}
if e, ok := in.get(name); ok && e.header.Typeflag == tar.TypeLink {
if err := moveRec(e.header.Linkname, in, out); err != nil {
return err
}
}
if e, ok := in.get(name); ok {
out.add(e)
in.remove(name)
}
return nil
}
type entry struct {
header *tar.Header
payload io.ReadSeeker
}
type tarFile struct {
index map[string]*entry
stream []*entry
}
func (f *tarFile) add(e *entry) {
if f.index == nil {
f.index = make(map[string]*entry)
}
f.index[cleanEntryName(e.header.Name)] = e
f.stream = append(f.stream, e)
}
func (f *tarFile) remove(name string) {
name = cleanEntryName(name)
if f.index != nil {
delete(f.index, name)
}
var filtered []*entry
for _, e := range f.stream {
if cleanEntryName(e.header.Name) == name {
continue
}
filtered = append(filtered, e)
}
f.stream = filtered
}
func (f *tarFile) get(name string) (e *entry, ok bool) {
if f.index == nil {
return nil, false
}
e, ok = f.index[cleanEntryName(name)]
return
}
func (f *tarFile) dump() []*entry {
return f.stream
}
type readCloser struct {
io.Reader
closeFunc func() error
}
func (rc readCloser) Close() error {
return rc.closeFunc()
}
func fileSectionReader(file *os.File) (*io.SectionReader, error) {
info, err := file.Stat()
if err != nil {
return nil, err
}
return io.NewSectionReader(file, 0, info.Size()), nil
}
func newTempFiles() *tempFiles {
return &tempFiles{}
}
type tempFiles struct {
files []*os.File
filesMu sync.Mutex
cleanupOnce sync.Once
}
func (tf *tempFiles) TempFile(dir, pattern string) (*os.File, error) {
f, err := os.CreateTemp(dir, pattern)
if err != nil {
return nil, err
}
tf.filesMu.Lock()
tf.files = append(tf.files, f)
tf.filesMu.Unlock()
return f, nil
}
func (tf *tempFiles) CleanupAll() (err error) {
tf.cleanupOnce.Do(func() {
err = tf.cleanupAll()
})
return
}
func (tf *tempFiles) cleanupAll() error {
tf.filesMu.Lock()
defer tf.filesMu.Unlock()
var allErr []error
for _, f := range tf.files {
if err := f.Close(); err != nil {
allErr = append(allErr, err)
}
if err := os.Remove(f.Name()); err != nil {
allErr = append(allErr, err)
}
}
tf.files = nil
return errorutil.Aggregate(allErr)
}
func newCountReadSeeker(r io.ReaderAt) (*countReadSeeker, error) {
pos := int64(0)
return &countReadSeeker{r: r, cPos: &pos}, nil
}
type countReadSeeker struct {
r io.ReaderAt
cPos *int64
mu sync.Mutex
}
func (cr *countReadSeeker) Read(p []byte) (int, error) {
cr.mu.Lock()
defer cr.mu.Unlock()
n, err := cr.r.ReadAt(p, *cr.cPos)
if err == nil {
*cr.cPos += int64(n)
}
return n, err
}
func (cr *countReadSeeker) Seek(offset int64, whence int) (int64, error) {
cr.mu.Lock()
defer cr.mu.Unlock()
switch whence {
default:
return 0, fmt.Errorf("Unknown whence: %v", whence)
case io.SeekStart:
case io.SeekCurrent:
offset += *cr.cPos
case io.SeekEnd:
return 0, fmt.Errorf("Unsupported whence: %v", whence)
}
if offset < 0 {
return 0, fmt.Errorf("invalid offset")
}
*cr.cPos = offset
return offset, nil
}
func (cr *countReadSeeker) currentPos() int64 {
cr.mu.Lock()
defer cr.mu.Unlock()
return *cr.cPos
}
func decompressBlob(org *io.SectionReader, tmp *tempFiles) (*io.SectionReader, error) {
if org.Size() < 4 {
return org, nil
}
src := make([]byte, 4)
if _, err := org.Read(src); err != nil && err != io.EOF {
return nil, err
}
var dR io.Reader
if bytes.Equal([]byte{0x1F, 0x8B, 0x08}, src[:3]) {
// gzip
dgR, err := gzip.NewReader(io.NewSectionReader(org, 0, org.Size()))
if err != nil {
return nil, err
}
defer dgR.Close()
dR = io.Reader(dgR)
} else if bytes.Equal([]byte{0x28, 0xb5, 0x2f, 0xfd}, src[:4]) {
// zstd
dzR, err := zstd.NewReader(io.NewSectionReader(org, 0, org.Size()))
if err != nil {
return nil, err
}
defer dzR.Close()
dR = io.Reader(dzR)
} else {
// uncompressed
return io.NewSectionReader(org, 0, org.Size()), nil
}
b, err := tmp.TempFile("", "uncompresseddata")
if err != nil {
return nil, err
}
if _, err := io.Copy(b, dR); err != nil {
return nil, err
}
return fileSectionReader(b)
}

View File

@@ -0,0 +1,40 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package errorutil
import (
"errors"
"fmt"
"strings"
)
// Aggregate combines a list of errors into a single new error.
func Aggregate(errs []error) error {
switch len(errs) {
case 0:
return nil
case 1:
return errs[0]
default:
points := make([]string, len(errs)+1)
points[0] = fmt.Sprintf("%d error(s) occurred:", len(errs))
for i, err := range errs {
points[i+1] = fmt.Sprintf("* %s", err)
}
return errors.New(strings.Join(points, "\n\t"))
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,278 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the LICENSE file.
*/
package externaltoc
import (
"archive/tar"
"bytes"
"compress/gzip"
"encoding/binary"
"encoding/json"
"fmt"
"hash"
"io"
"sync"
"github.com/containerd/stargz-snapshotter/estargz"
digest "github.com/opencontainers/go-digest"
)
type GzipCompression struct {
*GzipCompressor
*GzipDecompressor
}
func NewGzipCompressionWithLevel(provideTOC func() ([]byte, error), level int) estargz.Compression {
return &GzipCompression{
NewGzipCompressorWithLevel(level),
NewGzipDecompressor(provideTOC),
}
}
func NewGzipCompressor() *GzipCompressor {
return &GzipCompressor{compressionLevel: gzip.BestCompression}
}
func NewGzipCompressorWithLevel(level int) *GzipCompressor {
return &GzipCompressor{compressionLevel: level}
}
type GzipCompressor struct {
compressionLevel int
buf *bytes.Buffer
}
func (gc *GzipCompressor) WriteTOCTo(w io.Writer) (int, error) {
if len(gc.buf.Bytes()) == 0 {
return 0, fmt.Errorf("TOC hasn't been registered")
}
return w.Write(gc.buf.Bytes())
}
func (gc *GzipCompressor) Writer(w io.Writer) (estargz.WriteFlushCloser, error) {
return gzip.NewWriterLevel(w, gc.compressionLevel)
}
func (gc *GzipCompressor) WriteTOCAndFooter(w io.Writer, off int64, toc *estargz.JTOC, diffHash hash.Hash) (digest.Digest, error) {
tocJSON, err := json.MarshalIndent(toc, "", "\t")
if err != nil {
return "", err
}
buf := new(bytes.Buffer)
gz, _ := gzip.NewWriterLevel(buf, gc.compressionLevel)
// TOC isn't written to layer so no effect to diff ID
tw := tar.NewWriter(gz)
if err := tw.WriteHeader(&tar.Header{
Typeflag: tar.TypeReg,
Name: estargz.TOCTarName,
Size: int64(len(tocJSON)),
}); err != nil {
return "", err
}
if _, err := tw.Write(tocJSON); err != nil {
return "", err
}
if err := tw.Close(); err != nil {
return "", err
}
if err := gz.Close(); err != nil {
return "", err
}
gc.buf = buf
footerBytes, err := gzipFooterBytes()
if err != nil {
return "", err
}
if _, err := w.Write(footerBytes); err != nil {
return "", err
}
return digest.FromBytes(tocJSON), nil
}
// The footer is an empty gzip stream with no compression and an Extra header.
//
// 46 comes from:
//
// 10 bytes gzip header
// 2 bytes XLEN (length of Extra field) = 21 (4 bytes header + len("STARGZEXTERNALTOC"))
// 2 bytes Extra: SI1 = 'S', SI2 = 'G'
// 2 bytes Extra: LEN = 17 (len("STARGZEXTERNALTOC"))
// 17 bytes Extra: subfield = "STARGZEXTERNALTOC"
// 5 bytes flate header
// 8 bytes gzip footer
// (End of the eStargz blob)
const FooterSize = 46
// gzipFooterBytes returns the 104 bytes footer.
func gzipFooterBytes() ([]byte, error) {
buf := bytes.NewBuffer(make([]byte, 0, FooterSize))
gz, _ := gzip.NewWriterLevel(buf, gzip.NoCompression) // MUST be NoCompression to keep 51 bytes
// Extra header indicating the offset of TOCJSON
// https://tools.ietf.org/html/rfc1952#section-2.3.1.1
header := make([]byte, 4)
header[0], header[1] = 'S', 'G'
subfield := "STARGZEXTERNALTOC" // len("STARGZEXTERNALTOC") = 17
binary.LittleEndian.PutUint16(header[2:4], uint16(len(subfield))) // little-endian per RFC1952
gz.Header.Extra = append(header, []byte(subfield)...)
if err := gz.Close(); err != nil {
return nil, err
}
if buf.Len() != FooterSize {
panic(fmt.Sprintf("footer buffer = %d, not %d", buf.Len(), FooterSize))
}
return buf.Bytes(), nil
}
func NewGzipDecompressor(provideTOCFunc func() ([]byte, error)) *GzipDecompressor {
return &GzipDecompressor{provideTOCFunc: provideTOCFunc}
}
type GzipDecompressor struct {
provideTOCFunc func() ([]byte, error)
rawTOC []byte // Do not access this field directly. Get this through getTOC() method.
getTOCOnce sync.Once
}
func (gz *GzipDecompressor) getTOC() ([]byte, error) {
if len(gz.rawTOC) == 0 {
var retErr error
gz.getTOCOnce.Do(func() {
if gz.provideTOCFunc == nil {
retErr = fmt.Errorf("TOC hasn't been provided")
return
}
rawTOC, err := gz.provideTOCFunc()
if err != nil {
retErr = err
return
}
gz.rawTOC = rawTOC
})
if retErr != nil {
return nil, retErr
}
if len(gz.rawTOC) == 0 {
return nil, fmt.Errorf("no TOC is provided")
}
}
return gz.rawTOC, nil
}
func (gz *GzipDecompressor) Reader(r io.Reader) (io.ReadCloser, error) {
return gzip.NewReader(r)
}
func (gz *GzipDecompressor) ParseTOC(r io.Reader) (toc *estargz.JTOC, tocDgst digest.Digest, err error) {
if r != nil {
return nil, "", fmt.Errorf("TOC must be provided externally but got internal one")
}
rawTOC, err := gz.getTOC()
if err != nil {
return nil, "", fmt.Errorf("failed to get TOC: %v", err)
}
return parseTOCEStargz(bytes.NewReader(rawTOC))
}
func (gz *GzipDecompressor) ParseFooter(p []byte) (blobPayloadSize, tocOffset, tocSize int64, err error) {
if len(p) != FooterSize {
return 0, 0, 0, fmt.Errorf("invalid length %d cannot be parsed", len(p))
}
zr, err := gzip.NewReader(bytes.NewReader(p))
if err != nil {
return 0, 0, 0, err
}
defer zr.Close()
extra := zr.Header.Extra
si1, si2, subfieldlen, subfield := extra[0], extra[1], extra[2:4], extra[4:]
if si1 != 'S' || si2 != 'G' {
return 0, 0, 0, fmt.Errorf("invalid subfield IDs: %q, %q; want E, S", si1, si2)
}
if slen := binary.LittleEndian.Uint16(subfieldlen); slen != uint16(len("STARGZEXTERNALTOC")) {
return 0, 0, 0, fmt.Errorf("invalid length of subfield %d; want %d", slen, 16+len("STARGZ"))
}
if string(subfield) != "STARGZEXTERNALTOC" {
return 0, 0, 0, fmt.Errorf("STARGZ magic string must be included in the footer subfield")
}
// tocOffset < 0 indicates external TOC.
// blobPayloadSize < 0 indicates the entire blob size.
return -1, -1, 0, nil
}
func (gz *GzipDecompressor) FooterSize() int64 {
return FooterSize
}
func (gz *GzipDecompressor) DecompressTOC(r io.Reader) (tocJSON io.ReadCloser, err error) {
if r != nil {
return nil, fmt.Errorf("TOC must be provided externally but got internal one")
}
rawTOC, err := gz.getTOC()
if err != nil {
return nil, fmt.Errorf("failed to get TOC: %v", err)
}
return decompressTOCEStargz(bytes.NewReader(rawTOC))
}
func parseTOCEStargz(r io.Reader) (toc *estargz.JTOC, tocDgst digest.Digest, err error) {
tr, err := decompressTOCEStargz(r)
if err != nil {
return nil, "", err
}
dgstr := digest.Canonical.Digester()
toc = new(estargz.JTOC)
if err := json.NewDecoder(io.TeeReader(tr, dgstr.Hash())).Decode(&toc); err != nil {
return nil, "", fmt.Errorf("error decoding TOC JSON: %v", err)
}
if err := tr.Close(); err != nil {
return nil, "", err
}
return toc, dgstr.Digest(), nil
}
func decompressTOCEStargz(r io.Reader) (tocJSON io.ReadCloser, err error) {
zr, err := gzip.NewReader(r)
if err != nil {
return nil, fmt.Errorf("malformed TOC gzip header: %v", err)
}
zr.Multistream(false)
tr := tar.NewReader(zr)
h, err := tr.Next()
if err != nil {
return nil, fmt.Errorf("failed to find tar header in TOC gzip stream: %v", err)
}
if h.Name != estargz.TOCTarName {
return nil, fmt.Errorf("TOC tar entry had name %q; expected %q", h.Name, estargz.TOCTarName)
}
return readCloser{tr, zr.Close}, nil
}
type readCloser struct {
io.Reader
closeFunc func() error
}
func (rc readCloser) Close() error {
return rc.closeFunc()
}

View File

@@ -0,0 +1,237 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the LICENSE file.
*/
package estargz
import (
"archive/tar"
"bytes"
"compress/gzip"
"encoding/binary"
"encoding/json"
"fmt"
"hash"
"io"
"strconv"
digest "github.com/opencontainers/go-digest"
)
type gzipCompression struct {
*GzipCompressor
*GzipDecompressor
}
func newGzipCompressionWithLevel(level int) Compression {
return &gzipCompression{
&GzipCompressor{level},
&GzipDecompressor{},
}
}
func NewGzipCompressor() *GzipCompressor {
return &GzipCompressor{gzip.BestCompression}
}
func NewGzipCompressorWithLevel(level int) *GzipCompressor {
return &GzipCompressor{level}
}
type GzipCompressor struct {
compressionLevel int
}
func (gc *GzipCompressor) Writer(w io.Writer) (WriteFlushCloser, error) {
return gzip.NewWriterLevel(w, gc.compressionLevel)
}
func (gc *GzipCompressor) WriteTOCAndFooter(w io.Writer, off int64, toc *JTOC, diffHash hash.Hash) (digest.Digest, error) {
tocJSON, err := json.MarshalIndent(toc, "", "\t")
if err != nil {
return "", err
}
gz, _ := gzip.NewWriterLevel(w, gc.compressionLevel)
gw := io.Writer(gz)
if diffHash != nil {
gw = io.MultiWriter(gz, diffHash)
}
tw := tar.NewWriter(gw)
if err := tw.WriteHeader(&tar.Header{
Typeflag: tar.TypeReg,
Name: TOCTarName,
Size: int64(len(tocJSON)),
}); err != nil {
return "", err
}
if _, err := tw.Write(tocJSON); err != nil {
return "", err
}
if err := tw.Close(); err != nil {
return "", err
}
if err := gz.Close(); err != nil {
return "", err
}
if _, err := w.Write(gzipFooterBytes(off)); err != nil {
return "", err
}
return digest.FromBytes(tocJSON), nil
}
// gzipFooterBytes returns the 51 bytes footer.
func gzipFooterBytes(tocOff int64) []byte {
buf := bytes.NewBuffer(make([]byte, 0, FooterSize))
gz, _ := gzip.NewWriterLevel(buf, gzip.NoCompression) // MUST be NoCompression to keep 51 bytes
// Extra header indicating the offset of TOCJSON
// https://tools.ietf.org/html/rfc1952#section-2.3.1.1
header := make([]byte, 4)
header[0], header[1] = 'S', 'G'
subfield := fmt.Sprintf("%016xSTARGZ", tocOff)
binary.LittleEndian.PutUint16(header[2:4], uint16(len(subfield))) // little-endian per RFC1952
gz.Header.Extra = append(header, []byte(subfield)...)
gz.Close()
if buf.Len() != FooterSize {
panic(fmt.Sprintf("footer buffer = %d, not %d", buf.Len(), FooterSize))
}
return buf.Bytes()
}
type GzipDecompressor struct{}
func (gz *GzipDecompressor) Reader(r io.Reader) (io.ReadCloser, error) {
return gzip.NewReader(r)
}
func (gz *GzipDecompressor) ParseTOC(r io.Reader) (toc *JTOC, tocDgst digest.Digest, err error) {
return parseTOCEStargz(r)
}
func (gz *GzipDecompressor) ParseFooter(p []byte) (blobPayloadSize, tocOffset, tocSize int64, err error) {
if len(p) != FooterSize {
return 0, 0, 0, fmt.Errorf("invalid length %d cannot be parsed", len(p))
}
zr, err := gzip.NewReader(bytes.NewReader(p))
if err != nil {
return 0, 0, 0, err
}
defer zr.Close()
extra := zr.Header.Extra
si1, si2, subfieldlen, subfield := extra[0], extra[1], extra[2:4], extra[4:]
if si1 != 'S' || si2 != 'G' {
return 0, 0, 0, fmt.Errorf("invalid subfield IDs: %q, %q; want E, S", si1, si2)
}
if slen := binary.LittleEndian.Uint16(subfieldlen); slen != uint16(16+len("STARGZ")) {
return 0, 0, 0, fmt.Errorf("invalid length of subfield %d; want %d", slen, 16+len("STARGZ"))
}
if string(subfield[16:]) != "STARGZ" {
return 0, 0, 0, fmt.Errorf("STARGZ magic string must be included in the footer subfield")
}
tocOffset, err = strconv.ParseInt(string(subfield[:16]), 16, 64)
if err != nil {
return 0, 0, 0, fmt.Errorf("legacy: failed to parse toc offset: %w", err)
}
return tocOffset, tocOffset, 0, nil
}
func (gz *GzipDecompressor) FooterSize() int64 {
return FooterSize
}
func (gz *GzipDecompressor) DecompressTOC(r io.Reader) (tocJSON io.ReadCloser, err error) {
return decompressTOCEStargz(r)
}
type LegacyGzipDecompressor struct{}
func (gz *LegacyGzipDecompressor) Reader(r io.Reader) (io.ReadCloser, error) {
return gzip.NewReader(r)
}
func (gz *LegacyGzipDecompressor) ParseTOC(r io.Reader) (toc *JTOC, tocDgst digest.Digest, err error) {
return parseTOCEStargz(r)
}
func (gz *LegacyGzipDecompressor) ParseFooter(p []byte) (blobPayloadSize, tocOffset, tocSize int64, err error) {
if len(p) != legacyFooterSize {
return 0, 0, 0, fmt.Errorf("legacy: invalid length %d cannot be parsed", len(p))
}
zr, err := gzip.NewReader(bytes.NewReader(p))
if err != nil {
return 0, 0, 0, fmt.Errorf("legacy: failed to get footer gzip reader: %w", err)
}
defer zr.Close()
extra := zr.Header.Extra
if len(extra) != 16+len("STARGZ") {
return 0, 0, 0, fmt.Errorf("legacy: invalid stargz's extra field size")
}
if string(extra[16:]) != "STARGZ" {
return 0, 0, 0, fmt.Errorf("legacy: magic string STARGZ not found")
}
tocOffset, err = strconv.ParseInt(string(extra[:16]), 16, 64)
if err != nil {
return 0, 0, 0, fmt.Errorf("legacy: failed to parse toc offset: %w", err)
}
return tocOffset, tocOffset, 0, nil
}
func (gz *LegacyGzipDecompressor) FooterSize() int64 {
return legacyFooterSize
}
func (gz *LegacyGzipDecompressor) DecompressTOC(r io.Reader) (tocJSON io.ReadCloser, err error) {
return decompressTOCEStargz(r)
}
func parseTOCEStargz(r io.Reader) (toc *JTOC, tocDgst digest.Digest, err error) {
tr, err := decompressTOCEStargz(r)
if err != nil {
return nil, "", err
}
dgstr := digest.Canonical.Digester()
toc = new(JTOC)
if err := json.NewDecoder(io.TeeReader(tr, dgstr.Hash())).Decode(&toc); err != nil {
return nil, "", fmt.Errorf("error decoding TOC JSON: %v", err)
}
if err := tr.Close(); err != nil {
return nil, "", err
}
return toc, dgstr.Digest(), nil
}
func decompressTOCEStargz(r io.Reader) (tocJSON io.ReadCloser, err error) {
zr, err := gzip.NewReader(r)
if err != nil {
return nil, fmt.Errorf("malformed TOC gzip header: %v", err)
}
zr.Multistream(false)
tr := tar.NewReader(zr)
h, err := tr.Next()
if err != nil {
return nil, fmt.Errorf("failed to find tar header in TOC gzip stream: %v", err)
}
if h.Name != TOCTarName {
return nil, fmt.Errorf("TOC tar entry had name %q; expected %q", h.Name, TOCTarName)
}
return readCloser{tr, zr.Close}, nil
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,342 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the LICENSE file.
*/
package estargz
import (
"archive/tar"
"hash"
"io"
"os"
"path"
"time"
digest "github.com/opencontainers/go-digest"
)
const (
// TOCTarName is the name of the JSON file in the tar archive in the
// table of contents gzip stream.
TOCTarName = "stargz.index.json"
// FooterSize is the number of bytes in the footer
//
// The footer is an empty gzip stream with no compression and an Extra
// header of the form "%016xSTARGZ", where the 64 bit hex-encoded
// number is the offset to the gzip stream of JSON TOC.
//
// 51 comes from:
//
// 10 bytes gzip header
// 2 bytes XLEN (length of Extra field) = 26 (4 bytes header + 16 hex digits + len("STARGZ"))
// 2 bytes Extra: SI1 = 'S', SI2 = 'G'
// 2 bytes Extra: LEN = 22 (16 hex digits + len("STARGZ"))
// 22 bytes Extra: subfield = fmt.Sprintf("%016xSTARGZ", offsetOfTOC)
// 5 bytes flate header
// 8 bytes gzip footer
// (End of the eStargz blob)
//
// NOTE: For Extra fields, subfield IDs SI1='S' SI2='G' is used for eStargz.
FooterSize = 51
// legacyFooterSize is the number of bytes in the legacy stargz footer.
//
// 47 comes from:
//
// 10 byte gzip header +
// 2 byte (LE16) length of extra, encoding 22 (16 hex digits + len("STARGZ")) == "\x16\x00" +
// 22 bytes of extra (fmt.Sprintf("%016xSTARGZ", tocGzipOffset))
// 5 byte flate header
// 8 byte gzip footer (two little endian uint32s: digest, size)
legacyFooterSize = 47
// TOCJSONDigestAnnotation is an annotation for an image layer. This stores the
// digest of the TOC JSON.
// This annotation is valid only when it is specified in `.[]layers.annotations`
// of an image manifest.
TOCJSONDigestAnnotation = "containerd.io/snapshot/stargz/toc.digest"
// StoreUncompressedSizeAnnotation is an additional annotation key for eStargz to enable lazy
// pulling on containers/storage. Stargz Store is required to expose the layer's uncompressed size
// to the runtime but current OCI image doesn't ship this information by default. So we store this
// to the special annotation.
StoreUncompressedSizeAnnotation = "io.containers.estargz.uncompressed-size"
// PrefetchLandmark is a file entry which indicates the end position of
// prefetch in the stargz file.
PrefetchLandmark = ".prefetch.landmark"
// NoPrefetchLandmark is a file entry which indicates that no prefetch should
// occur in the stargz file.
NoPrefetchLandmark = ".no.prefetch.landmark"
landmarkContents = 0xf
)
// JTOC is the JSON-serialized table of contents index of the files in the stargz file.
type JTOC struct {
Version int `json:"version"`
Entries []*TOCEntry `json:"entries"`
}
// TOCEntry is an entry in the stargz file's TOC (Table of Contents).
type TOCEntry struct {
// Name is the tar entry's name. It is the complete path
// stored in the tar file, not just the base name.
Name string `json:"name"`
// Type is one of "dir", "reg", "symlink", "hardlink", "char",
// "block", "fifo", or "chunk".
// The "chunk" type is used for regular file data chunks past the first
// TOCEntry; the 2nd chunk and on have only Type ("chunk"), Offset,
// ChunkOffset, and ChunkSize populated.
Type string `json:"type"`
// Size, for regular files, is the logical size of the file.
Size int64 `json:"size,omitempty"`
// ModTime3339 is the modification time of the tar entry. Empty
// means zero or unknown. Otherwise it's in UTC RFC3339
// format. Use the ModTime method to access the time.Time value.
ModTime3339 string `json:"modtime,omitempty"`
modTime time.Time
// LinkName, for symlinks and hardlinks, is the link target.
LinkName string `json:"linkName,omitempty"`
// Mode is the permission and mode bits.
Mode int64 `json:"mode,omitempty"`
// UID is the user ID of the owner.
UID int `json:"uid,omitempty"`
// GID is the group ID of the owner.
GID int `json:"gid,omitempty"`
// Uname is the username of the owner.
//
// In the serialized JSON, this field may only be present for
// the first entry with the same UID.
Uname string `json:"userName,omitempty"`
// Gname is the group name of the owner.
//
// In the serialized JSON, this field may only be present for
// the first entry with the same GID.
Gname string `json:"groupName,omitempty"`
// Offset, for regular files, provides the offset in the
// stargz file to the file's data bytes. See ChunkOffset and
// ChunkSize.
Offset int64 `json:"offset,omitempty"`
// InnerOffset is an optional field indicates uncompressed offset
// of this "reg" or "chunk" payload in a stream starts from Offset.
// This field enables to put multiple "reg" or "chunk" payloads
// in one chunk with having the same Offset but different InnerOffset.
InnerOffset int64 `json:"innerOffset,omitempty"`
nextOffset int64 // the Offset of the next entry with a non-zero Offset
// DevMajor is the major device number for "char" and "block" types.
DevMajor int `json:"devMajor,omitempty"`
// DevMinor is the major device number for "char" and "block" types.
DevMinor int `json:"devMinor,omitempty"`
// NumLink is the number of entry names pointing to this entry.
// Zero means one name references this entry.
// This field is calculated during runtime and not recorded in TOC JSON.
NumLink int `json:"-"`
// Xattrs are the extended attribute for the entry.
Xattrs map[string][]byte `json:"xattrs,omitempty"`
// Digest stores the OCI checksum for regular files payload.
// It has the form "sha256:abcdef01234....".
Digest string `json:"digest,omitempty"`
// ChunkOffset is non-zero if this is a chunk of a large,
// regular file. If so, the Offset is where the gzip header of
// ChunkSize bytes at ChunkOffset in Name begin.
//
// In serialized form, a "chunkSize" JSON field of zero means
// that the chunk goes to the end of the file. After reading
// from the stargz TOC, though, the ChunkSize is initialized
// to a non-zero file for when Type is either "reg" or
// "chunk".
ChunkOffset int64 `json:"chunkOffset,omitempty"`
ChunkSize int64 `json:"chunkSize,omitempty"`
// ChunkDigest stores an OCI digest of the chunk. This must be formed
// as "sha256:0123abcd...".
ChunkDigest string `json:"chunkDigest,omitempty"`
children map[string]*TOCEntry
// chunkTopIndex is index of the entry where Offset starts in the blob.
chunkTopIndex int
}
// ModTime returns the entry's modification time.
func (e *TOCEntry) ModTime() time.Time { return e.modTime }
// NextOffset returns the position (relative to the start of the
// stargz file) of the next gzip boundary after e.Offset.
func (e *TOCEntry) NextOffset() int64 { return e.nextOffset }
func (e *TOCEntry) addChild(baseName string, child *TOCEntry) {
if e.children == nil {
e.children = make(map[string]*TOCEntry)
}
if child.Type == "dir" {
e.NumLink++ // Entry ".." in the subdirectory links to this directory
}
e.children[baseName] = child
}
// isDataType reports whether TOCEntry is a regular file or chunk (something that
// contains regular file data).
func (e *TOCEntry) isDataType() bool { return e.Type == "reg" || e.Type == "chunk" }
// Stat returns a FileInfo value representing e.
func (e *TOCEntry) Stat() os.FileInfo { return fileInfo{e} }
// ForeachChild calls f for each child item. If f returns false, iteration ends.
// If e is not a directory, f is not called.
func (e *TOCEntry) ForeachChild(f func(baseName string, ent *TOCEntry) bool) {
for name, ent := range e.children {
if !f(name, ent) {
return
}
}
}
// LookupChild returns the directory e's child by its base name.
func (e *TOCEntry) LookupChild(baseName string) (child *TOCEntry, ok bool) {
child, ok = e.children[baseName]
return
}
// fileInfo implements os.FileInfo using the wrapped *TOCEntry.
type fileInfo struct{ e *TOCEntry }
var _ os.FileInfo = fileInfo{}
func (fi fileInfo) Name() string { return path.Base(fi.e.Name) }
func (fi fileInfo) IsDir() bool { return fi.e.Type == "dir" }
func (fi fileInfo) Size() int64 { return fi.e.Size }
func (fi fileInfo) ModTime() time.Time { return fi.e.ModTime() }
func (fi fileInfo) Sys() interface{} { return fi.e }
func (fi fileInfo) Mode() (m os.FileMode) {
// TOCEntry.Mode is tar.Header.Mode so we can understand the these bits using `tar` pkg.
m = (&tar.Header{Mode: fi.e.Mode}).FileInfo().Mode() &
(os.ModePerm | os.ModeSetuid | os.ModeSetgid | os.ModeSticky)
switch fi.e.Type {
case "dir":
m |= os.ModeDir
case "symlink":
m |= os.ModeSymlink
case "char":
m |= os.ModeDevice | os.ModeCharDevice
case "block":
m |= os.ModeDevice
case "fifo":
m |= os.ModeNamedPipe
}
return m
}
// TOCEntryVerifier holds verifiers that are usable for verifying chunks contained
// in a eStargz blob.
type TOCEntryVerifier interface {
// Verifier provides a content verifier that can be used for verifying the
// contents of the specified TOCEntry.
Verifier(ce *TOCEntry) (digest.Verifier, error)
}
// Compression provides the compression helper to be used creating and parsing eStargz.
// This package provides gzip-based Compression by default, but any compression
// algorithm (e.g. zstd) can be used as long as it implements Compression.
type Compression interface {
Compressor
Decompressor
}
// Compressor represents the helper mothods to be used for creating eStargz.
type Compressor interface {
// Writer returns WriteCloser to be used for writing a chunk to eStargz.
// Everytime a chunk is written, the WriteCloser is closed and Writer is
// called again for writing the next chunk.
//
// The returned writer should implement "Flush() error" function that flushes
// any pending compressed data to the underlying writer.
Writer(w io.Writer) (WriteFlushCloser, error)
// WriteTOCAndFooter is called to write JTOC to the passed Writer.
// diffHash calculates the DiffID (uncompressed sha256 hash) of the blob
// WriteTOCAndFooter can optionally write anything that affects DiffID calculation
// (e.g. uncompressed TOC JSON).
//
// This function returns tocDgst that represents the digest of TOC that will be used
// to verify this blob when it's parsed.
WriteTOCAndFooter(w io.Writer, off int64, toc *JTOC, diffHash hash.Hash) (tocDgst digest.Digest, err error)
}
// Decompressor represents the helper mothods to be used for parsing eStargz.
type Decompressor interface {
// Reader returns ReadCloser to be used for decompressing file payload.
Reader(r io.Reader) (io.ReadCloser, error)
// FooterSize returns the size of the footer of this blob.
FooterSize() int64
// ParseFooter parses the footer and returns the offset and (compressed) size of TOC.
// payloadBlobSize is the (compressed) size of the blob payload (i.e. the size between
// the top until the TOC JSON).
//
// If tocOffset < 0, we assume that TOC isn't contained in the blob and pass nil reader
// to ParseTOC. We expect that ParseTOC acquire TOC from the external location and return it.
//
// tocSize is optional. If tocSize <= 0, it's by default the size of the range from tocOffset until the beginning of the
// footer (blob size - tocOff - FooterSize).
// If blobPayloadSize < 0, blobPayloadSize become the blob size.
ParseFooter(p []byte) (blobPayloadSize, tocOffset, tocSize int64, err error)
// ParseTOC parses TOC from the passed reader. The reader provides the partial contents
// of the underlying blob that has the range specified by ParseFooter method.
//
// This function returns tocDgst that represents the digest of TOC that will be used
// to verify this blob. This must match to the value returned from
// Compressor.WriteTOCAndFooter that is used when creating this blob.
//
// If tocOffset returned by ParseFooter is < 0, we assume that TOC isn't contained in the blob.
// Pass nil reader to ParseTOC then we expect that ParseTOC acquire TOC from the external location
// and return it.
ParseTOC(r io.Reader) (toc *JTOC, tocDgst digest.Digest, err error)
}
type WriteFlushCloser interface {
io.WriteCloser
Flush() error
}

View File

@@ -0,0 +1,201 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package zstdchunked
import (
"bufio"
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"hash"
"io"
"sync"
"github.com/containerd/stargz-snapshotter/estargz"
"github.com/klauspost/compress/zstd"
digest "github.com/opencontainers/go-digest"
)
const (
// ManifestChecksumAnnotation is an annotation that contains the compressed TOC Digset
ManifestChecksumAnnotation = "io.containers.zstd-chunked.manifest-checksum"
// ManifestPositionAnnotation is an annotation that contains the offset to the TOC.
ManifestPositionAnnotation = "io.containers.zstd-chunked.manifest-position"
// FooterSize is the size of the footer
FooterSize = 40
manifestTypeCRFS = 1
)
var (
skippableFrameMagic = []byte{0x50, 0x2a, 0x4d, 0x18}
zstdFrameMagic = []byte{0x28, 0xb5, 0x2f, 0xfd}
zstdChunkedFrameMagic = []byte{0x47, 0x6e, 0x55, 0x6c, 0x49, 0x6e, 0x55, 0x78}
)
type Decompressor struct{}
func (zz *Decompressor) Reader(r io.Reader) (io.ReadCloser, error) {
decoder, err := zstd.NewReader(r)
if err != nil {
return nil, err
}
return &zstdReadCloser{decoder}, nil
}
func (zz *Decompressor) ParseTOC(r io.Reader) (toc *estargz.JTOC, tocDgst digest.Digest, err error) {
zr, err := zstd.NewReader(r)
if err != nil {
return nil, "", err
}
defer zr.Close()
dgstr := digest.Canonical.Digester()
toc = new(estargz.JTOC)
if err := json.NewDecoder(io.TeeReader(zr, dgstr.Hash())).Decode(&toc); err != nil {
return nil, "", fmt.Errorf("error decoding TOC JSON: %w", err)
}
return toc, dgstr.Digest(), nil
}
func (zz *Decompressor) ParseFooter(p []byte) (blobPayloadSize, tocOffset, tocSize int64, err error) {
offset := binary.LittleEndian.Uint64(p[0:8])
compressedLength := binary.LittleEndian.Uint64(p[8:16])
if !bytes.Equal(zstdChunkedFrameMagic, p[32:40]) {
return 0, 0, 0, fmt.Errorf("invalid magic number")
}
// 8 is the size of the zstd skippable frame header + the frame size (see WriteTOCAndFooter)
return int64(offset - 8), int64(offset), int64(compressedLength), nil
}
func (zz *Decompressor) FooterSize() int64 {
return FooterSize
}
func (zz *Decompressor) DecompressTOC(r io.Reader) (tocJSON io.ReadCloser, err error) {
decoder, err := zstd.NewReader(r)
if err != nil {
return nil, err
}
br := bufio.NewReader(decoder)
if _, err := br.Peek(1); err != nil {
return nil, err
}
return &reader{br, decoder.Close}, nil
}
type reader struct {
io.Reader
closeFunc func()
}
func (r *reader) Close() error { r.closeFunc(); return nil }
type zstdReadCloser struct{ *zstd.Decoder }
func (z *zstdReadCloser) Close() error {
z.Decoder.Close()
return nil
}
type Compressor struct {
CompressionLevel zstd.EncoderLevel
Metadata map[string]string
pool sync.Pool
}
func (zc *Compressor) Writer(w io.Writer) (estargz.WriteFlushCloser, error) {
if wc := zc.pool.Get(); wc != nil {
ec := wc.(*zstd.Encoder)
ec.Reset(w)
return &poolEncoder{ec, zc}, nil
}
ec, err := zstd.NewWriter(w, zstd.WithEncoderLevel(zc.CompressionLevel), zstd.WithLowerEncoderMem(true))
if err != nil {
return nil, err
}
return &poolEncoder{ec, zc}, nil
}
type poolEncoder struct {
*zstd.Encoder
zc *Compressor
}
func (w *poolEncoder) Close() error {
if err := w.Encoder.Close(); err != nil {
return err
}
w.zc.pool.Put(w.Encoder)
return nil
}
func (zc *Compressor) WriteTOCAndFooter(w io.Writer, off int64, toc *estargz.JTOC, diffHash hash.Hash) (digest.Digest, error) {
tocJSON, err := json.MarshalIndent(toc, "", "\t")
if err != nil {
return "", err
}
buf := new(bytes.Buffer)
encoder, err := zstd.NewWriter(buf, zstd.WithEncoderLevel(zc.CompressionLevel))
if err != nil {
return "", err
}
if _, err := encoder.Write(tocJSON); err != nil {
return "", err
}
if err := encoder.Close(); err != nil {
return "", err
}
compressedTOC := buf.Bytes()
_, err = io.Copy(w, bytes.NewReader(appendSkippableFrameMagic(compressedTOC)))
// 8 is the size of the zstd skippable frame header + the frame size
tocOff := uint64(off) + 8
if _, err := w.Write(appendSkippableFrameMagic(
zstdFooterBytes(tocOff, uint64(len(tocJSON)), uint64(len(compressedTOC)))),
); err != nil {
return "", err
}
if zc.Metadata != nil {
zc.Metadata[ManifestChecksumAnnotation] = digest.FromBytes(compressedTOC).String()
zc.Metadata[ManifestPositionAnnotation] = fmt.Sprintf("%d:%d:%d:%d",
tocOff, len(compressedTOC), len(tocJSON), manifestTypeCRFS)
}
return digest.FromBytes(tocJSON), err
}
// zstdFooterBytes returns the 40 bytes footer.
func zstdFooterBytes(tocOff, tocRawSize, tocCompressedSize uint64) []byte {
footer := make([]byte, FooterSize)
binary.LittleEndian.PutUint64(footer, tocOff)
binary.LittleEndian.PutUint64(footer[8:], tocCompressedSize)
binary.LittleEndian.PutUint64(footer[16:], tocRawSize)
binary.LittleEndian.PutUint64(footer[24:], manifestTypeCRFS)
copy(footer[32:40], zstdChunkedFrameMagic)
return footer
}
func appendSkippableFrameMagic(b []byte) []byte {
size := make([]byte, 4)
binary.LittleEndian.PutUint32(size, uint32(len(b)))
return append(append(skippableFrameMagic, size...), b...)
}

View File

@@ -0,0 +1,151 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the NOTICE.md file.
*/
package config
const (
// TargetSkipVerifyLabel is a snapshot label key that indicates to skip content
// verification for the layer.
TargetSkipVerifyLabel = "containerd.io/snapshot/remote/stargz.skipverify"
// TargetPrefetchSizeLabel is a snapshot label key that indicates size to prefetch
// the layer. If the layer is eStargz and contains prefetch landmarks, these config
// will be respeced.
TargetPrefetchSizeLabel = "containerd.io/snapshot/remote/stargz.prefetch"
)
// Config is configuration for stargz snapshotter filesystem.
type Config struct {
// Type of cache for compressed contents fetched from the registry. "memory" stores them on memory.
// Other values default to cache them on disk.
HTTPCacheType string `toml:"http_cache_type"`
// Type of cache for uncompressed files contents. "memory" stores them on memory. Other values
// default to cache them on disk.
FSCacheType string `toml:"filesystem_cache_type"`
// ResolveResultEntryTTLSec is TTL (in sec) to cache resolved layers for
// future use. (default 120s)
ResolveResultEntryTTLSec int `toml:"resolve_result_entry_ttl_sec"`
// PrefetchSize is the default size (in bytes) to prefetch when mounting a layer. Default is 0. Stargz-snapshotter still
// uses the value specified by the image using "containerd.io/snapshot/remote/stargz.prefetch" or the landmark file.
PrefetchSize int64 `toml:"prefetch_size"`
// PrefetchTimeoutSec is the default timeout (in seconds) when the prefetching takes long. Default is 10s.
PrefetchTimeoutSec int64 `toml:"prefetch_timeout_sec"`
// NoPrefetch disables prefetching. Default is false.
NoPrefetch bool `toml:"noprefetch"`
// NoBackgroundFetch disables the behaviour of fetching the entire layer contents in background. Default is false.
NoBackgroundFetch bool `toml:"no_background_fetch"`
// Debug enables filesystem debug log.
Debug bool `toml:"debug"`
// AllowNoVerification allows mouting images without verification. Default is false.
AllowNoVerification bool `toml:"allow_no_verification"`
// DisableVerification disables verifying layer contents. Default is false.
DisableVerification bool `toml:"disable_verification"`
// MaxConcurrency is max number of concurrent background tasks for fetching layer contents. Default is 2.
MaxConcurrency int64 `toml:"max_concurrency"`
// NoPrometheus disables exposing filesystem-related metrics. Default is false.
NoPrometheus bool `toml:"no_prometheus"`
// BlobConfig is config for layer blob management.
BlobConfig `toml:"blob"`
// DirectoryCacheConfig is config for directory-based cache.
DirectoryCacheConfig `toml:"directory_cache"`
// FuseConfig is configurations for FUSE fs.
FuseConfig `toml:"fuse"`
// ResolveResultEntry is a deprecated field.
ResolveResultEntry int `toml:"resolve_result_entry"` // deprecated
}
// BlobConfig is configuration for the logic to fetching blobs.
type BlobConfig struct {
// ValidInterval specifies a duration (in seconds) during which the layer can be reused without
// checking the connection to the registry. Default is 60.
ValidInterval int64 `toml:"valid_interval"`
// CheckAlways overwrites ValidInterval to 0 if it's true. Default is false.
CheckAlways bool `toml:"check_always"`
// ChunkSize is the granularity (in bytes) at which background fetch and on-demand reads
// are fetched from the remote registry. Default is 50000.
ChunkSize int64 `toml:"chunk_size"`
// FetchTimeoutSec is a timeout duration (in seconds) for fetching chunks from the registry. Default is 300.
FetchTimeoutSec int64 `toml:"fetching_timeout_sec"`
// ForceSingleRangeMode disables using of multiple ranges in a Range Request and always specifies one larger
// region that covers them. Default is false.
ForceSingleRangeMode bool `toml:"force_single_range_mode"`
// PrefetchChunkSize is the maximum bytes transferred per http GET from remote registry
// during prefetch. It is recommended to have PrefetchChunkSize > ChunkSize.
// If PrefetchChunkSize < ChunkSize prefetch bytes will be fetched as a single http GET,
// else total GET requests for prefetch = ceil(PrefetchSize / PrefetchChunkSize).
// Default is 0.
PrefetchChunkSize int64 `toml:"prefetch_chunk_size"`
// MaxRetries is a max number of reries of a HTTP request. Default is 5.
MaxRetries int `toml:"max_retries"`
// MinWaitMSec is minimal delay (in seconds) for the next retrying after a request failure. Default is 30.
MinWaitMSec int `toml:"min_wait_msec"`
// MinWaitMSec is maximum delay (in seconds) for the next retrying after a request failure. Default is 30.
MaxWaitMSec int `toml:"max_wait_msec"`
}
// DirectoryCacheConfig is configuration for the disk-based cache.
type DirectoryCacheConfig struct {
// MaxLRUCacheEntry is the number of entries of LRU cache to cache data on memory. Default is 10.
MaxLRUCacheEntry int `toml:"max_lru_cache_entry"`
// MaxCacheFds is the number of entries of LRU cache to hold fds of files of cached contents. Default is 10.
MaxCacheFds int `toml:"max_cache_fds"`
// SyncAdd being true means that each adding of data to the cache blocks until the data is fully written to the
// cache directory. Default is false.
SyncAdd bool `toml:"sync_add"`
// Direct disables on-memory data cache. Default is true for saving memory usage.
Direct bool `toml:"direct" default:"true"`
}
// FuseConfig is configuration for FUSE fs.
type FuseConfig struct {
// AttrTimeout defines overall timeout attribute for a file system in seconds.
AttrTimeout int64 `toml:"attr_timeout"`
// EntryTimeout defines TTL for directory, name lookup in seconds.
EntryTimeout int64 `toml:"entry_timeout"`
}

View File

@@ -0,0 +1,506 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the NOTICE.md file.
*/
//
// Example implementation of FileSystem.
//
// This implementation uses stargz by CRFS(https://github.com/google/crfs) as
// image format, which has following feature:
// - We can use docker registry as a backend store (means w/o additional layer
// stores).
// - The stargz-formatted image is still docker-compatible (means normal
// runtimes can still use the formatted image).
//
// Currently, we reimplemented CRFS-like filesystem for ease of integration.
// But in the near future, we intend to integrate it with CRFS.
//
package fs
import (
"context"
"fmt"
"os/exec"
"strconv"
"sync"
"time"
"github.com/containerd/containerd/v2/core/remotes/docker"
"github.com/containerd/containerd/v2/pkg/reference"
"github.com/containerd/log"
"github.com/containerd/stargz-snapshotter/estargz"
"github.com/containerd/stargz-snapshotter/fs/config"
"github.com/containerd/stargz-snapshotter/fs/layer"
commonmetrics "github.com/containerd/stargz-snapshotter/fs/metrics/common"
layermetrics "github.com/containerd/stargz-snapshotter/fs/metrics/layer"
"github.com/containerd/stargz-snapshotter/fs/remote"
"github.com/containerd/stargz-snapshotter/fs/source"
"github.com/containerd/stargz-snapshotter/metadata"
memorymetadata "github.com/containerd/stargz-snapshotter/metadata/memory"
"github.com/containerd/stargz-snapshotter/snapshot"
"github.com/containerd/stargz-snapshotter/task"
metrics "github.com/docker/go-metrics"
fusefs "github.com/hanwen/go-fuse/v2/fs"
"github.com/hanwen/go-fuse/v2/fuse"
digest "github.com/opencontainers/go-digest"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
"golang.org/x/sys/unix"
)
const (
defaultFuseTimeout = time.Second
defaultMaxConcurrency = 2
)
var fusermountBin = []string{"fusermount", "fusermount3"}
type Option func(*options)
type options struct {
getSources source.GetSources
resolveHandlers map[string]remote.Handler
metadataStore metadata.Store
metricsLogLevel *log.Level
overlayOpaqueType layer.OverlayOpaqueType
additionalDecompressors func(context.Context, source.RegistryHosts, reference.Spec, ocispec.Descriptor) []metadata.Decompressor
}
func WithGetSources(s source.GetSources) Option {
return func(opts *options) {
opts.getSources = s
}
}
func WithResolveHandler(name string, handler remote.Handler) Option {
return func(opts *options) {
if opts.resolveHandlers == nil {
opts.resolveHandlers = make(map[string]remote.Handler)
}
opts.resolveHandlers[name] = handler
}
}
func WithMetadataStore(metadataStore metadata.Store) Option {
return func(opts *options) {
opts.metadataStore = metadataStore
}
}
func WithMetricsLogLevel(logLevel log.Level) Option {
return func(opts *options) {
opts.metricsLogLevel = &logLevel
}
}
func WithOverlayOpaqueType(overlayOpaqueType layer.OverlayOpaqueType) Option {
return func(opts *options) {
opts.overlayOpaqueType = overlayOpaqueType
}
}
func WithAdditionalDecompressors(d func(context.Context, source.RegistryHosts, reference.Spec, ocispec.Descriptor) []metadata.Decompressor) Option {
return func(opts *options) {
opts.additionalDecompressors = d
}
}
func NewFilesystem(root string, cfg config.Config, opts ...Option) (_ snapshot.FileSystem, err error) {
var fsOpts options
for _, o := range opts {
o(&fsOpts)
}
maxConcurrency := cfg.MaxConcurrency
if maxConcurrency == 0 {
maxConcurrency = defaultMaxConcurrency
}
attrTimeout := time.Duration(cfg.FuseConfig.AttrTimeout) * time.Second
if attrTimeout == 0 {
attrTimeout = defaultFuseTimeout
}
entryTimeout := time.Duration(cfg.FuseConfig.EntryTimeout) * time.Second
if entryTimeout == 0 {
entryTimeout = defaultFuseTimeout
}
metadataStore := fsOpts.metadataStore
if metadataStore == nil {
metadataStore = memorymetadata.NewReader
}
getSources := fsOpts.getSources
if getSources == nil {
getSources = source.FromDefaultLabels(func(refspec reference.Spec) (hosts []docker.RegistryHost, _ error) {
return docker.ConfigureDefaultRegistries(docker.WithPlainHTTP(docker.MatchLocalhost))(refspec.Hostname())
})
}
tm := task.NewBackgroundTaskManager(maxConcurrency, 5*time.Second)
r, err := layer.NewResolver(root, tm, cfg, fsOpts.resolveHandlers, metadataStore, fsOpts.overlayOpaqueType, fsOpts.additionalDecompressors)
if err != nil {
return nil, fmt.Errorf("failed to setup resolver: %w", err)
}
var ns *metrics.Namespace
if !cfg.NoPrometheus {
ns = metrics.NewNamespace("stargz", "fs", nil)
logLevel := log.DebugLevel
if fsOpts.metricsLogLevel != nil {
logLevel = *fsOpts.metricsLogLevel
}
commonmetrics.Register(logLevel) // Register common metrics. This will happen only once.
}
c := layermetrics.NewLayerMetrics(ns)
if ns != nil {
metrics.Register(ns) // Register layer metrics.
}
return &filesystem{
resolver: r,
getSources: getSources,
prefetchSize: cfg.PrefetchSize,
noprefetch: cfg.NoPrefetch,
noBackgroundFetch: cfg.NoBackgroundFetch,
debug: cfg.Debug,
layer: make(map[string]layer.Layer),
backgroundTaskManager: tm,
allowNoVerification: cfg.AllowNoVerification,
disableVerification: cfg.DisableVerification,
metricsController: c,
attrTimeout: attrTimeout,
entryTimeout: entryTimeout,
}, nil
}
type filesystem struct {
resolver *layer.Resolver
prefetchSize int64
noprefetch bool
noBackgroundFetch bool
debug bool
layer map[string]layer.Layer
layerMu sync.Mutex
backgroundTaskManager *task.BackgroundTaskManager
allowNoVerification bool
disableVerification bool
getSources source.GetSources
metricsController *layermetrics.Controller
attrTimeout time.Duration
entryTimeout time.Duration
}
func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[string]string) (retErr error) {
// Setting the start time to measure the Mount operation duration.
start := time.Now()
// This is a prioritized task and all background tasks will be stopped
// execution so this can avoid being disturbed for NW traffic by background
// tasks.
fs.backgroundTaskManager.DoPrioritizedTask()
defer fs.backgroundTaskManager.DonePrioritizedTask()
ctx = log.WithLogger(ctx, log.G(ctx).WithField("mountpoint", mountpoint))
// Get source information of this layer.
src, err := fs.getSources(labels)
if err != nil {
return err
} else if len(src) == 0 {
return fmt.Errorf("source must be passed")
}
defaultPrefetchSize := fs.prefetchSize
if psStr, ok := labels[config.TargetPrefetchSizeLabel]; ok {
if ps, err := strconv.ParseInt(psStr, 10, 64); err == nil {
defaultPrefetchSize = ps
}
}
// Resolve the target layer
var (
resultChan = make(chan layer.Layer)
errChan = make(chan error)
)
go func() {
rErr := fmt.Errorf("failed to resolve target")
for _, s := range src {
l, err := fs.resolver.Resolve(ctx, s.Hosts, s.Name, s.Target)
if err == nil {
resultChan <- l
fs.prefetch(ctx, l, defaultPrefetchSize, start)
return
}
rErr = fmt.Errorf("failed to resolve layer %q from %q: %v: %w", s.Target.Digest, s.Name, err, rErr)
}
errChan <- rErr
}()
// Also resolve and cache other layers in parallel
preResolve := src[0] // TODO: should we pre-resolve blobs in other sources as well?
for _, desc := range neighboringLayers(preResolve.Manifest, preResolve.Target) {
desc := desc
go func() {
// Avoids to get canceled by client.
ctx := log.WithLogger(context.Background(), log.G(ctx).WithField("mountpoint", mountpoint))
l, err := fs.resolver.Resolve(ctx, preResolve.Hosts, preResolve.Name, desc)
if err != nil {
log.G(ctx).WithError(err).Debug("failed to pre-resolve")
return
}
fs.prefetch(ctx, l, defaultPrefetchSize, start)
// Release this layer because this isn't target and we don't use it anymore here.
// However, this will remain on the resolver cache until eviction.
l.Done()
}()
}
// Wait for resolving completion
var l layer.Layer
select {
case l = <-resultChan:
case err := <-errChan:
log.G(ctx).WithError(err).Debug("failed to resolve layer")
return fmt.Errorf("failed to resolve layer: %w", err)
case <-time.After(30 * time.Second):
log.G(ctx).Debug("failed to resolve layer (timeout)")
return fmt.Errorf("failed to resolve layer (timeout)")
}
defer func() {
if retErr != nil {
l.Done() // don't use this layer.
}
}()
// Verify layer's content
if fs.disableVerification {
// Skip if verification is disabled completely
l.SkipVerify()
log.G(ctx).Infof("Verification forcefully skipped")
} else if tocDigest, ok := labels[estargz.TOCJSONDigestAnnotation]; ok {
// Verify this layer using the TOC JSON digest passed through label.
dgst, err := digest.Parse(tocDigest)
if err != nil {
log.G(ctx).WithError(err).Debugf("failed to parse passed TOC digest %q", dgst)
return fmt.Errorf("invalid TOC digest: %v: %w", tocDigest, err)
}
if err := l.Verify(dgst); err != nil {
log.G(ctx).WithError(err).Debugf("invalid layer")
return fmt.Errorf("invalid stargz layer: %w", err)
}
log.G(ctx).Debugf("verified")
} else if _, ok := labels[config.TargetSkipVerifyLabel]; ok && fs.allowNoVerification {
// If unverified layer is allowed, use it with warning.
// This mode is for legacy stargz archives which don't contain digests
// necessary for layer verification.
l.SkipVerify()
log.G(ctx).Warningf("No verification is held for layer")
} else {
// Verification must be done. Don't mount this layer.
return fmt.Errorf("digest of TOC JSON must be passed")
}
node, err := l.RootNode(0)
if err != nil {
log.G(ctx).WithError(err).Warnf("Failed to get root node")
return fmt.Errorf("failed to get root node: %w", err)
}
// Measuring duration of Mount operation for resolved layer.
digest := l.Info().Digest // get layer sha
defer commonmetrics.MeasureLatencyInMilliseconds(commonmetrics.Mount, digest, start)
// Register the mountpoint layer
fs.layerMu.Lock()
fs.layer[mountpoint] = l
fs.layerMu.Unlock()
fs.metricsController.Add(mountpoint, l)
// mount the node to the specified mountpoint
// TODO: bind mount the state directory as a read-only fs on snapshotter's side
rawFS := fusefs.NewNodeFS(node, &fusefs.Options{
AttrTimeout: &fs.attrTimeout,
EntryTimeout: &fs.entryTimeout,
NullPermissions: true,
})
mountOpts := &fuse.MountOptions{
AllowOther: true, // allow users other than root&mounter to access fs
FsName: "stargz", // name this filesystem as "stargz"
Debug: fs.debug,
}
if isFusermountBinExist() {
log.G(ctx).Infof("fusermount detected")
mountOpts.Options = []string{"suid"} // option for fusermount; allow setuid inside container
} else {
log.G(ctx).WithError(err).Infof("%s not installed; trying direct mount", fusermountBin)
mountOpts.DirectMount = true
}
server, err := fuse.NewServer(rawFS, mountpoint, mountOpts)
if err != nil {
log.G(ctx).WithError(err).Debug("failed to make filesystem server")
return err
}
go server.Serve()
return server.WaitMount()
}
func (fs *filesystem) Check(ctx context.Context, mountpoint string, labels map[string]string) error {
// This is a prioritized task and all background tasks will be stopped
// execution so this can avoid being disturbed for NW traffic by background
// tasks.
fs.backgroundTaskManager.DoPrioritizedTask()
defer fs.backgroundTaskManager.DonePrioritizedTask()
defer commonmetrics.MeasureLatencyInMilliseconds(commonmetrics.PrefetchesCompleted, digest.FromString(""), time.Now()) // measuring the time the container launch is blocked on prefetch to complete
ctx = log.WithLogger(ctx, log.G(ctx).WithField("mountpoint", mountpoint))
fs.layerMu.Lock()
l := fs.layer[mountpoint]
fs.layerMu.Unlock()
if l == nil {
log.G(ctx).Debug("layer not registered")
return fmt.Errorf("layer not registered")
}
if l.Info().FetchedSize < l.Info().Size {
// Image contents hasn't fully cached yet.
// Check the blob connectivity and try to refresh the connection on failure
if err := fs.check(ctx, l, labels); err != nil {
log.G(ctx).WithError(err).Warn("check failed")
return err
}
}
// Wait for prefetch compeletion
if !fs.noprefetch {
if err := l.WaitForPrefetchCompletion(); err != nil {
log.G(ctx).WithError(err).Warn("failed to sync with prefetch completion")
}
}
return nil
}
func (fs *filesystem) check(ctx context.Context, l layer.Layer, labels map[string]string) error {
err := l.Check()
if err == nil {
return nil
}
log.G(ctx).WithError(err).Warn("failed to connect to blob")
// Check failed. Try to refresh the connection with fresh source information
src, err := fs.getSources(labels)
if err != nil {
return err
}
var (
retrynum = 1
rErr = fmt.Errorf("failed to refresh connection")
)
for retry := 0; retry < retrynum; retry++ {
log.G(ctx).Warnf("refreshing(%d)...", retry)
for _, s := range src {
err := l.Refresh(ctx, s.Hosts, s.Name, s.Target)
if err == nil {
log.G(ctx).Debug("Successfully refreshed connection")
return nil
}
log.G(ctx).WithError(err).Warnf("failed to refresh the layer %q from %q", s.Target.Digest, s.Name)
rErr = fmt.Errorf("failed(layer:%q, ref:%q): %v: %w", s.Target.Digest, s.Name, err, rErr)
}
}
return rErr
}
func (fs *filesystem) Unmount(ctx context.Context, mountpoint string) error {
if mountpoint == "" {
return fmt.Errorf("mount point must be specified")
}
fs.layerMu.Lock()
l, ok := fs.layer[mountpoint]
if !ok {
fs.layerMu.Unlock()
return fmt.Errorf("specified path %q isn't a mountpoint", mountpoint)
}
delete(fs.layer, mountpoint) // unregisters the corresponding layer
l.Done()
fs.layerMu.Unlock()
fs.metricsController.Remove(mountpoint)
if err := unmount(mountpoint, 0); err != nil {
if err != unix.EBUSY {
return err
}
// Try force unmount
log.G(ctx).WithError(err).Debugf("trying force unmount %q", mountpoint)
if err := unmount(mountpoint, unix.MNT_FORCE); err != nil {
return err
}
}
return nil
}
func unmount(target string, flags int) error {
for {
if err := unix.Unmount(target, flags); err != unix.EINTR {
return err
}
}
}
func (fs *filesystem) prefetch(ctx context.Context, l layer.Layer, defaultPrefetchSize int64, start time.Time) {
// Prefetch a layer. The first Check() for this layer waits for the prefetch completion.
if !fs.noprefetch {
go l.Prefetch(defaultPrefetchSize)
}
// Fetch whole layer aggressively in background.
if !fs.noBackgroundFetch {
go func() {
if err := l.BackgroundFetch(); err == nil {
// write log record for the latency between mount start and last on demand fetch
commonmetrics.LogLatencyForLastOnDemandFetch(ctx, l.Info().Digest, start, l.Info().ReadTime)
}
}()
}
}
// neighboringLayers returns layer descriptors except the `target` layer in the specified manifest.
func neighboringLayers(manifest ocispec.Manifest, target ocispec.Descriptor) (descs []ocispec.Descriptor) {
for _, desc := range manifest.Layers {
if desc.Digest.String() != target.Digest.String() {
descs = append(descs, desc)
}
}
return
}
func isFusermountBinExist() bool {
for _, b := range fusermountBin {
if _, err := exec.LookPath(b); err == nil {
return true
}
}
return false
}

View File

@@ -0,0 +1,681 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the NOTICE.md file.
*/
package layer
import (
"bytes"
"context"
"fmt"
"io"
"os"
"path/filepath"
"sync"
"time"
"github.com/containerd/containerd/v2/pkg/reference"
"github.com/containerd/log"
"github.com/containerd/stargz-snapshotter/cache"
"github.com/containerd/stargz-snapshotter/estargz"
"github.com/containerd/stargz-snapshotter/estargz/zstdchunked"
"github.com/containerd/stargz-snapshotter/fs/config"
commonmetrics "github.com/containerd/stargz-snapshotter/fs/metrics/common"
"github.com/containerd/stargz-snapshotter/fs/reader"
"github.com/containerd/stargz-snapshotter/fs/remote"
"github.com/containerd/stargz-snapshotter/fs/source"
"github.com/containerd/stargz-snapshotter/metadata"
"github.com/containerd/stargz-snapshotter/task"
"github.com/containerd/stargz-snapshotter/util/cacheutil"
"github.com/containerd/stargz-snapshotter/util/namedmutex"
fusefs "github.com/hanwen/go-fuse/v2/fs"
digest "github.com/opencontainers/go-digest"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)
const (
defaultResolveResultEntryTTLSec = 120
defaultMaxLRUCacheEntry = 10
defaultMaxCacheFds = 10
defaultPrefetchTimeoutSec = 10
memoryCacheType = "memory"
)
// Layer represents a layer.
type Layer interface {
// Info returns the information of this layer.
Info() Info
// RootNode returns the root node of this layer.
RootNode(baseInode uint32) (fusefs.InodeEmbedder, error)
// Check checks if the layer is still connectable.
Check() error
// Refresh refreshes the layer connection.
Refresh(ctx context.Context, hosts source.RegistryHosts, refspec reference.Spec, desc ocispec.Descriptor) error
// Verify verifies this layer using the passed TOC Digest.
// Nop if Verify() or SkipVerify() was already called.
Verify(tocDigest digest.Digest) (err error)
// SkipVerify skips verification for this layer.
// Nop if Verify() or SkipVerify() was already called.
SkipVerify()
// Prefetch prefetches the specified size. If the layer is eStargz and contains landmark files,
// the range indicated by these files is respected.
Prefetch(prefetchSize int64) error
// ReadAt reads this layer.
ReadAt([]byte, int64, ...remote.Option) (int, error)
// WaitForPrefetchCompletion waits untils Prefetch completes.
WaitForPrefetchCompletion() error
// BackgroundFetch fetches the entire layer contents to the cache.
// Fetching contents is done as a background task.
BackgroundFetch() error
// Done releases the reference to this layer. The resources related to this layer will be
// discarded sooner or later. Queries after calling this function won't be serviced.
Done()
}
// Info is the current status of a layer.
type Info struct {
Digest digest.Digest
Size int64 // layer size in bytes
FetchedSize int64 // layer fetched size in bytes
PrefetchSize int64 // layer prefetch size in bytes
ReadTime time.Time // last time the layer was read
TOCDigest digest.Digest
}
// Resolver resolves the layer location and provieds the handler of that layer.
type Resolver struct {
rootDir string
resolver *remote.Resolver
prefetchTimeout time.Duration
layerCache *cacheutil.TTLCache
layerCacheMu sync.Mutex
blobCache *cacheutil.TTLCache
blobCacheMu sync.Mutex
backgroundTaskManager *task.BackgroundTaskManager
resolveLock *namedmutex.NamedMutex
config config.Config
metadataStore metadata.Store
overlayOpaqueType OverlayOpaqueType
additionalDecompressors func(context.Context, source.RegistryHosts, reference.Spec, ocispec.Descriptor) []metadata.Decompressor
}
// NewResolver returns a new layer resolver.
func NewResolver(root string, backgroundTaskManager *task.BackgroundTaskManager, cfg config.Config, resolveHandlers map[string]remote.Handler, metadataStore metadata.Store, overlayOpaqueType OverlayOpaqueType, additionalDecompressors func(context.Context, source.RegistryHosts, reference.Spec, ocispec.Descriptor) []metadata.Decompressor) (*Resolver, error) {
resolveResultEntryTTL := time.Duration(cfg.ResolveResultEntryTTLSec) * time.Second
if resolveResultEntryTTL == 0 {
resolveResultEntryTTL = defaultResolveResultEntryTTLSec * time.Second
}
prefetchTimeout := time.Duration(cfg.PrefetchTimeoutSec) * time.Second
if prefetchTimeout == 0 {
prefetchTimeout = defaultPrefetchTimeoutSec * time.Second
}
// layerCache caches resolved layers for future use. This is useful in a use-case where
// the filesystem resolves and caches all layers in an image (not only queried one) in parallel,
// before they are actually queried.
layerCache := cacheutil.NewTTLCache(resolveResultEntryTTL)
layerCache.OnEvicted = func(key string, value interface{}) {
if err := value.(*layer).close(); err != nil {
log.L.WithField("key", key).WithError(err).Warnf("failed to clean up layer")
return
}
log.L.WithField("key", key).Debugf("cleaned up layer")
}
// blobCache caches resolved blobs for futural use. This is especially useful when a layer
// isn't eStargz/stargz (the *layer object won't be created/cached in this case).
blobCache := cacheutil.NewTTLCache(resolveResultEntryTTL)
blobCache.OnEvicted = func(key string, value interface{}) {
if err := value.(remote.Blob).Close(); err != nil {
log.L.WithField("key", key).WithError(err).Warnf("failed to clean up blob")
return
}
log.L.WithField("key", key).Debugf("cleaned up blob")
}
if err := os.MkdirAll(root, 0700); err != nil {
return nil, err
}
return &Resolver{
rootDir: root,
resolver: remote.NewResolver(cfg.BlobConfig, resolveHandlers),
layerCache: layerCache,
blobCache: blobCache,
prefetchTimeout: prefetchTimeout,
backgroundTaskManager: backgroundTaskManager,
config: cfg,
resolveLock: new(namedmutex.NamedMutex),
metadataStore: metadataStore,
overlayOpaqueType: overlayOpaqueType,
additionalDecompressors: additionalDecompressors,
}, nil
}
func newCache(root string, cacheType string, cfg config.Config) (cache.BlobCache, error) {
if cacheType == memoryCacheType {
return cache.NewMemoryCache(), nil
}
dcc := cfg.DirectoryCacheConfig
maxDataEntry := dcc.MaxLRUCacheEntry
if maxDataEntry == 0 {
maxDataEntry = defaultMaxLRUCacheEntry
}
maxFdEntry := dcc.MaxCacheFds
if maxFdEntry == 0 {
maxFdEntry = defaultMaxCacheFds
}
bufPool := &sync.Pool{
New: func() interface{} {
return new(bytes.Buffer)
},
}
dCache, fCache := cacheutil.NewLRUCache(maxDataEntry), cacheutil.NewLRUCache(maxFdEntry)
dCache.OnEvicted = func(key string, value interface{}) {
value.(*bytes.Buffer).Reset()
bufPool.Put(value)
}
fCache.OnEvicted = func(key string, value interface{}) {
value.(*os.File).Close()
}
// create a cache on an unique directory
if err := os.MkdirAll(root, 0700); err != nil {
return nil, err
}
cachePath, err := os.MkdirTemp(root, "")
if err != nil {
return nil, fmt.Errorf("failed to initialize directory cache: %w", err)
}
return cache.NewDirectoryCache(
cachePath,
cache.DirectoryCacheConfig{
SyncAdd: dcc.SyncAdd,
DataCache: dCache,
FdCache: fCache,
BufPool: bufPool,
Direct: dcc.Direct,
},
)
}
// Resolve resolves a layer based on the passed layer blob information.
func (r *Resolver) Resolve(ctx context.Context, hosts source.RegistryHosts, refspec reference.Spec, desc ocispec.Descriptor, esgzOpts ...metadata.Option) (_ Layer, retErr error) {
name := refspec.String() + "/" + desc.Digest.String()
// Wait if resolving this layer is already running. The result
// can hopefully get from the cache.
r.resolveLock.Lock(name)
defer r.resolveLock.Unlock(name)
ctx = log.WithLogger(ctx, log.G(ctx).WithField("src", name))
// First, try to retrieve this layer from the underlying cache.
r.layerCacheMu.Lock()
c, done, ok := r.layerCache.Get(name)
r.layerCacheMu.Unlock()
if ok {
if l := c.(*layer); l.Check() == nil {
log.G(ctx).Debugf("hit layer cache %q", name)
return &layerRef{l, done}, nil
}
// Cached layer is invalid
done()
r.layerCacheMu.Lock()
r.layerCache.Remove(name)
r.layerCacheMu.Unlock()
}
log.G(ctx).Debugf("resolving")
// Resolve the blob.
blobR, err := r.resolveBlob(ctx, hosts, refspec, desc)
if err != nil {
return nil, fmt.Errorf("failed to resolve the blob: %w", err)
}
defer func() {
if retErr != nil {
blobR.done()
}
}()
fsCache, err := newCache(filepath.Join(r.rootDir, "fscache"), r.config.FSCacheType, r.config)
if err != nil {
return nil, fmt.Errorf("failed to create fs cache: %w", err)
}
defer func() {
if retErr != nil {
fsCache.Close()
}
}()
// Get a reader for stargz archive.
// Each file's read operation is a prioritized task and all background tasks
// will be stopped during the execution so this can avoid being disturbed for
// NW traffic by background tasks.
sr := io.NewSectionReader(readerAtFunc(func(p []byte, offset int64) (n int, err error) {
r.backgroundTaskManager.DoPrioritizedTask()
defer r.backgroundTaskManager.DonePrioritizedTask()
return blobR.ReadAt(p, offset)
}), 0, blobR.Size())
// define telemetry hooks to measure latency metrics inside estargz package
telemetry := metadata.Telemetry{
GetFooterLatency: func(start time.Time) {
commonmetrics.MeasureLatencyInMilliseconds(commonmetrics.StargzFooterGet, desc.Digest, start)
},
GetTocLatency: func(start time.Time) {
commonmetrics.MeasureLatencyInMilliseconds(commonmetrics.StargzTocGet, desc.Digest, start)
},
DeserializeTocLatency: func(start time.Time) {
commonmetrics.MeasureLatencyInMilliseconds(commonmetrics.DeserializeTocJSON, desc.Digest, start)
},
}
additionalDecompressors := []metadata.Decompressor{new(zstdchunked.Decompressor)}
if r.additionalDecompressors != nil {
additionalDecompressors = append(additionalDecompressors, r.additionalDecompressors(ctx, hosts, refspec, desc)...)
}
meta, err := r.metadataStore(sr,
append(esgzOpts, metadata.WithTelemetry(&telemetry), metadata.WithDecompressors(additionalDecompressors...))...)
if err != nil {
return nil, err
}
vr, err := reader.NewReader(meta, fsCache, desc.Digest)
if err != nil {
return nil, fmt.Errorf("failed to read layer: %w", err)
}
// Combine layer information together and cache it.
l := newLayer(r, desc, blobR, vr)
r.layerCacheMu.Lock()
cachedL, done2, added := r.layerCache.Add(name, l)
r.layerCacheMu.Unlock()
if !added {
l.close() // layer already exists in the cache. discrad this.
}
log.G(ctx).Debugf("resolved")
return &layerRef{cachedL.(*layer), done2}, nil
}
// resolveBlob resolves a blob based on the passed layer blob information.
func (r *Resolver) resolveBlob(ctx context.Context, hosts source.RegistryHosts, refspec reference.Spec, desc ocispec.Descriptor) (_ *blobRef, retErr error) {
name := refspec.String() + "/" + desc.Digest.String()
// Try to retrieve the blob from the underlying cache.
r.blobCacheMu.Lock()
c, done, ok := r.blobCache.Get(name)
r.blobCacheMu.Unlock()
if ok {
if blob := c.(remote.Blob); blob.Check() == nil {
return &blobRef{blob, done}, nil
}
// invalid blob. discard this.
done()
r.blobCacheMu.Lock()
r.blobCache.Remove(name)
r.blobCacheMu.Unlock()
}
httpCache, err := newCache(filepath.Join(r.rootDir, "httpcache"), r.config.HTTPCacheType, r.config)
if err != nil {
return nil, fmt.Errorf("failed to create http cache: %w", err)
}
defer func() {
if retErr != nil {
httpCache.Close()
}
}()
// Resolve the blob and cache the result.
b, err := r.resolver.Resolve(ctx, hosts, refspec, desc, httpCache)
if err != nil {
return nil, fmt.Errorf("failed to resolve the source: %w", err)
}
r.blobCacheMu.Lock()
cachedB, done, added := r.blobCache.Add(name, b)
r.blobCacheMu.Unlock()
if !added {
b.Close() // blob already exists in the cache. discard this.
}
return &blobRef{cachedB.(remote.Blob), done}, nil
}
func newLayer(
resolver *Resolver,
desc ocispec.Descriptor,
blob *blobRef,
vr *reader.VerifiableReader,
) *layer {
return &layer{
resolver: resolver,
desc: desc,
blob: blob,
verifiableReader: vr,
prefetchWaiter: newWaiter(),
}
}
type layer struct {
resolver *Resolver
desc ocispec.Descriptor
blob *blobRef
verifiableReader *reader.VerifiableReader
prefetchWaiter *waiter
prefetchSize int64
prefetchSizeMu sync.Mutex
r reader.Reader
closed bool
closedMu sync.Mutex
prefetchOnce sync.Once
backgroundFetchOnce sync.Once
}
func (l *layer) Info() Info {
var readTime time.Time
if l.r != nil {
readTime = l.r.LastOnDemandReadTime()
}
return Info{
Digest: l.desc.Digest,
Size: l.blob.Size(),
FetchedSize: l.blob.FetchedSize(),
PrefetchSize: l.prefetchedSize(),
ReadTime: readTime,
TOCDigest: l.verifiableReader.Metadata().TOCDigest(),
}
}
func (l *layer) prefetchedSize() int64 {
l.prefetchSizeMu.Lock()
sz := l.prefetchSize
l.prefetchSizeMu.Unlock()
return sz
}
func (l *layer) Check() error {
if l.isClosed() {
return fmt.Errorf("layer is already closed")
}
return l.blob.Check()
}
func (l *layer) Refresh(ctx context.Context, hosts source.RegistryHosts, refspec reference.Spec, desc ocispec.Descriptor) error {
if l.isClosed() {
return fmt.Errorf("layer is already closed")
}
return l.blob.Refresh(ctx, hosts, refspec, desc)
}
func (l *layer) Verify(tocDigest digest.Digest) (err error) {
if l.isClosed() {
return fmt.Errorf("layer is already closed")
}
if l.r != nil {
return nil
}
l.r, err = l.verifiableReader.VerifyTOC(tocDigest)
return
}
func (l *layer) SkipVerify() {
if l.r != nil {
return
}
l.r = l.verifiableReader.SkipVerify()
}
func (l *layer) Prefetch(prefetchSize int64) (err error) {
l.prefetchOnce.Do(func() {
ctx := context.Background()
l.resolver.backgroundTaskManager.DoPrioritizedTask()
defer l.resolver.backgroundTaskManager.DonePrioritizedTask()
err = l.prefetch(ctx, prefetchSize)
if err != nil {
log.G(ctx).WithError(err).Warnf("failed to prefetch layer=%v", l.desc.Digest)
return
}
log.G(ctx).Debug("completed to prefetch")
})
return
}
func (l *layer) prefetch(ctx context.Context, prefetchSize int64) error {
defer l.prefetchWaiter.done() // Notify the completion
// Measuring the total time to complete prefetch (use defer func() because l.Info().PrefetchSize is set later)
start := time.Now()
defer func() {
commonmetrics.WriteLatencyWithBytesLogValue(ctx, l.desc.Digest, commonmetrics.PrefetchTotal, start, commonmetrics.PrefetchSize, l.prefetchedSize())
}()
if l.isClosed() {
return fmt.Errorf("layer is already closed")
}
rootID := l.verifiableReader.Metadata().RootID()
if _, _, err := l.verifiableReader.Metadata().GetChild(rootID, estargz.NoPrefetchLandmark); err == nil {
// do not prefetch this layer
return nil
} else if id, _, err := l.verifiableReader.Metadata().GetChild(rootID, estargz.PrefetchLandmark); err == nil {
offset, err := l.verifiableReader.Metadata().GetOffset(id)
if err != nil {
return fmt.Errorf("failed to get offset of prefetch landmark: %w", err)
}
// override the prefetch size with optimized value
prefetchSize = offset
} else if prefetchSize > l.blob.Size() {
// adjust prefetch size not to exceed the whole layer size
prefetchSize = l.blob.Size()
}
// Fetch the target range
downloadStart := time.Now()
err := l.blob.Cache(0, prefetchSize)
commonmetrics.WriteLatencyLogValue(ctx, l.desc.Digest, commonmetrics.PrefetchDownload, downloadStart) // time to download prefetch data
if err != nil {
return fmt.Errorf("failed to prefetch layer: %w", err)
}
// Set prefetch size for metrics after prefetch completed
l.prefetchSizeMu.Lock()
l.prefetchSize = prefetchSize
l.prefetchSizeMu.Unlock()
// Cache uncompressed contents of the prefetched range
decompressStart := time.Now()
err = l.verifiableReader.Cache(reader.WithFilter(func(offset int64) bool {
return offset < prefetchSize // Cache only prefetch target
}))
commonmetrics.WriteLatencyLogValue(ctx, l.desc.Digest, commonmetrics.PrefetchDecompress, decompressStart) // time to decompress prefetch data
if err != nil {
return fmt.Errorf("failed to cache prefetched layer: %w", err)
}
return nil
}
func (l *layer) WaitForPrefetchCompletion() error {
if l.isClosed() {
return fmt.Errorf("layer is already closed")
}
return l.prefetchWaiter.wait(l.resolver.prefetchTimeout)
}
func (l *layer) BackgroundFetch() (err error) {
l.backgroundFetchOnce.Do(func() {
ctx := context.Background()
err = l.backgroundFetch(ctx)
if err != nil {
log.G(ctx).WithError(err).Warnf("failed to fetch whole layer=%v", l.desc.Digest)
return
}
log.G(ctx).Debug("completed to fetch all layer data in background")
})
return
}
func (l *layer) backgroundFetch(ctx context.Context) error {
defer commonmetrics.WriteLatencyLogValue(ctx, l.desc.Digest, commonmetrics.BackgroundFetchTotal, time.Now())
if l.isClosed() {
return fmt.Errorf("layer is already closed")
}
br := io.NewSectionReader(readerAtFunc(func(p []byte, offset int64) (retN int, retErr error) {
l.resolver.backgroundTaskManager.InvokeBackgroundTask(func(ctx context.Context) {
// Measuring the time to download background fetch data (in milliseconds)
defer commonmetrics.MeasureLatencyInMilliseconds(commonmetrics.BackgroundFetchDownload, l.Info().Digest, time.Now()) // time to download background fetch data
retN, retErr = l.blob.ReadAt(
p,
offset,
remote.WithContext(ctx), // Make cancellable
remote.WithCacheOpts(cache.Direct()), // Do not pollute mem cache
)
}, 120*time.Second)
return
}), 0, l.blob.Size())
defer commonmetrics.WriteLatencyLogValue(ctx, l.desc.Digest, commonmetrics.BackgroundFetchDecompress, time.Now()) // time to decompress background fetch data (in milliseconds)
return l.verifiableReader.Cache(
reader.WithReader(br), // Read contents in background
reader.WithCacheOpts(cache.Direct()), // Do not pollute mem cache
)
}
func (l *layerRef) Done() {
l.done()
}
func (l *layer) RootNode(baseInode uint32) (fusefs.InodeEmbedder, error) {
if l.isClosed() {
return nil, fmt.Errorf("layer is already closed")
}
if l.r == nil {
return nil, fmt.Errorf("layer hasn't been verified yet")
}
return newNode(l.desc.Digest, l.r, l.blob, baseInode, l.resolver.overlayOpaqueType)
}
func (l *layer) ReadAt(p []byte, offset int64, opts ...remote.Option) (int, error) {
return l.blob.ReadAt(p, offset, opts...)
}
func (l *layer) close() error {
l.closedMu.Lock()
defer l.closedMu.Unlock()
if l.closed {
return nil
}
l.closed = true
defer l.blob.done() // Close reader first, then close the blob
l.verifiableReader.Close()
if l.r != nil {
return l.r.Close()
}
return nil
}
func (l *layer) isClosed() bool {
l.closedMu.Lock()
closed := l.closed
l.closedMu.Unlock()
return closed
}
// blobRef is a reference to the blob in the cache. Calling `done` decreases the reference counter
// of this blob in the underlying cache. When nobody refers to the blob in the cache, resources bound
// to this blob will be discarded.
type blobRef struct {
remote.Blob
done func()
}
// layerRef is a reference to the layer in the cache. Calling `Done` or `done` decreases the
// reference counter of this blob in the underlying cache. When nobody refers to the layer in the
// cache, resources bound to this layer will be discarded.
type layerRef struct {
*layer
done func()
}
func newWaiter() *waiter {
return &waiter{
completionCond: sync.NewCond(&sync.Mutex{}),
}
}
type waiter struct {
isDone bool
isDoneMu sync.Mutex
completionCond *sync.Cond
}
func (w *waiter) done() {
w.isDoneMu.Lock()
w.isDone = true
w.isDoneMu.Unlock()
w.completionCond.Broadcast()
}
func (w *waiter) wait(timeout time.Duration) error {
wait := func() <-chan struct{} {
ch := make(chan struct{})
go func() {
w.isDoneMu.Lock()
isDone := w.isDone
w.isDoneMu.Unlock()
w.completionCond.L.Lock()
if !isDone {
w.completionCond.Wait()
}
w.completionCond.L.Unlock()
ch <- struct{}{}
}()
return ch
}
select {
case <-time.After(timeout):
w.isDoneMu.Lock()
w.isDone = true
w.isDoneMu.Unlock()
w.completionCond.Broadcast()
return fmt.Errorf("timeout(%v)", timeout)
case <-wait():
return nil
}
}
type readerAtFunc func([]byte, int64) (int, error)
func (f readerAtFunc) ReadAt(p []byte, offset int64) (int, error) { return f(p, offset) }

View File

@@ -0,0 +1,806 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the NOTICE.md file.
*/
package layer
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"os"
"sort"
"strings"
"sync"
"syscall"
"time"
"github.com/containerd/log"
"github.com/containerd/stargz-snapshotter/estargz"
commonmetrics "github.com/containerd/stargz-snapshotter/fs/metrics/common"
"github.com/containerd/stargz-snapshotter/fs/reader"
"github.com/containerd/stargz-snapshotter/fs/remote"
"github.com/containerd/stargz-snapshotter/metadata"
fusefs "github.com/hanwen/go-fuse/v2/fs"
"github.com/hanwen/go-fuse/v2/fuse"
digest "github.com/opencontainers/go-digest"
"golang.org/x/sys/unix"
)
const (
blockSize = 4096
physicalBlockSize = 512
// physicalBlockRatio is the ratio of blockSize to physicalBlockSize.
// It can be used to convert from # blockSize-byte blocks to # physicalBlockSize-byte blocks
physicalBlockRatio = blockSize / physicalBlockSize
whiteoutPrefix = ".wh."
whiteoutOpaqueDir = whiteoutPrefix + whiteoutPrefix + ".opq"
opaqueXattrValue = "y"
stateDirName = ".stargz-snapshotter"
statFileMode = syscall.S_IFREG | 0400 // -r--------
stateDirMode = syscall.S_IFDIR | 0500 // dr-x------
)
type OverlayOpaqueType int
const (
OverlayOpaqueAll OverlayOpaqueType = iota
OverlayOpaqueTrusted
OverlayOpaqueUser
)
var opaqueXattrs = map[OverlayOpaqueType][]string{
OverlayOpaqueAll: {"trusted.overlay.opaque", "user.overlay.opaque"},
OverlayOpaqueTrusted: {"trusted.overlay.opaque"},
OverlayOpaqueUser: {"user.overlay.opaque"},
}
func newNode(layerDgst digest.Digest, r reader.Reader, blob remote.Blob, baseInode uint32, opaque OverlayOpaqueType) (fusefs.InodeEmbedder, error) {
rootID := r.Metadata().RootID()
rootAttr, err := r.Metadata().GetAttr(rootID)
if err != nil {
return nil, err
}
opq, ok := opaqueXattrs[opaque]
if !ok {
return nil, fmt.Errorf("Unknown overlay opaque type")
}
ffs := &fs{
r: r,
layerDigest: layerDgst,
baseInode: baseInode,
rootID: rootID,
opaqueXattrs: opq,
}
ffs.s = ffs.newState(layerDgst, blob)
return &node{
id: rootID,
attr: rootAttr,
fs: ffs,
}, nil
}
// fs contains global metadata used by nodes
type fs struct {
r reader.Reader
s *state
layerDigest digest.Digest
baseInode uint32
rootID uint32
opaqueXattrs []string
}
func (fs *fs) inodeOfState() uint64 {
return (uint64(fs.baseInode) << 32) | 1 // reserved
}
func (fs *fs) inodeOfStatFile() uint64 {
return (uint64(fs.baseInode) << 32) | 2 // reserved
}
func (fs *fs) inodeOfID(id uint32) (uint64, error) {
// 0 is reserved by go-fuse 1 and 2 are reserved by the state dir
if id > ^uint32(0)-3 {
return 0, fmt.Errorf("too many inodes")
}
return (uint64(fs.baseInode) << 32) | uint64(3+id), nil
}
// node is a filesystem inode abstraction.
type node struct {
fusefs.Inode
fs *fs
id uint32
attr metadata.Attr
ents []fuse.DirEntry
entsCached bool
entsMu sync.Mutex
}
func (n *node) isRootNode() bool {
return n.id == n.fs.rootID
}
func (n *node) isOpaque() bool {
if _, _, err := n.fs.r.Metadata().GetChild(n.id, whiteoutOpaqueDir); err == nil {
return true
}
return false
}
var _ = (fusefs.InodeEmbedder)((*node)(nil))
var _ = (fusefs.NodeReaddirer)((*node)(nil))
func (n *node) Readdir(ctx context.Context) (fusefs.DirStream, syscall.Errno) {
ents, errno := n.readdir()
if errno != 0 {
return nil, errno
}
return fusefs.NewListDirStream(ents), 0
}
func (n *node) readdir() ([]fuse.DirEntry, syscall.Errno) {
// Measure how long node_readdir operation takes (in microseconds).
start := time.Now() // set start time
defer commonmetrics.MeasureLatencyInMicroseconds(commonmetrics.NodeReaddir, n.fs.layerDigest, start)
n.entsMu.Lock()
if n.entsCached {
ents := n.ents
n.entsMu.Unlock()
return ents, 0
}
n.entsMu.Unlock()
isRoot := n.isRootNode()
var ents []fuse.DirEntry
whiteouts := map[string]uint32{}
normalEnts := map[string]bool{}
var lastErr error
if err := n.fs.r.Metadata().ForeachChild(n.id, func(name string, id uint32, mode os.FileMode) bool {
// We don't want to show prefetch landmarks in "/".
if isRoot && (name == estargz.PrefetchLandmark || name == estargz.NoPrefetchLandmark) {
return true
}
// We don't want to show whiteouts.
if strings.HasPrefix(name, whiteoutPrefix) {
if name == whiteoutOpaqueDir {
return true
}
// Add the overlayfs-compiant whiteout later.
whiteouts[name] = id
return true
}
// This is a normal entry.
normalEnts[name] = true
ino, err := n.fs.inodeOfID(id)
if err != nil {
lastErr = err
return false
}
ents = append(ents, fuse.DirEntry{
Mode: fileModeToSystemMode(mode),
Name: name,
Ino: ino,
})
return true
}); err != nil || lastErr != nil {
n.fs.s.report(fmt.Errorf("node.Readdir: err = %v; lastErr = %v", err, lastErr))
return nil, syscall.EIO
}
// Append whiteouts if no entry replaces the target entry in the lower layer.
for w, id := range whiteouts {
if !normalEnts[w[len(whiteoutPrefix):]] {
ino, err := n.fs.inodeOfID(id)
if err != nil {
n.fs.s.report(fmt.Errorf("node.Readdir: err = %v; lastErr = %v", err, lastErr))
return nil, syscall.EIO
}
ents = append(ents, fuse.DirEntry{
Mode: syscall.S_IFCHR,
Name: w[len(whiteoutPrefix):],
Ino: ino,
})
}
}
// Avoid undeterministic order of entries on each call
sort.Slice(ents, func(i, j int) bool {
return ents[i].Name < ents[j].Name
})
n.entsMu.Lock()
defer n.entsMu.Unlock()
n.ents, n.entsCached = ents, true // cache it
return ents, 0
}
var _ = (fusefs.NodeLookuper)((*node)(nil))
func (n *node) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*fusefs.Inode, syscall.Errno) {
isRoot := n.isRootNode()
// We don't want to show prefetch landmarks in "/".
if isRoot && (name == estargz.PrefetchLandmark || name == estargz.NoPrefetchLandmark) {
return nil, syscall.ENOENT
}
// We don't want to show whiteouts.
if strings.HasPrefix(name, whiteoutPrefix) {
return nil, syscall.ENOENT
}
// state directory
if isRoot && name == stateDirName {
return n.NewInode(ctx, n.fs.s, n.fs.stateToAttr(&out.Attr)), 0
}
// lookup on memory nodes
if cn := n.GetChild(name); cn != nil {
switch tn := cn.Operations().(type) {
case *node:
ino, err := n.fs.inodeOfID(tn.id)
if err != nil {
n.fs.s.report(fmt.Errorf("node.Lookup: %v", err))
return nil, syscall.EIO
}
entryToAttr(ino, tn.attr, &out.Attr)
case *whiteout:
ino, err := n.fs.inodeOfID(tn.id)
if err != nil {
n.fs.s.report(fmt.Errorf("node.Lookup: %v", err))
return nil, syscall.EIO
}
entryToAttr(ino, tn.attr, &out.Attr)
default:
n.fs.s.report(fmt.Errorf("node.Lookup: uknown node type detected"))
return nil, syscall.EIO
}
return cn, 0
}
// early return if this entry doesn't exist
n.entsMu.Lock()
if n.entsCached {
var found bool
for _, e := range n.ents {
if e.Name == name {
found = true
}
}
if !found {
n.entsMu.Unlock()
return nil, syscall.ENOENT
}
}
n.entsMu.Unlock()
id, ce, err := n.fs.r.Metadata().GetChild(n.id, name)
if err != nil {
// If the entry exists as a whiteout, show an overlayfs-styled whiteout node.
if whID, wh, err := n.fs.r.Metadata().GetChild(n.id, fmt.Sprintf("%s%s", whiteoutPrefix, name)); err == nil {
ino, err := n.fs.inodeOfID(whID)
if err != nil {
n.fs.s.report(fmt.Errorf("node.Lookup: %v", err))
return nil, syscall.EIO
}
return n.NewInode(ctx, &whiteout{
id: whID,
fs: n.fs,
attr: wh,
}, entryToWhAttr(ino, wh, &out.Attr)), 0
}
n.readdir() // This code path is very expensive. Cache child entries here so that the next call don't reach here.
return nil, syscall.ENOENT
}
ino, err := n.fs.inodeOfID(id)
if err != nil {
n.fs.s.report(fmt.Errorf("node.Lookup: %v", err))
return nil, syscall.EIO
}
return n.NewInode(ctx, &node{
id: id,
fs: n.fs,
attr: ce,
}, entryToAttr(ino, ce, &out.Attr)), 0
}
var _ = (fusefs.NodeOpener)((*node)(nil))
func (n *node) Open(ctx context.Context, flags uint32) (fh fusefs.FileHandle, fuseFlags uint32, errno syscall.Errno) {
ra, err := n.fs.r.OpenFile(n.id)
if err != nil {
n.fs.s.report(fmt.Errorf("node.Open: %v", err))
return nil, 0, syscall.EIO
}
return &file{
n: n,
ra: ra,
}, fuse.FOPEN_KEEP_CACHE, 0
}
var _ = (fusefs.NodeGetattrer)((*node)(nil))
func (n *node) Getattr(ctx context.Context, f fusefs.FileHandle, out *fuse.AttrOut) syscall.Errno {
ino, err := n.fs.inodeOfID(n.id)
if err != nil {
n.fs.s.report(fmt.Errorf("node.Getattr: %v", err))
return syscall.EIO
}
entryToAttr(ino, n.attr, &out.Attr)
return 0
}
var _ = (fusefs.NodeGetxattrer)((*node)(nil))
func (n *node) Getxattr(ctx context.Context, attr string, dest []byte) (uint32, syscall.Errno) {
ent := n.attr
opq := n.isOpaque()
for _, opaqueXattr := range n.fs.opaqueXattrs {
if attr == opaqueXattr && opq {
// This node is an opaque directory so give overlayfs-compliant indicator.
if len(dest) < len(opaqueXattrValue) {
return uint32(len(opaqueXattrValue)), syscall.ERANGE
}
return uint32(copy(dest, opaqueXattrValue)), 0
}
}
if v, ok := ent.Xattrs[attr]; ok {
if len(dest) < len(v) {
return uint32(len(v)), syscall.ERANGE
}
return uint32(copy(dest, v)), 0
}
return 0, syscall.ENODATA
}
var _ = (fusefs.NodeListxattrer)((*node)(nil))
func (n *node) Listxattr(ctx context.Context, dest []byte) (uint32, syscall.Errno) {
ent := n.attr
opq := n.isOpaque()
var attrs []byte
if opq {
// This node is an opaque directory so add overlayfs-compliant indicator.
for _, opaqueXattr := range n.fs.opaqueXattrs {
attrs = append(attrs, []byte(opaqueXattr+"\x00")...)
}
}
for k := range ent.Xattrs {
attrs = append(attrs, []byte(k+"\x00")...)
}
if len(dest) < len(attrs) {
return uint32(len(attrs)), syscall.ERANGE
}
return uint32(copy(dest, attrs)), 0
}
var _ = (fusefs.NodeReadlinker)((*node)(nil))
func (n *node) Readlink(ctx context.Context) ([]byte, syscall.Errno) {
ent := n.attr
return []byte(ent.LinkName), 0
}
var _ = (fusefs.NodeStatfser)((*node)(nil))
func (n *node) Statfs(ctx context.Context, out *fuse.StatfsOut) syscall.Errno {
defaultStatfs(out)
return 0
}
// file is a file abstraction which implements file handle in go-fuse.
type file struct {
n *node
ra io.ReaderAt
}
var _ = (fusefs.FileReader)((*file)(nil))
func (f *file) Read(ctx context.Context, dest []byte, off int64) (fuse.ReadResult, syscall.Errno) {
defer commonmetrics.MeasureLatencyInMicroseconds(commonmetrics.ReadOnDemand, f.n.fs.layerDigest, time.Now()) // measure time for on-demand file reads (in microseconds)
defer commonmetrics.IncOperationCount(commonmetrics.OnDemandReadAccessCount, f.n.fs.layerDigest) // increment the counter for on-demand file accesses
n, err := f.ra.ReadAt(dest, off)
if err != nil && err != io.EOF {
f.n.fs.s.report(fmt.Errorf("file.Read: %v", err))
return nil, syscall.EIO
}
return fuse.ReadResultData(dest[:n]), 0
}
var _ = (fusefs.FileGetattrer)((*file)(nil))
func (f *file) Getattr(ctx context.Context, out *fuse.AttrOut) syscall.Errno {
ino, err := f.n.fs.inodeOfID(f.n.id)
if err != nil {
f.n.fs.s.report(fmt.Errorf("file.Getattr: %v", err))
return syscall.EIO
}
entryToAttr(ino, f.n.attr, &out.Attr)
return 0
}
// whiteout is a whiteout abstraction compliant to overlayfs.
type whiteout struct {
fusefs.Inode
id uint32
fs *fs
attr metadata.Attr
}
var _ = (fusefs.NodeGetattrer)((*whiteout)(nil))
func (w *whiteout) Getattr(ctx context.Context, f fusefs.FileHandle, out *fuse.AttrOut) syscall.Errno {
ino, err := w.fs.inodeOfID(w.id)
if err != nil {
w.fs.s.report(fmt.Errorf("whiteout.Getattr: %v", err))
return syscall.EIO
}
entryToWhAttr(ino, w.attr, &out.Attr)
return 0
}
var _ = (fusefs.NodeStatfser)((*whiteout)(nil))
func (w *whiteout) Statfs(ctx context.Context, out *fuse.StatfsOut) syscall.Errno {
defaultStatfs(out)
return 0
}
// newState provides new state directory node.
// It creates statFile at the same time to give it stable inode number.
func (fs *fs) newState(layerDigest digest.Digest, blob remote.Blob) *state {
return &state{
statFile: &statFile{
name: layerDigest.String() + ".json",
statJSON: statJSON{
Digest: layerDigest.String(),
Size: blob.Size(),
},
blob: blob,
fs: fs,
},
fs: fs,
}
}
// state is a directory which contain a "state file" of this layer aiming to
// observability. This filesystem uses it to report something(e.g. error) to
// the clients(e.g. Kubernetes's livenessProbe).
// This directory has mode "dr-x------ root root".
type state struct {
fusefs.Inode
statFile *statFile
fs *fs
}
var _ = (fusefs.NodeReaddirer)((*state)(nil))
func (s *state) Readdir(ctx context.Context) (fusefs.DirStream, syscall.Errno) {
return fusefs.NewListDirStream([]fuse.DirEntry{
{
Mode: statFileMode,
Name: s.statFile.name,
Ino: s.fs.inodeOfStatFile(),
},
}), 0
}
var _ = (fusefs.NodeLookuper)((*state)(nil))
func (s *state) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*fusefs.Inode, syscall.Errno) {
if name != s.statFile.name {
return nil, syscall.ENOENT
}
attr, errno := s.statFile.attr(&out.Attr)
if errno != 0 {
return nil, errno
}
return s.NewInode(ctx, s.statFile, attr), 0
}
var _ = (fusefs.NodeGetattrer)((*state)(nil))
func (s *state) Getattr(ctx context.Context, f fusefs.FileHandle, out *fuse.AttrOut) syscall.Errno {
s.fs.stateToAttr(&out.Attr)
return 0
}
var _ = (fusefs.NodeStatfser)((*state)(nil))
func (s *state) Statfs(ctx context.Context, out *fuse.StatfsOut) syscall.Errno {
defaultStatfs(out)
return 0
}
func (s *state) report(err error) {
s.statFile.report(err)
}
type statJSON struct {
Error string `json:"error,omitempty"`
Digest string `json:"digest"`
// URL is excluded for potential security reason
Size int64 `json:"size"`
FetchedSize int64 `json:"fetchedSize"`
FetchedPercent float64 `json:"fetchedPercent"` // Fetched / Size * 100.0
}
// statFile is a file which contain something to be reported from this layer.
// This filesystem uses statFile.report() to report something(e.g. error) to
// the clients(e.g. Kubernetes's livenessProbe).
// This file has mode "-r-------- root root".
type statFile struct {
fusefs.Inode
name string
blob remote.Blob
statJSON statJSON
mu sync.Mutex
fs *fs
}
var _ = (fusefs.NodeOpener)((*statFile)(nil))
func (sf *statFile) Open(ctx context.Context, flags uint32) (fh fusefs.FileHandle, fuseFlags uint32, errno syscall.Errno) {
return nil, 0, 0
}
var _ = (fusefs.NodeReader)((*statFile)(nil))
func (sf *statFile) Read(ctx context.Context, f fusefs.FileHandle, dest []byte, off int64) (fuse.ReadResult, syscall.Errno) {
sf.mu.Lock()
defer sf.mu.Unlock()
st, err := sf.updateStatUnlocked()
if err != nil {
return nil, syscall.EIO
}
n, err := bytes.NewReader(st).ReadAt(dest, off)
if err != nil && err != io.EOF {
return nil, syscall.EIO
}
return fuse.ReadResultData(dest[:n]), 0
}
var _ = (fusefs.NodeGetattrer)((*statFile)(nil))
func (sf *statFile) Getattr(ctx context.Context, f fusefs.FileHandle, out *fuse.AttrOut) syscall.Errno {
_, errno := sf.attr(&out.Attr)
return errno
}
var _ = (fusefs.NodeStatfser)((*statFile)(nil))
func (sf *statFile) Statfs(ctx context.Context, out *fuse.StatfsOut) syscall.Errno {
defaultStatfs(out)
return 0
}
// logContents puts the contents of statFile in the log
// to keep that information accessible for troubleshooting.
// The entries naming is kept to be consistend with the field naming in statJSON.
func (sf *statFile) logContents() {
ctx := context.Background()
log.G(ctx).WithFields(log.Fields{
"digest": sf.statJSON.Digest, "size": sf.statJSON.Size,
"fetchedSize": sf.statJSON.FetchedSize, "fetchedPercent": sf.statJSON.FetchedPercent,
}).WithError(errors.New(sf.statJSON.Error)).Error("statFile error")
}
func (sf *statFile) report(err error) {
sf.mu.Lock()
defer sf.mu.Unlock()
sf.statJSON.Error = err.Error()
sf.logContents()
}
func (sf *statFile) attr(out *fuse.Attr) (fusefs.StableAttr, syscall.Errno) {
sf.mu.Lock()
defer sf.mu.Unlock()
st, err := sf.updateStatUnlocked()
if err != nil {
return fusefs.StableAttr{}, syscall.EIO
}
return sf.fs.statFileToAttr(uint64(len(st)), out), 0
}
func (sf *statFile) updateStatUnlocked() ([]byte, error) {
sf.statJSON.FetchedSize = sf.blob.FetchedSize()
sf.statJSON.FetchedPercent = float64(sf.statJSON.FetchedSize) / float64(sf.statJSON.Size) * 100.0
j, err := json.Marshal(&sf.statJSON)
if err != nil {
return nil, err
}
j = append(j, []byte("\n")...)
return j, nil
}
// entryToAttr converts metadata.Attr to go-fuse's Attr.
func entryToAttr(ino uint64, e metadata.Attr, out *fuse.Attr) fusefs.StableAttr {
out.Ino = ino
out.Size = uint64(e.Size)
if e.Mode&os.ModeSymlink != 0 {
out.Size = uint64(len(e.LinkName))
}
out.Blksize = blockSize
out.Blocks = (out.Size + uint64(out.Blksize) - 1) / uint64(out.Blksize) * physicalBlockRatio
mtime := e.ModTime
out.SetTimes(nil, &mtime, nil)
out.Mode = fileModeToSystemMode(e.Mode)
out.Owner = fuse.Owner{Uid: uint32(e.UID), Gid: uint32(e.GID)}
out.Rdev = uint32(unix.Mkdev(uint32(e.DevMajor), uint32(e.DevMinor)))
out.Nlink = uint32(e.NumLink)
if out.Nlink == 0 {
out.Nlink = 1 // zero "NumLink" means one.
}
out.Padding = 0 // TODO
return fusefs.StableAttr{
Mode: out.Mode,
Ino: out.Ino,
// NOTE: The inode number is unique throughout the lifetime of
// this filesystem so we don't consider about generation at this
// moment.
}
}
// entryToWhAttr converts metadata.Attr to go-fuse's Attr of whiteouts.
func entryToWhAttr(ino uint64, e metadata.Attr, out *fuse.Attr) fusefs.StableAttr {
out.Ino = ino
out.Size = 0
out.Blksize = blockSize
out.Blocks = 0
mtime := e.ModTime
out.SetTimes(nil, &mtime, nil)
out.Mode = syscall.S_IFCHR
out.Owner = fuse.Owner{Uid: 0, Gid: 0}
out.Rdev = uint32(unix.Mkdev(0, 0))
out.Nlink = 1
out.Padding = 0 // TODO
return fusefs.StableAttr{
Mode: out.Mode,
Ino: out.Ino,
// NOTE: The inode number is unique throughout the lifetime of
// this filesystem so we don't consider about generation at this
// moment.
}
}
// stateToAttr converts state directory to go-fuse's Attr.
func (fs *fs) stateToAttr(out *fuse.Attr) fusefs.StableAttr {
out.Ino = fs.inodeOfState()
out.Size = 0
out.Blksize = blockSize
out.Blocks = 0
out.Nlink = 1
// root can read and open it (dr-x------ root root).
out.Mode = stateDirMode
out.Owner = fuse.Owner{Uid: 0, Gid: 0}
// dummy
out.Mtime = 0
out.Mtimensec = 0
out.Rdev = 0
out.Padding = 0
return fusefs.StableAttr{
Mode: out.Mode,
Ino: out.Ino,
// NOTE: The inode number is unique throughout the lifetime of
// this filesystem so we don't consider about generation at this
// moment.
}
}
// statFileToAttr converts stat file to go-fuse's Attr.
// func statFileToAttr(id uint64, sf *statFile, size uint64, out *fuse.Attr) fusefs.StableAttr {
func (fs *fs) statFileToAttr(size uint64, out *fuse.Attr) fusefs.StableAttr {
out.Ino = fs.inodeOfStatFile()
out.Size = size
out.Blksize = blockSize
out.Blocks = (out.Size + uint64(out.Blksize) - 1) / uint64(out.Blksize) * physicalBlockRatio
out.Nlink = 1
// Root can read it ("-r-------- root root").
out.Mode = statFileMode
out.Owner = fuse.Owner{Uid: 0, Gid: 0}
// dummy
out.Mtime = 0
out.Mtimensec = 0
out.Rdev = 0
out.Padding = 0
return fusefs.StableAttr{
Mode: out.Mode,
Ino: out.Ino,
// NOTE: The inode number is unique throughout the lifetime of
// this filesystem so we don't consider about generation at this
// moment.
}
}
func fileModeToSystemMode(m os.FileMode) uint32 {
// Permission bits
res := uint32(m & os.ModePerm)
// File type bits
switch m & os.ModeType {
case os.ModeDevice:
res |= syscall.S_IFBLK
case os.ModeDevice | os.ModeCharDevice:
res |= syscall.S_IFCHR
case os.ModeDir:
res |= syscall.S_IFDIR
case os.ModeNamedPipe:
res |= syscall.S_IFIFO
case os.ModeSymlink:
res |= syscall.S_IFLNK
case os.ModeSocket:
res |= syscall.S_IFSOCK
default: // regular file.
res |= syscall.S_IFREG
}
// suid, sgid, sticky bits
if m&os.ModeSetuid != 0 {
res |= syscall.S_ISUID
}
if m&os.ModeSetgid != 0 {
res |= syscall.S_ISGID
}
if m&os.ModeSticky != 0 {
res |= syscall.S_ISVTX
}
return res
}
func defaultStatfs(stat *fuse.StatfsOut) {
// http://man7.org/linux/man-pages/man2/statfs.2.html
stat.Blocks = 0 // dummy
stat.Bfree = 0
stat.Bavail = 0
stat.Files = 0 // dummy
stat.Ffree = 0
stat.Bsize = blockSize
stat.NameLen = 1<<32 - 1
stat.Frsize = blockSize
stat.Padding = 0
stat.Spare = [6]uint32{}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,216 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package commonmetrics
import (
"context"
"sync"
"time"
"github.com/containerd/log"
digest "github.com/opencontainers/go-digest"
"github.com/prometheus/client_golang/prometheus"
)
const (
// OperationLatencyKeyMilliseconds is the key for stargz operation latency metrics in milliseconds.
OperationLatencyKeyMilliseconds = "operation_duration_milliseconds"
// OperationLatencyKeyMicroseconds is the key for stargz operation latency metrics in microseconds.
OperationLatencyKeyMicroseconds = "operation_duration_microseconds"
// OperationCountKey is the key for stargz operation count metrics.
OperationCountKey = "operation_count"
// BytesServedKey is the key for any metric related to counting bytes served as the part of specific operation.
BytesServedKey = "bytes_served"
// Keep namespace as stargz and subsystem as fs.
namespace = "stargz"
subsystem = "fs"
)
// Lists all metric labels.
const (
// prometheus metrics
Mount = "mount"
RemoteRegistryGet = "remote_registry_get"
NodeReaddir = "node_readdir"
StargzHeaderGet = "stargz_header_get"
StargzFooterGet = "stargz_footer_get"
StargzTocGet = "stargz_toc_get"
DeserializeTocJSON = "stargz_toc_json_deserialize"
PrefetchesCompleted = "all_prefetches_completed"
ReadOnDemand = "read_on_demand"
MountLayerToLastOnDemandFetch = "mount_layer_to_last_on_demand_fetch"
OnDemandReadAccessCount = "on_demand_read_access_count"
OnDemandRemoteRegistryFetchCount = "on_demand_remote_registry_fetch_count"
OnDemandBytesServed = "on_demand_bytes_served"
OnDemandBytesFetched = "on_demand_bytes_fetched"
// logs metrics
PrefetchTotal = "prefetch_total"
PrefetchDownload = "prefetch_download"
PrefetchDecompress = "prefetch_decompress"
BackgroundFetchTotal = "background_fetch_total"
BackgroundFetchDownload = "background_fetch_download"
BackgroundFetchDecompress = "background_fetch_decompress"
PrefetchSize = "prefetch_size"
)
var (
// Buckets for OperationLatency metrics.
latencyBucketsMilliseconds = []float64{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384} // in milliseconds
latencyBucketsMicroseconds = []float64{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024} // in microseconds
// operationLatencyMilliseconds collects operation latency numbers in milliseconds grouped by
// operation, type and layer digest.
operationLatencyMilliseconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: OperationLatencyKeyMilliseconds,
Help: "Latency in milliseconds of stargz snapshotter operations. Broken down by operation type and layer sha.",
Buckets: latencyBucketsMilliseconds,
},
[]string{"operation_type", "layer"},
)
// operationLatencyMicroseconds collects operation latency numbers in microseconds grouped by
// operation, type and layer digest.
operationLatencyMicroseconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: OperationLatencyKeyMicroseconds,
Help: "Latency in microseconds of stargz snapshotter operations. Broken down by operation type and layer sha.",
Buckets: latencyBucketsMicroseconds,
},
[]string{"operation_type", "layer"},
)
// operationCount collects operation count numbers by operation
// type and layer sha.
operationCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: OperationCountKey,
Help: "The count of stargz snapshotter operations. Broken down by operation type and layer sha.",
},
[]string{"operation_type", "layer"},
)
// bytesCount reflects the number of bytes served as the part of specitic operation type per layer sha.
bytesCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: BytesServedKey,
Help: "The number of bytes served per stargz snapshotter operations. Broken down by operation type and layer sha.",
},
[]string{"operation_type", "layer"},
)
)
var register sync.Once
var logLevel = log.DebugLevel
// sinceInMilliseconds gets the time since the specified start in milliseconds.
// The division by 1e6 is made to have the milliseconds value as floating point number, since the native method
// .Milliseconds() returns an integer value and you can lost a precision for sub-millisecond values.
func sinceInMilliseconds(start time.Time) float64 {
return float64(time.Since(start).Nanoseconds()) / 1e6
}
// sinceInMicroseconds gets the time since the specified start in microseconds.
// The division by 1e3 is made to have the microseconds value as floating point number, since the native method
// .Microseconds() returns an integer value and you can lost a precision for sub-microsecond values.
func sinceInMicroseconds(start time.Time) float64 {
return float64(time.Since(start).Nanoseconds()) / 1e3
}
// Register registers metrics. This is always called only once.
func Register(l log.Level) {
register.Do(func() {
logLevel = l
prometheus.MustRegister(operationLatencyMilliseconds)
prometheus.MustRegister(operationLatencyMicroseconds)
prometheus.MustRegister(operationCount)
prometheus.MustRegister(bytesCount)
})
}
// MeasureLatencyInMilliseconds wraps the labels attachment as well as calling Observe into a single method.
// Right now we attach the operation and layer digest, so it's possible to see the breakdown for latency
// by operation and individual layers.
// If you want this to be layer agnostic, just pass the digest from empty string, e.g.
// layerDigest := digest.FromString("")
func MeasureLatencyInMilliseconds(operation string, layer digest.Digest, start time.Time) {
operationLatencyMilliseconds.WithLabelValues(operation, layer.String()).Observe(sinceInMilliseconds(start))
}
// MeasureLatencyInMicroseconds wraps the labels attachment as well as calling Observe into a single method.
// Right now we attach the operation and layer digest, so it's possible to see the breakdown for latency
// by operation and individual layers.
// If you want this to be layer agnostic, just pass the digest from empty string, e.g.
// layerDigest := digest.FromString("")
func MeasureLatencyInMicroseconds(operation string, layer digest.Digest, start time.Time) {
operationLatencyMicroseconds.WithLabelValues(operation, layer.String()).Observe(sinceInMicroseconds(start))
}
// IncOperationCount wraps the labels attachment as well as calling Inc into a single method.
func IncOperationCount(operation string, layer digest.Digest) {
operationCount.WithLabelValues(operation, layer.String()).Inc()
}
// AddBytesCount wraps the labels attachment as well as calling Add into a single method.
func AddBytesCount(operation string, layer digest.Digest, bytes int64) {
bytesCount.WithLabelValues(operation, layer.String()).Add(float64(bytes))
}
// WriteLatencyLogValue wraps writing the log info record for latency in milliseconds. The log record breaks down by operation and layer digest.
func WriteLatencyLogValue(ctx context.Context, layer digest.Digest, operation string, start time.Time) {
ctx = log.WithLogger(ctx, log.G(ctx).WithField("metrics", "latency").WithField("operation", operation).WithField("layer_sha", layer.String()))
log.G(ctx).Logf(logLevel, "value=%v milliseconds", sinceInMilliseconds(start))
}
// WriteLatencyWithBytesLogValue wraps writing the log info record for latency in milliseconds with adding the size in bytes.
// The log record breaks down by operation, layer digest and byte value.
func WriteLatencyWithBytesLogValue(ctx context.Context, layer digest.Digest, latencyOperation string, start time.Time, bytesMetricName string, bytesMetricValue int64) {
ctx = log.WithLogger(ctx, log.G(ctx).WithField("metrics", "latency").WithField("operation", latencyOperation).WithField("layer_sha", layer.String()))
log.G(ctx).Logf(logLevel, "value=%v milliseconds; %v=%v bytes", sinceInMilliseconds(start), bytesMetricName, bytesMetricValue)
}
// LogLatencyForLastOnDemandFetch implements a special case for measuring the latency of last on demand fetch, which must be invoked at the end of
// background fetch operation only. Since this is expected to happen only once per container launch, it writes a log line,
// instead of directly emitting a metric.
// We do that in the following way:
// 1. We record the mount start time
// 2. We constantly record the timestamps when we do on demand fetch for each layer sha
// 3. On background fetch completed we measure the difference between the last on demand fetch and mount start time
// and record it as a metric
func LogLatencyForLastOnDemandFetch(ctx context.Context, layer digest.Digest, start time.Time, end time.Time) {
diffInMilliseconds := float64(end.Sub(start).Milliseconds())
// value can be negative if we pass the default value for time.Time as `end`
// this can happen if there were no on-demand fetch for the particular layer
if diffInMilliseconds > 0 {
ctx = log.WithLogger(ctx, log.G(ctx).WithField("metrics", "latency").WithField("operation", MountLayerToLastOnDemandFetch).WithField("layer_sha", layer.String()))
log.G(ctx).Logf(logLevel, "value=%v milliseconds", diffInMilliseconds)
}
}

View File

@@ -0,0 +1,65 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package layermetrics
import (
"github.com/containerd/stargz-snapshotter/fs/layer"
metrics "github.com/docker/go-metrics"
"github.com/prometheus/client_golang/prometheus"
)
var layerMetrics = []*metric{
{
name: "layer_fetched_size",
help: "Total fetched size of the layer",
unit: metrics.Bytes,
vt: prometheus.CounterValue,
getValues: func(l layer.Layer) []value {
return []value{
{
v: float64(l.Info().FetchedSize),
},
}
},
},
{
name: "layer_prefetch_size",
help: "Total prefetched size of the layer",
unit: metrics.Bytes,
vt: prometheus.CounterValue,
getValues: func(l layer.Layer) []value {
return []value{
{
v: float64(l.Info().PrefetchSize),
},
}
},
},
{
name: "layer_size",
help: "Total size of the layer",
unit: metrics.Bytes,
vt: prometheus.CounterValue,
getValues: func(l layer.Layer) []value {
return []value{
{
v: float64(l.Info().Size),
},
}
},
},
}

View File

@@ -0,0 +1,113 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package layermetrics
import (
"sync"
"github.com/containerd/stargz-snapshotter/fs/layer"
metrics "github.com/docker/go-metrics"
"github.com/prometheus/client_golang/prometheus"
)
func NewLayerMetrics(ns *metrics.Namespace) *Controller {
if ns == nil {
return &Controller{}
}
c := &Controller{
ns: ns,
layer: make(map[string]layer.Layer),
}
c.metrics = append(c.metrics, layerMetrics...)
ns.Add(c)
return c
}
type Controller struct {
ns *metrics.Namespace
metrics []*metric
layer map[string]layer.Layer
layerMu sync.RWMutex
}
func (c *Controller) Describe(ch chan<- *prometheus.Desc) {
for _, e := range c.metrics {
ch <- e.desc(c.ns)
}
}
func (c *Controller) Collect(ch chan<- prometheus.Metric) {
c.layerMu.RLock()
wg := &sync.WaitGroup{}
for mp, l := range c.layer {
mp, l := mp, l
wg.Add(1)
go func() {
defer wg.Done()
for _, e := range c.metrics {
e.collect(mp, l, c.ns, ch)
}
}()
}
c.layerMu.RUnlock()
wg.Wait()
}
func (c *Controller) Add(key string, l layer.Layer) {
if c.ns == nil {
return
}
c.layerMu.Lock()
c.layer[key] = l
c.layerMu.Unlock()
}
func (c *Controller) Remove(key string) {
if c.ns == nil {
return
}
c.layerMu.Lock()
delete(c.layer, key)
c.layerMu.Unlock()
}
type value struct {
v float64
l []string
}
type metric struct {
name string
help string
unit metrics.Unit
vt prometheus.ValueType
labels []string
// getValues returns the value and labels for the data
getValues func(l layer.Layer) []value
}
func (m *metric) desc(ns *metrics.Namespace) *prometheus.Desc {
return ns.NewDesc(m.name, m.help, m.unit, append([]string{"digest", "mountpoint"}, m.labels...)...)
}
func (m *metric) collect(mountpoint string, l layer.Layer, ns *metrics.Namespace, ch chan<- prometheus.Metric) {
values := m.getValues(l)
for _, v := range values {
ch <- prometheus.MustNewConstMetric(m.desc(ns), m.vt, v.v, append([]string{l.Info().Digest.String(), mountpoint}, v.l...)...)
}
}

View File

@@ -0,0 +1,579 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the NOTICE.md file.
*/
package reader
import (
"bufio"
"bytes"
"context"
"crypto/sha256"
"fmt"
"io"
"os"
"runtime"
"sync"
"time"
"github.com/containerd/stargz-snapshotter/cache"
"github.com/containerd/stargz-snapshotter/estargz"
commonmetrics "github.com/containerd/stargz-snapshotter/fs/metrics/common"
"github.com/containerd/stargz-snapshotter/metadata"
"github.com/hashicorp/go-multierror"
digest "github.com/opencontainers/go-digest"
"golang.org/x/sync/errgroup"
"golang.org/x/sync/semaphore"
)
const maxWalkDepth = 10000
type Reader interface {
OpenFile(id uint32) (io.ReaderAt, error)
Metadata() metadata.Reader
Close() error
LastOnDemandReadTime() time.Time
}
// VerifiableReader produces a Reader with a given verifier.
type VerifiableReader struct {
r *reader
lastVerifyErr error
lastVerifyErrMu sync.Mutex
prohibitVerifyFailure bool
prohibitVerifyFailureMu sync.RWMutex
closed bool
closedMu sync.Mutex
verifier func(uint32, string) (digest.Verifier, error)
}
func (vr *VerifiableReader) storeLastVerifyErr(err error) {
vr.lastVerifyErrMu.Lock()
vr.lastVerifyErr = err
vr.lastVerifyErrMu.Unlock()
}
func (vr *VerifiableReader) loadLastVerifyErr() error {
vr.lastVerifyErrMu.Lock()
err := vr.lastVerifyErr
vr.lastVerifyErrMu.Unlock()
return err
}
func (vr *VerifiableReader) SkipVerify() Reader {
return vr.r
}
func (vr *VerifiableReader) VerifyTOC(tocDigest digest.Digest) (Reader, error) {
if vr.isClosed() {
return nil, fmt.Errorf("reader is already closed")
}
vr.prohibitVerifyFailureMu.Lock()
vr.prohibitVerifyFailure = true
lastVerifyErr := vr.loadLastVerifyErr()
vr.prohibitVerifyFailureMu.Unlock()
if err := lastVerifyErr; err != nil {
return nil, fmt.Errorf("content error occurs during caching contents: %w", err)
}
if actual := vr.r.r.TOCDigest(); actual != tocDigest {
return nil, fmt.Errorf("invalid TOC JSON %q; want %q", actual, tocDigest)
}
vr.r.verify = true
return vr.r, nil
}
func (vr *VerifiableReader) Metadata() metadata.Reader {
// TODO: this shouldn't be called before verified
return vr.r.r
}
func (vr *VerifiableReader) Cache(opts ...CacheOption) (err error) {
if vr.isClosed() {
return fmt.Errorf("reader is already closed")
}
var cacheOpts cacheOptions
for _, o := range opts {
o(&cacheOpts)
}
gr := vr.r
r := gr.r
if cacheOpts.reader != nil {
r, err = r.Clone(cacheOpts.reader)
if err != nil {
return err
}
}
rootID := r.RootID()
filter := func(int64) bool {
return true
}
if cacheOpts.filter != nil {
filter = cacheOpts.filter
}
eg, egCtx := errgroup.WithContext(context.Background())
eg.Go(func() error {
return vr.cacheWithReader(egCtx,
0, eg, semaphore.NewWeighted(int64(runtime.GOMAXPROCS(0))),
rootID, r, filter, cacheOpts.cacheOpts...)
})
return eg.Wait()
}
func (vr *VerifiableReader) cacheWithReader(ctx context.Context, currentDepth int, eg *errgroup.Group, sem *semaphore.Weighted, dirID uint32, r metadata.Reader, filter func(int64) bool, opts ...cache.Option) (rErr error) {
if currentDepth > maxWalkDepth {
return fmt.Errorf("tree is too deep (depth:%d)", currentDepth)
}
rootID := r.RootID()
r.ForeachChild(dirID, func(name string, id uint32, mode os.FileMode) bool {
e, err := r.GetAttr(id)
if err != nil {
rErr = err
return false
}
if mode.IsDir() {
// Walk through all files on this stargz file.
// Ignore the entry of "./" (formated as "" by stargz lib) on root directory
// because this points to the root directory itself.
if dirID == rootID && name == "" {
return true
}
if err := vr.cacheWithReader(ctx, currentDepth+1, eg, sem, id, r, filter, opts...); err != nil {
rErr = err
return false
}
return true
} else if !mode.IsRegular() {
// Only cache regular files
return true
} else if dirID == rootID && name == estargz.TOCTarName {
// We don't need to cache TOC json file
return true
}
offset, err := r.GetOffset(id)
if err != nil {
rErr = err
return false
}
if !filter(offset) {
// This entry need to be filtered out
return true
}
fr, err := r.OpenFileWithPreReader(id, func(nid uint32, chunkOffset, chunkSize int64, chunkDigest string, r io.Reader) (retErr error) {
return vr.readAndCache(nid, r, chunkOffset, chunkSize, chunkDigest, opts...)
})
if err != nil {
rErr = err
return false
}
var nr int64
for nr < e.Size {
chunkOffset, chunkSize, chunkDigestStr, ok := fr.ChunkEntryForOffset(nr)
if !ok {
break
}
nr += chunkSize
if err := sem.Acquire(ctx, 1); err != nil {
rErr = err
return false
}
eg.Go(func() error {
defer sem.Release(1)
err := vr.readAndCache(id, io.NewSectionReader(fr, chunkOffset, chunkSize), chunkOffset, chunkSize, chunkDigestStr, opts...)
if err != nil {
return fmt.Errorf("failed to read %q (off:%d,size:%d): %w", name, chunkOffset, chunkSize, err)
}
return nil
})
}
return true
})
return
}
func (vr *VerifiableReader) readAndCache(id uint32, fr io.Reader, chunkOffset, chunkSize int64, chunkDigest string, opts ...cache.Option) (retErr error) {
gr := vr.r
if retErr != nil {
vr.storeLastVerifyErr(retErr)
}
// Check if it already exists in the cache
cacheID := genID(id, chunkOffset, chunkSize)
if r, err := gr.cache.Get(cacheID); err == nil {
r.Close()
return nil
}
// missed cache, needs to fetch and add it to the cache
br := bufio.NewReaderSize(fr, int(chunkSize))
if _, err := br.Peek(int(chunkSize)); err != nil {
return fmt.Errorf("cacheWithReader.peek: %v", err)
}
w, err := gr.cache.Add(cacheID, opts...)
if err != nil {
return err
}
defer w.Close()
v, err := vr.verifier(id, chunkDigest)
if err != nil {
vr.prohibitVerifyFailureMu.RLock()
if vr.prohibitVerifyFailure {
vr.prohibitVerifyFailureMu.RUnlock()
return fmt.Errorf("verifier not found: %w", err)
}
vr.storeLastVerifyErr(err)
vr.prohibitVerifyFailureMu.RUnlock()
}
tee := io.Discard
if v != nil {
tee = io.Writer(v) // verification is required
}
if _, err := io.CopyN(w, io.TeeReader(br, tee), chunkSize); err != nil {
w.Abort()
return fmt.Errorf("failed to cache file payload: %w", err)
}
if v != nil && !v.Verified() {
err := fmt.Errorf("invalid chunk")
vr.prohibitVerifyFailureMu.RLock()
if vr.prohibitVerifyFailure {
vr.prohibitVerifyFailureMu.RUnlock()
w.Abort()
return err
}
vr.storeLastVerifyErr(err)
vr.prohibitVerifyFailureMu.RUnlock()
}
return w.Commit()
}
func (vr *VerifiableReader) Close() error {
vr.closedMu.Lock()
defer vr.closedMu.Unlock()
if vr.closed {
return nil
}
vr.closed = true
return vr.r.Close()
}
func (vr *VerifiableReader) isClosed() bool {
vr.closedMu.Lock()
closed := vr.closed
vr.closedMu.Unlock()
return closed
}
// NewReader creates a Reader based on the given stargz blob and cache implementation.
// It returns VerifiableReader so the caller must provide a metadata.ChunkVerifier
// to use for verifying file or chunk contained in this stargz blob.
func NewReader(r metadata.Reader, cache cache.BlobCache, layerSha digest.Digest) (*VerifiableReader, error) {
vr := &reader{
r: r,
cache: cache,
bufPool: sync.Pool{
New: func() interface{} {
return new(bytes.Buffer)
},
},
layerSha: layerSha,
verifier: digestVerifier,
}
return &VerifiableReader{r: vr, verifier: digestVerifier}, nil
}
type reader struct {
r metadata.Reader
cache cache.BlobCache
bufPool sync.Pool
layerSha digest.Digest
lastReadTime time.Time
lastReadTimeMu sync.Mutex
closed bool
closedMu sync.Mutex
verify bool
verifier func(uint32, string) (digest.Verifier, error)
}
func (gr *reader) Metadata() metadata.Reader {
return gr.r
}
func (gr *reader) setLastReadTime(lastReadTime time.Time) {
gr.lastReadTimeMu.Lock()
gr.lastReadTime = lastReadTime
gr.lastReadTimeMu.Unlock()
}
func (gr *reader) LastOnDemandReadTime() time.Time {
gr.lastReadTimeMu.Lock()
t := gr.lastReadTime
gr.lastReadTimeMu.Unlock()
return t
}
func (gr *reader) OpenFile(id uint32) (io.ReaderAt, error) {
if gr.isClosed() {
return nil, fmt.Errorf("reader is already closed")
}
var fr metadata.File
fr, err := gr.r.OpenFileWithPreReader(id, func(nid uint32, chunkOffset, chunkSize int64, chunkDigest string, r io.Reader) error {
// Check if it already exists in the cache
cacheID := genID(nid, chunkOffset, chunkSize)
if r, err := gr.cache.Get(cacheID); err == nil {
r.Close()
return nil
}
// Read and cache
b := gr.bufPool.Get().(*bytes.Buffer)
b.Reset()
b.Grow(int(chunkSize))
ip := b.Bytes()[:chunkSize]
if _, err := io.ReadFull(r, ip); err != nil {
gr.putBuffer(b)
return err
}
err := gr.verifyAndCache(nid, ip, chunkDigest, cacheID)
gr.putBuffer(b)
return err
})
if err != nil {
return nil, fmt.Errorf("failed to open file %d: %w", id, err)
}
return &file{
id: id,
fr: fr,
gr: gr,
}, nil
}
func (gr *reader) Close() (retErr error) {
gr.closedMu.Lock()
defer gr.closedMu.Unlock()
if gr.closed {
return nil
}
gr.closed = true
if err := gr.cache.Close(); err != nil {
retErr = multierror.Append(retErr, err)
}
if err := gr.r.Close(); err != nil {
retErr = multierror.Append(retErr, err)
}
return
}
func (gr *reader) isClosed() bool {
gr.closedMu.Lock()
closed := gr.closed
gr.closedMu.Unlock()
return closed
}
func (gr *reader) putBuffer(b *bytes.Buffer) {
b.Reset()
gr.bufPool.Put(b)
}
type file struct {
id uint32
fr metadata.File
gr *reader
}
// ReadAt reads chunks from the stargz file with trying to fetch as many chunks
// as possible from the cache.
func (sf *file) ReadAt(p []byte, offset int64) (int, error) {
nr := 0
for nr < len(p) {
chunkOffset, chunkSize, chunkDigestStr, ok := sf.fr.ChunkEntryForOffset(offset + int64(nr))
if !ok {
break
}
var (
id = genID(sf.id, chunkOffset, chunkSize)
lowerDiscard = positive(offset - chunkOffset)
upperDiscard = positive(chunkOffset + chunkSize - (offset + int64(len(p))))
expectedSize = chunkSize - upperDiscard - lowerDiscard
)
// Check if the content exists in the cache
if r, err := sf.gr.cache.Get(id); err == nil {
n, err := r.ReadAt(p[nr:int64(nr)+expectedSize], lowerDiscard)
if (err == nil || err == io.EOF) && int64(n) == expectedSize {
nr += n
r.Close()
continue
}
r.Close()
}
// We missed cache. Take it from underlying reader.
// We read the whole chunk here and add it to the cache so that following
// reads against neighboring chunks can take the data without decmpression.
if lowerDiscard == 0 && upperDiscard == 0 {
// We can directly store the result to the given buffer
ip := p[nr : int64(nr)+chunkSize]
n, err := sf.fr.ReadAt(ip, chunkOffset)
if err != nil && err != io.EOF {
return 0, fmt.Errorf("failed to read data: %w", err)
}
if err := sf.gr.verifyAndCache(sf.id, ip, chunkDigestStr, id); err != nil {
return 0, err
}
nr += n
continue
}
// Use temporally buffer for aligning this chunk
b := sf.gr.bufPool.Get().(*bytes.Buffer)
b.Reset()
b.Grow(int(chunkSize))
ip := b.Bytes()[:chunkSize]
if _, err := sf.fr.ReadAt(ip, chunkOffset); err != nil && err != io.EOF {
sf.gr.putBuffer(b)
return 0, fmt.Errorf("failed to read data: %w", err)
}
if err := sf.gr.verifyAndCache(sf.id, ip, chunkDigestStr, id); err != nil {
sf.gr.putBuffer(b)
return 0, err
}
n := copy(p[nr:], ip[lowerDiscard:chunkSize-upperDiscard])
sf.gr.putBuffer(b)
if int64(n) != expectedSize {
return 0, fmt.Errorf("unexpected final data size %d; want %d", n, expectedSize)
}
nr += n
}
commonmetrics.AddBytesCount(commonmetrics.OnDemandBytesServed, sf.gr.layerSha, int64(nr)) // measure the number of on demand bytes served
return nr, nil
}
func (gr *reader) verifyAndCache(entryID uint32, ip []byte, chunkDigestStr string, cacheID string) error {
// We can end up doing on demand registry fetch when aligning the chunk
commonmetrics.IncOperationCount(commonmetrics.OnDemandRemoteRegistryFetchCount, gr.layerSha) // increment the number of on demand file fetches from remote registry
commonmetrics.AddBytesCount(commonmetrics.OnDemandBytesFetched, gr.layerSha, int64(len(ip))) // record total bytes fetched
gr.setLastReadTime(time.Now())
// Verify this chunk
if err := gr.verifyChunk(entryID, ip, chunkDigestStr); err != nil {
return fmt.Errorf("invalid chunk: %w", err)
}
// Cache this chunk
if w, err := gr.cache.Add(cacheID); err == nil {
if cn, err := w.Write(ip); err != nil || cn != len(ip) {
w.Abort()
} else {
w.Commit()
}
w.Close()
}
return nil
}
func (gr *reader) verifyChunk(id uint32, p []byte, chunkDigestStr string) error {
if !gr.verify {
return nil // verification is not required
}
v, err := gr.verifier(id, chunkDigestStr)
if err != nil {
return fmt.Errorf("invalid chunk: %w", err)
}
if _, err := v.Write(p); err != nil {
return fmt.Errorf("invalid chunk: failed to write to verifier: %w", err)
}
if !v.Verified() {
return fmt.Errorf("invalid chunk: not verified")
}
return nil
}
func genID(id uint32, offset, size int64) string {
sum := sha256.Sum256([]byte(fmt.Sprintf("%d-%d-%d", id, offset, size)))
return fmt.Sprintf("%x", sum)
}
func positive(n int64) int64 {
if n < 0 {
return 0
}
return n
}
type CacheOption func(*cacheOptions)
type cacheOptions struct {
cacheOpts []cache.Option
filter func(int64) bool
reader *io.SectionReader
}
func WithCacheOpts(cacheOpts ...cache.Option) CacheOption {
return func(opts *cacheOptions) {
opts.cacheOpts = cacheOpts
}
}
func WithFilter(filter func(int64) bool) CacheOption {
return func(opts *cacheOptions) {
opts.filter = filter
}
}
func WithReader(sr *io.SectionReader) CacheOption {
return func(opts *cacheOptions) {
opts.reader = sr
}
}
func digestVerifier(id uint32, chunkDigestStr string) (digest.Verifier, error) {
chunkDigest, err := digest.Parse(chunkDigestStr)
if err != nil {
return nil, fmt.Errorf("invalid chunk: no digest is recorded(len=%d): %w", len(chunkDigestStr), err)
}
return chunkDigest.Verifier(), nil
}

View File

@@ -0,0 +1,821 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the NOTICE.md file.
*/
package reader
import (
"bytes"
"compress/gzip"
"fmt"
"io"
"os"
"path"
"path/filepath"
"strings"
"sync"
"testing"
"time"
"github.com/containerd/stargz-snapshotter/cache"
"github.com/containerd/stargz-snapshotter/estargz"
"github.com/containerd/stargz-snapshotter/metadata"
tutil "github.com/containerd/stargz-snapshotter/util/testutil"
"github.com/klauspost/compress/zstd"
digest "github.com/opencontainers/go-digest"
"golang.org/x/sync/errgroup"
)
type region struct{ b, e int64 }
const (
sampleChunkSize = 3
sampleMiddleOffset = sampleChunkSize / 2
sampleData1 = "0123456789"
lastChunkOffset1 = sampleChunkSize * (int64(len(sampleData1)) / sampleChunkSize)
)
var srcCompressions = map[string]tutil.CompressionFactory{
"zstd-fastest": tutil.ZstdCompressionWithLevel(zstd.SpeedFastest),
"gzip-bestspeed": tutil.GzipCompressionWithLevel(gzip.BestSpeed),
"externaltoc-gzip-bestspeed": tutil.ExternalTOCGzipCompressionWithLevel(gzip.BestSpeed),
}
func TestSuiteReader(t *testing.T, store metadata.Store) {
testFileReadAt(t, store)
testCacheVerify(t, store)
testFailReader(t, store)
testPreReader(t, store)
}
func testFileReadAt(t *testing.T, factory metadata.Store) {
sizeCond := map[string]int64{
"single_chunk": sampleChunkSize - sampleMiddleOffset,
"multi_chunks": sampleChunkSize + sampleMiddleOffset,
}
innerOffsetCond := map[string]int64{
"at_top": 0,
"at_middle": sampleMiddleOffset,
}
baseOffsetCond := map[string]int64{
"of_1st_chunk": sampleChunkSize * 0,
"of_2nd_chunk": sampleChunkSize * 1,
"of_last_chunk": lastChunkOffset1,
}
fileSizeCond := map[string]int64{
"in_1_chunk_file": sampleChunkSize * 1,
"in_2_chunks_file": sampleChunkSize * 2,
"in_max_size_file": int64(len(sampleData1)),
}
cacheCond := map[string][]region{
"with_clean_cache": nil,
"with_edge_filled_cache": {
region{0, sampleChunkSize - 1},
region{lastChunkOffset1, int64(len(sampleData1)) - 1},
},
"with_sparse_cache": {
region{0, sampleChunkSize - 1},
region{2 * sampleChunkSize, 3*sampleChunkSize - 1},
},
}
for sn, size := range sizeCond {
for in, innero := range innerOffsetCond {
for bo, baseo := range baseOffsetCond {
for fn, filesize := range fileSizeCond {
for cc, cacheExcept := range cacheCond {
for srcCompressionName, srcCompression := range srcCompressions {
srcCompression := srcCompression()
t.Run(fmt.Sprintf("reading_%s_%s_%s_%s_%s_%s", sn, in, bo, fn, cc, srcCompressionName), func(t *testing.T) {
if filesize > int64(len(sampleData1)) {
t.Fatal("sample file size is larger than sample data")
}
wantN := size
offset := baseo + innero
if remain := filesize - offset; remain < wantN {
if wantN = remain; wantN < 0 {
wantN = 0
}
}
// use constant string value as a data source.
want := strings.NewReader(sampleData1)
// data we want to get.
wantData := make([]byte, wantN)
_, err := want.ReadAt(wantData, offset)
if err != nil && err != io.EOF {
t.Fatalf("want.ReadAt (offset=%d,size=%d): %v", offset, wantN, err)
}
// data we get through a file.
f, closeFn := makeFile(t, []byte(sampleData1)[:filesize], sampleChunkSize, factory, srcCompression)
defer closeFn()
f.fr = newExceptFile(t, f.fr, cacheExcept...)
for _, reg := range cacheExcept {
id := genID(f.id, reg.b, reg.e-reg.b+1)
w, err := f.gr.cache.Add(id)
if err != nil {
w.Close()
t.Fatalf("failed to add cache %v: %v", id, err)
}
if _, err := w.Write([]byte(sampleData1[reg.b : reg.e+1])); err != nil {
w.Close()
t.Fatalf("failed to write cache %v: %v", id, err)
}
if err := w.Commit(); err != nil {
w.Close()
t.Fatalf("failed to commit cache %v: %v", id, err)
}
w.Close()
}
respData := make([]byte, size)
n, err := f.ReadAt(respData, offset)
if err != nil {
t.Errorf("failed to read off=%d, size=%d, filesize=%d: %v", offset, size, filesize, err)
return
}
respData = respData[:n]
if !bytes.Equal(wantData, respData) {
t.Errorf("off=%d, filesize=%d; read data{size=%d,data=%q}; want (size=%d,data=%q)",
offset, filesize, len(respData), string(respData), wantN, string(wantData))
return
}
// check cache has valid contents.
cn := 0
nr := 0
for int64(nr) < wantN {
chunkOffset, chunkSize, _, ok := f.fr.ChunkEntryForOffset(offset + int64(nr))
if !ok {
break
}
data := make([]byte, chunkSize)
id := genID(f.id, chunkOffset, chunkSize)
r, err := f.gr.cache.Get(id)
if err != nil {
t.Errorf("missed cache of offset=%d, size=%d: %v(got size=%d)", chunkOffset, chunkSize, err, n)
return
}
defer r.Close()
if n, err := r.ReadAt(data, 0); (err != nil && err != io.EOF) || n != int(chunkSize) {
t.Errorf("failed to read cache of offset=%d, size=%d: %v(got size=%d)", chunkOffset, chunkSize, err, n)
return
}
nr += n
cn++
}
})
}
}
}
}
}
}
}
func newExceptFile(t *testing.T, fr metadata.File, except ...region) metadata.File {
er := exceptFile{fr: fr, t: t}
er.except = map[region]bool{}
for _, reg := range except {
er.except[reg] = true
}
return &er
}
type exceptFile struct {
fr metadata.File
except map[region]bool
t *testing.T
}
func (er *exceptFile) ReadAt(p []byte, offset int64) (int, error) {
if er.except[region{offset, offset + int64(len(p)) - 1}] {
er.t.Fatalf("Requested prohibited region of chunk: (%d, %d)", offset, offset+int64(len(p))-1)
}
return er.fr.ReadAt(p, offset)
}
func (er *exceptFile) ChunkEntryForOffset(offset int64) (off int64, size int64, dgst string, ok bool) {
return er.fr.ChunkEntryForOffset(offset)
}
func makeFile(t *testing.T, contents []byte, chunkSize int, factory metadata.Store, comp tutil.Compression) (*file, func() error) {
testName := "test"
sr, dgst, err := tutil.BuildEStargz([]tutil.TarEntry{
tutil.File(testName, string(contents)),
}, tutil.WithEStargzOptions(estargz.WithChunkSize(chunkSize), estargz.WithCompression(comp)))
if err != nil {
t.Fatalf("failed to build sample estargz")
}
mr, err := factory(sr, metadata.WithDecompressors(comp))
if err != nil {
t.Fatalf("failed to create reader: %v", err)
}
vr, err := NewReader(mr, cache.NewMemoryCache(), digest.FromString(""))
if err != nil {
mr.Close()
t.Fatalf("failed to make new reader: %v", err)
}
r, err := vr.VerifyTOC(dgst)
if err != nil {
vr.Close()
t.Fatalf("failed to verify TOC: %v", err)
}
tid, _, err := r.Metadata().GetChild(r.Metadata().RootID(), testName)
if err != nil {
vr.Close()
t.Fatalf("failed to get %q: %v", testName, err)
}
ra, err := r.OpenFile(tid)
if err != nil {
vr.Close()
t.Fatalf("Failed to open testing file: %v", err)
}
f, ok := ra.(*file)
if !ok {
vr.Close()
t.Fatalf("invalid type of file %q", tid)
}
return f, vr.Close
}
func testCacheVerify(t *testing.T, factory metadata.Store) {
for _, skipVerify := range [2]bool{true, false} {
for _, invalidChunkBeforeVerify := range [2]bool{true, false} {
for _, invalidChunkAfterVerify := range [2]bool{true, false} {
for srcCompressionName, srcCompression := range srcCompressions {
srcCompression := srcCompression()
name := fmt.Sprintf("test_cache_verify_%v_%v_%v_%v",
skipVerify, invalidChunkBeforeVerify, invalidChunkAfterVerify, srcCompressionName)
t.Run(name, func(t *testing.T) {
sr, tocDgst, err := tutil.BuildEStargz([]tutil.TarEntry{
tutil.File("a", sampleData1+"a"),
tutil.File("b", sampleData1+"b"),
}, tutil.WithEStargzOptions(estargz.WithChunkSize(sampleChunkSize), estargz.WithCompression(srcCompression)))
if err != nil {
t.Fatalf("failed to build sample estargz")
}
// Determine the expected behaviour
var wantVerifyFail, wantCacheFail, wantCacheFail2 bool
if skipVerify {
// always no error if verification is disabled
wantVerifyFail, wantCacheFail, wantCacheFail2 = false, false, false
} else if invalidChunkBeforeVerify {
// errors occurred before verifying TOC must be reported via VerifyTOC()
wantVerifyFail = true
} else if invalidChunkAfterVerify {
// errors occurred after verifying TOC must be reported via Cache()
wantVerifyFail, wantCacheFail, wantCacheFail2 = false, true, true
} else {
// otherwise no verification error
wantVerifyFail, wantCacheFail, wantCacheFail2 = false, false, false
}
// Prepare reader
verifier := &failIDVerifier{}
mr, err := factory(sr, metadata.WithDecompressors(srcCompression))
if err != nil {
t.Fatalf("failed to prepare reader %v", err)
}
defer mr.Close()
vr, err := NewReader(mr, cache.NewMemoryCache(), digest.FromString(""))
if err != nil {
t.Fatalf("failed to make new reader: %v", err)
}
vr.verifier = verifier.verifier
vr.r.verifier = verifier.verifier
off2id, id2path, err := prepareMap(vr.Metadata(), vr.Metadata().RootID(), "")
if err != nil || off2id == nil || id2path == nil {
t.Fatalf("failed to prepare offset map %v, off2id = %+v, id2path = %+v", err, off2id, id2path)
}
// Perform Cache() before verification
// 1. Either of "a" or "b" is read and verified
// 2. VerifyTOC/SkipVerify is called
// 3. Another entry ("a" or "b") is called
verifyDone := make(chan struct{})
var firstEntryCalled bool
var eg errgroup.Group
var mu sync.Mutex
eg.Go(func() error {
return vr.Cache(WithFilter(func(off int64) bool {
id, ok := off2id[off]
if !ok {
t.Fatalf("no ID is assigned to offset %d", off)
}
name, ok := id2path[id]
if !ok {
t.Fatalf("no name is assigned to id %d", id)
}
if name == "a" || name == "b" {
mu.Lock()
if !firstEntryCalled {
firstEntryCalled = true
if invalidChunkBeforeVerify {
verifier.registerFails([]uint32{id})
}
mu.Unlock()
return true
}
mu.Unlock()
<-verifyDone
if invalidChunkAfterVerify {
verifier.registerFails([]uint32{id})
}
return true
}
return false
}))
})
if invalidChunkBeforeVerify {
// wait for encountering the error of the first chunk read
start := time.Now()
for {
if err := vr.loadLastVerifyErr(); err != nil {
break
}
if time.Since(start) > time.Second {
t.Fatalf("timeout(1s): failed to wait for read error is registered")
}
time.Sleep(10 * time.Millisecond)
}
}
// Perform verification
if skipVerify {
vr.SkipVerify()
} else {
_, err = vr.VerifyTOC(tocDgst)
}
if checkErr := checkError(wantVerifyFail, err); checkErr != nil {
t.Errorf("verify: %v", checkErr)
return
}
if err != nil {
return
}
close(verifyDone)
// Check the result of Cache()
if checkErr := checkError(wantCacheFail, eg.Wait()); checkErr != nil {
t.Errorf("cache: %v", checkErr)
return
}
// Call Cache() again and check the result
if checkErr := checkError(wantCacheFail2, vr.Cache()); checkErr != nil {
t.Errorf("cache(2): %v", checkErr)
return
}
})
}
}
}
}
}
type failIDVerifier struct {
fails []uint32
failsMu sync.Mutex
}
func (f *failIDVerifier) registerFails(fails []uint32) {
f.failsMu.Lock()
defer f.failsMu.Unlock()
f.fails = fails
}
func (f *failIDVerifier) verifier(id uint32, chunkDigest string) (digest.Verifier, error) {
f.failsMu.Lock()
defer f.failsMu.Unlock()
success := true
for _, n := range f.fails {
if n == id {
success = false
break
}
}
return &testVerifier{success}, nil
}
type testVerifier struct {
success bool
}
func (bv *testVerifier) Write(p []byte) (n int, err error) {
return len(p), nil
}
func (bv *testVerifier) Verified() bool {
return bv.success
}
func checkError(wantFail bool, err error) error {
if wantFail && err == nil {
return fmt.Errorf("wanted to fail but succeeded")
} else if !wantFail && err != nil {
return fmt.Errorf("wanted to succeed verification but failed: %w", err)
}
return nil
}
func prepareMap(mr metadata.Reader, id uint32, p string) (off2id map[int64]uint32, id2path map[uint32]string, _ error) {
attr, err := mr.GetAttr(id)
if err != nil {
return nil, nil, err
}
id2path = map[uint32]string{id: p}
off2id = make(map[int64]uint32)
if attr.Mode.IsRegular() {
off, err := mr.GetOffset(id)
if err != nil {
return nil, nil, err
}
off2id[off] = id
}
var retErr error
mr.ForeachChild(id, func(name string, id uint32, mode os.FileMode) bool {
o2i, i2p, err := prepareMap(mr, id, path.Join(p, name))
if err != nil {
retErr = err
return false
}
for k, v := range o2i {
off2id[k] = v
}
for k, v := range i2p {
id2path[k] = v
}
return true
})
if retErr != nil {
return nil, nil, retErr
}
return off2id, id2path, nil
}
func testFailReader(t *testing.T, factory metadata.Store) {
testFileName := "test"
for srcCompressionName, srcCompression := range srcCompressions {
srcCompression := srcCompression()
t.Run(fmt.Sprintf("%v", srcCompressionName), func(t *testing.T) {
for _, rs := range []bool{true, false} {
for _, vs := range []bool{true, false} {
stargzFile, tocDigest, err := tutil.BuildEStargz([]tutil.TarEntry{
tutil.File(testFileName, sampleData1),
}, tutil.WithEStargzOptions(estargz.WithChunkSize(sampleChunkSize), estargz.WithCompression(srcCompression)))
if err != nil {
t.Fatalf("failed to build sample estargz")
}
br := &breakReaderAt{
ReaderAt: stargzFile,
success: true,
}
bev := &testChunkVerifier{true}
mcache := cache.NewMemoryCache()
mr, err := factory(io.NewSectionReader(br, 0, stargzFile.Size()), metadata.WithDecompressors(srcCompression))
if err != nil {
t.Fatalf("failed to prepare metadata reader")
}
defer mr.Close()
vr, err := NewReader(mr, mcache, digest.FromString(""))
if err != nil {
t.Fatalf("failed to make new reader: %v", err)
}
defer vr.Close()
vr.verifier = bev.verifier
vr.r.verifier = bev.verifier
gr, err := vr.VerifyTOC(tocDigest)
if err != nil {
t.Fatalf("failed to verify TOC: %v", err)
}
notexist := uint32(0)
found := false
for i := uint32(0); i < 1000000; i++ {
if _, err := gr.Metadata().GetAttr(i); err != nil {
notexist, found = i, true
break
}
}
if !found {
t.Fatalf("free ID not found")
}
// tests for opening non-existing file
_, err = gr.OpenFile(notexist)
if err == nil {
t.Errorf("succeeded to open file but wanted to fail")
return
}
// tests failure behaviour of a file read
tid, _, err := gr.Metadata().GetChild(gr.Metadata().RootID(), testFileName)
if err != nil {
t.Errorf("failed to get %q: %v", testFileName, err)
return
}
fr, err := gr.OpenFile(tid)
if err != nil {
t.Errorf("failed to open file but wanted to succeed: %v", err)
return
}
mcache.(*cache.MemoryCache).Membuf = map[string]*bytes.Buffer{}
br.success = rs
bev.success = vs
// tests for reading file
p := make([]byte, len(sampleData1))
n, err := fr.ReadAt(p, 0)
if rs && vs {
if err != nil || n != len(sampleData1) || !bytes.Equal([]byte(sampleData1), p) {
t.Errorf("failed to read data but wanted to succeed: %v", err)
return
}
} else {
if err == nil {
t.Errorf("succeeded to read data but wanted to fail (reader:%v,verify:%v)", rs, vs)
return
}
}
}
}
})
}
}
type breakReaderAt struct {
io.ReaderAt
success bool
}
func (br *breakReaderAt) ReadAt(p []byte, off int64) (int, error) {
if br.success {
return br.ReaderAt.ReadAt(p, off)
}
return 0, fmt.Errorf("failed")
}
type testChunkVerifier struct {
success bool
}
func (bev *testChunkVerifier) verifier(id uint32, chunkDigest string) (digest.Verifier, error) {
return &testVerifier{bev.success}, nil
}
func testPreReader(t *testing.T, factory metadata.Store) {
data64KB := string(tutil.RandomBytes(t, 64000))
tests := []struct {
name string
chunkSize int
minChunkSize int
in []tutil.TarEntry
want []check
}{
{
name: "several_files_in_chunk",
minChunkSize: 8000,
in: []tutil.TarEntry{
tutil.Dir("foo/"),
tutil.File("foo/foo1", data64KB),
tutil.File("foo2", "bb"),
tutil.File("foo22", "ccc"),
tutil.Dir("bar/"),
tutil.File("bar/bar.txt", "aaa"),
tutil.File("foo3", data64KB),
},
// NOTE: we assume that the compressed "data64KB" is still larger than 8KB
// landmark+dir+foo1, foo2+foo22+dir+bar.txt+foo3, TOC, footer
want: []check{
hasFileContentsWithPreCached("foo22", 0, "ccc", chunkInfo{"foo2", "bb", 0, 2}, chunkInfo{"bar/bar.txt", "aaa", 0, 3}, chunkInfo{"foo3", data64KB, 0, 64000}),
hasFileContentsOffset("foo2", 0, "bb", true),
hasFileContentsOffset("bar/bar.txt", 0, "aaa", true),
hasFileContentsOffset("bar/bar.txt", 1, "aa", true),
hasFileContentsOffset("bar/bar.txt", 2, "a", true),
hasFileContentsOffset("foo3", 0, data64KB, true),
hasFileContentsOffset("foo22", 0, "ccc", true),
hasFileContentsOffset("foo/foo1", 0, data64KB, false),
hasFileContentsOffset("foo/foo1", 0, data64KB, true),
hasFileContentsOffset("foo/foo1", 1, data64KB[1:], true),
hasFileContentsOffset("foo/foo1", 2, data64KB[2:], true),
hasFileContentsOffset("foo/foo1", 3, data64KB[3:], true),
},
},
{
name: "several_files_in_chunk_chunked",
minChunkSize: 8000,
chunkSize: 32000,
in: []tutil.TarEntry{
tutil.Dir("foo/"),
tutil.File("foo/foo1", data64KB),
tutil.File("foo2", "bb"),
tutil.Dir("bar/"),
tutil.File("foo3", data64KB),
},
// NOTE: we assume that the compressed chunk of "data64KB" is still larger than 8KB
// landmark+dir+foo1(1), foo1(2), foo2+dir+foo3(1), foo3(2), TOC, footer
want: []check{
hasFileContentsWithPreCached("foo2", 0, "bb", chunkInfo{"foo3", data64KB[:32000], 0, 32000}),
hasFileContentsOffset("foo2", 0, "bb", true),
hasFileContentsOffset("foo2", 1, "b", true),
hasFileContentsOffset("foo3", 0, data64KB[:len(data64KB)/2], true),
hasFileContentsOffset("foo3", 1, data64KB[1:len(data64KB)/2], true),
hasFileContentsOffset("foo3", 2, data64KB[2:len(data64KB)/2], true),
hasFileContentsOffset("foo3", int64(len(data64KB)/2), data64KB[len(data64KB)/2:], false),
hasFileContentsOffset("foo3", int64(len(data64KB)-1), data64KB[len(data64KB)-1:], true),
hasFileContentsOffset("foo/foo1", 0, data64KB, false),
hasFileContentsOffset("foo/foo1", 1, data64KB[1:], true),
hasFileContentsOffset("foo/foo1", 2, data64KB[2:], true),
hasFileContentsOffset("foo/foo1", int64(len(data64KB)/2), data64KB[len(data64KB)/2:], true),
hasFileContentsOffset("foo/foo1", int64(len(data64KB)-1), data64KB[len(data64KB)-1:], true),
},
},
}
for _, tt := range tests {
for srcCompresionName, srcCompression := range srcCompressions {
srcCompression := srcCompression()
t.Run(tt.name+"-"+srcCompresionName, func(t *testing.T) {
opts := []tutil.BuildEStargzOption{
tutil.WithEStargzOptions(estargz.WithCompression(srcCompression)),
}
if tt.chunkSize > 0 {
opts = append(opts, tutil.WithEStargzOptions(estargz.WithChunkSize(tt.chunkSize)))
}
if tt.minChunkSize > 0 {
t.Logf("minChunkSize = %d", tt.minChunkSize)
opts = append(opts, tutil.WithEStargzOptions(estargz.WithMinChunkSize(tt.minChunkSize)))
}
esgz, tocDgst, err := tutil.BuildEStargz(tt.in, opts...)
if err != nil {
t.Fatalf("failed to build sample eStargz: %v", err)
}
testR := &calledReaderAt{esgz, nil}
mr, err := factory(io.NewSectionReader(testR, 0, esgz.Size()), metadata.WithDecompressors(srcCompression))
if err != nil {
t.Fatalf("failed to create new reader: %v", err)
}
defer mr.Close()
memcache := cache.NewMemoryCache()
vr, err := NewReader(mr, memcache, digest.FromString(""))
if err != nil {
t.Fatalf("failed to make new reader: %v", err)
}
rr, err := vr.VerifyTOC(tocDgst)
if err != nil {
t.Fatalf("failed to verify TOC: %v", err)
}
r := rr.(*reader)
for _, want := range tt.want {
want(t, r, testR)
}
})
}
}
}
type check func(*testing.T, *reader, *calledReaderAt)
type chunkInfo struct {
name string
data string
chunkOffset int64
chunkSize int64
}
func hasFileContentsOffset(name string, off int64, contents string, fromCache bool) check {
return func(t *testing.T, r *reader, cr *calledReaderAt) {
tid, err := lookup(r, name)
if err != nil {
t.Fatalf("failed to lookup %q", name)
}
ra, err := r.OpenFile(tid)
if err != nil {
t.Fatalf("Failed to open testing file: %v", err)
}
cr.called = nil // reset test
buf := make([]byte, len(contents))
n, err := ra.ReadAt(buf, off)
if err != nil {
t.Fatalf("failed to readat %q: %v", name, err)
}
if n != len(contents) {
t.Fatalf("failed to read contents %q (off:%d, want:%q) got %q", name, off, longBytesView([]byte(contents)), longBytesView(buf))
}
if string(buf) != contents {
t.Fatalf("unexpected content of %q: %q want %q", name, longBytesView(buf), longBytesView([]byte(contents)))
}
t.Logf("reader calls for %q: offsets: %+v", name, cr.called)
if fromCache {
if len(cr.called) != 0 {
t.Fatalf("unexpected read on %q: offsets: %v", name, cr.called)
}
} else {
if len(cr.called) == 0 {
t.Fatalf("no call happened to reader for %q", name)
}
}
}
}
func hasFileContentsWithPreCached(name string, off int64, contents string, extra ...chunkInfo) check {
return func(t *testing.T, r *reader, cr *calledReaderAt) {
tid, err := lookup(r, name)
if err != nil {
t.Fatalf("failed to lookup %q", name)
}
ra, err := r.OpenFile(tid)
if err != nil {
t.Fatalf("Failed to open testing file: %v", err)
}
buf := make([]byte, len(contents))
n, err := ra.ReadAt(buf, off)
if err != nil {
t.Fatalf("failed to readat %q: %v", name, err)
}
if n != len(contents) {
t.Fatalf("failed to read contents %q (off:%d, want:%q) got %q", name, off, longBytesView([]byte(contents)), longBytesView(buf))
}
if string(buf) != contents {
t.Fatalf("unexpected content of %q: %q want %q", name, longBytesView(buf), longBytesView([]byte(contents)))
}
for _, e := range extra {
eid, err := lookup(r, e.name)
if err != nil {
t.Fatalf("failed to lookup %q", e.name)
}
cacheID := genID(eid, e.chunkOffset, e.chunkSize)
er, err := r.cache.Get(cacheID)
if err != nil {
t.Fatalf("failed to get cache %q: %+v", cacheID, e)
}
data, err := io.ReadAll(io.NewSectionReader(er, 0, e.chunkSize))
er.Close()
if err != nil {
t.Fatalf("failed to read cache %q: %+v", cacheID, e)
}
if string(data) != e.data {
t.Fatalf("unexpected contents of cache %q (%+v): %q; wanted %q", cacheID, e, longBytesView(data), longBytesView([]byte(e.data)))
}
}
}
}
func lookup(r *reader, name string) (uint32, error) {
name = strings.TrimPrefix(path.Clean("/"+name), "/")
if name == "" {
return r.Metadata().RootID(), nil
}
dir, base := filepath.Split(name)
pid, err := lookup(r, dir)
if err != nil {
return 0, err
}
id, _, err := r.Metadata().GetChild(pid, base)
return id, err
}
type calledReaderAt struct {
io.ReaderAt
called []int64
}
func (r *calledReaderAt) ReadAt(p []byte, off int64) (int, error) {
r.called = append(r.called, off)
return r.ReaderAt.ReadAt(p, off)
}
// longBytesView is an alias of []byte suitable for printing a long data as an omitted string to avoid long data being printed.
type longBytesView []byte
func (b longBytesView) String() string {
if len(b) < 100 {
return string(b)
}
return string(b[:50]) + "...(omit)..." + string(b[len(b)-50:])
}

View File

@@ -0,0 +1,535 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the NOTICE.md file.
*/
package remote
import (
"context"
"fmt"
"io"
"regexp"
"sort"
"strings"
"sync"
"time"
"github.com/containerd/containerd/v2/pkg/reference"
"github.com/containerd/stargz-snapshotter/cache"
"github.com/containerd/stargz-snapshotter/fs/source"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
"golang.org/x/sync/errgroup"
"golang.org/x/sync/singleflight"
)
var contentRangeRegexp = regexp.MustCompile(`bytes ([0-9]+)-([0-9]+)/([0-9]+|\\*)`)
type Blob interface {
Check() error
Size() int64
FetchedSize() int64
ReadAt(p []byte, offset int64, opts ...Option) (int, error)
Cache(offset int64, size int64, opts ...Option) error
Refresh(ctx context.Context, host source.RegistryHosts, refspec reference.Spec, desc ocispec.Descriptor) error
Close() error
}
type blob struct {
fetcher fetcher
fetcherMu sync.Mutex
size int64
chunkSize int64
prefetchChunkSize int64
cache cache.BlobCache
lastCheck time.Time
lastCheckMu sync.Mutex
checkInterval time.Duration
fetchTimeout time.Duration
fetchedRegionSet regionSet
fetchedRegionSetMu sync.Mutex
fetchedRegionGroup singleflight.Group
fetchedRegionCopyMu sync.Mutex
resolver *Resolver
closed bool
closedMu sync.Mutex
}
func makeBlob(fetcher fetcher, size int64, chunkSize int64, prefetchChunkSize int64,
blobCache cache.BlobCache, lastCheck time.Time, checkInterval time.Duration,
r *Resolver, fetchTimeout time.Duration) *blob {
return &blob{
fetcher: fetcher,
size: size,
chunkSize: chunkSize,
prefetchChunkSize: prefetchChunkSize,
cache: blobCache,
lastCheck: lastCheck,
checkInterval: checkInterval,
resolver: r,
fetchTimeout: fetchTimeout,
}
}
func (b *blob) Close() error {
b.closedMu.Lock()
defer b.closedMu.Unlock()
if b.closed {
return nil
}
b.closed = true
return b.cache.Close()
}
func (b *blob) isClosed() bool {
b.closedMu.Lock()
closed := b.closed
b.closedMu.Unlock()
return closed
}
func (b *blob) Refresh(ctx context.Context, hosts source.RegistryHosts, refspec reference.Spec, desc ocispec.Descriptor) error {
if b.isClosed() {
return fmt.Errorf("blob is already closed")
}
// refresh the fetcher
f, newSize, err := b.resolver.resolveFetcher(ctx, hosts, refspec, desc)
if err != nil {
return err
}
if newSize != b.size {
return fmt.Errorf("Invalid size of new blob %d; want %d", newSize, b.size)
}
// update the blob's fetcher with new one
b.fetcherMu.Lock()
b.fetcher = f
b.fetcherMu.Unlock()
b.lastCheckMu.Lock()
b.lastCheck = time.Now()
b.lastCheckMu.Unlock()
return nil
}
func (b *blob) Check() error {
if b.isClosed() {
return fmt.Errorf("blob is already closed")
}
now := time.Now()
b.lastCheckMu.Lock()
lastCheck := b.lastCheck
b.lastCheckMu.Unlock()
if now.Sub(lastCheck) < b.checkInterval {
// do nothing if not expired
return nil
}
b.fetcherMu.Lock()
fr := b.fetcher
b.fetcherMu.Unlock()
err := fr.check()
if err == nil {
// update lastCheck only if check succeeded.
// on failure, we should check this layer next time again.
b.lastCheckMu.Lock()
b.lastCheck = now
b.lastCheckMu.Unlock()
}
return err
}
func (b *blob) Size() int64 {
return b.size
}
func (b *blob) FetchedSize() int64 {
b.fetchedRegionSetMu.Lock()
sz := b.fetchedRegionSet.totalSize()
b.fetchedRegionSetMu.Unlock()
return sz
}
func makeSyncKey(allData map[region]io.Writer) string {
keys := make([]string, len(allData))
keysIndex := 0
for key := range allData {
keys[keysIndex] = fmt.Sprintf("[%d,%d]", key.b, key.e)
keysIndex++
}
sort.Strings(keys)
return strings.Join(keys, ",")
}
func (b *blob) cacheAt(offset int64, size int64, fr fetcher, cacheOpts *options) error {
fetchReg := region{floor(offset, b.chunkSize), ceil(offset+size-1, b.chunkSize) - 1}
discard := make(map[region]io.Writer)
err := b.walkChunks(fetchReg, func(reg region) error {
if r, err := b.cache.Get(fr.genID(reg), cacheOpts.cacheOpts...); err == nil {
return r.Close() // nop if the cache hits
}
discard[reg] = io.Discard
return nil
})
if err != nil {
return err
}
return b.fetchRange(discard, cacheOpts)
}
func (b *blob) Cache(offset int64, size int64, opts ...Option) error {
if b.isClosed() {
return fmt.Errorf("blob is already closed")
}
var cacheOpts options
for _, o := range opts {
o(&cacheOpts)
}
b.fetcherMu.Lock()
fr := b.fetcher
b.fetcherMu.Unlock()
if b.prefetchChunkSize <= b.chunkSize {
return b.cacheAt(offset, size, fr, &cacheOpts)
}
eg, _ := errgroup.WithContext(context.Background())
fetchSize := b.chunkSize * (b.prefetchChunkSize / b.chunkSize)
end := offset + size
for i := offset; i < end; i += fetchSize {
i, l := i, fetchSize
if i+l > end {
l = end - i
}
eg.Go(func() error {
return b.cacheAt(i, l, fr, &cacheOpts)
})
}
return eg.Wait()
}
// ReadAt reads remote chunks from specified offset for the buffer size.
// It tries to fetch as many chunks as possible from local cache.
// We can configure this function with options.
func (b *blob) ReadAt(p []byte, offset int64, opts ...Option) (int, error) {
if b.isClosed() {
return 0, fmt.Errorf("blob is already closed")
}
if len(p) == 0 || offset > b.size {
return 0, nil
}
// Make the buffer chunk aligned
allRegion := region{floor(offset, b.chunkSize), ceil(offset+int64(len(p))-1, b.chunkSize) - 1}
allData := make(map[region]io.Writer)
var readAtOpts options
for _, o := range opts {
o(&readAtOpts)
}
// Fetcher can be suddenly updated so we take and use the snapshot of it for
// consistency.
b.fetcherMu.Lock()
fr := b.fetcher
b.fetcherMu.Unlock()
b.walkChunks(allRegion, func(chunk region) error {
var (
base = positive(chunk.b - offset)
lowerUnread = positive(offset - chunk.b)
upperUnread = positive(chunk.e + 1 - (offset + int64(len(p))))
expectedSize = chunk.size() - upperUnread - lowerUnread
)
// Check if the content exists in the cache
r, err := b.cache.Get(fr.genID(chunk), readAtOpts.cacheOpts...)
if err == nil {
defer r.Close()
n, err := r.ReadAt(p[base:base+expectedSize], lowerUnread)
if (err == nil || err == io.EOF) && int64(n) == expectedSize {
return nil
}
}
// We missed cache. Take it from remote registry.
// We get the whole chunk here and add it to the cache so that following
// reads against neighboring chunks can take the data without making HTTP requests.
allData[chunk] = newBytesWriter(p[base:base+expectedSize], lowerUnread)
return nil
})
// Read required data
if err := b.fetchRange(allData, &readAtOpts); err != nil {
return 0, err
}
// Adjust the buffer size according to the blob size
if remain := b.size - offset; int64(len(p)) >= remain {
if remain < 0 {
remain = 0
}
p = p[:remain]
}
return len(p), nil
}
// fetchRegions fetches all specified chunks from remote blob and puts it in the local cache.
// It must be called from within fetchRange and need to ensure that it is inside the singleflight `Do` operation.
func (b *blob) fetchRegions(allData map[region]io.Writer, fetched map[region]bool, opts *options) error {
if len(allData) == 0 {
return nil
}
// Fetcher can be suddenly updated so we take and use the snapshot of it for
// consistency.
b.fetcherMu.Lock()
fr := b.fetcher
b.fetcherMu.Unlock()
// request missed regions
var req []region
for reg := range allData {
req = append(req, reg)
fetched[reg] = false
}
fetchCtx, cancel := context.WithTimeout(context.Background(), b.fetchTimeout)
defer cancel()
if opts.ctx != nil {
fetchCtx = opts.ctx
}
mr, err := fr.fetch(fetchCtx, req, true)
if err != nil {
return err
}
defer mr.Close()
// Update the check timer because we succeeded to access the blob
b.lastCheckMu.Lock()
b.lastCheck = time.Now()
b.lastCheckMu.Unlock()
// chunk and cache responsed data. Regions must be aligned by chunk size.
// TODO: Reorganize remoteData to make it be aligned by chunk size
for {
reg, p, err := mr.Next()
if err == io.EOF {
break
} else if err != nil {
return fmt.Errorf("failed to read multipart resp: %w", err)
}
if err := b.walkChunks(reg, func(chunk region) (retErr error) {
id := fr.genID(chunk)
cw, err := b.cache.Add(id, opts.cacheOpts...)
if err != nil {
return err
}
defer cw.Close()
w := io.Writer(cw)
// If this chunk is one of the targets, write the content to the
// passed reader too.
if _, ok := fetched[chunk]; ok {
w = io.MultiWriter(w, allData[chunk])
}
// Copy the target chunk
if _, err := io.CopyN(w, p, chunk.size()); err != nil {
cw.Abort()
return err
}
// Add the target chunk to the cache
if err := cw.Commit(); err != nil {
return err
}
b.fetchedRegionSetMu.Lock()
b.fetchedRegionSet.add(chunk)
b.fetchedRegionSetMu.Unlock()
fetched[chunk] = true
return nil
}); err != nil {
return fmt.Errorf("failed to get chunks: %w", err)
}
}
// Check all chunks are fetched
var unfetched []region
for c, b := range fetched {
if !b {
unfetched = append(unfetched, c)
}
}
if unfetched != nil {
return fmt.Errorf("failed to fetch region %v", unfetched)
}
return nil
}
// fetchRange fetches all specified chunks from local cache and remote blob.
func (b *blob) fetchRange(allData map[region]io.Writer, opts *options) error {
if len(allData) == 0 {
return nil
}
// We build a key based on regions we need to fetch and pass it to singleflightGroup.Do(...)
// to block simultaneous same requests. Once the request is finished and the data is ready,
// all blocked callers will be unblocked and that same data will be returned by all blocked callers.
key := makeSyncKey(allData)
fetched := make(map[region]bool)
_, err, shared := b.fetchedRegionGroup.Do(key, func() (interface{}, error) {
return nil, b.fetchRegions(allData, fetched, opts)
})
// When unblocked try to read from cache in case if there were no errors
// If we fail reading from cache, fetch from remote registry again
if err == nil && shared {
for reg := range allData {
if _, ok := fetched[reg]; ok {
continue
}
err = b.walkChunks(reg, func(chunk region) error {
b.fetcherMu.Lock()
fr := b.fetcher
b.fetcherMu.Unlock()
// Check if the content exists in the cache
// And if exists, read from cache
r, err := b.cache.Get(fr.genID(chunk), opts.cacheOpts...)
if err != nil {
return err
}
defer r.Close()
rr := io.NewSectionReader(r, 0, chunk.size())
// Copy the target chunk
b.fetchedRegionCopyMu.Lock()
defer b.fetchedRegionCopyMu.Unlock()
if _, err := io.CopyN(allData[chunk], rr, chunk.size()); err != nil {
return err
}
return nil
})
if err != nil {
break
}
}
// if we cannot read the data from cache, do fetch again
if err != nil {
return b.fetchRange(allData, opts)
}
}
return err
}
type walkFunc func(reg region) error
// walkChunks walks chunks from begin to end in order in the specified region.
// specified region must be aligned by chunk size.
func (b *blob) walkChunks(allRegion region, walkFn walkFunc) error {
if allRegion.b%b.chunkSize != 0 {
return fmt.Errorf("region (%d, %d) must be aligned by chunk size",
allRegion.b, allRegion.e)
}
for i := allRegion.b; i <= allRegion.e && i < b.size; i += b.chunkSize {
reg := region{i, i + b.chunkSize - 1}
if reg.e >= b.size {
reg.e = b.size - 1
}
if err := walkFn(reg); err != nil {
return err
}
}
return nil
}
func newBytesWriter(dest []byte, destOff int64) io.Writer {
return &bytesWriter{
dest: dest,
destOff: destOff,
current: 0,
}
}
type bytesWriter struct {
dest []byte
destOff int64
current int64
}
func (bw *bytesWriter) Write(p []byte) (int, error) {
defer func() { bw.current = bw.current + int64(len(p)) }()
var (
destBase = positive(bw.current - bw.destOff)
pBegin = positive(bw.destOff - bw.current)
pEnd = positive(bw.destOff + int64(len(bw.dest)) - bw.current)
)
if destBase > int64(len(bw.dest)) {
return len(p), nil
}
if pBegin >= int64(len(p)) {
return len(p), nil
}
if pEnd > int64(len(p)) {
pEnd = int64(len(p))
}
copy(bw.dest[destBase:], p[pBegin:pEnd])
return len(p), nil
}
func floor(n int64, unit int64) int64 {
return (n / unit) * unit
}
func ceil(n int64, unit int64) int64 {
return (n/unit + 1) * unit
}
func positive(n int64) int64 {
if n < 0 {
return 0
}
return n
}

View File

@@ -0,0 +1,728 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the NOTICE.md file.
*/
package remote
import (
"context"
"crypto/rand"
"crypto/sha256"
"fmt"
"io"
"math/big"
"mime"
"mime/multipart"
"net/http"
"path"
"strconv"
"strings"
"sync"
"time"
"github.com/containerd/containerd/v2/core/remotes/docker"
"github.com/containerd/containerd/v2/pkg/reference"
"github.com/containerd/errdefs"
"github.com/containerd/log"
"github.com/containerd/stargz-snapshotter/cache"
"github.com/containerd/stargz-snapshotter/fs/config"
commonmetrics "github.com/containerd/stargz-snapshotter/fs/metrics/common"
"github.com/containerd/stargz-snapshotter/fs/source"
"github.com/hashicorp/go-multierror"
rhttp "github.com/hashicorp/go-retryablehttp"
digest "github.com/opencontainers/go-digest"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)
const (
defaultChunkSize = 50000
defaultValidIntervalSec = 60
defaultFetchTimeoutSec = 300
defaultMaxRetries = 5
defaultMinWaitMSec = 30
defaultMaxWaitMSec = 300000
)
func NewResolver(cfg config.BlobConfig, handlers map[string]Handler) *Resolver {
if cfg.ChunkSize == 0 { // zero means "use default chunk size"
cfg.ChunkSize = defaultChunkSize
}
if cfg.ValidInterval == 0 { // zero means "use default interval"
cfg.ValidInterval = defaultValidIntervalSec
}
if cfg.CheckAlways {
cfg.ValidInterval = 0
}
if cfg.FetchTimeoutSec == 0 {
cfg.FetchTimeoutSec = defaultFetchTimeoutSec
}
if cfg.MaxRetries == 0 {
cfg.MaxRetries = defaultMaxRetries
}
if cfg.MinWaitMSec == 0 {
cfg.MinWaitMSec = defaultMinWaitMSec
}
if cfg.MaxWaitMSec == 0 {
cfg.MaxWaitMSec = defaultMaxWaitMSec
}
return &Resolver{
blobConfig: cfg,
handlers: handlers,
}
}
type Resolver struct {
blobConfig config.BlobConfig
handlers map[string]Handler
}
type fetcher interface {
fetch(ctx context.Context, rs []region, retry bool) (multipartReadCloser, error)
check() error
genID(reg region) string
}
func (r *Resolver) Resolve(ctx context.Context, hosts source.RegistryHosts, refspec reference.Spec, desc ocispec.Descriptor, blobCache cache.BlobCache) (Blob, error) {
f, size, err := r.resolveFetcher(ctx, hosts, refspec, desc)
if err != nil {
return nil, err
}
blobConfig := &r.blobConfig
return makeBlob(f,
size,
blobConfig.ChunkSize,
blobConfig.PrefetchChunkSize,
blobCache,
time.Now(),
time.Duration(blobConfig.ValidInterval)*time.Second,
r,
time.Duration(blobConfig.FetchTimeoutSec)*time.Second), nil
}
func (r *Resolver) resolveFetcher(ctx context.Context, hosts source.RegistryHosts, refspec reference.Spec, desc ocispec.Descriptor) (f fetcher, size int64, err error) {
blobConfig := &r.blobConfig
fc := &fetcherConfig{
hosts: hosts,
refspec: refspec,
desc: desc,
maxRetries: blobConfig.MaxRetries,
minWaitMSec: time.Duration(blobConfig.MinWaitMSec) * time.Millisecond,
maxWaitMSec: time.Duration(blobConfig.MaxWaitMSec) * time.Millisecond,
}
var handlersErr error
for name, p := range r.handlers {
// TODO: allow to configure the selection of readers based on the hostname in refspec
r, size, err := p.Handle(ctx, desc)
if err != nil {
handlersErr = multierror.Append(handlersErr, err)
continue
}
log.G(ctx).WithField("handler name", name).WithField("ref", refspec.String()).WithField("digest", desc.Digest).
Debugf("contents is provided by a handler")
return &remoteFetcher{r}, size, nil
}
log.G(ctx).WithError(handlersErr).WithField("ref", refspec.String()).WithField("digest", desc.Digest).Debugf("using default handler")
hf, size, err := newHTTPFetcher(ctx, fc)
if err != nil {
return nil, 0, err
}
if blobConfig.ForceSingleRangeMode {
hf.singleRangeMode()
}
return hf, size, err
}
type fetcherConfig struct {
hosts source.RegistryHosts
refspec reference.Spec
desc ocispec.Descriptor
maxRetries int
minWaitMSec time.Duration
maxWaitMSec time.Duration
}
func jitter(duration time.Duration) time.Duration {
if duration <= 0 {
return duration
}
b, err := rand.Int(rand.Reader, big.NewInt(int64(duration)))
if err != nil {
panic(err)
}
return time.Duration(b.Int64() + int64(duration))
}
// backoffStrategy extends retryablehttp's DefaultBackoff to add a random jitter to avoid overwhelming the repository
// when it comes back online
// DefaultBackoff either tries to parse the 'Retry-After' header of the response; or, it uses an exponential backoff
// 2 ^ numAttempts, limited by max
func backoffStrategy(min, max time.Duration, attemptNum int, resp *http.Response) time.Duration {
delayTime := rhttp.DefaultBackoff(min, max, attemptNum, resp)
return jitter(delayTime)
}
// retryStrategy extends retryablehttp's DefaultRetryPolicy to debug log the error when retrying
// DefaultRetryPolicy retries whenever err is non-nil (except for some url errors) or if returned
// status code is 429 or 5xx (except 501)
func retryStrategy(ctx context.Context, resp *http.Response, err error) (bool, error) {
retry, err2 := rhttp.DefaultRetryPolicy(ctx, resp, err)
if retry {
log.G(ctx).WithError(err).Debugf("Retrying request")
}
return retry, err2
}
func newHTTPFetcher(ctx context.Context, fc *fetcherConfig) (*httpFetcher, int64, error) {
reghosts, err := fc.hosts(fc.refspec)
if err != nil {
return nil, 0, err
}
desc := fc.desc
if desc.Digest.String() == "" {
return nil, 0, fmt.Errorf("Digest is mandatory in layer descriptor")
}
digest := desc.Digest
pullScope, err := docker.RepositoryScope(fc.refspec, false)
if err != nil {
return nil, 0, err
}
// Try to create fetcher until succeeded
rErr := fmt.Errorf("failed to resolve")
for _, host := range reghosts {
if host.Host == "" || strings.Contains(host.Host, "/") {
rErr = fmt.Errorf("invalid destination (host %q, ref:%q, digest:%q): %w", host.Host, fc.refspec, digest, rErr)
continue // Try another
}
// Prepare transport with authorization functionality
tr := host.Client.Transport
timeout := host.Client.Timeout
if rt, ok := tr.(*rhttp.RoundTripper); ok {
rt.Client.RetryMax = fc.maxRetries
rt.Client.RetryWaitMin = fc.minWaitMSec
rt.Client.RetryWaitMax = fc.maxWaitMSec
rt.Client.Backoff = backoffStrategy
rt.Client.CheckRetry = retryStrategy
timeout = rt.Client.HTTPClient.Timeout
}
if host.Authorizer != nil {
tr = &transport{
inner: tr,
auth: host.Authorizer,
scope: pullScope,
}
}
// Resolve redirection and get blob URL
blobURL := fmt.Sprintf("%s://%s/%s/blobs/%s",
host.Scheme,
path.Join(host.Host, host.Path),
strings.TrimPrefix(fc.refspec.Locator, fc.refspec.Hostname()+"/"),
digest)
url, header, err := redirect(ctx, blobURL, tr, timeout, host.Header)
if err != nil {
rErr = fmt.Errorf("failed to redirect (host %q, ref:%q, digest:%q): %v: %w", host.Host, fc.refspec, digest, err, rErr)
continue // Try another
}
// Get size information
// TODO: we should try to use the Size field in the descriptor here.
start := time.Now() // start time before getting layer header
size, err := getSize(ctx, url, tr, timeout, header)
commonmetrics.MeasureLatencyInMilliseconds(commonmetrics.StargzHeaderGet, digest, start) // time to get layer header
if err != nil {
rErr = fmt.Errorf("failed to get size (host %q, ref:%q, digest:%q): %v: %w", host.Host, fc.refspec, digest, err, rErr)
continue // Try another
}
// Hit one destination
return &httpFetcher{
url: url,
tr: tr,
blobURL: blobURL,
digest: digest,
timeout: timeout,
header: header,
orgHeader: host.Header,
}, size, nil
}
return nil, 0, fmt.Errorf("cannot resolve layer: %w", rErr)
}
type transport struct {
inner http.RoundTripper
auth docker.Authorizer
scope string
}
func (tr *transport) RoundTrip(req *http.Request) (*http.Response, error) {
ctx := docker.WithScope(req.Context(), tr.scope)
roundTrip := func(req *http.Request) (*http.Response, error) {
// authorize the request using docker.Authorizer
if err := tr.auth.Authorize(ctx, req); err != nil {
return nil, err
}
// send the request
return tr.inner.RoundTrip(req)
}
resp, err := roundTrip(req)
if err != nil {
return nil, err
}
// TODO: support more status codes and retries
if resp.StatusCode == http.StatusUnauthorized {
log.G(ctx).Infof("Received status code: %v. Refreshing creds...", resp.Status)
// prepare authorization for the target host using docker.Authorizer
if err := tr.auth.AddResponses(ctx, []*http.Response{resp}); err != nil {
if errdefs.IsNotImplemented(err) {
return resp, nil
}
return nil, err
}
// re-authorize and send the request
return roundTrip(req.Clone(ctx))
}
return resp, nil
}
func redirect(ctx context.Context, blobURL string, tr http.RoundTripper, timeout time.Duration, header http.Header) (url string, withHeader http.Header, err error) {
if timeout > 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
}
// We use GET request for redirect.
// gcr.io returns 200 on HEAD without Location header (2020).
// ghcr.io returns 200 on HEAD without Location header (2020).
req, err := http.NewRequestWithContext(ctx, "GET", blobURL, nil)
if err != nil {
return "", nil, fmt.Errorf("failed to make request to the registry: %w", err)
}
req.Header = http.Header{}
for k, v := range header {
req.Header[k] = v
}
req.Close = false
req.Header.Set("Range", "bytes=0-1")
res, err := tr.RoundTrip(req)
if err != nil {
return "", nil, fmt.Errorf("failed to request: %w", err)
}
defer func() {
io.Copy(io.Discard, res.Body)
res.Body.Close()
}()
if res.StatusCode/100 == 2 {
url = blobURL
withHeader = header
} else if redir := res.Header.Get("Location"); redir != "" && res.StatusCode/100 == 3 {
// TODO: Support nested redirection
url = redir
// Do not pass headers to the redirected location.
} else {
return "", nil, fmt.Errorf("failed to access to the registry with code %v", res.StatusCode)
}
return
}
func getSize(ctx context.Context, url string, tr http.RoundTripper, timeout time.Duration, header http.Header) (int64, error) {
if timeout > 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
}
req, err := http.NewRequestWithContext(ctx, "HEAD", url, nil)
if err != nil {
return 0, err
}
req.Header = http.Header{}
for k, v := range header {
req.Header[k] = v
}
req.Close = false
res, err := tr.RoundTrip(req)
if err != nil {
return 0, err
}
defer res.Body.Close()
if res.StatusCode == http.StatusOK {
return strconv.ParseInt(res.Header.Get("Content-Length"), 10, 64)
}
headStatusCode := res.StatusCode
// Failed to do HEAD request. Fall back to GET.
// ghcr.io (https://github-production-container-registry.s3.amazonaws.com) doesn't allow
// HEAD request (2020).
req, err = http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return 0, fmt.Errorf("failed to make request to the registry: %w", err)
}
req.Header = http.Header{}
for k, v := range header {
req.Header[k] = v
}
req.Close = false
req.Header.Set("Range", "bytes=0-1")
res, err = tr.RoundTrip(req)
if err != nil {
return 0, fmt.Errorf("failed to request: %w", err)
}
defer func() {
io.Copy(io.Discard, res.Body)
res.Body.Close()
}()
if res.StatusCode == http.StatusOK {
return strconv.ParseInt(res.Header.Get("Content-Length"), 10, 64)
} else if res.StatusCode == http.StatusPartialContent {
_, size, err := parseRange(res.Header.Get("Content-Range"))
return size, err
}
return 0, fmt.Errorf("failed to get size with code (HEAD=%v, GET=%v)",
headStatusCode, res.StatusCode)
}
type httpFetcher struct {
url string
urlMu sync.Mutex
tr http.RoundTripper
blobURL string
digest digest.Digest
singleRange bool
singleRangeMu sync.Mutex
timeout time.Duration
header http.Header
orgHeader http.Header
}
type multipartReadCloser interface {
Next() (region, io.Reader, error)
Close() error
}
func (f *httpFetcher) fetch(ctx context.Context, rs []region, retry bool) (multipartReadCloser, error) {
if len(rs) == 0 {
return nil, fmt.Errorf("no request queried")
}
var (
tr = f.tr
singleRangeMode = f.isSingleRangeMode()
)
// squash requesting chunks for reducing the total size of request header
// (servers generally have limits for the size of headers)
// TODO: when our request has too many ranges, we need to divide it into
// multiple requests to avoid huge header.
var s regionSet
for _, reg := range rs {
s.add(reg)
}
requests := s.rs
if singleRangeMode {
// Squash requests if the layer doesn't support multi range.
requests = []region{superRegion(requests)}
}
// Request to the registry
f.urlMu.Lock()
url := f.url
f.urlMu.Unlock()
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, err
}
req.Header = http.Header{}
for k, v := range f.header {
req.Header[k] = v
}
var ranges string
for _, reg := range requests {
ranges += fmt.Sprintf("%d-%d,", reg.b, reg.e)
}
req.Header.Add("Range", fmt.Sprintf("bytes=%s", ranges[:len(ranges)-1]))
req.Header.Add("Accept-Encoding", "identity")
req.Close = false
// Recording the roundtrip latency for remote registry GET operation.
start := time.Now()
res, err := tr.RoundTrip(req) // NOT DefaultClient; don't want redirects
commonmetrics.MeasureLatencyInMilliseconds(commonmetrics.RemoteRegistryGet, f.digest, start)
if err != nil {
return nil, err
}
if res.StatusCode == http.StatusOK {
// We are getting the whole blob in one part (= status 200)
size, err := strconv.ParseInt(res.Header.Get("Content-Length"), 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to parse Content-Length: %w", err)
}
return newSinglePartReader(region{0, size - 1}, res.Body), nil
} else if res.StatusCode == http.StatusPartialContent {
mediaType, params, err := mime.ParseMediaType(res.Header.Get("Content-Type"))
if err != nil {
return nil, fmt.Errorf("invalid media type %q: %w", mediaType, err)
}
if strings.HasPrefix(mediaType, "multipart/") {
// We are getting a set of chunks as a multipart body.
return newMultiPartReader(res.Body, params["boundary"]), nil
}
// We are getting single range
reg, _, err := parseRange(res.Header.Get("Content-Range"))
if err != nil {
return nil, fmt.Errorf("failed to parse Content-Range: %w", err)
}
return newSinglePartReader(reg, res.Body), nil
} else if retry && res.StatusCode == http.StatusForbidden {
log.G(ctx).Infof("Received status code: %v. Refreshing URL and retrying...", res.Status)
// re-redirect and retry this once.
if err := f.refreshURL(ctx); err != nil {
return nil, fmt.Errorf("failed to refresh URL on %v: %w", res.Status, err)
}
return f.fetch(ctx, rs, false)
} else if retry && res.StatusCode == http.StatusBadRequest && !singleRangeMode {
log.G(ctx).Infof("Received status code: %v. Setting single range mode and retrying...", res.Status)
// gcr.io (https://storage.googleapis.com) returns 400 on multi-range request (2020 #81)
f.singleRangeMode() // fallbacks to singe range request mode
return f.fetch(ctx, rs, false) // retries with the single range mode
}
return nil, fmt.Errorf("unexpected status code: %v", res.Status)
}
func (f *httpFetcher) check() error {
ctx := context.Background()
if f.timeout > 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, f.timeout)
defer cancel()
}
f.urlMu.Lock()
url := f.url
f.urlMu.Unlock()
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return fmt.Errorf("check failed: failed to make request: %w", err)
}
req.Header = http.Header{}
for k, v := range f.header {
req.Header[k] = v
}
req.Close = false
req.Header.Set("Range", "bytes=0-1")
res, err := f.tr.RoundTrip(req)
if err != nil {
return fmt.Errorf("check failed: failed to request to registry: %w", err)
}
defer func() {
io.Copy(io.Discard, res.Body)
res.Body.Close()
}()
if res.StatusCode == http.StatusOK || res.StatusCode == http.StatusPartialContent {
return nil
} else if res.StatusCode == http.StatusForbidden {
// Try to re-redirect this blob
rCtx := context.Background()
if f.timeout > 0 {
var rCancel context.CancelFunc
rCtx, rCancel = context.WithTimeout(rCtx, f.timeout)
defer rCancel()
}
if err := f.refreshURL(rCtx); err == nil {
return nil
}
return fmt.Errorf("failed to refresh URL on status %v", res.Status)
}
return fmt.Errorf("unexpected status code %v", res.StatusCode)
}
func (f *httpFetcher) refreshURL(ctx context.Context) error {
newURL, headers, err := redirect(ctx, f.blobURL, f.tr, f.timeout, f.orgHeader)
if err != nil {
return err
}
f.urlMu.Lock()
f.url = newURL
f.header = headers
f.urlMu.Unlock()
return nil
}
func (f *httpFetcher) genID(reg region) string {
sum := sha256.Sum256([]byte(fmt.Sprintf("%s-%d-%d", f.blobURL, reg.b, reg.e)))
return fmt.Sprintf("%x", sum)
}
func (f *httpFetcher) singleRangeMode() {
f.singleRangeMu.Lock()
f.singleRange = true
f.singleRangeMu.Unlock()
}
func (f *httpFetcher) isSingleRangeMode() bool {
f.singleRangeMu.Lock()
r := f.singleRange
f.singleRangeMu.Unlock()
return r
}
func newSinglePartReader(reg region, rc io.ReadCloser) multipartReadCloser {
return &singlepartReader{
r: rc,
Closer: rc,
reg: reg,
}
}
type singlepartReader struct {
io.Closer
r io.Reader
reg region
called bool
}
func (sr *singlepartReader) Next() (region, io.Reader, error) {
if !sr.called {
sr.called = true
return sr.reg, sr.r, nil
}
return region{}, nil, io.EOF
}
func newMultiPartReader(rc io.ReadCloser, boundary string) multipartReadCloser {
return &multipartReader{
m: multipart.NewReader(rc, boundary),
Closer: rc,
}
}
type multipartReader struct {
io.Closer
m *multipart.Reader
}
func (sr *multipartReader) Next() (region, io.Reader, error) {
p, err := sr.m.NextPart()
if err != nil {
return region{}, nil, err
}
reg, _, err := parseRange(p.Header.Get("Content-Range"))
if err != nil {
return region{}, nil, fmt.Errorf("failed to parse Content-Range: %w", err)
}
return reg, p, nil
}
func parseRange(header string) (region, int64, error) {
submatches := contentRangeRegexp.FindStringSubmatch(header)
if len(submatches) < 4 {
return region{}, 0, fmt.Errorf("Content-Range %q doesn't have enough information", header)
}
begin, err := strconv.ParseInt(submatches[1], 10, 64)
if err != nil {
return region{}, 0, fmt.Errorf("failed to parse beginning offset %q: %w", submatches[1], err)
}
end, err := strconv.ParseInt(submatches[2], 10, 64)
if err != nil {
return region{}, 0, fmt.Errorf("failed to parse end offset %q: %w", submatches[2], err)
}
blobSize, err := strconv.ParseInt(submatches[3], 10, 64)
if err != nil {
return region{}, 0, fmt.Errorf("failed to parse blob size %q: %w", submatches[3], err)
}
return region{begin, end}, blobSize, nil
}
type Option func(*options)
type options struct {
ctx context.Context
cacheOpts []cache.Option
}
func WithContext(ctx context.Context) Option {
return func(opts *options) {
opts.ctx = ctx
}
}
func WithCacheOpts(cacheOpts ...cache.Option) Option {
return func(opts *options) {
opts.cacheOpts = cacheOpts
}
}
type remoteFetcher struct {
r Fetcher
}
func (r *remoteFetcher) fetch(ctx context.Context, rs []region, retry bool) (multipartReadCloser, error) {
var s regionSet
for _, reg := range rs {
s.add(reg)
}
reg := superRegion(s.rs)
rc, err := r.r.Fetch(ctx, reg.b, reg.size())
if err != nil {
return nil, err
}
return newSinglePartReader(reg, rc), nil
}
func (r *remoteFetcher) check() error {
return r.r.Check()
}
func (r *remoteFetcher) genID(reg region) string {
return r.r.GenID(reg.b, reg.size())
}
type Handler interface {
Handle(ctx context.Context, desc ocispec.Descriptor) (fetcher Fetcher, size int64, err error)
}
type Fetcher interface {
Fetch(ctx context.Context, off int64, size int64) (io.ReadCloser, error)
Check() error
GenID(off int64, size int64) string
}

View File

@@ -0,0 +1,109 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2019 The Go Authors. All rights reserved.
Use of this source code is governed by a BSD-style
license that can be found in the NOTICE.md file.
*/
package remote
// region is HTTP-range-request-compliant range.
// "b" is beginning byte of the range and "e" is the end.
// "e" is must be inclusive along with HTTP's range expression.
type region struct{ b, e int64 }
func (c region) size() int64 {
return c.e - c.b + 1
}
func superRegion(regs []region) region {
s := regs[0]
for _, reg := range regs {
if reg.b < s.b {
s.b = reg.b
}
if reg.e > s.e {
s.e = reg.e
}
}
return s
}
// regionSet is a set of regions
type regionSet struct {
rs []region // must be kept sorted
}
// add attempts to merge r to rs.rs with squashing the regions as
// small as possible. This operation takes O(n).
// TODO: more efficient way to do it.
func (rs *regionSet) add(r region) {
// Iterate over the sorted region slice from the tail.
// a) When an overwrap occurs, adjust `r` to fully contain the looking region
// `l` and remove `l` from region slice.
// b) Once l.e become less than r.b, no overwrap will occur again. So immediately
// insert `r` which fully contains all overwrapped regions, to the region slice.
// Here, `r` is inserted to the region slice with keeping it sorted, without
// overwrapping to any regions.
// *) If any `l` contains `r`, we don't need to do anything so return immediately.
for i := len(rs.rs) - 1; i >= 0; i-- {
l := &rs.rs[i]
// *) l contains r
if l.b <= r.b && r.e <= l.e {
return
}
// a) r overwraps to l so adjust r to fully contain l and reomve l
// from region slice.
if l.b <= r.b && r.b <= l.e+1 && l.e <= r.e {
r.b = l.b
rs.rs = append(rs.rs[:i], rs.rs[i+1:]...)
continue
}
if r.b <= l.b && l.b <= r.e+1 && r.e <= l.e {
r.e = l.e
rs.rs = append(rs.rs[:i], rs.rs[i+1:]...)
continue
}
if r.b <= l.b && l.e <= r.e {
rs.rs = append(rs.rs[:i], rs.rs[i+1:]...)
continue
}
// b) No overwrap will occur after this iteration. Instert r to the
// region slice immediately.
if l.e < r.b {
rs.rs = append(rs.rs[:i+1], append([]region{r}, rs.rs[i+1:]...)...)
return
}
// No overwrap occurs yet. See the next region.
}
// r is the topmost region among regions in the slice.
rs.rs = append([]region{r}, rs.rs...)
}
func (rs *regionSet) totalSize() int64 {
var sz int64
for _, f := range rs.rs {
sz += f.size()
}
return sz
}

View File

@@ -0,0 +1,271 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package source
import (
"context"
"fmt"
"strings"
"github.com/containerd/containerd/v2/core/images"
"github.com/containerd/containerd/v2/core/remotes/docker"
"github.com/containerd/containerd/v2/pkg/labels"
"github.com/containerd/containerd/v2/pkg/reference"
"github.com/containerd/stargz-snapshotter/fs/config"
digest "github.com/opencontainers/go-digest"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)
// GetSources is a function for converting snapshot labels into typed blob sources
// information. This package defines a default converter which provides source
// information based on some labels but implementations aren't required to use labels.
// Implementations are allowed to return several sources (registry config + image refs)
// about the blob.
type GetSources func(labels map[string]string) (source []Source, err error)
// RegistryHosts returns a list of registries that provides the specified image.
type RegistryHosts func(reference.Spec) ([]docker.RegistryHost, error)
// Source is a typed blob source information. This contains information about
// a blob stored in registries and some contexts of the blob.
type Source struct {
// Hosts is a registry configuration where this blob is stored.
Hosts RegistryHosts
// Name is an image reference which contains this blob.
Name reference.Spec
// Target is a descriptor of this blob.
Target ocispec.Descriptor
// Manifest is an image manifest which contains the blob. This will
// be used by the filesystem to pre-resolve some layers contained in
// the manifest.
// Currently, only layer digests (Manifest.Layers.Digest) will be used.
Manifest ocispec.Manifest
}
const (
// targetRefLabel is a label which contains image reference.
targetRefLabel = "containerd.io/snapshot/remote/stargz.reference"
// targetDigestLabel is a label which contains layer digest.
targetDigestLabel = "containerd.io/snapshot/remote/stargz.digest"
// targetImageLayersLabel is a label which contains layer digests contained in
// the target image.
targetImageLayersLabel = "containerd.io/snapshot/remote/stargz.layers"
// targetImageURLsLabelPrefix is a label prefix which constructs a map from the layer index to
// urls of the layer descriptor.
targetImageURLsLabelPrefix = "containerd.io/snapshot/remote/urls."
// targetURsLLabel is a label which contains layer URL. This is only used to pass URL from containerd
// to snapshotter.
targetURLsLabel = "containerd.io/snapshot/remote/urls"
)
// FromDefaultLabels returns a function for converting snapshot labels to
// source information based on labels.
func FromDefaultLabels(hosts RegistryHosts) GetSources {
return func(labels map[string]string) ([]Source, error) {
refStr, ok := labels[targetRefLabel]
if !ok {
return nil, fmt.Errorf("reference hasn't been passed")
}
refspec, err := reference.Parse(refStr)
if err != nil {
return nil, err
}
digestStr, ok := labels[targetDigestLabel]
if !ok {
return nil, fmt.Errorf("digest hasn't been passed")
}
target, err := digest.Parse(digestStr)
if err != nil {
return nil, err
}
var neighboringLayers []ocispec.Descriptor
if l, ok := labels[targetImageLayersLabel]; ok {
layersStr := strings.Split(l, ",")
for i, l := range layersStr {
d, err := digest.Parse(l)
if err != nil {
return nil, err
}
if d.String() != target.String() {
desc := ocispec.Descriptor{Digest: d}
if urls, ok := labels[targetImageURLsLabelPrefix+fmt.Sprintf("%d", i)]; ok {
desc.URLs = strings.Split(urls, ",")
}
neighboringLayers = append(neighboringLayers, desc)
}
}
}
targetDesc := ocispec.Descriptor{
Digest: target,
Annotations: labels,
}
if targetURLs, ok := labels[targetURLsLabel]; ok {
targetDesc.URLs = append(targetDesc.URLs, strings.Split(targetURLs, ",")...)
}
return []Source{
{
Hosts: hosts,
Name: refspec,
Target: targetDesc,
Manifest: ocispec.Manifest{Layers: append([]ocispec.Descriptor{targetDesc}, neighboringLayers...)},
},
}, nil
}
}
// AppendDefaultLabelsHandlerWrapper makes a handler which appends image's basic
// information to each layer descriptor as annotations during unpack. These
// annotations will be passed to this remote snapshotter as labels and used to
// construct source information.
func AppendDefaultLabelsHandlerWrapper(ref string, prefetchSize int64) func(f images.Handler) images.Handler {
return func(f images.Handler) images.Handler {
return images.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
children, err := f.Handle(ctx, desc)
if err != nil {
return nil, err
}
switch desc.MediaType {
case ocispec.MediaTypeImageManifest, images.MediaTypeDockerSchema2Manifest:
for i := range children {
c := &children[i]
if images.IsLayerType(c.MediaType) {
if c.Annotations == nil {
c.Annotations = make(map[string]string)
}
c.Annotations[targetRefLabel] = ref
c.Annotations[targetDigestLabel] = c.Digest.String()
var layers string
for i, l := range children[i:] {
if images.IsLayerType(l.MediaType) {
ls := fmt.Sprintf("%s,", l.Digest.String())
// This avoids the label hits the size limitation.
// Skipping layers is allowed here and only affects performance.
if err := labels.Validate(targetImageLayersLabel, layers+ls); err != nil {
break
}
layers += ls
// Store URLs of the neighbouring layer as well.
urlsKey := targetImageURLsLabelPrefix + fmt.Sprintf("%d", i)
c.Annotations[urlsKey] = appendWithValidation(urlsKey, l.URLs)
}
}
c.Annotations[targetImageLayersLabel] = strings.TrimSuffix(layers, ",")
c.Annotations[config.TargetPrefetchSizeLabel] = fmt.Sprintf("%d", prefetchSize)
// store URL in annotation to let containerd to pass it to the snapshotter
c.Annotations[targetURLsLabel] = appendWithValidation(targetURLsLabel, c.URLs)
}
}
}
return children, nil
})
}
}
func appendWithValidation(key string, values []string) string {
var v string
for _, u := range values {
s := fmt.Sprintf("%s,", u)
if err := labels.Validate(key, v+s); err != nil {
break
}
v += s
}
return strings.TrimSuffix(v, ",")
}
// TODO: switch to "github.com/containerd/containerd/pkg/snapshotters" once all tools using
//
// stargz-snapshotter (e.g. k3s) move to containerd version where that pkg is available.
const (
// targetImageLayersLabel is a label which contains layer digests contained in
// the target image and will be passed to snapshotters for preparing layers in
// parallel. Skipping some layers is allowed and only affects performance.
targetImageLayersLabelContainerd = "containerd.io/snapshot/cri.image-layers"
)
// AppendExtraLabelsHandler adds optional labels that aren't provided by
// "github.com/containerd/containerd/pkg/snapshotters" but can be used for stargz snapshotter's extra functionalities.
func AppendExtraLabelsHandler(prefetchSize int64, wrapper func(images.Handler) images.Handler) func(images.Handler) images.Handler {
return func(f images.Handler) images.Handler {
return images.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
children, err := wrapper(f).Handle(ctx, desc)
if err != nil {
return nil, err
}
switch desc.MediaType {
case ocispec.MediaTypeImageManifest, images.MediaTypeDockerSchema2Manifest:
for i := range children {
c := &children[i]
if !images.IsLayerType(c.MediaType) {
continue
}
if _, ok := c.Annotations[targetURLsLabel]; !ok { // nop if this key is already set
c.Annotations[targetURLsLabel] = appendWithValidation(targetURLsLabel, c.URLs)
}
if _, ok := c.Annotations[config.TargetPrefetchSizeLabel]; !ok { // nop if this key is already set
c.Annotations[config.TargetPrefetchSizeLabel] = fmt.Sprintf("%d", prefetchSize)
}
// Store URLs of the neighbouring layer as well.
nlayers, ok := c.Annotations[targetImageLayersLabelContainerd]
if !ok {
continue
}
for j, dstr := range strings.Split(nlayers, ",") {
d, err := digest.Parse(dstr)
if err != nil {
return nil, err
}
l, ok := layerFromDigest(children, d)
if !ok {
continue
}
urlsKey := targetImageURLsLabelPrefix + fmt.Sprintf("%d", j)
if _, ok := c.Annotations[urlsKey]; !ok { // nop if this key is already set
c.Annotations[urlsKey] = appendWithValidation(urlsKey, l.URLs)
}
}
}
}
return children, nil
})
}
}
func layerFromDigest(layers []ocispec.Descriptor, target digest.Digest) (ocispec.Descriptor, bool) {
for _, l := range layers {
if l.Digest == target {
return l, images.IsLayerType(l.MediaType)
}
}
return ocispec.Descriptor{}, false
}

View File

@@ -0,0 +1,283 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package memory
import (
"fmt"
"io"
"math"
"os"
"time"
"github.com/containerd/stargz-snapshotter/estargz"
"github.com/containerd/stargz-snapshotter/metadata"
digest "github.com/opencontainers/go-digest"
)
type reader struct {
r *estargz.Reader
rootID uint32
idMap map[uint32]*estargz.TOCEntry
idOfEntry map[string]uint32
estargzOpts []estargz.OpenOption
}
func newReader(er *estargz.Reader, rootID uint32, idMap map[uint32]*estargz.TOCEntry, idOfEntry map[string]uint32, estargzOpts []estargz.OpenOption) *reader {
return &reader{r: er, rootID: rootID, idMap: idMap, idOfEntry: idOfEntry, estargzOpts: estargzOpts}
}
func NewReader(sr *io.SectionReader, opts ...metadata.Option) (metadata.Reader, error) {
var rOpts metadata.Options
for _, o := range opts {
if err := o(&rOpts); err != nil {
return nil, fmt.Errorf("failed to apply option: %w", err)
}
}
telemetry := &estargz.Telemetry{}
if rOpts.Telemetry != nil {
telemetry.GetFooterLatency = estargz.MeasureLatencyHook(rOpts.Telemetry.GetFooterLatency)
telemetry.GetTocLatency = estargz.MeasureLatencyHook(rOpts.Telemetry.GetTocLatency)
telemetry.DeserializeTocLatency = estargz.MeasureLatencyHook(rOpts.Telemetry.DeserializeTocLatency)
}
var decompressors []estargz.Decompressor
for _, d := range rOpts.Decompressors {
decompressors = append(decompressors, d)
}
erOpts := []estargz.OpenOption{
estargz.WithTOCOffset(rOpts.TOCOffset),
estargz.WithTelemetry(telemetry),
estargz.WithDecompressors(decompressors...),
}
er, err := estargz.Open(sr, erOpts...)
if err != nil {
return nil, err
}
root, ok := er.Lookup("")
if !ok {
return nil, fmt.Errorf("failed to get root node")
}
rootID, idMap, idOfEntry, err := assignIDs(er, root)
if err != nil {
return nil, err
}
r := newReader(er, rootID, idMap, idOfEntry, erOpts)
return r, nil
}
// assignIDs assigns an to each TOC item and returns a mapping from ID to entry and vice-versa.
func assignIDs(er *estargz.Reader, e *estargz.TOCEntry) (rootID uint32, idMap map[uint32]*estargz.TOCEntry, idOfEntry map[string]uint32, err error) {
idMap = make(map[uint32]*estargz.TOCEntry)
idOfEntry = make(map[string]uint32)
curID := uint32(0)
nextID := func() (uint32, error) {
if curID == math.MaxUint32 {
return 0, fmt.Errorf("sequence id too large")
}
curID++
return curID, nil
}
var mapChildren func(e *estargz.TOCEntry) (uint32, error)
mapChildren = func(e *estargz.TOCEntry) (uint32, error) {
if e.Type == "hardlink" {
return 0, fmt.Errorf("unexpected type \"hardlink\": this should be replaced to the destination entry")
}
var ok bool
id, ok := idOfEntry[e.Name]
if !ok {
id, err = nextID()
if err != nil {
return 0, err
}
idMap[id] = e
idOfEntry[e.Name] = id
}
e.ForeachChild(func(_ string, ent *estargz.TOCEntry) bool {
_, err = mapChildren(ent)
return err == nil
})
if err != nil {
return 0, err
}
return id, nil
}
rootID, err = mapChildren(e)
if err != nil {
return 0, nil, nil, err
}
return rootID, idMap, idOfEntry, nil
}
func (r *reader) RootID() uint32 {
return r.rootID
}
func (r *reader) TOCDigest() digest.Digest {
return r.r.TOCDigest()
}
func (r *reader) GetOffset(id uint32) (offset int64, err error) {
e, ok := r.idMap[id]
if !ok {
return 0, fmt.Errorf("entry %d not found", id)
}
return e.Offset, nil
}
func (r *reader) GetAttr(id uint32) (attr metadata.Attr, err error) {
e, ok := r.idMap[id]
if !ok {
err = fmt.Errorf("entry %d not found", id)
return
}
// TODO: zero copy
attrFromTOCEntry(e, &attr)
return
}
func (r *reader) GetChild(pid uint32, base string) (id uint32, attr metadata.Attr, err error) {
e, ok := r.idMap[pid]
if !ok {
err = fmt.Errorf("parent entry %d not found", pid)
return
}
child, ok := e.LookupChild(base)
if !ok {
err = fmt.Errorf("child %q of entry %d not found", base, pid)
return
}
cid, ok := r.idOfEntry[child.Name]
if !ok {
err = fmt.Errorf("id of entry %q not found", base)
return
}
// TODO: zero copy
attrFromTOCEntry(child, &attr)
return cid, attr, nil
}
func (r *reader) ForeachChild(id uint32, f func(name string, id uint32, mode os.FileMode) bool) error {
e, ok := r.idMap[id]
if !ok {
return fmt.Errorf("parent entry %d not found", id)
}
var err error
e.ForeachChild(func(baseName string, ent *estargz.TOCEntry) bool {
id, ok := r.idOfEntry[ent.Name]
if !ok {
err = fmt.Errorf("id of child entry %q not found", baseName)
return false
}
return f(baseName, id, ent.Stat().Mode())
})
return err
}
func (r *reader) OpenFile(id uint32) (metadata.File, error) {
e, ok := r.idMap[id]
if !ok {
return nil, fmt.Errorf("entry %d not found", id)
}
sr, err := r.r.OpenFile(e.Name)
if err != nil {
return nil, err
}
return &file{r, e, sr}, nil
}
func (r *reader) OpenFileWithPreReader(id uint32, preRead func(id uint32, chunkOffset, chunkSize int64, chunkDigest string, r io.Reader) error) (metadata.File, error) {
e, ok := r.idMap[id]
if !ok {
return nil, fmt.Errorf("entry %d not found", id)
}
sr, err := r.r.OpenFileWithPreReader(e.Name, func(e *estargz.TOCEntry, chunkR io.Reader) error {
cid, ok := r.idOfEntry[e.Name]
if !ok {
return fmt.Errorf("id of entry %q not found", e.Name)
}
return preRead(cid, e.ChunkOffset, e.ChunkSize, e.ChunkDigest, chunkR)
})
if err != nil {
return nil, err
}
return &file{r, e, sr}, nil
}
func (r *reader) Clone(sr *io.SectionReader) (metadata.Reader, error) {
er, err := estargz.Open(sr, r.estargzOpts...)
if err != nil {
return nil, err
}
return newReader(er, r.rootID, r.idMap, r.idOfEntry, r.estargzOpts), nil
}
func (r *reader) Close() error {
return nil
}
type file struct {
r *reader
e *estargz.TOCEntry
sr *io.SectionReader
}
func (r *file) ChunkEntryForOffset(offset int64) (off int64, size int64, dgst string, ok bool) {
e, ok := r.r.r.ChunkEntryForOffset(r.e.Name, offset)
if !ok {
return 0, 0, "", false
}
dgst = e.Digest
if e.ChunkDigest != "" {
// NOTE* "reg" also can contain ChunkDigest (e.g. when "reg" is the first entry of
// chunked file)
dgst = e.ChunkDigest
}
return e.ChunkOffset, e.ChunkSize, dgst, true
}
func (r *file) ReadAt(p []byte, off int64) (n int, err error) {
return r.sr.ReadAt(p, off)
}
func (r *reader) NumOfNodes() (i int, _ error) {
return len(r.idMap), nil
}
// TODO: share it with db pkg
func attrFromTOCEntry(src *estargz.TOCEntry, dst *metadata.Attr) *metadata.Attr {
dst.Size = src.Size
dst.ModTime, _ = time.Parse(time.RFC3339, src.ModTime3339)
dst.LinkName = src.LinkName
dst.Mode = src.Stat().Mode()
dst.UID = src.UID
dst.GID = src.GID
dst.DevMajor = src.DevMajor
dst.DevMinor = src.DevMinor
dst.Xattrs = src.Xattrs
dst.NumLink = src.NumLink
return dst
}

View File

@@ -0,0 +1,139 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metadata
import (
"io"
"os"
"time"
"github.com/containerd/stargz-snapshotter/estargz"
digest "github.com/opencontainers/go-digest"
)
// Attr reprensents the attributes of a node.
type Attr struct {
// Size, for regular files, is the logical size of the file.
Size int64
// ModTime is the modification time of the node.
ModTime time.Time
// LinkName, for symlinks, is the link target.
LinkName string
// Mode is the permission and mode bits.
Mode os.FileMode
// UID is the user ID of the owner.
UID int
// GID is the group ID of the owner.
GID int
// DevMajor is the major device number for device.
DevMajor int
// DevMinor is the major device number for device.
DevMinor int
// Xattrs are the extended attribute for the node.
Xattrs map[string][]byte
// NumLink is the number of names pointing to this node.
NumLink int
}
// Store reads the provided eStargz blob and creates a metadata reader.
type Store func(sr *io.SectionReader, opts ...Option) (Reader, error)
// Reader provides access to file metadata of a blob.
type Reader interface {
RootID() uint32
TOCDigest() digest.Digest
GetOffset(id uint32) (offset int64, err error)
GetAttr(id uint32) (attr Attr, err error)
GetChild(pid uint32, base string) (id uint32, attr Attr, err error)
ForeachChild(id uint32, f func(name string, id uint32, mode os.FileMode) bool) error
OpenFile(id uint32) (File, error)
OpenFileWithPreReader(id uint32, preRead func(id uint32, chunkOffset, chunkSize int64, chunkDigest string, r io.Reader) error) (File, error)
Clone(sr *io.SectionReader) (Reader, error)
Close() error
}
type File interface {
ChunkEntryForOffset(offset int64) (off int64, size int64, dgst string, ok bool)
ReadAt(p []byte, off int64) (n int, err error)
}
type Decompressor interface {
estargz.Decompressor
// DecompressTOC decompresses the passed blob and returns a reader of TOC JSON.
//
// If tocOffset returned by ParseFooter is < 0, we assume that TOC isn't contained in the blob.
// Pass nil reader to DecompressTOC then we expect that DecompressTOC acquire TOC from the external
// location and return it.
DecompressTOC(io.Reader) (tocJSON io.ReadCloser, err error)
}
type Options struct {
TOCOffset int64
Telemetry *Telemetry
Decompressors []Decompressor
}
// Option is an option to configure the behaviour of reader.
type Option func(o *Options) error
// WithTOCOffset option specifies the offset of TOC
func WithTOCOffset(tocOffset int64) Option {
return func(o *Options) error {
o.TOCOffset = tocOffset
return nil
}
}
// WithTelemetry option specifies the telemetry hooks
func WithTelemetry(telemetry *Telemetry) Option {
return func(o *Options) error {
o.Telemetry = telemetry
return nil
}
}
// WithDecompressors option specifies decompressors to use.
// Default is gzip-based decompressor.
func WithDecompressors(decompressors ...Decompressor) Option {
return func(o *Options) error {
o.Decompressors = decompressors
return nil
}
}
// A func which takes start time and records the diff
type MeasureLatencyHook func(time.Time)
// A struct which defines telemetry hooks. By implementing these hooks you should be able to record
// the latency metrics of the respective steps of estargz open operation.
type Telemetry struct {
GetFooterLatency MeasureLatencyHook // measure time to get stargz footer (in milliseconds)
GetTocLatency MeasureLatencyHook // measure time to GET TOC JSON (in milliseconds)
DeserializeTocLatency MeasureLatencyHook // measure time to deserialize TOC JSON (in milliseconds)
}

View File

@@ -0,0 +1,154 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package estargz
import (
"context"
"fmt"
"io"
"github.com/containerd/containerd/v2/core/content"
"github.com/containerd/containerd/v2/core/images"
"github.com/containerd/containerd/v2/core/images/converter"
"github.com/containerd/containerd/v2/core/images/converter/uncompress"
"github.com/containerd/containerd/v2/pkg/archive/compression"
"github.com/containerd/containerd/v2/pkg/labels"
"github.com/containerd/errdefs"
"github.com/containerd/stargz-snapshotter/estargz"
"github.com/containerd/stargz-snapshotter/util/ioutils"
"github.com/opencontainers/go-digest"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)
// LayerConvertWithLayerAndCommonOptsFunc converts legacy tar.gz layers into eStargz tar.gz
// layers. Media type is unchanged. Should be used in conjunction with WithDockerToOCI(). See
// LayerConvertFunc for more details. The difference between this function and
// LayerConvertFunc is that this allows to specify additional eStargz options per layer.
func LayerConvertWithLayerAndCommonOptsFunc(opts map[digest.Digest][]estargz.Option, commonOpts ...estargz.Option) converter.ConvertFunc {
if opts == nil {
return LayerConvertFunc(commonOpts...)
}
return func(ctx context.Context, cs content.Store, desc ocispec.Descriptor) (*ocispec.Descriptor, error) {
// TODO: enable to speciy option per layer "index" because it's possible that there are
// two layers having same digest in an image (but this should be rare case)
return LayerConvertFunc(append(commonOpts, opts[desc.Digest]...)...)(ctx, cs, desc)
}
}
// LayerConvertFunc converts legacy tar.gz layers into eStargz tar.gz layers.
// Media type is unchanged.
//
// Should be used in conjunction with WithDockerToOCI().
//
// Otherwise "containerd.io/snapshot/stargz/toc.digest" annotation will be lost,
// because the Docker media type does not support layer annotations.
func LayerConvertFunc(opts ...estargz.Option) converter.ConvertFunc {
return func(ctx context.Context, cs content.Store, desc ocispec.Descriptor) (*ocispec.Descriptor, error) {
if !images.IsLayerType(desc.MediaType) {
// No conversion. No need to return an error here.
return nil, nil
}
info, err := cs.Info(ctx, desc.Digest)
if err != nil {
return nil, err
}
labelz := info.Labels
if labelz == nil {
labelz = make(map[string]string)
}
ra, err := cs.ReaderAt(ctx, desc)
if err != nil {
return nil, err
}
defer ra.Close()
sr := io.NewSectionReader(ra, 0, desc.Size)
blob, err := estargz.Build(sr, append(opts, estargz.WithContext(ctx))...)
if err != nil {
return nil, err
}
defer blob.Close()
ref := fmt.Sprintf("convert-estargz-from-%s", desc.Digest)
w, err := content.OpenWriter(ctx, cs, content.WithRef(ref))
if err != nil {
return nil, err
}
defer w.Close()
// Reset the writing position
// Old writer possibly remains without aborted
// (e.g. conversion interrupted by a signal)
if err := w.Truncate(0); err != nil {
return nil, err
}
// Copy and count the contents
pr, pw := io.Pipe()
c := new(ioutils.CountWriter)
doneCount := make(chan struct{})
go func() {
defer close(doneCount)
defer pr.Close()
decompressR, err := compression.DecompressStream(pr)
if err != nil {
pr.CloseWithError(err)
return
}
defer decompressR.Close()
if _, err := io.Copy(c, decompressR); err != nil {
pr.CloseWithError(err)
return
}
}()
n, err := io.Copy(w, io.TeeReader(blob, pw))
if err != nil {
return nil, err
}
if err := blob.Close(); err != nil {
return nil, err
}
if err := pw.Close(); err != nil {
return nil, err
}
<-doneCount
// update diffID label
labelz[labels.LabelUncompressed] = blob.DiffID().String()
if err = w.Commit(ctx, n, "", content.WithLabels(labelz)); err != nil && !errdefs.IsAlreadyExists(err) {
return nil, err
}
if err := w.Close(); err != nil {
return nil, err
}
newDesc := desc
if uncompress.IsUncompressedType(newDesc.MediaType) {
if images.IsDockerType(newDesc.MediaType) {
newDesc.MediaType += ".gzip"
} else {
newDesc.MediaType += "+gzip"
}
}
newDesc.Digest = w.Digest()
newDesc.Size = n
if newDesc.Annotations == nil {
newDesc.Annotations = make(map[string]string, 1)
}
newDesc.Annotations[estargz.TOCJSONDigestAnnotation] = blob.TOCDigest().String()
newDesc.Annotations[estargz.StoreUncompressedSizeAnnotation] = fmt.Sprintf("%d", c.Size())
return &newDesc, nil
}
}

View File

@@ -0,0 +1,398 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package externaltoc
import (
"context"
"encoding/json"
"fmt"
"io"
"sort"
"time"
"github.com/containerd/containerd/v2/core/content"
"github.com/containerd/containerd/v2/core/images"
"github.com/containerd/containerd/v2/core/images/converter"
"github.com/containerd/containerd/v2/core/images/converter/uncompress"
"github.com/containerd/containerd/v2/pkg/archive/compression"
"github.com/containerd/containerd/v2/pkg/labels"
"github.com/containerd/containerd/v2/pkg/reference"
"github.com/containerd/errdefs"
"github.com/containerd/stargz-snapshotter/estargz"
esgzexternaltoc "github.com/containerd/stargz-snapshotter/estargz/externaltoc"
estargzconvert "github.com/containerd/stargz-snapshotter/nativeconverter/estargz"
"github.com/containerd/stargz-snapshotter/util/ioutils"
"github.com/opencontainers/go-digest"
ocispecspec "github.com/opencontainers/image-spec/specs-go"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)
// LayerConvertFunc converts legacy tar.gz layers into eStargz tar.gz layers.
//
// finalize() callback function returned by this function will return the image that contains
// external TOC of each layer. Note that the returned image by isn't stored to the containerd image
// store so far so the caller needs to do it.
//
// Media type is unchanged.
//
// Should be used in conjunction with WithDockerToOCI().
//
// Otherwise "containerd.io/snapshot/stargz/toc.digest" annotation will be lost,
// because the Docker media type does not support layer annotations.
//
// WithCompression() in esgzOpts will be ignored but used the one for external TOC instead.
func LayerConvertFunc(esgzOpts []estargz.Option, compressionLevel int) (convertFunc converter.ConvertFunc, finalize func(ctx context.Context, cs content.Store, ref string, desc *ocispec.Descriptor) (*images.Image, error)) {
return layerConvert(func(c estargz.Compression) converter.ConvertFunc {
return estargzconvert.LayerConvertFunc(append(esgzOpts, estargz.WithCompression(c))...)
}, compressionLevel)
}
// LayerConvertWithLayerAndCommonOptsFunc converts legacy tar.gz layers into eStargz.
// Media type is unchanged. Should be used in conjunction with WithDockerToOCI(). See
// LayerConvertFunc for more details. The difference between this function and
// LayerConvertFunc is that this allows to specify additional eStargz options per layer.
func LayerConvertWithLayerAndCommonOptsFunc(opts map[digest.Digest][]estargz.Option, commonOpts []estargz.Option, compressionLevel int) (convertFunc converter.ConvertFunc, finalize func(ctx context.Context, cs content.Store, ref string, desc *ocispec.Descriptor) (*images.Image, error)) {
return layerConvert(func(c estargz.Compression) converter.ConvertFunc {
return estargzconvert.LayerConvertWithLayerAndCommonOptsFunc(opts, append(commonOpts,
estargz.WithCompression(c),
)...)
}, compressionLevel)
}
// LayerConvertLossLessConfig is configuration for LayerConvertLossLessFunc.
type LayerConvertLossLessConfig struct {
CompressionLevel int
ChunkSize int
MinChunkSize int
}
// LayerConvertLossLessFunc converts legacy tar.gz layers into eStargz tar.gz layers without changing
// the diffIDs (i.e. uncompressed digest).
//
// finalize() callback function returned by this function will return the image that contains
// external TOC of each layer. Note that the returned image by isn't stored to the containerd image
// store so far so the caller needs to do it.
//
// Media type is unchanged.
//
// Should be used in conjunction with WithDockerToOCI().
//
// Otherwise "containerd.io/snapshot/stargz/toc.digest" annotation will be lost,
// because the Docker media type does not support layer annotations.
//
// WithCompression() in esgzOpts will be ignored but used the one for external TOC instead.
func LayerConvertLossLessFunc(cfg LayerConvertLossLessConfig) (convertFunc converter.ConvertFunc, finalize func(ctx context.Context, cs content.Store, ref string, desc *ocispec.Descriptor) (*images.Image, error)) {
return layerConvert(func(c estargz.Compression) converter.ConvertFunc {
return layerLossLessConvertFunc(c, cfg.ChunkSize, cfg.MinChunkSize)
}, cfg.CompressionLevel)
}
func layerConvert(layerConvertFunc func(estargz.Compression) converter.ConvertFunc, compressionLevel int) (convertFunc converter.ConvertFunc, finalize func(ctx context.Context, cs content.Store, ref string, desc *ocispec.Descriptor) (*images.Image, error)) {
type tocInfo struct {
digest digest.Digest
size int64
}
esgzDigest2TOC := make(map[digest.Digest]tocInfo)
// TODO: currently, all layers of all platforms are combined to one TOC manifest. Maybe we can consider
// having a separated TOC manifest per platform.
converterFunc := func(ctx context.Context, cs content.Store, desc ocispec.Descriptor) (*ocispec.Descriptor, error) {
cm := esgzexternaltoc.NewGzipCompressionWithLevel(nil, compressionLevel)
c := cm.(*esgzexternaltoc.GzipCompression)
cf := layerConvertFunc(c)
desc2, err := cf(ctx, cs, desc)
if err != nil {
return desc2, err
}
var layerDgst digest.Digest
if desc2 != nil {
layerDgst = desc2.Digest
} else {
layerDgst = desc.Digest // no conversion happened
}
dgst, size, err := writeTOCTo(ctx, c, cs)
if err != nil {
return nil, err
}
esgzDigest2TOC[layerDgst] = tocInfo{dgst, size}
return desc2, nil
}
finalizeFunc := func(ctx context.Context, cs content.Store, ref string, desc *ocispec.Descriptor) (*images.Image, error) {
var layers []ocispec.Descriptor
for esgzDigest, toc := range esgzDigest2TOC {
layers = append(layers, ocispec.Descriptor{
MediaType: ocispec.MediaTypeImageLayerGzip,
Digest: toc.digest,
Size: toc.size,
Annotations: map[string]string{
"containerd.io/snapshot/stargz/layer.digest": esgzDigest.String(),
},
})
}
sort.Slice(layers, func(i, j int) bool {
return layers[i].Digest.String() < layers[j].Digest.String()
})
mfst, err := createManifest(ctx, cs, ocispec.ImageConfig{}, layers)
if err != nil {
return nil, err
}
tocImgRef, err := getTOCReference(ref)
if err != nil {
return nil, err
}
return &images.Image{
Name: tocImgRef,
Target: *mfst,
}, nil
}
return converterFunc, finalizeFunc
}
func getTOCReference(ref string) (string, error) {
refspec, err := reference.Parse(ref)
if err != nil {
return "", err
}
refspec.Object = refspec.Object + "-esgztoc" // TODO: support custom location
return refspec.String(), nil
}
func layerLossLessConvertFunc(compressor estargz.Compressor, chunkSize int, minChunkSize int) converter.ConvertFunc {
return func(ctx context.Context, cs content.Store, desc ocispec.Descriptor) (*ocispec.Descriptor, error) {
if !images.IsLayerType(desc.MediaType) {
// No conversion. No need to return an error here.
return nil, nil
}
info, err := cs.Info(ctx, desc.Digest)
if err != nil {
return nil, err
}
labelz := info.Labels
if labelz == nil {
labelz = make(map[string]string)
}
ra, err := cs.ReaderAt(ctx, desc)
if err != nil {
return nil, err
}
defer ra.Close()
sr := io.NewSectionReader(ra, 0, desc.Size)
ref := fmt.Sprintf("convert-estargz-from-%s", desc.Digest)
w, err := content.OpenWriter(ctx, cs, content.WithRef(ref))
if err != nil {
return nil, err
}
defer w.Close()
// Reset the writing position
// Old writer possibly remains without aborted
// (e.g. conversion interrupted by a signal)
if err := w.Truncate(0); err != nil {
return nil, err
}
// Copy and count the contents
esgzUW, esgzUncompressedInfoCh := calcUncompression()
orgUW, orgUncompressedInfoCh := calcUncompression()
countW := new(ioutils.CountWriter)
mw := io.MultiWriter(io.MultiWriter(w, countW), esgzUW)
var ew *estargz.Writer
if compressor != nil {
ew = estargz.NewWriterWithCompressor(mw, compressor)
} else {
ew = estargz.NewWriter(mw)
}
if chunkSize > 0 {
ew.ChunkSize = chunkSize
}
ew.MinChunkSize = minChunkSize
if err := ew.AppendTarLossLess(io.TeeReader(sr, orgUW)); err != nil {
return nil, fmt.Errorf("cannot perform compression in lossless way: %w", err)
}
tocDgst, err := ew.Close()
if err != nil {
return nil, err
}
n := countW.Size()
if err := esgzUW.Close(); err != nil {
return nil, err
}
if err := orgUW.Close(); err != nil {
return nil, err
}
esgzUncompressedInfo := <-esgzUncompressedInfoCh
orgUncompressedInfo := <-orgUncompressedInfoCh
// check the lossless conversion
if esgzUncompressedInfo.diffID.String() != orgUncompressedInfo.diffID.String() {
return nil, fmt.Errorf("unexpected diffID %q; want %q",
esgzUncompressedInfo.diffID.String(), orgUncompressedInfo.diffID.String())
}
if esgzUncompressedInfo.size != orgUncompressedInfo.size {
return nil, fmt.Errorf("unexpected uncompressed size %q; want %q",
esgzUncompressedInfo.size, orgUncompressedInfo.size)
}
// write diffID label
labelz[labels.LabelUncompressed] = esgzUncompressedInfo.diffID.String()
if err = w.Commit(ctx, n, "", content.WithLabels(labelz)); err != nil && !errdefs.IsAlreadyExists(err) {
return nil, err
}
if err := w.Close(); err != nil {
return nil, err
}
newDesc := desc
if uncompress.IsUncompressedType(newDesc.MediaType) {
if images.IsDockerType(newDesc.MediaType) {
newDesc.MediaType += ".gzip"
} else {
newDesc.MediaType += "+gzip"
}
}
newDesc.Digest = w.Digest()
newDesc.Size = n
if newDesc.Annotations == nil {
newDesc.Annotations = make(map[string]string, 1)
}
newDesc.Annotations[estargz.TOCJSONDigestAnnotation] = tocDgst.String()
newDesc.Annotations[estargz.StoreUncompressedSizeAnnotation] = fmt.Sprintf("%d", esgzUncompressedInfo.size)
return &newDesc, nil
}
}
type uncompressedInfo struct {
diffID digest.Digest
size int64
}
func calcUncompression() (*io.PipeWriter, chan uncompressedInfo) {
pr, pw := io.Pipe()
infoCh := make(chan uncompressedInfo)
go func() {
defer pr.Close()
c := new(ioutils.CountWriter)
diffID := digest.Canonical.Digester()
decompressR, err := compression.DecompressStream(pr)
if err != nil {
pr.CloseWithError(err)
close(infoCh)
return
}
defer decompressR.Close()
if _, err := io.Copy(io.MultiWriter(c, diffID.Hash()), decompressR); err != nil {
pr.CloseWithError(err)
close(infoCh)
return
}
infoCh <- uncompressedInfo{
diffID: diffID.Digest(),
size: c.Size(),
}
}()
return pw, infoCh
}
func writeTOCTo(ctx context.Context, gc *esgzexternaltoc.GzipCompression, cs content.Store) (digest.Digest, int64, error) {
ref := "external-toc" + time.Now().String()
w, err := content.OpenWriter(ctx, cs, content.WithRef(ref))
if err != nil {
return "", 0, err
}
defer w.Close()
if err := w.Truncate(0); err != nil {
return "", 0, err
}
c := new(ioutils.CountWriter)
dgstr := digest.Canonical.Digester()
n, err := gc.WriteTOCTo(io.MultiWriter(io.MultiWriter(w, dgstr.Hash()), c))
if err != nil {
return "", 0, err
}
if err := w.Commit(ctx, int64(n), ""); err != nil && !errdefs.IsAlreadyExists(err) {
return "", 0, err
}
if err := w.Close(); err != nil {
return "", 0, err
}
return dgstr.Digest(), c.Size(), nil
}
func createManifest(ctx context.Context, cs content.Store, config ocispec.ImageConfig, layers []ocispec.Descriptor) (*ocispec.Descriptor, error) {
// Create config
configDgst, configSize, err := writeJSON(ctx, cs, &config, nil)
if err != nil {
return nil, err
}
// Create manifest
mfst := ocispec.Manifest{
Versioned: ocispecspec.Versioned{
SchemaVersion: 2,
},
MediaType: ocispec.MediaTypeImageManifest,
Config: ocispec.Descriptor{
MediaType: ocispec.MediaTypeImageConfig,
Digest: configDgst,
Size: configSize,
},
Layers: layers,
}
mfstLabels := make(map[string]string)
for i, ld := range mfst.Layers {
mfstLabels[fmt.Sprintf("containerd.io/gc.ref.content.l.%d", i)] = ld.Digest.String()
}
mfstLabels["containerd.io/gc.ref.content.c.0"] = configDgst.String()
mfstDgst, mfstSize, err := writeJSON(ctx, cs, &mfst, mfstLabels)
if err != nil {
return nil, err
}
return &ocispec.Descriptor{
MediaType: ocispec.MediaTypeImageManifest,
Digest: mfstDgst,
Size: mfstSize,
}, nil
}
func writeJSON(ctx context.Context, cs content.Store, data interface{}, labels map[string]string) (digest.Digest, int64, error) {
raw, err := json.Marshal(data)
if err != nil {
return "", 0, err
}
size := len(raw)
ref := "write-json-ref" + digest.FromBytes(raw).String()
w, err := content.OpenWriter(ctx, cs, content.WithRef(ref))
if err != nil {
return "", 0, err
}
defer w.Close()
if err := w.Truncate(0); err != nil {
return "", 0, err
}
if _, err := w.Write(raw); err != nil {
return "", 0, err
}
if err = w.Commit(ctx, int64(size), "", content.WithLabels(labels)); err != nil && !errdefs.IsAlreadyExists(err) {
return "", 0, err
}
dgst := w.Digest()
if err := w.Close(); err != nil {
return "", 0, err
}
return dgst, int64(size), nil
}

View File

@@ -0,0 +1,90 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package externaltoc
import (
"context"
"fmt"
"io"
"github.com/containerd/containerd/v2/core/remotes"
"github.com/containerd/containerd/v2/core/remotes/docker"
"github.com/containerd/containerd/v2/pkg/reference"
"github.com/containerd/platforms"
esgzexternaltoc "github.com/containerd/stargz-snapshotter/estargz/externaltoc"
"github.com/containerd/stargz-snapshotter/fs/source"
"github.com/containerd/stargz-snapshotter/util/containerdutil"
"github.com/opencontainers/go-digest"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)
func NewRemoteDecompressor(ctx context.Context, hosts source.RegistryHosts, refspec reference.Spec, desc ocispec.Descriptor) *esgzexternaltoc.GzipDecompressor {
return esgzexternaltoc.NewGzipDecompressor(func() ([]byte, error) {
resolver := docker.NewResolver(docker.ResolverOptions{
Hosts: func(host string) ([]docker.RegistryHost, error) {
if host != refspec.Hostname() {
return nil, fmt.Errorf("unexpected host %q for image ref %q", host, refspec.String())
}
return hosts(refspec)
},
})
return fetchTOCBlob(ctx, resolver, refspec, desc.Digest)
})
}
func fetchTOCBlob(ctx context.Context, resolver remotes.Resolver, refspec reference.Spec, dgst digest.Digest) ([]byte, error) {
// TODO: support custom location of TOC manifest and TOCs using annotations, etc.
tocImgRef, err := getTOCReference(refspec.String())
if err != nil {
return nil, err
}
_, img, err := resolver.Resolve(ctx, tocImgRef)
if err != nil {
return nil, err
}
fetcher, err := resolver.Fetcher(ctx, tocImgRef)
if err != nil {
return nil, err
}
// TODO: cache this manifest
manifest, err := containerdutil.FetchManifestPlatform(ctx, fetcher, img, platforms.DefaultSpec())
if err != nil {
return nil, err
}
return fetchTOCBlobFromManifest(ctx, fetcher, manifest, dgst)
}
func fetchTOCBlobFromManifest(ctx context.Context, fetcher remotes.Fetcher, manifest ocispec.Manifest, layerDigest digest.Digest) ([]byte, error) {
for _, l := range manifest.Layers {
if len(l.Annotations) == 0 {
continue
}
ldgst, ok := l.Annotations["containerd.io/snapshot/stargz/layer.digest"]
if !ok {
continue
}
if ldgst == layerDigest.String() {
r, err := fetcher.Fetch(ctx, l)
if err != nil {
return nil, err
}
defer r.Close()
return io.ReadAll(r)
}
}
return nil, fmt.Errorf("TOC not found")
}

View File

@@ -0,0 +1,70 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package service
import (
"github.com/containerd/stargz-snapshotter/fs/config"
"github.com/containerd/stargz-snapshotter/service/resolver"
)
// Config is configuration for stargz snapshotter service.
type Config struct {
config.Config
// KubeconfigKeychainConfig is config for kubeconfig-based keychain.
KubeconfigKeychainConfig `toml:"kubeconfig_keychain"`
// CRIKeychainConfig is config for CRI-based keychain.
CRIKeychainConfig `toml:"cri_keychain"`
// ResolverConfig is config for resolving registries.
ResolverConfig `toml:"resolver"`
// SnapshotterConfig is snapshotter-related config.
SnapshotterConfig `toml:"snapshotter"`
}
// KubeconfigKeychainConfig is config for kubeconfig-based keychain.
type KubeconfigKeychainConfig struct {
// EnableKeychain enables kubeconfig-based keychain
EnableKeychain bool `toml:"enable_keychain"`
// KubeconfigPath is the path to kubeconfig which can be used to sync
// secrets on the cluster into this snapshotter.
KubeconfigPath string `toml:"kubeconfig_path"`
}
// CRIKeychainConfig is config for CRI-based keychain.
type CRIKeychainConfig struct {
// EnableKeychain enables CRI-based keychain
EnableKeychain bool `toml:"enable_keychain"`
// ImageServicePath is the path to the unix socket of backing CRI Image Service (e.g. containerd CRI plugin)
ImageServicePath string `toml:"image_service_path"`
}
// ResolverConfig is config for resolving registries.
type ResolverConfig resolver.Config
// SnapshotterConfig is snapshotter-related config.
type SnapshotterConfig struct {
// AllowInvalidMountsOnRestart allows that there are snapshot mounts that cannot access to the
// data source when restarting the snapshotter.
// NOTE: User needs to manually remove the snapshots from containerd's metadata store using
// ctr (e.g. `ctr snapshot rm`).
AllowInvalidMountsOnRestart bool `toml:"allow_invalid_mounts_on_restart"`
}

View File

@@ -0,0 +1,112 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package service
import (
"fmt"
"strings"
"github.com/containerd/containerd/v2/pkg/reference"
"github.com/containerd/stargz-snapshotter/fs/source"
digest "github.com/opencontainers/go-digest"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)
// TODO: switch to "github.com/containerd/containerd/pkg/snapshotters" once all tools using
//
// stargz-snapshotter (e.g. k3s) move to containerd version where that pkg is available.
const (
// targetRefLabel is a label which contains image reference and will be passed
// to snapshotters.
targetRefLabel = "containerd.io/snapshot/cri.image-ref"
// targetLayerDigestLabel is a label which contains layer digest and will be passed
// to snapshotters.
targetLayerDigestLabel = "containerd.io/snapshot/cri.layer-digest"
// targetImageLayersLabel is a label which contains layer digests contained in
// the target image and will be passed to snapshotters for preparing layers in
// parallel. Skipping some layers is allowed and only affects performance.
targetImageLayersLabel = "containerd.io/snapshot/cri.image-layers"
)
const (
// targetImageURLsLabelPrefix is a label prefix which constructs a map from the layer index to
// urls of the layer descriptor. This isn't contained in the set of the labels passed from CRI plugin but
// some clients (e.g. nerdctl) passes this for preserving url field in the OCI descriptor.
targetImageURLsLabelPrefix = "containerd.io/snapshot/remote/urls."
// targetURsLLabel is a label which contains layer URL. This is only used to pass URL from containerd
// to snapshotter. This isn't contained in the set of the labels passed from CRI plugin but
// some clients (e.g. nerdctl) passes this for preserving url field in the OCI descriptor.
targetURLsLabel = "containerd.io/snapshot/remote/urls"
)
func sourceFromCRILabels(hosts source.RegistryHosts) source.GetSources {
return func(labels map[string]string) ([]source.Source, error) {
refStr, ok := labels[targetRefLabel]
if !ok {
return nil, fmt.Errorf("reference hasn't been passed")
}
refspec, err := reference.Parse(refStr)
if err != nil {
return nil, err
}
digestStr, ok := labels[targetLayerDigestLabel]
if !ok {
return nil, fmt.Errorf("digest hasn't been passed")
}
target, err := digest.Parse(digestStr)
if err != nil {
return nil, err
}
var neighboringLayers []ocispec.Descriptor
if l, ok := labels[targetImageLayersLabel]; ok {
layersStr := strings.Split(l, ",")
for i, l := range layersStr {
d, err := digest.Parse(l)
if err != nil {
return nil, err
}
if d.String() != target.String() {
desc := ocispec.Descriptor{Digest: d}
if urls, ok := labels[targetImageURLsLabelPrefix+fmt.Sprintf("%d", i)]; ok {
desc.URLs = strings.Split(urls, ",")
}
neighboringLayers = append(neighboringLayers, desc)
}
}
}
targetDesc := ocispec.Descriptor{
Digest: target,
Annotations: labels,
}
if targetURLs, ok := labels[targetURLsLabel]; ok {
targetDesc.URLs = append(targetDesc.URLs, strings.Split(targetURLs, ",")...)
}
return []source.Source{
{
Hosts: hosts,
Name: refspec,
Target: targetDesc,
Manifest: ocispec.Manifest{Layers: append([]ocispec.Descriptor{targetDesc}, neighboringLayers...)},
},
}, nil
}
}

View File

@@ -0,0 +1,145 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cri
import (
"context"
"errors"
"fmt"
"sync"
"time"
"github.com/containerd/containerd/v2/pkg/reference"
"github.com/containerd/log"
"github.com/containerd/stargz-snapshotter/service/resolver"
distribution "github.com/distribution/reference"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)
// NewCRIKeychain provides creds passed through CRI PullImage API.
// This also returns a CRI image service server that works as a proxy backed by the specified CRI service.
// This server reads all PullImageRequest and uses PullImageRequest.AuthConfig for authenticating snapshots.
func NewCRIKeychain(ctx context.Context, connectCRI func() (runtime.ImageServiceClient, error)) (resolver.Credential, runtime.ImageServiceServer) {
server := &instrumentedService{config: make(map[string]*runtime.AuthConfig)}
go func() {
log.G(ctx).Debugf("Waiting for CRI service is started...")
for i := 0; i < 100; i++ {
client, err := connectCRI()
if err == nil {
server.criMu.Lock()
server.cri = client
server.criMu.Unlock()
log.G(ctx).Info("connected to backend CRI service")
return
}
log.G(ctx).WithError(err).Warnf("failed to connect to CRI")
time.Sleep(10 * time.Second)
}
log.G(ctx).Warnf("no connection is available to CRI")
}()
return server.credentials, server
}
type instrumentedService struct {
cri runtime.ImageServiceClient
criMu sync.Mutex
config map[string]*runtime.AuthConfig
configMu sync.Mutex
}
func (in *instrumentedService) credentials(host string, refspec reference.Spec) (string, string, error) {
if host == "docker.io" || host == "registry-1.docker.io" {
// Creds of "docker.io" is stored keyed by "https://index.docker.io/v1/".
host = "index.docker.io"
}
in.configMu.Lock()
defer in.configMu.Unlock()
if cfg, ok := in.config[refspec.String()]; ok {
return resolver.ParseAuth(cfg, host)
}
return "", "", nil
}
func (in *instrumentedService) getCRI() (c runtime.ImageServiceClient) {
in.criMu.Lock()
c = in.cri
in.criMu.Unlock()
return
}
func (in *instrumentedService) ListImages(ctx context.Context, r *runtime.ListImagesRequest) (res *runtime.ListImagesResponse, err error) {
cri := in.getCRI()
if cri == nil {
return nil, errors.New("server is not initialized yet")
}
return cri.ListImages(ctx, r)
}
func (in *instrumentedService) ImageStatus(ctx context.Context, r *runtime.ImageStatusRequest) (res *runtime.ImageStatusResponse, err error) {
cri := in.getCRI()
if cri == nil {
return nil, errors.New("server is not initialized yet")
}
return cri.ImageStatus(ctx, r)
}
func (in *instrumentedService) PullImage(ctx context.Context, r *runtime.PullImageRequest) (res *runtime.PullImageResponse, err error) {
cri := in.getCRI()
if cri == nil {
return nil, errors.New("server is not initialized yet")
}
refspec, err := parseReference(r.GetImage().GetImage())
if err != nil {
return nil, err
}
in.configMu.Lock()
in.config[refspec.String()] = r.GetAuth()
in.configMu.Unlock()
return cri.PullImage(ctx, r)
}
func (in *instrumentedService) RemoveImage(ctx context.Context, r *runtime.RemoveImageRequest) (_ *runtime.RemoveImageResponse, err error) {
cri := in.getCRI()
if cri == nil {
return nil, errors.New("server is not initialized yet")
}
refspec, err := parseReference(r.GetImage().GetImage())
if err != nil {
return nil, err
}
in.configMu.Lock()
delete(in.config, refspec.String())
in.configMu.Unlock()
return cri.RemoveImage(ctx, r)
}
func (in *instrumentedService) ImageFsInfo(ctx context.Context, r *runtime.ImageFsInfoRequest) (res *runtime.ImageFsInfoResponse, err error) {
cri := in.getCRI()
if cri == nil {
return nil, errors.New("server is not initialized yet")
}
return cri.ImageFsInfo(ctx, r)
}
func parseReference(ref string) (reference.Spec, error) {
namedRef, err := distribution.ParseDockerRef(ref)
if err != nil {
return reference.Spec{}, fmt.Errorf("failed to parse image reference %q: %w", ref, err)
}
return reference.Parse(namedRef.String())
}

View File

@@ -0,0 +1,49 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dockerconfig
import (
"context"
"github.com/containerd/containerd/v2/pkg/reference"
"github.com/containerd/log"
"github.com/containerd/stargz-snapshotter/service/resolver"
"github.com/docker/cli/cli/config"
)
func NewDockerconfigKeychain(ctx context.Context) resolver.Credential {
return func(host string, refspec reference.Spec) (string, string, error) {
cf, err := config.Load("")
if err != nil {
log.G(ctx).WithError(err).Warnf("failed to load docker config file")
return "", "", nil
}
if host == "docker.io" || host == "registry-1.docker.io" {
// Creds of docker.io is stored keyed by "https://index.docker.io/v1/".
host = "https://index.docker.io/v1/"
}
ac, err := cf.GetAuthConfig(host)
if err != nil {
return "", "", err
}
if ac.IdentityToken != "" {
return "", ac.IdentityToken, nil
}
return ac.Username, ac.Password, nil
}
}

View File

@@ -0,0 +1,262 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package kubeconfig
import (
"bytes"
"context"
"fmt"
"os"
"sync"
"time"
"github.com/containerd/containerd/v2/pkg/reference"
"github.com/containerd/log"
"github.com/containerd/stargz-snapshotter/service/resolver"
dcfile "github.com/docker/cli/cli/config/configfile"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/client-go/util/workqueue"
)
const dockerconfigSelector = "type=" + string(corev1.SecretTypeDockerConfigJson)
type options struct {
kubeconfigPath string
}
type Option func(*options)
func WithKubeconfigPath(path string) Option {
return func(opts *options) {
opts.kubeconfigPath = path
}
}
// NewKubeconfigKeychain provides a keychain which can sync its contents with
// kubernetes API server by fetching all `kubernetes.io/dockerconfigjson`
// secrets in the cluster with provided kubeconfig. It's OK that config provides
// kubeconfig path but the file doesn't exist at that moment. In this case, this
// keychain keeps on trying to read the specified path periodically and when the
// file is actually provided, this keychain tries to access API server using the
// file. This is useful for some environments (e.g. single node cluster with
// containerized apiserver) where stargz snapshotter needs to start before
// everything, including booting containerd/kubelet/apiserver and configuring
// users/roles.
// TODO: support update of kubeconfig file
func NewKubeconfigKeychain(ctx context.Context, opts ...Option) resolver.Credential {
var kcOpts options
for _, o := range opts {
o(&kcOpts)
}
kc := newKeychain(ctx, kcOpts.kubeconfigPath)
return kc.credentials
}
func newKeychain(ctx context.Context, kubeconfigPath string) *keychain {
kc := &keychain{
config: make(map[string]*dcfile.ConfigFile),
}
ctx = log.WithLogger(ctx, log.G(ctx).WithField("kubeconfig", kubeconfigPath))
go func() {
if kubeconfigPath != "" {
log.G(ctx).Debugf("Waiting for kubeconfig being installed...")
for {
if _, err := os.Stat(kubeconfigPath); err == nil {
break
} else if !os.IsNotExist(err) {
log.G(ctx).WithError(err).
Warnf("failed to read; Disabling syncing")
return
}
time.Sleep(10 * time.Second)
}
}
// default loader for KUBECONFIG or `~/.kube/config`
// if no explicit path provided, KUBECONFIG will be used.
// if KUBECONFIG doesn't contain paths, `~/.kube/config` will be used.
loadingRule := clientcmd.NewDefaultClientConfigLoadingRules()
// explicitly provide path for kubeconfig.
// if path isn't "", this path will be respected.
loadingRule.ExplicitPath = kubeconfigPath
// load and merge config files
clientcfg, err := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(
loadingRule, // loader for config files
&clientcmd.ConfigOverrides{}, // no overrides for config
).ClientConfig()
if err != nil {
log.G(ctx).WithError(err).Warnf("failed to load config; Disabling syncing")
return
}
client, err := kubernetes.NewForConfig(clientcfg)
if err != nil {
log.G(ctx).WithError(err).Warnf("failed to prepare client; Disabling syncing")
return
}
if err := kc.startSyncSecrets(ctx, client); err != nil {
log.G(ctx).WithError(err).Warnf("failed to sync secrets")
}
}()
return kc
}
type keychain struct {
config map[string]*dcfile.ConfigFile
configMu sync.Mutex
// the following entries are used for syncing secrets with API server.
// these fields are lazily filled after kubeconfig file is provided.
queue *workqueue.Typed[string]
informer cache.SharedIndexInformer
}
func (kc *keychain) credentials(host string, refspec reference.Spec) (string, string, error) {
if host == "docker.io" || host == "registry-1.docker.io" {
// Creds of "docker.io" is stored keyed by "https://index.docker.io/v1/".
host = "https://index.docker.io/v1/"
}
kc.configMu.Lock()
defer kc.configMu.Unlock()
for _, cfg := range kc.config {
if acfg, err := cfg.GetAuthConfig(host); err == nil {
if acfg.IdentityToken != "" {
return "", acfg.IdentityToken, nil
} else if !(acfg.Username == "" && acfg.Password == "") {
return acfg.Username, acfg.Password, nil
}
}
}
return "", "", nil
}
func (kc *keychain) startSyncSecrets(ctx context.Context, client kubernetes.Interface) error {
// don't let panics crash the process
defer utilruntime.HandleCrash()
// get informed on `kubernetes.io/dockerconfigjson` secrets in all namespaces
informer := cache.NewSharedIndexInformer(
&cache.ListWatch{
ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
// TODO: support legacy image secret `kubernetes.io/dockercfg`
options.FieldSelector = dockerconfigSelector
return client.CoreV1().Secrets(metav1.NamespaceAll).List(ctx, options)
},
WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
// TODO: support legacy image secret `kubernetes.io/dockercfg`
options.FieldSelector = dockerconfigSelector
return client.CoreV1().Secrets(metav1.NamespaceAll).Watch(ctx, options)
},
},
&corev1.Secret{},
0,
cache.Indexers{},
)
// use workqueue because each task possibly takes long for parsing config,
// wating for lock, etc...
queue := workqueue.NewTyped[string]()
defer queue.ShutDown()
informer.AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
key, err := cache.MetaNamespaceKeyFunc(obj)
if err == nil {
queue.Add(key)
}
},
UpdateFunc: func(old, new interface{}) {
key, err := cache.MetaNamespaceKeyFunc(new)
if err == nil {
queue.Add(key)
}
},
DeleteFunc: func(obj interface{}) {
key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj)
if err == nil {
queue.Add(key)
}
},
})
go informer.Run(ctx.Done())
if !cache.WaitForCacheSync(ctx.Done(), informer.HasSynced) {
return fmt.Errorf("Timed out for syncing cache")
}
// get informer and queue
kc.informer = informer
kc.queue = queue
// keep on syncing secrets
wait.Until(kc.runWorker, time.Second, ctx.Done())
return nil
}
func (kc *keychain) runWorker() {
for kc.processNextItem() {
// continue looping
}
}
// TODO: consider retrying?
func (kc *keychain) processNextItem() bool {
key, quit := kc.queue.Get()
if quit {
return false
}
defer kc.queue.Done(key)
obj, exists, err := kc.informer.GetIndexer().GetByKey(key)
if err != nil {
utilruntime.HandleError(fmt.Errorf("failed to get object; don't sync %q: %v", key, err))
return true
}
if !exists {
kc.configMu.Lock()
delete(kc.config, key)
kc.configMu.Unlock()
return true
}
// TODO: support legacy image secret `kubernetes.io/dockercfg`
data, ok := obj.(*corev1.Secret).Data[corev1.DockerConfigJsonKey]
if !ok {
utilruntime.HandleError(fmt.Errorf("no secret is provided; don't sync %q", key))
return true
}
configFile := dcfile.New("")
if err := configFile.LoadFromReader(bytes.NewReader(data)); err != nil {
utilruntime.HandleError(fmt.Errorf("broken data; don't sync %q: %v", key, err))
return true
}
kc.configMu.Lock()
kc.config[key] = configFile
kc.configMu.Unlock()
return true
}

Some files were not shown because too many files have changed in this diff Show More