containerd/vendor/github.com/hanwen/go-fuse/v2/fs/inode.go
Brad Davidson 63272c3b7a
Enable btrfs/fuse-overlayfs/stargz snapshotter plugins
Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
2025-03-20 21:34:00 +00:00

758 lines
21 KiB
Go

// Copyright 2019 the Go-FUSE Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package fs
import (
"context"
"fmt"
"log"
"math/rand"
"sort"
"strings"
"sync"
"syscall"
"unsafe"
"github.com/hanwen/go-fuse/v2/fuse"
)
// StableAttr holds immutable attributes of a object in the filesystem.
type StableAttr struct {
// Each Inode has a type, which does not change over the
// lifetime of the inode, for example fuse.S_IFDIR. The default (0)
// is interpreted as S_IFREG (regular file).
Mode uint32
// The inode number must be unique among the currently live
// objects in the file system. It is used to communicate to
// the kernel about this file object. The value uint64(-1)
// is reserved. When using Ino==0, a unique, sequential
// number is assigned (starting at 2^63 by default) on Inode creation.
Ino uint64
// When reusing a previously used inode number for a new
// object, the new object must have a different Gen
// number. This is irrelevant if the FS is not exported over
// NFS
Gen uint64
}
// Reserved returns if the StableAttr is using reserved Inode numbers.
func (i *StableAttr) Reserved() bool {
return i.Ino == ^uint64(0) // fuse.pollHackInode = ^uint64(0)
}
// Inode is a node in VFS tree. Inodes are one-to-one mapped to
// Operations instances, which is the extension interface for file
// systems. One can create fully-formed trees of Inodes ahead of time
// by creating "persistent" Inodes.
//
// The Inode struct contains a lock, so it should not be
// copied. Inodes should be obtained by calling Inode.NewInode() or
// Inode.NewPersistentInode().
type Inode struct {
stableAttr StableAttr
ops InodeEmbedder
bridge *rawBridge
// The *Node ID* is an arbitrary uint64 identifier chosen by the FUSE library.
// It is used the identify *nodes* (files/directories/symlinks/...) in the
// communication between the FUSE library and the Linux kernel.
nodeId uint64
// Following data is mutable.
// file handles.
// protected by bridge.mu
openFiles []uint32
// backing files, protected by bridge.mu
backingIDRefcount int
backingID int32
backingFd int
// mu protects the following mutable fields. When locking
// multiple Inodes, locks must be acquired using
// lockNodes/unlockNodes
mu sync.Mutex
// persistent indicates that this node should not be removed
// from the tree, even if there are no live references. This
// must be set on creation, and can only be changed to false
// by calling removeRef.
// When you change this, you MUST increment changeCounter.
persistent bool
// changeCounter increments every time the mutable state
// (lookupCount, persistent, children, parents) protected by
// mu is modified.
//
// This is used in places where we have to relock inode into inode
// group lock, and after locking the group we have to check if inode
// did not changed, and if it changed - retry the operation.
changeCounter uint32
// Number of kernel refs to this node.
// When you change this, you MUST increment changeCounter.
lookupCount uint64
// Children of this Inode.
// When you change this, you MUST increment changeCounter.
children inodeChildren
// Parents of this Inode. Can be more than one due to hard links.
// When you change this, you MUST increment changeCounter.
parents inodeParents
}
func (n *Inode) IsDir() bool {
return n.stableAttr.Mode&syscall.S_IFMT == syscall.S_IFDIR
}
func (n *Inode) embed() *Inode {
return n
}
func (n *Inode) EmbeddedInode() *Inode {
return n
}
func initInode(n *Inode, ops InodeEmbedder, attr StableAttr, bridge *rawBridge, persistent bool, nodeId uint64) {
n.ops = ops
n.stableAttr = attr
n.bridge = bridge
n.persistent = persistent
n.nodeId = nodeId
if attr.Mode == fuse.S_IFDIR {
n.children.init()
}
}
// Set node ID and mode in EntryOut
func (n *Inode) setEntryOut(out *fuse.EntryOut) {
out.NodeId = n.nodeId
out.Ino = n.stableAttr.Ino
out.Mode = (out.Attr.Mode & 07777) | n.stableAttr.Mode
}
// StableAttr returns the (Ino, Gen) tuple for this node.
func (n *Inode) StableAttr() StableAttr {
return n.stableAttr
}
// Mode returns the filetype
func (n *Inode) Mode() uint32 {
return n.stableAttr.Mode
}
// Returns the root of the tree
func (n *Inode) Root() *Inode {
return n.bridge.root
}
// Returns whether this is the root of the tree
func (n *Inode) IsRoot() bool {
return n.bridge.root == n
}
func modeStr(m uint32) string {
return map[uint32]string{
syscall.S_IFREG: "reg",
syscall.S_IFLNK: "lnk",
syscall.S_IFDIR: "dir",
syscall.S_IFSOCK: "soc",
syscall.S_IFIFO: "pip",
syscall.S_IFCHR: "chr",
syscall.S_IFBLK: "blk",
}[m]
}
func (a StableAttr) String() string {
return fmt.Sprintf("i%d g%d (%s)",
a.Ino, a.Gen, modeStr(a.Mode))
}
// debugString is used for debugging. Racy.
func (n *Inode) String() string {
n.mu.Lock()
defer n.mu.Unlock()
return fmt.Sprintf("%s: %s", n.stableAttr.String(), n.children.String())
}
// sortNodes rearranges inode group in consistent order.
//
// The nodes are ordered by their in-RAM address, which gives consistency
// property: for any A and B inodes, sortNodes will either always order A < B,
// or always order A > B.
//
// See lockNodes where this property is used to avoid deadlock when taking
// locks on inode group.
func sortNodes(ns []*Inode) {
sort.Slice(ns, func(i, j int) bool {
return nodeLess(ns[i], ns[j])
})
}
func nodeLess(a, b *Inode) bool {
return uintptr(unsafe.Pointer(a)) < uintptr(unsafe.Pointer(b))
}
// lockNodes locks group of inodes.
//
// It always lock the inodes in the same order - to avoid deadlocks.
// It also avoids locking an inode more than once, if it was specified multiple times.
// An example when an inode might be given multiple times is if dir/a and dir/b
// are hardlinked to the same inode and the caller needs to take locks on dir children.
func lockNodes(ns ...*Inode) {
sortNodes(ns)
// The default value nil prevents trying to lock nil nodes.
var nprev *Inode
for _, n := range ns {
if n != nprev {
n.mu.Lock()
nprev = n
}
}
}
// lockNode2 locks a and b in order consistent with lockNodes.
func lockNode2(a, b *Inode) {
if a == b {
a.mu.Lock()
} else if nodeLess(a, b) {
a.mu.Lock()
b.mu.Lock()
} else {
b.mu.Lock()
a.mu.Lock()
}
}
// unlockNode2 unlocks a and b
func unlockNode2(a, b *Inode) {
if a == b {
a.mu.Unlock()
} else {
a.mu.Unlock()
b.mu.Unlock()
}
}
// unlockNodes releases locks taken by lockNodes.
func unlockNodes(ns ...*Inode) {
// we don't need to unlock in the same order that was used in lockNodes.
// however it still helps to have nodes sorted to avoid duplicates.
sortNodes(ns)
var nprev *Inode
for _, n := range ns {
if n != nprev {
n.mu.Unlock()
nprev = n
}
}
}
// Forgotten returns true if the kernel holds no references to this
// inode. This can be used for background cleanup tasks, since the
// kernel has no way of reviving forgotten nodes by its own
// initiative.
//
// Bugs: Forgotten() may momentarily return true in the window between
// creation (NewInode) and adding the node into the tree, which
// happens after Lookup/Mkdir/etc. return.
//
// Deprecated: use NodeOnForgetter instead.
func (n *Inode) Forgotten() bool {
n.mu.Lock()
defer n.mu.Unlock()
return n.lookupCount == 0 && n.parents.count() == 0 && !n.persistent
}
// Operations returns the object implementing the file system
// operations.
func (n *Inode) Operations() InodeEmbedder {
return n.ops
}
// Path returns a path string to the inode relative to `root`.
// Pass nil to walk the hierarchy as far up as possible.
//
// If you set `root`, Path() warns if it finds an orphaned Inode, i.e.
// if it does not end up at `root` after walking the hierarchy.
func (n *Inode) Path(root *Inode) string {
var segments []string
p := n
for p != nil && p != root {
// We don't try to take all locks at the same time, because
// the caller won't use the "path" string under lock anyway.
p.mu.Lock()
// Get last known parent
pd := p.parents.get()
p.mu.Unlock()
if pd == nil {
p = nil
break
}
segments = append(segments, pd.name)
p = pd.parent
}
if root != nil && root != p {
deletedPlaceholder := fmt.Sprintf(".go-fuse.%d/deleted", rand.Uint64())
n.bridge.logf("warning: Inode.Path: n%d is orphaned, replacing segment with %q",
n.nodeId, deletedPlaceholder)
// NOSUBMIT - should replace rather than append?
segments = append(segments, deletedPlaceholder)
}
i := 0
j := len(segments) - 1
for i < j {
segments[i], segments[j] = segments[j], segments[i]
i++
j--
}
path := strings.Join(segments, "/")
return path
}
// setEntry does `iparent[name] = ichild` linking.
//
// setEntry must not be called simultaneously for any of iparent or ichild.
// This, for example could be satisfied if both iparent and ichild are locked,
// but it could be also valid if only iparent is locked and ichild was just
// created and only one goroutine keeps referencing it.
func (iparent *Inode) setEntry(name string, ichild *Inode) {
if ichild.stableAttr.Mode == syscall.S_IFDIR {
// Directories cannot have more than one parent. Clear the map.
// This special-case is neccessary because ichild may still have a
// parent that was forgotten (i.e. removed from bridge.inoMap).
ichild.parents.clear()
}
iparent.children.set(iparent, name, ichild)
}
// NewPersistentInode returns an Inode whose lifetime is not in
// control of the kernel.
//
// When the kernel is short on memory, it will forget cached file
// system information (directory entries and inode metadata). This is
// announced with FORGET messages. There are no guarantees if or when
// this happens. When it happens, these are handled transparently by
// go-fuse: all Inodes created with NewInode are released
// automatically. NewPersistentInode creates inodes that go-fuse keeps
// in memory, even if the kernel is not interested in them. This is
// convenient for building static trees up-front.
func (n *Inode) NewPersistentInode(ctx context.Context, node InodeEmbedder, id StableAttr) *Inode {
return n.newInode(ctx, node, id, true)
}
// ForgetPersistent manually marks the node as no longer important. If
// it has no children, and if the kernel as no references, the nodes
// gets removed from the tree.
func (n *Inode) ForgetPersistent() {
n.removeRef(0, true)
}
// NewInode returns an inode for the given InodeEmbedder. The mode
// should be standard mode argument (eg. S_IFDIR). The inode number in
// id.Ino argument is used to implement hard-links. If it is given,
// and another node with the same ID is known, the new inode may be
// ignored, and the old one used instead.
func (n *Inode) NewInode(ctx context.Context, node InodeEmbedder, id StableAttr) *Inode {
return n.newInode(ctx, node, id, false)
}
func (n *Inode) newInode(ctx context.Context, ops InodeEmbedder, id StableAttr, persistent bool) *Inode {
return n.bridge.newInode(ctx, ops, id, persistent)
}
// removeRef decreases references. Returns if this operation caused
// the node to be forgotten (for kernel references), and whether it is
// live (ie. was not dropped from the tree)
func (n *Inode) removeRef(nlookup uint64, dropPersistence bool) (hasLookups, isPersistent, hasChildren bool) {
var beforeLookups, beforePersistence, beforeChildren bool
var unusedParents []*Inode
beforeLookups, hasLookups, beforePersistence, isPersistent, beforeChildren, hasChildren, unusedParents = n.removeRefInner(nlookup, dropPersistence, unusedParents)
if !hasLookups && !isPersistent && !hasChildren && (beforeChildren || beforeLookups || beforePersistence) {
if nf, ok := n.ops.(NodeOnForgetter); ok {
nf.OnForget()
}
}
for len(unusedParents) > 0 {
l := len(unusedParents)
p := unusedParents[l-1]
unusedParents = unusedParents[:l-1]
_, _, _, _, _, _, unusedParents = p.removeRefInner(0, false, unusedParents)
if nf, ok := p.ops.(NodeOnForgetter); ok {
nf.OnForget()
}
}
return
}
func (n *Inode) removeRefInner(nlookup uint64, dropPersistence bool, inputUnusedParents []*Inode) (beforeLookups, hasLookups, beforePersistent, isPersistent, beforeChildren, hasChildren bool, unusedParents []*Inode) {
var lockme []*Inode
var parents []parentData
unusedParents = inputUnusedParents
n.mu.Lock()
beforeLookups = n.lookupCount > 0
beforePersistent = n.persistent
beforeChildren = n.children.len() > 0
if nlookup > 0 && dropPersistence {
log.Panic("only one allowed")
} else if nlookup > n.lookupCount {
log.Panicf("n%d lookupCount underflow: lookupCount=%d, decrement=%d", n.nodeId, n.lookupCount, nlookup)
} else if nlookup > 0 {
n.lookupCount -= nlookup
n.changeCounter++
} else if dropPersistence && n.persistent {
n.persistent = false
n.changeCounter++
}
n.bridge.mu.Lock()
if n.lookupCount == 0 {
// Dropping the node from stableAttrs guarantees that no new references to this node are
// handed out to the kernel, hence we can also safely delete it from kernelNodeIds.
delete(n.bridge.stableAttrs, n.stableAttr)
delete(n.bridge.kernelNodeIds, n.nodeId)
}
n.bridge.mu.Unlock()
retry:
for {
lockme = append(lockme[:0], n)
parents = parents[:0]
nChange := n.changeCounter
hasLookups = n.lookupCount > 0
hasChildren = n.children.len() > 0
isPersistent = n.persistent
for _, p := range n.parents.all() {
parents = append(parents, p)
lockme = append(lockme, p.parent)
}
n.mu.Unlock()
if hasLookups || hasChildren || isPersistent {
return
}
lockNodes(lockme...)
if n.changeCounter != nChange {
unlockNodes(lockme...)
// could avoid unlocking and relocking n here.
n.mu.Lock()
continue retry
}
for _, p := range parents {
parentNode := p.parent
if parentNode.children.get(p.name) != n {
// another node has replaced us already
continue
}
parentNode.children.del(p.parent, p.name)
if parentNode.children.len() == 0 && parentNode.lookupCount == 0 && !parentNode.persistent {
unusedParents = append(unusedParents, parentNode)
}
}
if n.lookupCount != 0 {
log.Panicf("n%d %p lookupCount changed: %d", n.nodeId, n, n.lookupCount)
}
unlockNodes(lockme...)
break
}
return
}
// GetChild returns a child node with the given name, or nil if the
// directory has no child by that name.
func (n *Inode) GetChild(name string) *Inode {
n.mu.Lock()
defer n.mu.Unlock()
return n.children.get(name)
}
// AddChild adds a child to this node. If overwrite is false, fail if
// the destination already exists.
func (n *Inode) AddChild(name string, ch *Inode, overwrite bool) (success bool) {
if len(name) == 0 {
log.Panic("empty name for inode")
}
retry:
for {
lockNode2(n, ch)
prev := n.children.get(name)
parentCounter := n.changeCounter
if prev == nil {
n.children.set(n, name, ch)
unlockNode2(n, ch)
return true
}
unlockNode2(n, ch)
if !overwrite {
return false
}
lockme := [3]*Inode{n, ch, prev}
lockNodes(lockme[:]...)
if parentCounter != n.changeCounter {
unlockNodes(lockme[:]...)
continue retry
}
prev.parents.delete(parentData{name, n})
n.children.set(n, name, ch)
prev.changeCounter++
unlockNodes(lockme[:]...)
return true
}
}
// Children returns the list of children of this directory Inode.
func (n *Inode) Children() map[string]*Inode {
n.mu.Lock()
defer n.mu.Unlock()
return n.children.toMap()
}
// childrenList returns the list of children of this directory Inode.
// The result is guaranteed to be stable as long as the directory did
// not change.
func (n *Inode) childrenList() []childEntry {
n.mu.Lock()
defer n.mu.Unlock()
return n.children.list()
}
// Parents returns a parent of this Inode, or nil if this Inode is
// deleted or is the root
func (n *Inode) Parent() (string, *Inode) {
n.mu.Lock()
defer n.mu.Unlock()
p := n.parents.get()
if p == nil {
return "", nil
}
return p.name, p.parent
}
// RmAllChildren recursively drops a tree, forgetting all persistent
// nodes.
func (n *Inode) RmAllChildren() {
for {
chs := n.Children()
if len(chs) == 0 {
break
}
for nm, ch := range chs {
ch.RmAllChildren()
n.RmChild(nm)
}
}
n.removeRef(0, true)
}
// RmChild removes multiple children. Returns whether the removal
// succeeded and whether the node is still live afterward. The removal
// is transactional: it only succeeds if all names are children, and
// if they all were removed successfully. If the removal was
// successful, and there are no children left, the node may be removed
// from the FS tree. In that case, RmChild returns live==false.
func (n *Inode) RmChild(names ...string) (success, live bool) {
var lockme []*Inode
retry:
for {
n.mu.Lock()
lockme = append(lockme[:0], n)
nChange := n.changeCounter
for _, nm := range names {
ch := n.children.get(nm)
if ch == nil {
n.mu.Unlock()
return false, true
}
lockme = append(lockme, ch)
}
n.mu.Unlock()
lockNodes(lockme...)
if n.changeCounter != nChange {
unlockNodes(lockme...)
continue retry
}
for _, nm := range names {
n.children.del(n, nm)
}
live = n.lookupCount > 0 || n.children.len() > 0 || n.persistent
unlockNodes(lockme...)
// removal successful
break
}
if !live {
hasLookups, isPersistent, hasChildren := n.removeRef(0, false)
return true, (hasLookups || isPersistent || hasChildren)
}
return true, true
}
// MvChild executes a rename. If overwrite is set, a child at the
// destination will be overwritten, should it exist. It returns false
// if 'overwrite' is false, and the destination exists.
func (n *Inode) MvChild(old string, newParent *Inode, newName string, overwrite bool) bool {
if len(newName) == 0 {
log.Panicf("empty newName for MvChild")
}
retry:
for {
lockNode2(n, newParent)
counter1 := n.changeCounter
counter2 := newParent.changeCounter
oldChild := n.children.get(old)
destChild := newParent.children.get(newName)
unlockNode2(n, newParent)
if destChild != nil && !overwrite {
return false
}
lockNodes(n, newParent, oldChild, destChild)
if counter2 != newParent.changeCounter || counter1 != n.changeCounter {
unlockNodes(n, newParent, oldChild, destChild)
continue retry
}
if oldChild != nil {
n.children.del(n, old)
}
if destChild != nil {
// This can cause the child to be slated for
// removal; see below
newParent.children.del(newParent, newName)
}
if oldChild != nil {
newParent.children.set(newParent, newName, oldChild)
}
unlockNodes(n, newParent, oldChild, destChild)
if destChild != nil {
destChild.removeRef(0, false)
}
return true
}
}
// ExchangeChild swaps the entries at (n, oldName) and (newParent,
// newName).
func (n *Inode) ExchangeChild(oldName string, newParent *Inode, newName string) {
oldParent := n
retry:
for {
lockNode2(oldParent, newParent)
counter1 := oldParent.changeCounter
counter2 := newParent.changeCounter
oldChild := oldParent.children.get(oldName)
destChild := newParent.children.get(newName)
unlockNode2(oldParent, newParent)
if destChild == oldChild {
return
}
lockNodes(oldParent, newParent, oldChild, destChild)
if counter2 != newParent.changeCounter || counter1 != oldParent.changeCounter {
unlockNodes(oldParent, newParent, oldChild, destChild)
continue retry
}
// Detach
if oldChild != nil {
oldParent.children.del(oldParent, oldName)
}
if destChild != nil {
newParent.children.del(newParent, newName)
}
// Attach
if oldChild != nil {
newParent.children.set(newParent, newName, oldChild)
}
if destChild != nil {
oldParent.children.set(oldParent, oldName, destChild)
}
unlockNodes(oldParent, newParent, oldChild, destChild)
return
}
}
// NotifyEntry notifies the kernel that data for a (directory, name)
// tuple should be invalidated. On next access, a LOOKUP operation
// will be started.
func (n *Inode) NotifyEntry(name string) syscall.Errno {
status := n.bridge.server.EntryNotify(n.nodeId, name)
return syscall.Errno(status)
}
// NotifyDelete notifies the kernel that the given inode was removed
// from this directory as entry under the given name. It is equivalent
// to NotifyEntry, but also sends an event to inotify watchers.
func (n *Inode) NotifyDelete(name string, child *Inode) syscall.Errno {
// XXX arg ordering?
return syscall.Errno(n.bridge.server.DeleteNotify(n.nodeId, child.nodeId, name))
}
// NotifyContent notifies the kernel that content under the given
// inode should be flushed from buffers.
func (n *Inode) NotifyContent(off, sz int64) syscall.Errno {
// XXX how does this work for directories?
return syscall.Errno(n.bridge.server.InodeNotify(n.nodeId, off, sz))
}
// WriteCache stores data in the kernel cache.
func (n *Inode) WriteCache(offset int64, data []byte) syscall.Errno {
return syscall.Errno(n.bridge.server.InodeNotifyStoreCache(n.nodeId, offset, data))
}
// ReadCache reads data from the kernel cache.
func (n *Inode) ReadCache(offset int64, dest []byte) (count int, errno syscall.Errno) {
c, s := n.bridge.server.InodeRetrieveCache(n.nodeId, offset, dest)
return c, syscall.Errno(s)
}