// Copyright 2019 the Go-FUSE Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package fs import ( "context" "fmt" "log" "math/rand" "sort" "strings" "sync" "syscall" "unsafe" "github.com/hanwen/go-fuse/v2/fuse" ) // StableAttr holds immutable attributes of a object in the filesystem. type StableAttr struct { // Each Inode has a type, which does not change over the // lifetime of the inode, for example fuse.S_IFDIR. The default (0) // is interpreted as S_IFREG (regular file). Mode uint32 // The inode number must be unique among the currently live // objects in the file system. It is used to communicate to // the kernel about this file object. The value uint64(-1) // is reserved. When using Ino==0, a unique, sequential // number is assigned (starting at 2^63 by default) on Inode creation. Ino uint64 // When reusing a previously used inode number for a new // object, the new object must have a different Gen // number. This is irrelevant if the FS is not exported over // NFS Gen uint64 } // Reserved returns if the StableAttr is using reserved Inode numbers. func (i *StableAttr) Reserved() bool { return i.Ino == ^uint64(0) // fuse.pollHackInode = ^uint64(0) } // Inode is a node in VFS tree. Inodes are one-to-one mapped to // Operations instances, which is the extension interface for file // systems. One can create fully-formed trees of Inodes ahead of time // by creating "persistent" Inodes. // // The Inode struct contains a lock, so it should not be // copied. Inodes should be obtained by calling Inode.NewInode() or // Inode.NewPersistentInode(). type Inode struct { stableAttr StableAttr ops InodeEmbedder bridge *rawBridge // The *Node ID* is an arbitrary uint64 identifier chosen by the FUSE library. // It is used the identify *nodes* (files/directories/symlinks/...) in the // communication between the FUSE library and the Linux kernel. nodeId uint64 // Following data is mutable. // file handles. // protected by bridge.mu openFiles []uint32 // backing files, protected by bridge.mu backingIDRefcount int backingID int32 backingFd int // mu protects the following mutable fields. When locking // multiple Inodes, locks must be acquired using // lockNodes/unlockNodes mu sync.Mutex // persistent indicates that this node should not be removed // from the tree, even if there are no live references. This // must be set on creation, and can only be changed to false // by calling removeRef. // When you change this, you MUST increment changeCounter. persistent bool // changeCounter increments every time the mutable state // (lookupCount, persistent, children, parents) protected by // mu is modified. // // This is used in places where we have to relock inode into inode // group lock, and after locking the group we have to check if inode // did not changed, and if it changed - retry the operation. changeCounter uint32 // Number of kernel refs to this node. // When you change this, you MUST increment changeCounter. lookupCount uint64 // Children of this Inode. // When you change this, you MUST increment changeCounter. children inodeChildren // Parents of this Inode. Can be more than one due to hard links. // When you change this, you MUST increment changeCounter. parents inodeParents } func (n *Inode) IsDir() bool { return n.stableAttr.Mode&syscall.S_IFMT == syscall.S_IFDIR } func (n *Inode) embed() *Inode { return n } func (n *Inode) EmbeddedInode() *Inode { return n } func initInode(n *Inode, ops InodeEmbedder, attr StableAttr, bridge *rawBridge, persistent bool, nodeId uint64) { n.ops = ops n.stableAttr = attr n.bridge = bridge n.persistent = persistent n.nodeId = nodeId if attr.Mode == fuse.S_IFDIR { n.children.init() } } // Set node ID and mode in EntryOut func (n *Inode) setEntryOut(out *fuse.EntryOut) { out.NodeId = n.nodeId out.Ino = n.stableAttr.Ino out.Mode = (out.Attr.Mode & 07777) | n.stableAttr.Mode } // StableAttr returns the (Ino, Gen) tuple for this node. func (n *Inode) StableAttr() StableAttr { return n.stableAttr } // Mode returns the filetype func (n *Inode) Mode() uint32 { return n.stableAttr.Mode } // Returns the root of the tree func (n *Inode) Root() *Inode { return n.bridge.root } // Returns whether this is the root of the tree func (n *Inode) IsRoot() bool { return n.bridge.root == n } func modeStr(m uint32) string { return map[uint32]string{ syscall.S_IFREG: "reg", syscall.S_IFLNK: "lnk", syscall.S_IFDIR: "dir", syscall.S_IFSOCK: "soc", syscall.S_IFIFO: "pip", syscall.S_IFCHR: "chr", syscall.S_IFBLK: "blk", }[m] } func (a StableAttr) String() string { return fmt.Sprintf("i%d g%d (%s)", a.Ino, a.Gen, modeStr(a.Mode)) } // debugString is used for debugging. Racy. func (n *Inode) String() string { n.mu.Lock() defer n.mu.Unlock() return fmt.Sprintf("%s: %s", n.stableAttr.String(), n.children.String()) } // sortNodes rearranges inode group in consistent order. // // The nodes are ordered by their in-RAM address, which gives consistency // property: for any A and B inodes, sortNodes will either always order A < B, // or always order A > B. // // See lockNodes where this property is used to avoid deadlock when taking // locks on inode group. func sortNodes(ns []*Inode) { sort.Slice(ns, func(i, j int) bool { return nodeLess(ns[i], ns[j]) }) } func nodeLess(a, b *Inode) bool { return uintptr(unsafe.Pointer(a)) < uintptr(unsafe.Pointer(b)) } // lockNodes locks group of inodes. // // It always lock the inodes in the same order - to avoid deadlocks. // It also avoids locking an inode more than once, if it was specified multiple times. // An example when an inode might be given multiple times is if dir/a and dir/b // are hardlinked to the same inode and the caller needs to take locks on dir children. func lockNodes(ns ...*Inode) { sortNodes(ns) // The default value nil prevents trying to lock nil nodes. var nprev *Inode for _, n := range ns { if n != nprev { n.mu.Lock() nprev = n } } } // lockNode2 locks a and b in order consistent with lockNodes. func lockNode2(a, b *Inode) { if a == b { a.mu.Lock() } else if nodeLess(a, b) { a.mu.Lock() b.mu.Lock() } else { b.mu.Lock() a.mu.Lock() } } // unlockNode2 unlocks a and b func unlockNode2(a, b *Inode) { if a == b { a.mu.Unlock() } else { a.mu.Unlock() b.mu.Unlock() } } // unlockNodes releases locks taken by lockNodes. func unlockNodes(ns ...*Inode) { // we don't need to unlock in the same order that was used in lockNodes. // however it still helps to have nodes sorted to avoid duplicates. sortNodes(ns) var nprev *Inode for _, n := range ns { if n != nprev { n.mu.Unlock() nprev = n } } } // Forgotten returns true if the kernel holds no references to this // inode. This can be used for background cleanup tasks, since the // kernel has no way of reviving forgotten nodes by its own // initiative. // // Bugs: Forgotten() may momentarily return true in the window between // creation (NewInode) and adding the node into the tree, which // happens after Lookup/Mkdir/etc. return. // // Deprecated: use NodeOnForgetter instead. func (n *Inode) Forgotten() bool { n.mu.Lock() defer n.mu.Unlock() return n.lookupCount == 0 && n.parents.count() == 0 && !n.persistent } // Operations returns the object implementing the file system // operations. func (n *Inode) Operations() InodeEmbedder { return n.ops } // Path returns a path string to the inode relative to `root`. // Pass nil to walk the hierarchy as far up as possible. // // If you set `root`, Path() warns if it finds an orphaned Inode, i.e. // if it does not end up at `root` after walking the hierarchy. func (n *Inode) Path(root *Inode) string { var segments []string p := n for p != nil && p != root { // We don't try to take all locks at the same time, because // the caller won't use the "path" string under lock anyway. p.mu.Lock() // Get last known parent pd := p.parents.get() p.mu.Unlock() if pd == nil { p = nil break } segments = append(segments, pd.name) p = pd.parent } if root != nil && root != p { deletedPlaceholder := fmt.Sprintf(".go-fuse.%d/deleted", rand.Uint64()) n.bridge.logf("warning: Inode.Path: n%d is orphaned, replacing segment with %q", n.nodeId, deletedPlaceholder) // NOSUBMIT - should replace rather than append? segments = append(segments, deletedPlaceholder) } i := 0 j := len(segments) - 1 for i < j { segments[i], segments[j] = segments[j], segments[i] i++ j-- } path := strings.Join(segments, "/") return path } // setEntry does `iparent[name] = ichild` linking. // // setEntry must not be called simultaneously for any of iparent or ichild. // This, for example could be satisfied if both iparent and ichild are locked, // but it could be also valid if only iparent is locked and ichild was just // created and only one goroutine keeps referencing it. func (iparent *Inode) setEntry(name string, ichild *Inode) { if ichild.stableAttr.Mode == syscall.S_IFDIR { // Directories cannot have more than one parent. Clear the map. // This special-case is neccessary because ichild may still have a // parent that was forgotten (i.e. removed from bridge.inoMap). ichild.parents.clear() } iparent.children.set(iparent, name, ichild) } // NewPersistentInode returns an Inode whose lifetime is not in // control of the kernel. // // When the kernel is short on memory, it will forget cached file // system information (directory entries and inode metadata). This is // announced with FORGET messages. There are no guarantees if or when // this happens. When it happens, these are handled transparently by // go-fuse: all Inodes created with NewInode are released // automatically. NewPersistentInode creates inodes that go-fuse keeps // in memory, even if the kernel is not interested in them. This is // convenient for building static trees up-front. func (n *Inode) NewPersistentInode(ctx context.Context, node InodeEmbedder, id StableAttr) *Inode { return n.newInode(ctx, node, id, true) } // ForgetPersistent manually marks the node as no longer important. If // it has no children, and if the kernel as no references, the nodes // gets removed from the tree. func (n *Inode) ForgetPersistent() { n.removeRef(0, true) } // NewInode returns an inode for the given InodeEmbedder. The mode // should be standard mode argument (eg. S_IFDIR). The inode number in // id.Ino argument is used to implement hard-links. If it is given, // and another node with the same ID is known, the new inode may be // ignored, and the old one used instead. func (n *Inode) NewInode(ctx context.Context, node InodeEmbedder, id StableAttr) *Inode { return n.newInode(ctx, node, id, false) } func (n *Inode) newInode(ctx context.Context, ops InodeEmbedder, id StableAttr, persistent bool) *Inode { return n.bridge.newInode(ctx, ops, id, persistent) } // removeRef decreases references. Returns if this operation caused // the node to be forgotten (for kernel references), and whether it is // live (ie. was not dropped from the tree) func (n *Inode) removeRef(nlookup uint64, dropPersistence bool) (hasLookups, isPersistent, hasChildren bool) { var beforeLookups, beforePersistence, beforeChildren bool var unusedParents []*Inode beforeLookups, hasLookups, beforePersistence, isPersistent, beforeChildren, hasChildren, unusedParents = n.removeRefInner(nlookup, dropPersistence, unusedParents) if !hasLookups && !isPersistent && !hasChildren && (beforeChildren || beforeLookups || beforePersistence) { if nf, ok := n.ops.(NodeOnForgetter); ok { nf.OnForget() } } for len(unusedParents) > 0 { l := len(unusedParents) p := unusedParents[l-1] unusedParents = unusedParents[:l-1] _, _, _, _, _, _, unusedParents = p.removeRefInner(0, false, unusedParents) if nf, ok := p.ops.(NodeOnForgetter); ok { nf.OnForget() } } return } func (n *Inode) removeRefInner(nlookup uint64, dropPersistence bool, inputUnusedParents []*Inode) (beforeLookups, hasLookups, beforePersistent, isPersistent, beforeChildren, hasChildren bool, unusedParents []*Inode) { var lockme []*Inode var parents []parentData unusedParents = inputUnusedParents n.mu.Lock() beforeLookups = n.lookupCount > 0 beforePersistent = n.persistent beforeChildren = n.children.len() > 0 if nlookup > 0 && dropPersistence { log.Panic("only one allowed") } else if nlookup > n.lookupCount { log.Panicf("n%d lookupCount underflow: lookupCount=%d, decrement=%d", n.nodeId, n.lookupCount, nlookup) } else if nlookup > 0 { n.lookupCount -= nlookup n.changeCounter++ } else if dropPersistence && n.persistent { n.persistent = false n.changeCounter++ } n.bridge.mu.Lock() if n.lookupCount == 0 { // Dropping the node from stableAttrs guarantees that no new references to this node are // handed out to the kernel, hence we can also safely delete it from kernelNodeIds. delete(n.bridge.stableAttrs, n.stableAttr) delete(n.bridge.kernelNodeIds, n.nodeId) } n.bridge.mu.Unlock() retry: for { lockme = append(lockme[:0], n) parents = parents[:0] nChange := n.changeCounter hasLookups = n.lookupCount > 0 hasChildren = n.children.len() > 0 isPersistent = n.persistent for _, p := range n.parents.all() { parents = append(parents, p) lockme = append(lockme, p.parent) } n.mu.Unlock() if hasLookups || hasChildren || isPersistent { return } lockNodes(lockme...) if n.changeCounter != nChange { unlockNodes(lockme...) // could avoid unlocking and relocking n here. n.mu.Lock() continue retry } for _, p := range parents { parentNode := p.parent if parentNode.children.get(p.name) != n { // another node has replaced us already continue } parentNode.children.del(p.parent, p.name) if parentNode.children.len() == 0 && parentNode.lookupCount == 0 && !parentNode.persistent { unusedParents = append(unusedParents, parentNode) } } if n.lookupCount != 0 { log.Panicf("n%d %p lookupCount changed: %d", n.nodeId, n, n.lookupCount) } unlockNodes(lockme...) break } return } // GetChild returns a child node with the given name, or nil if the // directory has no child by that name. func (n *Inode) GetChild(name string) *Inode { n.mu.Lock() defer n.mu.Unlock() return n.children.get(name) } // AddChild adds a child to this node. If overwrite is false, fail if // the destination already exists. func (n *Inode) AddChild(name string, ch *Inode, overwrite bool) (success bool) { if len(name) == 0 { log.Panic("empty name for inode") } retry: for { lockNode2(n, ch) prev := n.children.get(name) parentCounter := n.changeCounter if prev == nil { n.children.set(n, name, ch) unlockNode2(n, ch) return true } unlockNode2(n, ch) if !overwrite { return false } lockme := [3]*Inode{n, ch, prev} lockNodes(lockme[:]...) if parentCounter != n.changeCounter { unlockNodes(lockme[:]...) continue retry } prev.parents.delete(parentData{name, n}) n.children.set(n, name, ch) prev.changeCounter++ unlockNodes(lockme[:]...) return true } } // Children returns the list of children of this directory Inode. func (n *Inode) Children() map[string]*Inode { n.mu.Lock() defer n.mu.Unlock() return n.children.toMap() } // childrenList returns the list of children of this directory Inode. // The result is guaranteed to be stable as long as the directory did // not change. func (n *Inode) childrenList() []childEntry { n.mu.Lock() defer n.mu.Unlock() return n.children.list() } // Parents returns a parent of this Inode, or nil if this Inode is // deleted or is the root func (n *Inode) Parent() (string, *Inode) { n.mu.Lock() defer n.mu.Unlock() p := n.parents.get() if p == nil { return "", nil } return p.name, p.parent } // RmAllChildren recursively drops a tree, forgetting all persistent // nodes. func (n *Inode) RmAllChildren() { for { chs := n.Children() if len(chs) == 0 { break } for nm, ch := range chs { ch.RmAllChildren() n.RmChild(nm) } } n.removeRef(0, true) } // RmChild removes multiple children. Returns whether the removal // succeeded and whether the node is still live afterward. The removal // is transactional: it only succeeds if all names are children, and // if they all were removed successfully. If the removal was // successful, and there are no children left, the node may be removed // from the FS tree. In that case, RmChild returns live==false. func (n *Inode) RmChild(names ...string) (success, live bool) { var lockme []*Inode retry: for { n.mu.Lock() lockme = append(lockme[:0], n) nChange := n.changeCounter for _, nm := range names { ch := n.children.get(nm) if ch == nil { n.mu.Unlock() return false, true } lockme = append(lockme, ch) } n.mu.Unlock() lockNodes(lockme...) if n.changeCounter != nChange { unlockNodes(lockme...) continue retry } for _, nm := range names { n.children.del(n, nm) } live = n.lookupCount > 0 || n.children.len() > 0 || n.persistent unlockNodes(lockme...) // removal successful break } if !live { hasLookups, isPersistent, hasChildren := n.removeRef(0, false) return true, (hasLookups || isPersistent || hasChildren) } return true, true } // MvChild executes a rename. If overwrite is set, a child at the // destination will be overwritten, should it exist. It returns false // if 'overwrite' is false, and the destination exists. func (n *Inode) MvChild(old string, newParent *Inode, newName string, overwrite bool) bool { if len(newName) == 0 { log.Panicf("empty newName for MvChild") } retry: for { lockNode2(n, newParent) counter1 := n.changeCounter counter2 := newParent.changeCounter oldChild := n.children.get(old) destChild := newParent.children.get(newName) unlockNode2(n, newParent) if destChild != nil && !overwrite { return false } lockNodes(n, newParent, oldChild, destChild) if counter2 != newParent.changeCounter || counter1 != n.changeCounter { unlockNodes(n, newParent, oldChild, destChild) continue retry } if oldChild != nil { n.children.del(n, old) } if destChild != nil { // This can cause the child to be slated for // removal; see below newParent.children.del(newParent, newName) } if oldChild != nil { newParent.children.set(newParent, newName, oldChild) } unlockNodes(n, newParent, oldChild, destChild) if destChild != nil { destChild.removeRef(0, false) } return true } } // ExchangeChild swaps the entries at (n, oldName) and (newParent, // newName). func (n *Inode) ExchangeChild(oldName string, newParent *Inode, newName string) { oldParent := n retry: for { lockNode2(oldParent, newParent) counter1 := oldParent.changeCounter counter2 := newParent.changeCounter oldChild := oldParent.children.get(oldName) destChild := newParent.children.get(newName) unlockNode2(oldParent, newParent) if destChild == oldChild { return } lockNodes(oldParent, newParent, oldChild, destChild) if counter2 != newParent.changeCounter || counter1 != oldParent.changeCounter { unlockNodes(oldParent, newParent, oldChild, destChild) continue retry } // Detach if oldChild != nil { oldParent.children.del(oldParent, oldName) } if destChild != nil { newParent.children.del(newParent, newName) } // Attach if oldChild != nil { newParent.children.set(newParent, newName, oldChild) } if destChild != nil { oldParent.children.set(oldParent, oldName, destChild) } unlockNodes(oldParent, newParent, oldChild, destChild) return } } // NotifyEntry notifies the kernel that data for a (directory, name) // tuple should be invalidated. On next access, a LOOKUP operation // will be started. func (n *Inode) NotifyEntry(name string) syscall.Errno { status := n.bridge.server.EntryNotify(n.nodeId, name) return syscall.Errno(status) } // NotifyDelete notifies the kernel that the given inode was removed // from this directory as entry under the given name. It is equivalent // to NotifyEntry, but also sends an event to inotify watchers. func (n *Inode) NotifyDelete(name string, child *Inode) syscall.Errno { // XXX arg ordering? return syscall.Errno(n.bridge.server.DeleteNotify(n.nodeId, child.nodeId, name)) } // NotifyContent notifies the kernel that content under the given // inode should be flushed from buffers. func (n *Inode) NotifyContent(off, sz int64) syscall.Errno { // XXX how does this work for directories? return syscall.Errno(n.bridge.server.InodeNotify(n.nodeId, off, sz)) } // WriteCache stores data in the kernel cache. func (n *Inode) WriteCache(offset int64, data []byte) syscall.Errno { return syscall.Errno(n.bridge.server.InodeNotifyStoreCache(n.nodeId, offset, data)) } // ReadCache reads data from the kernel cache. func (n *Inode) ReadCache(offset int64, dest []byte) (count int, errno syscall.Errno) { c, s := n.bridge.server.InodeRetrieveCache(n.nodeId, offset, dest) return c, syscall.Errno(s) }