usr/src/sys/miscfs/union/union_vnops.c

/*
 * Copyright (c) 1992, 1993, 1994 The Regents of the University of California.
 * Copyright (c) 1992, 1993, 1994 Jan-Simon Pendry.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry and by John Heidemann of the UCLA Ficus project.
 *
 * %sccs.include.redist.c%
 *
 *      @(#)union_vnops.c       1.1 (Berkeley) %G%
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/buf.h>
#include "union.h"


int union_bug_bypass = 0;   /* for debugging: enables bypass printf'ing */

/*
 * This is the 10-Apr-92 bypass routine.
 *    This version has been optimized for speed, throwing away some
 * safety checks.  It should still always work, but it's not as
 * robust to programmer errors.
 *    Define SAFETY to include some error checking code.
 *
 * In general, we map all vnodes going down and unmap them on the way back.
 * As an exception to this, vnodes can be marked "unmapped" by setting
 * the Nth bit in operation's vdesc_flags.
 *
 * Also, some BSD vnode operations have the side effect of vrele'ing
 * their arguments.  With stacking, the reference counts are held
 * by the upper node, not the lower one, so we must handle these
 * side-effects here.  This is not of concern in Sun-derived systems
 * since there are no such side-effects.
 *
 * This makes the following assumptions:
 * - only one returned vpp
 * - no INOUT vpp's (Sun's vop_open has one of these)
 * - the vnode operation vector of the first vnode should be used
 *   to determine what implementation of the op should be invoked
 * - all mapped vnodes are of our vnode-type (NEEDSWORK:
 *   problems on rmdir'ing mount points and renaming?)
 */
int
union_bypass(ap)
        struct vop_generic_args /* {
                struct vnodeop_desc *a_desc;
                <other random data follows, presumably>
        } */ *ap;
{
        struct vnode **this_vp_p;
        int error;
        struct vnode *old_vps[VDESC_MAX_VPS];
        struct vnode **vps_p[VDESC_MAX_VPS];
        struct vnode ***vppp;
        struct vnodeop_desc *descp = ap->a_desc;
        int reles, i;

        if (union_bug_bypass)
                printf ("union_bypass: %s\n", descp->vdesc_name);

#ifdef SAFETY
        /*
         * We require at least one vp.
         */
        if (descp->vdesc_vp_offsets == NULL ||
            descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET)
                panic ("union_bypass: no vp's in map.\n");
#endif

        /*
         * Map the vnodes going in.
         * Later, we'll invoke the operation based on
         * the first mapped vnode's operation vector.
         */
        reles = descp->vdesc_flags;
        for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
                if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
                        break;   /* bail out at end of list */
                vps_p[i] = this_vp_p =
                        VOPARG_OFFSETTO(struct vnode **, descp->vdesc_vp_offsets[i],ap);
                /*
                 * We're not guaranteed that any but the first vnode
                 * are of our type.  Check for and don't map any
                 * that aren't.  (We must always map first vp or vclean fails.)
                 */
                if (i && (*this_vp_p)->v_op != union_vnodeop_p) {
                        old_vps[i] = NULL;
                } else {
                        old_vps[i] = *this_vp_p;
                        *(vps_p[i]) = OTHERVP(*this_vp_p);
                        /*
                         * XXX - Several operations have the side effect
                         * of vrele'ing their vp's.  We must account for
                         * that.  (This should go away in the future.)
                         */
                        if (reles & 1)
                                VREF(*this_vp_p);
                }

        }

        /*
         * Call the operation on the lower layer
         * with the modified argument structure.
         */
        error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap);

        /*
         * Maintain the illusion of call-by-value
         * by restoring vnodes in the argument structure
         * to their original value.
         */
        reles = descp->vdesc_flags;
        for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
                if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
                        break;   /* bail out at end of list */
                if (old_vps[i]) {
                        *(vps_p[i]) = old_vps[i];
                        if (reles & 1)
                                vrele(*(vps_p[i]));
                }
        }

        /*
         * Map the possible out-going vpp
         * (Assumes that the lower layer always returns
         * a VREF'ed vpp unless it gets an error.)
         */
        if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET &&
            !(descp->vdesc_flags & VDESC_NOMAP_VPP) &&
            !error) {
                /*
                 * XXX - even though some ops have vpp returned vp's,
                 * several ops actually vrele this before returning.
                 * We must avoid these ops.
                 * (This should go away when these ops are regularized.)
                 */
                if (descp->vdesc_flags & VDESC_VPP_WILLRELE)
                        goto out;
                vppp = VOPARG_OFFSETTO(struct vnode***,
                                 descp->vdesc_vpp_offset,ap);
                panic("union: failed to handled returned vnode");
                error = union_allocvp(0, 0, 0, 0, 0, 0);
        }

out:
        return (error);
}

/*
 * Check access permission on the union vnode.
 * The access check being enforced is to check
 * against both the underlying vnode, and any
 * copied vnode.  This ensures that no additional
 * file permissions are given away simply because
 * the user caused an implicit file copy.
 */
int
union_access(ap)
        struct vop_access_args /* {
                struct vnodeop_desc *a_desc;
                struct vnode *a_vp;
                int a_mode;
                struct ucred *a_cred;
                struct proc *a_p;
        } */ *ap;
{
        struct union_node *un = VTOUNION(ap->a_vp);
        struct vnode *vp;

        if (vp = un->un_lowervp) {
                int error;

                error = VOP_ACCESS(vp, ap->a_mode, ap->a_cred, ap->a_p);
                if (error)
                        return (error);
        }

        if (vp = un->un_uppervp)
                return (VOP_ACCESS(vp, ap->a_mode, ap->a_cred, ap->a_p));

        return (0);
}

static int
union_mkshadow(dvp, cnp, vpp)
        struct vnode *dvp;
        struct componentname *cnp;
        struct vnode *vpp;
{
        int error;
        struct vattr va;
        struct proc *p = cnp->cn_proc;
        int lockparent = (cnp->cn_flags & LOCKPARENT);

        /*
         * policy: when creating the shadow directory in the
         * upper layer, create it owned by the current user,
         * group from parent directory, and mode 777 modified
         * by umask (ie mostly identical to the mkdir syscall).
         * (jsp, kb)
         * TODO: create the directory owned by the user who
         * did the mount (um->um_cred).
         */

        VATTR_NULL(&va);
        va.va_type = VDIR;
        va.va_mode = UN_DIRMODE &~ p->p_fd->fd_cmask;
        if (lockparent)
                VOP_UNLOCK(dvp);
        LEASE_CHECK(dvp, p, p->p_ucred, LEASE_WRITE);
        VOP_LOCK(dvp);
        error = VOP_MKDIR(dvp, vpp, cnp, &va);
        if (lockparent)
                VOP_LOCK(dvp);
        return (error);
}

static int
union_lookup1(dvp, vpp, cnp)
        struct vnode *dvp;
        struct vnode **vpp;
        struct componentname *cnp;
{
        int error;
        struct vnode *tdvp;
        struct mount *mp;

        if (cnp->cn_flags & ISDOTDOT) {
                for (;;) {
                        if ((dvp->v_flag & VROOT) == 0 ||
                            (cnp->cn_flags & NOCROSSMOUNT))
                                break;

                        tdvp = dvp;
                        dvp = dvp->v_mount->mnt_vnodecovered;
                        vput(tdvp);
                        VREF(dvp);
                        VOP_LOCK(dvp);
                }
        }

        error = VOP_LOOKUP(dvp, &tdvp, cnp);
        if (error)
                return (error);

        dvp = tdvp;
        while ((dvp->v_type == VDIR) && (mp = dvp->v_mountedhere) &&
               (cnp->cn_flags & NOCROSSMOUNT) == 0) {

                if (mp->mnt_flag & MNT_MLOCK) {
                        mp->mnt_flag |= MNT_MWAIT;
                        sleep((caddr_t) mp, PVFS);
                        continue;
                }

                if (error = VFS_ROOT(mp, &tdvp)) {
                        vput(dvp);
                        return (error);
                }

                vput(tdvp);
                dvp = tdvp;
        }

        *vpp = dvp;
        return (0);
}

int
union_lookup(ap)
        struct vop_lookup_args /* {
                struct vnodeop_desc *a_desc;
                struct vnode *a_dvp;
                struct vnode **a_vpp;
                struct componentname *a_cnp;
        } */ *ap;
{
        int uerror, lerror;
        struct vnode *uppervp, *lowervp;
        struct vnode *upperdvp, *lowerdvp;
        struct vnode *dvp = ap->a_dvp;
        struct union_node *dun = VTOUNION(ap->a_dvp);
        struct componentname *cnp = ap->a_cnp;
        int lockparent = cnp->cn_flags & LOCKPARENT;

        upperdvp = dun->un_uppervp;
        lowerdvp = dun->un_lowervp;

        /*
         * do the lookup in the upper level.
         * if that level comsumes additional pathnames,
         * then assume that something special is going
         * on and just return that vnode.
         */
        uppervp = 0;
        if (upperdvp) {
                uerror = union_lookup1(upperdvp, &uppervp, cnp);
                if (cnp->cn_consume != 0) {
                        *ap->a_vpp = uppervp;
                        return (uerror);
                }
                if (!lockparent)
                        VOP_LOCK(upperdvp);
        } else {
                uerror = ENOENT;
        }

        /*
         * in a similar way to the upper layer, do the lookup
         * in the lower layer.   this time, if there is some
         * component magic going on, then vput whatever we got
         * back from the upper layer and return the lower vnode
         * instead.
         */
        lowervp = 0;
        if (lowerdvp) {
                lerror = union_lookup1(lowerdvp, &lowervp, cnp);
                if (cnp->cn_consume != 0) {
                        if (uppervp) {
                                vput(uppervp);
                                uppervp = 0;
                        }
                        *ap->a_vpp = lowervp;
                        return (lerror);
                }
                if (!lockparent)
                        VOP_LOCK(lowerdvp);
        } else {
                lerror = ENOENT;
        }

        /*
         * at this point, we have uerror and lerror indicating
         * possible errors with the lookups in the upper and lower
         * layers.  additionally, uppervp and lowervp are (locked)
         * references to existing vnodes in the upper and lower layers.
         *
         * there are now three cases to consider.
         * 1. if both layers returned an error, then return whatever
         *    error the upper layer generated.
         *
         * 2. if the top layer failed and the bottom layer succeeded
         *    then two subcases occur.
         *    a.  the bottom vnode is not a directory, in which
         *        case just return a new union vnode referencing
         *        an empty top layer and the existing bottom layer.
         *    b.  the bottom vnode is a directory, in which case
         *        create a new directory in the top-level and
         *        continue as in case 3.
         *
         * 3. if the top layer succeeded then return a new union
         *    vnode referencing whatever the new top layer and
         *    whatever the bottom layer returned.
         */

        /* case 1. */
        if ((uerror != 0) && (lerror != 0)) {
                *ap->a_vpp = 0;
                return (uerror);
        }

        /* case 2. */
        if (uerror != 0 /* && (lerror == 0) */ ) {
                if (lowervp->v_type == VDIR) { /* case 2b. */
                        uerror = union_mkshadow(upperdvp, cnp, &uppervp);
                        if (uerror) {
                                if (lowervp) {
                                        vput(lowervp);
                                        lowervp = 0;
                                }
                                return (uerror);
                        }
                }
        }

        return (union_allocvp(ap->a_vpp, dvp->v_mount, dvp, cnp,
                              uppervp, lowervp));
}

/*
 * copyfile.  copy the vnode (fvp) to the vnode (tvp)
 * using a sequence of reads and writes.
 */
static int
union_copyfile(p, cred, fvp, tvp)
        struct proc *p;
        struct ucred *cred;
        struct vnode *fvp;
        struct vnode *tvp;
{
        char *buf;
        struct uio uio;
        struct iovec iov;
        int error = 0;
        off_t offset;

        /*
         * strategy:
         * allocate a buffer of size MAXBSIZE.
         * loop doing reads and writes, keeping track
         * of the current uio offset.
         * give up at the first sign of trouble.
         */

        uio.uio_procp = p;
        uio.uio_segflg = UIO_SYSSPACE;
        offset = 0;

        VOP_UNLOCK(fvp);                                /* XXX */
        LEASE_CHECK(fvp, p, cred, LEASE_READ);
        VOP_LOCK(fvp);                                  /* XXX */
        VOP_UNLOCK(tvp);                                /* XXX */
        LEASE_CHECK(tvp, p, cred, LEASE_WRITE);
        VOP_LOCK(tvp);                                  /* XXX */

        buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
        do {
                uio.uio_iov = &iov;
                uio.uio_iovcnt = 1;
                iov.iov_base = buf;
                iov.iov_len = MAXBSIZE;
                uio.uio_resid = iov.iov_len;
                uio.uio_offset = offset;
                uio.uio_rw = UIO_READ;
                error = VOP_READ(fvp, &uio, 0, cred);

                if (error == 0) {
                        uio.uio_iov = &iov;
                        uio.uio_iovcnt = 1;
                        iov.iov_base = buf;
                        iov.iov_len = MAXBSIZE - uio.uio_resid;
                        uio.uio_rw = UIO_WRITE;
                        uio.uio_resid = iov.iov_len;
                        uio.uio_offset = offset;

                        do {
                                error = VOP_WRITE(tvp, &uio, 0, cred);
                        } while (error == 0 && uio.uio_resid > 0);
                        if (error == 0)
                                offset = uio.uio_offset;
                }
        } while ((uio.uio_resid == 0) && (error == 0));

        free(buf, M_TEMP);
        return (error);
}

int
union_open(ap)
        struct vop_open_args /* {
                struct vnodeop_desc *a_desc;
                struct vnode *a_vp;
                int a_mode;
                struct ucred *a_cred;
                struct proc *a_p;
        } */ *ap;
{
        struct union_node *un = VTOUNION(ap->a_vp);
        int mode = ap->a_mode;
        struct ucred *cred = ap->a_cred;
        struct proc *p = ap->a_p;

        /*
         * If there is an existing upper vp then simply open that.
         */
        if (un->un_uppervp)
                return (VOP_OPEN(un->un_uppervp, mode, cred, p));

        /*
         * If the lower vnode is being opened for writing, then
         * copy the file contents to the upper vnode and open that,
         * otherwise can simply open the lower vnode.
         */
        if ((ap->a_mode & FWRITE) && (un->un_lowervp->v_type == VREG)) {
                int error;
                struct nameidata nd;
                struct filedesc *fdp = p->p_fd;
                int fmode;
                int cmode;

                /*
                 * Open the named file in the upper layer.  Note that
                 * the file may have come into existence *since* the lookup
                 * was done, since the upper layer may really be a
                 * loopback mount of some other filesystem... so open
                 * the file with exclusive create and barf if it already
                 * exists.
                 * XXX - perhaps shoudl re-lookup the node (once more with
                 * feeling) and simply open that.  Who knows.
                 */
                NDINIT(&nd, CREATE, 0, UIO_SYSSPACE, un->un_path, p);
                fmode = (O_CREAT|O_TRUNC|O_EXCL);
                cmode = UN_FILEMODE & ~fdp->fd_cmask;
                error = vn_open(&nd, fmode, cmode);
                if (error)
                        return (error);
                un->un_uppervp = nd.ni_vp;
                /*
                 * Now, if the file is being opened with truncation, then
                 * the (new) upper vnode is ready to fly, otherwise the
                 * data from the lower vnode must be copied to the upper
                 * layer first.  This only works for regular files (check
                 * is made above).
                 */
                if ((mode & O_TRUNC) == 0) {
                        /* XXX - should not ignore errors from VOP_CLOSE */
                        error = VOP_OPEN(un->un_lowervp, FREAD, cred, p);
                        if (error == 0) {
                                error = union_copyfile(p, cred,
                                               un->un_lowervp, un->un_uppervp);
                                (void) VOP_CLOSE(un->un_lowervp, FREAD);
                        }
                        (void) VOP_CLOSE(un->un_uppervp, FWRITE);
                }
                if (error == 0)
                        error = VOP_OPEN(un->un_uppervp, FREAD, cred, p);
                return (error);
        }

        return (VOP_OPEN(un->un_lowervp, mode, cred, p));
}

/*
 *  We handle getattr only to change the fsid.
 */
int
union_getattr(ap)
        struct vop_getattr_args /* {
                struct vnode *a_vp;
                struct vattr *a_vap;
                struct ucred *a_cred;
                struct proc *a_p;
        } */ *ap;
{
        int error;

        if (error = union_bypass(ap))
                return (error);
        /* Requires that arguments be restored. */
        ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
        return (0);
}

/*
 * union_readdir works in concert with getdirentries and
 * readdir(3) to provide a list of entries in the unioned
 * directories.  getdirentries is responsible for walking
 * down the union stack.  readdir(3) is responsible for
 * eliminating duplicate names from the returned data stream.
 */
int
union_readdir(ap)
        struct vop_readdir_args /* {
                struct vnodeop_desc *a_desc;
                struct vnode *a_vp;
                struct uio *a_uio;
                struct ucred *a_cred;
        } */ *ap;
{
        struct union_node *un = VTOUNION(ap->a_vp);

        if (un->un_uppervp)
                return (union_bypass(ap));

        return (0);
}

int
union_inactive(ap)
        struct vop_inactive_args /* {
                struct vnode *a_vp;
        } */ *ap;
{

        /*
         * Do nothing (and _don't_ bypass).
         * Wait to vrele lowervp until reclaim,
         * so that until then our union_node is in the
         * cache and reusable.
         *
         * NEEDSWORK: Someday, consider inactive'ing
         * the lowervp and then trying to reactivate it
         * with capabilities (v_id)
         * like they do in the name lookup cache code.
         * That's too much work for now.
         */
        return (0);
}

int
union_reclaim(ap)
        struct vop_reclaim_args /* {
                struct vnode *a_vp;
        } */ *ap;
{
        struct vnode *vp = ap->a_vp;
        struct union_node *un = VTOUNION(vp);
        struct vnode *uppervp = un->un_uppervp;
        struct vnode *lowervp = un->un_lowervp;
        struct vnode *dirvp = un->un_dirvp;
        char *path = un->un_path;

        /*
         * Note: in vop_reclaim, vp->v_op == dead_vnodeop_p,
         * so we can't call VOPs on ourself.
         */
        /* After this assignment, this node will not be re-used. */
        un->un_uppervp = 0;
        un->un_lowervp = 0;
        un->un_dirvp = 0;
        un->un_path = NULL;
        union_freevp(vp);
        if (uppervp)
                vrele(uppervp);
        if (lowervp)
                vrele(lowervp);
        if (dirvp)
                vrele(dirvp);
        if (path)
                free(path, M_TEMP);
        return (0);
}


int
union_print(ap)
        struct vop_print_args /* {
                struct vnode *a_vp;
        } */ *ap;
{
        struct vnode *vp = ap->a_vp;

        printf("\ttag VT_UNION, vp=%x, uppervp=%x, lowervp=%x\n",
                        vp, UPPERVP(vp), LOWERVP(vp));
        return (0);
}


/*
 * XXX - vop_strategy must be hand coded because it has no
 * vnode in its arguments.
 * This goes away with a merged VM/buffer cache.
 */
int
union_strategy(ap)
        struct vop_strategy_args /* {
                struct buf *a_bp;
        } */ *ap;
{
        struct buf *bp = ap->a_bp;
        int error;
        struct vnode *savedvp;

        savedvp = bp->b_vp;
        bp->b_vp = OTHERVP(bp->b_vp);

#ifdef DIAGNOSTIC
        if (bp->b_vp == 0)
                panic("union_strategy: nil vp");
        if (((bp->b_flags & B_READ) == 0) &&
            (bp->b_vp == LOWERVP(savedvp)))
                panic("union_strategy: writing to lowervp");
#endif

        error = VOP_STRATEGY(bp);
        bp->b_vp = savedvp;

        return (error);
}


/*
 * XXX - like vop_strategy, vop_bwrite must be hand coded because it has no
 * vnode in its arguments.
 * This goes away with a merged VM/buffer cache.
 */
int
union_bwrite(ap)
        struct vop_bwrite_args /* {
                struct buf *a_bp;
        } */ *ap;
{
        struct buf *bp = ap->a_bp;
        int error;
        struct vnode *savedvp;

        savedvp = bp->b_vp;
        bp->b_vp = UPPERVP(bp->b_vp);

#ifdef DIAGNOSTIC
        if (bp->b_vp == 0)
                panic("union_bwrite: no upper vp");
#endif

        error = VOP_BWRITE(bp);

        bp->b_vp = savedvp;

        return (error);
}

int
union_lock(ap)
        struct vop_lock_args *ap;
{
        struct union_node *un = VTOUNION(ap->a_vp);

#ifdef DIAGNOSTIC
        if (un->un_pid == curproc->p_pid)
                panic("union: locking agsinst myself");
#endif
        while (un->un_flags & UN_LOCKED) {
                un->un_flags |= UN_WANT;
                sleep((caddr_t) &un->un_flags, PINOD);
        }
        un->un_flags |= UN_LOCKED;
#ifdef DIAGNOSTIC
        un->un_pid = curproc->p_pid;
#endif

        if (un->un_lowervp && !VOP_ISLOCKED(un->un_lowervp))
                VOP_LOCK(un->un_lowervp);
        if (un->un_uppervp && !VOP_ISLOCKED(un->un_uppervp))
                VOP_LOCK(un->un_uppervp);
}

int
union_unlock(ap)
        struct vop_lock_args *ap;
{
        struct union_node *un = VTOUNION(ap->a_vp);

#ifdef DIAGNOSTIC
        if (un->un_pid != curproc->p_pid)
                panic("union: unlocking other process's union node");
        if ((un->un_flags & UN_LOCKED) == 0)
                panic("union: unlock unlocked node");
#endif

        if (un->un_uppervp && VOP_ISLOCKED(un->un_uppervp))
                VOP_UNLOCK(un->un_uppervp);
        if (un->un_lowervp && VOP_ISLOCKED(un->un_lowervp))
                VOP_UNLOCK(un->un_lowervp);

        un->un_flags &= ~UN_LOCKED;
        if (un->un_flags & UN_WANT) {
                un->un_flags &= ~UN_WANT;
                wakeup((caddr_t) &un->un_flags);
        }

#ifdef DIAGNOSTIC
        un->un_pid = 0;
#endif
}

/*
 * Global vfs data structures
 */
int (**union_vnodeop_p)();
struct vnodeopv_entry_desc union_vnodeop_entries[] = {
        { &vop_default_desc, union_bypass },

        { &vop_getattr_desc, union_getattr },
        { &vop_inactive_desc, union_inactive },
        { &vop_reclaim_desc, union_reclaim },
        { &vop_print_desc, union_print },

        { &vop_strategy_desc, union_strategy },
        { &vop_bwrite_desc, union_bwrite },

        { &vop_lock_desc, union_lock },
        { &vop_unlock_desc, union_unlock },

        { (struct vnodeop_desc*)NULL, (int(*)())NULL }
};
struct vnodeopv_desc union_vnodeop_opv_desc =
        { &union_vnodeop_p, union_vnodeop_entries };