usr/src/sys/nfs/nfs_socket.c

/*
 * Copyright (c) 1989 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Rick Macklem at The University of Guelph.
 *
 * Redistribution and use in source and binary forms are permitted
 * provided that the above copyright notice and this paragraph are
 * duplicated in all such forms and that any documentation,
 * advertising materials, and other materials related to such
 * distribution and use acknowledge that the software was developed
 * by the University of California, Berkeley.  The name of the
 * University may not be used to endorse or promote products derived
 * from this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
 *
 *      @(#)nfs_socket.c        7.6 (Berkeley) %G%
 */

/*
 * Socket operations for use by nfs (similar to uipc_socket.c, but never
 * with copies to/from a uio vector)
 * NB: For now, they only work for datagram sockets.
 * (Use on stream sockets would require some record boundary mark in the
 *  stream as defined by "RPC: Remote Procedure Call Protocol
 *  Specification" RFC1057 Section 10)
 *  and different versions of send, receive and reply that do not assume
 *  an atomic protocol
 */

#include "types.h"
#include "param.h"
#include "uio.h"
#include "user.h"
#include "proc.h"
#include "signal.h"
#include "mount.h"
#include "kernel.h"
#include "malloc.h"
#include "mbuf.h"
#include "vnode.h"
#include "domain.h"
#include "protosw.h"
#include "socket.h"
#include "socketvar.h"
#include "rpcv2.h"
#include "nfsv2.h"
#include "nfs.h"
#include "xdr_subs.h"
#include "nfsm_subs.h"
#include "nfsmount.h"

#include "syslog.h"
#define nfs_log(message, host)  log(LOG_ERR, message, host)

#define TRUE    1

/* set lock on sockbuf sb, sleep at neg prio */
#define nfs_sblock(sb) { \
        while ((sb)->sb_flags & SB_LOCK) { \
                (sb)->sb_flags |= SB_WANT; \
                sleep((caddr_t)&(sb)->sb_flags, PZERO-1); \
        } \
        (sb)->sb_flags |= SB_LOCK; \
}
/*
 * nfs_sbwait() is simply sbwait() but at a negative priority so that it
 * can not be interrupted by a signal.
 */
nfs_sbwait(sb)
        struct sockbuf *sb;
{
        sb->sb_flags |= SB_WAIT;
        sleep((caddr_t)&sb->sb_cc, PZERO-2);
}

/*
 * External data, mostly RPC constants in XDR form
 */
extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
        rpc_msgaccepted, rpc_call;
extern u_long nfs_prog, nfs_vers;
int     nfsrv_null(),
        nfsrv_getattr(),
        nfsrv_setattr(),
        nfsrv_lookup(),
        nfsrv_readlink(),
        nfsrv_read(),
        nfsrv_write(),
        nfsrv_create(),
        nfsrv_remove(),
        nfsrv_rename(),
        nfsrv_link(),
        nfsrv_symlink(),
        nfsrv_mkdir(),
        nfsrv_rmdir(),
        nfsrv_readdir(),
        nfsrv_statfs(),
        nfsrv_noop();

int (*nfsrv_procs[NFS_NPROCS])() = {
        nfsrv_null,
        nfsrv_getattr,
        nfsrv_setattr,
        nfsrv_noop,
        nfsrv_lookup,
        nfsrv_readlink,
        nfsrv_read,
        nfsrv_noop,
        nfsrv_write,
        nfsrv_create,
        nfsrv_remove,
        nfsrv_rename,
        nfsrv_link,
        nfsrv_symlink,
        nfsrv_mkdir,
        nfsrv_rmdir,
        nfsrv_readdir,
        nfsrv_statfs,
};

struct nfshost *nfshosth;
struct nfsreq nfsreqh;
int nfsrexmtthresh = NFS_FISHY;

/*
 * Initialize sockets and per-host congestion for a new NFS connection.
 * We do not free the sockaddr if error.
 */
nfs_connect(nmp, saddr)
        register struct nfsmount *nmp;
        struct mbuf *saddr;
{
        int s, error, srvaddrlen;
        struct mbuf *m;
        register struct nfshost *nfshp;

        nmp->nm_so = 0;
        if (error = socreate(mtod(saddr, struct sockaddr *)->sa_family,
                                &nmp->nm_so, SOCK_DGRAM, 0))
                goto bad;

        /* Unix sockets do not provide a local bind for server reply */
        if (mtod(saddr, struct sockaddr *)->sa_family == AF_UNIX) {
                struct sockaddr *sa;
                static char client[] = "/tmp/.nfs/nfsclient##";
                static int serial;
                int firstserial;
                m = m_getclr(M_WAIT, MT_SONAME);
                if (m == NULL) {
                        error = ENOBUFS;
                        goto bad;
                }
                m->m_len = sizeof (client) + 2;
                sa = mtod(m, struct sockaddr *);
                sa->sa_family = AF_UNIX;
#ifdef  MSG_TRUNC       /* Have sa_len to set? */
                sa->sa_len = m->m_len;
#endif
                bcopy(client, sa->sa_data, sizeof(client));
                firstserial = serial;
                do {
                        if (++serial >= 100) serial = 0;
                        sa->sa_data[19] = (serial / 10) + '0';
                        sa->sa_data[20] = (serial % 10) + '0';
                        error = sobind(nmp->nm_so, m);
                        if (firstserial == serial) break;
                } while (error == EADDRINUSE);
                m_freem(m);
                if (error)
                        goto bad;
        }

        if (error = soconnect(nmp->nm_so, saddr))
                goto bad;
        error = soreserve(nmp->nm_so,   /* get space ! */
                                nmp->nm_wsize + 1024,           /* one out */
                                (nmp->nm_rsize + 1024) * 4);    /* four in */
        if (error)
                goto bad;

        /*
         * Search mount list for existing server entry.
         *
         * Note, even though we have a sockaddr, it is not quite reliable
         * enough to bcmp against. For instance, a sockaddr_in has a
         * sin_zero field which is not reliably zeroed by user code (e.g.
         * mount). So what we do as an attempt at transport independence
         * is to get the peeraddr of our connected socket into a zeroed
         * sockaddr. Then we cache that and compare against it. This is
         * not exactly perfect. However it is not critical that it be, if
         * we cannot match the sockaddr we will simply allocate a new nfshp
         * per mount, which will disable the per-host congestion but
         * everything else will work as normal.
         */
        m = m_getclr(M_WAIT, MT_SONAME);
        if (m && (*(nmp->nm_so->so_proto->pr_usrreq))(nmp->nm_so, PRU_PEERADDR,
                                (struct mbuf *)0, m, (struct mbuf *)0) == 0) {
                m_freem(saddr);
                saddr = m;
        } else
                m_freem(m);
        srvaddrlen = saddr->m_len;

        s = splnet();

        for (nfshp = nfshosth; nfshp; nfshp = nfshp->nh_next) {
                if (srvaddrlen != nfshp->nh_salen)
                        continue;
                if (!bcmp(mtod(saddr,caddr_t),mtod(nfshp->nh_sockaddr,caddr_t),
                                srvaddrlen))
                        break;
        }
        if (nfshp)              /* Have an existing mount host */
                m_freem(saddr);
        else {
                MALLOC(nfshp,struct nfshost *,sizeof *nfshp,M_NFSMNT,M_WAITOK);
                bzero((caddr_t)nfshp, sizeof *nfshp);
                nfshp->nh_sockaddr = saddr;
                nfshp->nh_salen = srvaddrlen;
                /* Initialize other non-zero congestion variables */
                nfshp->nh_currto = NFS_TIMEO;
                nfshp->nh_window = 1;               /* Initial send window */
                nfshp->nh_ssthresh = NFS_MAXWINDOW; /* Slowstart threshold */
                if (nfshosth) nfshosth->nh_prev = nfshp;        /* Chain in */
                nfshp->nh_next = nfshosth;
                nfshosth = nfshp;
        }
        nfshp->nh_refcnt++;
        splx(s);
        nmp->nm_hostinfo = nfshp;
        if (nmp->nm_rto == NFS_TIMEO) {
                nmp->nm_rto = nfshp->nh_currto;
                nmp->nm_rttvar = nmp->nm_rto << 1;
        }
        return (0);

bad:
        if (nmp->nm_so) (void) soclose(nmp->nm_so);
        nmp->nm_so = 0;
        return (error);
}

/*
 * NFS disconnect. Clean up and unlink.
 */
nfs_disconnect(nmp)
        register struct nfsmount *nmp;
{
        register struct nfshost *nfshp;

        if (nmp->nm_so)
                soclose(nmp->nm_so);
        nmp->nm_so = 0;
        if (nfshp = nmp->nm_hostinfo) {
                int s = splnet();
                if (--nfshp->nh_refcnt <= 0) {
                        if (nfshp->nh_next)
                                nfshp->nh_next->nh_prev = nfshp->nh_prev;
                        if (nfshp->nh_prev)
                                nfshp->nh_prev->nh_next = nfshp->nh_next;
                        else
                                nfshosth = nfshp->nh_next;
                        /* If unix family, remove the nfsclient from /tmp */
                        if (mtod(nfshp->nh_sockaddr,
                                struct sockaddr *)->sa_family == AF_UNIX) {
                                        /* Lookup sa_data, do VOP_REMOVE... */
                        }
                        m_freem(nfshp->nh_sockaddr);
                        FREE(nfshp, M_NFSMNT);
                }
                nmp->nm_hostinfo = 0;
                splx(s);
        }
}

/*
 * This is a stripped down non-interruptible version of sosend().
 */
nfs_send(so, nam, top, flags, siz)
        register struct socket *so;
        struct mbuf *nam;
        struct mbuf *top;
        int flags;
        int siz;
{
        int error, s;

#ifdef MGETHDR
        top->m_pkthdr.len = siz;
#endif
        for (;;) {
                nfs_sblock(&so->so_snd);
                s = splnet();
                if (error = nfs_sockerr(so, 1)) {
                        splx(s);
                        m_freem(top);
                        break;
                }
                if (sbspace(&so->so_snd) < siz) {
                        sbunlock(&so->so_snd);
                        nfs_sbwait(&so->so_snd);
                        splx(s);
                        continue;
                }
                error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, top,
                        (struct mbuf *)nam, (struct mbuf *)0, (struct mbuf *)0);
                splx(s);
                break;
        }
        sbunlock(&so->so_snd);
        return (error);
}

/*
 * This is a stripped down datagram specific version of soreceive()
 */
nfs_dgreceive(so, msk, mtch, aname, mp)
        register struct socket *so;
        u_long msk;
        u_long mtch;
        struct mbuf **aname;
        struct mbuf **mp;
{
        register struct mbuf *m;
        int s, error = 0;
        struct mbuf *nextrecord;

        if (aname)
                *aname = 0;

        for (;;) {
                sblock(&so->so_rcv);
                s = splnet();

                if (so->so_rcv.sb_cc == 0) {
                        if (error = nfs_sockerr(so, 0)) {
                                so->so_error = 0;
                                break;
                        }
                        sbunlock(&so->so_rcv);
                        sbwait(&so->so_rcv);
                        splx(s);
                        continue;
                }
                m = so->so_rcv.sb_mb;
                if (m == 0)
                        panic("nfs_dgreceive 1");
                nextrecord = m->m_nextpkt;
                /* Save sender's address */
                if (m->m_type != MT_SONAME)
                        panic("nfs_dgreceive 1a");
                sbfree(&so->so_rcv, m);
                if (aname) {
                        *aname = m;
                        so->so_rcv.sb_mb = m->m_next;
                        m->m_next = 0;
                        m = so->so_rcv.sb_mb;
                } else {
                        MFREE(m, so->so_rcv.sb_mb);
                        m = so->so_rcv.sb_mb;
                }
                /* Drop control mbuf's */
                if (m && m->m_type == MT_RIGHTS)
                        panic("nfs_dgreceive 2");
                if (m && m->m_type == MT_CONTROL) {
                        sbfree(&so->so_rcv, m);
                        MFREE(m, so->so_rcv.sb_mb);
                        m = so->so_rcv.sb_mb;
                }
                /* Dequeue packet from sockbuf */
                *mp = m;
                while (m) {
                        if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
                                panic("nfs_dgreceive 3");
                        sbfree(&so->so_rcv, m);
                        m = so->so_rcv.sb_mb = m->m_next;
                }
                so->so_rcv.sb_mb = nextrecord;
                /* Return */
                break;
        }
        sbunlock(&so->so_rcv);
        splx(s);
        return (error);
}

struct rpc_replyhead {
        u_long  r_xid;
        u_long  r_rep;
};

/*
 * Implement NFS client side datagram receive.
 * We depend on the way that records are added to the sockbuf
 * by sbappend*.  In particular, each record (mbufs linked through m_next)
 * must begin with an address, followed by optional MT_CONTROL mbuf
 * and then zero or more mbufs of data.
 * We must search through the list of received datagrams matching them
 * with outstanding requests using the xid, until ours is found.
 */
nfs_dgreply(so, mntp, myrep)
        register struct socket *so;
        struct nfsmount *mntp;
        struct nfsreq *myrep;
{
        register struct mbuf *m;
        register struct nfsreq *rep;
        register int error = 0, s;
        int logged = 0;
        struct mbuf *nextrecord;
        struct rpc_replyhead replyh;

restart:
        nfs_sblock(&so->so_rcv);
        s = splnet();
        /* Already received and queued for us, bye bye */
        if (myrep->r_mrep != NULL) {
                error = 0;
                goto release;
        }
        /* If we have run out of retries (hard mounts have bogus count) */
        if (myrep->r_rexmit > myrep->r_retry) {
                error = ETIMEDOUT;
                nfsstats.rpctimeouts++;
giveup:
                if (myrep->r_flags & R_TIMING) {
                        myrep->r_flags &= ~R_TIMING;
                        mntp->nm_rtt = -1;
                }
                if (myrep->r_flags & R_SENT) {
                        myrep->r_flags &= ~R_SENT;
                        --mntp->nm_hostinfo->nh_sent;
                        /* If count now 0, want to initiate new req */
                }
                goto release;
        }

        m = so->so_rcv.sb_mb;
        if (m == 0) {
                if (so->so_rcv.sb_cc)
                        panic("nfs_soreply 1");
                if (error = nfs_sockerr(so, 0)) {
                        so->so_error = 0;
                        goto giveup;
                }
                /* Allow signals to interrupt request? (nfs_timer wakes up) */
                if ((mntp->nm_flag & NFSMNT_INT) &&
                    u.u_procp->p_sig & ~u.u_procp->p_sigmask) {
                        error = EINTR;
                        goto giveup;
                }
                if (mntp->nm_rexmit >= nfsrexmtthresh && logged++ == 0)
                        uprintf("NFS server %s not responding, retrying\n",
                                mntp->nm_host);
                sbunlock(&so->so_rcv);
                nfs_sbwait(&so->so_rcv);
                splx(s);
                goto restart;
        }

        /*
         * Take off the address, check for rights and ditch any control
         * mbufs.
         */
        nextrecord = m->m_nextpkt;
        if (m->m_type != MT_SONAME)
                panic("nfs reply SONAME");
        sbfree(&so->so_rcv, m);
        MFREE(m, so->so_rcv.sb_mb);
        m = so->so_rcv.sb_mb;
        if (m && m->m_type == MT_RIGHTS)
                panic("nfs reply RIGHTS");
        if (m && m->m_type == MT_CONTROL) {
                sbfree(&so->so_rcv, m);
                MFREE(m, so->so_rcv.sb_mb);
                m = so->so_rcv.sb_mb;
        }
        if (m) {
                m->m_nextpkt = nextrecord;
        } else {
                so->so_rcv.sb_mb = nextrecord;
                sbunlock(&so->so_rcv);
                splx(s);
                goto restart;
        }

        /*
         * Get the xid and check that it is an rpc reply
         */
        if (m->m_len >= sizeof replyh)
                bcopy(mtod(m, caddr_t), (caddr_t)&replyh, sizeof replyh);
        else {
                struct mbuf *mp = m;
                caddr_t cp = (caddr_t)&replyh;
                int cnt = sizeof replyh;
                do {
                        if (mp->m_len > 0) {
                                int xfer = (mp->m_len >= cnt) ? cnt : mp->m_len;
                                bcopy(mtod(mp, caddr_t), cp, xfer);
                                cnt -= xfer;
                                cp += xfer;
                        }
                        if (cnt > 0)
                                mp = mp->m_next;
                } while (mp && cnt > 0);
                if (mp == NULL) {               /* Insufficient length */
                        nfsstats.rpcinvalid++;
                        goto dropit;
                }
        }
        if (replyh.r_rep != rpc_reply) {        /* Not a reply */
                nfsstats.rpcinvalid++;
                goto dropit;
        }
        /*
         * Loop through the request list to match up the reply
         * If no match, just drop the datagram
         */
        if (rep = nfsreqh.r_next) {
            while (rep != &nfsreqh) {
                /* The socket, being connected, will only queue matches */
                if (replyh.r_xid == rep->r_xid && so == rep->r_mntp->nm_so) {
                        /* Found it.. */
                        if (rep->r_mrep)        /* Already there - duplicate */
                                break;
                        rep->r_mrep = m;
                        while (m) {
                                if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
                                        panic("nfs_soreply 3");
                                sbfree(&so->so_rcv, m);
                                m = so->so_rcv.sb_mb = m->m_next;
                        }
                        so->so_rcv.sb_mb = nextrecord;
                        if (rep->r_flags & R_TIMING) {
                                nfs_updatetimer(mntp);
                                rep->r_flags &= ~R_TIMING;
                                mntp->nm_rtt = -1;      /* re-arm timer */
                        }
                        if (rep->r_flags & R_SENT) {
                                rep->r_flags &= ~R_SENT;
                                --mntp->nm_hostinfo->nh_sent;
                                /* If count now 0, want to initiate new req */
                        }
                        if (rep == myrep) {             /* This is success */
                                if (logged)
                                        uprintf("NFS server %s responded\n",
                                                mntp->nm_host);
                                goto release;
                        }
                        /* Else wake up other sleeper and wait for next */
                        sbunlock(&so->so_rcv);
                        sorwakeup(so);
                        splx(s);
                        goto restart;
                }
                rep = rep->r_next;
            }
        }
        /* If not matched to request, drop it */
        nfsstats.rpcunexpected++;
dropit:
        sbdroprecord(&so->so_rcv);
        sbunlock(&so->so_rcv);
        splx(s);
        goto restart;

release:
        sbunlock(&so->so_rcv);
        splx(s);
        return (error);
}

/*
 * nfs_request - goes something like this
 *      - fill in request struct
 *      - links it into list
 *      - calls nfs_sosend() for first transmit
 *      - calls nfs_soreceive() to get reply
 *      - break down rpc header and return with nfs reply pointed to
 *        by mrep or error
 * nb: always frees up mreq mbuf list
 */
nfs_request(vp, mreq, xid, idem, mp, mrp, mdp, dposp)
        struct vnode *vp;
        struct mbuf *mreq;
        u_long xid;
        int idem;
        struct mount *mp;
        struct mbuf **mrp;
        struct mbuf **mdp;
        caddr_t *dposp;
{
        register struct mbuf *m, *mrep;
        register struct nfsreq *rep;
        register u_long *p;
        register int len;
        struct nfsmount *mntp;
        struct mbuf *md;
        struct nfsreq *reph;
        caddr_t dpos;
        char *cp2;
        int t1;
        int s;
        int error;

        mntp = vfs_to_nfs(mp);
        m = mreq;
        MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
        rep->r_xid = xid;
        rep->r_mntp = mntp;
        rep->r_vp = vp;
        if (mntp->nm_flag & NFSMNT_SOFT)
                rep->r_retry = mntp->nm_retry;
        else
                rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
        rep->r_flags = rep->r_rexmit = 0;
        /* Idempotency: add N * MINTIMEO to requests if not, else use 0 */
        rep->r_timer = rep->r_timerinit = -(idem * NFS_MINTIMEO);
        rep->r_mrep = NULL;
        rep->r_mreq = m;
        len = 0;
        while (m) {
                len += m->m_len;
                m = m->m_next;
        }
        rep->r_msiz = len;

        /*
         * Do the client side RPC.
         */
        nfsstats.rpcrequests++;
        s = splnet();
        /* Chain request into list of outstanding requests. Be sure
         * to put it LAST so timer finds oldest requests first. */
        reph = &nfsreqh;
        if (reph->r_prev == NULL) {
                reph->r_next = rep;
                rep->r_prev = reph;
        } else {
                reph->r_prev->r_next = rep;
                rep->r_prev = reph->r_prev;
        }
        reph->r_prev = rep;
        rep->r_next = reph;
        /*
         * If backing off another request or avoiding congestion, don't
         * send this one now but let timer do it. If not timing a request,
         * do it now.
         */
        if (mntp->nm_hostinfo->nh_sent > 0 &&
            (mntp->nm_hostinfo->nh_currexmit != 0 ||
             mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)) {
                splx(s);
                goto skipsend;
        }
        ++mntp->nm_hostinfo->nh_sent;   /* Inconsistent if can't NFSMCOPY */
        rep->r_flags |= R_SENT;         /* But not a catastrophe */
        if (mntp->nm_rtt == -1) {
                mntp->nm_rtt = 0;
                rep->r_flags |= R_TIMING;
        }
        splx(s);

        /*
         * If we can get a packet to send, send it off...
         * otherwise the timer will retransmit later
         */
        m = NFSMCOPY(mreq, 0, M_COPYALL, M_WAIT);
        if (m != NULL)
                (void) nfs_send(mntp->nm_so, (struct mbuf *)0, m, 0, len);
        /*
         * Wait for the reply from our send or the timer's.
         */
skipsend:
        error = nfs_dgreply(mntp->nm_so, mntp, rep);

        /*
         * RPC done, unlink the request.
         */
        s = splnet();
        rep->r_prev->r_next = rep->r_next;
        rep->r_next->r_prev = rep->r_prev;
        splx(s);
        m_freem(rep->r_mreq);
        mrep = md = rep->r_mrep;
        FREE((caddr_t)rep, M_NFSREQ);
        if (error)
                return (error);

        /*
         * break down the rpc header and check if ok
         */
        dpos = mtod(md, caddr_t);
        nfsm_disect(p, u_long *, 5*NFSX_UNSIGNED);
        p += 2;
        if (*p++ == rpc_msgdenied) {
                if (*p == rpc_mismatch)
                        error = EOPNOTSUPP;
                else
                        error = EACCES;
                m_freem(mrep);
                return (error);
        }
        /*
         * skip over the auth_verf, someday we may want to cache auth_short's
         * for nfs_reqhead(), but for now just dump it
         */
        if (*++p != 0) {
                len = nfsm_rndup(fxdr_unsigned(long, *p));
                nfsm_adv(len);
        }
        nfsm_disect(p, u_long *, NFSX_UNSIGNED);
        /* 0 == ok */
        if (*p == 0) {
                nfsm_disect(p, u_long *, NFSX_UNSIGNED);
                if (*p != 0) {
                        error = fxdr_unsigned(int, *p);
                        m_freem(mrep);
                        return (error);
                }
                *mrp = mrep;
                *mdp = md;
                *dposp = dpos;
                return (0);
        }
        m_freem(mrep);
        return (EPROTONOSUPPORT);
nfsmout:
        return (error);
}

/*
 * Get a request for the server main loop
 * - receive a request via. nfs_soreceive()
 * - verify it
 * - fill in the cred struct.
 */
nfs_getreq(so, prog, vers, maxproc, nam, mrp, mdp, dposp, retxid, proc, cr,
           msk, mtch)
        struct socket *so;
        u_long prog;
        u_long vers;
        int maxproc;
        struct mbuf **nam;
        struct mbuf **mrp;
        struct mbuf **mdp;
        caddr_t *dposp;
        u_long *retxid;
        u_long *proc;
        register struct ucred *cr;
        u_long msk;
        u_long mtch;
{
        register int i;
        register u_long *p;
        register long t1;
        caddr_t dpos, cp2;
        int error = 0;
        struct mbuf *mrep, *md;
        int len;

        if (error = nfs_dgreceive(so, msk, mtch, nam, &mrep))
                return (error);
        md = mrep;
        dpos = mtod(mrep, caddr_t);
        nfsm_disect(p, u_long *, 10*NFSX_UNSIGNED);
        *retxid = *p++;
        if (*p++ != rpc_call) {
                m_freem(mrep);
                return (ERPCMISMATCH);
        }
        if (*p++ != rpc_vers) {
                m_freem(mrep);
                return (ERPCMISMATCH);
        }
        if (*p++ != prog) {
                m_freem(mrep);
                return (EPROGUNAVAIL);
        }
        if (*p++ != vers) {
                m_freem(mrep);
                return (EPROGMISMATCH);
        }
        *proc = fxdr_unsigned(u_long, *p++);
        if (*proc == NFSPROC_NULL) {
                *mrp = mrep;
                return (0);
        }
        if (*proc > maxproc || *p++ != rpc_auth_unix) {
                m_freem(mrep);
                return (EPROCUNAVAIL);
        }
        (void) fxdr_unsigned(int, *p++);
        len = fxdr_unsigned(int, *++p);
        nfsm_adv(nfsm_rndup(len));
        nfsm_disect(p, u_long *, 3*NFSX_UNSIGNED);
        cr->cr_uid = fxdr_unsigned(uid_t, *p++);
        cr->cr_gid = fxdr_unsigned(gid_t, *p++);
        len = fxdr_unsigned(int, *p);
        if (len > 10) {
                m_freem(mrep);
                return (EBADRPC);
        }
        nfsm_disect(p, u_long *, (len + 2)*NFSX_UNSIGNED);
        for (i = 1; i <= len; i++)
                cr->cr_groups[i] = fxdr_unsigned(gid_t, *p++);
        cr->cr_ngroups = len + 1;
        /*
         * Do we have any use for the verifier.
         * According to the "Remote Procedure Call Protocol Spec." it
         * should be AUTH_NULL, but some clients make it AUTH_UNIX?
         * For now, just skip over it
         */
        len = fxdr_unsigned(int, *++p);
        if (len > 0)
                nfsm_adv(nfsm_rndup(len));
        *mrp = mrep;
        *mdp = md;
        *dposp = dpos;
        return (0);
nfsmout:
        return (error);
}

/*
 * Generate the rpc reply header
 * siz arg. is used to decide if adding a cluster is worthwhile
 */
nfs_rephead(siz, retxid, err, mrq, mbp, bposp)
        int siz;
        u_long retxid;
        int err;
        struct mbuf **mrq;
        struct mbuf **mbp;
        caddr_t *bposp;
{
        register u_long *p;
        register long t1;
        caddr_t bpos;
        struct mbuf *mreq, *mb, *mb2;

        NFSMGETHDR(mreq);
        mb = mreq;
        if ((siz+RPC_REPLYSIZ) > MHLEN)
                NFSMCLGET(mreq, M_WAIT);
        p = mtod(mreq, u_long *);
        mreq->m_len = 6*NFSX_UNSIGNED;
        bpos = ((caddr_t)p)+mreq->m_len;
        *p++ = retxid;
        *p++ = rpc_reply;
        if (err == ERPCMISMATCH) {
                *p++ = rpc_msgdenied;
                *p++ = rpc_mismatch;
                *p++ = txdr_unsigned(2);
                *p = txdr_unsigned(2);
        } else {
                *p++ = rpc_msgaccepted;
                *p++ = 0;
                *p++ = 0;
                switch (err) {
                case EPROGUNAVAIL:
                        *p = txdr_unsigned(RPC_PROGUNAVAIL);
                        break;
                case EPROGMISMATCH:
                        *p = txdr_unsigned(RPC_PROGMISMATCH);
                        nfsm_build(p, u_long *, 2*NFSX_UNSIGNED);
                        *p++ = txdr_unsigned(2);
                        *p = txdr_unsigned(2);  /* someday 3 */
                        break;
                case EPROCUNAVAIL:
                        *p = txdr_unsigned(RPC_PROCUNAVAIL);
                        break;
                default:
                        *p = 0;
                        if (err != VNOVAL) {
                                nfsm_build(p, u_long *, NFSX_UNSIGNED);
                                *p = txdr_unsigned(err);
                        }
                        break;
                };
        }
        *mrq = mreq;
        *mbp = mb;
        *bposp = bpos;
        if (err != 0 && err != VNOVAL)
                nfsstats.srvrpc_errs++;
        return (0);
}

/*
 * Nfs timer routine
 * Scan the nfsreq list and retranmit any requests that have timed out
 * To avoid retransmission attempts on STREAM sockets (in the future) make
 * sure to set the r_retry field to 0 (implies nm_retry == 0).
 */
nfs_timer()
{
        register struct nfsreq *rep;
        register struct mbuf *m;
        register struct socket *so;
        register struct nfsmount *mntp;
        int s, error;

        s = splnet();
        rep = nfsreqh.r_next;
        if (rep) for ( ; rep != &nfsreqh; rep = rep->r_next) {
                mntp = rep->r_mntp;
                if (rep->r_flags & R_TIMING)    /* update rtt in mount */
                        mntp->nm_rtt++;
                /* If not timed out or reply already received, skip */
                if (++rep->r_timer < mntp->nm_rto || rep->r_mrep)
                        continue;
                /* Do backoff and save new timeout in mount */
                if (rep->r_flags & R_TIMING) {
                        nfs_backofftimer(mntp);
                        rep->r_flags &= ~R_TIMING;
                        mntp->nm_rtt = -1;
                }
                if (rep->r_flags & R_SENT) {
                        rep->r_flags &= ~R_SENT;
                        --mntp->nm_hostinfo->nh_sent;
                }
                /* Check state of socket, cf nfs_send */
                so = mntp->nm_so;
                if (error = nfs_sockerr(so, 1))
                        goto wakeup;
                if (sbspace(&so->so_snd) < rep->r_msiz)
                        goto wakeup;
                /* Check for too many retries, cf nfs_dgreply */
                if (++rep->r_rexmit > NFS_MAXREXMIT)    /* clip */
                        rep->r_rexmit = NFS_MAXREXMIT;
                if (rep->r_rexmit > rep->r_retry)       /* too many */
                        goto wakeup;
                /* Check for congestion control, cf nfs_request */
                if (mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)
                        goto wakeup;
                /* Send it! */
                m = NFSMCOPY(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT);
                if (m == NULL)
                        goto wakeup;
                nfsstats.rpcretries++;
#ifdef MGETHDR
                m->m_pkthdr.len = rep->r_msiz;
#endif
                (void)(*so->so_proto->pr_usrreq)(so, PRU_SEND, m,
                        (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);

                /* We need to time the request even though we're
                 * retransmitting, in order to maintain backoff. */
                mntp->nm_rtt = 0;
                ++mntp->nm_hostinfo->nh_sent;
                rep->r_flags |= (R_SENT|R_TIMING);
                rep->r_timer = rep->r_timerinit;
wakeup:
                /* If error or interruptible mount, give user a look */
                if (error || (mntp->nm_flag & NFSMNT_INT))
                        sorwakeup(so);
        }
        splx(s);
        timeout(nfs_timer, (caddr_t)0, hz/NFS_HZ);
}

/*
 * NFS timer update and backoff. The "Jacobson/Karels/Karn" scheme is
 * used here. The timer state is held in the nfsmount structure and
 * a single request is used to clock the response. When successful
 * the rtt smoothing in nfs_updatetimer is used, when failed the backoff
 * is done by nfs_backofftimer. We also log failure messages in these
 * routines.
 *
 * Congestion variables are held in the nfshost structure which
 * is referenced by nfsmounts and shared per-server. This separation
 * makes it possible to do per-mount timing which allows varying disk
 * access times to be dealt with, while preserving a network oriented
 * congestion control scheme.
 *
 * The windowing implements the Jacobson/Karels slowstart algorithm
 * with adjusted scaling factors. We start with one request, then send
 * 4 more after each success until the ssthresh limit is reached, then
 * we increment at a rate proportional to the window. On failure, we
 * remember 3/4 the current window and clamp the send limit to 1. Note
 * ICMP source quench is not reflected in so->so_error so we ignore that
 * for now.
 *
 * NFS behaves much more like a transport protocol with these changes,
 * shedding the teenage pedal-to-the-metal tendencies of "other"
 * implementations.
 *
 * Timers and congestion avoidance by Tom Talpey, Open Software Foundation.
 */

/*
 * The TCP algorithm was not forgiving enough. Because the NFS server
 * responds only after performing lookups/diskio/etc, we have to be
 * more prepared to accept a spiky variance. The TCP algorithm is:
 * TCP_RTO(mntp) ((((mntp)->nm_srtt >> 2) + (mntp)->nm_rttvar) >> 1)
 */
#define NFS_RTO(mntp)   (((mntp)->nm_srtt >> 3) + (mntp)->nm_rttvar)

nfs_updatetimer(mntp)
        register struct nfsmount *mntp;
{
        register struct nfshost *nfshp = mntp->nm_hostinfo;

        /* If retransmitted, clear and return */
        if (mntp->nm_rexmit || nfshp->nh_currexmit) {
                if (nfshp->nh_currexmit >= nfsrexmtthresh)
                        nfs_log("NFS server %s OK\n", mntp->nm_host);
                mntp->nm_rexmit = nfshp->nh_currexmit = 0;
                return;
        }
        /* If have a measurement, do smoothing */
        if (mntp->nm_srtt) {
                register short delta;
                delta = mntp->nm_rtt - (mntp->nm_srtt >> 3);
                if ((mntp->nm_srtt += delta) <= 0)
                        mntp->nm_srtt = 1;
                if (delta < 0)
                        delta = -delta;
                delta -= (mntp->nm_rttvar >> 2);
                if ((mntp->nm_rttvar += delta) <= 0)
                        mntp->nm_rttvar = 1;
        /* Else initialize */
        } else {
                mntp->nm_rttvar = mntp->nm_rtt << 1;
                if (mntp->nm_rttvar == 0) mntp->nm_rttvar = 2;
                mntp->nm_srtt = mntp->nm_rttvar << 2;
        }
        /* Compute new Retransmission TimeOut and clip */
        mntp->nm_rto = NFS_RTO(mntp);
        if (mntp->nm_rto < NFS_MINTIMEO)
                mntp->nm_rto = NFS_MINTIMEO;
        else if (mntp->nm_rto > NFS_MAXTIMEO)
                mntp->nm_rto = NFS_MAXTIMEO;
        nfshp->nh_currto = mntp->nm_rto;

        /* Update window estimate */
        if (nfshp->nh_window < nfshp->nh_ssthresh)      /* quickly */
                nfshp->nh_window += 4;
        else {                                          /* slowly */
                register long incr = ++nfshp->nh_winext;
                incr = (incr * incr) / nfshp->nh_window;
                if (incr > 0) {
                        nfshp->nh_winext = 0;
                        ++nfshp->nh_window;
                }
        }
        if (nfshp->nh_window > NFS_MAXWINDOW)
                nfshp->nh_window = NFS_MAXWINDOW;
}

nfs_backofftimer(mntp)
        register struct nfsmount *mntp;
{
        register struct nfshost *nfshp = mntp->nm_hostinfo;
        register unsigned long newrto;

        /* Clip shift count */
        if (++mntp->nm_rexmit > 8 * sizeof mntp->nm_rto)
                mntp->nm_rexmit = 8 * sizeof mntp->nm_rto;
        /* Back off RTO exponentially */
        newrto = NFS_RTO(mntp);
        newrto <<= (mntp->nm_rexmit - 1);
        if (newrto == 0 || newrto > NFS_MAXTIMEO)
                newrto = NFS_MAXTIMEO;
        mntp->nm_rto = nfshp->nh_currto = newrto;

        /* If too many retries, message, assume a bogus RTT and re-measure */
        if (nfshp->nh_currexmit < mntp->nm_rexmit) {
                nfshp->nh_currexmit = mntp->nm_rexmit;
                if (nfshp->nh_currexmit >= nfsrexmtthresh) {
                        if (nfshp->nh_currexmit == nfsrexmtthresh) {
                                nfs_log("NFS server %s not responding\n",
                                                                mntp->nm_host);
                                mntp->nm_rttvar += (mntp->nm_srtt >> 2);
                                mntp->nm_srtt = 0;
                        }
                        /* The routing invalidation should be a usrreq PRU */
                        if (mtod(nfshp->nh_sockaddr,
                                struct sockaddr *)->sa_family == AF_INET)
                                in_losing(mntp->nm_so->so_pcb);
                }
        }
        /* Close down window but remember this point (3/4 current) for later */
        nfshp->nh_ssthresh = ((nfshp->nh_window << 1) + nfshp->nh_window) >> 2;
        nfshp->nh_window = 1;
        nfshp->nh_winext = 0;
}

/*
 * Not all errors are fatal. The closed checks deal
 * with errors a little strangely.
 */

nfs_sockerr(so, sending)
        struct socket *so;
        int sending;
{
        if (sending && (so->so_state & SS_CANTSENDMORE)) {
                so->so_error = EPIPE;
                return (EPIPE);
        }

        switch (so->so_error) {                 /* inhibit certain errors */
        case ENETDOWN:
        case ENETUNREACH:
        case EHOSTDOWN:
        case EHOSTUNREACH:
                so->so_error = 0;
        case 0:
                break;
        default:                                /* return all others */
                printf("nfs_sockerr: error %d on %s\n", so->so_error,
                        sending?"send":"receive");
                return (so->so_error);
        }

        if (!sending && (so->so_state & SS_CANTRCVMORE)) {
                so->so_error = 0;               /* (no error) */
                return (EPIPE);
        }
        return (so->so_error);
}