BSD 4_3_Net_2 release
[unix-history] / usr / src / sys / nfs / nfs_socket.c
index ac56ed1..ff092d4 100644 (file)
@@ -1,50 +1,62 @@
 /*
 /*
- * Copyright (c) 1989 The Regents of the University of California.
+ * Copyright (c) 1989, 1991 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
- * Redistribution and use in source and binary forms are permitted
- * provided that the above copyright notice and this paragraph are
- * duplicated in all such forms and that any documentation,
- * advertising materials, and other materials related to such
- * distribution and use acknowledge that the software was developed
- * by the University of California, Berkeley.  The name of the
- * University may not be used to endorse or promote products derived
- * from this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *     This product includes software developed by the University of
+ *     California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
  *
  *
- *     @(#)nfs_socket.c        7.3 (Berkeley) %G%
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)nfs_socket.c        7.23 (Berkeley) 4/20/91
  */
 
 /*
  */
 
 /*
- * Socket operations for use by nfs (similar to uipc_socket.c, but never
- * with copies to/from a uio vector)
- * NB: For now, they only work for UDP datagram sockets.
- * (Use on stream sockets would require some record boundary mark in the
- *  stream such as Sun's RM (Section 3.2 of the Sun RPC Message Protocol
- *  manual, in Networking on the Sun Workstation, Part #800-1324-03
- *  and different versions of send, receive and reply that do not assume
- *  an atomic protocol
+ * Socket operations for use by nfs
  */
 
  */
 
-#include "types.h"
 #include "param.h"
 #include "param.h"
-#include "uio.h"
-#include "user.h"
+#include "proc.h"
 #include "mount.h"
 #include "kernel.h"
 #include "malloc.h"
 #include "mbuf.h"
 #include "mount.h"
 #include "kernel.h"
 #include "malloc.h"
 #include "mbuf.h"
+#include "namei.h"
 #include "vnode.h"
 #include "domain.h"
 #include "protosw.h"
 #include "socket.h"
 #include "socketvar.h"
 #include "vnode.h"
 #include "domain.h"
 #include "protosw.h"
 #include "socket.h"
 #include "socketvar.h"
-#include "netinet/in.h"
+#include "syslog.h"
+#include "tprintf.h"
+#include "../netinet/in.h"
+#include "../netinet/tcp.h"
+
 #include "rpcv2.h"
 #include "nfsv2.h"
 #include "nfs.h"
 #include "rpcv2.h"
 #include "nfsv2.h"
 #include "nfs.h"
 #include "nfsmount.h"
 
 #define        TRUE    1
 #include "nfsmount.h"
 
 #define        TRUE    1
-
-/* set lock on sockbuf sb, sleep at neg prio */
-#define nfs_sblock(sb) { \
-       while ((sb)->sb_flags & SB_LOCK) { \
-               (sb)->sb_flags |= SB_WANT; \
-               sleep((caddr_t)&(sb)->sb_flags, PZERO-1); \
-       } \
-       (sb)->sb_flags |= SB_LOCK; \
-}
+#define        FALSE   0
 
 /*
  * External data, mostly RPC constants in XDR form
 
 /*
  * External data, mostly RPC constants in XDR form
 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
        rpc_msgaccepted, rpc_call;
 extern u_long nfs_prog, nfs_vers;
 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
        rpc_msgaccepted, rpc_call;
 extern u_long nfs_prog, nfs_vers;
+/* Maybe these should be bits in a u_long ?? */
+extern int nonidempotent[NFS_NPROCS];
+static int compressrequest[NFS_NPROCS] = {
+       FALSE,
+       TRUE,
+       TRUE,
+       FALSE,
+       TRUE,
+       TRUE,
+       TRUE,
+       FALSE,
+       FALSE,
+       TRUE,
+       TRUE,
+       TRUE,
+       TRUE,
+       TRUE,
+       TRUE,
+       TRUE,
+       TRUE,
+       TRUE,
+};
+int    nfs_sbwait();
+void   nfs_disconnect();
+struct mbuf *nfs_compress(), *nfs_uncompress();
+
 int    nfsrv_null(),
        nfsrv_getattr(),
        nfsrv_setattr(),
 int    nfsrv_null(),
        nfsrv_getattr(),
        nfsrv_setattr(),
@@ -108,312 +138,579 @@ int (*nfsrv_procs[NFS_NPROCS])() = {
        nfsrv_statfs,
 };
 
        nfsrv_statfs,
 };
 
+struct nfsreq nfsreqh;
+int nfsrexmtthresh = NFS_FISHY;
+int nfs_tcpnodelay = 1;
+
+/*
+ * Initialize sockets and congestion for a new NFS connection.
+ * We do not free the sockaddr if error.
+ */
+nfs_connect(nmp)
+       register struct nfsmount *nmp;
+{
+       register struct socket *so;
+       int s, error, bufsize;
+       struct mbuf *m;
+
+       nmp->nm_so = (struct socket *)0;
+       if (error = socreate(mtod(nmp->nm_nam, struct sockaddr *)->sa_family,
+               &nmp->nm_so, nmp->nm_sotype, nmp->nm_soproto))
+               goto bad;
+       so = nmp->nm_so;
+       nmp->nm_soflags = so->so_proto->pr_flags;
+
+       if (nmp->nm_sotype == SOCK_DGRAM)
+               bufsize = min(4 * (nmp->nm_wsize + NFS_MAXPKTHDR),
+                   NFS_MAXPACKET);
+       else
+               bufsize = min(4 * (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof(u_long)),
+                   NFS_MAXPACKET + sizeof(u_long));
+       if (error = soreserve(so, bufsize, bufsize))
+               goto bad;
+
+       /*
+        * Protocols that do not require connections may be optionally left
+        * unconnected for servers that reply from a port other than NFS_PORT.
+        */
+       if (nmp->nm_flag & NFSMNT_NOCONN) {
+               if (nmp->nm_soflags & PR_CONNREQUIRED) {
+                       error = ENOTCONN;
+                       goto bad;
+               }
+       } else {
+               if (error = soconnect(so, nmp->nm_nam))
+                       goto bad;
+
+               /*
+                * Wait for the connection to complete. Cribbed from the
+                * connect system call but with the wait at negative prio.
+                */
+               s = splnet();
+               while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0)
+                       (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "nfscon", 0);
+               splx(s);
+               if (so->so_error) {
+                       error = so->so_error;
+                       goto bad;
+               }
+       }
+       if (nmp->nm_sotype == SOCK_DGRAM) {
+               if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_SPONGY | NFSMNT_INT)) {
+                       so->so_rcv.sb_timeo = (5 * hz);
+                       so->so_snd.sb_timeo = (5 * hz);
+               } else {
+                       so->so_rcv.sb_timeo = 0;
+                       so->so_snd.sb_timeo = 0;
+               }
+               nmp->nm_rto = NFS_TIMEO;
+       } else {
+               if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_SPONGY | NFSMNT_INT)) {
+                       so->so_rcv.sb_timeo = (5 * hz);
+                       so->so_snd.sb_timeo = (5 * hz);
+               } else {
+                       so->so_rcv.sb_timeo = 0;
+                       so->so_snd.sb_timeo = 0;
+               }
+               if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
+                       MGET(m, M_WAIT, MT_SOOPTS);
+                       *mtod(m, int *) = 1;
+                       m->m_len = sizeof(int);
+                       sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m);
+               }
+               if (so->so_proto->pr_domain->dom_family == AF_INET &&
+                   so->so_proto->pr_protocol == IPPROTO_TCP &&
+                   nfs_tcpnodelay) {
+                       MGET(m, M_WAIT, MT_SOOPTS);
+                       *mtod(m, int *) = 1;
+                       m->m_len = sizeof(int);
+                       sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m);
+               }
+               nmp->nm_rto = 10 * NFS_TIMEO;           /* XXX */
+       }
+       so->so_rcv.sb_flags |= SB_NOINTR;
+       so->so_snd.sb_flags |= SB_NOINTR;
+
+       /* Initialize other non-zero congestion variables */
+       nmp->nm_window = 2;                     /* Initial send window */
+       nmp->nm_ssthresh = NFS_MAXWINDOW;       /* Slowstart threshold */
+       nmp->nm_rttvar = nmp->nm_rto << 1;
+       nmp->nm_sent = 0;
+       nmp->nm_currexmit = 0;
+       return (0);
+
+bad:
+       nfs_disconnect(nmp);
+       return (error);
+}
+
+/*
+ * Reconnect routine:
+ * Called when a connection is broken on a reliable protocol.
+ * - clean up the old socket
+ * - nfs_connect() again
+ * - set R_MUSTRESEND for all outstanding requests on mount point
+ * If this fails the mount point is DEAD!
+ * nb: Must be called with the nfs_solock() set on the mount point.
+ */
+nfs_reconnect(rep, nmp)
+       register struct nfsreq *rep;
+       register struct nfsmount *nmp;
+{
+       register struct nfsreq *rp;
+       int error;
+
+       nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname,
+           "trying reconnect");
+       while (error = nfs_connect(nmp)) {
+#ifdef lint
+               error = error;
+#endif /* lint */
+               if ((nmp->nm_flag & NFSMNT_INT) && nfs_sigintr(rep->r_procp))
+                       return (EINTR);
+               (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
+       }
+       nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname,
+           "reconnected");
+
+       /*
+        * Loop through outstanding request list and fix up all requests
+        * on old socket.
+        */
+       rp = nfsreqh.r_next;
+       while (rp != &nfsreqh) {
+               if (rp->r_nmp == nmp)
+                       rp->r_flags |= R_MUSTRESEND;
+               rp = rp->r_next;
+       }
+       return (0);
+}
+
+/*
+ * NFS disconnect. Clean up and unlink.
+ */
+void
+nfs_disconnect(nmp)
+       register struct nfsmount *nmp;
+{
+       register struct socket *so;
+
+       if (nmp->nm_so) {
+               so = nmp->nm_so;
+               nmp->nm_so = (struct socket *)0;
+               soshutdown(so, 2);
+               soclose(so);
+       }
+}
 
 /*
 
 /*
- * This is a stripped down version of sosend() specific to
- * udp/ip and uses the mbuf list provdied
+ * This is the nfs send routine. For connection based socket types, it
+ * must be called with an nfs_solock() on the socket.
+ * "rep == NULL" indicates that it has been called from a server.
  */
  */
-nfs_udpsend(so, nam, top, flags, siz)
+nfs_send(so, nam, top, rep)
        register struct socket *so;
        struct mbuf *nam;
        register struct socket *so;
        struct mbuf *nam;
-       struct mbuf *top;
-       int flags;
-       int siz;
+       register struct mbuf *top;
+       struct nfsreq *rep;
 {
 {
-       register int space;
-       int error = 0, s, dontroute, first = 1;
-
-       dontroute =
-           (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
-           (so->so_proto->pr_flags & PR_ATOMIC);
-#define        snderr(errno)   { error = errno; splx(s); goto release; }
-
-#ifdef MGETHDR
-       top->m_pkthdr.len = siz;
-#endif
-restart:
-       nfs_sblock(&so->so_snd);
-       s = splnet();
-       if (so->so_state & SS_CANTSENDMORE)
-               snderr(EPIPE);
-       if (so->so_error)
-               snderr(so->so_error);
-       space = sbspace(&so->so_snd);
-       if (space < siz) {
-               sbunlock(&so->so_snd);
-               nfs_sbwait(&so->so_snd);
-               splx(s);
-               goto restart;
+       struct mbuf *sendnam;
+       int error, soflags;
+
+       if (rep) {
+               if (rep->r_flags & R_SOFTTERM) {
+                       m_freem(top);
+                       return (EINTR);
+               }
+               if (rep->r_nmp->nm_so == NULL &&
+                   (error = nfs_reconnect(rep, rep->r_nmp)))
+                       return (error);
+               rep->r_flags &= ~R_MUSTRESEND;
+               so = rep->r_nmp->nm_so;
+               soflags = rep->r_nmp->nm_soflags;
+       } else
+               soflags = so->so_proto->pr_flags;
+       if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
+               sendnam = (struct mbuf *)0;
+       else
+               sendnam = nam;
+
+       error = sosend(so, sendnam, (struct uio *)0, top,
+               (struct mbuf *)0, 0);
+       if (error == EWOULDBLOCK && rep) {
+               if (rep->r_flags & R_SOFTTERM)
+                       error = EINTR;
+               else {
+                       rep->r_flags |= R_MUSTRESEND;
+                       error = 0;
+               }
        }
        }
-       splx(s);
-       if (dontroute)
-               so->so_options |= SO_DONTROUTE;
-       s = splnet();                                   /* XXX */
-       error = (*so->so_proto->pr_usrreq)(so,
-           PRU_SEND,
-           top, (caddr_t)nam, (struct mbuf *)0, (struct mbuf *)0);
-       splx(s);
-       if (dontroute)
-               so->so_options &= ~SO_DONTROUTE;
-       top = (struct mbuf *)0;
-
-release:
-       sbunlock(&so->so_snd);
-       if (top)
-               m_freem(top);
+       /*
+        * Ignore socket errors??
+        */
+       if (error && error != EINTR && error != ERESTART)
+               error = 0;
        return (error);
 }
 
 /*
        return (error);
 }
 
 /*
- * This is a stripped down udp specific version of soreceive()
+ * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
+ * done by soreceive(), but for SOCK_STREAM we must deal with the Record
+ * Mark and consolidate the data into a new mbuf list.
+ * nb: Sometimes TCP passes the data up to soreceive() in long lists of
+ *     small mbufs.
+ * For SOCK_STREAM we must be very careful to read an entire record once
+ * we have read any of it, even if the system call has been interrupted.
  */
  */
-nfs_udpreceive(so, aname, mp)
+nfs_receive(so, aname, mp, rep)
        register struct socket *so;
        struct mbuf **aname;
        struct mbuf **mp;
        register struct socket *so;
        struct mbuf **aname;
        struct mbuf **mp;
+       register struct nfsreq *rep;
 {
 {
+       struct uio auio;
+       struct iovec aio;
        register struct mbuf *m;
        register struct mbuf *m;
-       int s, error = 0;
-       struct protosw *pr = so->so_proto;
-       struct mbuf *nextrecord;
+       struct mbuf *m2, *mnew, **mbp;
+       caddr_t fcp, tcp;
+       u_long len;
+       struct mbuf **getnam;
+       int error, siz, mlen, soflags, rcvflg;
 
 
-       if (aname)
-               *aname = 0;
-
-restart:
-       sblock(&so->so_rcv);
-       s = splnet();
+       /*
+        * Set up arguments for soreceive()
+        */
+       *mp = (struct mbuf *)0;
+       *aname = (struct mbuf *)0;
+       if (rep)
+               soflags = rep->r_nmp->nm_soflags;
+       else
+               soflags = so->so_proto->pr_flags;
 
 
-       if (so->so_rcv.sb_cc == 0) {
-               if (so->so_error) {
-                       error = so->so_error;
-                       so->so_error = 0;
-                       goto release;
+       /*
+        * For reliable protocols, lock against other senders/receivers
+        * in case a reconnect is necessary.
+        * For SOCK_STREAM, first get the Record Mark to find out how much
+        * more there is to get.
+        * We must lock the socket against other receivers
+        * until we have an entire rpc request/reply.
+        */
+       if (soflags & PR_CONNREQUIRED) {
+tryagain:
+               /*
+                * Check for fatal errors and resending request.
+                */
+               if (rep) {
+                       /*
+                        * Ugh: If a reconnect attempt just happened, nm_so
+                        * would have changed. NULL indicates a failed
+                        * attempt that has essentially shut down this
+                        * mount point.
+                        */
+                       if (rep->r_mrep || (so = rep->r_nmp->nm_so) == NULL ||
+                               (rep->r_flags & R_SOFTTERM))
+                               return (EINTR);
+                       while (rep->r_flags & R_MUSTRESEND) {
+                               m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
+                               nfsstats.rpcretries++;
+                               if (error = nfs_send(so, rep->r_nmp->nm_nam, m,
+                                       rep))
+                                       goto errout;
+                       }
+               }
+               if ((soflags & PR_ATOMIC) == 0) {
+                       aio.iov_base = (caddr_t) &len;
+                       aio.iov_len = sizeof(u_long);
+                       auio.uio_iov = &aio;
+                       auio.uio_iovcnt = 1;
+                       auio.uio_segflg = UIO_SYSSPACE;
+                       auio.uio_rw = UIO_READ;
+                       auio.uio_procp = (struct proc *)0;
+                       auio.uio_offset = 0;
+                       auio.uio_resid = sizeof(u_long);
+                       do {
+                           rcvflg = MSG_WAITALL;
+                           error = soreceive(so, (struct mbuf **)0, &auio,
+                               (struct mbuf **)0, (struct mbuf **)0, &rcvflg);
+                           if (error == EWOULDBLOCK && rep) {
+                               if (rep->r_flags & R_SOFTTERM)
+                                       return (EINTR);
+                               if (rep->r_flags & R_MUSTRESEND)
+                                       goto tryagain;
+                           }
+                       } while (error == EWOULDBLOCK);
+                       if (!error && auio.uio_resid > 0) {
+                           if (rep)
+                               log(LOG_INFO,
+                                  "short receive (%d/%d) from nfs server %s\n",
+                                  sizeof(u_long) - auio.uio_resid,
+                                  sizeof(u_long),
+                                rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
+                           error = EPIPE;
+                       }
+                       if (error)
+                               goto errout;
+                       len = ntohl(len) & ~0x80000000;
+                       /*
+                        * This is SERIOUS! We are out of sync with the sender
+                        * and forcing a disconnect/reconnect is all I can do.
+                        */
+                       if (len > NFS_MAXPACKET) {
+                           if (rep)
+                               log(LOG_ERR, "%s (%d) from nfs server %s\n",
+                                   "impossible packet length",
+                                   len,
+                                rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
+                           error = EFBIG;
+                           goto errout;
+                       }
+                       auio.uio_resid = len;
+                       do {
+                           rcvflg = MSG_WAITALL;
+                           error =  soreceive(so, (struct mbuf **)0,
+                               &auio, mp, (struct mbuf **)0, &rcvflg);
+                       } while (error == EWOULDBLOCK || error == EINTR ||
+                                error == ERESTART);
+                       if (!error && auio.uio_resid > 0) {
+                           if (rep)
+                               log(LOG_INFO,
+                                  "short receive (%d/%d) from nfs server %s\n",
+                                  len - auio.uio_resid, len,
+                                rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
+                           error = EPIPE;
+                       }
+               } else {
+                       auio.uio_resid = len = 1000000; /* Anything Big */
+                       do {
+                           rcvflg = 0;
+                           error =  soreceive(so, (struct mbuf **)0,
+                               &auio, mp, (struct mbuf **)0, &rcvflg);
+                           if (error == EWOULDBLOCK && rep) {
+                               if (rep->r_flags & R_SOFTTERM)
+                                       return (EINTR);
+                               if (rep->r_flags & R_MUSTRESEND)
+                                       goto tryagain;
+                           }
+                       } while (error == EWOULDBLOCK);
+                       if (!error && *mp == NULL)
+                               error = EPIPE;
+                       len -= auio.uio_resid;
+               }
+errout:
+               if (error && rep && error != EINTR && error != ERESTART) {
+                       m_freem(*mp);
+                       *mp = (struct mbuf *)0;
+                       if (error != EPIPE && rep)
+                               log(LOG_INFO,
+                                   "receive error %d from nfs server %s\n",
+                                   error,
+                                rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
+                       nfs_disconnect(rep->r_nmp);
+                       error = nfs_reconnect(rep, rep->r_nmp);
+                       if (!error)
+                               goto tryagain;
                }
                }
-               if (so->so_state & SS_CANTRCVMORE)
-                       goto release;
-               sbunlock(&so->so_rcv);
-               sbwait(&so->so_rcv);
-               splx(s);
-               goto restart;
-       }
-       m = so->so_rcv.sb_mb;
-       if (m == 0)
-               panic("nfs_receive 1");
-       nextrecord = m->m_nextpkt;
-       if (m->m_type != MT_SONAME)
-               panic("nfs_receive 1a");
-       sbfree(&so->so_rcv, m);
-       if (aname) {
-               *aname = m;
-               so->so_rcv.sb_mb = m->m_next;
-               m->m_next = 0;
-               m = so->so_rcv.sb_mb;
        } else {
        } else {
-               MFREE(m, so->so_rcv.sb_mb);
-               m = so->so_rcv.sb_mb;
+               if (so->so_state & SS_ISCONNECTED)
+                       getnam = (struct mbuf **)0;
+               else
+                       getnam = aname;
+               auio.uio_resid = len = 1000000;
+               do {
+                       rcvflg = 0;
+                       error =  soreceive(so, getnam, &auio, mp,
+                               (struct mbuf **)0, &rcvflg);
+                       if (error == EWOULDBLOCK && rep &&
+                           (rep->r_flags & R_SOFTTERM))
+                               return (EINTR);
+               } while (error == EWOULDBLOCK);
+               len -= auio.uio_resid;
        }
        }
-       if (m && m->m_type == MT_RIGHTS)
-               panic("nfs_receive 2");
-       if (m && m->m_type == MT_CONTROL) {
-               sbfree(&so->so_rcv, m);
-               MFREE(m, so->so_rcv.sb_mb);
-               m = so->so_rcv.sb_mb;
+       if (error) {
+               m_freem(*mp);
+               *mp = (struct mbuf *)0;
        }
        }
-       *mp = m;
+       /*
+        * Search for any mbufs that are not a multiple of 4 bytes long.
+        * These could cause pointer alignment problems, so copy them to
+        * well aligned mbufs.
+        */
+       m = *mp;
+       mbp = mp;
        while (m) {
        while (m) {
-               if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
-                       panic("nfs_receive 3");
-               sbfree(&so->so_rcv, m);
-               m = so->so_rcv.sb_mb = m->m_next;
-       }
-       so->so_rcv.sb_mb = nextrecord;
-       so->so_state &= ~SS_RCVATMARK;  /* Necessary ?? */
-release:
-       sbunlock(&so->so_rcv);
-       splx(s);
+               /*
+                * All this for something that may never happen.
+                */
+               if (m->m_next && (m->m_len & 0x3)) {
+                       printf("nfs_rcv odd length!\n");
+                       mlen = 0;
+                       while (m) {
+                               fcp = mtod(m, caddr_t);
+                               while (m->m_len > 0) {
+                                       if (mlen == 0) {
+                                               MGET(m2, M_WAIT, MT_DATA);
+                                               if (len >= MINCLSIZE)
+                                                       MCLGET(m2, M_WAIT);
+                                               m2->m_len = 0;
+                                               mlen = M_TRAILINGSPACE(m2);
+                                               tcp = mtod(m2, caddr_t);
+                                               *mbp = m2;
+                                               mbp = &m2->m_next;
+                                       }
+                                       siz = MIN(mlen, m->m_len);
+                                       bcopy(fcp, tcp, siz);
+                                       m2->m_len += siz;
+                                       mlen -= siz;
+                                       len -= siz;
+                                       tcp += siz;
+                                       m->m_len -= siz;
+                                       fcp += siz;
+                               }
+                               MFREE(m, mnew);
+                               m = mnew;
+                       }
+                       break;
+               }
+               len -= m->m_len;
+               mbp = &m->m_next;
+               m = m->m_next;
+       }
        return (error);
 }
 
        return (error);
 }
 
-struct nfsreq nfsreqh = {
-       (struct nfsreq *)0,
-       (struct nfsreq *)0,
-       (struct mbuf *)0,
-       (struct mbuf *)0,
-       (struct nfsmount *)0,
-       0, 0, 0, 0, 0,
-};
-
-struct rpc_replyhead {
-       u_long  r_xid;
-       u_long  r_rep;
-};
-
 /*
  * Implement receipt of reply on a socket.
 /*
  * Implement receipt of reply on a socket.
- * We depend on the way that records are added to the sockbuf
- * by sbappend*.  In particular, each record (mbufs linked through m_next)
- * must begin with an address, followed by optional MT_CONTROL mbuf
- * and then zero or more mbufs of data.
- * Although the sockbuf is locked, new data may still be appended,
- * and thus we must maintain consistency of the sockbuf during that time.
  * We must search through the list of received datagrams matching them
  * with outstanding requests using the xid, until ours is found.
  */
  * We must search through the list of received datagrams matching them
  * with outstanding requests using the xid, until ours is found.
  */
-nfs_udpreply(so, mntp, myrep)
-       register struct socket *so;
-       struct nfsmount *mntp;
+/* ARGSUSED */
+nfs_reply(nmp, myrep)
+       struct nfsmount *nmp;
        struct nfsreq *myrep;
 {
        register struct mbuf *m;
        register struct nfsreq *rep;
        struct nfsreq *myrep;
 {
        register struct mbuf *m;
        register struct nfsreq *rep;
-       register int error = 0, s;
-       struct protosw *pr = so->so_proto;
-       struct mbuf *nextrecord;
-       struct sockaddr_in *saddr;
-       u_long inaddr;
-       struct rpc_replyhead replyh;
-       struct mbuf *mp;
+       register int error = 0;
+       u_long rxid;
+       struct mbuf *mp, *nam;
        char *cp;
        int cnt, xfer;
        char *cp;
        int cnt, xfer;
-       int found;
-
-restart:
-       nfs_sblock(&so->so_rcv);
-       /* Already received, bye bye */
-       if (myrep->r_mrep != NULL) {
-               sbunlock(&so->so_rcv);
-               return (0);
-       }
-       /* If a soft mount and we have run out of retries */
-       if (myrep->r_retry == 0 && myrep->r_timer == 0) {
-               sbunlock(&so->so_rcv);
-               return (ETIMEDOUT);
-       }
-       s = splnet();
-
-       m = so->so_rcv.sb_mb;
-       if (m == 0) {
-               if (so->so_rcv.sb_cc)
-                       panic("nfs_soreply 1");
-               if (so->so_error) {
-                       error = so->so_error;
-                       so->so_error = 0;
-                       goto release;
-               }
-               if (so->so_state & SS_CANTRCVMORE)
-                       goto release;
-               sbunlock(&so->so_rcv);
-               nfs_sbwait(&so->so_rcv);
-               splx(s);
-               goto restart;
-       }
-       nextrecord = m->m_nextpkt;
 
        /*
 
        /*
-        * Take off the address, check for rights and ditch any control
-        * mbufs.
+        * Loop around until we get our own reply
         */
         */
-       if (m->m_type != MT_SONAME)
-               panic("nfs reply SONAME");
-       saddr = mtod(m, struct sockaddr_in *);
-       inaddr = saddr->sin_addr.s_addr;
-       sbfree(&so->so_rcv, m);
-       MFREE(m, so->so_rcv.sb_mb);
-       m = so->so_rcv.sb_mb;
-       if (m && m->m_type == MT_RIGHTS)
-               panic("nfs reply RIGHTS");
-       if (m && m->m_type == MT_CONTROL) {
-               sbfree(&so->so_rcv, m);
-               MFREE(m, so->so_rcv.sb_mb);
-               m = so->so_rcv.sb_mb;
-       }
-       if (m) {
-               m->m_nextpkt = nextrecord;
-       } else {
-               so->so_rcv.sb_mb = nextrecord;
-               sbunlock(&so->so_rcv);
-               splx(s);
-               goto restart;
-       }
+       for (;;) {
+               /*
+                * Lock against other receivers so that I don't get stuck in
+                * sbwait() after someone else has received my reply for me.
+                * Also necessary for connection based protocols to avoid
+                * race conditions during a reconnect.
+                */
+               nfs_solock(&nmp->nm_flag);
+               /* Already received, bye bye */
+               if (myrep->r_mrep != NULL) {
+                       nfs_sounlock(&nmp->nm_flag);
+                       return (0);
+               }
+               /*
+                * Get the next Rpc reply off the socket
+                */
+               if (error = nfs_receive(nmp->nm_so, &nam, &mp, myrep)) {
+                       nfs_sounlock(&nmp->nm_flag);
 
 
-       /*
-        * Get the xid and check that it is an rpc reply
-        */
-       mp = m;
-       if (m->m_len >= 2*NFSX_UNSIGNED)
-               bcopy(mtod(m, caddr_t), (caddr_t)&replyh, 2*NFSX_UNSIGNED);
-       else {
-               cnt = 2*NFSX_UNSIGNED;
-               cp = (caddr_t)&replyh;
-               while (mp && cnt > 0) {
-                       if (mp->m_len > 0) {
-                               xfer = (mp->m_len >= cnt) ? cnt : mp->m_len;
-                               bcopy(mtod(mp, caddr_t), cp, xfer);
-                               cnt -= xfer;
-                               cp += xfer;
+                       /*
+                        * Ignore routing errors on connectionless protocols??
+                        */
+                       if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
+                               nmp->nm_so->so_error = 0;
+                               continue;
+                       }
+
+                       /*
+                        * Otherwise cleanup and return a fatal error.
+                        */
+                       if (myrep->r_flags & R_TIMING) {
+                               myrep->r_flags &= ~R_TIMING;
+                               nmp->nm_rtt = -1;
+                       }
+                       if (myrep->r_flags & R_SENT) {
+                               myrep->r_flags &= ~R_SENT;
+                               nmp->nm_sent--;
                        }
                        }
-                       if (cnt > 0)
-                               mp = mp->m_next;
+                       return (error);
                }
                }
-       }
-       found = 0;
-       if (replyh.r_rep != rpc_reply || mp == NULL)
-               goto dropit;
-       /*
-        * Loop through the request list to match up the reply
-        * Iff no match, just drop the datagram
-        */
-       rep = nfsreqh.r_next;
-       while (!found && rep != &nfsreqh) {
-               if (rep->r_mrep == NULL && replyh.r_xid == rep->r_xid &&
-                   inaddr == rep->r_inaddr) {
-                       /* Found it.. */
-                       rep->r_mrep = m;
-                       while (m) {
-                               if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
-                                       panic("nfs_soreply 3");
-                               sbfree(&so->so_rcv, m);
-                               m = so->so_rcv.sb_mb = m->m_next;
+       
+               /*
+                * Get the xid and check that it is an rpc reply
+                */
+               m = mp;
+               while (m && m->m_len == 0)
+                       m = m->m_next;
+               if (m == NULL) {
+                       nfsstats.rpcinvalid++;
+                       m_freem(mp);
+                       nfs_sounlock(&nmp->nm_flag);
+                       continue;
+               }
+               bcopy(mtod(m, caddr_t), (caddr_t)&rxid, NFSX_UNSIGNED);
+               /*
+                * Loop through the request list to match up the reply
+                * Iff no match, just drop the datagram
+                */
+               m = mp;
+               rep = nfsreqh.r_next;
+               while (rep != &nfsreqh) {
+                       if (rep->r_mrep == NULL && rxid == rep->r_xid) {
+                               /* Found it.. */
+                               rep->r_mrep = m;
+                               /*
+                                * Update timing
+                                */
+                               if (rep->r_flags & R_TIMING) {
+                                       nfs_updatetimer(rep->r_nmp);
+                                       rep->r_flags &= ~R_TIMING;
+                                       rep->r_nmp->nm_rtt = -1;
+                               }
+                               if (rep->r_flags & R_SENT) {
+                                       rep->r_flags &= ~R_SENT;
+                                       rep->r_nmp->nm_sent--;
+                               }
+                               break;
                        }
                        }
-                       so->so_rcv.sb_mb = nextrecord;
-                       if (rep == myrep)
-                               goto release;
-                       found++;
+                       rep = rep->r_next;
                }
                }
-               rep = rep->r_next;
+               nfs_sounlock(&nmp->nm_flag);
+               if (nam)
+                       m_freem(nam);
+               /*
+                * If not matched to a request, drop it.
+                * If it's mine, get out.
+                */
+               if (rep == &nfsreqh) {
+                       nfsstats.rpcunexpected++;
+                       m_freem(m);
+               } else if (rep == myrep)
+                       return (0);
        }
        }
-       /* Iff not matched to request, drop it */
-dropit:
-       if (!found) {
-               sbdroprecord(&so->so_rcv);
-       } else if (so->so_rcv.sb_flags & SB_WAIT) {
-               so->so_rcv.sb_flags &= ~SB_WAIT;
-               wakeup((caddr_t)&so->so_rcv.sb_cc);
-       }
-       sbunlock(&so->so_rcv);
-       splx(s);
-       goto restart;
-release:
-       sbunlock(&so->so_rcv);
-       splx(s);
-       return (error);
 }
 
 /*
  * nfs_request - goes something like this
  *     - fill in request struct
  *     - links it into list
 }
 
 /*
  * nfs_request - goes something like this
  *     - fill in request struct
  *     - links it into list
- *     - calls nfs_sosend() for first transmit
- *     - calls nfs_soreceive() to get reply
+ *     - calls nfs_send() for first transmit
+ *     - calls nfs_receive() to get reply
  *     - break down rpc header and return with nfs reply pointed to
  *       by mrep or error
  * nb: always frees up mreq mbuf list
  */
  *     - break down rpc header and return with nfs reply pointed to
  *       by mrep or error
  * nb: always frees up mreq mbuf list
  */
-nfs_request(vp, mreq, xid, mp, mrp, mdp, dposp)
+nfs_request(vp, mreq, xid, procnum, procp, tryhard, mp, mrp, mdp, dposp)
        struct vnode *vp;
        struct mbuf *mreq;
        u_long xid;
        struct vnode *vp;
        struct mbuf *mreq;
        u_long xid;
+       int procnum;
+       struct proc *procp;
+       int tryhard;
        struct mount *mp;
        struct mbuf **mrp;
        struct mbuf **mdp;
        struct mount *mp;
        struct mbuf **mrp;
        struct mbuf **mdp;
@@ -421,81 +718,148 @@ nfs_request(vp, mreq, xid, mp, mrp, mdp, dposp)
 {
        register struct mbuf *m, *mrep;
        register struct nfsreq *rep;
 {
        register struct mbuf *m, *mrep;
        register struct nfsreq *rep;
-       register u_long *p;
+       register u_long *tl;
        register int len;
        register int len;
-       struct nfsmount *mntp;
+       struct nfsmount *nmp;
        struct mbuf *md;
        struct mbuf *md;
-       struct sockaddr_in *saddr;
        struct nfsreq *reph;
        caddr_t dpos;
        char *cp2;
        int t1;
        struct nfsreq *reph;
        caddr_t dpos;
        char *cp2;
        int t1;
-       int s;
-       int error;
+       int s, compressed;
+       int error = 0;
 
 
-       mntp = vfs_to_nfs(mp);
+       nmp = VFSTONFS(mp);
        m = mreq;
        MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
        rep->r_xid = xid;
        m = mreq;
        MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
        rep->r_xid = xid;
-       rep->r_mntp = mntp;
-       saddr = mtod(mntp->nm_sockaddr, struct sockaddr_in *);
-       rep->r_inaddr = saddr->sin_addr.s_addr;
+       rep->r_nmp = nmp;
        rep->r_vp = vp;
        rep->r_vp = vp;
-       if (mntp->nm_flag & NFSMNT_SOFT)
-               rep->r_retry = mntp->nm_retrans;
+       rep->r_procp = procp;
+       if ((nmp->nm_flag & NFSMNT_SOFT) ||
+           ((nmp->nm_flag & NFSMNT_SPONGY) && !tryhard))
+               rep->r_retry = nmp->nm_retry;
+       else
+               rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
+       rep->r_flags = rep->r_rexmit = 0;
+       /*
+        * Three cases:
+        * - non-idempotent requests on SOCK_DGRAM use NFS_MINIDEMTIMEO
+        * - idempotent requests on SOCK_DGRAM use 0
+        * - Reliable transports, NFS_RELIABLETIMEO
+        *   Timeouts are still done on reliable transports to ensure detection
+        *   of excessive connection delay.
+        */
+       if (nmp->nm_sotype != SOCK_DGRAM)
+               rep->r_timerinit = -NFS_RELIABLETIMEO;
+       else if (nonidempotent[procnum])
+               rep->r_timerinit = -NFS_MINIDEMTIMEO;
        else
        else
-               rep->r_retry = VNOVAL;
+               rep->r_timerinit = 0;
+       rep->r_timer = rep->r_timerinit;
        rep->r_mrep = NULL;
        rep->r_mrep = NULL;
-       rep->r_mreq = m;
-       rep->r_timer = rep->r_timeout = mntp->nm_timeo;
        len = 0;
        while (m) {
                len += m->m_len;
                m = m->m_next;
        }
        len = 0;
        while (m) {
                len += m->m_len;
                m = m->m_next;
        }
-       rep->r_msiz = len;
-       m = NFSMCOPY(mreq, 0, M_COPYALL, M_WAIT);
+       mreq->m_pkthdr.len = len;
+       mreq->m_pkthdr.rcvif = (struct ifnet *)0;
+       compressed = 0;
+       m = mreq;
+       if ((nmp->nm_flag & NFSMNT_COMPRESS) && compressrequest[procnum]) {
+               mreq = nfs_compress(mreq);
+               if (mreq != m) {
+                       len = mreq->m_pkthdr.len;
+                       compressed++;
+               }
+       }
+       /*
+        * For non-atomic protocols, insert a Sun RPC Record Mark.
+        */
+       if ((nmp->nm_soflags & PR_ATOMIC) == 0) {
+               M_PREPEND(mreq, sizeof(u_long), M_WAIT);
+               *mtod(mreq, u_long *) = htonl(0x80000000 | len);
+       }
+       rep->r_mreq = mreq;
 
 
-       /* Chain it into list of outstanding requests */
-       reph = &nfsreqh;
+       /*
+        * Do the client side RPC.
+        */
+       nfsstats.rpcrequests++;
+       /*
+        * Chain request into list of outstanding requests. Be sure
+        * to put it LAST so timer finds oldest requests first.
+        */
        s = splnet();
        s = splnet();
-       if (reph->r_prev == NULL) {
-               reph->r_next = rep;
-               rep->r_prev = reph;
-       } else {
-               reph->r_prev->r_next = rep;
-               rep->r_prev = reph->r_prev;
-       }
+       reph = &nfsreqh;
+       reph->r_prev->r_next = rep;
+       rep->r_prev = reph->r_prev;
        reph->r_prev = rep;
        rep->r_next = reph;
        reph->r_prev = rep;
        rep->r_next = reph;
-       splx(s);
+       /*
+        * If backing off another request or avoiding congestion, don't
+        * send this one now but let timer do it. If not timing a request,
+        * do it now.
+        */
+       if (nmp->nm_sent <= 0 || nmp->nm_sotype != SOCK_DGRAM ||
+           (nmp->nm_currexmit == 0 && nmp->nm_sent < nmp->nm_window)) {
+               nmp->nm_sent++;
+               rep->r_flags |= R_SENT;
+               if (nmp->nm_rtt == -1) {
+                       nmp->nm_rtt = 0;
+                       rep->r_flags |= R_TIMING;
+               }
+               splx(s);
+               m = m_copym(mreq, 0, M_COPYALL, M_WAIT);
+               if (nmp->nm_soflags & PR_CONNREQUIRED)
+                       nfs_solock(&nmp->nm_flag);
+               error = nfs_send(nmp->nm_so, nmp->nm_nam, m, rep);
+               if (nmp->nm_soflags & PR_CONNREQUIRED)
+                       nfs_sounlock(&nmp->nm_flag);
+               if (error && NFSIGNORE_SOERROR(nmp->nm_soflags, error))
+                       nmp->nm_so->so_error = error = 0;
+       } else
+               splx(s);
 
        /*
 
        /*
-        * Iff the NFSMCOPY above succeeded, send it off...
-        * otherwise the timer will retransmit later
+        * Wait for the reply from our send or the timer's.
         */
         */
-       if (m != NULL)
-               error = nfs_udpsend(mntp->nm_so, (struct mbuf *)0, m, 0, len);
-       error = nfs_udpreply(mntp->nm_so, mntp, rep);
+       if (!error)
+               error = nfs_reply(nmp, rep);
 
 
+       /*
+        * RPC done, unlink the request.
+        */
        s = splnet();
        rep->r_prev->r_next = rep->r_next;
        rep->r_next->r_prev = rep->r_prev;
        splx(s);
        s = splnet();
        rep->r_prev->r_next = rep->r_next;
        rep->r_next->r_prev = rep->r_prev;
        splx(s);
+
+       /*
+        * If there was a successful reply and a tprintf msg.
+        * tprintf a response.
+        */
+       if (!error && (rep->r_flags & R_TPRINTFMSG))
+               nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname,
+                   "is alive again");
        m_freem(rep->r_mreq);
        m_freem(rep->r_mreq);
-       mrep = md = rep->r_mrep;
+       mrep = rep->r_mrep;
        FREE((caddr_t)rep, M_NFSREQ);
        if (error)
                return (error);
 
        FREE((caddr_t)rep, M_NFSREQ);
        if (error)
                return (error);
 
+       if (compressed)
+               mrep = nfs_uncompress(mrep);
+       md = mrep;
        /*
         * break down the rpc header and check if ok
         */
        dpos = mtod(md, caddr_t);
        /*
         * break down the rpc header and check if ok
         */
        dpos = mtod(md, caddr_t);
-       nfsm_disect(p, u_long *, 5*NFSX_UNSIGNED);
-       p += 2;
-       if (*p++ == rpc_msgdenied) {
-               if (*p == rpc_mismatch)
+       nfsm_disect(tl, u_long *, 5*NFSX_UNSIGNED);
+       tl += 2;
+       if (*tl++ == rpc_msgdenied) {
+               if (*tl == rpc_mismatch)
                        error = EOPNOTSUPP;
                else
                        error = EACCES;
                        error = EOPNOTSUPP;
                else
                        error = EACCES;
@@ -506,16 +870,16 @@ nfs_request(vp, mreq, xid, mp, mrp, mdp, dposp)
         * skip over the auth_verf, someday we may want to cache auth_short's
         * for nfs_reqhead(), but for now just dump it
         */
         * skip over the auth_verf, someday we may want to cache auth_short's
         * for nfs_reqhead(), but for now just dump it
         */
-       if (*++p != 0) {
-               len = nfsm_rndup(fxdr_unsigned(long, *p));
+       if (*++tl != 0) {
+               len = nfsm_rndup(fxdr_unsigned(long, *tl));
                nfsm_adv(len);
        }
                nfsm_adv(len);
        }
-       nfsm_disect(p, u_long *, NFSX_UNSIGNED);
+       nfsm_disect(tl, u_long *, NFSX_UNSIGNED);
        /* 0 == ok */
        /* 0 == ok */
-       if (*p == 0) {
-               nfsm_disect(p, u_long *, NFSX_UNSIGNED);
-               if (*p != 0) {
-                       error = fxdr_unsigned(int, *p);
+       if (*tl == 0) {
+               nfsm_disect(tl, u_long *, NFSX_UNSIGNED);
+               if (*tl != 0) {
+                       error = fxdr_unsigned(int, *tl);
                        m_freem(mrep);
                        return (error);
                }
                        m_freem(mrep);
                        return (error);
                }
@@ -536,7 +900,8 @@ nfsmout:
  * - verify it
  * - fill in the cred struct.
  */
  * - verify it
  * - fill in the cred struct.
  */
-nfs_getreq(so, prog, vers, maxproc, nam, mrp, mdp, dposp, retxid, proc, cr)
+nfs_getreq(so, prog, vers, maxproc, nam, mrp, mdp, dposp, retxid, procnum, cr,
+       msk, mtch, wascomp)
        struct socket *so;
        u_long prog;
        u_long vers;
        struct socket *so;
        u_long prog;
        u_long vers;
@@ -546,69 +911,107 @@ nfs_getreq(so, prog, vers, maxproc, nam, mrp, mdp, dposp, retxid, proc, cr)
        struct mbuf **mdp;
        caddr_t *dposp;
        u_long *retxid;
        struct mbuf **mdp;
        caddr_t *dposp;
        u_long *retxid;
-       u_long *proc;
+       u_long *procnum;
        register struct ucred *cr;
        register struct ucred *cr;
+       struct mbuf *msk, *mtch;
+       int *wascomp;
 {
        register int i;
 {
        register int i;
-       register struct mbuf *m;
-       nfsm_vars;
-       int len, len2;
+       register u_long *tl;
+       register long t1;
+       caddr_t dpos, cp2;
+       int error = 0;
+       struct mbuf *mrep, *md;
+       int len;
 
 
-       if (error = nfs_udpreceive(so, nam, &mrep))
+       if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
+               error = nfs_receive(so, nam, &mrep, (struct nfsreq *)0);
+       } else {
+               mrep = (struct mbuf *)0;
+               do {
+                       if (mrep) {
+                               m_freem(*nam);
+                               m_freem(mrep);
+                       }
+                       error = nfs_receive(so, nam, &mrep, (struct nfsreq *)0);
+               } while (!error && nfs_badnam(*nam, msk, mtch));
+       }
+       if (error)
                return (error);
        md = mrep;
                return (error);
        md = mrep;
+       mrep = nfs_uncompress(mrep);
+       if (mrep != md) {
+               *wascomp = 1;
+               md = mrep;
+       } else
+               *wascomp = 0;
        dpos = mtod(mrep, caddr_t);
        dpos = mtod(mrep, caddr_t);
-       nfsm_disect(p, u_long *, 10*NFSX_UNSIGNED);
-       *retxid = *p++;
-       if (*p++ != rpc_call) {
+       nfsm_disect(tl, u_long *, 10*NFSX_UNSIGNED);
+       *retxid = *tl++;
+       if (*tl++ != rpc_call) {
                m_freem(mrep);
                return (ERPCMISMATCH);
        }
                m_freem(mrep);
                return (ERPCMISMATCH);
        }
-       if (*p++ != rpc_vers) {
+       if (*tl++ != rpc_vers) {
                m_freem(mrep);
                return (ERPCMISMATCH);
        }
                m_freem(mrep);
                return (ERPCMISMATCH);
        }
-       if (*p++ != prog) {
+       if (*tl++ != prog) {
                m_freem(mrep);
                return (EPROGUNAVAIL);
        }
                m_freem(mrep);
                return (EPROGUNAVAIL);
        }
-       if (*p++ != vers) {
+       if (*tl++ != vers) {
                m_freem(mrep);
                return (EPROGMISMATCH);
        }
                m_freem(mrep);
                return (EPROGMISMATCH);
        }
-       *proc = fxdr_unsigned(u_long, *p++);
-       if (*proc == NFSPROC_NULL) {
+       *procnum = fxdr_unsigned(u_long, *tl++);
+       if (*procnum == NFSPROC_NULL) {
                *mrp = mrep;
                return (0);
        }
                *mrp = mrep;
                return (0);
        }
-       if (*proc > maxproc || *p++ != rpc_auth_unix) {
+       if (*procnum > maxproc || *tl++ != rpc_auth_unix) {
                m_freem(mrep);
                return (EPROCUNAVAIL);
        }
                m_freem(mrep);
                return (EPROCUNAVAIL);
        }
-       len = fxdr_unsigned(int, *p++);
-       len2 = fxdr_unsigned(int, *++p);
-       nfsm_adv(nfsm_rndup(len2));
-       nfsm_disect(p, u_long *, 3*NFSX_UNSIGNED);
-       cr->cr_uid = fxdr_unsigned(uid_t, *p++);
-       cr->cr_gid = fxdr_unsigned(gid_t, *p++);
-       len2 = fxdr_unsigned(int, *p);
-       if (len2 > 10) {
+       len = fxdr_unsigned(int, *tl++);
+       if (len < 0 || len > RPCAUTH_MAXSIZ) {
+               m_freem(mrep);
+               return (EBADRPC);
+       }
+       len = fxdr_unsigned(int, *++tl);
+       if (len < 0 || len > NFS_MAXNAMLEN) {
+               m_freem(mrep);
+               return (EBADRPC);
+       }
+       nfsm_adv(nfsm_rndup(len));
+       nfsm_disect(tl, u_long *, 3*NFSX_UNSIGNED);
+       cr->cr_uid = fxdr_unsigned(uid_t, *tl++);
+       cr->cr_gid = fxdr_unsigned(gid_t, *tl++);
+       len = fxdr_unsigned(int, *tl);
+       if (len < 0 || len > RPCAUTH_UNIXGIDS) {
                m_freem(mrep);
                return (EBADRPC);
        }
                m_freem(mrep);
                return (EBADRPC);
        }
-       nfsm_disect(p, u_long *, (len2+2)*NFSX_UNSIGNED);
-       for (i = 1; i <= len2; i++)
-               cr->cr_groups[i] = fxdr_unsigned(gid_t, *p++);
-       cr->cr_ngroups = len2+1;
+       nfsm_disect(tl, u_long *, (len + 2)*NFSX_UNSIGNED);
+       for (i = 1; i <= len; i++)
+               if (i < NGROUPS)
+                       cr->cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
+               else
+                       tl++;
+       cr->cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
        /*
         * Do we have any use for the verifier.
         * According to the "Remote Procedure Call Protocol Spec." it
         * should be AUTH_NULL, but some clients make it AUTH_UNIX?
         * For now, just skip over it
         */
        /*
         * Do we have any use for the verifier.
         * According to the "Remote Procedure Call Protocol Spec." it
         * should be AUTH_NULL, but some clients make it AUTH_UNIX?
         * For now, just skip over it
         */
-       len2 = fxdr_unsigned(int, *++p);
-       if (len2 > 0)
-               nfsm_adv(nfsm_rndup(len2));
+       len = fxdr_unsigned(int, *++tl);
+       if (len < 0 || len > RPCAUTH_MAXSIZ) {
+               m_freem(mrep);
+               return (EBADRPC);
+       }
+       if (len > 0)
+               nfsm_adv(nfsm_rndup(len));
        *mrp = mrep;
        *mdp = md;
        *dposp = dpos;
        *mrp = mrep;
        *mdp = md;
        *dposp = dpos;
@@ -629,44 +1032,47 @@ nfs_rephead(siz, retxid, err, mrq, mbp, bposp)
        struct mbuf **mbp;
        caddr_t *bposp;
 {
        struct mbuf **mbp;
        caddr_t *bposp;
 {
-       nfsm_vars;
+       register u_long *tl;
+       register long t1;
+       caddr_t bpos;
+       struct mbuf *mreq, *mb, *mb2;
 
        NFSMGETHDR(mreq);
        mb = mreq;
        if ((siz+RPC_REPLYSIZ) > MHLEN)
 
        NFSMGETHDR(mreq);
        mb = mreq;
        if ((siz+RPC_REPLYSIZ) > MHLEN)
-               NFSMCLGET(mreq, M_WAIT);
-       p = mtod(mreq, u_long *);
+               MCLGET(mreq, M_WAIT);
+       tl = mtod(mreq, u_long *);
        mreq->m_len = 6*NFSX_UNSIGNED;
        mreq->m_len = 6*NFSX_UNSIGNED;
-       bpos = ((caddr_t)p)+mreq->m_len;
-       *p++ = retxid;
-       *p++ = rpc_reply;
+       bpos = ((caddr_t)tl)+mreq->m_len;
+       *tl++ = retxid;
+       *tl++ = rpc_reply;
        if (err == ERPCMISMATCH) {
        if (err == ERPCMISMATCH) {
-               *p++ = rpc_msgdenied;
-               *p++ = rpc_mismatch;
-               *p++ = txdr_unsigned(2);
-               *p = txdr_unsigned(2);
+               *tl++ = rpc_msgdenied;
+               *tl++ = rpc_mismatch;
+               *tl++ = txdr_unsigned(2);
+               *tl = txdr_unsigned(2);
        } else {
        } else {
-               *p++ = rpc_msgaccepted;
-               *p++ = 0;
-               *p++ = 0;
+               *tl++ = rpc_msgaccepted;
+               *tl++ = 0;
+               *tl++ = 0;
                switch (err) {
                case EPROGUNAVAIL:
                switch (err) {
                case EPROGUNAVAIL:
-                       *p = txdr_unsigned(RPC_PROGUNAVAIL);
+                       *tl = txdr_unsigned(RPC_PROGUNAVAIL);
                        break;
                case EPROGMISMATCH:
                        break;
                case EPROGMISMATCH:
-                       *p = txdr_unsigned(RPC_PROGMISMATCH);
-                       nfsm_build(p, u_long *, 2*NFSX_UNSIGNED);
-                       *p++ = txdr_unsigned(2);
-                       *p = txdr_unsigned(2);  /* someday 3 */
+                       *tl = txdr_unsigned(RPC_PROGMISMATCH);
+                       nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED);
+                       *tl++ = txdr_unsigned(2);
+                       *tl = txdr_unsigned(2); /* someday 3 */
                        break;
                case EPROCUNAVAIL:
                        break;
                case EPROCUNAVAIL:
-                       *p = txdr_unsigned(RPC_PROCUNAVAIL);
+                       *tl = txdr_unsigned(RPC_PROCUNAVAIL);
                        break;
                default:
                        break;
                default:
-                       *p = 0;
+                       *tl = 0;
                        if (err != VNOVAL) {
                        if (err != VNOVAL) {
-                               nfsm_build(p, u_long *, NFSX_UNSIGNED);
-                               *p = txdr_unsigned(err);
+                               nfsm_build(tl, u_long *, NFSX_UNSIGNED);
+                               *tl = txdr_unsigned(err);
                        }
                        break;
                };
                        }
                        break;
                };
@@ -683,56 +1089,325 @@ nfs_rephead(siz, retxid, err, mrq, mbp, bposp)
  * Nfs timer routine
  * Scan the nfsreq list and retranmit any requests that have timed out
  * To avoid retransmission attempts on STREAM sockets (in the future) make
  * Nfs timer routine
  * Scan the nfsreq list and retranmit any requests that have timed out
  * To avoid retransmission attempts on STREAM sockets (in the future) make
- * sure to set the r_retry field to 0.
+ * sure to set the r_retry field to 0 (implies nm_retry == 0).
  */
 nfs_timer()
 {
        register struct nfsreq *rep;
        register struct mbuf *m;
        register struct socket *so;
  */
 nfs_timer()
 {
        register struct nfsreq *rep;
        register struct mbuf *m;
        register struct socket *so;
-       int s, len;
+       register struct nfsmount *nmp;
+       int s, error;
 
        s = splnet();
 
        s = splnet();
-       rep = nfsreqh.r_next;
-       while (rep && rep != &nfsreqh) {
-               if (rep->r_timer > 0)
-                       rep->r_timer--;
-               else if (rep->r_mrep == NULL && rep->r_retry > 0) {
-                       so = rep->r_mntp->nm_so;
-                       if ((so->so_state & SS_CANTSENDMORE) == 0 &&
-                           !so->so_error &&
-                           sbspace(&so->so_snd) >= rep->r_msiz) {
-                               m = NFSMCOPY(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT);
-                               if (m != NULL) {
-                                       nfsstats.rpcretries++;
-                                       rep->r_timeout <<= 2; /* x4 backoff */
-                                       if (rep->r_timeout > NFS_MAXTIMEO)
-                                               rep->r_timeout = NFS_MAXTIMEO;
-                                       rep->r_timer = rep->r_timeout;
-                                       if (rep->r_retry != VNOVAL)
-                                               rep->r_retry--;
-#ifdef MGETHDR
-                                       m->m_pkthdr.len = rep->r_msiz;
-#endif
-                                       (*so->so_proto->pr_usrreq)(so, PRU_SEND,
-                                               m, (caddr_t)0, (struct mbuf *)0,
-                                               (struct mbuf *)0);
-                               }
+       for (rep = nfsreqh.r_next; rep != &nfsreqh; rep = rep->r_next) {
+               nmp = rep->r_nmp;
+               if (rep->r_mrep || (rep->r_flags & R_SOFTTERM) ||
+                   (so = nmp->nm_so) == NULL)
+                       continue;
+               if ((nmp->nm_flag & NFSMNT_INT) && nfs_sigintr(rep->r_procp)) {
+                       rep->r_flags |= R_SOFTTERM;
+                       continue;
+               }
+               if (rep->r_flags & R_TIMING)    /* update rtt in mount */
+                       nmp->nm_rtt++;
+               /* If not timed out */
+               if (++rep->r_timer < nmp->nm_rto)
+                       continue;
+               /* Do backoff and save new timeout in mount */
+               if (rep->r_flags & R_TIMING) {
+                       nfs_backofftimer(nmp);
+                       rep->r_flags &= ~R_TIMING;
+                       nmp->nm_rtt = -1;
+               }
+               if (rep->r_flags & R_SENT) {
+                       rep->r_flags &= ~R_SENT;
+                       nmp->nm_sent--;
+               }
+
+               /*
+                * Check for too many retries on soft mount.
+                * nb: For hard mounts, r_retry == NFS_MAXREXMIT+1
+                */
+               if (++rep->r_rexmit > NFS_MAXREXMIT)
+                       rep->r_rexmit = NFS_MAXREXMIT;
+
+               /*
+                * Check for server not responding
+                */
+               if ((rep->r_flags & R_TPRINTFMSG) == 0 &&
+                    rep->r_rexmit > NFS_FISHY) {
+                       nfs_msg(rep->r_procp,
+                           nmp->nm_mountp->mnt_stat.f_mntfromname,
+                           "not responding");
+                       rep->r_flags |= R_TPRINTFMSG;
+               }
+               if (rep->r_rexmit >= rep->r_retry) {    /* too many */
+                       nfsstats.rpctimeouts++;
+                       rep->r_flags |= R_SOFTTERM;
+                       continue;
+               }
+               if (nmp->nm_sotype != SOCK_DGRAM)
+                       continue;
+
+               /*
+                * If there is enough space and the window allows..
+                *      Resend it
+                */
+               if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
+                      nmp->nm_sent < nmp->nm_window &&
+                      (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){
+                       nfsstats.rpcretries++;
+                       if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
+                           error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m,
+                           (caddr_t)0, (struct mbuf *)0, (struct mbuf *)0);
+                       else
+                           error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m,
+                           nmp->nm_nam, (struct mbuf *)0, (struct mbuf *)0);
+                       if (error) {
+                               if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
+                                       so->so_error = 0;
+                       } else {
+                               /*
+                                * We need to time the request even though we
+                                * are retransmitting.
+                                */
+                               nmp->nm_rtt = 0;
+                               nmp->nm_sent++;
+                               rep->r_flags |= (R_SENT|R_TIMING);
+                               rep->r_timer = rep->r_timerinit;
                        }
                }
                        }
                }
-               rep = rep->r_next;
        }
        splx(s);
        }
        splx(s);
-       timeout(nfs_timer, (caddr_t)0, hz/10);
+       timeout(nfs_timer, (caddr_t)0, hz/NFS_HZ);
+}
+
+/*
+ * NFS timer update and backoff. The "Jacobson/Karels/Karn" scheme is
+ * used here. The timer state is held in the nfsmount structure and
+ * a single request is used to clock the response. When successful
+ * the rtt smoothing in nfs_updatetimer is used, when failed the backoff
+ * is done by nfs_backofftimer. We also log failure messages in these
+ * routines.
+ *
+ * Congestion variables are held in the nfshost structure which
+ * is referenced by nfsmounts and shared per-server. This separation
+ * makes it possible to do per-mount timing which allows varying disk
+ * access times to be dealt with, while preserving a network oriented
+ * congestion control scheme.
+ *
+ * The windowing implements the Jacobson/Karels slowstart algorithm
+ * with adjusted scaling factors. We start with one request, then send
+ * 4 more after each success until the ssthresh limit is reached, then
+ * we increment at a rate proportional to the window. On failure, we
+ * remember 3/4 the current window and clamp the send limit to 1. Note
+ * ICMP source quench is not reflected in so->so_error so we ignore that
+ * for now.
+ *
+ * NFS behaves much more like a transport protocol with these changes,
+ * shedding the teenage pedal-to-the-metal tendencies of "other"
+ * implementations.
+ *
+ * Timers and congestion avoidance by Tom Talpey, Open Software Foundation.
+ */
+
+/*
+ * The TCP algorithm was not forgiving enough. Because the NFS server
+ * responds only after performing lookups/diskio/etc, we have to be
+ * more prepared to accept a spiky variance. The TCP algorithm is:
+ * TCP_RTO(nmp) ((((nmp)->nm_srtt >> 2) + (nmp)->nm_rttvar) >> 1)
+ */
+#define NFS_RTO(nmp)   (((nmp)->nm_srtt >> 3) + (nmp)->nm_rttvar)
+
+nfs_updatetimer(nmp)
+       register struct nfsmount *nmp;
+{
+
+       /* If retransmitted, clear and return */
+       if (nmp->nm_rexmit || nmp->nm_currexmit) {
+               nmp->nm_rexmit = nmp->nm_currexmit = 0;
+               return;
+       }
+       /* If have a measurement, do smoothing */
+       if (nmp->nm_srtt) {
+               register short delta;
+               delta = nmp->nm_rtt - (nmp->nm_srtt >> 3);
+               if ((nmp->nm_srtt += delta) <= 0)
+                       nmp->nm_srtt = 1;
+               if (delta < 0)
+                       delta = -delta;
+               delta -= (nmp->nm_rttvar >> 2);
+               if ((nmp->nm_rttvar += delta) <= 0)
+                       nmp->nm_rttvar = 1;
+       /* Else initialize */
+       } else {
+               nmp->nm_rttvar = nmp->nm_rtt << 1;
+               if (nmp->nm_rttvar == 0) nmp->nm_rttvar = 2;
+               nmp->nm_srtt = nmp->nm_rttvar << 2;
+       }
+       /* Compute new Retransmission TimeOut and clip */
+       nmp->nm_rto = NFS_RTO(nmp);
+       if (nmp->nm_rto < NFS_MINTIMEO)
+               nmp->nm_rto = NFS_MINTIMEO;
+       else if (nmp->nm_rto > NFS_MAXTIMEO)
+               nmp->nm_rto = NFS_MAXTIMEO;
+
+       /* Update window estimate */
+       if (nmp->nm_window < nmp->nm_ssthresh)  /* quickly */
+               nmp->nm_window += 4;
+       else {                                          /* slowly */
+               register long incr = ++nmp->nm_winext;
+               incr = (incr * incr) / nmp->nm_window;
+               if (incr > 0) {
+                       nmp->nm_winext = 0;
+                       ++nmp->nm_window;
+               }
+       }
+       if (nmp->nm_window > NFS_MAXWINDOW)
+               nmp->nm_window = NFS_MAXWINDOW;
+}
+
+nfs_backofftimer(nmp)
+       register struct nfsmount *nmp;
+{
+       register unsigned long newrto;
+
+       /* Clip shift count */
+       if (++nmp->nm_rexmit > 8 * sizeof nmp->nm_rto)
+               nmp->nm_rexmit = 8 * sizeof nmp->nm_rto;
+       /* Back off RTO exponentially */
+       newrto = NFS_RTO(nmp);
+       newrto <<= (nmp->nm_rexmit - 1);
+       if (newrto == 0 || newrto > NFS_MAXTIMEO)
+               newrto = NFS_MAXTIMEO;
+       nmp->nm_rto = newrto;
+
+       /* If too many retries, message, assume a bogus RTT and re-measure */
+       if (nmp->nm_currexmit < nmp->nm_rexmit) {
+               nmp->nm_currexmit = nmp->nm_rexmit;
+               if (nmp->nm_currexmit >= nfsrexmtthresh) {
+                       if (nmp->nm_currexmit == nfsrexmtthresh) {
+                               nmp->nm_rttvar += (nmp->nm_srtt >> 2);
+                               nmp->nm_srtt = 0;
+                       }
+               }
+       }
+       /* Close down window but remember this point (3/4 current) for later */
+       nmp->nm_ssthresh = ((nmp->nm_window << 1) + nmp->nm_window) >> 2;
+       nmp->nm_window = 1;
+       nmp->nm_winext = 0;
+}
+
+/*
+ * Test for a termination signal pending on procp.
+ * This is used for NFSMNT_INT mounts.
+ */
+nfs_sigintr(p)
+       register struct proc *p;
+{
+       if (p && p->p_sig && (((p->p_sig &~ p->p_sigmask) &~ p->p_sigignore) &
+           NFSINT_SIGMASK))
+               return (1);
+       else
+               return (0);
+}
+
+nfs_msg(p, server, msg)
+       struct proc *p;
+       char *server, *msg;
+{
+       tpr_t tpr;
+
+       if (p)
+               tpr = tprintf_open(p);
+       else
+               tpr = NULL;
+       tprintf(tpr, "nfs server %s: %s\n", server, msg);
+       tprintf_close(tpr);
+}
+
+/*
+ * Lock a socket against others.
+ * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
+ * and also to avoid race conditions between the processes with nfs requests
+ * in progress when a reconnect is necessary.
+ */
+nfs_solock(flagp)
+       register int *flagp;
+{
+
+       while (*flagp & NFSMNT_SCKLOCK) {
+               *flagp |= NFSMNT_WANTSCK;
+               (void) tsleep((caddr_t)flagp, PZERO-1, "nfsolck", 0);
+       }
+       *flagp |= NFSMNT_SCKLOCK;
+}
+
+/*
+ * Unlock the stream socket for others.
+ */
+nfs_sounlock(flagp)
+       register int *flagp;
+{
+
+       if ((*flagp & NFSMNT_SCKLOCK) == 0)
+               panic("nfs sounlock");
+       *flagp &= ~NFSMNT_SCKLOCK;
+       if (*flagp & NFSMNT_WANTSCK) {
+               *flagp &= ~NFSMNT_WANTSCK;
+               wakeup((caddr_t)flagp);
+       }
+}
+
+/*
+ * This function compares two net addresses by family and returns TRUE
+ * if they are the same.
+ * If there is any doubt, return FALSE.
+ */
+nfs_netaddr_match(nam1, nam2)
+       struct mbuf *nam1, *nam2;
+{
+       register struct sockaddr *saddr1, *saddr2;
+
+       saddr1 = mtod(nam1, struct sockaddr *);
+       saddr2 = mtod(nam2, struct sockaddr *);
+       if (saddr1->sa_family != saddr2->sa_family)
+               return (0);
+
+       /*
+        * Must do each address family separately since unused fields
+        * are undefined values and not always zeroed.
+        */
+       switch (saddr1->sa_family) {
+       case AF_INET:
+               if (((struct sockaddr_in *)saddr1)->sin_addr.s_addr ==
+                   ((struct sockaddr_in *)saddr2)->sin_addr.s_addr)
+                       return (1);
+               break;
+       default:
+               break;
+       };
+       return (0);
 }
 
 /*
 }
 
 /*
- * nfs_sbwait() is simply sbwait() but at a negative priority so that it
- * can not be interrupted by a signal.
+ * Check the hostname fields for nfsd's mask and match fields.
+ * By address family:
+ * - Bitwise AND the mask with the host address field
+ * - Compare for == with match
+ * return TRUE if not equal
  */
  */
-nfs_sbwait(sb)
-       struct sockbuf *sb;
+nfs_badnam(nam, msk, mtch)
+       register struct mbuf *nam, *msk, *mtch;
 {
 {
-       sb->sb_flags |= SB_WAIT;
-       sleep((caddr_t)&sb->sb_cc, PZERO-2);
+       switch (mtod(nam, struct sockaddr *)->sa_family) {
+       case AF_INET:
+               return ((mtod(nam, struct sockaddr_in *)->sin_addr.s_addr &
+                        mtod(msk, struct sockaddr_in *)->sin_addr.s_addr) !=
+                        mtod(mtch, struct sockaddr_in *)->sin_addr.s_addr);
+       default:
+               printf("nfs_badmatch, unknown sa_family\n");
+               return (0);
+       };
 }
 }