tsleep; sblock/sbwait catch/return interrupts; implement lowat,
[unix-history] / usr / src / sys / kern / uipc_socket.c
/*
* Copyright (c) 1982, 1986, 1988 Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms are permitted
* provided that the above copyright notice and this paragraph are
* duplicated in all such forms and that any documentation,
* advertising materials, and other materials related to such
* distribution and use acknowledge that the software was developed
* by the University of California, Berkeley. The name of the
* University may not be used to endorse or promote products derived
* from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
* WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*
* @(#)uipc_socket.c 7.17 (Berkeley) %G%
*/
#include "param.h"
#include "user.h"
#include "proc.h"
#include "file.h"
#include "malloc.h"
#include "mbuf.h"
#include "domain.h"
#include "protosw.h"
#include "socket.h"
#include "socketvar.h"
/*
* Socket operation routines.
* These routines are called by the routines in
* sys_socket.c or from a system process, and
* implement the semantics of socket operations by
* switching out to the protocol specific routines.
*
* TODO:
* test socketpair
* clean up async
* out-of-band is a kludge
*/
/*ARGSUSED*/
socreate(dom, aso, type, proto)
struct socket **aso;
register int type;
int proto;
{
register struct protosw *prp;
register struct socket *so;
register int error;
if (proto)
prp = pffindproto(dom, proto, type);
else
prp = pffindtype(dom, type);
if (prp == 0)
return (EPROTONOSUPPORT);
if (prp->pr_type != type)
return (EPROTOTYPE);
MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
bzero((caddr_t)so, sizeof(*so));
so->so_type = type;
if (u.u_uid == 0)
so->so_state = SS_PRIV;
so->so_proto = prp;
error =
(*prp->pr_usrreq)(so, PRU_ATTACH,
(struct mbuf *)0, (struct mbuf *)proto, (struct mbuf *)0);
if (error) {
so->so_state |= SS_NOFDREF;
sofree(so);
return (error);
}
*aso = so;
return (0);
}
sobind(so, nam)
struct socket *so;
struct mbuf *nam;
{
int s = splnet();
int error;
error = (*so->so_proto->pr_usrreq)(so, PRU_BIND,
(struct mbuf *)0, nam, (struct mbuf *)0);
splx(s);
return (error);
}
solisten(so, backlog)
register struct socket *so;
int backlog;
{
int s = splnet(), error;
error =
(*so->so_proto->pr_usrreq)(so, PRU_LISTEN,
(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
if (error) {
splx(s);
return (error);
}
if (so->so_q == 0)
so->so_options |= SO_ACCEPTCONN;
if (backlog < 0)
backlog = 0;
so->so_qlimit = min(backlog, SOMAXCONN);
splx(s);
return (0);
}
sofree(so)
register struct socket *so;
{
if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
return;
if (so->so_head) {
if (!soqremque(so, 0) && !soqremque(so, 1))
panic("sofree dq");
so->so_head = 0;
}
sbrelease(&so->so_snd);
sorflush(so);
FREE(so, M_SOCKET);
}
/*
* Close a socket on last file table reference removal.
* Initiate disconnect if connected.
* Free socket when disconnect complete.
*/
soclose(so)
register struct socket *so;
{
int s = splnet(); /* conservative */
int error = 0;
if (so->so_options & SO_ACCEPTCONN) {
while (so->so_q0)
(void) soabort(so->so_q0);
while (so->so_q)
(void) soabort(so->so_q);
}
if (so->so_pcb == 0)
goto discard;
if (so->so_state & SS_ISCONNECTED) {
if ((so->so_state & SS_ISDISCONNECTING) == 0) {
error = sodisconnect(so);
if (error)
goto drop;
}
if (so->so_options & SO_LINGER) {
if ((so->so_state & SS_ISDISCONNECTING) &&
(so->so_state & SS_NBIO))
goto drop;
while (so->so_state & SS_ISCONNECTED)
if (error = tsleep((caddr_t)&so->so_timeo,
PSOCK | PCATCH, netcls, so->so_linger))
break;
}
}
/*
* If there is an error on the socket, disregard any
* error from tsleep and return the socket error.
*/
if (so->so_error)
error = so->so_error;
drop:
if (so->so_pcb) {
int error2 =
(*so->so_proto->pr_usrreq)(so, PRU_DETACH,
(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
if (error == 0)
error = error2;
}
discard:
if (so->so_state & SS_NOFDREF)
panic("soclose: NOFDREF");
so->so_state |= SS_NOFDREF;
sofree(so);
splx(s);
return (error);
}
/*
* Must be called at splnet...
*/
soabort(so)
struct socket *so;
{
return (
(*so->so_proto->pr_usrreq)(so, PRU_ABORT,
(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
}
soaccept(so, nam)
register struct socket *so;
struct mbuf *nam;
{
int s = splnet();
int error;
if ((so->so_state & SS_NOFDREF) == 0)
panic("soaccept: !NOFDREF");
so->so_state &= ~SS_NOFDREF;
error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
(struct mbuf *)0, nam, (struct mbuf *)0);
splx(s);
return (error);
}
soconnect(so, nam)
register struct socket *so;
struct mbuf *nam;
{
int s;
int error;
if (so->so_options & SO_ACCEPTCONN)
return (EOPNOTSUPP);
s = splnet();
/*
* If protocol is connection-based, can only connect once.
* Otherwise, if connected, try to disconnect first.
* This allows user to disconnect by connecting to, e.g.,
* a null address.
*/
if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
(error = sodisconnect(so))))
error = EISCONN;
else
error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
(struct mbuf *)0, nam, (struct mbuf *)0);
splx(s);
return (error);
}
soconnect2(so1, so2)
register struct socket *so1;
struct socket *so2;
{
int s = splnet();
int error;
error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
(struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0);
splx(s);
return (error);
}
sodisconnect(so)
register struct socket *so;
{
int s = splnet();
int error;
if ((so->so_state & SS_ISCONNECTED) == 0) {
error = ENOTCONN;
goto bad;
}
if (so->so_state & SS_ISDISCONNECTING) {
error = EALREADY;
goto bad;
}
error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
bad:
splx(s);
return (error);
}
/*
* Send on a socket.
* If send must go all at once and message is larger than
* send buffering, then hard error.
* Lock against other senders.
* If must go all at once and not enough room now, then
* inform user that this would block and do nothing.
* Otherwise, if nonblocking, send as much as possible.
*/
sosend(so, nam, uio, flags, control)
register struct socket *so;
struct mbuf *nam;
register struct uio *uio;
int flags;
struct mbuf *control;
{
struct mbuf *top = 0, **mp;
register struct mbuf *m;
register int space, len;
int rlen = 0, error, s, dontroute, mlen;
int atomic = sosendallatonce(so);
if (atomic && uio->uio_resid > so->so_snd.sb_hiwat)
return (EMSGSIZE);
dontroute =
(flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
(so->so_proto->pr_flags & PR_ATOMIC);
u.u_ru.ru_msgsnd++;
if (control)
rlen = control->m_len;
#define snderr(errno) { error = errno; splx(s); goto release; }
restart:
if (error = sblock(&so->so_snd))
return (error);
do {
s = splnet();
if (so->so_state & SS_CANTSENDMORE)
snderr(EPIPE);
if (so->so_error)
snderr(so->so_error);
if ((so->so_state & SS_ISCONNECTED) == 0) {
if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
if ((so->so_state & SS_ISCONFIRMING) == 0)
snderr(ENOTCONN);
} else if (nam == 0)
snderr(EDESTADDRREQ);
}
if (flags & MSG_OOB)
space = 1024;
else {
space = sbspace(&so->so_snd);
if (space <= rlen ||
(atomic && space < uio->uio_resid + rlen) ||
(uio->uio_resid >= MCLBYTES && space < MCLBYTES &&
so->so_snd.sb_cc >= MCLBYTES &&
(so->so_state & SS_NBIO) == 0)) {
if (so->so_state & SS_NBIO)
snderr(EWOULDBLOCK);
sbunlock(&so->so_snd);
if (error = sbwait(&so->so_snd))
snderr(error);
splx(s);
goto restart;
}
}
splx(s);
mp = &top;
space -= rlen;
do {
do {
if (top == 0) {
MGETHDR(m, M_WAIT, MT_DATA);
mlen = MHLEN;
m->m_pkthdr.len = 0;
m->m_pkthdr.rcvif = (struct ifnet *)0;
} else {
MGET(m, M_WAIT, MT_DATA);
mlen = MLEN;
}
if (uio->uio_resid >= MINCLSIZE && space >= MCLBYTES) {
MCLGET(m, M_WAIT);
if ((m->m_flags & M_EXT) == 0)
goto nopages;
mlen = MCLBYTES;
#ifdef MAPPED_MBUFS
len = min(MCLBYTES, uio->uio_resid);
if (len < mlen - max_hdr)
m->m_data += max_hdr;
#else
len = min(MCLBYTES - max_hdr, uio->uio_resid);
m->m_data += max_hdr;
#endif
space -= MCLBYTES;
} else {
nopages:
len = min(min(mlen, uio->uio_resid), space);
space -= len;
/*
* For datagram protocols, leave room
* for protocol headers in first mbuf.
*/
if (atomic && top == 0 && len < mlen)
MH_ALIGN(m, len);
}
error = uiomove(mtod(m, caddr_t), len, uio);
m->m_len = len;
*mp = m;
top->m_pkthdr.len += len;
if (error)
goto release;
mp = &m->m_next;
if (uio->uio_resid <= 0) {
if ((flags & MSG_EOR) && top)
top->m_flags |= M_EOR;
break;
}
} while (space > 0 && atomic);
if (dontroute)
so->so_options |= SO_DONTROUTE;
s = splnet(); /* XXX */
error = (*so->so_proto->pr_usrreq)(so,
(flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
top, (caddr_t)nam, control);
splx(s);
if (dontroute)
so->so_options &= ~SO_DONTROUTE;
rlen = 0;
top = 0;
mp = &top;
if (error)
goto release;
} while (uio->uio_resid && space > 0);
} while (uio->uio_resid);
release:
sbunlock(&so->so_snd);
if (top)
m_freem(top);
return (error);
}
/*
* Implement receive operations on a socket.
* We depend on the way that records are added to the sockbuf
* by sbappend*. In particular, each record (mbufs linked through m_next)
* must begin with an address if the protocol so specifies,
* followed by an optional mbuf containing access rights if supported
* by the protocol, and then zero or more mbufs of data.
* In order to avoid blocking network interrupts for the entire time here,
* we splx() while doing the actual copy to user space.
* Although the sockbuf is locked, new data may still be appended,
* and thus we must maintain consistency of the sockbuf during that time.
*/
soreceive(so, aname, uio, flagsp, rightsp, controlp)
register struct socket *so;
struct mbuf **aname;
register struct uio *uio;
int *flagsp;
struct mbuf **rightsp, **controlp;
{
register struct mbuf *m;
register int flags, len, error, s, offset;
struct protosw *pr = so->so_proto;
struct mbuf *nextrecord, *m_with_eor;
int moff;
if (rightsp)
*rightsp = 0;
if (aname)
*aname = 0;
if (controlp)
*controlp = 0;
if (flagsp)
flags = *flagsp &~ MSG_EOR;
else
flags = 0;
if (flags & MSG_OOB) {
m = m_get(M_WAIT, MT_DATA);
error = (*pr->pr_usrreq)(so, PRU_RCVOOB,
m, (struct mbuf *)(flags & MSG_PEEK), (struct mbuf *)0);
if (error)
goto bad;
do {
len = uio->uio_resid;
if (len > m->m_len)
len = m->m_len;
error = uiomove(mtod(m, caddr_t), (int)len, uio);
m = m_free(m);
} while (uio->uio_resid && error == 0 && m);
bad:
if (m)
m_freem(m);
return (error);
}
if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
(struct mbuf *)0, (struct mbuf *)0);
restart:
if (error = sblock(&so->so_rcv))
return (error);
s = splnet();
m = so->so_rcv.sb_mb;
if (m == 0) {
if (so->so_rcv.sb_cc)
panic("receive 1");
if (so->so_error) {
error = so->so_error;
so->so_error = 0;
goto release;
}
if (so->so_state & SS_CANTRCVMORE)
goto release;
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
(so->so_proto->pr_flags & PR_CONNREQUIRED)) {
error = ENOTCONN;
goto release;
}
if (uio->uio_resid == 0)
goto release;
if (so->so_state & SS_NBIO) {
error = EWOULDBLOCK;
goto release;
}
sbunlock(&so->so_rcv);
if (error = sbwait(&so->so_rcv))
goto release;
splx(s);
goto restart;
}
u.u_ru.ru_msgrcv++;
if (m->m_type == 0)
panic("receive 3a");
nextrecord = m->m_nextpkt;
if (pr->pr_flags & PR_ADDR) {
if (m->m_type != MT_SONAME)
panic("receive 1a");
if (flags & MSG_PEEK) {
if (aname)
*aname = m_copy(m, 0, m->m_len);
m = m->m_next;
} else {
sbfree(&so->so_rcv, m);
if (aname) {
*aname = m;
so->so_rcv.sb_mb = m->m_next;
m->m_next = 0;
m = so->so_rcv.sb_mb;
} else {
MFREE(m, so->so_rcv.sb_mb);
m = so->so_rcv.sb_mb;
}
}
}
if (m && m->m_type == MT_RIGHTS) {
if ((pr->pr_flags & PR_RIGHTS) == 0)
panic("receive 2");
if (flags & MSG_PEEK) {
if (rightsp)
*rightsp = m_copy(m, 0, m->m_len);
m = m->m_next;
} else {
sbfree(&so->so_rcv, m);
if (rightsp) {
*rightsp = m;
so->so_rcv.sb_mb = m->m_next;
m->m_next = 0;
m = so->so_rcv.sb_mb;
} else {
MFREE(m, so->so_rcv.sb_mb);
m = so->so_rcv.sb_mb;
}
}
}
if (m && m->m_type == MT_CONTROL) {
if (flags & MSG_PEEK) {
if (controlp)
*controlp = m_copy(m, 0, m->m_len);
m = m->m_next;
} else {
sbfree(&so->so_rcv, m);
if (controlp) {
*controlp = m;
so->so_rcv.sb_mb = m->m_next;
m->m_next = 0;
m = so->so_rcv.sb_mb;
} else {
MFREE(m, so->so_rcv.sb_mb);
m = so->so_rcv.sb_mb;
}
}
}
if (m)
m->m_nextpkt = nextrecord;
moff = 0;
offset = 0;
m_with_eor = 0;
while (m && uio->uio_resid > 0 && error == 0) {
if (m->m_type == MT_OOBDATA)
flags |= MSG_OOB;
else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
panic("receive 3");
if (m->m_flags & M_EOR)
m_with_eor = m;
len = uio->uio_resid;
so->so_state &= ~SS_RCVATMARK;
if (so->so_oobmark && len > so->so_oobmark - offset)
len = so->so_oobmark - offset;
if (len > m->m_len - moff)
len = m->m_len - moff;
splx(s);
error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
s = splnet();
if (len == m->m_len - moff) {
if (flags & MSG_PEEK) {
m = m->m_next;
moff = 0;
} else {
nextrecord = m->m_nextpkt;
sbfree(&so->so_rcv, m);
MFREE(m, so->so_rcv.sb_mb);
m = so->so_rcv.sb_mb;
if (m)
m->m_nextpkt = nextrecord;
}
} else {
if (flags & MSG_PEEK)
moff += len;
else {
m->m_data += len;
m->m_len -= len;
so->so_rcv.sb_cc -= len;
}
}
if (so->so_oobmark) {
if ((flags & MSG_PEEK) == 0) {
so->so_oobmark -= len;
if (so->so_oobmark == 0) {
so->so_state |= SS_RCVATMARK;
break;
}
} else
offset += len;
}
if (m_with_eor)
break;
}
if (m_with_eor) {
if (m != m_with_eor)
flags |= MSG_EOR;
/* else data not consumed from mbuf */
}
if ((flags & MSG_PEEK) == 0) {
if (m == 0)
so->so_rcv.sb_mb = nextrecord;
else if (pr->pr_flags & PR_ATOMIC) {
flags |= MSG_TRUNC;
(void) sbdroprecord(&so->so_rcv);
}
if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
(struct mbuf *)flags, (struct mbuf *)0,
(struct mbuf *)0);
if (error == 0 && rightsp && *rightsp &&
pr->pr_domain->dom_externalize)
error = (*pr->pr_domain->dom_externalize)(*rightsp);
}
if (flagsp)
*flagsp |= flags;
release:
sbunlock(&so->so_rcv);
splx(s);
return (error);
}
soshutdown(so, how)
register struct socket *so;
register int how;
{
register struct protosw *pr = so->so_proto;
how++;
if (how & FREAD)
sorflush(so);
if (how & FWRITE)
return ((*pr->pr_usrreq)(so, PRU_SHUTDOWN,
(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
return (0);
}
sorflush(so)
register struct socket *so;
{
register struct sockbuf *sb = &so->so_rcv;
register struct protosw *pr = so->so_proto;
register int s;
struct sockbuf asb;
sb->sb_flags |= SB_NOINTR;
(void) sblock(sb);
s = splimp();
socantrcvmore(so);
sbunlock(sb);
asb = *sb;
bzero((caddr_t)sb, sizeof (*sb));
splx(s);
if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
(*pr->pr_domain->dom_dispose)(asb.sb_mb);
sbrelease(&asb);
}
sosetopt(so, level, optname, m0)
register struct socket *so;
int level, optname;
struct mbuf *m0;
{
int error = 0;
u_long val;
register struct mbuf *m = m0;
register struct sockbuf *sb;
if (level != SOL_SOCKET) {
if (so->so_proto && so->so_proto->pr_ctloutput)
return ((*so->so_proto->pr_ctloutput)
(PRCO_SETOPT, so, level, optname, &m0));
error = ENOPROTOOPT;
} else {
switch (optname) {
case SO_LINGER:
if (m == NULL || m->m_len != sizeof (struct linger)) {
error = EINVAL;
goto bad;
}
so->so_linger = mtod(m, struct linger *)->l_linger;
/* fall thru... */
case SO_DEBUG:
case SO_KEEPALIVE:
case SO_DONTROUTE:
case SO_USELOOPBACK:
case SO_BROADCAST:
case SO_REUSEADDR:
case SO_OOBINLINE:
if (m == NULL || m->m_len < sizeof (int)) {
error = EINVAL;
goto bad;
}
if (*mtod(m, int *))
so->so_options |= optname;
else
so->so_options &= ~optname;
break;
case SO_SNDBUF:
case SO_SNDLOWAT:
case SO_SNDTIMEO:
sb = &so->so_snd;
goto bufopts;
case SO_RCVBUF:
case SO_RCVLOWAT:
case SO_RCVTIMEO:
sb = &so->so_rcv;
bufopts:
if (m == NULL || m->m_len < sizeof (int)) {
error = EINVAL;
goto bad;
}
switch (optname) {
case SO_SNDBUF:
case SO_RCVBUF:
if ((val = (u_long) *mtod(m, int *)) == 0) {
error = EINVAL;
goto bad;
}
if (sbreserve(sb, val) == 0) {
error = ENOBUFS;
goto bad;
}
if (sb->sb_lowat > sb->sb_hiwat)
sb->sb_lowat = sb->sb_hiwat;
break;
case SO_SNDLOWAT:
case SO_RCVLOWAT:
if ((val = (u_long) *mtod(m, int *)) == 0 ||
val > sb->sb_hiwat) {
error = EINVAL;
goto bad;
}
sb->sb_lowat = val;
break;
case SO_SNDTIMEO:
case SO_RCVTIMEO:
sb->sb_timeo = *mtod(m, int *);
break;
}
break;
default:
error = ENOPROTOOPT;
break;
}
}
bad:
if (m)
(void) m_free(m);
return (error);
}
sogetopt(so, level, optname, mp)
register struct socket *so;
int level, optname;
struct mbuf **mp;
{
register struct mbuf *m;
if (level != SOL_SOCKET) {
if (so->so_proto && so->so_proto->pr_ctloutput) {
return ((*so->so_proto->pr_ctloutput)
(PRCO_GETOPT, so, level, optname, mp));
} else
return (ENOPROTOOPT);
} else {
m = m_get(M_WAIT, MT_SOOPTS);
m->m_len = sizeof (int);
switch (optname) {
case SO_LINGER:
m->m_len = sizeof (struct linger);
mtod(m, struct linger *)->l_onoff =
so->so_options & SO_LINGER;
mtod(m, struct linger *)->l_linger = so->so_linger;
break;
case SO_USELOOPBACK:
case SO_DONTROUTE:
case SO_DEBUG:
case SO_KEEPALIVE:
case SO_REUSEADDR:
case SO_BROADCAST:
case SO_OOBINLINE:
*mtod(m, int *) = so->so_options & optname;
break;
case SO_TYPE:
*mtod(m, int *) = so->so_type;
break;
case SO_ERROR:
*mtod(m, int *) = so->so_error;
so->so_error = 0;
break;
case SO_SNDBUF:
*mtod(m, int *) = so->so_snd.sb_hiwat;
break;
case SO_RCVBUF:
*mtod(m, int *) = so->so_rcv.sb_hiwat;
break;
case SO_SNDLOWAT:
*mtod(m, int *) = so->so_snd.sb_lowat;
break;
case SO_RCVLOWAT:
*mtod(m, int *) = so->so_rcv.sb_lowat;
break;
case SO_SNDTIMEO:
*mtod(m, int *) = so->so_snd.sb_timeo;
break;
case SO_RCVTIMEO:
*mtod(m, int *) = so->so_rcv.sb_timeo;
break;
default:
(void)m_free(m);
return (ENOPROTOOPT);
}
*mp = m;
return (0);
}
}
sohasoutofband(so)
register struct socket *so;
{
struct proc *p;
if (so->so_pgid < 0)
gsignal(-so->so_pgid, SIGURG);
else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
psignal(p, SIGURG);
if (so->so_rcv.sb_sel) {
selwakeup(so->so_rcv.sb_sel, so->so_rcv.sb_flags & SB_COLL);
so->so_rcv.sb_sel = 0;
so->so_rcv.sb_flags &= ~SB_COLL;
}
}