X-Git-Url: https://git.subgeniuskitty.com/unix-history/.git/blobdiff_plain/02c1608bc36b1ae9b88347a4e120d7be4f3334df..05db08ca40dcf52f15be693f25a96a593409f690:/usr/src/sys/netinet/tcp_output.c diff --git a/usr/src/sys/netinet/tcp_output.c b/usr/src/sys/netinet/tcp_output.c index 64c3f2b3f4..4d150a48a6 100644 --- a/usr/src/sys/netinet/tcp_output.c +++ b/usr/src/sys/netinet/tcp_output.c @@ -1,39 +1,39 @@ -/* tcp_output.c 4.43 82/08/02 */ +/* + * Copyright (c) 1982, 1986 Regents of the University of California. + * All rights reserved. The Berkeley software License Agreement + * specifies the terms and conditions for redistribution. + * + * @(#)tcp_output.c 7.10 (Berkeley) %G% + */ + +#include "param.h" +#include "systm.h" +#include "mbuf.h" +#include "protosw.h" +#include "socket.h" +#include "socketvar.h" +#include "errno.h" -#include "../h/param.h" -#include "../h/systm.h" -#include "../h/mbuf.h" -#include "../h/protosw.h" -#include "../h/socket.h" -#include "../h/socketvar.h" -#include "../net/in.h" #include "../net/route.h" -#include "../net/in_pcb.h" -#include "../net/in_systm.h" -#include "../net/ip.h" -#include "../net/ip_var.h" -#include "../net/tcp.h" -#define TCPOUTFLAGS -#include "../net/tcp_fsm.h" -#include "../net/tcp_seq.h" -#include "../net/tcp_timer.h" -#include "../net/tcp_var.h" -#include "../net/tcpip.h" -#include "../net/tcp_debug.h" -#include -char *tcpstates[]; /* XXX */ +#include "in.h" +#include "in_pcb.h" +#include "in_systm.h" +#include "ip.h" +#include "ip_var.h" +#include "tcp.h" +#define TCPOUTFLAGS +#include "tcp_fsm.h" +#include "tcp_seq.h" +#include "tcp_timer.h" +#include "tcp_var.h" +#include "tcpip.h" +#include "tcp_debug.h" /* - * Initial options: indicate max segment length 1/2 of space - * allocated for receive; if TCPTRUEOOB is defined, indicate - * willingness to do true out-of-band. + * Initial options. */ -#ifndef TCPTRUEOOB u_char tcp_initopt[4] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, }; -#else -u_char tcp_initopt[6] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, TCPOPT_WILLOOB, 2 }; -#endif /* * Tcp output routine: figure out what should be sent and send it. @@ -42,15 +42,14 @@ tcp_output(tp) register struct tcpcb *tp; { register struct socket *so = tp->t_inpcb->inp_socket; - register int len; + register int len, win; struct mbuf *m0; - int off, flags, win, error; + int off, flags, error; register struct mbuf *m; register struct tcpiphdr *ti; u_char *opt; unsigned optlen = 0; - int sendalot; - + int idle, sendalot; /* * Determine length of data that should be transmitted, @@ -58,70 +57,126 @@ tcp_output(tp) * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ + idle = (tp->snd_max == tp->snd_una); again: sendalot = 0; off = tp->snd_nxt - tp->snd_una; - len = MIN(so->so_snd.sb_cc, tp->snd_wnd+tp->t_force) - off; - if (len < 0) - return (0); /* ??? */ /* past FIN */ + win = MIN(tp->snd_wnd, tp->snd_cwnd); + + /* + * If in persist timeout with window of 0, send 1 byte. + * Otherwise, if window is small but nonzero + * and timer expired, we will send what we can + * and go to transmit state. + */ + if (tp->t_force) { + if (win == 0) + win = 1; + else { + tp->t_timer[TCPT_PERSIST] = 0; + tp->t_rxtshift = 0; + } + } + + len = MIN(so->so_snd.sb_cc, win) - off; + flags = tcp_outflags[tp->t_state]; + + if (len < 0) { + /* + * If FIN has been sent but not acked, + * but we haven't been called to retransmit, + * len will be -1; transmit if acking, otherwise no need. + * Otherwise, window shrank after we sent into it. + * If window shrank to 0, cancel pending retransmit + * and pull snd_nxt back to (closed) window. + * We will enter persist state below. + * If the window didn't close completely, + * just wait for an ACK. + */ + if (flags & TH_FIN) { + if (tp->t_flags & TF_ACKNOW) + len = 0; + else + return (0); + } else if (win == 0) { + tp->t_timer[TCPT_REXMT] = 0; + tp->snd_nxt = tp->snd_una; + len = 0; + } else + return (0); + } if (len > tp->t_maxseg) { len = tp->t_maxseg; sendalot = 1; } - - flags = tcp_outflags[tp->t_state]; - if (tp->snd_nxt + len < tp->snd_una + so->so_snd.sb_cc) + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; - if (flags & (TH_SYN|TH_RST|TH_FIN)) + win = sbspace(&so->so_rcv); + + + /* + * If our state indicates that FIN should be sent + * and we have not yet done so, or we're retransmitting the FIN, + * then we need to send. + */ + if (flags & TH_FIN && + ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) + goto send; + /* + * Send if we owe peer an ACK. + */ + if (tp->t_flags & TF_ACKNOW) + goto send; + if (flags & (TH_SYN|TH_RST)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* - * Sender silly window avoidance. If can send all data, - * a maximum segment, at least 1/4 of window do it, + * Sender silly window avoidance. If connection is idle + * and can send all data, a maximum segment, + * at least a maximum default-size segment do it, * or are forced, do it; otherwise don't bother. + * If peer's buffer is tiny, then send + * when window is at least half open. + * If retransmitting (possibly after persist timer forced us + * to send into a small window), then must resend. */ if (len) { - if (len == tp->t_maxseg || off+len >= so->so_snd.sb_cc) + if (len == tp->t_maxseg) goto send; - if (len * 4 >= tp->snd_wnd) /* a lot */ + if ((idle || tp->t_flags & TF_NODELAY) && + len + off >= so->so_snd.sb_cc) goto send; if (tp->t_force) goto send; + if (len >= tp->max_sndwnd / 2) + goto send; + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + goto send; } /* - * Send if we owe peer an ACK. + * Compare available window to amount of window + * known to peer (as advertised window less + * next expected input.) If the difference is 35% or more of the + * maximum possible window, then want to send a window update to peer. */ - if (tp->t_flags&TF_ACKNOW) - goto send; - -#ifdef TCPTRUEOOB - /* - * Send if an out of band data or ack should be transmitted. - */ - if (tp->t_oobflags&(TCPOOB_OWEACK|TCPOOB_NEEDACK))) - goto send; -#endif + if (win > 0) { + int adv = win - (tp->rcv_adv - tp->rcv_nxt); - /* - * Calculate available window in i, and also amount - * of window known to peer (as advertised window less - * next expected input.) If this is 35% or more of the - * maximum possible window, then want to send a segment to peer. - */ - win = sbspace(&so->so_rcv); - if (win > 0 && - ((100*(win-(tp->rcv_adv-tp->rcv_nxt))/so->so_rcv.sb_hiwat) >= 35)) - goto send; + if (100 * adv / so->so_rcv.sb_hiwat >= 35) + goto send; + if (adv >= 2 * tp->t_maxseg && so->so_rcv.sb_cc == 0) + goto send; + } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists - * persisting to move a zero window + * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tp->t_timer[TCPT_PERSIST] @@ -132,13 +187,14 @@ again: * is set when we are retransmitting * The output side is idle when both timers are zero. * - * If send window is closed, there is data to transmit, and no - * retransmit or persist is pending, then go to persist state, - * arranging to force out a byte to get more current window information - * if nothing happens soon. + * If send window is too small, there is data to transmit, and no + * retransmit or persist is pending, then go to persist state. + * If nothing happens soon, send when timer expires: + * if window is nonzero, transmit what we can, + * otherwise force out a byte. */ - if (tp->snd_wnd == 0 && so->so_snd.sb_cc && - tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { + if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && + tp->t_timer[TCPT_PERSIST] == 0) { tp->t_rxtshift = 0; tcp_setpersist(tp); } @@ -154,16 +210,33 @@ send: * be transmitted, and initialize the header from * the template for sends on this connection. */ - MGET(m, 0); - if (m == 0) + MGET(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) return (ENOBUFS); m->m_off = MMAXOFF - sizeof (struct tcpiphdr); m->m_len = sizeof (struct tcpiphdr); if (len) { + if (tp->t_force && len == 1) + tcpstat.tcps_sndprobe++; + else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { + tcpstat.tcps_sndrexmitpack++; + tcpstat.tcps_sndrexmitbyte += len; + } else { + tcpstat.tcps_sndpack++; + tcpstat.tcps_sndbyte += len; + } m->m_next = m_copy(so->so_snd.sb_mb, off, len); if (m->m_next == 0) len = 0; - } + } else if (tp->t_flags & TF_ACKNOW) + tcpstat.tcps_sndacks++; + else if (flags & (TH_SYN|TH_FIN|TH_RST)) + tcpstat.tcps_sndctrl++; + else if (SEQ_GT(tp->snd_up, tp->snd_una)) + tcpstat.tcps_sndurg++; + else + tcpstat.tcps_sndwinup++; + ti = mtod(m, struct tcpiphdr *); if (tp->t_template == 0) panic("tcp_output"); @@ -172,40 +245,31 @@ send: /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. + * If resending a FIN, be sure not to use a new sequence number. */ - ti->ti_seq = tp->snd_nxt; - ti->ti_ack = tp->rcv_nxt; -#if vax - ti->ti_seq = htonl(ti->ti_seq); - ti->ti_ack = htonl(ti->ti_ack); -#endif + if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && + tp->snd_nxt == tp->snd_max) + tp->snd_nxt--; + ti->ti_seq = htonl(tp->snd_nxt); + ti->ti_ack = htonl(tp->rcv_nxt); /* * Before ESTABLISHED, force sending of initial options * unless TCP set to not do any options. */ - if (tp->t_state < TCPS_ESTABLISHED) { - if (tp->t_flags&TF_NOOPT) - goto noopt; - opt = tcp_initopt; - optlen = sizeof (tcp_initopt); - *(u_short *)(opt + 2) = so->so_rcv.sb_hiwat / 2; -#if vax - *(u_short *)(opt + 2) = htons(*(u_short *)(opt + 2)); -#endif - } else { - if (tp->t_tcpopt == 0) - goto noopt; - opt = mtod(tp->t_tcpopt, u_char *); - optlen = tp->t_tcpopt->m_len; + opt = NULL; + if (flags & TH_SYN && (tp->t_flags & TF_NOOPT) == 0) { + u_short mss; + + mss = MIN(so->so_rcv.sb_hiwat / 2, tcp_mss(tp)); + if (mss > IP_MSS - sizeof(struct tcpiphdr)) { + opt = tcp_initopt; + optlen = sizeof (tcp_initopt); + *(u_short *)(opt + 2) = htons(mss); + } } -#ifndef TCPTRUEOOB - if (opt) -#else - if (opt || (tp->t_oobflags&(TCPOOB_OWEACK|TCPOOB_NEEDACK))) -#endif - { + if (opt) { m0 = m->m_next; - m->m_next = m_get(M_DONTWAIT); + m->m_next = m_get(M_DONTWAIT, MT_DATA); if (m->m_next == 0) { (void) m_free(m); m_freem(m0); @@ -213,35 +277,9 @@ send: } m->m_next->m_next = m0; m0 = m->m_next; - m0->m_off = MMINOFF; m0->m_len = optlen; bcopy((caddr_t)opt, mtod(m0, caddr_t), optlen); opt = (u_char *)(mtod(m0, caddr_t) + optlen); -#ifdef TCPTRUEOOB - if (tp->t_oobflags&TCPOOB_OWEACK) { -printf("tp %x send OOBACK for %x\n", tp->t_iobseq); - *opt++ = TCPOPT_OOBACK; - *opt++ = 3; - *opt++ = tp->t_iobseq; - m0->m_len += 3; - tp->t_oobflags &= ~TCPOOB_OWEACK; - /* sender should rexmt oob to force ack repeat */ - } - if (tp->t_oobflags&TCPOOB_NEEDACK) { -printf("tp %x send OOBDATA seq %x data %x\n", tp->t_oobseq, tp->t_oobc); - *opt++ = TCPOPT_OOBDATA; - *opt++ = 8; - *opt++ = tp->t_oobseq; - *opt++ = tp->t_oobc; - *(tcp_seq *)opt = tp->t_oobmark - tp->snd_nxt; -#ifdef vax - *(tcp_seq *)opt = htonl((unsigned)*(tcp_seq *)opt); -#endif - m0->m_len += 8; - TCPT_RANGESET(tp->t_timer[TCPT_OOBREXMT], - tcp_beta * tp->t_srtt, TCPTV_MIN, TCPTV_MAX); - } -#endif while (m0->m_len & 0x3) { *opt++ = TCPOPT_EOL; m0->m_len++; @@ -249,22 +287,18 @@ printf("tp %x send OOBDATA seq %x data %x\n", tp->t_oobseq, tp->t_oobc); optlen = m0->m_len; ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2; } -noopt: ti->ti_flags = flags; - win = sbspace(&so->so_rcv); - if (win < so->so_rcv.sb_hiwat / 4) /* avoid silly window */ + /* + * Calculate receive window. Don't shrink window, + * but avoid silly window syndrome. + */ + if (win < so->so_rcv.sb_hiwat / 4 && win < tp->t_maxseg) win = 0; - if (win > 0) -#if vax - ti->ti_win = htons((u_short)win); -#else - ti->ti_win = win; -#endif + if (win < (int)(tp->rcv_adv - tp->rcv_nxt)) + win = (int)(tp->rcv_adv - tp->rcv_nxt); + ti->ti_win = htons((u_short)win); if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { - ti->ti_urp = tp->snd_up - tp->snd_nxt; -#if vax - ti->ti_urp = htons(ti->ti_urp); -#endif + ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); ti->ti_flags |= TH_URG; } else /* @@ -277,9 +311,8 @@ noopt: /* * If anything to send and we can send it all, set PUSH. * (This will keep happy those implementations which only - * give data to the user when a buffer fills or a PUSH comes in. + * give data to the user when a buffer fills or a PUSH comes in.) */ -/* if (len && (ti->ti_flags & (TH_FIN|TH_RST|TH_SYN)) == 0) */ if (len && off+len == so->so_snd.sb_cc) ti->ti_flags |= TH_PUSH; @@ -287,56 +320,60 @@ noopt: * Put TCP length in extended header, and then * checksum extended header and data. */ - if (len + optlen) { - ti->ti_len = sizeof (struct tcphdr) + optlen + len; -#if vax - ti->ti_len = htons((u_short)ti->ti_len); -#endif - } + if (len + optlen) + ti->ti_len = htons((u_short)(sizeof(struct tcphdr) + + optlen + len)); ti->ti_sum = in_cksum(m, sizeof (struct tcpiphdr) + (int)optlen + len); /* * In transmit state, time the transmission and arrange for - * the retransmit. In persist state, reset persist time for - * next persist. + * the retransmit. In persist state, just set snd_max. */ - if (tp->t_force == 0) { + if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { + tcp_seq startseq = tp->snd_nxt; + /* * Advance snd_nxt over sequence space of this segment. */ - if (flags & (TH_SYN|TH_FIN)) + if (flags & TH_SYN) + tp->snd_nxt++; + if (flags & TH_FIN) { tp->snd_nxt++; + tp->t_flags |= TF_SENTFIN; + } tp->snd_nxt += len; - if (SEQ_GT(tp->snd_nxt, tp->snd_max)) + if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; - - /* - * Time this transmission if not a retransmission and - * not currently timing anything. - */ - if (SEQ_GT(tp->snd_nxt, tp->snd_max) && tp->t_rtt == 0) { - tp->t_rtt = 1; - tp->t_rtseq = tp->snd_nxt - len; + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + */ + if (tp->t_rtt == 0) { + tp->t_rtt = 1; + tp->t_rtseq = startseq; + tcpstat.tcps_segstimed++; + } } /* - * Set retransmit timer if not currently set. - * Initial value for retransmit timer to tcp_beta*tp->t_srtt. - * Initialize shift counter which is used for exponential - * backoff of retransmit time. + * Set retransmit timer if not currently set, + * and not doing an ack or a keep-alive probe. + * Initial value for retransmit timer is smoothed + * round-trip time + 2 * round-trip time variance. + * Initialize shift counter which is used for backoff + * of retransmit time. */ if (tp->t_timer[TCPT_REXMT] == 0 && tp->snd_nxt != tp->snd_una) { - TCPT_RANGESET(tp->t_timer[TCPT_REXMT], - tcp_beta * tp->t_srtt, TCPTV_MIN, TCPTV_MAX); - tp->t_rtt = 0; - tp->t_rxtshift = 0; + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + if (tp->t_timer[TCPT_PERSIST]) { + tp->t_timer[TCPT_PERSIST] = 0; + tp->t_rxtshift = 0; + } } - tp->t_timer[TCPT_PERSIST] = 0; - } else { - if (SEQ_GT(tp->snd_una+1, tp->snd_max)) - tp->snd_max = tp->snd_una+1; - } + } else + if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) + tp->snd_max = tp->snd_nxt + len; /* * Trace. @@ -350,20 +387,22 @@ noopt: */ ((struct ip *)ti)->ip_len = sizeof (struct tcpiphdr) + optlen + len; ((struct ip *)ti)->ip_ttl = TCP_TTL; - if (error = ip_output(m, tp->t_ipopt, (so->so_options & SO_DONTROUTE) ? - &routetoif : &tp->t_inpcb->inp_route, 0)) + error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + so->so_options & SO_DONTROUTE); + if (error) return (error); + tcpstat.tcps_sndtotal++; /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. - * Drop send for purpose of ACK requirements. + * Any pending ACK has now been sent. */ if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + win; tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); - if (sendalot && tp->t_force == 0) + if (sendalot) goto again; return (0); } @@ -371,6 +410,7 @@ noopt: tcp_setpersist(tp) register struct tcpcb *tp; { + register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; if (tp->t_timer[TCPT_REXMT]) panic("tcp_output REXMT"); @@ -378,9 +418,8 @@ tcp_setpersist(tp) * Start/restart persistance timer. */ TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], - ((int)(tcp_beta * tp->t_srtt)) << tp->t_rxtshift, - TCPTV_PERSMIN, TCPTV_MAX); - tp->t_rxtshift++; - if (tp->t_rxtshift >= TCP_MAXRXTSHIFT) - tp->t_rxtshift = 0; + t * tcp_backoff[tp->t_rxtshift], + TCPTV_PERSMIN, TCPTV_PERSMAX); + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) + tp->t_rxtshift++; }