-/* tcp_output.c 4.30 82/01/18 */
+/*
+ * Copyright (c) 1982, 1986 Regents of the University of California.
+ * All rights reserved. The Berkeley software License Agreement
+ * specifies the terms and conditions for redistribution.
+ *
+ * @(#)tcp_output.c 7.10 (Berkeley) %G%
+ */
-#include "../h/param.h"
-#include "../h/systm.h"
-#include "../h/mbuf.h"
-#include "../h/protosw.h"
-#include "../h/socket.h"
-#include "../h/socketvar.h"
-#include "../net/in.h"
-#include "../net/in_pcb.h"
-#include "../net/in_systm.h"
-#include "../net/ip.h"
-#include "../net/ip_var.h"
-#include "../net/tcp.h"
-#define TCPOUTFLAGS
-#include "../net/tcp_fsm.h"
-#include "../net/tcp_seq.h"
-#include "../net/tcp_timer.h"
-#include "../net/tcp_var.h"
-#include "../net/tcpip.h"
-#include "../net/tcp_debug.h"
-#include "../errno.h"
+#include "param.h"
+#include "systm.h"
+#include "mbuf.h"
+#include "protosw.h"
+#include "socket.h"
+#include "socketvar.h"
+#include "errno.h"
-char *tcpstates[]; /* XXX */
+#include "../net/route.h"
+
+#include "in.h"
+#include "in_pcb.h"
+#include "in_systm.h"
+#include "ip.h"
+#include "ip_var.h"
+#include "tcp.h"
+#define TCPOUTFLAGS
+#include "tcp_fsm.h"
+#include "tcp_seq.h"
+#include "tcp_timer.h"
+#include "tcp_var.h"
+#include "tcpip.h"
+#include "tcp_debug.h"
/*
- * Initial options: indicate max segment length 1/2 of space
- * allocated for receive; if TCPTRUEOOB is defined, indicate
- * willingness to do true out-of-band.
+ * Initial options.
*/
-#ifndef TCPTRUEOOB
u_char tcp_initopt[4] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, };
-#else
-u_char tcp_initopt[6] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, TCPOPT_WILLOOB, 2 };
-#endif
/*
* Tcp output routine: figure out what should be sent and send it.
register struct tcpcb *tp;
{
register struct socket *so = tp->t_inpcb->inp_socket;
- register int len;
+ register int len, win;
struct mbuf *m0;
- int off, flags;
+ int off, flags, error;
register struct mbuf *m;
register struct tcpiphdr *ti;
- int win, force;
u_char *opt;
unsigned optlen = 0;
-
-COUNT(TCP_OUTPUT);
+ int idle, sendalot;
/*
- * Determine length of data that can be transmitted,
+ * Determine length of data that should be transmitted,
* and flags that will be used.
* If there is some data or critical controls (SYN, RST)
* to send, then transmit; otherwise, investigate further.
*/
+ idle = (tp->snd_max == tp->snd_una);
+again:
+ sendalot = 0;
off = tp->snd_nxt - tp->snd_una;
- len = MIN(so->so_snd.sb_cc, tp->snd_wnd+tp->t_force) - off;
- if (len < 0)
- return; /* past FIN */
- if (len > tp->t_maxseg)
- len = tp->t_maxseg;
+ win = MIN(tp->snd_wnd, tp->snd_cwnd);
+
+ /*
+ * If in persist timeout with window of 0, send 1 byte.
+ * Otherwise, if window is small but nonzero
+ * and timer expired, we will send what we can
+ * and go to transmit state.
+ */
+ if (tp->t_force) {
+ if (win == 0)
+ win = 1;
+ else {
+ tp->t_timer[TCPT_PERSIST] = 0;
+ tp->t_rxtshift = 0;
+ }
+ }
+
+ len = MIN(so->so_snd.sb_cc, win) - off;
flags = tcp_outflags[tp->t_state];
- if (tp->snd_nxt + len < tp->snd_una + so->so_snd.sb_cc)
+
+ if (len < 0) {
+ /*
+ * If FIN has been sent but not acked,
+ * but we haven't been called to retransmit,
+ * len will be -1; transmit if acking, otherwise no need.
+ * Otherwise, window shrank after we sent into it.
+ * If window shrank to 0, cancel pending retransmit
+ * and pull snd_nxt back to (closed) window.
+ * We will enter persist state below.
+ * If the window didn't close completely,
+ * just wait for an ACK.
+ */
+ if (flags & TH_FIN) {
+ if (tp->t_flags & TF_ACKNOW)
+ len = 0;
+ else
+ return (0);
+ } else if (win == 0) {
+ tp->t_timer[TCPT_REXMT] = 0;
+ tp->snd_nxt = tp->snd_una;
+ len = 0;
+ } else
+ return (0);
+ }
+ if (len > tp->t_maxseg) {
+ len = tp->t_maxseg;
+ sendalot = 1;
+ }
+ if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
flags &= ~TH_FIN;
- if (len || (flags & (TH_SYN|TH_RST|TH_FIN)))
- goto send;
+ win = sbspace(&so->so_rcv);
+
+ /*
+ * If our state indicates that FIN should be sent
+ * and we have not yet done so, or we're retransmitting the FIN,
+ * then we need to send.
+ */
+ if (flags & TH_FIN &&
+ ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
+ goto send;
/*
* Send if we owe peer an ACK.
*/
- if (tp->t_flags&TF_ACKNOW)
+ if (tp->t_flags & TF_ACKNOW)
+ goto send;
+ if (flags & (TH_SYN|TH_RST))
+ goto send;
+ if (SEQ_GT(tp->snd_up, tp->snd_una))
goto send;
-#ifdef TCPTRUEOOB
/*
- * Send if an out of band data or ack should be transmitted.
+ * Sender silly window avoidance. If connection is idle
+ * and can send all data, a maximum segment,
+ * at least a maximum default-size segment do it,
+ * or are forced, do it; otherwise don't bother.
+ * If peer's buffer is tiny, then send
+ * when window is at least half open.
+ * If retransmitting (possibly after persist timer forced us
+ * to send into a small window), then must resend.
*/
- if (tp->t_oobflags&(TCPOOB_OWEACK|TCPOOB_NEEDACK)))
- goto send;
-#endif
+ if (len) {
+ if (len == tp->t_maxseg)
+ goto send;
+ if ((idle || tp->t_flags & TF_NODELAY) &&
+ len + off >= so->so_snd.sb_cc)
+ goto send;
+ if (tp->t_force)
+ goto send;
+ if (len >= tp->max_sndwnd / 2)
+ goto send;
+ if (SEQ_LT(tp->snd_nxt, tp->snd_max))
+ goto send;
+ }
/*
- * Calculate available window in i, and also amount
- * of window known to peer (as advertised window less
- * next expected input.) If this is 35% or more of the
- * maximum possible window, then want to send a segment to peer.
+ * Compare available window to amount of window
+ * known to peer (as advertised window less
+ * next expected input.) If the difference is 35% or more of the
+ * maximum possible window, then want to send a window update to peer.
*/
- win = sbspace(&so->so_rcv);
- if (win > 0 &&
- ((100*(win-(tp->rcv_adv-tp->rcv_nxt))/so->so_rcv.sb_hiwat) >= 35))
- goto send;
+ if (win > 0) {
+ int adv = win - (tp->rcv_adv - tp->rcv_nxt);
+
+ if (100 * adv / so->so_rcv.sb_hiwat >= 35)
+ goto send;
+ if (adv >= 2 * tp->t_maxseg && so->so_rcv.sb_cc == 0)
+ goto send;
+ }
+
+ /*
+ * TCP window updates are not reliable, rather a polling protocol
+ * using ``persist'' packets is used to insure receipt of window
+ * updates. The three ``states'' for the output side are:
+ * idle not doing retransmits or persists
+ * persisting to move a small or zero window
+ * (re)transmitting and thereby not persisting
+ *
+ * tp->t_timer[TCPT_PERSIST]
+ * is set when we are in persist state.
+ * tp->t_force
+ * is set when we are called to send a persist packet.
+ * tp->t_timer[TCPT_REXMT]
+ * is set when we are retransmitting
+ * The output side is idle when both timers are zero.
+ *
+ * If send window is too small, there is data to transmit, and no
+ * retransmit or persist is pending, then go to persist state.
+ * If nothing happens soon, send when timer expires:
+ * if window is nonzero, transmit what we can,
+ * otherwise force out a byte.
+ */
+ if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
+ tp->t_timer[TCPT_PERSIST] == 0) {
+ tp->t_rxtshift = 0;
+ tcp_setpersist(tp);
+ }
/*
* No reason to send a segment, just return.
* be transmitted, and initialize the header from
* the template for sends on this connection.
*/
- MGET(m, 0);
- if (m == 0)
- return (0);
+ MGET(m, M_DONTWAIT, MT_HEADER);
+ if (m == NULL)
+ return (ENOBUFS);
m->m_off = MMAXOFF - sizeof (struct tcpiphdr);
m->m_len = sizeof (struct tcpiphdr);
if (len) {
+ if (tp->t_force && len == 1)
+ tcpstat.tcps_sndprobe++;
+ else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
+ tcpstat.tcps_sndrexmitpack++;
+ tcpstat.tcps_sndrexmitbyte += len;
+ } else {
+ tcpstat.tcps_sndpack++;
+ tcpstat.tcps_sndbyte += len;
+ }
m->m_next = m_copy(so->so_snd.sb_mb, off, len);
if (m->m_next == 0)
len = 0;
- }
+ } else if (tp->t_flags & TF_ACKNOW)
+ tcpstat.tcps_sndacks++;
+ else if (flags & (TH_SYN|TH_FIN|TH_RST))
+ tcpstat.tcps_sndctrl++;
+ else if (SEQ_GT(tp->snd_up, tp->snd_una))
+ tcpstat.tcps_sndurg++;
+ else
+ tcpstat.tcps_sndwinup++;
+
ti = mtod(m, struct tcpiphdr *);
if (tp->t_template == 0)
panic("tcp_output");
/*
* Fill in fields, remembering maximum advertised
* window for use in delaying messages about window sizes.
+ * If resending a FIN, be sure not to use a new sequence number.
*/
- ti->ti_seq = tp->snd_nxt;
- ti->ti_ack = tp->rcv_nxt;
-#if vax
- ti->ti_seq = htonl(ti->ti_seq);
- ti->ti_ack = htonl(ti->ti_ack);
-#endif
+ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
+ tp->snd_nxt == tp->snd_max)
+ tp->snd_nxt--;
+ ti->ti_seq = htonl(tp->snd_nxt);
+ ti->ti_ack = htonl(tp->rcv_nxt);
/*
* Before ESTABLISHED, force sending of initial options
* unless TCP set to not do any options.
*/
- if (tp->t_state < TCPS_ESTABLISHED) {
- if (tp->t_flags&TF_NOOPT)
- goto noopt;
- opt = tcp_initopt;
- optlen = sizeof (tcp_initopt);
- *(u_short *)(opt + 2) = so->so_rcv.sb_hiwat / 2;
-#if vax
- *(u_short *)(opt + 2) = htons(*(u_short *)(opt + 2));
-#endif
- } else {
- if (tp->t_tcpopt == 0)
- goto noopt;
- opt = mtod(tp->t_tcpopt, u_char *);
- optlen = tp->t_tcpopt->m_len;
+ opt = NULL;
+ if (flags & TH_SYN && (tp->t_flags & TF_NOOPT) == 0) {
+ u_short mss;
+
+ mss = MIN(so->so_rcv.sb_hiwat / 2, tcp_mss(tp));
+ if (mss > IP_MSS - sizeof(struct tcpiphdr)) {
+ opt = tcp_initopt;
+ optlen = sizeof (tcp_initopt);
+ *(u_short *)(opt + 2) = htons(mss);
+ }
}
-#ifndef TCPTRUEOOB
- if (opt)
-#else
- if (opt || (tp->t_oobflags&(TCPOOB_OWEACK|TCPOOB_NEEDACK)))
-#endif
- {
+ if (opt) {
m0 = m->m_next;
- m->m_next = m_get(0);
+ m->m_next = m_get(M_DONTWAIT, MT_DATA);
if (m->m_next == 0) {
(void) m_free(m);
m_freem(m0);
- return (0);
+ return (ENOBUFS);
}
m->m_next->m_next = m0;
m0 = m->m_next;
- m0->m_off = MMINOFF;
m0->m_len = optlen;
- bcopy(opt, mtod(m0, caddr_t), optlen);
+ bcopy((caddr_t)opt, mtod(m0, caddr_t), optlen);
opt = (u_char *)(mtod(m0, caddr_t) + optlen);
-#ifdef TCPTRUEOOB
- if (tp->t_oobflags&TCPOOB_OWEACK) {
-printf("tp %x send OOBACK for %x\n", tp->t_iobseq);
- *opt++ = TCPOPT_OOBACK;
- *opt++ = 3;
- *opt++ = tp->t_iobseq;
- m0->m_len += 3;
- tp->t_oobflags &= ~TCPOOB_OWEACK;
- /* sender should rexmt oob to force ack repeat */
- }
- if (tp->t_oobflags&TCPOOB_NEEDACK) {
-printf("tp %x send OOBDATA seq %x data %x\n", tp->t_oobseq, tp->t_oobc);
- *opt++ = TCPOPT_OOBDATA;
- *opt++ = 8;
- *opt++ = tp->t_oobseq;
- *opt++ = tp->t_oobc;
- *(tcp_seq *)opt = tp->t_oobmark - tp->snd_nxt;
-#ifdef vax
- *(tcp_seq *)opt = htonl((unsigned)*(tcp_seq *)opt);
-#endif
- m0->m_len += 8;
- TCPT_RANGESET(tp->t_timer[TCPT_OOBREXMT],
- tcp_beta * tp->t_srtt, TCPTV_MIN, TCPTV_MAX);
- }
-#endif
while (m0->m_len & 0x3) {
*opt++ = TCPOPT_EOL;
m0->m_len++;
optlen = m0->m_len;
ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2;
}
-noopt:
ti->ti_flags = flags;
- win = sbspace(&so->so_rcv);
- if (win > 0)
- ti->ti_win = htons((u_short)win);
+ /*
+ * Calculate receive window. Don't shrink window,
+ * but avoid silly window syndrome.
+ */
+ if (win < so->so_rcv.sb_hiwat / 4 && win < tp->t_maxseg)
+ win = 0;
+ if (win < (int)(tp->rcv_adv - tp->rcv_nxt))
+ win = (int)(tp->rcv_adv - tp->rcv_nxt);
+ ti->ti_win = htons((u_short)win);
if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
- ti->ti_urp = tp->snd_up - tp->snd_nxt;
-#if vax
- ti->ti_urp = htons(ti->ti_urp);
-#endif
+ ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
ti->ti_flags |= TH_URG;
} else
/*
* number wraparound.
*/
tp->snd_up = tp->snd_una; /* drag it along */
- /* PUSH */
-
/*
- * Put TCP length in extended header, and then
- * checksum extended header and data.
+ * If anything to send and we can send it all, set PUSH.
+ * (This will keep happy those implementations which only
+ * give data to the user when a buffer fills or a PUSH comes in.)
*/
- if (len + optlen) {
- ti->ti_len = sizeof (struct tcphdr) + optlen + len;
-#if vax
- ti->ti_len = htons((u_short)ti->ti_len);
-#endif
- }
- ti->ti_sum = in_cksum(m, sizeof (struct tcpiphdr) + optlen + len);
+ if (len && off+len == so->so_snd.sb_cc)
+ ti->ti_flags |= TH_PUSH;
/*
- * Advance snd_nxt over sequence space of this segment
+ * Put TCP length in extended header, and then
+ * checksum extended header and data.
*/
- if (flags & (TH_SYN|TH_FIN))
- tp->snd_nxt++;
- tp->snd_nxt += len;
+ if (len + optlen)
+ ti->ti_len = htons((u_short)(sizeof(struct tcphdr) +
+ optlen + len));
+ ti->ti_sum = in_cksum(m, sizeof (struct tcpiphdr) + (int)optlen + len);
/*
- * If this transmission closes the window,
- * start persistance timer at 2 round trip times
- * but at least TCPTV_PERSMIN ticks.
+ * In transmit state, time the transmission and arrange for
+ * the retransmit. In persist state, just set snd_max.
*/
- if (TCPS_HAVERCVDSYN(tp->t_state) &&
- SEQ_GEQ(tp->snd_nxt, tp->snd_una+tp->snd_wnd) &&
- tp->t_timer[TCPT_PERSIST] == 0)
- TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
- 2 * tp->t_srtt, TCPTV_PERSMIN, TCPTV_MAX);
+ if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
+ tcp_seq startseq = tp->snd_nxt;
- /*
- * Time this transmission if not a retransmission and
- * not currently timing anything.
- */
- if (SEQ_GT(tp->snd_nxt, tp->snd_max) && tp->t_rtt == 0) {
- tp->t_rtt = 1;
- tp->t_rtseq = tp->snd_nxt - len;
- }
+ /*
+ * Advance snd_nxt over sequence space of this segment.
+ */
+ if (flags & TH_SYN)
+ tp->snd_nxt++;
+ if (flags & TH_FIN) {
+ tp->snd_nxt++;
+ tp->t_flags |= TF_SENTFIN;
+ }
+ tp->snd_nxt += len;
+ if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
+ tp->snd_max = tp->snd_nxt;
+ /*
+ * Time this transmission if not a retransmission and
+ * not currently timing anything.
+ */
+ if (tp->t_rtt == 0) {
+ tp->t_rtt = 1;
+ tp->t_rtseq = startseq;
+ tcpstat.tcps_segstimed++;
+ }
+ }
- /*
- * Set retransmit timer if not currently set.
- * Initial value for retransmit timer to tcp_beta*tp->t_srtt.
- * Initialize shift counter which is used for exponential
- * backoff of retransmit time.
- */
- if (tp->t_timer[TCPT_REXMT] == 0 && tp->snd_nxt != tp->snd_una) {
- TCPT_RANGESET(tp->t_timer[TCPT_REXMT],
- tcp_beta * tp->t_srtt, TCPTV_MIN, TCPTV_MAX);
- tp->t_rtt = 0;
- tp->t_rxtshift = 0;
- }
+ /*
+ * Set retransmit timer if not currently set,
+ * and not doing an ack or a keep-alive probe.
+ * Initial value for retransmit timer is smoothed
+ * round-trip time + 2 * round-trip time variance.
+ * Initialize shift counter which is used for backoff
+ * of retransmit time.
+ */
+ if (tp->t_timer[TCPT_REXMT] == 0 &&
+ tp->snd_nxt != tp->snd_una) {
+ tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
+ if (tp->t_timer[TCPT_PERSIST]) {
+ tp->t_timer[TCPT_PERSIST] = 0;
+ tp->t_rxtshift = 0;
+ }
+ }
+ } else
+ if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
+ tp->snd_max = tp->snd_nxt + len;
/*
* Trace.
*/
- if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
+ if (so->so_options & SO_DEBUG)
tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0);
/*
*/
((struct ip *)ti)->ip_len = sizeof (struct tcpiphdr) + optlen + len;
((struct ip *)ti)->ip_ttl = TCP_TTL;
- if (ip_output(m, tp->t_ipopt) == 0)
- return (0);
+ error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
+ so->so_options & SO_DONTROUTE);
+ if (error)
+ return (error);
+ tcpstat.tcps_sndtotal++;
/*
* Data sent (as far as we can tell).
* If this advertises a larger window than any other segment,
* then remember the size of the advertised window.
- * Drop send for purpose of ACK requirements.
+ * Any pending ACK has now been sent.
*/
if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
tp->rcv_adv = tp->rcv_nxt + win;
tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
- if (SEQ_GT(tp->snd_nxt, tp->snd_max))
- tp->snd_max = tp->snd_nxt;
- return (1);
+ if (sendalot)
+ goto again;
+ return (0);
+}
+
+tcp_setpersist(tp)
+ register struct tcpcb *tp;
+{
+ register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
+
+ if (tp->t_timer[TCPT_REXMT])
+ panic("tcp_output REXMT");
+ /*
+ * Start/restart persistance timer.
+ */
+ TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
+ t * tcp_backoff[tp->t_rxtshift],
+ TCPTV_PERSMIN, TCPTV_PERSMAX);
+ if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
+ tp->t_rxtshift++;
}