slow-start changes from van
[unix-history] / usr / src / sys / netinet / tcp_output.c
/*
* Copyright (c) 1982, 1986 Regents of the University of California.
* All rights reserved. The Berkeley software License Agreement
* specifies the terms and conditions for redistribution.
*
* @(#)tcp_output.c 7.5 (Berkeley) %G%
*/
#include "param.h"
#include "systm.h"
#include "mbuf.h"
#include "protosw.h"
#include "socket.h"
#include "socketvar.h"
#include "errno.h"
#include "../net/route.h"
#include "in.h"
#include "in_pcb.h"
#include "in_systm.h"
#include "ip.h"
#include "ip_var.h"
#include "tcp.h"
#define TCPOUTFLAGS
#include "tcp_fsm.h"
#include "tcp_seq.h"
#include "tcp_timer.h"
#include "tcp_var.h"
#include "tcpip.h"
#include "tcp_debug.h"
/*
* Initial options.
*/
u_char tcp_initopt[4] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, };
/*
* Tcp output routine: figure out what should be sent and send it.
*/
tcp_output(tp)
register struct tcpcb *tp;
{
register struct socket *so = tp->t_inpcb->inp_socket;
register int len, win;
struct mbuf *m0;
int off, flags, error;
register struct mbuf *m;
register struct tcpiphdr *ti;
u_char *opt;
unsigned optlen = 0;
int idle, sendalot;
/*
* Determine length of data that should be transmitted,
* and flags that will be used.
* If there is some data or critical controls (SYN, RST)
* to send, then transmit; otherwise, investigate further.
*/
idle = (tp->snd_max == tp->snd_una);
again:
sendalot = 0;
off = tp->snd_nxt - tp->snd_una;
win = MIN(tp->snd_wnd, tp->snd_cwnd);
/*
* If in persist timeout with window of 0, send 1 byte.
* Otherwise, if window is small but nonzero
* and timer expired, we will send what we can
* and go to transmit state.
*/
if (tp->t_force) {
if (win == 0)
win = 1;
else {
tp->t_timer[TCPT_PERSIST] = 0;
tp->t_rxtshift = 0;
}
}
len = MIN(so->so_snd.sb_cc, win) - off;
flags = tcp_outflags[tp->t_state];
if (len < 0) {
/*
* If FIN has been sent but not acked,
* but we haven't been called to retransmit,
* len will be -1; transmit if acking, otherwise no need.
* Otherwise, window shrank after we sent into it.
* If window shrank to 0, cancel pending retransmit
* and pull snd_nxt back to (closed) window.
* We will enter persist state below.
* If the window didn't close completely,
* just wait for an ACK.
*/
if (flags & TH_FIN) {
if (tp->t_flags & TF_ACKNOW)
len = 0;
else
return (0);
} else if (win == 0) {
tp->t_timer[TCPT_REXMT] = 0;
tp->snd_nxt = tp->snd_una;
len = 0;
} else
return (0);
}
if (len > tp->t_maxseg) {
len = tp->t_maxseg;
sendalot = 1;
}
if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
flags &= ~TH_FIN;
win = sbspace(&so->so_rcv);
/*
* If our state indicates that FIN should be sent
* and we have not yet done so, or we're retransmitting the FIN,
* then we need to send.
*/
if (flags & TH_FIN &&
((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
goto send;
/*
* Send if we owe peer an ACK.
*/
if (tp->t_flags & TF_ACKNOW)
goto send;
if (flags & (TH_SYN|TH_RST))
goto send;
if (SEQ_GT(tp->snd_up, tp->snd_una))
goto send;
/*
* Sender silly window avoidance. If connection is idle
* and can send all data, a maximum segment,
* at least a maximum default-size segment do it,
* or are forced, do it; otherwise don't bother.
* If peer's buffer is tiny, then send
* when window is at least half open.
* If retransmitting (possibly after persist timer forced us
* to send into a small window), then must resend.
*/
if (len) {
if (len == tp->t_maxseg || len >= TCP_MSS) /* a lot */
goto send;
if ((idle || tp->t_flags & TF_NODELAY) &&
len + off >= so->so_snd.sb_cc)
goto send;
if (tp->t_force)
goto send;
if (len >= tp->max_sndwnd / 2)
goto send;
if (SEQ_LT(tp->snd_nxt, tp->snd_max))
goto send;
}
/*
* Compare available window to amount of window
* known to peer (as advertised window less
* next expected input.) If the difference is 35% or more of the
* maximum possible window, then want to send a window update to peer.
*/
if (win > 0 &&
((100*(win-(tp->rcv_adv-tp->rcv_nxt))/so->so_rcv.sb_hiwat) >= 35))
goto send;
/*
* TCP window updates are not reliable, rather a polling protocol
* using ``persist'' packets is used to insure receipt of window
* updates. The three ``states'' for the output side are:
* idle not doing retransmits or persists
* persisting to move a small or zero window
* (re)transmitting and thereby not persisting
*
* tp->t_timer[TCPT_PERSIST]
* is set when we are in persist state.
* tp->t_force
* is set when we are called to send a persist packet.
* tp->t_timer[TCPT_REXMT]
* is set when we are retransmitting
* The output side is idle when both timers are zero.
*
* If send window is too small, there is data to transmit, and no
* retransmit or persist is pending, then go to persist state.
* If nothing happens soon, send when timer expires:
* if window is nonzero, transmit what we can,
* otherwise force out a byte.
*/
if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
tp->t_timer[TCPT_PERSIST] == 0) {
tp->t_rxtshift = 0;
tcp_setpersist(tp);
}
/*
* No reason to send a segment, just return.
*/
return (0);
send:
/*
* Grab a header mbuf, attaching a copy of data to
* be transmitted, and initialize the header from
* the template for sends on this connection.
*/
MGET(m, M_DONTWAIT, MT_HEADER);
if (m == NULL)
return (ENOBUFS);
m->m_off = MMAXOFF - sizeof (struct tcpiphdr);
m->m_len = sizeof (struct tcpiphdr);
if (len) {
if (tp->t_force && len == 1)
tcpstat.tcps_sndprobe++;
else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
tcpstat.tcps_sndrexmitpack++;
tcpstat.tcps_sndrexmitbyte += len;
} else {
tcpstat.tcps_sndpack++;
tcpstat.tcps_sndbyte += len;
}
m->m_next = m_copy(so->so_snd.sb_mb, off, len);
if (m->m_next == 0)
len = 0;
} else if (tp->t_flags & TF_ACKNOW)
tcpstat.tcps_sndacks++;
else if (flags & (TH_SYN|TH_FIN|TH_RST))
tcpstat.tcps_sndctrl++;
else if (SEQ_GT(tp->snd_up, tp->snd_una))
tcpstat.tcps_sndurg++;
else
tcpstat.tcps_sndwinup++;
ti = mtod(m, struct tcpiphdr *);
if (tp->t_template == 0)
panic("tcp_output");
bcopy((caddr_t)tp->t_template, (caddr_t)ti, sizeof (struct tcpiphdr));
/*
* Fill in fields, remembering maximum advertised
* window for use in delaying messages about window sizes.
* If resending a FIN, be sure not to use a new sequence number.
*/
if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && len == 0)
tp->snd_nxt--;
ti->ti_seq = htonl(tp->snd_nxt);
ti->ti_ack = htonl(tp->rcv_nxt);
/*
* Before ESTABLISHED, force sending of initial options
* unless TCP set to not do any options.
*/
opt = NULL;
if (tp->t_state < TCPS_ESTABLISHED && (tp->t_flags & TF_NOOPT) == 0) {
u_short mss;
mss = MIN(so->so_rcv.sb_hiwat / 2, tcp_mss(tp));
if (mss > IP_MSS - sizeof(struct tcpiphdr)) {
opt = tcp_initopt;
optlen = sizeof (tcp_initopt);
*(u_short *)(opt + 2) = htons(mss);
}
} else if (tp->t_tcpopt) {
opt = mtod(tp->t_tcpopt, u_char *);
optlen = tp->t_tcpopt->m_len;
}
if (opt) {
m0 = m->m_next;
m->m_next = m_get(M_DONTWAIT, MT_DATA);
if (m->m_next == 0) {
(void) m_free(m);
m_freem(m0);
return (ENOBUFS);
}
m->m_next->m_next = m0;
m0 = m->m_next;
m0->m_len = optlen;
bcopy((caddr_t)opt, mtod(m0, caddr_t), optlen);
opt = (u_char *)(mtod(m0, caddr_t) + optlen);
while (m0->m_len & 0x3) {
*opt++ = TCPOPT_EOL;
m0->m_len++;
}
optlen = m0->m_len;
ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2;
}
ti->ti_flags = flags;
/*
* Calculate receive window. Don't shrink window,
* but avoid silly window syndrome.
*/
if (win < so->so_rcv.sb_hiwat / 4 && win < tp->t_maxseg)
win = 0;
if (win < (int)(tp->rcv_adv - tp->rcv_nxt))
win = (int)(tp->rcv_adv - tp->rcv_nxt);
ti->ti_win = htons((u_short)win);
if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
ti->ti_flags |= TH_URG;
} else
/*
* If no urgent pointer to send, then we pull
* the urgent pointer to the left edge of the send window
* so that it doesn't drift into the send window on sequence
* number wraparound.
*/
tp->snd_up = tp->snd_una; /* drag it along */
/*
* If anything to send and we can send it all, set PUSH.
* (This will keep happy those implementations which only
* give data to the user when a buffer fills or a PUSH comes in.)
*/
if (len && off+len == so->so_snd.sb_cc)
ti->ti_flags |= TH_PUSH;
/*
* Put TCP length in extended header, and then
* checksum extended header and data.
*/
if (len + optlen)
ti->ti_len = htons((u_short)(sizeof(struct tcphdr) +
optlen + len));
ti->ti_sum = in_cksum(m, sizeof (struct tcpiphdr) + (int)optlen + len);
/*
* In transmit state, time the transmission and arrange for
* the retransmit. In persist state, just set snd_max.
*/
if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
/*
* Advance snd_nxt over sequence space of this segment.
*/
if (flags & TH_SYN)
tp->snd_nxt++;
if (flags & TH_FIN) {
tp->snd_nxt++;
tp->t_flags |= TF_SENTFIN;
}
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
tp->snd_max = tp->snd_nxt;
/*
* Time this transmission if not a retransmission and
* not currently timing anything.
*/
if (tp->t_rtt == 0) {
tp->t_rtt = 1;
tp->t_rtseq = tp->snd_nxt - len;
tcpstat.tcps_segstimed++;
}
}
/*
* Set retransmit timer if not currently set,
* and not doing an ack or a keep-alive probe.
* Initial value for retransmit timer is tcp_beta*tp->t_srtt.
* Initialize shift counter which is used for backoff
* of retransmit time.
*/
if (tp->t_timer[TCPT_REXMT] == 0 &&
tp->snd_nxt != tp->snd_una) {
TCPT_RANGESET(tp->t_timer[TCPT_REXMT],
tcp_beta * (tp->t_srtt ? tp->t_srtt : TCPTV_SRTTDFLT),
TCPTV_MIN, TCPTV_MAX);
tp->t_rxtshift = 0;
tp->t_timer[TCPT_PERSIST] = 0;
}
} else
if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
tp->snd_max = tp->snd_nxt + len;
/*
* Trace.
*/
if (so->so_options & SO_DEBUG)
tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0);
/*
* Fill in IP length and desired time to live and
* send to IP level.
*/
((struct ip *)ti)->ip_len = sizeof (struct tcpiphdr) + optlen + len;
((struct ip *)ti)->ip_ttl = TCP_TTL;
error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
so->so_options & SO_DONTROUTE);
if (error)
return (error);
tcpstat.tcps_sndtotal++;
/*
* Data sent (as far as we can tell).
* If this advertises a larger window than any other segment,
* then remember the size of the advertised window.
* Any pending ACK has now been sent.
*/
if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
tp->rcv_adv = tp->rcv_nxt + win;
tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
if (sendalot)
goto again;
return (0);
}
tcp_setpersist(tp)
register struct tcpcb *tp;
{
if (tp->t_timer[TCPT_REXMT])
panic("tcp_output REXMT");
/*
* Start/restart persistance timer.
*/
TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
((int)(tcp_beta * tp->t_srtt)) << tp->t_rxtshift,
TCPTV_PERSMIN, TCPTV_MAX);
tp->t_rxtshift++;
if (tp->t_rxtshift >= TCP_MAXRXTSHIFT)
tp->t_rxtshift = 0;
}