reorganization to move ufsmount ops to be vnode ops;
[unix-history] / usr / src / sys / netinet / tcp_output.c
CommitLineData
8ae0e4b4 1/*
1f2bdbf2 2 * Copyright (c) 1982, 1986, 1988, 1990 Regents of the University of California.
2b6b6284 3 * All rights reserved.
8ae0e4b4 4 *
dbf0c423 5 * %sccs.include.redist.c%
2b6b6284 6 *
54a4ca03 7 * @(#)tcp_output.c 7.22 (Berkeley) %G%
8ae0e4b4 8 */
76ee76df 9
20666ad3
JB
10#include "param.h"
11#include "systm.h"
4a86c86b 12#include "malloc.h"
20666ad3
JB
13#include "mbuf.h"
14#include "protosw.h"
15#include "socket.h"
16#include "socketvar.h"
17#include "errno.h"
f4d55810 18
c124e997 19#include "../net/route.h"
f4d55810 20
20666ad3 21#include "in.h"
20666ad3
JB
22#include "in_systm.h"
23#include "ip.h"
ccc60b2b 24#include "in_pcb.h"
20666ad3
JB
25#include "ip_var.h"
26#include "tcp.h"
0974b45c 27#define TCPOUTFLAGS
20666ad3
JB
28#include "tcp_fsm.h"
29#include "tcp_seq.h"
30#include "tcp_timer.h"
31#include "tcp_var.h"
32#include "tcpip.h"
33#include "tcp_debug.h"
76ee76df 34
1f2bdbf2
MK
35#ifdef notyet
36extern struct mbuf *m_copypack();
37#endif
38
8b5a83bb 39/*
77a4e3ca 40 * Initial options.
8b5a83bb 41 */
8b5a83bb 42u_char tcp_initopt[4] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, };
8b5a83bb 43
ea727f86 44/*
4aed14e3 45 * Tcp output routine: figure out what should be sent and send it.
ea727f86 46 */
a6503abf 47tcp_output(tp)
53a5409e 48 register struct tcpcb *tp;
ea727f86 49{
53a5409e 50 register struct socket *so = tp->t_inpcb->inp_socket;
8129ec6e 51 register long len, win;
acad00cc 52 int off, flags, error;
a6503abf
BJ
53 register struct mbuf *m;
54 register struct tcpiphdr *ti;
8b5a83bb 55 u_char *opt;
1f2bdbf2 56 unsigned optlen, hdrlen;
acad00cc 57 int idle, sendalot;
76ee76df 58
a6503abf 59 /*
8ae6c089 60 * Determine length of data that should be transmitted,
0974b45c
BJ
61 * and flags that will be used.
62 * If there is some data or critical controls (SYN, RST)
63 * to send, then transmit; otherwise, investigate further.
a6503abf 64 */
acad00cc 65 idle = (tp->snd_max == tp->snd_una);
1f2bdbf2
MK
66 if (idle && tp->t_idle >= tp->t_rxtcur)
67 /*
68 * We have been idle for "a while" and no acks are
69 * expected to clock out any data we send --
70 * slow start to get ack "clock" running again.
71 */
72 tp->snd_cwnd = tp->t_maxseg;
2266a466
BJ
73again:
74 sendalot = 0;
a6503abf 75 off = tp->snd_nxt - tp->snd_una;
4a86c86b 76 win = min(tp->snd_wnd, tp->snd_cwnd);
e4af65f3 77
eaf69575
MK
78 /*
79 * If in persist timeout with window of 0, send 1 byte.
acad00cc
MK
80 * Otherwise, if window is small but nonzero
81 * and timer expired, we will send what we can
82 * and go to transmit state.
eaf69575
MK
83 */
84 if (tp->t_force) {
e4af65f3 85 if (win == 0)
eaf69575
MK
86 win = 1;
87 else {
88 tp->t_timer[TCPT_PERSIST] = 0;
89 tp->t_rxtshift = 0;
90 }
91 }
acad00cc 92
0974b45c 93 flags = tcp_outflags[tp->t_state];
1f2bdbf2 94 len = min(so->so_snd.sb_cc, win) - off;
e4af65f3
MK
95
96 if (len < 0) {
97 /*
dd0ed0b1
MK
98 * If FIN has been sent but not acked,
99 * but we haven't been called to retransmit,
449e69b4
MK
100 * len will be -1. Otherwise, window shrank
101 * after we sent into it. If window shrank to 0,
102 * cancel pending retransmit and pull snd_nxt
103 * back to (closed) window. We will enter persist
104 * state below. If the window didn't close completely,
dd0ed0b1 105 * just wait for an ACK.
e4af65f3 106 */
449e69b4
MK
107 len = 0;
108 if (win == 0) {
faa26a98
MK
109 tp->t_timer[TCPT_REXMT] = 0;
110 tp->snd_nxt = tp->snd_una;
449e69b4 111 }
faa26a98
MK
112 }
113 if (len > tp->t_maxseg) {
114 len = tp->t_maxseg;
2e5a76f2 115 sendalot = 1;
e4af65f3 116 }
baf677ce
MK
117 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
118 flags &= ~TH_FIN;
faa26a98 119
1f2bdbf2 120 win = sbspace(&so->so_rcv);
a6503abf 121
8ae6c089 122 /*
7d304adf
MK
123 * Sender silly window avoidance. If connection is idle
124 * and can send all data, a maximum segment,
125 * at least a maximum default-size segment do it,
8ae6c089 126 * or are forced, do it; otherwise don't bother.
18a438b6
MK
127 * If peer's buffer is tiny, then send
128 * when window is at least half open.
eaf69575
MK
129 * If retransmitting (possibly after persist timer forced us
130 * to send into a small window), then must resend.
8ae6c089
BJ
131 */
132 if (len) {
3d92549d 133 if (len == tp->t_maxseg)
8ae6c089 134 goto send;
acad00cc
MK
135 if ((idle || tp->t_flags & TF_NODELAY) &&
136 len + off >= so->so_snd.sb_cc)
8ae6c089
BJ
137 goto send;
138 if (tp->t_force)
139 goto send;
18a438b6
MK
140 if (len >= tp->max_sndwnd / 2)
141 goto send;
eaf69575
MK
142 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
143 goto send;
e4af65f3 144 }
76ee76df 145
a6503abf 146 /*
acad00cc
MK
147 * Compare available window to amount of window
148 * known to peer (as advertised window less
449e69b4 149 * next expected input). If the difference is at least two
ccc60b2b 150 * max size segments, or at least 50% of the maximum possible
449e69b4 151 * window, then want to send a window update to peer.
a6503abf 152 */
a6bbda13 153 if (win > 0) {
54a4ca03 154 long adv = win - (tp->rcv_adv - tp->rcv_nxt);
a6bbda13 155
54a4ca03 156 if (adv >= (long) (2 * tp->t_maxseg))
a6bbda13 157 goto send;
54a4ca03 158 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
a6bbda13
MK
159 goto send;
160 }
a6503abf 161
1f2bdbf2
MK
162 /*
163 * Send if we owe peer an ACK.
164 */
165 if (tp->t_flags & TF_ACKNOW)
166 goto send;
167 if (flags & (TH_SYN|TH_RST))
168 goto send;
169 if (SEQ_GT(tp->snd_up, tp->snd_una))
170 goto send;
171 /*
172 * If our state indicates that FIN should be sent
173 * and we have not yet done so, or we're retransmitting the FIN,
174 * then we need to send.
175 */
176 if (flags & TH_FIN &&
177 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
178 goto send;
179
2266a466
BJ
180 /*
181 * TCP window updates are not reliable, rather a polling protocol
182 * using ``persist'' packets is used to insure receipt of window
183 * updates. The three ``states'' for the output side are:
184 * idle not doing retransmits or persists
acad00cc 185 * persisting to move a small or zero window
2266a466
BJ
186 * (re)transmitting and thereby not persisting
187 *
188 * tp->t_timer[TCPT_PERSIST]
189 * is set when we are in persist state.
190 * tp->t_force
191 * is set when we are called to send a persist packet.
192 * tp->t_timer[TCPT_REXMT]
193 * is set when we are retransmitting
194 * The output side is idle when both timers are zero.
195 *
eaf69575
MK
196 * If send window is too small, there is data to transmit, and no
197 * retransmit or persist is pending, then go to persist state.
198 * If nothing happens soon, send when timer expires:
199 * if window is nonzero, transmit what we can,
200 * otherwise force out a byte.
2266a466 201 */
eaf69575
MK
202 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
203 tp->t_timer[TCPT_PERSIST] == 0) {
2266a466
BJ
204 tp->t_rxtshift = 0;
205 tcp_setpersist(tp);
206 }
207
a6503abf
BJ
208 /*
209 * No reason to send a segment, just return.
210 */
f1b2fa5b 211 return (0);
a6503abf
BJ
212
213send:
1f2bdbf2
MK
214 /*
215 * Before ESTABLISHED, force sending of initial options
216 * unless TCP set not to do any options.
217 * NOTE: we assume that the IP/TCP header plus TCP options
218 * always fit in a single mbuf, leaving room for a maximum
219 * link header, i.e.
220 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
221 */
222 optlen = 0;
223 hdrlen = sizeof (struct tcpiphdr);
224 if (flags & TH_SYN && (tp->t_flags & TF_NOOPT) == 0) {
225 opt = tcp_initopt;
226 optlen = sizeof (tcp_initopt);
227 hdrlen += sizeof (tcp_initopt);
228 *(u_short *)(opt + 2) = htons((u_short) tcp_mss(tp, 0));
229#ifdef DIAGNOSTIC
230 if (max_linkhdr + hdrlen > MHLEN)
231 panic("tcphdr too big");
232#endif
233 }
234
a6503abf
BJ
235 /*
236 * Grab a header mbuf, attaching a copy of data to
237 * be transmitted, and initialize the header from
238 * the template for sends on this connection.
239 */
a6503abf 240 if (len) {
3b52afc5
MK
241 if (tp->t_force && len == 1)
242 tcpstat.tcps_sndprobe++;
243 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
244 tcpstat.tcps_sndrexmitpack++;
245 tcpstat.tcps_sndrexmitbyte += len;
246 } else {
247 tcpstat.tcps_sndpack++;
248 tcpstat.tcps_sndbyte += len;
249 }
1f2bdbf2
MK
250#ifdef notyet
251 if ((m = m_copypack(so->so_snd.sb_mb, off,
252 (int)len, max_linkhdr + hdrlen)) == 0) {
253 error = ENOBUFS;
254 goto out;
255 }
256 /*
257 * m_copypack left space for our hdr; use it.
258 */
259 m->m_len += hdrlen;
260 m->m_data -= hdrlen;
261#else
262 MGETHDR(m, M_DONTWAIT, MT_HEADER);
263 if (m == NULL) {
264 error = ENOBUFS;
265 goto out;
266 }
267 m->m_data += max_linkhdr;
268 m->m_len = hdrlen;
269 if (len <= MHLEN - hdrlen - max_linkhdr) {
8129ec6e
MK
270 if (m->m_next == 0)
271 len = 0;
272 }
1f2bdbf2
MK
273#endif
274 /*
275 * If we're sending everything we've got, set PUSH.
276 * (This will keep happy those implementations which only
277 * give data to the user when a buffer fills or
278 * a PUSH comes in.)
279 */
280 if (off + len == so->so_snd.sb_cc)
281 flags |= TH_PUSH;
282 } else {
283 if (tp->t_flags & TF_ACKNOW)
284 tcpstat.tcps_sndacks++;
285 else if (flags & (TH_SYN|TH_FIN|TH_RST))
286 tcpstat.tcps_sndctrl++;
287 else if (SEQ_GT(tp->snd_up, tp->snd_una))
288 tcpstat.tcps_sndurg++;
289 else
290 tcpstat.tcps_sndwinup++;
3b52afc5 291
1f2bdbf2
MK
292 MGETHDR(m, M_DONTWAIT, MT_HEADER);
293 if (m == NULL) {
294 error = ENOBUFS;
295 goto out;
296 }
297 m->m_data += max_linkhdr;
298 m->m_len = hdrlen;
299 }
300 m->m_pkthdr.rcvif = (struct ifnet *)0;
301 ti = mtod(m, struct tcpiphdr *);
a6503abf
BJ
302 if (tp->t_template == 0)
303 panic("tcp_output");
f1b2fa5b 304 bcopy((caddr_t)tp->t_template, (caddr_t)ti, sizeof (struct tcpiphdr));
a6503abf
BJ
305
306 /*
307 * Fill in fields, remembering maximum advertised
308 * window for use in delaying messages about window sizes.
faa26a98 309 * If resending a FIN, be sure not to use a new sequence number.
a6503abf 310 */
3d92549d
MK
311 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
312 tp->snd_nxt == tp->snd_max)
faa26a98 313 tp->snd_nxt--;
acad00cc
MK
314 ti->ti_seq = htonl(tp->snd_nxt);
315 ti->ti_ack = htonl(tp->rcv_nxt);
1f2bdbf2
MK
316 if (optlen) {
317 bcopy((caddr_t)opt, (caddr_t)(ti + 1), optlen);
8b5a83bb 318 ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2;
0974b45c
BJ
319 }
320 ti->ti_flags = flags;
acad00cc
MK
321 /*
322 * Calculate receive window. Don't shrink window,
323 * but avoid silly window syndrome.
324 */
6def2330 325 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
acad00cc 326 win = 0;
1f2bdbf2
MK
327 if (win > TCP_MAXWIN)
328 win = TCP_MAXWIN;
6def2330
MK
329 if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
330 win = (long)(tp->rcv_adv - tp->rcv_nxt);
dd861483
MK
331 if (win > IP_MAXPACKET)
332 win = IP_MAXPACKET;
acad00cc 333 ti->ti_win = htons((u_short)win);
0974b45c 334 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
8011f5df 335 ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
a6503abf
BJ
336 ti->ti_flags |= TH_URG;
337 } else
338 /*
339 * If no urgent pointer to send, then we pull
340 * the urgent pointer to the left edge of the send window
341 * so that it doesn't drift into the send window on sequence
342 * number wraparound.
343 */
0974b45c 344 tp->snd_up = tp->snd_una; /* drag it along */
a6503abf
BJ
345
346 /*
347 * Put TCP length in extended header, and then
348 * checksum extended header and data.
349 */
acad00cc 350 if (len + optlen)
1f2bdbf2 351 ti->ti_len = htons((u_short)(sizeof (struct tcphdr) +
acad00cc 352 optlen + len));
1f2bdbf2 353 ti->ti_sum = in_cksum(m, (int)(hdrlen + len));
0974b45c
BJ
354
355 /*
2266a466 356 * In transmit state, time the transmission and arrange for
eaf69575 357 * the retransmit. In persist state, just set snd_max.
0974b45c 358 */
eaf69575 359 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
eb5508b2
MK
360 tcp_seq startseq = tp->snd_nxt;
361
2266a466 362 /*
8931cb5b 363 * Advance snd_nxt over sequence space of this segment.
2266a466 364 */
1f2bdbf2
MK
365 if (flags & (TH_SYN|TH_FIN)) {
366 if (flags & TH_SYN)
367 tp->snd_nxt++;
368 if (flags & TH_FIN) {
369 tp->snd_nxt++;
370 tp->t_flags |= TF_SENTFIN;
371 }
faa26a98 372 }
2266a466 373 tp->snd_nxt += len;
eb5508b2 374 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
e45e6858 375 tp->snd_max = tp->snd_nxt;
eb5508b2
MK
376 /*
377 * Time this transmission if not a retransmission and
378 * not currently timing anything.
379 */
380 if (tp->t_rtt == 0) {
381 tp->t_rtt = 1;
382 tp->t_rtseq = startseq;
383 tcpstat.tcps_segstimed++;
384 }
385 }
405c9168 386
2266a466 387 /*
eaf69575 388 * Set retransmit timer if not currently set,
6be9a225 389 * and not doing an ack or a keep-alive probe.
7cc62c26
MK
390 * Initial value for retransmit timer is smoothed
391 * round-trip time + 2 * round-trip time variance.
e4af65f3
MK
392 * Initialize shift counter which is used for backoff
393 * of retransmit time.
2266a466
BJ
394 */
395 if (tp->t_timer[TCPT_REXMT] == 0 &&
396 tp->snd_nxt != tp->snd_una) {
a6bbda13
MK
397 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
398 if (tp->t_timer[TCPT_PERSIST]) {
399 tp->t_timer[TCPT_PERSIST] = 0;
400 tp->t_rxtshift = 0;
401 }
2266a466 402 }
6be9a225 403 } else
acad00cc
MK
404 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
405 tp->snd_max = tp->snd_nxt + len;
a6503abf 406
f1dd32da
BJ
407 /*
408 * Trace.
409 */
8931cb5b 410 if (so->so_options & SO_DEBUG)
f1dd32da
BJ
411 tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0);
412
a6503abf
BJ
413 /*
414 * Fill in IP length and desired time to live and
1f2bdbf2
MK
415 * send to IP level. There should be a better way
416 * to handle ttl and tos; we could keep them in
417 * the template, but need a way to checksum without them.
a6503abf 418 */
1f2bdbf2
MK
419 m->m_pkthdr.len = hdrlen + len;
420 ((struct ip *)ti)->ip_len = m->m_pkthdr.len;
421 ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; /* XXX */
422 ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip.ip_tos; /* XXX */
423#if BSD >= 43
9d866d2f 424#if BSD>=43
d55475b1
MK
425 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
426 so->so_options & SO_DONTROUTE);
1f2bdbf2
MK
427#else
428 error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route,
429 so->so_options & SO_DONTROUTE);
430#endif
9d866d2f
MK
431#else
432 error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route,
433 so->so_options & SO_DONTROUTE);
434#endif
83591b9d 435 if (error) {
1f2bdbf2 436out:
83591b9d
MK
437 if (error == ENOBUFS) {
438 tcp_quench(tp->t_inpcb);
439 return (0);
440 }
1f2bdbf2
MK
441 if ((error == EHOSTUNREACH || error == ENETDOWN)
442 && TCPS_HAVERCVDSYN(tp->t_state)) {
443 tp->t_softerror = error;
444 return (0);
445 }
8a2f82db 446 return (error);
83591b9d 447 }
3b52afc5 448 tcpstat.tcps_sndtotal++;
a6503abf
BJ
449
450 /*
451 * Data sent (as far as we can tell).
452 * If this advertises a larger window than any other segment,
4aed14e3 453 * then remember the size of the advertised window.
acad00cc 454 * Any pending ACK has now been sent.
a6503abf 455 */
be43ac7f 456 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
a6503abf 457 tp->rcv_adv = tp->rcv_nxt + win;
0974b45c 458 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
acad00cc 459 if (sendalot)
2266a466 460 goto again;
8a2f82db 461 return (0);
76ee76df 462}
2266a466
BJ
463
464tcp_setpersist(tp)
465 register struct tcpcb *tp;
466{
7cc62c26 467 register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
2266a466
BJ
468
469 if (tp->t_timer[TCPT_REXMT])
470 panic("tcp_output REXMT");
471 /*
472 * Start/restart persistance timer.
473 */
474 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
7cc62c26 475 t * tcp_backoff[tp->t_rxtshift],
3d92549d
MK
476 TCPTV_PERSMIN, TCPTV_PERSMAX);
477 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
478 tp->t_rxtshift++;
2266a466 479}