add macro for rexmt time; use soft error if set after timeout
[unix-history] / usr / src / sys / netinet / tcp_output.c
CommitLineData
8ae0e4b4 1/*
4a86c86b 2 * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
2b6b6284 3 * All rights reserved.
8ae0e4b4 4 *
2b6b6284 5 * Redistribution and use in source and binary forms are permitted
616d42db
KB
6 * provided that the above copyright notice and this paragraph are
7 * duplicated in all such forms and that any documentation,
8 * advertising materials, and other materials related to such
9 * distribution and use acknowledge that the software was developed
10 * by the University of California, Berkeley. The name of the
11 * University may not be used to endorse or promote products derived
12 * from this software without specific prior written permission.
13 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
14 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
15 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
2b6b6284 16 *
ccc60b2b 17 * @(#)tcp_output.c 7.19 (Berkeley) %G%
8ae0e4b4 18 */
76ee76df 19
20666ad3
JB
20#include "param.h"
21#include "systm.h"
4a86c86b 22#include "malloc.h"
20666ad3
JB
23#include "mbuf.h"
24#include "protosw.h"
25#include "socket.h"
26#include "socketvar.h"
27#include "errno.h"
f4d55810 28
c124e997 29#include "../net/route.h"
f4d55810 30
20666ad3 31#include "in.h"
20666ad3
JB
32#include "in_systm.h"
33#include "ip.h"
ccc60b2b 34#include "in_pcb.h"
20666ad3
JB
35#include "ip_var.h"
36#include "tcp.h"
0974b45c 37#define TCPOUTFLAGS
20666ad3
JB
38#include "tcp_fsm.h"
39#include "tcp_seq.h"
40#include "tcp_timer.h"
41#include "tcp_var.h"
42#include "tcpip.h"
43#include "tcp_debug.h"
76ee76df 44
8b5a83bb 45/*
77a4e3ca 46 * Initial options.
8b5a83bb 47 */
8b5a83bb 48u_char tcp_initopt[4] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, };
8b5a83bb 49
ea727f86 50/*
4aed14e3 51 * Tcp output routine: figure out what should be sent and send it.
ea727f86 52 */
a6503abf 53tcp_output(tp)
53a5409e 54 register struct tcpcb *tp;
ea727f86 55{
53a5409e 56 register struct socket *so = tp->t_inpcb->inp_socket;
8129ec6e 57 register long len, win;
a6503abf 58 struct mbuf *m0;
acad00cc 59 int off, flags, error;
a6503abf
BJ
60 register struct mbuf *m;
61 register struct tcpiphdr *ti;
8b5a83bb
BJ
62 u_char *opt;
63 unsigned optlen = 0;
acad00cc 64 int idle, sendalot;
76ee76df 65
a6503abf 66 /*
8ae6c089 67 * Determine length of data that should be transmitted,
0974b45c
BJ
68 * and flags that will be used.
69 * If there is some data or critical controls (SYN, RST)
70 * to send, then transmit; otherwise, investigate further.
a6503abf 71 */
acad00cc 72 idle = (tp->snd_max == tp->snd_una);
2266a466
BJ
73again:
74 sendalot = 0;
a6503abf 75 off = tp->snd_nxt - tp->snd_una;
4a86c86b 76 win = min(tp->snd_wnd, tp->snd_cwnd);
e4af65f3 77
eaf69575
MK
78 /*
79 * If in persist timeout with window of 0, send 1 byte.
acad00cc
MK
80 * Otherwise, if window is small but nonzero
81 * and timer expired, we will send what we can
82 * and go to transmit state.
eaf69575
MK
83 */
84 if (tp->t_force) {
e4af65f3 85 if (win == 0)
eaf69575
MK
86 win = 1;
87 else {
88 tp->t_timer[TCPT_PERSIST] = 0;
89 tp->t_rxtshift = 0;
90 }
91 }
acad00cc 92
4a86c86b 93 len = min(so->so_snd.sb_cc, win) - off;
0974b45c 94 flags = tcp_outflags[tp->t_state];
e4af65f3
MK
95
96 if (len < 0) {
97 /*
dd0ed0b1
MK
98 * If FIN has been sent but not acked,
99 * but we haven't been called to retransmit,
449e69b4
MK
100 * len will be -1. Otherwise, window shrank
101 * after we sent into it. If window shrank to 0,
102 * cancel pending retransmit and pull snd_nxt
103 * back to (closed) window. We will enter persist
104 * state below. If the window didn't close completely,
dd0ed0b1 105 * just wait for an ACK.
e4af65f3 106 */
449e69b4
MK
107 len = 0;
108 if (win == 0) {
faa26a98
MK
109 tp->t_timer[TCPT_REXMT] = 0;
110 tp->snd_nxt = tp->snd_una;
449e69b4 111 }
faa26a98
MK
112 }
113 if (len > tp->t_maxseg) {
114 len = tp->t_maxseg;
2e5a76f2 115 sendalot = 1;
e4af65f3 116 }
baf677ce
MK
117 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
118 flags &= ~TH_FIN;
e4af65f3 119 win = sbspace(&so->so_rcv);
acad00cc 120
faa26a98
MK
121
122 /*
123 * If our state indicates that FIN should be sent
124 * and we have not yet done so, or we're retransmitting the FIN,
125 * then we need to send.
126 */
127 if (flags & TH_FIN &&
128 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
129 goto send;
acad00cc
MK
130 /*
131 * Send if we owe peer an ACK.
132 */
e4af65f3
MK
133 if (tp->t_flags & TF_ACKNOW)
134 goto send;
faa26a98 135 if (flags & (TH_SYN|TH_RST))
acad00cc 136 goto send;
8ae6c089 137 if (SEQ_GT(tp->snd_up, tp->snd_una))
a6503abf
BJ
138 goto send;
139
8ae6c089 140 /*
7d304adf
MK
141 * Sender silly window avoidance. If connection is idle
142 * and can send all data, a maximum segment,
143 * at least a maximum default-size segment do it,
8ae6c089 144 * or are forced, do it; otherwise don't bother.
18a438b6
MK
145 * If peer's buffer is tiny, then send
146 * when window is at least half open.
eaf69575
MK
147 * If retransmitting (possibly after persist timer forced us
148 * to send into a small window), then must resend.
8ae6c089
BJ
149 */
150 if (len) {
3d92549d 151 if (len == tp->t_maxseg)
8ae6c089 152 goto send;
acad00cc
MK
153 if ((idle || tp->t_flags & TF_NODELAY) &&
154 len + off >= so->so_snd.sb_cc)
8ae6c089
BJ
155 goto send;
156 if (tp->t_force)
157 goto send;
18a438b6
MK
158 if (len >= tp->max_sndwnd / 2)
159 goto send;
eaf69575
MK
160 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
161 goto send;
e4af65f3 162 }
76ee76df 163
a6503abf 164 /*
acad00cc
MK
165 * Compare available window to amount of window
166 * known to peer (as advertised window less
449e69b4 167 * next expected input). If the difference is at least two
ccc60b2b 168 * max size segments, or at least 50% of the maximum possible
449e69b4 169 * window, then want to send a window update to peer.
a6503abf 170 */
a6bbda13
MK
171 if (win > 0) {
172 int adv = win - (tp->rcv_adv - tp->rcv_nxt);
173
ccc60b2b 174 if (adv >= 2 * tp->t_maxseg)
a6bbda13 175 goto send;
ccc60b2b 176 if (2 * adv >= so->so_rcv.sb_hiwat)
a6bbda13
MK
177 goto send;
178 }
a6503abf 179
2266a466
BJ
180 /*
181 * TCP window updates are not reliable, rather a polling protocol
182 * using ``persist'' packets is used to insure receipt of window
183 * updates. The three ``states'' for the output side are:
184 * idle not doing retransmits or persists
acad00cc 185 * persisting to move a small or zero window
2266a466
BJ
186 * (re)transmitting and thereby not persisting
187 *
188 * tp->t_timer[TCPT_PERSIST]
189 * is set when we are in persist state.
190 * tp->t_force
191 * is set when we are called to send a persist packet.
192 * tp->t_timer[TCPT_REXMT]
193 * is set when we are retransmitting
194 * The output side is idle when both timers are zero.
195 *
eaf69575
MK
196 * If send window is too small, there is data to transmit, and no
197 * retransmit or persist is pending, then go to persist state.
198 * If nothing happens soon, send when timer expires:
199 * if window is nonzero, transmit what we can,
200 * otherwise force out a byte.
2266a466 201 */
eaf69575
MK
202 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
203 tp->t_timer[TCPT_PERSIST] == 0) {
2266a466
BJ
204 tp->t_rxtshift = 0;
205 tcp_setpersist(tp);
206 }
207
a6503abf
BJ
208 /*
209 * No reason to send a segment, just return.
210 */
f1b2fa5b 211 return (0);
a6503abf
BJ
212
213send:
214 /*
215 * Grab a header mbuf, attaching a copy of data to
216 * be transmitted, and initialize the header from
217 * the template for sends on this connection.
218 */
4a86c86b 219 MGETHDR(m, M_DONTWAIT, MT_HEADER);
60d68e9e 220 if (m == NULL)
8a2f82db 221 return (ENOBUFS);
4a86c86b 222 m->m_data += max_linkhdr;
53a5409e 223 m->m_len = sizeof (struct tcpiphdr);
4a86c86b 224 m->m_pkthdr.rcvif = (struct ifnet *)0;
8129ec6e 225 ti = mtod(m, struct tcpiphdr *);
a6503abf 226 if (len) {
3b52afc5
MK
227 if (tp->t_force && len == 1)
228 tcpstat.tcps_sndprobe++;
229 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
230 tcpstat.tcps_sndrexmitpack++;
231 tcpstat.tcps_sndrexmitbyte += len;
232 } else {
233 tcpstat.tcps_sndpack++;
234 tcpstat.tcps_sndbyte += len;
235 }
4a86c86b 236 if (len <= MHLEN - sizeof (struct tcpiphdr) - max_linkhdr) {
8129ec6e
MK
237 if (m->m_next == 0)
238 len = 0;
239 }
3b52afc5
MK
240 } else if (tp->t_flags & TF_ACKNOW)
241 tcpstat.tcps_sndacks++;
242 else if (flags & (TH_SYN|TH_FIN|TH_RST))
243 tcpstat.tcps_sndctrl++;
244 else if (SEQ_GT(tp->snd_up, tp->snd_una))
245 tcpstat.tcps_sndurg++;
246 else
247 tcpstat.tcps_sndwinup++;
248
a6503abf
BJ
249 if (tp->t_template == 0)
250 panic("tcp_output");
f1b2fa5b 251 bcopy((caddr_t)tp->t_template, (caddr_t)ti, sizeof (struct tcpiphdr));
a6503abf
BJ
252
253 /*
254 * Fill in fields, remembering maximum advertised
255 * window for use in delaying messages about window sizes.
faa26a98 256 * If resending a FIN, be sure not to use a new sequence number.
a6503abf 257 */
3d92549d
MK
258 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
259 tp->snd_nxt == tp->snd_max)
faa26a98 260 tp->snd_nxt--;
acad00cc
MK
261 ti->ti_seq = htonl(tp->snd_nxt);
262 ti->ti_ack = htonl(tp->rcv_nxt);
8b5a83bb
BJ
263 /*
264 * Before ESTABLISHED, force sending of initial options
265 * unless TCP set to not do any options.
266 */
acad00cc 267 opt = NULL;
1ccb6fcd 268 if (flags & TH_SYN && (tp->t_flags & TF_NOOPT) == 0) {
8011f5df 269 u_short mss;
99578149 270
4a86c86b 271 mss = min(so->so_rcv.sb_hiwat / 2, tcp_mss(tp));
acad00cc
MK
272 if (mss > IP_MSS - sizeof(struct tcpiphdr)) {
273 opt = tcp_initopt;
274 optlen = sizeof (tcp_initopt);
275 *(u_short *)(opt + 2) = htons(mss);
276 }
8b5a83bb 277 }
77a4e3ca 278 if (opt) {
f1b2fa5b 279 m0 = m->m_next;
cce93e4b 280 m->m_next = m_get(M_DONTWAIT, MT_DATA);
0974b45c
BJ
281 if (m->m_next == 0) {
282 (void) m_free(m);
8b5a83bb 283 m_freem(m0);
8a2f82db 284 return (ENOBUFS);
0974b45c
BJ
285 }
286 m->m_next->m_next = m0;
8b5a83bb 287 m0 = m->m_next;
8b5a83bb 288 m0->m_len = optlen;
668cc26d 289 bcopy((caddr_t)opt, mtod(m0, caddr_t), optlen);
8b5a83bb 290 opt = (u_char *)(mtod(m0, caddr_t) + optlen);
8b5a83bb
BJ
291 while (m0->m_len & 0x3) {
292 *opt++ = TCPOPT_EOL;
293 m0->m_len++;
294 }
295 optlen = m0->m_len;
296 ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2;
0974b45c
BJ
297 }
298 ti->ti_flags = flags;
acad00cc
MK
299 /*
300 * Calculate receive window. Don't shrink window,
301 * but avoid silly window syndrome.
302 */
6def2330 303 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
acad00cc 304 win = 0;
8129ec6e
MK
305 if (win > IP_MAXPACKET)
306 win = IP_MAXPACKET;
6def2330
MK
307 if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
308 win = (long)(tp->rcv_adv - tp->rcv_nxt);
dd861483
MK
309 if (win > IP_MAXPACKET)
310 win = IP_MAXPACKET;
acad00cc 311 ti->ti_win = htons((u_short)win);
0974b45c 312 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
8011f5df 313 ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
a6503abf
BJ
314 ti->ti_flags |= TH_URG;
315 } else
316 /*
317 * If no urgent pointer to send, then we pull
318 * the urgent pointer to the left edge of the send window
319 * so that it doesn't drift into the send window on sequence
320 * number wraparound.
321 */
0974b45c 322 tp->snd_up = tp->snd_una; /* drag it along */
02c1608b
BJ
323 /*
324 * If anything to send and we can send it all, set PUSH.
325 * (This will keep happy those implementations which only
5cdc4d65 326 * give data to the user when a buffer fills or a PUSH comes in.)
02c1608b 327 */
02c1608b
BJ
328 if (len && off+len == so->so_snd.sb_cc)
329 ti->ti_flags |= TH_PUSH;
a6503abf
BJ
330
331 /*
332 * Put TCP length in extended header, and then
333 * checksum extended header and data.
334 */
acad00cc
MK
335 if (len + optlen)
336 ti->ti_len = htons((u_short)(sizeof(struct tcphdr) +
337 optlen + len));
9340d736
MK
338 ti->ti_sum = in_cksum(m,
339 (int)(sizeof (struct tcpiphdr) + (int)optlen + len));
0974b45c
BJ
340
341 /*
2266a466 342 * In transmit state, time the transmission and arrange for
eaf69575 343 * the retransmit. In persist state, just set snd_max.
0974b45c 344 */
eaf69575 345 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
eb5508b2
MK
346 tcp_seq startseq = tp->snd_nxt;
347
2266a466 348 /*
8931cb5b 349 * Advance snd_nxt over sequence space of this segment.
2266a466 350 */
faa26a98
MK
351 if (flags & TH_SYN)
352 tp->snd_nxt++;
353 if (flags & TH_FIN) {
2266a466 354 tp->snd_nxt++;
faa26a98
MK
355 tp->t_flags |= TF_SENTFIN;
356 }
2266a466 357 tp->snd_nxt += len;
eb5508b2 358 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
e45e6858 359 tp->snd_max = tp->snd_nxt;
eb5508b2
MK
360 /*
361 * Time this transmission if not a retransmission and
362 * not currently timing anything.
363 */
364 if (tp->t_rtt == 0) {
365 tp->t_rtt = 1;
366 tp->t_rtseq = startseq;
367 tcpstat.tcps_segstimed++;
368 }
369 }
405c9168 370
2266a466 371 /*
eaf69575 372 * Set retransmit timer if not currently set,
6be9a225 373 * and not doing an ack or a keep-alive probe.
7cc62c26
MK
374 * Initial value for retransmit timer is smoothed
375 * round-trip time + 2 * round-trip time variance.
e4af65f3
MK
376 * Initialize shift counter which is used for backoff
377 * of retransmit time.
2266a466
BJ
378 */
379 if (tp->t_timer[TCPT_REXMT] == 0 &&
380 tp->snd_nxt != tp->snd_una) {
a6bbda13
MK
381 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
382 if (tp->t_timer[TCPT_PERSIST]) {
383 tp->t_timer[TCPT_PERSIST] = 0;
384 tp->t_rxtshift = 0;
385 }
2266a466 386 }
6be9a225 387 } else
acad00cc
MK
388 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
389 tp->snd_max = tp->snd_nxt + len;
a6503abf 390
f1dd32da
BJ
391 /*
392 * Trace.
393 */
8931cb5b 394 if (so->so_options & SO_DEBUG)
f1dd32da
BJ
395 tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0);
396
a6503abf
BJ
397 /*
398 * Fill in IP length and desired time to live and
399 * send to IP level.
400 */
8b5a83bb 401 ((struct ip *)ti)->ip_len = sizeof (struct tcpiphdr) + optlen + len;
4a86c86b
MK
402 if (m->m_flags & M_PKTHDR)
403 m->m_pkthdr.len = ((struct ip *)ti)->ip_len;
ccc60b2b 404 ((struct ip *)ti)->ip_ttl = tcp_ttl;
9d866d2f 405#if BSD>=43
d55475b1
MK
406 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
407 so->so_options & SO_DONTROUTE);
9d866d2f
MK
408#else
409 error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route,
410 so->so_options & SO_DONTROUTE);
411#endif
83591b9d
MK
412 if (error) {
413 if (error == ENOBUFS) {
414 tcp_quench(tp->t_inpcb);
415 return (0);
416 }
8a2f82db 417 return (error);
83591b9d 418 }
3b52afc5 419 tcpstat.tcps_sndtotal++;
a6503abf
BJ
420
421 /*
422 * Data sent (as far as we can tell).
423 * If this advertises a larger window than any other segment,
4aed14e3 424 * then remember the size of the advertised window.
acad00cc 425 * Any pending ACK has now been sent.
a6503abf 426 */
be43ac7f 427 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
a6503abf 428 tp->rcv_adv = tp->rcv_nxt + win;
0974b45c 429 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
acad00cc 430 if (sendalot)
2266a466 431 goto again;
8a2f82db 432 return (0);
76ee76df 433}
2266a466
BJ
434
435tcp_setpersist(tp)
436 register struct tcpcb *tp;
437{
7cc62c26 438 register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
2266a466
BJ
439
440 if (tp->t_timer[TCPT_REXMT])
441 panic("tcp_output REXMT");
442 /*
443 * Start/restart persistance timer.
444 */
445 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
7cc62c26 446 t * tcp_backoff[tp->t_rxtshift],
3d92549d
MK
447 TCPTV_PERSMIN, TCPTV_PERSMAX);
448 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
449 tp->t_rxtshift++;
2266a466 450}