not doing *anything* while waiting for input; add substate
[unix-history] / usr / src / sys / netinet / tcp_output.c
CommitLineData
8ae0e4b4 1/*
0880b18e 2 * Copyright (c) 1982, 1986 Regents of the University of California.
2b6b6284 3 * All rights reserved.
8ae0e4b4 4 *
2b6b6284
KB
5 * Redistribution and use in source and binary forms are permitted
6 * provided that this notice is preserved and that due credit is given
7 * to the University of California at Berkeley. The name of the University
8 * may not be used to endorse or promote products derived from this
9 * software without specific prior written permission. This software
10 * is provided ``as is'' without express or implied warranty.
11 *
9340d736 12 * @(#)tcp_output.c 7.16 (Berkeley) %G%
8ae0e4b4 13 */
76ee76df 14
20666ad3
JB
15#include "param.h"
16#include "systm.h"
17#include "mbuf.h"
18#include "protosw.h"
19#include "socket.h"
20#include "socketvar.h"
21#include "errno.h"
f4d55810 22
c124e997 23#include "../net/route.h"
f4d55810 24
20666ad3
JB
25#include "in.h"
26#include "in_pcb.h"
27#include "in_systm.h"
28#include "ip.h"
29#include "ip_var.h"
30#include "tcp.h"
0974b45c 31#define TCPOUTFLAGS
20666ad3
JB
32#include "tcp_fsm.h"
33#include "tcp_seq.h"
34#include "tcp_timer.h"
35#include "tcp_var.h"
36#include "tcpip.h"
37#include "tcp_debug.h"
76ee76df 38
8b5a83bb 39/*
77a4e3ca 40 * Initial options.
8b5a83bb 41 */
8b5a83bb 42u_char tcp_initopt[4] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, };
8b5a83bb 43
ea727f86 44/*
4aed14e3 45 * Tcp output routine: figure out what should be sent and send it.
ea727f86 46 */
a6503abf 47tcp_output(tp)
53a5409e 48 register struct tcpcb *tp;
ea727f86 49{
53a5409e 50 register struct socket *so = tp->t_inpcb->inp_socket;
8129ec6e 51 register long len, win;
a6503abf 52 struct mbuf *m0;
acad00cc 53 int off, flags, error;
a6503abf
BJ
54 register struct mbuf *m;
55 register struct tcpiphdr *ti;
8b5a83bb
BJ
56 u_char *opt;
57 unsigned optlen = 0;
acad00cc 58 int idle, sendalot;
76ee76df 59
a6503abf 60 /*
8ae6c089 61 * Determine length of data that should be transmitted,
0974b45c
BJ
62 * and flags that will be used.
63 * If there is some data or critical controls (SYN, RST)
64 * to send, then transmit; otherwise, investigate further.
a6503abf 65 */
acad00cc 66 idle = (tp->snd_max == tp->snd_una);
2266a466
BJ
67again:
68 sendalot = 0;
a6503abf 69 off = tp->snd_nxt - tp->snd_una;
eaf69575 70 win = MIN(tp->snd_wnd, tp->snd_cwnd);
e4af65f3 71
eaf69575
MK
72 /*
73 * If in persist timeout with window of 0, send 1 byte.
acad00cc
MK
74 * Otherwise, if window is small but nonzero
75 * and timer expired, we will send what we can
76 * and go to transmit state.
eaf69575
MK
77 */
78 if (tp->t_force) {
e4af65f3 79 if (win == 0)
eaf69575
MK
80 win = 1;
81 else {
82 tp->t_timer[TCPT_PERSIST] = 0;
83 tp->t_rxtshift = 0;
84 }
85 }
acad00cc 86
8278ae69 87 len = MIN(so->so_snd.sb_cc, win) - off;
0974b45c 88 flags = tcp_outflags[tp->t_state];
e4af65f3
MK
89
90 if (len < 0) {
91 /*
dd0ed0b1
MK
92 * If FIN has been sent but not acked,
93 * but we haven't been called to retransmit,
449e69b4
MK
94 * len will be -1. Otherwise, window shrank
95 * after we sent into it. If window shrank to 0,
96 * cancel pending retransmit and pull snd_nxt
97 * back to (closed) window. We will enter persist
98 * state below. If the window didn't close completely,
dd0ed0b1 99 * just wait for an ACK.
e4af65f3 100 */
449e69b4
MK
101 len = 0;
102 if (win == 0) {
faa26a98
MK
103 tp->t_timer[TCPT_REXMT] = 0;
104 tp->snd_nxt = tp->snd_una;
449e69b4 105 }
faa26a98
MK
106 }
107 if (len > tp->t_maxseg) {
108 len = tp->t_maxseg;
2e5a76f2 109 sendalot = 1;
e4af65f3 110 }
baf677ce
MK
111 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
112 flags &= ~TH_FIN;
e4af65f3 113 win = sbspace(&so->so_rcv);
acad00cc 114
faa26a98
MK
115
116 /*
117 * If our state indicates that FIN should be sent
118 * and we have not yet done so, or we're retransmitting the FIN,
119 * then we need to send.
120 */
121 if (flags & TH_FIN &&
122 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
123 goto send;
acad00cc
MK
124 /*
125 * Send if we owe peer an ACK.
126 */
e4af65f3
MK
127 if (tp->t_flags & TF_ACKNOW)
128 goto send;
faa26a98 129 if (flags & (TH_SYN|TH_RST))
acad00cc 130 goto send;
8ae6c089 131 if (SEQ_GT(tp->snd_up, tp->snd_una))
a6503abf
BJ
132 goto send;
133
8ae6c089 134 /*
7d304adf
MK
135 * Sender silly window avoidance. If connection is idle
136 * and can send all data, a maximum segment,
137 * at least a maximum default-size segment do it,
8ae6c089 138 * or are forced, do it; otherwise don't bother.
18a438b6
MK
139 * If peer's buffer is tiny, then send
140 * when window is at least half open.
eaf69575
MK
141 * If retransmitting (possibly after persist timer forced us
142 * to send into a small window), then must resend.
8ae6c089
BJ
143 */
144 if (len) {
3d92549d 145 if (len == tp->t_maxseg)
8ae6c089 146 goto send;
acad00cc
MK
147 if ((idle || tp->t_flags & TF_NODELAY) &&
148 len + off >= so->so_snd.sb_cc)
8ae6c089
BJ
149 goto send;
150 if (tp->t_force)
151 goto send;
18a438b6
MK
152 if (len >= tp->max_sndwnd / 2)
153 goto send;
eaf69575
MK
154 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
155 goto send;
e4af65f3 156 }
76ee76df 157
a6503abf 158 /*
acad00cc
MK
159 * Compare available window to amount of window
160 * known to peer (as advertised window less
449e69b4
MK
161 * next expected input). If the difference is at least two
162 * max size segments or at least 35% of the maximum possible
163 * window, then want to send a window update to peer.
a6503abf 164 */
a6bbda13
MK
165 if (win > 0) {
166 int adv = win - (tp->rcv_adv - tp->rcv_nxt);
167
449e69b4 168 if (so->so_rcv.sb_cc == 0 && adv >= 2 * tp->t_maxseg)
a6bbda13 169 goto send;
449e69b4 170 if (100 * adv / so->so_rcv.sb_hiwat >= 35)
a6bbda13
MK
171 goto send;
172 }
a6503abf 173
2266a466
BJ
174 /*
175 * TCP window updates are not reliable, rather a polling protocol
176 * using ``persist'' packets is used to insure receipt of window
177 * updates. The three ``states'' for the output side are:
178 * idle not doing retransmits or persists
acad00cc 179 * persisting to move a small or zero window
2266a466
BJ
180 * (re)transmitting and thereby not persisting
181 *
182 * tp->t_timer[TCPT_PERSIST]
183 * is set when we are in persist state.
184 * tp->t_force
185 * is set when we are called to send a persist packet.
186 * tp->t_timer[TCPT_REXMT]
187 * is set when we are retransmitting
188 * The output side is idle when both timers are zero.
189 *
eaf69575
MK
190 * If send window is too small, there is data to transmit, and no
191 * retransmit or persist is pending, then go to persist state.
192 * If nothing happens soon, send when timer expires:
193 * if window is nonzero, transmit what we can,
194 * otherwise force out a byte.
2266a466 195 */
eaf69575
MK
196 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
197 tp->t_timer[TCPT_PERSIST] == 0) {
2266a466
BJ
198 tp->t_rxtshift = 0;
199 tcp_setpersist(tp);
200 }
201
a6503abf
BJ
202 /*
203 * No reason to send a segment, just return.
204 */
f1b2fa5b 205 return (0);
a6503abf
BJ
206
207send:
208 /*
209 * Grab a header mbuf, attaching a copy of data to
210 * be transmitted, and initialize the header from
211 * the template for sends on this connection.
212 */
60d68e9e
SL
213 MGET(m, M_DONTWAIT, MT_HEADER);
214 if (m == NULL)
8a2f82db 215 return (ENOBUFS);
8129ec6e
MK
216#define MAXLINKHDR 32 /* belongs elsewhere */
217#define DATASPACE (MMAXOFF - (MMINOFF + MAXLINKHDR + sizeof (struct tcpiphdr)))
218 m->m_off = MMINOFF + MAXLINKHDR;
53a5409e 219 m->m_len = sizeof (struct tcpiphdr);
8129ec6e 220 ti = mtod(m, struct tcpiphdr *);
a6503abf 221 if (len) {
3b52afc5
MK
222 if (tp->t_force && len == 1)
223 tcpstat.tcps_sndprobe++;
224 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
225 tcpstat.tcps_sndrexmitpack++;
226 tcpstat.tcps_sndrexmitbyte += len;
227 } else {
228 tcpstat.tcps_sndpack++;
229 tcpstat.tcps_sndbyte += len;
230 }
8129ec6e 231 if (len <= DATASPACE) {
9340d736 232 m_copydata(so->so_snd.sb_mb, off, (int) len,
8129ec6e
MK
233 mtod(m, caddr_t) + sizeof(struct tcpiphdr));
234 m->m_len += len;
235 } else {
9340d736 236 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
8129ec6e
MK
237 if (m->m_next == 0)
238 len = 0;
239 }
3b52afc5
MK
240 } else if (tp->t_flags & TF_ACKNOW)
241 tcpstat.tcps_sndacks++;
242 else if (flags & (TH_SYN|TH_FIN|TH_RST))
243 tcpstat.tcps_sndctrl++;
244 else if (SEQ_GT(tp->snd_up, tp->snd_una))
245 tcpstat.tcps_sndurg++;
246 else
247 tcpstat.tcps_sndwinup++;
248
a6503abf
BJ
249 if (tp->t_template == 0)
250 panic("tcp_output");
f1b2fa5b 251 bcopy((caddr_t)tp->t_template, (caddr_t)ti, sizeof (struct tcpiphdr));
a6503abf
BJ
252
253 /*
254 * Fill in fields, remembering maximum advertised
255 * window for use in delaying messages about window sizes.
faa26a98 256 * If resending a FIN, be sure not to use a new sequence number.
a6503abf 257 */
3d92549d
MK
258 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
259 tp->snd_nxt == tp->snd_max)
faa26a98 260 tp->snd_nxt--;
acad00cc
MK
261 ti->ti_seq = htonl(tp->snd_nxt);
262 ti->ti_ack = htonl(tp->rcv_nxt);
8b5a83bb
BJ
263 /*
264 * Before ESTABLISHED, force sending of initial options
265 * unless TCP set to not do any options.
266 */
acad00cc 267 opt = NULL;
1ccb6fcd 268 if (flags & TH_SYN && (tp->t_flags & TF_NOOPT) == 0) {
8011f5df 269 u_short mss;
99578149 270
99578149 271 mss = MIN(so->so_rcv.sb_hiwat / 2, tcp_mss(tp));
acad00cc
MK
272 if (mss > IP_MSS - sizeof(struct tcpiphdr)) {
273 opt = tcp_initopt;
274 optlen = sizeof (tcp_initopt);
275 *(u_short *)(opt + 2) = htons(mss);
276 }
8b5a83bb 277 }
77a4e3ca 278 if (opt) {
f1b2fa5b 279 m0 = m->m_next;
cce93e4b 280 m->m_next = m_get(M_DONTWAIT, MT_DATA);
0974b45c
BJ
281 if (m->m_next == 0) {
282 (void) m_free(m);
8b5a83bb 283 m_freem(m0);
8a2f82db 284 return (ENOBUFS);
0974b45c
BJ
285 }
286 m->m_next->m_next = m0;
8b5a83bb 287 m0 = m->m_next;
8b5a83bb 288 m0->m_len = optlen;
668cc26d 289 bcopy((caddr_t)opt, mtod(m0, caddr_t), optlen);
8b5a83bb 290 opt = (u_char *)(mtod(m0, caddr_t) + optlen);
8b5a83bb
BJ
291 while (m0->m_len & 0x3) {
292 *opt++ = TCPOPT_EOL;
293 m0->m_len++;
294 }
295 optlen = m0->m_len;
296 ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2;
0974b45c
BJ
297 }
298 ti->ti_flags = flags;
acad00cc
MK
299 /*
300 * Calculate receive window. Don't shrink window,
301 * but avoid silly window syndrome.
302 */
6def2330 303 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
acad00cc 304 win = 0;
8129ec6e
MK
305 if (win > IP_MAXPACKET)
306 win = IP_MAXPACKET;
6def2330
MK
307 if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
308 win = (long)(tp->rcv_adv - tp->rcv_nxt);
dd861483
MK
309 if (win > IP_MAXPACKET)
310 win = IP_MAXPACKET;
acad00cc 311 ti->ti_win = htons((u_short)win);
0974b45c 312 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
8011f5df 313 ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
a6503abf
BJ
314 ti->ti_flags |= TH_URG;
315 } else
316 /*
317 * If no urgent pointer to send, then we pull
318 * the urgent pointer to the left edge of the send window
319 * so that it doesn't drift into the send window on sequence
320 * number wraparound.
321 */
0974b45c 322 tp->snd_up = tp->snd_una; /* drag it along */
02c1608b
BJ
323 /*
324 * If anything to send and we can send it all, set PUSH.
325 * (This will keep happy those implementations which only
5cdc4d65 326 * give data to the user when a buffer fills or a PUSH comes in.)
02c1608b 327 */
02c1608b
BJ
328 if (len && off+len == so->so_snd.sb_cc)
329 ti->ti_flags |= TH_PUSH;
a6503abf
BJ
330
331 /*
332 * Put TCP length in extended header, and then
333 * checksum extended header and data.
334 */
acad00cc
MK
335 if (len + optlen)
336 ti->ti_len = htons((u_short)(sizeof(struct tcphdr) +
337 optlen + len));
9340d736
MK
338 ti->ti_sum = in_cksum(m,
339 (int)(sizeof (struct tcpiphdr) + (int)optlen + len));
0974b45c
BJ
340
341 /*
2266a466 342 * In transmit state, time the transmission and arrange for
eaf69575 343 * the retransmit. In persist state, just set snd_max.
0974b45c 344 */
eaf69575 345 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
eb5508b2
MK
346 tcp_seq startseq = tp->snd_nxt;
347
2266a466 348 /*
8931cb5b 349 * Advance snd_nxt over sequence space of this segment.
2266a466 350 */
faa26a98
MK
351 if (flags & TH_SYN)
352 tp->snd_nxt++;
353 if (flags & TH_FIN) {
2266a466 354 tp->snd_nxt++;
faa26a98
MK
355 tp->t_flags |= TF_SENTFIN;
356 }
2266a466 357 tp->snd_nxt += len;
eb5508b2 358 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
e45e6858 359 tp->snd_max = tp->snd_nxt;
eb5508b2
MK
360 /*
361 * Time this transmission if not a retransmission and
362 * not currently timing anything.
363 */
364 if (tp->t_rtt == 0) {
365 tp->t_rtt = 1;
366 tp->t_rtseq = startseq;
367 tcpstat.tcps_segstimed++;
368 }
369 }
405c9168 370
2266a466 371 /*
eaf69575 372 * Set retransmit timer if not currently set,
6be9a225 373 * and not doing an ack or a keep-alive probe.
7cc62c26
MK
374 * Initial value for retransmit timer is smoothed
375 * round-trip time + 2 * round-trip time variance.
e4af65f3
MK
376 * Initialize shift counter which is used for backoff
377 * of retransmit time.
2266a466
BJ
378 */
379 if (tp->t_timer[TCPT_REXMT] == 0 &&
380 tp->snd_nxt != tp->snd_una) {
a6bbda13
MK
381 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
382 if (tp->t_timer[TCPT_PERSIST]) {
383 tp->t_timer[TCPT_PERSIST] = 0;
384 tp->t_rxtshift = 0;
385 }
2266a466 386 }
6be9a225 387 } else
acad00cc
MK
388 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
389 tp->snd_max = tp->snd_nxt + len;
a6503abf 390
f1dd32da
BJ
391 /*
392 * Trace.
393 */
8931cb5b 394 if (so->so_options & SO_DEBUG)
f1dd32da
BJ
395 tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0);
396
a6503abf
BJ
397 /*
398 * Fill in IP length and desired time to live and
399 * send to IP level.
400 */
8b5a83bb 401 ((struct ip *)ti)->ip_len = sizeof (struct tcpiphdr) + optlen + len;
a6503abf 402 ((struct ip *)ti)->ip_ttl = TCP_TTL;
9d866d2f 403#if BSD>=43
d55475b1
MK
404 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
405 so->so_options & SO_DONTROUTE);
9d866d2f
MK
406#else
407 error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route,
408 so->so_options & SO_DONTROUTE);
409#endif
83591b9d
MK
410 if (error) {
411 if (error == ENOBUFS) {
412 tcp_quench(tp->t_inpcb);
413 return (0);
414 }
8a2f82db 415 return (error);
83591b9d 416 }
3b52afc5 417 tcpstat.tcps_sndtotal++;
a6503abf
BJ
418
419 /*
420 * Data sent (as far as we can tell).
421 * If this advertises a larger window than any other segment,
4aed14e3 422 * then remember the size of the advertised window.
acad00cc 423 * Any pending ACK has now been sent.
a6503abf 424 */
be43ac7f 425 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
a6503abf 426 tp->rcv_adv = tp->rcv_nxt + win;
0974b45c 427 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
acad00cc 428 if (sendalot)
2266a466 429 goto again;
8a2f82db 430 return (0);
76ee76df 431}
2266a466
BJ
432
433tcp_setpersist(tp)
434 register struct tcpcb *tp;
435{
7cc62c26 436 register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
2266a466
BJ
437
438 if (tp->t_timer[TCPT_REXMT])
439 panic("tcp_output REXMT");
440 /*
441 * Start/restart persistance timer.
442 */
443 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
7cc62c26 444 t * tcp_backoff[tp->t_rxtshift],
3d92549d
MK
445 TCPTV_PERSMIN, TCPTV_PERSMAX);
446 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
447 tp->t_rxtshift++;
2266a466 448}