fix profiling
[unix-history] / usr / src / sys / netinet / tcp_subr.c
CommitLineData
8ae0e4b4 1/*
33042259 2 * Copyright (c) 1982, 1986, 1988, 1990 Regents of the University of California.
2b6b6284 3 * All rights reserved.
8ae0e4b4 4 *
dbf0c423 5 * %sccs.include.redist.c%
2b6b6284 6 *
69d96ae2 7 * @(#)tcp_subr.c 7.25 (Berkeley) %G%
8ae0e4b4 8 */
ecaa4e6f 9
85ee91bb
KS
10#include <sys/param.h>
11#include <sys/proc.h>
12#include <sys/systm.h>
13#include <sys/malloc.h>
14#include <sys/mbuf.h>
15#include <sys/socket.h>
16#include <sys/socketvar.h>
17#include <sys/protosw.h>
18#include <sys/errno.h>
5548a02f
KB
19
20#include <net/route.h>
21#include <net/if.h>
22
23#include <netinet/in.h>
24#include <netinet/in_systm.h>
25#include <netinet/ip.h>
26#include <netinet/in_pcb.h>
27#include <netinet/ip_var.h>
28#include <netinet/ip_icmp.h>
29#include <netinet/tcp.h>
30#include <netinet/tcp_fsm.h>
31#include <netinet/tcp_seq.h>
32#include <netinet/tcp_timer.h>
33#include <netinet/tcp_var.h>
34#include <netinet/tcpip.h>
ecaa4e6f 35
33042259 36/* patchable/settable parameters for tcp */
10604dba 37int tcp_ttl = TCP_TTL;
33042259
MK
38int tcp_mssdflt = TCP_MSS;
39int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
69d96ae2 40int tcp_do_rfc1323 = 1;
33042259
MK
41
42extern struct inpcb *tcp_last_inpcb;
10604dba 43
ecaa4e6f
BJ
44/*
45 * Tcp initialization
46 */
47tcp_init()
48{
49
50 tcp_iss = 1; /* wrong */
51 tcb.inp_next = tcb.inp_prev = &tcb;
9d91b170
MK
52 if (max_protohdr < sizeof(struct tcpiphdr))
53 max_protohdr = sizeof(struct tcpiphdr);
54 if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN)
55 panic("tcp_init");
ecaa4e6f
BJ
56}
57
58/*
59 * Create template to be used to send tcp packets on a connection.
60 * Call after host entry created, allocates an mbuf and fills
61 * in a skeletal tcp/ip header, minimizing the amount of work
62 * necessary when the connection is used.
63 */
64struct tcpiphdr *
65tcp_template(tp)
66 struct tcpcb *tp;
67{
68 register struct inpcb *inp = tp->t_inpcb;
69 register struct mbuf *m;
70 register struct tcpiphdr *n;
71
ece01391 72 if ((n = tp->t_template) == 0) {
9f5105e3 73 m = m_get(M_DONTWAIT, MT_HEADER);
ece01391
MK
74 if (m == NULL)
75 return (0);
ece01391
MK
76 m->m_len = sizeof (struct tcpiphdr);
77 n = mtod(m, struct tcpiphdr *);
78 }
ecaa4e6f
BJ
79 n->ti_next = n->ti_prev = 0;
80 n->ti_x1 = 0;
81 n->ti_pr = IPPROTO_TCP;
82 n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip));
83 n->ti_src = inp->inp_laddr;
84 n->ti_dst = inp->inp_faddr;
85 n->ti_sport = inp->inp_lport;
86 n->ti_dport = inp->inp_fport;
87 n->ti_seq = 0;
0974b45c 88 n->ti_ack = 0;
ecaa4e6f
BJ
89 n->ti_x2 = 0;
90 n->ti_off = 5;
91 n->ti_flags = 0;
92 n->ti_win = 0;
93 n->ti_sum = 0;
94 n->ti_urp = 0;
95 return (n);
96}
97
98/*
405c9168 99 * Send a single message to the TCP at address specified by
33042259 100 * the given TCP/IP header. If m == 0, then we make a copy
405c9168
BJ
101 * of the tcpiphdr at ti and send directly to the addressed host.
102 * This is used to force keep alive messages out using the TCP
103 * template for a connection tp->t_template. If flags are given
104 * then we send a message back to the TCP which originated the
105 * segment ti, and discard the mbuf containing it and any other
106 * attached mbufs.
107 *
108 * In any case the ack and sequence number of the transmitted
109 * segment are as specified by the parameters.
ecaa4e6f 110 */
9d91b170 111tcp_respond(tp, ti, m, ack, seq, flags)
8e65fd66 112 struct tcpcb *tp;
ecaa4e6f 113 register struct tcpiphdr *ti;
9d91b170 114 register struct mbuf *m;
0974b45c 115 tcp_seq ack, seq;
ecaa4e6f
BJ
116 int flags;
117{
37a28d38
MK
118 register int tlen;
119 int win = 0;
c124e997 120 struct route *ro = 0;
ecaa4e6f 121
c124e997 122 if (tp) {
8e65fd66 123 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
c124e997
SL
124 ro = &tp->t_inpcb->inp_route;
125 }
9d91b170
MK
126 if (m == 0) {
127 m = m_gethdr(M_DONTWAIT, MT_HEADER);
5cdc4d65 128 if (m == NULL)
405c9168 129 return;
eeef4ac3
MK
130#ifdef TCP_COMPAT_42
131 tlen = 1;
132#else
133 tlen = 0;
134#endif
9d91b170 135 m->m_data += max_linkhdr;
405c9168
BJ
136 *mtod(m, struct tcpiphdr *) = *ti;
137 ti = mtod(m, struct tcpiphdr *);
138 flags = TH_ACK;
139 } else {
140 m_freem(m->m_next);
141 m->m_next = 0;
9d91b170 142 m->m_data = (caddr_t)ti;
405c9168 143 m->m_len = sizeof (struct tcpiphdr);
33042259 144 tlen = 0;
0974b45c 145#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
405c9168
BJ
146 xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, u_long);
147 xchg(ti->ti_dport, ti->ti_sport, u_short);
ecaa4e6f 148#undef xchg
405c9168 149 }
37a28d38
MK
150 ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen));
151 tlen += sizeof (struct tcpiphdr);
152 m->m_len = tlen;
153 m->m_pkthdr.len = tlen;
154 m->m_pkthdr.rcvif = (struct ifnet *) 0;
0974b45c
BJ
155 ti->ti_next = ti->ti_prev = 0;
156 ti->ti_x1 = 0;
2c48b3f8
BJ
157 ti->ti_seq = htonl(seq);
158 ti->ti_ack = htonl(ack);
0974b45c
BJ
159 ti->ti_x2 = 0;
160 ti->ti_off = sizeof (struct tcphdr) >> 2;
ecaa4e6f 161 ti->ti_flags = flags;
69d96ae2
AC
162 if (tp)
163 ti->ti_win = htons((u_short) (win >> tp->rcv_scale));
164 else
165 ti->ti_win = htons((u_short)win);
8e65fd66 166 ti->ti_urp = 0;
69d96ae2 167 ti->ti_sum = 0;
37a28d38
MK
168 ti->ti_sum = in_cksum(m, tlen);
169 ((struct ip *)ti)->ip_len = tlen;
10604dba 170 ((struct ip *)ti)->ip_ttl = tcp_ttl;
c124e997 171 (void) ip_output(m, (struct mbuf *)0, ro, 0);
ecaa4e6f 172}
a6503abf 173
0974b45c
BJ
174/*
175 * Create a new TCP control block, making an
176 * empty reassembly queue and hooking it to the argument
177 * protocol control block.
178 */
a6503abf
BJ
179struct tcpcb *
180tcp_newtcpcb(inp)
181 struct inpcb *inp;
182{
a6503abf 183 register struct tcpcb *tp;
a6503abf 184
69d96ae2
AC
185 tp = malloc(sizeof(*tp), M_PCB, M_NOWAIT);
186 if (tp == NULL)
5cdc4d65 187 return ((struct tcpcb *)0);
69d96ae2 188 bzero((char *) tp, sizeof(struct tcpcb));
a6503abf 189 tp->seg_next = tp->seg_prev = (struct tcpiphdr *)tp;
33042259
MK
190 tp->t_maxseg = tcp_mssdflt;
191
69d96ae2 192 tp->t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
a6503abf 193 tp->t_inpcb = inp;
7cc62c26 194 /*
5ca0b868
MK
195 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
196 * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
197 * reasonable initial retransmit time.
7cc62c26 198 */
5ca0b868 199 tp->t_srtt = TCPTV_SRTTBASE;
33042259
MK
200 tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << 2;
201 tp->t_rttmin = TCPTV_MIN;
dabb0e53
MK
202 TCPT_RANGESET(tp->t_rxtcur,
203 ((TCPTV_SRTTBASE >> 2) + (TCPTV_SRTTDFLT << 2)) >> 1,
204 TCPTV_MIN, TCPTV_REXMTMAX);
69d96ae2
AC
205 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
206 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
33042259 207 inp->inp_ip.ip_ttl = tcp_ttl;
a6503abf
BJ
208 inp->inp_ppcb = (caddr_t)tp;
209 return (tp);
210}
211
0974b45c
BJ
212/*
213 * Drop a TCP connection, reporting
214 * the specified error. If connection is synchronized,
215 * then send a RST to peer.
216 */
0e3936fa 217struct tcpcb *
a6503abf 218tcp_drop(tp, errno)
0e3936fa 219 register struct tcpcb *tp;
a6503abf
BJ
220 int errno;
221{
222 struct socket *so = tp->t_inpcb->inp_socket;
223
d3504cc0 224 if (TCPS_HAVERCVDSYN(tp->t_state)) {
a6503abf 225 tp->t_state = TCPS_CLOSED;
39d536e6 226 (void) tcp_output(tp);
35f3fc10
MK
227 tcpstat.tcps_drops++;
228 } else
229 tcpstat.tcps_conndrops++;
33042259
MK
230 if (errno == ETIMEDOUT && tp->t_softerror)
231 errno = tp->t_softerror;
a6503abf 232 so->so_error = errno;
0e3936fa 233 return (tcp_close(tp));
a6503abf
BJ
234}
235
0974b45c
BJ
236/*
237 * Close a TCP control block:
238 * discard all space held by the tcp
239 * discard internet protocol block
240 * wake up any sleepers
241 */
0e3936fa 242struct tcpcb *
a6503abf
BJ
243tcp_close(tp)
244 register struct tcpcb *tp;
245{
246 register struct tcpiphdr *t;
364801f5
BJ
247 struct inpcb *inp = tp->t_inpcb;
248 struct socket *so = inp->inp_socket;
13e2480b 249 register struct mbuf *m;
33042259
MK
250#ifdef RTV_RTT
251 register struct rtentry *rt;
a6503abf 252
33042259
MK
253 /*
254 * If we sent enough data to get some meaningful characteristics,
255 * save them in the routing entry. 'Enough' is arbitrarily
1ac2096c 256 * defined as the sendpipesize (default 4K) * 16. This would
33042259
MK
257 * give us 16 rtt samples assuming we only get one sample per
258 * window (the usual case on a long haul net). 16 samples is
259 * enough for the srtt filter to converge to within 5% of the correct
260 * value; fewer samples and we could save a very bogus rtt.
261 *
262 * Don't update the default route's characteristics and don't
263 * update anything that the user "locked".
264 */
1ac2096c 265 if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) &&
33042259 266 (rt = inp->inp_route.ro_rt) &&
1ac2096c 267 ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr != INADDR_ANY) {
33042259
MK
268 register u_long i;
269
270 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
271 i = tp->t_srtt *
272 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE));
273 if (rt->rt_rmx.rmx_rtt && i)
274 /*
275 * filter this update to half the old & half
276 * the new values, converting scale.
277 * See route.h and tcp_var.h for a
278 * description of the scaling constants.
279 */
280 rt->rt_rmx.rmx_rtt =
281 (rt->rt_rmx.rmx_rtt + i) / 2;
282 else
283 rt->rt_rmx.rmx_rtt = i;
284 }
285 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
286 i = tp->t_rttvar *
287 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE));
288 if (rt->rt_rmx.rmx_rttvar && i)
289 rt->rt_rmx.rmx_rttvar =
290 (rt->rt_rmx.rmx_rttvar + i) / 2;
291 else
292 rt->rt_rmx.rmx_rttvar = i;
293 }
294 /*
295 * update the pipelimit (ssthresh) if it has been updated
296 * already or if a pipesize was specified & the threshhold
297 * got below half the pipesize. I.e., wait for bad news
298 * before we start updating, then update on both good
299 * and bad news.
300 */
301 if ((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
302 (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh ||
303 i < (rt->rt_rmx.rmx_sendpipe / 2)) {
304 /*
305 * convert the limit from user data bytes to
306 * packets then to packet data bytes.
307 */
308 i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
309 if (i < 2)
310 i = 2;
311 i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr));
312 if (rt->rt_rmx.rmx_ssthresh)
313 rt->rt_rmx.rmx_ssthresh =
314 (rt->rt_rmx.rmx_ssthresh + i) / 2;
315 else
316 rt->rt_rmx.rmx_ssthresh = i;
317 }
318 }
319#endif RTV_RTT
320 /* free the reassembly queue, if any */
a6503abf 321 t = tp->seg_next;
13e2480b
SL
322 while (t != (struct tcpiphdr *)tp) {
323 t = (struct tcpiphdr *)t->ti_next;
33042259 324 m = REASS_MBUF((struct tcpiphdr *)t->ti_prev);
13e2480b
SL
325 remque(t->ti_prev);
326 m_freem(m);
327 }
0974b45c 328 if (tp->t_template)
a6503abf 329 (void) m_free(dtom(tp->t_template));
69d96ae2 330 free(tp, M_PCB);
364801f5 331 inp->inp_ppcb = 0;
4aed14e3 332 soisdisconnected(so);
33042259
MK
333 /* clobber input pcb cache if we're closing the cached connection */
334 if (inp == tcp_last_inpcb)
335 tcp_last_inpcb = &tcb;
86676257 336 in_pcbdetach(inp);
35f3fc10 337 tcpstat.tcps_closed++;
0e3936fa 338 return ((struct tcpcb *)0);
a6503abf
BJ
339}
340
a6503abf
BJ
341tcp_drain()
342{
a6503abf 343
a6503abf
BJ
344}
345
be841dc3
MK
346/*
347 * Notify a tcp user of an asynchronous error;
33042259
MK
348 * store error as soft error, but wake up user
349 * (for now, won't do anything until can select for soft error).
be841dc3 350 */
33042259 351tcp_notify(inp, error)
ba200b9a 352 struct inpcb *inp;
33042259 353 int error;
be841dc3 354{
ba200b9a
MK
355 register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
356 register struct socket *so = inp->inp_socket;
be841dc3 357
ba200b9a 358 /*
69d96ae2 359 * Ignore some errors if we are hooked up.
ba200b9a
MK
360 * If connection hasn't completed, has retransmitted several times,
361 * and receives a second error, give up now. This is better
362 * than waiting a long time to establish a connection that
363 * can never complete.
364 */
69d96ae2
AC
365 if (tp->t_state == TCPS_ESTABLISHED &&
366 (error == EHOSTUNREACH || error == ENETUNREACH ||
367 error == EHOSTDOWN)) {
368 return;
369 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
ba200b9a
MK
370 tp->t_softerror)
371 so->so_error = error;
69d96ae2 372 else
ba200b9a
MK
373 tp->t_softerror = error;
374 wakeup((caddr_t) &so->so_timeo);
375 sorwakeup(so);
376 sowwakeup(so);
be841dc3 377}
b1dd4cca
MK
378
379tcp_ctlinput(cmd, sa, ip)
72e4f44e 380 int cmd;
7c626d4d 381 struct sockaddr *sa;
b1dd4cca 382 register struct ip *ip;
a6503abf 383{
b1dd4cca
MK
384 register struct tcphdr *th;
385 extern struct in_addr zeroin_addr;
39674d5f 386 extern u_char inetctlerrmap[];
b1dd4cca 387 int (*notify)() = tcp_notify, tcp_quench();
39674d5f 388
b1dd4cca
MK
389 if (cmd == PRC_QUENCH)
390 notify = tcp_quench;
69d96ae2
AC
391 else if (!PRC_IS_REDIRECT(cmd) &&
392 ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0))
7c626d4d 393 return;
b1dd4cca
MK
394 if (ip) {
395 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
396 in_pcbnotify(&tcb, sa, th->th_dport, ip->ip_src, th->th_sport,
397 cmd, notify);
398 } else
399 in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify);
a6503abf 400}
05586739 401
9d866d2f
MK
402#if BSD<43
403/* XXX fake routine */
404tcp_abort(inp)
405 struct inpcb *inp;
406{
407 return;
408}
409#endif
410
05586739
MK
411/*
412 * When a source quench is received, close congestion window
2e5a76f2 413 * to one segment. We will gradually open it again as we proceed.
05586739
MK
414 */
415tcp_quench(inp)
416 struct inpcb *inp;
417{
418 struct tcpcb *tp = intotcpcb(inp);
419
7c626d4d 420 if (tp)
2e5a76f2 421 tp->snd_cwnd = tp->t_maxseg;
05586739 422}