* Copyright (c) 1982, 1986, 1988, 1990 Regents of the University of California.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* from: @(#)tcp_input.c 7.25 (Berkeley) 6/30/90
* $Id: tcp_input.c,v 1.8 1994/01/30 01:00:28 davidg Exp $
#include "../net/route.h"
static void tcp_dooptions(struct tcpcb
*, struct mbuf
*, struct tcpiphdr
*);
static void tcp_pulloutofband(struct socket
*, struct tcpiphdr
*, struct mbuf
*);
static void tcp_xmit_timer(struct tcpcb
*);
int tcppredack
; /* XXX debugging: times hdr predict ok for acks */
int tcppreddat
; /* XXX # times header prediction ok for data packets */
struct tcpiphdr tcp_saveti
;
struct inpcb
*tcp_last_inpcb
= &tcb
;
* Insert segment ti into reassembly queue of tcp with
* control block tp. Return TH_FIN if reassembly now includes
* a segment with FIN. The macro form does the common case inline
* (segment is the next to be received on an established connection,
* and the queue is empty), avoiding linkage into and removal
* from the queue and repetition of various conversions.
* Set DELACK for segments received in order, but ack immediately
* when segments are out of order (so fast retransmit can work).
#define TCP_REASS(tp, ti, m, so, flags) { \
if ((ti)->ti_seq == (tp)->rcv_nxt && \
(tp)->seg_next == (struct tcpiphdr *)(tp) && \
(tp)->t_state == TCPS_ESTABLISHED) { \
tp->t_flags |= TF_DELACK; \
(tp)->rcv_nxt += (ti)->ti_len; \
flags = (ti)->ti_flags & TH_FIN; \
tcpstat.tcps_rcvbyte += (ti)->ti_len;\
sbappend(&(so)->so_rcv, (m)); \
(flags) = tcp_reass((tp), (ti), (m)); \
tp->t_flags |= TF_ACKNOW; \
register struct tcpcb
*tp
;
register struct tcpiphdr
*ti
;
register struct tcpiphdr
*q
;
struct socket
*so
= tp
->t_inpcb
->inp_socket
;
* Call with ti==0 after become established to
* force pre-ESTABLISHED data up to user socket.
* Find a segment which begins after this one does.
for (q
= tp
->seg_next
; q
!= (struct tcpiphdr
*)tp
;
q
= (struct tcpiphdr
*)q
->ti_next
)
if (SEQ_GT(q
->ti_seq
, ti
->ti_seq
))
* If there is a preceding segment, it may provide some of
* our data already. If so, drop the data from the incoming
* segment. If it provides all of our data, drop us.
if ((struct tcpiphdr
*)q
->ti_prev
!= (struct tcpiphdr
*)tp
) {
q
= (struct tcpiphdr
*)q
->ti_prev
;
/* conversion to int (in i) handles seq wraparound */
i
= q
->ti_seq
+ q
->ti_len
- ti
->ti_seq
;
tcpstat
.tcps_rcvduppack
++;
tcpstat
.tcps_rcvdupbyte
+= ti
->ti_len
;
q
= (struct tcpiphdr
*)(q
->ti_next
);
tcpstat
.tcps_rcvoopack
++;
tcpstat
.tcps_rcvoobyte
+= ti
->ti_len
;
REASS_MBUF(ti
) = m
; /* XXX */
* While we overlap succeeding segments trim them or,
* if they are completely covered, dequeue them.
while (q
!= (struct tcpiphdr
*)tp
) {
register int i
= (ti
->ti_seq
+ ti
->ti_len
) - q
->ti_seq
;
q
= (struct tcpiphdr
*)q
->ti_next
;
m
= REASS_MBUF((struct tcpiphdr
*)q
->ti_prev
);
* Stick new segment in its place.
* Present data to user, advancing rcv_nxt through
* completed sequence space.
if (TCPS_HAVERCVDSYN(tp
->t_state
) == 0)
if (ti
== (struct tcpiphdr
*)tp
|| ti
->ti_seq
!= tp
->rcv_nxt
)
if (tp
->t_state
== TCPS_SYN_RECEIVED
&& ti
->ti_len
)
tp
->rcv_nxt
+= ti
->ti_len
;
flags
= ti
->ti_flags
& TH_FIN
;
ti
= (struct tcpiphdr
*)ti
->ti_next
;
if (so
->so_state
& SS_CANTRCVMORE
)
sbappend(&so
->so_rcv
, m
);
} while (ti
!= (struct tcpiphdr
*)tp
&& ti
->ti_seq
== tp
->rcv_nxt
);
* TCP input routine, follows pages 65-76 of the
* protocol specification dated September, 1981 very closely.
register struct tcpiphdr
*ti
;
register struct inpcb
*inp
;
register struct tcpcb
*tp
= 0;
int todrop
, acked
, ourfinisacked
, needoutput
= 0;
* Get IP and TCP header together in first mbuf.
* Note: IP leaves IP header in first mbuf.
ti
= mtod(m
, struct tcpiphdr
*);
if (iphlen
> sizeof (struct ip
))
ip_stripoptions(m
, (struct mbuf
*)0);
if (m
->m_len
< sizeof (struct tcpiphdr
)) {
if ((m
= m_pullup(m
, sizeof (struct tcpiphdr
))) == 0) {
ti
= mtod(m
, struct tcpiphdr
*);
* Checksum extended TCP header and data.
tlen
= ((struct ip
*)ti
)->ip_len
;
len
= sizeof (struct ip
) + tlen
;
ti
->ti_next
= ti
->ti_prev
= 0;
ti
->ti_len
= (u_short
)tlen
;
if (ti
->ti_sum
= in_cksum(m
, len
)) {
tcpstat
.tcps_rcvbadsum
++;
* Check that TCP offset makes sense,
* pull out TCP options and adjust length. XXX
if (off
< sizeof (struct tcphdr
) || off
> tlen
) {
tcpstat
.tcps_rcvbadoff
++;
if (off
> sizeof (struct tcphdr
)) {
if (m
->m_len
< sizeof(struct ip
) + off
) {
if ((m
= m_pullup(m
, sizeof (struct ip
) + off
)) == 0) {
ti
= mtod(m
, struct tcpiphdr
*);
om
= m_get(M_DONTWAIT
, MT_DATA
);
om
->m_len
= off
- sizeof (struct tcphdr
);
{ caddr_t op
= mtod(m
, caddr_t
) + sizeof (struct tcpiphdr
);
bcopy(op
, mtod(om
, caddr_t
), (unsigned)om
->m_len
);
m
->m_pkthdr
.len
-= om
->m_len
;
(unsigned)(m
->m_len
-sizeof (struct tcpiphdr
)));
* Convert TCP protocol specific fields to host format.
* Locate pcb for segment.
if (inp
->inp_lport
!= ti
->ti_dport
||
inp
->inp_fport
!= ti
->ti_sport
||
inp
->inp_faddr
.s_addr
!= ti
->ti_src
.s_addr
||
inp
->inp_laddr
.s_addr
!= ti
->ti_dst
.s_addr
) {
inp
= in_pcblookup(&tcb
, ti
->ti_src
, ti
->ti_sport
,
ti
->ti_dst
, ti
->ti_dport
, INPLOOKUP_WILDCARD
);
* If the state is CLOSED (i.e., TCB does not exist) then
* all data in the incoming segment is discarded.
* If the TCB exists but is in CLOSED state, it is embryonic,
* but should either do a listen or a connect soon.
if (tp
->t_state
== TCPS_CLOSED
)
if (so
->so_options
& (SO_DEBUG
|SO_ACCEPTCONN
)) {
if (so
->so_options
& SO_DEBUG
) {
if (so
->so_options
& SO_ACCEPTCONN
) {
* Mark socket as temporary until we're
* committed to keeping it. The code at
* ``drop'' and ``dropwithreset'' check the
* flag dropsocket to see if the temporary
* socket created here should be discarded.
* We mark the socket as discardable until
* we're committed to it below in TCPS_LISTEN.
inp
= (struct inpcb
*)so
->so_pcb
;
inp
->inp_laddr
= ti
->ti_dst
;
inp
->inp_lport
= ti
->ti_dport
;
inp
->inp_options
= ip_srcroute();
tp
->t_state
= TCPS_LISTEN
;
* Segment received on connection.
* Reset idle time and keep-alive timer.
tp
->t_timer
[TCPT_KEEP
] = tcp_keepidle
;
* Process options if not in LISTEN state,
* else do it below (after getting remote address).
if (om
&& tp
->t_state
!= TCPS_LISTEN
) {
tcp_dooptions(tp
, om
, ti
);
* Header prediction: check for the two common cases
* of a uni-directional data xfer. If the packet has
* no control flags, is in-sequence, the window didn't
* change and we're not retransmitting, it's a
* candidate. If the length is zero and the ack moved
* forward, we're the sender side of the xfer. Just
* free the data acked & wake any higher level process
* that was blocked waiting for space. If the length
* is non-zero and the ack didn't move, we're the
* receiver side. If we're getting packets in-order
* (the reassembly queue is empty), add the data to
* the socket buffer and note that we need a delayed ack.
if (tp
->t_state
== TCPS_ESTABLISHED
&&
(tiflags
& (TH_SYN
|TH_FIN
|TH_RST
|TH_URG
|TH_ACK
)) == TH_ACK
&&
ti
->ti_seq
== tp
->rcv_nxt
&&
ti
->ti_win
&& ti
->ti_win
== tp
->snd_wnd
&&
tp
->snd_nxt
== tp
->snd_max
) {
if (SEQ_GT(ti
->ti_ack
, tp
->snd_una
) &&
SEQ_LEQ(ti
->ti_ack
, tp
->snd_max
) &&
tp
->snd_cwnd
>= tp
->snd_wnd
) {
* this is a pure ack for outstanding data.
if (tp
->t_rtt
&& SEQ_GT(ti
->ti_ack
,tp
->t_rtseq
))
acked
= ti
->ti_ack
- tp
->snd_una
;
tcpstat
.tcps_rcvackpack
++;
tcpstat
.tcps_rcvackbyte
+= acked
;
sbdrop(&so
->so_snd
, acked
);
tp
->snd_una
= ti
->ti_ack
;
* If all outstanding data are acked, stop
* retransmit timer, otherwise restart timer
* using current (possibly backed-off) value.
* If process is waiting for space,
* wakeup/selwakeup/signal. If data
* are ready to send, let tcp_output
* decide between more output or persist.
if (tp
->snd_una
== tp
->snd_max
)
tp
->t_timer
[TCPT_REXMT
] = 0;
else if (tp
->t_timer
[TCPT_PERSIST
] == 0)
tp
->t_timer
[TCPT_REXMT
] = tp
->t_rxtcur
;
if (so
->so_snd
.sb_flags
& SB_NOTIFY
)
} else if (ti
->ti_ack
== tp
->snd_una
&&
tp
->seg_next
== (struct tcpiphdr
*)tp
&&
ti
->ti_len
<= sbspace(&so
->so_rcv
)) {
* this is a pure, in-sequence data packet
* with nothing on the reassembly queue and
* we have enough buffer space to take it.
tp
->rcv_nxt
+= ti
->ti_len
;
tcpstat
.tcps_rcvbyte
+= ti
->ti_len
;
* Drop TCP and IP headers then add data
m
->m_data
+= sizeof(struct tcpiphdr
);
m
->m_len
-= sizeof(struct tcpiphdr
);
sbappend(&so
->so_rcv
, m
);
* If this is a small packet, then ACK now - with Nagel
* congestion avoidance sender won't send more until
if ((unsigned)ti
->ti_len
< tp
->t_maxseg
) {
tp
->t_flags
|= TF_ACKNOW
;
tp
->t_flags
|= TF_DELACK
;
* Drop TCP and IP headers; TCP options were dropped above.
m
->m_data
+= sizeof(struct tcpiphdr
);
m
->m_len
-= sizeof(struct tcpiphdr
);
* Calculate amount of space in receive window,
* and then do TCP input processing.
* Receive window is amount of space in rcv queue,
* but not less than advertised window.
win
= sbspace(&so
->so_rcv
);
tp
->rcv_wnd
= max(win
, (int)(tp
->rcv_adv
- tp
->rcv_nxt
));
* If the state is LISTEN then ignore segment if it contains an RST.
* If the segment contains an ACK then it is bad and send a RST.
* If it does not contain a SYN then it is not interesting; drop it.
* Don't bother responding if the destination was a broadcast.
* Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
* tp->iss, and send a segment:
* <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
* Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
* Fill in remote peer address fields if not previously specified.
* Enter SYN_RECEIVED state, and process any other fields of this
register struct sockaddr_in
*sin
;
if ((tiflags
& TH_SYN
) == 0)
if (m
->m_flags
& M_BCAST
)
am
= m_get(M_DONTWAIT
, MT_SONAME
); /* XXX */
am
->m_len
= sizeof (struct sockaddr_in
);
sin
= mtod(am
, struct sockaddr_in
*);
sin
->sin_family
= AF_INET
;
sin
->sin_len
= sizeof(*sin
);
sin
->sin_addr
= ti
->ti_src
;
sin
->sin_port
= ti
->ti_sport
;
if (inp
->inp_laddr
.s_addr
== INADDR_ANY
)
inp
->inp_laddr
= ti
->ti_dst
;
if (in_pcbconnect(inp
, am
)) {
tp
->t_template
= tcp_template(tp
);
if (tp
->t_template
== 0) {
tp
= tcp_drop(tp
, ENOBUFS
);
dropsocket
= 0; /* socket is already gone */
tcp_dooptions(tp
, om
, ti
);
tcp_iss
+= TCP_ISSINCR
/2;
tp
->t_flags
|= TF_ACKNOW
;
tp
->t_state
= TCPS_SYN_RECEIVED
;
tp
->t_timer
[TCPT_KEEP
] = TCPTV_KEEP_INIT
;
dropsocket
= 0; /* committed to socket */
* If the state is SYN_SENT:
* if seg contains an ACK, but not for our SYN, drop the input.
* if seg contains a RST, then drop the connection.
* if seg does not contain SYN, then drop it.
* Otherwise this is an acceptable SYN segment
* initialize tp->rcv_nxt and tp->irs
* if seg contains ack then advance tp->snd_una
* if SYN has been acked change to ESTABLISHED else SYN_RCVD state
* arrange for segment to be acked (eventually)
* continue processing rest of data/controls, beginning with URG
if ((tiflags
& TH_ACK
) &&
(SEQ_LEQ(ti
->ti_ack
, tp
->iss
) ||
SEQ_GT(ti
->ti_ack
, tp
->snd_max
)))
tp
= tcp_drop(tp
, ECONNREFUSED
);
if ((tiflags
& TH_SYN
) == 0)
tp
->snd_una
= ti
->ti_ack
;
if (SEQ_LT(tp
->snd_nxt
, tp
->snd_una
))
tp
->snd_nxt
= tp
->snd_una
;
tp
->t_timer
[TCPT_REXMT
] = 0;
tp
->t_flags
|= TF_ACKNOW
;
if (tiflags
& TH_ACK
&& SEQ_GT(tp
->snd_una
, tp
->iss
)) {
tp
->t_state
= TCPS_ESTABLISHED
;
(void) tcp_reass(tp
, (struct tcpiphdr
*)0,
* if we didn't have to retransmit the SYN,
* use its rtt as our initial srtt & rtt var.
tp
->t_state
= TCPS_SYN_RECEIVED
;
* Advance ti->ti_seq to correspond to first data byte.
* If data, trim to stay within window,
* dropping FIN if necessary.
if ((u_long
)ti
->ti_len
> (u_long
)tp
->rcv_wnd
) {
todrop
= ti
->ti_len
- tp
->rcv_wnd
;
ti
->ti_len
= tp
->rcv_wnd
;
tcpstat
.tcps_rcvpackafterwin
++;
tcpstat
.tcps_rcvbyteafterwin
+= todrop
;
tp
->snd_wl1
= ti
->ti_seq
- 1;
* States other than LISTEN or SYN_SENT.
* First check that at least some bytes of segment are within
* receive window. If segment begins before rcv_nxt,
* drop leading data (and SYN); if nothing left, just ack.
todrop
= tp
->rcv_nxt
- ti
->ti_seq
;
if (todrop
> ti
->ti_len
||
todrop
== ti
->ti_len
&& (tiflags
&TH_FIN
) == 0) {
tcpstat
.tcps_rcvduppack
++;
tcpstat
.tcps_rcvdupbyte
+= ti
->ti_len
;
* If segment is just one to the left of the window,
* check two special cases:
* 1. Don't toss RST in response to 4.2-style keepalive.
* 2. If the only thing to drop is a FIN, we can drop
* it, but check the ACK or we will get into FIN
* wars if our FINs crossed (both CLOSING).
* In either case, send ACK to resynchronize,
* but keep on processing for RST or ACK.
if ((tiflags
& TH_FIN
&& todrop
== ti
->ti_len
+ 1)
|| (tiflags
& TH_RST
&& ti
->ti_seq
== tp
->rcv_nxt
- 1)
tp
->t_flags
|= TF_ACKNOW
;
tcpstat
.tcps_rcvpartduppack
++;
tcpstat
.tcps_rcvpartdupbyte
+= todrop
;
* If new data are received on a connection after the
* user processes are gone, then RST the other end.
if ((so
->so_state
& SS_NOFDREF
) &&
tp
->t_state
> TCPS_CLOSE_WAIT
&& ti
->ti_len
) {
tcpstat
.tcps_rcvafterclose
++;
* If segment ends after window, drop trailing data
* (and PUSH and FIN); if nothing left, just ACK.
todrop
= (ti
->ti_seq
+ti
->ti_len
) - (tp
->rcv_nxt
+tp
->rcv_wnd
);
tcpstat
.tcps_rcvpackafterwin
++;
if (todrop
>= ti
->ti_len
) {
tcpstat
.tcps_rcvbyteafterwin
+= ti
->ti_len
;
* If a new connection request is received
* while in TIME_WAIT, drop the old connection
* and start over if the sequence numbers
* are above the previous ones.
tp
->t_state
== TCPS_TIME_WAIT
&&
SEQ_GT(ti
->ti_seq
, tp
->rcv_nxt
)) {
iss
= tp
->rcv_nxt
+ TCP_ISSINCR
;
* If window is closed can only take segments at
* window edge, and have to drop data and PUSH from
* incoming segments. Continue processing, but
* remember to ack. Otherwise, drop segment
if (tp
->rcv_wnd
== 0 && ti
->ti_seq
== tp
->rcv_nxt
) {
tp
->t_flags
|= TF_ACKNOW
;
tcpstat
.tcps_rcvwinprobe
++;
tcpstat
.tcps_rcvbyteafterwin
+= todrop
;
tiflags
&= ~(TH_PUSH
|TH_FIN
);
* If the RST bit is set examine the state:
* If passive open, return to LISTEN state.
* If active open, inform user that connection was refused.
* ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
* Inform user that connection was reset, and close tcb.
* CLOSING, LAST_ACK, TIME_WAIT STATES
if (tiflags
&TH_RST
) switch (tp
->t_state
) {
so
->so_error
= ECONNREFUSED
;
so
->so_error
= ECONNRESET
;
tp
->t_state
= TCPS_CLOSED
;
* If a SYN is in the window, then this is an
* error and we send an RST and drop the connection.
tp
= tcp_drop(tp
, ECONNRESET
);
* If the ACK bit is off we drop the segment and return.
if ((tiflags
& TH_ACK
) == 0)
* In SYN_RECEIVED state if the ack ACKs our SYN then enter
* ESTABLISHED state and continue processing, otherwise
if (SEQ_GT(tp
->snd_una
, ti
->ti_ack
) ||
SEQ_GT(ti
->ti_ack
, tp
->snd_max
))
tp
->t_state
= TCPS_ESTABLISHED
;
(void) tcp_reass(tp
, (struct tcpiphdr
*)0, (struct mbuf
*)0);
tp
->snd_wl1
= ti
->ti_seq
- 1;
* In ESTABLISHED state: drop duplicate ACKs; ACK out of range
* ACKs. If the ack is in the range
* tp->snd_una < ti->ti_ack <= tp->snd_max
* then advance tp->snd_una to ti->ti_ack and drop
* data from the retransmission queue. If this ACK reflects
* more up to date window information we update our window information.
if (SEQ_LEQ(ti
->ti_ack
, tp
->snd_una
)) {
if (ti
->ti_len
== 0 && ti
->ti_win
== tp
->snd_wnd
) {
tcpstat
.tcps_rcvdupack
++;
* If we have outstanding data (other than
* a window probe), this is a completely
* duplicate ack (ie, window info didn't
* change), the ack is the biggest we've
* seen and we've seen exactly our rexmt
* threshhold of them, assume a packet
* has been dropped and retransmit it.
* Kludge snd_nxt & the congestion
* window so we send only this one
* We know we're losing at the current
* window size so do congestion avoidance
* (set ssthresh to half the current window
* and pull our congestion window back to
* Dup acks mean that packets have left the
* network (they're now cached at the receiver)
* so bump cwnd by the amount in the receiver
* to keep a constant cwnd packets in the
if (tp
->t_timer
[TCPT_REXMT
] == 0 ||
ti
->ti_ack
!= tp
->snd_una
)
else if (++tp
->t_dupacks
== tcprexmtthresh
) {
tcp_seq onxt
= tp
->snd_nxt
;
min(tp
->snd_wnd
, tp
->snd_cwnd
) / 2 /
tp
->snd_ssthresh
= win
* tp
->t_maxseg
;
tp
->t_timer
[TCPT_REXMT
] = 0;
tp
->snd_nxt
= ti
->ti_ack
;
tp
->snd_cwnd
= tp
->t_maxseg
;
tp
->snd_cwnd
= tp
->snd_ssthresh
+
tp
->t_maxseg
* tp
->t_dupacks
;
if (SEQ_GT(onxt
, tp
->snd_nxt
))
} else if (tp
->t_dupacks
> tcprexmtthresh
) {
tp
->snd_cwnd
+= tp
->t_maxseg
;
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
if (tp
->t_dupacks
> tcprexmtthresh
&&
tp
->snd_cwnd
> tp
->snd_ssthresh
)
tp
->snd_cwnd
= tp
->snd_ssthresh
;
if (SEQ_GT(ti
->ti_ack
, tp
->snd_max
)) {
tcpstat
.tcps_rcvacktoomuch
++;
acked
= ti
->ti_ack
- tp
->snd_una
;
tcpstat
.tcps_rcvackpack
++;
tcpstat
.tcps_rcvackbyte
+= acked
;
* If transmit timer is running and timed sequence
* number was acked, update smoothed round trip time.
* Since we now have an rtt measurement, cancel the
* timer backoff (cf., Phil Karn's retransmit alg.).
* Recompute the initial retransmit timer.
if (tp
->t_rtt
&& SEQ_GT(ti
->ti_ack
, tp
->t_rtseq
))
* If all outstanding data is acked, stop retransmit
* timer and remember to restart (more output or persist).
* If there is more data to be acked, restart retransmit
* timer, using current (possibly backed-off) value.
if (ti
->ti_ack
== tp
->snd_max
) {
tp
->t_timer
[TCPT_REXMT
] = 0;
} else if (tp
->t_timer
[TCPT_PERSIST
] == 0)
tp
->t_timer
[TCPT_REXMT
] = tp
->t_rxtcur
;
* When new data is acked, open the congestion window.
* If the window gives us less than ssthresh packets
* in flight, open exponentially (maxseg per packet).
* Otherwise open linearly: maxseg per window
* (maxseg^2 / cwnd per packet), plus a constant
* fraction of a packet (maxseg/8) to help larger windows
register u_int cw
= tp
->snd_cwnd
;
register u_int incr
= tp
->t_maxseg
;
if (cw
> tp
->snd_ssthresh
)
incr
= incr
* incr
/ cw
+ incr
/ 8;
tp
->snd_cwnd
= min(cw
+ incr
, TCP_MAXWIN
);
if (acked
> so
->so_snd
.sb_cc
) {
tp
->snd_wnd
-= so
->so_snd
.sb_cc
;
sbdrop(&so
->so_snd
, (int)so
->so_snd
.sb_cc
);
sbdrop(&so
->so_snd
, acked
);
if (so
->so_snd
.sb_flags
& SB_NOTIFY
)
tp
->snd_una
= ti
->ti_ack
;
if (SEQ_LT(tp
->snd_nxt
, tp
->snd_una
))
tp
->snd_nxt
= tp
->snd_una
;
* In FIN_WAIT_1 STATE in addition to the processing
* for the ESTABLISHED state if our FIN is now acknowledged
* If we can't receive any more
* data, then closing user can proceed.
* Starting the timer is contrary to the
* specification, but if we don't get a FIN
if (so
->so_state
& SS_CANTRCVMORE
) {
tp
->t_timer
[TCPT_2MSL
] = tcp_maxidle
;
tp
->t_state
= TCPS_FIN_WAIT_2
;
* In CLOSING STATE in addition to the processing for
* the ESTABLISHED state if the ACK acknowledges our FIN
* then enter the TIME-WAIT state, otherwise ignore
tp
->t_state
= TCPS_TIME_WAIT
;
tp
->t_timer
[TCPT_2MSL
] = 2 * TCPTV_MSL
;
* In LAST_ACK, we may still be waiting for data to drain
* and/or to be acked, as well as for the ack of our FIN.
* If our FIN is now acknowledged, delete the TCB,
* enter the closed state and return.
* In TIME_WAIT state the only thing that should arrive
* is a retransmission of the remote FIN. Acknowledge
* it and restart the finack timer.
tp
->t_timer
[TCPT_2MSL
] = 2 * TCPTV_MSL
;
* Update window information.
* Don't look at window if no ACK: TAC's send garbage on first SYN.
if ((tiflags
& TH_ACK
) &&
(SEQ_LT(tp
->snd_wl1
, ti
->ti_seq
) || tp
->snd_wl1
== ti
->ti_seq
&&
(SEQ_LT(tp
->snd_wl2
, ti
->ti_ack
) ||
tp
->snd_wl2
== ti
->ti_ack
&& ti
->ti_win
> tp
->snd_wnd
))) {
/* keep track of pure window updates */
tp
->snd_wl2
== ti
->ti_ack
&& ti
->ti_win
> tp
->snd_wnd
)
tcpstat
.tcps_rcvwinupd
++;
tp
->snd_wnd
= ti
->ti_win
;
tp
->snd_wl1
= ti
->ti_seq
;
tp
->snd_wl2
= ti
->ti_ack
;
if (tp
->snd_wnd
> tp
->max_sndwnd
)
tp
->max_sndwnd
= tp
->snd_wnd
;
* Process segments with URG.
if ((tiflags
& TH_URG
) && ti
->ti_urp
&&
TCPS_HAVERCVDFIN(tp
->t_state
) == 0) {
* This is a kludge, but if we receive and accept
* random urgent pointers, we'll crash in
* soreceive. It's hard to imagine someone
* actually wanting to send this much urgent data.
if (ti
->ti_urp
+ so
->so_rcv
.sb_cc
> SB_MAX
) {
ti
->ti_urp
= 0; /* XXX */
tiflags
&= ~TH_URG
; /* XXX */
* If this segment advances the known urgent pointer,
* then mark the data stream. This should not happen
* in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
* a FIN has been received from the remote side.
* In these states we ignore the URG.
* According to RFC961 (Assigned Protocols),
* the urgent pointer points to the last octet
* of urgent data. We continue, however,
* to consider it to indicate the first octet
* of data past the urgent section as the original
* spec states (in one of two places).
if (SEQ_GT(ti
->ti_seq
+ti
->ti_urp
, tp
->rcv_up
)) {
tp
->rcv_up
= ti
->ti_seq
+ ti
->ti_urp
;
so
->so_oobmark
= so
->so_rcv
.sb_cc
+
(tp
->rcv_up
- tp
->rcv_nxt
) - 1;
so
->so_state
|= SS_RCVATMARK
;
tp
->t_oobflags
&= ~(TCPOOB_HAVEDATA
| TCPOOB_HADDATA
);
* Remove out of band data so doesn't get presented to user.
* This can happen independent of advancing the URG pointer,
* but if two URG's are pending at once, some out-of-band
* data may creep in... ick.
if ((u_long
)ti
->ti_urp
<= (u_long
)ti
->ti_len
&& (so
->so_options
& SO_OOBINLINE
) == 0
tcp_pulloutofband(so
, ti
, m
);
* If no out of band data is expected,
* pull receive urgent pointer along
* with the receive window.
if (SEQ_GT(tp
->rcv_nxt
, tp
->rcv_up
))
tp
->rcv_up
= tp
->rcv_nxt
;
* Process the segment text, merging it into the TCP sequencing queue,
* and arranging for acknowledgment of receipt if necessary.
* This process logically involves adjusting tp->rcv_wnd as data
* is presented to the user (this happens in tcp_usrreq.c,
* case PRU_RCVD). If a FIN has already been received on this
* connection then we just ignore the text.
if ((ti
->ti_len
|| (tiflags
&TH_FIN
)) &&
TCPS_HAVERCVDFIN(tp
->t_state
) == 0) {
TCP_REASS(tp
, ti
, m
, so
, tiflags
);
* Note the amount of data that peer has sent into
* our window, in order to estimate the sender's
len
= so
->so_rcv
.sb_hiwat
- (tp
->rcv_adv
- tp
->rcv_nxt
);
* If FIN is received ACK the FIN and let the user know
* that the connection is closing.
if (TCPS_HAVERCVDFIN(tp
->t_state
) == 0) {
tp
->t_flags
|= TF_ACKNOW
;
* In SYN_RECEIVED and ESTABLISHED STATES
* enter the CLOSE_WAIT state.
tp
->t_state
= TCPS_CLOSE_WAIT
;
* If still in FIN_WAIT_1 STATE FIN has not been acked so
* enter the CLOSING state.
tp
->t_state
= TCPS_CLOSING
;
* In FIN_WAIT_2 state enter the TIME_WAIT state,
* starting the time-wait timer, turning off the other
tp
->t_state
= TCPS_TIME_WAIT
;
tp
->t_timer
[TCPT_2MSL
] = 2 * TCPTV_MSL
;
* In TIME_WAIT state restart the 2 MSL time_wait timer.
tp
->t_timer
[TCPT_2MSL
] = 2 * TCPTV_MSL
;
if (so
->so_options
& SO_DEBUG
)
tcp_trace(TA_INPUT
, ostate
, tp
, &tcp_saveti
, 0);
* If this is a small packet, then ACK now - with Nagel
* congestion avoidance sender won't send more until
if (ti
->ti_len
&& ((unsigned)ti
->ti_len
< tp
->t_maxseg
))
tp
->t_flags
|= TF_ACKNOW
;
* Return any desired output.
if (needoutput
|| (tp
->t_flags
& TF_ACKNOW
))
* Generate an ACK dropping incoming segment if it occupies
* sequence space, where the ACK reflects our state.
tp
->t_flags
|= TF_ACKNOW
;
* Generate a RST, dropping incoming segment.
* Make ACK acceptable to originator of segment.
* Don't bother to respond if destination was broadcast.
if ((tiflags
& TH_RST
) || m
->m_flags
& M_BCAST
)
tcp_respond(tp
, ti
, m
, (tcp_seq
)0, ti
->ti_ack
, TH_RST
);
tcp_respond(tp
, ti
, m
, ti
->ti_seq
+ti
->ti_len
, (tcp_seq
)0,
/* destroy temporarily created socket */
* Drop space held by incoming segment and return.
if (tp
&& (tp
->t_inpcb
->inp_socket
->so_options
& SO_DEBUG
))
tcp_trace(TA_DROP
, ostate
, tp
, &tcp_saveti
, 0);
/* destroy temporarily created socket */
tcp_dooptions(tp
, om
, ti
)
for (; cnt
> 0; cnt
-= optlen
, cp
+= optlen
) {
if (!(ti
->ti_flags
& TH_SYN
))
bcopy((char *) cp
+ 2, (char *) &mss
, sizeof(mss
));
(void) tcp_mss(tp
, mss
); /* sets t_maxseg */
* Pull out of band byte out of a segment so
* it doesn't appear in the user's data queue.
* It is still reflected in the segment length for
tcp_pulloutofband(so
, ti
, m
)
int cnt
= ti
->ti_urp
- 1;
char *cp
= mtod(m
, caddr_t
) + cnt
;
struct tcpcb
*tp
= sototcpcb(so
);
tp
->t_oobflags
|= TCPOOB_HAVEDATA
;
bcopy(cp
+1, cp
, (unsigned)(m
->m_len
- cnt
- 1));
panic("tcp_pulloutofband");
* Collect new round-trip time estimate
* and update averages and current timeout.
register struct tcpcb
*tp
;
tcpstat
.tcps_rttupdated
++;
* srtt is stored as fixed point with 3 bits after the
* binary point (i.e., scaled by 8). The following magic
* is equivalent to the smoothing algorithm in rfc793 with
* an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
* point). Adjust t_rtt to origin 0.
delta
= tp
->t_rtt
- 1 - (tp
->t_srtt
>> TCP_RTT_SHIFT
);
if ((tp
->t_srtt
+= delta
) <= 0)
* We accumulate a smoothed rtt variance (actually, a
* smoothed mean difference), then set the retransmit
* timer to smoothed rtt + 4 times the smoothed variance.
* rttvar is stored as fixed point with 2 bits after the
* binary point (scaled by 4). The following is
* equivalent to rfc793 smoothing with an alpha of .75
* (rttvar = rttvar*3/4 + |delta| / 4). This replaces
* rfc793's wired-in beta.
delta
-= (tp
->t_rttvar
>> TCP_RTTVAR_SHIFT
);
if ((tp
->t_rttvar
+= delta
) <= 0)
* No rtt measurement yet - use the unsmoothed rtt.
* Set the variance to half the rtt (so our first
* retransmit happens at 2*rtt)
tp
->t_srtt
= tp
->t_rtt
<< TCP_RTT_SHIFT
;
tp
->t_rttvar
= tp
->t_rtt
<< (TCP_RTTVAR_SHIFT
- 1);
* the retransmit should happen at rtt + 4 * rttvar.
* Because of the way we do the smoothing, srtt and rttvar
* will each average +1/2 tick of bias. When we compute
* the retransmit timer, we want 1/2 tick of rounding and
* 1 extra tick because of +-1/2 tick uncertainty in the
* firing of the timer. The bias will give us exactly the
* 1.5 tick we need. But, because the bias is
* statistical, we have to test that we don't drop below
* the minimum feasible timer (which is 2 ticks).
TCPT_RANGESET(tp
->t_rxtcur
, TCP_REXMTVAL(tp
),
tp
->t_rttmin
, TCPTV_REXMTMAX
);
* We received an ack for a packet that wasn't retransmitted;
* it is probably safe to discard any error indications we've
* received recently. This isn't quite right, but close enough
* for now (a route might have failed after we sent a segment,
* and the return path might not be symmetrical).
* Determine a reasonable value for maxseg size.
* If the route is known, check route for mtu.
* If none, use an mss that can be handled on the outgoing
* interface without forcing IP to fragment; if bigger than
* an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
* to utilize large mbufs. If no route is found, route has no mtu,
* or the destination isn't local, use a default, hopefully conservative
* size (usually 512 or the default IP max size, but no more than the mtu
* of the interface), as we can't discover anything about intervening
* gateways or networks. We also initialize the congestion/slow start
* window to be a single segment if the destination isn't local.
* While looking at the routing entry, we also initialize other path-dependent
* parameters from pre-set or cached values in the routing entry.
register struct tcpcb
*tp
;
register struct rtentry
*rt
;
extern int tcp_mssdflt
, tcp_rttdflt
;
if ((rt
= ro
->ro_rt
) == (struct rtentry
*)0) {
/* No route yet, so try to acquire one */
if (inp
->inp_faddr
.s_addr
!= INADDR_ANY
) {
ro
->ro_dst
.sa_family
= AF_INET
;
ro
->ro_dst
.sa_len
= sizeof(ro
->ro_dst
);
((struct sockaddr_in
*) &ro
->ro_dst
)->sin_addr
=
if ((rt
= ro
->ro_rt
) == (struct rtentry
*)0)
#ifdef RTV_MTU /* if route characteristics exist ... */
* While we're here, check if there's an initial rtt
* or rttvar. Convert from the route-table units
* to scaled multiples of the slow timeout timer.
if (tp
->t_srtt
== 0 && (rtt
= rt
->rt_rmx
.rmx_rtt
)) {
if (rt
->rt_rmx
.rmx_locks
& RTV_MTU
)
tp
->t_rttmin
= rtt
/ (RTM_RTTUNIT
/ PR_SLOWHZ
);
tp
->t_srtt
= rtt
/ (RTM_RTTUNIT
/ (PR_SLOWHZ
* TCP_RTT_SCALE
));
if (rt
->rt_rmx
.rmx_rttvar
)
tp
->t_rttvar
= rt
->rt_rmx
.rmx_rttvar
/
(RTM_RTTUNIT
/ (PR_SLOWHZ
* TCP_RTTVAR_SCALE
));
/* default variation is +- 1 rtt */
tp
->t_srtt
* TCP_RTTVAR_SCALE
/ TCP_RTT_SCALE
;
TCPT_RANGESET(tp
->t_rxtcur
,
((tp
->t_srtt
>> 2) + tp
->t_rttvar
) >> 1,
tp
->t_rttmin
, TCPTV_REXMTMAX
);
* if there's an mtu associated with the route, use it
mss
= rt
->rt_rmx
.rmx_mtu
- sizeof(struct tcpiphdr
);
mss
= ifp
->if_mtu
- sizeof(struct tcpiphdr
);
#if (MCLBYTES & (MCLBYTES - 1)) == 0
mss
= mss
/ MCLBYTES
* MCLBYTES
;
if (!in_localaddr(inp
->inp_faddr
))
mss
= min(mss
, tcp_mssdflt
);
* The current mss, t_maxseg, is initialized to the default value.
* If we compute a smaller value, reduce the current mss.
* If we compute a larger value, return it for use in sending
* a max seg size option, but don't store it for use
* unless we received an offer at least that large from peer.
* However, do not accept offers under 32 bytes.
mss
= max(mss
, 32); /* sanity */
if (mss
< tp
->t_maxseg
|| offer
!= 0) {
* If there's a pipesize, change the socket buffer
* to that size. Make the socket buffers an integral
* number of mss units; if the mss is larger than
* the socket buffer, decrease the mss.
if ((bufsize
= rt
->rt_rmx
.rmx_sendpipe
) == 0)
bufsize
= so
->so_snd
.sb_hiwat
;
bufsize
= min(bufsize
, SB_MAX
) / mss
* mss
;
(void) sbreserve(&so
->so_snd
, bufsize
);
if ((bufsize
= rt
->rt_rmx
.rmx_recvpipe
) == 0)
bufsize
= so
->so_rcv
.sb_hiwat
;
bufsize
= min(bufsize
, SB_MAX
) / mss
* mss
;
(void) sbreserve(&so
->so_rcv
, bufsize
);
if (rt
->rt_rmx
.rmx_ssthresh
) {
* There's some sort of gateway or interface
* buffer limit on the path. Use this to set
* the slow start threshhold, but set the
* threshold to no less than 2*mss.
tp
->snd_ssthresh
= max(2 * mss
, rt
->rt_rmx
.rmx_ssthresh
);