From 2cb6350910d5a35f07bc076fabac8207477913d5 Mon Sep 17 00:00:00 2001 From: Garrett Wollman Date: Thu, 18 Nov 1993 00:08:23 +0000 Subject: [PATCH] Added (disabled) initial support for Path MTU Discovery. Updated ICMP implementation to be closer to strict RFC 1122 compliance. --- sys/netinet/icmp_var.h | 4 +- sys/netinet/in.c | 3 +- sys/netinet/in_mtudisc.c | 393 +++++++++++++++++++++++++++++++++++++++ sys/netinet/in_pcb.c | 35 +++- sys/netinet/in_pcb.h | 21 ++- sys/netinet/in_var.h | 17 +- sys/netinet/ip_icmp.c | 168 ++++++++++++++--- sys/netinet/ip_icmp.h | 37 +++- sys/netinet/ip_input.c | 55 ++++-- sys/netinet/tcp_subr.c | 55 +++++- sys/netinet/udp_usrreq.c | 7 +- 11 files changed, 741 insertions(+), 54 deletions(-) create mode 100644 sys/netinet/in_mtudisc.c diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h index eaf426331e..c8142c6da9 100644 --- a/sys/netinet/icmp_var.h +++ b/sys/netinet/icmp_var.h @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)icmp_var.h 7.5 (Berkeley) 6/28/90 - * $Id: icmp_var.h,v 1.2 1993/10/16 18:25:52 rgrimes Exp $ + * $Id: icmp_var.h,v 1.3 1993/11/07 17:47:45 wollman Exp $ */ #ifndef _NETINET_ICMP_VAR_H_ @@ -46,6 +46,8 @@ struct icmpstat { int icps_error; /* # of calls to icmp_error */ int icps_oldshort; /* no error 'cuz old ip too short */ int icps_oldicmp; /* no error 'cuz old was icmp */ + int icps_oldmcast; /* no error 'cuz old was multicast */ + int icps_oldbadaddr; /* no error 'cuz old had bad address */ int icps_outhist[ICMP_MAXTYPE + 1]; /* statistics related to input messages processed */ int icps_badcode; /* icmp_code out of range */ diff --git a/sys/netinet/in.c b/sys/netinet/in.c index 2ded25c348..0366102e21 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -31,10 +31,11 @@ * SUCH DAMAGE. * * from: @(#)in.c 7.17 (Berkeley) 4/20/91 - * $Id: in.c,v 1.3 1993/11/07 17:47:49 wollman Exp $ + * $Id: in.c,v 1.4 1993/11/07 22:55:02 wollman Exp $ */ #include "param.h" +#include "systm.h" #include "ioctl.h" #include "mbuf.h" #include "socket.h" diff --git a/sys/netinet/in_mtudisc.c b/sys/netinet/in_mtudisc.c new file mode 100644 index 0000000000..cb52ba1d04 --- /dev/null +++ b/sys/netinet/in_mtudisc.c @@ -0,0 +1,393 @@ +/*- + * Copyright (c) 1993, University of Vermont and State + * Agricultural College. + * Copyright (c) 1993, Garrett A. Wollman. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY AND AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR AUTHORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#ifdef MTUDISC + +#include "param.h" +#include "systm.h" +#include "kernel.h" +#include "mbuf.h" +#include "socket.h" +#include "socketvar.h" +#include "in_systm.h" +#include "net/if.h" +#include "net/route.h" +#include "in.h" +#include "in_var.h" +#include "ip.h" +#include "protosw.h" +#include "in_pcb.h" + +#ifdef INET + +/* + * checkpcbs[] lists all the PCB heads that might call on the services + * of MTU discovery. + * This is really bogus 'cuz a ULP needs to both get its entry added here + * /and/ set INP_DISCOVERMTU in each PCB. + */ +extern struct inpcb tcb; /* XXX move to header file */ + +struct inpcb *checkpcbs[] = { + &tcb, + 0 +}; + +/* + * MTUTIMER1 is the number of minutes to wait after having incremented + * the MTU estimate before trying again. MTUTIMER2 is the number + * of minutes to wait after having decremented the MTU estimate + * before trying to increment it. + */ +#ifndef MTUTIMER1 +#define MTUTIMER1 2 +#endif +int in_mtutimer1 = MTUTIMER1; + +#ifndef MTUTIMER2 +#define MTUTIMER2 10 +#endif +int in_mtutimer2 = MTUTIMER2; + + +/* + * Table of likely MTU values, courtesy of RFC 1191. + * This MUST remain in sorted order. + */ +const u_short in_mtus[] = { + 65535, /* maximum */ + 32767, /* convenient power of 2 - 1 */ + 17914, /* 16Mb Token Ring */ + 16383, /* convenient power of 2 - 1 */ + 8166, /* IEEE 802.4 */ + 6288, /* convenient stopping point */ + 4352, /* FDDI */ + 3144, /* convenient stopping point */ + 2002, /* IEEE 802.5 */ + 1492, /* IEEE 802.3 */ + 1006, /* BBN 1822 */ + 508, /* ARCNET */ + 296, /* SLIP, PPP */ + 128 /* minimum we'll accept */ +}; + +#define NMTUS ((sizeof in_mtus)/(sizeof in_mtus[0])) + +/* + * Find the next MTU in the sequence from CURRENT. + * If HIGHER, increase size; else decrease. + * Return of zero means we're stuck. + * NB: We might be called with a CURRENT MTU that's not in the + * table (as, for example, when an ICMP tells us there's a problem + * and reports a max path MTU value). + */ +unsigned +in_nextmtu(unsigned current, int higher) { + int i; + + for(i = 0; i < NMTUS; i++) { + if(in_mtus[i] <= (u_short)current) + break; + } + + if(i == NMTUS) { + if(higher) return in_mtus[NMTUS - 1]; + else return 0; /* error return */ + } + + /* + * Now we know that CURRENT lies somewhere in the interval + * (in_mtus[i - 1], in_mtus[i]]. If we want to go higher, + * take in_mtus[i - 1] always. If we want to go lower, we + * must check the lower bound to see if it's equal, and if so, + * take in_mtus[i + 1], unless i == NMTUS - 1, in which case + * we return failure. + * Got that? + */ + if(higher) + return in_mtus[(i >= 1) ? (i - 1) : 0]; + + /* now we know it's lower */ + if(current == in_mtus[i]) { + if(i == NMTUS - 1) + return 0; + else + return in_mtus[i + 1]; + } + + return in_mtus[i]; +} + +/* + * Set up the route to do MTU discovery. This only works for host routes, + * not net routes; in any case, ALL systems should have all IP routes + * marked with RTF_CLONING (and a genmask of zero), which will do the right + * thing, and also arrange for the pre-ARPing code to get called on + * on appropriate interfaces. + * + * We also go to some pains to keep listeners on the routing socket aware + * of what's going on when we fiddle the flags or metrics. I don't know + * if this is really necessary or not (or even if we're doing it in the + * right way). + */ +int in_routemtu(struct route *ro) { + if(!ro->ro_rt) + return 0; + + if((ro->ro_rt->rt_flags & (RTF_HOST | RTF_UP)) != (RTF_HOST | RTF_UP)) + return 0; + + if(ro->ro_rt->rt_rmx.rmx_mtu) { + /* + * Let the user know that we've turned on MTU discovery for this + * route entry. This doesn't do anything at present, but may + * be useful later on. + */ + if(!(ro->ro_rt->rt_flags & RTF_PROTO1)) { + ro->ro_rt->rt_flags |= RTF_PROTO1; + } + return 1; + } + + if(ro->ro_rt->rt_ifp && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)) { + ro->ro_rt->rt_flags |= RTF_PROTO1; + /* + * Subtraction is necessary because the interface's MTU includes + * the interface's own headers. We subtract the header length + * provided and hope for the best. + */ + ro->ro_rt->rt_rmx.rmx_mtu = + ro->ro_rt->rt_ifp->if_mtu - ro->ro_rt->rt_ifp->if_hdrlen; + return 1; + } + return 0; +} + +/* + * Perform the PCB fiddling necessary when the route changes. + * Protect against recursion, since we might get called as a + * result of notifying someone else that the MTU is changing. + */ +void +in_pcbmtu(struct inpcb *inp) { + static int notifying = 0; + static int timerstarted = 0; + unsigned oldmtu = inp->inp_pmtu; + int oldflags = inp->inp_flags; + + if (!timerstarted) { + timeout(in_mtutimer, 0, 60 * hz); + timerstarted = 1; + } + + if (inp->inp_flags & INP_DISCOVERMTU) { + /* + * If no route present, get one. + * If there is one present, but it's marked as being `down', + * try to get another one. + */ + if(!inp->inp_route.ro_rt) + rtalloc(&inp->inp_route); + else if((inp->inp_route.ro_rt->rt_flags & RTF_UP) == 0) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = 0; + rtalloc(&inp->inp_route); + } + + if(in_routemtu(&inp->inp_route)) { + inp->inp_flags |= INP_MTUDISCOVERED; + inp->inp_pmtu = inp->inp_route.ro_rt->rt_rmx.rmx_mtu; + inp->inp_ip.ip_off |= IP_DF; + } else { + inp->inp_flags &= ~INP_MTUDISCOVERED; + inp->inp_ip.ip_off &= ~IP_DF; + } + /* + * If nothing has changed since the last value we had, + * don't waste any time notifying everybody that nothing + * has changed. + */ + if(inp->inp_pmtu != oldmtu + || (inp->inp_flags ^ oldflags)) { + notifying = 1; + /* + * If the MTU has decreased, use timer 2. + */ + inp->inp_mtutimer = + (inp->inp_pmtu < oldmtu) ? in_mtutimer2 : in_mtutimer1; + in_mtunotify(inp); + notifying = 0; + } + } +} + +/* + * Tell the clients that have the same destination as INP that they + * need to take a new look at the MTU value and flags. + */ +void +in_mtunotify(struct inpcb *inp) { + in_pcbnotify(inp->inp_head, &inp->inp_route.ro_dst, 0, zeroin_addr, + 0, PRC_MTUCHANGED, inp->inp_mtunotify); +} + +/* + * Adjust the MTU listed in the route on the basis of an ICMP + * Unreachable: Need Fragmentation message. + * Note that the PRC_MSGSIZE error is still delivered; this just + * makes the adjustment in the route, and depends on the ULPs which + * are required to translate PRC_MSGSIZE into an in_pcbmtu() which will + * pick up the new size. + */ +void +in_mtureduce(struct in_addr dst, unsigned newsize) { + struct route ro; + + ro.ro_dst.sa_family = AF_INET; + ro.ro_dst.sa_len = sizeof ro.ro_dst; + ((struct sockaddr_in *)&ro.ro_dst)->sin_addr = dst; + ro.ro_rt = 0; + rtalloc(&ro); + + /* + * If there was no route, just forget about it, can't do anything. + */ + if(!ro.ro_rt) + return; + + /* + * If there was a route, but it's the wrong kind, forget it. + */ + if((ro.ro_rt->rt_flags & (RTF_UP | RTF_HOST)) != (RTF_UP | RTF_HOST)) { + RTFREE(ro.ro_rt); + return; + } + + /* + * If the MTU is locked by some outside agency, forget it. + */ + if(ro.ro_rt->rt_rmx.rmx_locks & RTV_MTU) { + RTFREE(ro.ro_rt); + return; + } + + /* + * If newsize == 0, then we got an ICMP from a router + * which doesn't support the MTU extension, so just go down one. + */ + newsize = in_nextmtu(ro.ro_rt->rt_rmx.rmx_mtu, 0); + + if(!newsize) { + ro.ro_rt->rt_rmx.rmx_mtu = 0; /* we can't go any lower */ + RTFREE(ro.ro_rt); + return; + } + /* + * If the new MTU is greater than the old MTU, forget it. (Prevent + * denial-of-service attack.) Don't bother if the new MTU is the + * same as the old one. + */ + if(ro.ro_rt->rt_rmx.rmx_mtu <= newsize) { + RTFREE(ro.ro_rt); + return; + } + + /* + * OK, do it. + */ + ro.ro_rt->rt_rmx.rmx_mtu = newsize; + RTFREE(ro.ro_rt); +} + +/* + * Walk through all the PCB lists in checkpcbs[] and decrement the + * timers on the ones still participating in MTU discovery. + * If the timers reach zero, bump the MTU (clamped to the interface + * MTU), assuming the route is still good. + */ +void +in_mtutimer(caddr_t dummy1, int dummy2) { + int i; + struct inpcb *inp; + struct rtentry *rt; + int s = splnet(); + + for(i = 0; checkpcbs[i]; i++) { + inp = checkpcbs[i]; + + while(inp = inp->inp_next) { + if(inp->inp_flags & INP_MTUDISCOVERED) { + if(!inp->inp_route.ro_rt + || !(inp->inp_route.ro_rt->rt_flags & RTF_UP)) { + inp->inp_flags &= ~INP_MTUDISCOVERED; + continue; /* we'll notice it later */ + } + + if(--inp->inp_mtutimer == 0) { + in_bumpmtu(inp); + inp->inp_mtutimer = in_mtutimer1; + if(inp->inp_route.ro_rt->rt_rmx.rmx_rtt + && ((in_mtutimer1 * 60) + > (inp->inp_route.ro_rt->rt_rmx.rmx_rtt / RTM_RTTUNIT))) { + inp->inp_mtutimer = + inp->inp_route.ro_rt->rt_rmx.rmx_rtt / RTM_RTTUNIT; + } + } + } + } + } + splx(s); + timeout(in_mtutimer, (caddr_t)0, 60 * hz); +} + +/* + * Try to increase the MTU and let everyone know that it has changed. + * Must be called with a valid route in inp->inp_route. Probably + * must be at splnet(), too. + */ +void +in_bumpmtu(struct inpcb *inp) { + struct route *ro; + unsigned newmtu; + + ro = &inp->inp_route; + newmtu = in_nextmtu(inp->inp_pmtu, 1); + if(!newmtu) return; /* doing the best we can */ + if(newmtu <= ro->ro_rt->rt_ifp->if_mtu) { + if(!(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)) { + ro->ro_rt->rt_rmx.rmx_mtu = newmtu; + in_pcbmtu(inp); + } + } +} + +#endif /* INET */ +#endif /* MTUDISC */ diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 0ac55d931f..cc21ead3b1 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)in_pcb.c 7.14 (Berkeley) 4/20/91 - * $Id$ + * $Id: in_pcb.c,v 1.2 1993/10/16 18:26:01 rgrimes Exp $ */ #include "param.h" @@ -232,6 +232,13 @@ in_pcbconnect(inp, nam) } inp->inp_faddr = sin->sin_addr; inp->inp_fport = sin->sin_port; +#ifdef MTUDISC + /* + * If the upper layer asked for PMTU discovery services, see + * if we can get an idea of what the MTU should be... + */ + in_pcbmtu(inp); +#endif /* MTUDISC */ return (0); } @@ -241,6 +248,9 @@ in_pcbdisconnect(inp) inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; +#ifdef MTUDISC + inp->inp_flags &= ~INP_MTUDISCOVERED; +#endif if (inp->inp_socket->so_state & SS_NOFDREF) in_pcbdetach(inp); } @@ -301,6 +311,7 @@ in_setpeeraddr(inp, nam) * * Must be called at splnet. */ +void in_pcbnotify(head, dst, fport, laddr, lport, cmd, notify) struct inpcb *head; struct sockaddr *dst; @@ -312,7 +323,6 @@ in_pcbnotify(head, dst, fport, laddr, lport, cmd, notify) struct in_addr faddr; int errno; int in_rtchange(); - extern u_char inetctlerrmap[]; if ((unsigned)cmd > PRC_NCMDS || dst->sa_family != AF_INET) return; @@ -324,14 +334,16 @@ in_pcbnotify(head, dst, fport, laddr, lport, cmd, notify) * Redirects go to all references to the destination, * and use in_rtchange to invalidate the route cache. * Dead host indications: notify all references to the destination. + * MTU change indications: same thing. * Otherwise, if we have knowledge of the local port and address, * deliver only to that socket. */ - if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { + if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD + || cmd == PRC_MTUCHANGED) { fport = 0; lport = 0; laddr.s_addr = 0; - if (cmd != PRC_HOSTDEAD) + if (cmd != PRC_HOSTDEAD && cmd != PRC_MTUCHANGED) notify = in_rtchange; } errno = inetctlerrmap[cmd]; @@ -357,6 +369,7 @@ in_pcbnotify(head, dst, fport, laddr, lport, cmd, notify) * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ +void in_losing(inp) struct inpcb *inp; { @@ -372,10 +385,14 @@ in_losing(inp) (struct rtentry **)0); inp->inp_route.ro_rt = 0; rtfree(rt); + +#ifdef MTUDISC /* - * A new route can be allocated - * the next time output is attempted. + * When doing MTU discovery, we want to find out as + * quickly as possible what the MTU of the new route is. */ + in_pcbmtu(inp); +#endif /* MTUDISC */ } } @@ -389,10 +406,14 @@ in_rtchange(inp) if (inp->inp_route.ro_rt) { rtfree(inp->inp_route.ro_rt); inp->inp_route.ro_rt = 0; +#ifdef MTUDISC /* * A new route can be allocated the next time - * output is attempted. + * output is attempted, but make sure to let + * MTU discovery know about it. */ + in_pcbmtu(inp); +#endif /* MTUDISC */ } } diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 55254dc85d..b509e51678 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)in_pcb.h 7.6 (Berkeley) 6/28/90 - * $Id: in_pcb.h,v 1.2 1993/10/16 18:26:03 rgrimes Exp $ + * $Id: in_pcb.h,v 1.3 1993/11/07 17:47:50 wollman Exp $ */ #ifndef _NETINET_IN_PCB_H_ @@ -59,12 +59,24 @@ struct inpcb { int inp_flags; /* generic IP/datagram flags */ struct ip inp_ip; /* header prototype; should have more */ struct mbuf *inp_options; /* IP options */ +#ifdef MTUDISC + int inp_pmtu; /* path mtu if INP_MTUDISCOVERED */ + int inp_mtutimer; /* decremented once a minute to + try to increase mtu */ + int (*inp_mtunotify)(struct inpcb *, int); + /* function to call when MTU may have + * changed */ +#endif /* MTUDISC */ }; /* flags in inp_flags: */ #define INP_RECVOPTS 0x01 /* receive incoming IP options */ #define INP_RECVRETOPTS 0x02 /* receive IP options for reply */ #define INP_RECVDSTADDR 0x04 /* receive IP dst address */ +#ifdef MTUDISC +#define INP_DISCOVERMTU 0x08 /* practice Path MTU discovery */ +#define INP_MTUDISCOVERED 0x10 /* we were able to get such a route */ +#endif /* MTUDISC */ #define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR) #ifdef sotorawcb @@ -92,5 +104,10 @@ struct raw_inpcb { #ifdef KERNEL struct inpcb *in_pcblookup(); -#endif +#ifdef MTUDISC +extern void in_pcbmtu(struct inpcb *); +extern void in_mtunotify(struct inpcb *); +extern void in_bumpmtu(struct inpcb *); +#endif /* MTUDISC */ +#endif /* KERNEL */ #endif /* _NETINET_IN_PCB_H_ */ diff --git a/sys/netinet/in_var.h b/sys/netinet/in_var.h index b799b0199d..b14dc6ce51 100644 --- a/sys/netinet/in_var.h +++ b/sys/netinet/in_var.h @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)in_var.h 7.6 (Berkeley) 6/28/90 - * $Id: in_var.h,v 1.2 1993/10/16 18:26:07 rgrimes Exp $ + * $Id: in_var.h,v 1.3 1993/11/07 17:47:53 wollman Exp $ */ #ifndef _NETINET_IN_VAR_H_ @@ -77,5 +77,18 @@ struct in_aliasreq { extern struct in_ifaddr *in_ifaddr; struct in_ifaddr *in_iaonnetof(); extern struct ifqueue ipintrq; /* ip packet input queue */ -#endif +extern struct protosw inetsw[]; +extern u_char ip_protox[]; +extern struct in_addr in_makeaddr(); +extern struct in_ifaddr *ifptoia(struct ifnet *); +extern u_char inetctlerrmap[]; +extern struct in_addr zeroin_addr; +#ifdef MTUDISC +struct route; +extern unsigned in_nextmtu(unsigned, int); +extern int in_routemtu(struct route *); +extern void in_mtureduce(struct in_addr, unsigned); +extern void in_mtutimer(caddr_t, int); +#endif /* MTUDISC */ +#endif /* KERNEL */ #endif /* _NETINET_IN_VAR_H_ */ diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c index ba0ee06ba7..c33a88f7c2 100644 --- a/sys/netinet/ip_icmp.c +++ b/sys/netinet/ip_icmp.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)ip_icmp.c 7.15 (Berkeley) 4/20/91 - * $Id: ip_icmp.c,v 1.2 1993/10/16 18:26:11 rgrimes Exp $ + * $Id: ip_icmp.c,v 1.3 1993/11/07 17:47:56 wollman Exp $ */ #include "param.h" @@ -62,24 +62,40 @@ int icmpprintfs = 0; #endif -extern struct protosw inetsw[]; +#ifndef IPBROADCASTECHO +#define IPBROADCASTECHO 0 +#endif +int ipbroadcastecho = IPBROADCASTECHO; + +#ifndef IPMASKAGENT +#define IPMASKAGENT 0 +#endif +int ipmaskagent = IPMASKAGENT; + struct icmpstat icmpstat; +#define satosin(sa) ((struct sockaddr_in *)(sa)) +static void icmp_reflect(struct mbuf *); +static void icmp_send(struct mbuf *, struct mbuf *); + + /* * Generate an error packet of type error * in response to bad packet ip. */ -/*VARARGS3*/ -icmp_error(n, type, code, dest) +void +icmp_error(n, type, code, dest, mtu) struct mbuf *n; int type, code; struct in_addr dest; + int mtu; /* mtu for ICMP_UNREACH_SRCFRAG */ { register struct ip *oip = mtod(n, struct ip *), *nip; register unsigned oiplen = oip->ip_hl << 2; register struct icmp *icp; register struct mbuf *m; unsigned icmplen; + u_long oaddr; #ifdef ICMPPRINTFS if (icmpprintfs) @@ -87,6 +103,61 @@ icmp_error(n, type, code, dest) #endif if (type != ICMP_REDIRECT) icmpstat.icps_error++; + + /* + * Quoth RFC 1122 (Requirements for Internet Hosts): + * + * An ICMP error message MUST NOT be sent as the result of + * receiving: + * - an ICMP error message, or + * - a datagram destined to an IP broadcast or IP multicast + * address, or + * - a datagram sent as a link-layer broadcast, or + * - a non-initial fragment, or + * - a datagram whose source address does not define a single + * host -- e.g., a zero address, a loopback address, a + * broadcast address, a multicast address, or a Class E + * address. + * + * NOTE: THESE RESTRICTIONS TAKE PRECEDENCE OVER ANY REQUIREMENT + * ELSEWHERE IN THIS DOCUMENT FOR SENDING ICMP ERROR MESSAGES. + */ + + oaddr = ntohl(oip->ip_src.s_addr); + + /* + * Don't send error messages to multicast or broadcast addresses. + */ + if (IN_MULTICAST(oaddr) + || oaddr == INADDR_BROADCAST + || n->m_flags & (M_BCAST | M_MCAST)) { + icmpstat.icps_oldmcast++; + goto freeit; + } + + /* + * Don't send error messages to zero addresses or class E's. + */ + if (IN_EXPERIMENTAL(oaddr) + || ! in_lnaof(oip->ip_src) + || ! in_netof(oip->ip_src)) { + icmpstat.icps_oldbadaddr++; + goto freeit; + } + + /* + * Don't send error messages to loopback addresses. + * As a special (unauthorized) exception, we check to see + * if the packet came from the loopback interface. If it + * did, then we should allow the errors through, because + * the upper layers rely on them. + */ + if(in_netof(oip->ip_src) == IN_LOOPBACKNET + && !(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK)) { + icmpstat.icps_oldbadaddr++; + goto freeit; + } + /* * Don't send error if not the first fragment of message. * Don't error if the old packet protocol was ICMP @@ -117,8 +188,12 @@ icmp_error(n, type, code, dest) icp->icmp_type = type; if (type == ICMP_REDIRECT) icp->icmp_gwaddr = dest; - else + else if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG) { + icp->icmp_mtu = htons(mtu); + icp->icmp_mtuvoid = 0; + } else { icp->icmp_void = 0; + } if (type == ICMP_PARAMPROB) { icp->icmp_pptr = code; code = 0; @@ -154,11 +229,11 @@ static struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET }; static struct sockaddr_in icmpdst = { sizeof (struct sockaddr_in), AF_INET }; static struct sockaddr_in icmpgw = { sizeof (struct sockaddr_in), AF_INET }; struct sockaddr_in icmpmask = { 8, 0 }; -struct in_ifaddr *ifptoia(); /* * Process a received ICMP message. */ +void icmp_input(m, hlen) register struct mbuf *m; int hlen; @@ -169,8 +244,6 @@ icmp_input(m, hlen) register int i; struct in_ifaddr *ia; int (*ctlfunc)(), code; - extern u_char ip_protox[]; - extern struct in_addr in_makeaddr(); /* * Locate icmp structure in mbuf, and check @@ -215,21 +288,35 @@ icmp_input(m, hlen) switch (icp->icmp_type) { case ICMP_UNREACH: - if (code > 5) + if (code > ICMP_UNREACH_MAXCODE) goto badcode; - code += PRC_UNREACH_NET; + if (code == ICMP_UNREACH_NEEDFRAG) { +#ifdef MTUDISC + /* + * Need to adjust the routing tables immediately; + * when ULP's get the PRC_MSGSIZE, it is their + * responsibility to notice it and update their + * internal ideas of MTU-derived protocol parameters. + */ + in_mtureduce(icp->icmp_ip.ip_dst, + ntohs(icp->icmp_mtu)); + code = PRC_MSGSIZE; +#endif /* MTUDISC */ + } else { + code += PRC_UNREACH_NET; + } goto deliver; case ICMP_TIMXCEED: - if (code > 1) + if (code > ICMP_TIMXCEED_MAXCODE) goto badcode; code += PRC_TIMXCEED_INTRANS; goto deliver; case ICMP_PARAMPROB: - if (code) + if (code > ICMP_PARAMPROB_MAXCODE) goto badcode; - code = PRC_PARAMPROB; + code += PRC_PARAMPROB; goto deliver; case ICMP_SOURCEQUENCH: @@ -260,7 +347,29 @@ icmp_input(m, hlen) icmpstat.icps_badcode++; break; + /* + * Always respond to pings from valid addresses. + * Don't respond to broadcast pings unless ipbroadcastecho + * is set. Don't respond to multicast pings unless + * ipbraodcastecho is set AND we support multicasting + * to begin with. (Per RFC 1122, we may choose either.) + */ case ICMP_ECHO: + { + u_long srcaddr = ntohl(icp->icmp_ip.ip_src.s_addr); +#ifdef MULTICAST + if(IN_MULTICAST(srcaddr) && !ipbroadcastecho) + break; +#else /* not MULTICAST */ + if(IN_MULTICAST(srcaddr)) + break; +#endif /* not MULTICAST */ + if((srcaddr == INADDR_BROADCAST + || m->m_flags & M_BCAST) + && !ipbroadcastecho) + break; + } + icp->icmp_type = ICMP_ECHOREPLY; goto reflect; @@ -274,16 +383,14 @@ icmp_input(m, hlen) icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ goto reflect; - case ICMP_IREQ: -#define satosin(sa) ((struct sockaddr_in *)(sa)) - if (in_netof(ip->ip_src) == 0 && - (ia = ifptoia(m->m_pkthdr.rcvif))) - ip->ip_src = in_makeaddr(in_netof(IA_SIN(ia)->sin_addr), - in_lnaof(ip->ip_src)); - icp->icmp_type = ICMP_IREQREPLY; - goto reflect; - + /* + * Per RFC 1122, only respond to ICMP mask requests + * if the administrator has SPECIFICALLY CONFIGURED + * this host as an address mask agent. + */ case ICMP_MASKREQ: + if (!ipmaskagent) + break; if (icmplen < ICMP_MASKLEN || (ia = ifptoia(m->m_pkthdr.rcvif)) == 0) break; @@ -321,8 +428,21 @@ reflect: printf("redirect dst %x to %x\n", icp->icmp_ip.ip_dst, icp->icmp_gwaddr); #endif + /* + * Per RFC 1122, throw away redirects that + * suggested places we can't get to, or + * an interface other than the one the packet + * arrived on. + * + * It also says that we SHOULD throw away + * redirects that come from someone other + * than the current first-hop gateway for the + * specified destination. + * + * These are both implemented as general policy + * by rtredirect(). + */ if (code == ICMP_REDIRECT_NET || code == ICMP_REDIRECT_TOSNET) { - u_long in_netof(); icmpsrc.sin_addr = in_makeaddr(in_netof(icp->icmp_ip.ip_dst), INADDR_ANY); in_sockmaskof(icp->icmp_ip.ip_dst, &icmpmask); @@ -370,6 +490,7 @@ freeit: /* * Reflect the ip packet back to the source */ +static void icmp_reflect(m) struct mbuf *m; { @@ -486,6 +607,7 @@ ifptoia(ifp) * Send an icmp packet back to the ip level, * after supplying a checksum. */ +static void icmp_send(m, opts) register struct mbuf *m; struct mbuf *opts; diff --git a/sys/netinet/ip_icmp.h b/sys/netinet/ip_icmp.h index 5493577262..9d954db8e8 100644 --- a/sys/netinet/ip_icmp.h +++ b/sys/netinet/ip_icmp.h @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)ip_icmp.h 7.5 (Berkeley) 6/28/90 - * $Id: ip_icmp.h,v 1.2 1993/10/16 18:26:12 rgrimes Exp $ + * $Id: ip_icmp.h,v 1.3 1993/11/07 17:47:57 wollman Exp $ */ #ifndef _NETINET_IP_ICMP_H_ @@ -56,6 +56,10 @@ struct icmp { n_short icd_id; n_short icd_seq; } ih_idseq; + struct ih_mtu { + n_short imt_unused; + n_short imt_nhmtu; + } ih_mtu; int ih_void; } icmp_hun; #define icmp_pptr icmp_hun.ih_pptr @@ -63,6 +67,8 @@ struct icmp { #define icmp_id icmp_hun.ih_idseq.icd_id #define icmp_seq icmp_hun.ih_idseq.icd_seq #define icmp_void icmp_hun.ih_void +#define icmp_mtu icmp_hun.ih_mtu.imt_nhmtu +#define icmp_mtuvoid icmp_hun.ih_mtu.imt_unused union { struct id_ts { n_time its_otime; @@ -110,17 +116,38 @@ struct icmp { #define ICMP_UNREACH_PORT 3 /* bad port */ #define ICMP_UNREACH_NEEDFRAG 4 /* IP_DF caused drop */ #define ICMP_UNREACH_SRCFAIL 5 /* src route failed */ +#define ICMP_UNREACH_NETUNKNOWN 6 /* dest net unknown */ +#define ICMP_UNREACH_HSTUNKNOWN 7 /* dst host unknown */ +#define ICMP_UNREACH_ISOLATED 8 /* src host isolated*/ +/* next two are for administratively prohibited */ +#define ICMP_UNREACH_NETADMIN 9 /* dest net */ +#define ICMP_UNREACH_HOSTADMIN 10 /* dest host */ +/* next two are for TOS unreachables */ +#define ICMP_UNREACH_TOSNET 11 /* dest net+tos */ +#define ICMP_UNREACH_TOSHOST 12 /* dest host+tos */ +#define ICMP_UNREACH_MAXCODE 12 + #define ICMP_SOURCEQUENCH 4 /* packet lost, slow down */ + #define ICMP_REDIRECT 5 /* shorter route, codes: */ #define ICMP_REDIRECT_NET 0 /* for network */ #define ICMP_REDIRECT_HOST 1 /* for host */ #define ICMP_REDIRECT_TOSNET 2 /* for tos and net */ #define ICMP_REDIRECT_TOSHOST 3 /* for tos and host */ +#define ICMP_REDIRECT_MAXCODE 3 + #define ICMP_ECHO 8 /* echo service */ + #define ICMP_TIMXCEED 11 /* time exceeded, code: */ #define ICMP_TIMXCEED_INTRANS 0 /* ttl==0 in transit */ #define ICMP_TIMXCEED_REASS 1 /* ttl==0 in reass */ +#define ICMP_TIMXCEED_MAXCODE 1 + #define ICMP_PARAMPROB 12 /* ip header bad */ +#define ICMP_PARAMPROB_GENERAL 0 /* generic problems */ +#define ICMP_PARAMPROB_REQDOPT 1 /* option misssing */ +#define ICMP_PARAMPROB_MAXCODE 1 + #define ICMP_TSTAMP 13 /* timestamp request */ #define ICMP_TSTAMPREPLY 14 /* timestamp reply */ #define ICMP_IREQ 15 /* information request */ @@ -135,4 +162,12 @@ struct icmp { (type) == ICMP_TSTAMP || (type) == ICMP_TSTAMPREPLY || \ (type) == ICMP_IREQ || (type) == ICMP_IREQREPLY || \ (type) == ICMP_MASKREQ || (type) == ICMP_MASKREPLY) + +#ifdef KERNEL +struct mbuf; +extern void icmp_error(struct mbuf *, int, int, struct in_addr, int); +extern void icmp_input(struct mbuf *, int); +n_time iptime(void); +#endif + #endif /* _NETINET_IP_ICMP_H_ */ diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index 26eda07227..6cb713cdad 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)ip_input.c 7.19 (Berkeley) 5/25/91 - * $Id: ip_input.c,v 1.3 1993/11/07 17:47:58 wollman Exp $ + * $Id: ip_input.c,v 1.4 1993/11/12 04:03:55 wollman Exp $ */ #include "param.h" @@ -762,7 +762,10 @@ ip_dooptions(m) } else return (0); bad: - icmp_error(m, type, code); + { + static struct in_addr fake; + icmp_error(m, type, code, fake, 0); + } return (1); } @@ -912,12 +915,39 @@ ip_stripoptions(m, mopt) } u_char inetctlerrmap[PRC_NCMDS] = { - 0, 0, 0, 0, - 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, - EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, - EMSGSIZE, EHOSTUNREACH, 0, 0, - 0, 0, 0, 0, - ENOPROTOOPT + 0, /* ifdown */ + 0, /* routedead */ + 0, /* #2 */ + 0, /* quench2 */ + 0, /* quench */ + EMSGSIZE, /* msgsize */ + EHOSTDOWN, /* hostdead */ + EHOSTUNREACH, /* hostunreach */ + EHOSTUNREACH, /* unreachnet */ + EHOSTUNREACH, /* unreachhost */ + ECONNREFUSED, /* unreachproto */ + ECONNREFUSED, /* unreachport */ + EMSGSIZE, /* old needfrag */ + EHOSTUNREACH, /* srcfail */ + EHOSTUNREACH, /* netunknown */ + EHOSTUNREACH, /* hostunknown */ + EHOSTUNREACH, /* isolated */ + ECONNREFUSED, /* net admin. prohibited */ + ECONNREFUSED, /* host admin. prohibited */ + EHOSTUNREACH, /* tos net unreachable */ + EHOSTUNREACH, /* tos host unreachable */ + 0, /* redirect net */ + 0, /* redirect host */ + 0, /* redirect tosnet */ + 0, /* redirect toshost */ + 0, /* time exceeded */ + 0, /* reassembly timeout */ + ENOPROTOOPT, /* parameter problem */ + ENOPROTOOPT, /* required option missing */ + 0, /* MTU changed */ + /* NB: this means that this error will only + get propagated by in_mtunotify(), which + doesn't bother to check. */ }; /* @@ -944,6 +974,7 @@ ip_forward(m, srcrt) int error, type = 0, code; struct mbuf *mcopy; struct in_addr dest; + int mtu; dest.s_addr = 0; #ifdef DIAGNOSTIC @@ -958,7 +989,7 @@ ip_forward(m, srcrt) } HTONS(ip->ip_id); if (ip->ip_ttl <= IPTTLDEC) { - icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest); + icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0); return; } ip->ip_ttl -= IPTTLDEC; @@ -976,10 +1007,12 @@ ip_forward(m, srcrt) rtalloc(&ipforward_rt); if (ipforward_rt.ro_rt == 0) { - icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest); + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); return; } rt = ipforward_rt.ro_rt; + mtu = rt->rt_ifp->if_mtu; + /* salt away if's mtu */ } /* @@ -1079,5 +1112,5 @@ ip_forward(m, srcrt) code = 0; break; } - icmp_error(mcopy, type, code, dest); + icmp_error(mcopy, type, code, dest, mtu); } diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 76466f8f6a..2f1809afa9 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)tcp_subr.c 7.20 (Berkeley) 12/1/90 - * $Id: tcp_subr.c,v 1.2 1993/10/16 18:26:31 rgrimes Exp $ + * $Id: tcp_subr.c,v 1.3 1993/11/07 17:48:08 wollman Exp $ */ #include "param.h" @@ -50,6 +50,7 @@ #include "in_systm.h" #include "ip.h" #include "in_pcb.h" +#include "in_var.h" #include "ip_var.h" #include "ip_icmp.h" #include "tcp.h" @@ -73,6 +74,9 @@ tcp_seq tcp_iss; struct inpcb tcb; struct tcpstat tcpstat; +#ifdef MTUDISC +int tcp_mtuchanged(struct inpcb *, int); +#endif /* * Tcp initialization @@ -235,6 +239,13 @@ tcp_newtcpcb(inp) tp->snd_ssthresh = TCP_MAXWIN; inp->inp_ip.ip_ttl = tcp_ttl; inp->inp_ppcb = (caddr_t)tp; +#ifdef MTUDISC + /* + * Enable Path MTU Discovery on this PCB. + */ + inp->inp_mtunotify = tcp_mtuchanged; + inp->inp_flags |= INP_DISCOVERMTU; +#endif /* MTUDISC */ return (tp); } @@ -345,7 +356,7 @@ tcp_close(tp) rt->rt_rmx.rmx_ssthresh = i; } } -#endif RTV_RTT +#endif /* RTV_RTT */ /* free the reassembly queue, if any */ t = tp->seg_next; while (t != (struct tcpiphdr *)tp) { @@ -388,18 +399,54 @@ tcp_notify(inp, error) sowwakeup(inp->inp_socket); } +/* + * When we get a PRC_MSGSIZE error (generated by the ICMP layer upon + * receipt of an ICMP_UNREACH_NEEDFRAG message), we need to get the + * IP layer to check the cached MTU data that it has in its PCBs. + * If things have changed, this will cause us to receive a + * PRC_MTUCHANGED message for /every/ connection to the same + * destination; that is handled by he tcp_mtuchanged() function, + * below. + * + * In the immortal words of Ken and Dennis, ``You are not expected to + * understand this.'' + */ +int /* grrr... should be void... */ +tcp_checkmtu(struct inpcb *inp, int error) { +#ifdef MTUDISC + /* + * XXX - this should also cause an immediate retransmission and + * slow start, since we know for a fact that the message we just sent + * got dropped on the floor. For now, just do what tcp_quench does. + */ + tcp_quench(inp); + in_pcbmtu(inp); +#endif /* MTUDISC */ +} + +#ifdef MTUDISC +int /* grrr... should be void... */ +tcp_mtuchanged(struct inpcb *inp, int error) { + /* don't do anything just yet... */; +} +#endif /* MTUDISC */ + tcp_ctlinput(cmd, sa, ip) int cmd; struct sockaddr *sa; register struct ip *ip; { register struct tcphdr *th; - extern struct in_addr zeroin_addr; - extern u_char inetctlerrmap[]; int (*notify)() = tcp_notify, tcp_quench(); if (cmd == PRC_QUENCH) notify = tcp_quench; +#ifdef MTUDISC + else if (cmd == PRC_MSGSIZE) + notify = tcp_checkmtu; + else if (cmd == PRC_MTUCHANGED) /* just in case */ + notify = tcp_mtuchanged; +#endif /* MTUDISC */ else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip) { diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index ca9c769aad..7b981f2f0f 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * from: @(#)udp_usrreq.c 7.20 (Berkeley) 4/20/91 - * $Id: udp_usrreq.c,v 1.2 1993/10/16 18:26:43 rgrimes Exp $ + * $Id: udp_usrreq.c,v 1.3 1993/11/07 17:48:14 wollman Exp $ */ #include "param.h" @@ -172,7 +172,10 @@ udp_input(m, iphlen) } *ip = save_ip; ip->ip_len += iphlen; - icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT); + { + static struct in_addr fake; + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, fake, 0); + } return; } -- 2.20.1