1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #include <sys/types.h> 28 #include <sys/stream.h> 29 #include <sys/strsubr.h> 30 #include <sys/dlpi.h> 31 #include <sys/strsun.h> 32 #include <sys/zone.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/cmn_err.h> 36 #include <sys/debug.h> 37 #include <sys/atomic.h> 38 39 #include <sys/systm.h> 40 #include <sys/param.h> 41 #include <sys/kmem.h> 42 #include <sys/sdt.h> 43 #include <sys/socket.h> 44 #include <sys/mac.h> 45 #include <net/if.h> 46 #include <net/if_arp.h> 47 #include <net/route.h> 48 #include <sys/sockio.h> 49 #include <netinet/in.h> 50 #include <net/if_dl.h> 51 52 #include <inet/common.h> 53 #include <inet/mi.h> 54 #include <inet/mib2.h> 55 #include <inet/nd.h> 56 #include <inet/arp.h> 57 #include <inet/snmpcom.h> 58 #include <inet/kstatcom.h> 59 60 #include <netinet/igmp_var.h> 61 #include <netinet/ip6.h> 62 #include <netinet/icmp6.h> 63 #include <netinet/sctp.h> 64 65 #include <inet/ip.h> 66 #include <inet/ip_impl.h> 67 #include <inet/ip6.h> 68 #include <inet/ip6_asp.h> 69 #include <inet/tcp.h> 70 #include <inet/ip_multi.h> 71 #include <inet/ip_if.h> 72 #include <inet/ip_ire.h> 73 #include <inet/ip_ftable.h> 74 #include <inet/ip_rts.h> 75 #include <inet/optcom.h> 76 #include <inet/ip_ndp.h> 77 #include <inet/ip_listutils.h> 78 #include <netinet/igmp.h> 79 #include <netinet/ip_mroute.h> 80 #include <inet/ipp_common.h> 81 82 #include <net/pfkeyv2.h> 83 #include <inet/sadb.h> 84 #include <inet/ipsec_impl.h> 85 #include <inet/ipdrop.h> 86 #include <inet/ip_netinfo.h> 87 88 #include <sys/pattr.h> 89 #include <inet/ipclassifier.h> 90 #include <inet/sctp_ip.h> 91 #include <inet/sctp/sctp_impl.h> 92 #include <inet/udp_impl.h> 93 #include <sys/sunddi.h> 94 95 #include <sys/tsol/label.h> 96 #include <sys/tsol/tnet.h> 97 98 #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ 99 100 #ifdef DEBUG 101 extern boolean_t skip_sctp_cksum; 102 #endif 103 104 static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *); 105 static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *); 106 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *); 107 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *); 108 static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *); 109 110 /* 111 * There are two types of output functions for IP used for different 112 * purposes: 113 * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there 114 * is no context in the form of a conn_t. However, there is a 115 * ip_xmit_attr_t that the callers use to influence interface selection 116 * (needed for ICMP echo as well as IPv6 link-locals) and IPsec. 117 * 118 * - conn_ip_output() is used when sending packets with a conn_t and 119 * ip_set_destination has been called to cache information. In that case 120 * various socket options are recorded in the ip_xmit_attr_t and should 121 * be taken into account. 122 */ 123 124 /* 125 * The caller *must* have called conn_connect() or ip_attr_connect() 126 * before calling conn_ip_output(). The caller needs to redo that each time 127 * the destination IP address or port changes, as well as each time there is 128 * a change to any socket option that would modify how packets are routed out 129 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF). 130 * 131 * The ULP caller has to serialize the use of a single ip_xmit_attr_t. 132 * We assert for that here. 133 */ 134 int 135 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa) 136 { 137 iaflags_t ixaflags = ixa->ixa_flags; 138 ire_t *ire; 139 nce_t *nce; 140 dce_t *dce; 141 ill_t *ill; 142 ip_stack_t *ipst = ixa->ixa_ipst; 143 int error; 144 145 /* We defer ipIfStatsHCOutRequests until an error or we have an ill */ 146 147 ASSERT(ixa->ixa_ire != NULL); 148 /* Note there is no ixa_nce when reject and blackhole routes */ 149 ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */ 150 151 #ifdef DEBUG 152 ASSERT(ixa->ixa_curthread == NULL); 153 ixa->ixa_curthread = curthread; 154 #endif 155 156 /* 157 * Even on labeled systems we can have a NULL ixa_tsl e.g., 158 * for IGMP/MLD traffic. 159 */ 160 161 ire = ixa->ixa_ire; 162 163 /* 164 * If the ULP says the (old) IRE resulted in reachability we 165 * record this before determine whether to use a new IRE. 166 * No locking for performance reasons. 167 */ 168 if (ixaflags & IXAF_REACH_CONF) 169 ire->ire_badcnt = 0; 170 171 /* 172 * Has routing changed since we cached the results of the lookup? 173 * 174 * This check captures all of: 175 * - the cached ire being deleted (by means of the special 176 * IRE_GENERATION_CONDEMNED) 177 * - A potentially better ire being added (ire_generation being 178 * increased) 179 * - A deletion of the nexthop ire that was used when we did the 180 * lookup. 181 * - An addition of a potentially better nexthop ire. 182 * The last two are handled by walking and increasing the generation 183 * number on all dependant IREs in ire_flush_cache(). 184 * 185 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE 186 * since we ensure that each time we set ixa_ire to such an IRE we 187 * make sure the ixa_ire_generation does not match (by using 188 * IRE_GENERATION_VERIFY). 189 */ 190 if (ire->ire_generation != ixa->ixa_ire_generation) { 191 error = ip_verify_ire(mp, ixa); 192 if (error != 0) { 193 ip_drop_output("ipIfStatsOutDiscards - verify ire", 194 mp, NULL); 195 goto drop; 196 } 197 ire = ixa->ixa_ire; 198 ASSERT(ire != NULL); 199 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 200 #ifdef DEBUG 201 ASSERT(ixa->ixa_curthread == curthread); 202 ixa->ixa_curthread = NULL; 203 #endif 204 ire->ire_ob_pkt_count++; 205 /* ixa_dce might be condemned; use default one */ 206 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, 207 &ipst->ips_dce_default->dce_ident)); 208 } 209 /* 210 * If the ncec changed then ip_verify_ire already set 211 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 212 * so we can recheck the interface mtu. 213 */ 214 215 /* 216 * Note that ire->ire_generation could already have changed. 217 * We catch that next time we send a packet. 218 */ 219 } 220 221 /* 222 * No need to lock access to ixa_nce since the ip_xmit_attr usage 223 * is single threaded. 224 */ 225 ASSERT(ixa->ixa_nce != NULL); 226 nce = ixa->ixa_nce; 227 if (nce->nce_is_condemned) { 228 error = ip_verify_nce(mp, ixa); 229 /* 230 * In case ZEROCOPY capability become not available, we 231 * copy the message and free the original one. We might 232 * be copying more data than needed but it doesn't hurt 233 * since such change rarely happens. 234 */ 235 switch (error) { 236 case 0: 237 break; 238 case ENOTSUP: { /* ZEROCOPY */ 239 mblk_t *nmp; 240 241 if ((nmp = copymsg(mp)) != NULL) { 242 freemsg(mp); 243 mp = nmp; 244 245 break; 246 } 247 /* FALLTHROUGH */ 248 } 249 default: 250 ip_drop_output("ipIfStatsOutDiscards - verify nce", 251 mp, NULL); 252 goto drop; 253 } 254 ire = ixa->ixa_ire; 255 ASSERT(ire != NULL); 256 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 257 #ifdef DEBUG 258 ASSERT(ixa->ixa_curthread == curthread); 259 ixa->ixa_curthread = NULL; 260 #endif 261 ire->ire_ob_pkt_count++; 262 /* ixa_dce might be condemned; use default one */ 263 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, 264 ixa, &ipst->ips_dce_default->dce_ident)); 265 } 266 ASSERT(ixa->ixa_nce != NULL); 267 nce = ixa->ixa_nce; 268 269 /* 270 * Note that some other event could already have made 271 * the new nce condemned. We catch that next time we 272 * try to send a packet. 273 */ 274 } 275 /* 276 * If there is no per-destination dce_t then we have a reference to 277 * the default dce_t (which merely contains the dce_ipid). 278 * The generation check captures both the introduction of a 279 * per-destination dce_t (e.g., due to ICMP packet too big) and 280 * any change to the per-destination dce (including it becoming 281 * condemned by use of the special DCE_GENERATION_CONDEMNED). 282 */ 283 dce = ixa->ixa_dce; 284 285 /* 286 * To avoid a periodic timer to increase the path MTU we 287 * look at dce_last_change_time each time we send a packet. 288 */ 289 if (dce->dce_flags & DCEF_PMTU) { 290 int64_t now = LBOLT_FASTPATH64; 291 292 if ((TICK_TO_SEC(now) - dce->dce_last_change_time > 293 ipst->ips_ip_pathmtu_interval)) { 294 /* 295 * Older than 20 minutes. Drop the path MTU information. 296 * Since the path MTU changes as a result of this, 297 * twiddle ixa_dce_generation to make us go through the 298 * dce verification code in conn_ip_output. 299 */ 300 mutex_enter(&dce->dce_lock); 301 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 302 dce->dce_last_change_time = TICK_TO_SEC(now); 303 mutex_exit(&dce->dce_lock); 304 dce_increment_generation(dce); 305 } 306 } 307 308 if (dce->dce_generation != ixa->ixa_dce_generation) { 309 error = ip_verify_dce(mp, ixa); 310 if (error != 0) { 311 ip_drop_output("ipIfStatsOutDiscards - verify dce", 312 mp, NULL); 313 goto drop; 314 } 315 dce = ixa->ixa_dce; 316 317 /* 318 * Note that some other event could already have made the 319 * new dce's generation number change. 320 * We catch that next time we try to send a packet. 321 */ 322 } 323 324 ill = nce->nce_ill; 325 326 /* 327 * An initial ixa_fragsize was set in ip_set_destination 328 * and we update it if any routing changes above. 329 * A change to ill_mtu with ifconfig will increase all dce_generation 330 * so that we will detect that with the generation check. Ditto for 331 * ill_mc_mtu. 332 */ 333 334 /* 335 * Caller needs to make sure IXAF_VERIFY_SRC is not set if 336 * conn_unspec_src. 337 */ 338 if ((ixaflags & IXAF_VERIFY_SOURCE) && 339 ixa->ixa_src_generation != ipst->ips_src_generation) { 340 /* Check if the IP source is still assigned to the host. */ 341 uint_t gen; 342 343 if (!ip_verify_src(mp, ixa, &gen)) { 344 /* Don't send a packet with a source that isn't ours */ 345 error = EADDRNOTAVAIL; 346 ip_drop_output("ipIfStatsOutDiscards - invalid src", 347 mp, NULL); 348 goto drop; 349 } 350 /* The source is still valid - update the generation number */ 351 ixa->ixa_src_generation = gen; 352 } 353 354 /* 355 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 356 * can only count the use prior to fragmentation. However the MIB 357 * counters on the ill will be incremented in post fragmentation. 358 */ 359 ire->ire_ob_pkt_count++; 360 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 361 362 /* 363 * Based on ire_type and ire_flags call one of: 364 * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK 365 * ire_send_multirt_v* - if RTF_MULTIRT 366 * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE 367 * ire_send_multicast_v* - for IRE_MULTICAST 368 * ire_send_broadcast_v4 - for IRE_BROADCAST 369 * ire_send_wire_v* - for the rest. 370 */ 371 #ifdef DEBUG 372 ASSERT(ixa->ixa_curthread == curthread); 373 ixa->ixa_curthread = NULL; 374 #endif 375 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident)); 376 377 drop: 378 if (ixaflags & IXAF_IS_IPV4) { 379 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 380 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 381 } else { 382 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests); 383 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 384 } 385 freemsg(mp); 386 #ifdef DEBUG 387 ASSERT(ixa->ixa_curthread == curthread); 388 ixa->ixa_curthread = NULL; 389 #endif 390 return (error); 391 } 392 393 /* 394 * Handle both IPv4 and IPv6. Sets the generation number 395 * to allow the caller to know when to call us again. 396 * Returns true if the source address in the packet is a valid source. 397 * We handle callers which try to send with a zero address (since we only 398 * get here if UNSPEC_SRC is not set). 399 */ 400 boolean_t 401 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 402 { 403 ip_stack_t *ipst = ixa->ixa_ipst; 404 405 /* 406 * Need to grab the generation number before we check to 407 * avoid a race with a change to the set of local addresses. 408 * No lock needed since the thread which updates the set of local 409 * addresses use ipif/ill locks and exit those (hence a store memory 410 * barrier) before doing the atomic increase of ips_src_generation. 411 */ 412 if (generationp != NULL) 413 *generationp = ipst->ips_src_generation; 414 415 if (ixa->ixa_flags & IXAF_IS_IPV4) { 416 ipha_t *ipha = (ipha_t *)mp->b_rptr; 417 418 if (ipha->ipha_src == INADDR_ANY) 419 return (B_FALSE); 420 421 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 422 ipst, B_FALSE) != IPVL_BAD); 423 } else { 424 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 425 uint_t scopeid; 426 427 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) 428 return (B_FALSE); 429 430 if (ixa->ixa_flags & IXAF_SCOPEID_SET) 431 scopeid = ixa->ixa_scopeid; 432 else 433 scopeid = 0; 434 435 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid, 436 ipst, B_FALSE, scopeid) != IPVL_BAD); 437 } 438 } 439 440 /* 441 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use. 442 */ 443 int 444 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa) 445 { 446 uint_t gen; 447 ire_t *ire; 448 nce_t *nce; 449 int error; 450 boolean_t multirt = B_FALSE; 451 452 /* 453 * Redo ip_select_route. 454 * Need to grab generation number as part of the lookup to 455 * avoid race. 456 */ 457 error = 0; 458 ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt); 459 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 460 if (error != 0) { 461 ire_refrele(ire); 462 return (error); 463 } 464 465 if (ixa->ixa_ire != NULL) 466 ire_refrele_notr(ixa->ixa_ire); 467 #ifdef DEBUG 468 ire_refhold_notr(ire); 469 ire_refrele(ire); 470 #endif 471 ixa->ixa_ire = ire; 472 ixa->ixa_ire_generation = gen; 473 if (multirt) { 474 if (ixa->ixa_flags & IXAF_IS_IPV4) 475 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 476 else 477 ixa->ixa_postfragfn = ip_postfrag_multirt_v6; 478 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 479 } else { 480 ixa->ixa_postfragfn = ire->ire_postfragfn; 481 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 482 } 483 484 /* 485 * Don't look for an nce for reject or blackhole. 486 * They have ire_generation set to IRE_GENERATION_VERIFY which 487 * makes conn_ip_output avoid references to ixa_nce. 488 */ 489 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 490 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY); 491 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 492 return (0); 493 } 494 495 /* The NCE could now be different */ 496 nce = ire_to_nce_pkt(ire, mp); 497 if (nce == NULL) { 498 /* 499 * Allocation failure. Make sure we redo ire/nce selection 500 * next time we send. 501 */ 502 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 503 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 504 return (ENOBUFS); 505 } 506 if (nce == ixa->ixa_nce) { 507 /* No change */ 508 nce_refrele(nce); 509 return (0); 510 } 511 512 /* 513 * Since the path MTU might change as a result of this 514 * route change, we twiddle ixa_dce_generation to 515 * make conn_ip_output go through the ip_verify_dce code. 516 */ 517 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 518 519 if (ixa->ixa_nce != NULL) 520 nce_refrele(ixa->ixa_nce); 521 ixa->ixa_nce = nce; 522 return (0); 523 } 524 525 /* 526 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use. 527 */ 528 static int 529 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa) 530 { 531 ire_t *ire = ixa->ixa_ire; 532 nce_t *nce; 533 int error = 0; 534 ipha_t *ipha = NULL; 535 ip6_t *ip6h = NULL; 536 537 if (ire->ire_ipversion == IPV4_VERSION) 538 ipha = (ipha_t *)mp->b_rptr; 539 else 540 ip6h = (ip6_t *)mp->b_rptr; 541 542 nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE); 543 if (nce == NULL) { 544 /* Try to find a better ire */ 545 return (ip_verify_ire(mp, ixa)); 546 } 547 548 /* 549 * The hardware offloading capabilities, for example LSO, of the 550 * interface might have changed, so do sanity verification here. 551 */ 552 if (ixa->ixa_flags & IXAF_VERIFY_LSO) { 553 if (!ip_verify_lso(nce->nce_ill, ixa)) { 554 ASSERT(ixa->ixa_notify != NULL); 555 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 556 IXAN_LSO, 0); 557 error = ENOTSUP; 558 } 559 } 560 561 /* 562 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with 563 * any ZEROCOPY changes. In case ZEROCOPY capability is not available 564 * any more, return error so that conn_ip_output() can take care of 565 * the ZEROCOPY message properly. It's safe to continue send the 566 * message when ZEROCOPY newly become available. 567 */ 568 if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) { 569 if (!ip_verify_zcopy(nce->nce_ill, ixa)) { 570 ASSERT(ixa->ixa_notify != NULL); 571 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 572 IXAN_ZCOPY, 0); 573 if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0) 574 error = ENOTSUP; 575 } 576 } 577 578 /* 579 * Since the path MTU might change as a result of this 580 * change, we twiddle ixa_dce_generation to 581 * make conn_ip_output go through the ip_verify_dce code. 582 */ 583 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 584 585 nce_refrele(ixa->ixa_nce); 586 ixa->ixa_nce = nce; 587 return (error); 588 } 589 590 /* 591 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use. 592 */ 593 static int 594 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa) 595 { 596 dce_t *dce; 597 uint_t gen; 598 uint_t pmtu; 599 600 dce = dce_lookup_pkt(mp, ixa, &gen); 601 ASSERT(dce != NULL); 602 603 dce_refrele_notr(ixa->ixa_dce); 604 #ifdef DEBUG 605 dce_refhold_notr(dce); 606 dce_refrele(dce); 607 #endif 608 ixa->ixa_dce = dce; 609 ixa->ixa_dce_generation = gen; 610 611 /* Extract the (path) mtu from the dce, ncec_ill etc */ 612 pmtu = ip_get_pmtu(ixa); 613 614 /* 615 * Tell ULP about PMTU changes - increase or decrease - by returning 616 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update 617 * both ixa_pmtu and ixa_fragsize appropriately. 618 * 619 * If ULP doesn't set that flag then we need to update ixa_fragsize 620 * since routing could have changed the ill after after ixa_fragsize 621 * was set previously in the conn_ip_output path or in 622 * ip_set_destination. 623 * 624 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu. 625 * 626 * In the case of a path MTU increase we send the packet after the 627 * notify to the ULP. 628 */ 629 if (ixa->ixa_flags & IXAF_VERIFY_PMTU) { 630 if (ixa->ixa_pmtu != pmtu) { 631 uint_t oldmtu = ixa->ixa_pmtu; 632 633 DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu, 634 uint32_t, ixa->ixa_pmtu); 635 ASSERT(ixa->ixa_notify != NULL); 636 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, 637 IXAN_PMTU, pmtu); 638 if (pmtu < oldmtu) 639 return (EMSGSIZE); 640 } 641 } else { 642 ixa->ixa_fragsize = pmtu; 643 } 644 return (0); 645 } 646 647 /* 648 * Verify LSO usability. Keep the return value simple to indicate whether 649 * the LSO capability has changed. Handle both IPv4 and IPv6. 650 */ 651 static boolean_t 652 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa) 653 { 654 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; 655 ill_lso_capab_t *new_lsoc = ill->ill_lso_capab; 656 657 if (ixa->ixa_flags & IXAF_LSO_CAPAB) { 658 /* 659 * Not unsable any more. 660 */ 661 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 662 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 663 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 664 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 665 !ILL_LSO_TCP_IPV4_USABLE(ill) : 666 !ILL_LSO_TCP_IPV6_USABLE(ill))) { 667 ixa->ixa_flags &= ~IXAF_LSO_CAPAB; 668 669 return (B_FALSE); 670 } 671 672 /* 673 * Capability has changed, refresh the copy in ixa. 674 */ 675 if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) { 676 *lsoc = *new_lsoc; 677 678 return (B_FALSE); 679 } 680 } else { /* Was not usable */ 681 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 682 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 683 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 684 ((ixa->ixa_flags & IXAF_IS_IPV4) ? 685 ILL_LSO_TCP_IPV4_USABLE(ill) : 686 ILL_LSO_TCP_IPV6_USABLE(ill))) { 687 *lsoc = *new_lsoc; 688 ixa->ixa_flags |= IXAF_LSO_CAPAB; 689 690 return (B_FALSE); 691 } 692 } 693 694 return (B_TRUE); 695 } 696 697 /* 698 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether 699 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6. 700 */ 701 static boolean_t 702 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa) 703 { 704 if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) { 705 /* 706 * Not unsable any more. 707 */ 708 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || 709 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || 710 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || 711 !ILL_ZCOPY_USABLE(ill)) { 712 ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB; 713 714 return (B_FALSE); 715 } 716 } else { /* Was not usable */ 717 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && 718 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 719 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && 720 ILL_ZCOPY_USABLE(ill)) { 721 ixa->ixa_flags |= IXAF_ZCOPY_CAPAB; 722 723 return (B_FALSE); 724 } 725 } 726 727 return (B_TRUE); 728 } 729 730 731 /* 732 * When there is no conn_t context, this will send a packet. 733 * The caller must *not* have called conn_connect() or ip_attr_connect() 734 * before calling ip_output_simple(). 735 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH. 736 * Honors IXAF_SET_SOURCE. 737 * 738 * We acquire the ire and after calling ire_sendfn we release 739 * the hold on the ire. Ditto for the nce and dce. 740 * 741 * This assumes that the caller has set the following in ip_xmit_attr_t: 742 * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set. 743 * If ixa_ifindex is non-zero it means send out that ill. (If it is 744 * an upper IPMP ill we load balance across the group; if a lower we send 745 * on that lower ill without load balancing.) 746 * IXAF_IS_IPV4 must be set correctly. 747 * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set. 748 * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup. 749 * If neither of those two are set we do an IPsec policy lookup. 750 * 751 * We handle setting things like 752 * ixa_pktlen 753 * ixa_ip_hdr_length 754 * ixa->ixa_protocol 755 * 756 * The caller may set ixa_xmit_hint, which is used for ECMP selection and 757 * transmit ring selecting in GLD. 758 * 759 * The caller must do an ixa_cleanup() to release any IPsec references 760 * after we return. 761 */ 762 int 763 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa) 764 { 765 ts_label_t *effective_tsl = NULL; 766 int err; 767 768 ASSERT(ixa->ixa_ipst != NULL); 769 770 if (is_system_labeled()) { 771 ip_stack_t *ipst = ixa->ixa_ipst; 772 773 if (ixa->ixa_flags & IXAF_IS_IPV4) { 774 err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid, 775 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 776 &effective_tsl); 777 } else { 778 err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid, 779 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, 780 &effective_tsl); 781 } 782 if (err != 0) { 783 ip2dbg(("tsol_check: label check failed (%d)\n", err)); 784 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 785 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 786 ip_drop_output("tsol_check_label", mp, NULL); 787 freemsg(mp); 788 return (err); 789 } 790 if (effective_tsl != NULL) { 791 /* Update the label */ 792 ip_xmit_attr_replace_tsl(ixa, effective_tsl); 793 } 794 } 795 796 if (ixa->ixa_flags & IXAF_IS_IPV4) 797 return (ip_output_simple_v4(mp, ixa)); 798 else 799 return (ip_output_simple_v6(mp, ixa)); 800 } 801 802 int 803 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa) 804 { 805 ipha_t *ipha; 806 ipaddr_t firsthop; /* In IP header */ 807 ipaddr_t dst; /* End of source route, or ipha_dst if none */ 808 ire_t *ire; 809 ipaddr_t setsrc; /* RTF_SETSRC */ 810 int error; 811 ill_t *ill = NULL; 812 dce_t *dce = NULL; 813 nce_t *nce; 814 iaflags_t ixaflags = ixa->ixa_flags; 815 ip_stack_t *ipst = ixa->ixa_ipst; 816 boolean_t repeat = B_FALSE; 817 boolean_t multirt = B_FALSE; 818 int64_t now; 819 820 ipha = (ipha_t *)mp->b_rptr; 821 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 822 823 /* 824 * Even on labeled systems we can have a NULL ixa_tsl e.g., 825 * for IGMP/MLD traffic. 826 */ 827 828 /* Caller already set flags */ 829 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 830 831 ASSERT(ixa->ixa_nce == NULL); 832 833 ixa->ixa_pktlen = ntohs(ipha->ipha_length); 834 ASSERT(ixa->ixa_pktlen == msgdsize(mp)); 835 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); 836 ixa->ixa_protocol = ipha->ipha_protocol; 837 838 /* 839 * Assumes that source routed packets have already been massaged by 840 * the ULP (ip_massage_options) and as a result ipha_dst is the next 841 * hop in the source route. The final destination is used for IPsec 842 * policy and DCE lookup. 843 */ 844 firsthop = ipha->ipha_dst; 845 dst = ip_get_dst(ipha); 846 847 repeat_ire: 848 error = 0; 849 setsrc = INADDR_ANY; 850 ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL, 851 &setsrc, &error, &multirt); 852 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ 853 if (error != 0) { 854 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 855 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 856 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL); 857 freemsg(mp); 858 goto done; 859 } 860 861 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) { 862 /* ire_ill might be NULL hence need to skip some code */ 863 if (ixaflags & IXAF_SET_SOURCE) 864 ipha->ipha_src = htonl(INADDR_LOOPBACK); 865 ixa->ixa_fragsize = IP_MAXPACKET; 866 ill = NULL; 867 nce = NULL; 868 ire->ire_ob_pkt_count++; 869 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 870 /* No dce yet; use default one */ 871 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, 872 &ipst->ips_dce_default->dce_ident); 873 goto done; 874 } 875 876 /* Note that ipha_dst is only used for IRE_MULTICAST */ 877 nce = ire_to_nce(ire, ipha->ipha_dst, NULL); 878 if (nce == NULL) { 879 /* Allocation failure? */ 880 ip_drop_output("ire_to_nce", mp, ill); 881 freemsg(mp); 882 error = ENOBUFS; 883 goto done; 884 } 885 if (nce->nce_is_condemned) { 886 nce_t *nce1; 887 888 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE); 889 nce_refrele(nce); 890 if (nce1 == NULL) { 891 if (!repeat) { 892 /* Try finding a better IRE */ 893 repeat = B_TRUE; 894 ire_refrele(ire); 895 goto repeat_ire; 896 } 897 /* Tried twice - drop packet */ 898 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 899 ip_drop_output("No nce", mp, ill); 900 freemsg(mp); 901 error = ENOBUFS; 902 goto done; 903 } 904 nce = nce1; 905 } 906 907 /* 908 * For multicast with multirt we have a flag passed back from 909 * ire_lookup_multi_ill_v4 since we don't have an IRE for each 910 * possible multicast address. 911 * We also need a flag for multicast since we can't check 912 * whether RTF_MULTIRT is set in ixa_ire for multicast. 913 */ 914 if (multirt) { 915 ixa->ixa_postfragfn = ip_postfrag_multirt_v4; 916 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; 917 } else { 918 ixa->ixa_postfragfn = ire->ire_postfragfn; 919 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; 920 } 921 ASSERT(ixa->ixa_nce == NULL); 922 ixa->ixa_nce = nce; 923 924 /* 925 * Check for a dce_t with a path mtu. 926 */ 927 dce = dce_lookup_v4(dst, ipst, NULL); 928 ASSERT(dce != NULL); 929 930 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) { 931 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 932 } else if (dce->dce_flags & DCEF_PMTU) { 933 /* 934 * To avoid a periodic timer to increase the path MTU we 935 * look at dce_last_change_time each time we send a packet. 936 */ 937 now = ddi_get_lbolt64(); 938 if (TICK_TO_SEC(now) - dce->dce_last_change_time > 939 ipst->ips_ip_pathmtu_interval) { 940 /* 941 * Older than 20 minutes. Drop the path MTU information. 942 */ 943 mutex_enter(&dce->dce_lock); 944 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); 945 dce->dce_last_change_time = TICK_TO_SEC(now); 946 mutex_exit(&dce->dce_lock); 947 dce_increment_generation(dce); 948 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 949 } else { 950 uint_t fragsize; 951 952 fragsize = ip_get_base_mtu(nce->nce_ill, ire); 953 if (fragsize > dce->dce_pmtu) 954 fragsize = dce->dce_pmtu; 955 ixa->ixa_fragsize = fragsize; 956 } 957 } else { 958 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); 959 } 960 961 /* 962 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp 963 * interface for source address selection. 964 */ 965 ill = ire_nexthop_ill(ire); 966 967 if (ixaflags & IXAF_SET_SOURCE) { 968 ipaddr_t src; 969 970 /* 971 * We use the final destination to get 972 * correct selection for source routed packets 973 */ 974 975 /* If unreachable we have no ill but need some source */ 976 if (ill == NULL) { 977 src = htonl(INADDR_LOOPBACK); 978 error = 0; 979 } else { 980 error = ip_select_source_v4(ill, setsrc, dst, 981 ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst, 982 &src, NULL, NULL); 983 } 984 if (error != 0) { 985 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 986 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 987 ip_drop_output("ipIfStatsOutDiscards - no source", 988 mp, ill); 989 freemsg(mp); 990 goto done; 991 } 992 ipha->ipha_src = src; 993 } else if (ixaflags & IXAF_VERIFY_SOURCE) { 994 /* Check if the IP source is assigned to the host. */ 995 if (!ip_verify_src(mp, ixa, NULL)) { 996 /* Don't send a packet with a source that isn't ours */ 997 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 998 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 999 ip_drop_output("ipIfStatsOutDiscards - invalid source", 1000 mp, ill); 1001 freemsg(mp); 1002 error = EADDRNOTAVAIL; 1003 goto done; 1004 } 1005 } 1006 1007 1008 /* 1009 * Check against global IPsec policy to set the AH/ESP attributes. 1010 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate. 1011 */ 1012 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1013 ASSERT(ixa->ixa_ipsec_policy == NULL); 1014 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa); 1015 if (mp == NULL) { 1016 /* MIB and ip_drop_packet already done */ 1017 return (EHOSTUNREACH); /* IPsec policy failure */ 1018 } 1019 } 1020 1021 if (ill != NULL) { 1022 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 1023 } else { 1024 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); 1025 } 1026 1027 /* 1028 * We update the statistics on the most specific IRE i.e., the first 1029 * one we found. 1030 * We don't have an IRE when we fragment, hence ire_ob_pkt_count 1031 * can only count the use prior to fragmentation. However the MIB 1032 * counters on the ill will be incremented in post fragmentation. 1033 */ 1034 ire->ire_ob_pkt_count++; 1035 1036 /* 1037 * Based on ire_type and ire_flags call one of: 1038 * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK 1039 * ire_send_multirt_v4 - if RTF_MULTIRT 1040 * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE 1041 * ire_send_multicast_v4 - for IRE_MULTICAST 1042 * ire_send_broadcast_v4 - for IRE_BROADCAST 1043 * ire_send_wire_v4 - for the rest. 1044 */ 1045 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident); 1046 done: 1047 ire_refrele(ire); 1048 if (dce != NULL) 1049 dce_refrele(dce); 1050 if (ill != NULL) 1051 ill_refrele(ill); 1052 if (ixa->ixa_nce != NULL) 1053 nce_refrele(ixa->ixa_nce); 1054 ixa->ixa_nce = NULL; 1055 return (error); 1056 } 1057 1058 /* 1059 * ire_sendfn() functions. 1060 * These functions use the following xmit_attr: 1061 * - ixa_fragsize - read to determine whether or not to fragment 1062 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec 1063 * - ixa_ipsec_* are used inside IPsec 1064 * - IXAF_SET_SOURCE - replace IP source in broadcast case. 1065 * - IXAF_LOOPBACK_COPY - for multicast and broadcast 1066 */ 1067 1068 1069 /* 1070 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK 1071 * 1072 * The checks for restrict_interzone_loopback are done in ire_route_recursive. 1073 */ 1074 /* ARGSUSED4 */ 1075 int 1076 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1077 ip_xmit_attr_t *ixa, uint32_t *identp) 1078 { 1079 ipha_t *ipha = (ipha_t *)iph_arg; 1080 ip_stack_t *ipst = ixa->ixa_ipst; 1081 ill_t *ill = ire->ire_ill; 1082 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 1083 uint_t pktlen = ixa->ixa_pktlen; 1084 1085 /* 1086 * No fragmentation, no nce, no application of IPsec, 1087 * and no ipha_ident assignment. 1088 * 1089 * Note different order between IP provider and FW_HOOKS than in 1090 * send_wire case. 1091 */ 1092 1093 /* 1094 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the 1095 * send probe, but not the receive probe. 1096 */ 1097 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1098 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1099 int, 1); 1100 1101 if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) { 1102 int error; 1103 1104 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 1105 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 1106 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 1107 ipst->ips_ipv4firewall_loopback_out, 1108 NULL, ill, ipha, mp, mp, 0, ipst, error); 1109 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 1110 if (mp == NULL) 1111 return (error); 1112 1113 /* 1114 * Even if the destination was changed by the filter we use the 1115 * forwarding decision that was made based on the address 1116 * in ip_output/ip_set_destination. 1117 */ 1118 /* Length could be different */ 1119 ipha = (ipha_t *)mp->b_rptr; 1120 pktlen = ntohs(ipha->ipha_length); 1121 } 1122 1123 /* 1124 * If a callback is enabled then we need to know the 1125 * source and destination zoneids for the packet. We already 1126 * have those handy. 1127 */ 1128 if (ipst->ips_ip4_observe.he_interested) { 1129 zoneid_t szone, dzone; 1130 zoneid_t stackzoneid; 1131 1132 stackzoneid = netstackid_to_zoneid( 1133 ipst->ips_netstack->netstack_stackid); 1134 1135 if (stackzoneid == GLOBAL_ZONEID) { 1136 /* Shared-IP zone */ 1137 dzone = ire->ire_zoneid; 1138 szone = ixa->ixa_zoneid; 1139 } else { 1140 szone = dzone = stackzoneid; 1141 } 1142 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); 1143 } 1144 1145 /* Handle lo0 stats */ 1146 ipst->ips_loopback_packets++; 1147 1148 /* Map ixa to ira including IPsec policies */ 1149 ipsec_out_to_in(ixa, ill, &iras); 1150 iras.ira_pktlen = pktlen; 1151 1152 if (!IS_SIMPLE_IPH(ipha)) { 1153 ip_output_local_options(ipha, ipst); 1154 iras.ira_flags |= IRAF_IPV4_OPTIONS; 1155 } 1156 1157 if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) { 1158 int error; 1159 1160 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 1161 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 1162 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 1163 ipst->ips_ipv4firewall_loopback_in, 1164 ill, NULL, ipha, mp, mp, 0, ipst, error); 1165 1166 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 1167 if (mp == NULL) { 1168 ira_cleanup(&iras, B_FALSE); 1169 return (error); 1170 } 1171 /* 1172 * Even if the destination was changed by the filter we use the 1173 * forwarding decision that was made based on the address 1174 * in ip_output/ip_set_destination. 1175 */ 1176 /* Length could be different */ 1177 ipha = (ipha_t *)mp->b_rptr; 1178 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length); 1179 } 1180 1181 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 1182 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 1183 int, 1); 1184 1185 ire->ire_ib_pkt_count++; 1186 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 1187 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen); 1188 1189 /* Destined to ire_zoneid - use that for fanout */ 1190 iras.ira_zoneid = ire->ire_zoneid; 1191 1192 if (is_system_labeled()) { 1193 iras.ira_flags |= IRAF_SYSTEM_LABELED; 1194 1195 /* 1196 * This updates ira_cred, ira_tsl and ira_free_flags based 1197 * on the label. We don't expect this to ever fail for 1198 * loopback packets, so we silently drop the packet should it 1199 * fail. 1200 */ 1201 if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) { 1202 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1203 ip_drop_input("tsol_get_pkt_label", mp, ill); 1204 freemsg(mp); 1205 return (0); 1206 } 1207 ASSERT(iras.ira_tsl != NULL); 1208 1209 /* tsol_get_pkt_label sometimes does pullupmsg */ 1210 ipha = (ipha_t *)mp->b_rptr; 1211 } 1212 1213 ip_fanout_v4(mp, ipha, &iras); 1214 1215 /* We moved any IPsec refs from ixa to iras */ 1216 ira_cleanup(&iras, B_FALSE); 1217 return (0); 1218 } 1219 1220 /* 1221 * ire_sendfn for IRE_BROADCAST 1222 * If the broadcast address is present on multiple ills and ixa_ifindex 1223 * isn't set, then we generate 1224 * a separate datagram (potentially with different source address) for 1225 * those ills. In any case, only one copy is looped back to ip_input_v4. 1226 */ 1227 int 1228 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1229 ip_xmit_attr_t *ixa, uint32_t *identp) 1230 { 1231 ipha_t *ipha = (ipha_t *)iph_arg; 1232 ip_stack_t *ipst = ixa->ixa_ipst; 1233 irb_t *irb = ire->ire_bucket; 1234 ire_t *ire1; 1235 mblk_t *mp1; 1236 ipha_t *ipha1; 1237 iaflags_t ixaflags = ixa->ixa_flags; 1238 nce_t *nce1, *nce_orig; 1239 1240 /* 1241 * Unless ire_send_multirt_v4 already set a ttl, force the 1242 * ttl to a smallish value. 1243 */ 1244 if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) { 1245 /* 1246 * To avoid broadcast storms, we usually set the TTL to 1 for 1247 * broadcasts. This can 1248 * be overridden stack-wide through the ip_broadcast_ttl 1249 * ndd tunable, or on a per-connection basis through the 1250 * IP_BROADCAST_TTL socket option. 1251 * 1252 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4 1253 * will force ttl to one after we've set this. 1254 */ 1255 if (ixaflags & IXAF_BROADCAST_TTL_SET) 1256 ipha->ipha_ttl = ixa->ixa_broadcast_ttl; 1257 else 1258 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 1259 } 1260 /* 1261 * Make sure we get a loopback copy (after IPsec and frag) 1262 * Skip hardware checksum so that loopback copy is checksumed. 1263 */ 1264 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1265 1266 /* Do we need to potentially generate multiple copies? */ 1267 if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0) 1268 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1269 1270 /* 1271 * Loop over all IRE_BROADCAST in the bucket (might only be one). 1272 * Note that everything in the bucket has the same destination address. 1273 */ 1274 irb_refhold(irb); 1275 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 1276 /* We do the main IRE after the end of the loop */ 1277 if (ire1 == ire) 1278 continue; 1279 1280 /* 1281 * Only IREs for the same IP address should be in the same 1282 * bucket. 1283 * But could have IRE_HOSTs in the case of CGTP. 1284 * If we find any multirt routes we bail out of the loop 1285 * and just do the single packet at the end; ip_postfrag_multirt 1286 * will duplicate the packet. 1287 */ 1288 ASSERT(ire1->ire_addr == ire->ire_addr); 1289 if (!(ire1->ire_type & IRE_BROADCAST)) 1290 continue; 1291 1292 if (IRE_IS_CONDEMNED(ire1)) 1293 continue; 1294 1295 if (ixa->ixa_zoneid != ALL_ZONES && 1296 ire->ire_zoneid != ire1->ire_zoneid) 1297 continue; 1298 1299 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL); 1300 1301 if (ire1->ire_flags & RTF_MULTIRT) 1302 break; 1303 1304 /* 1305 * For IPMP we only send for the ipmp_ill. arp_nce_init() will 1306 * ensure that this goes out on the cast_ill. 1307 */ 1308 if (IS_UNDER_IPMP(ire1->ire_ill)) 1309 continue; 1310 1311 mp1 = copymsg(mp); 1312 if (mp1 == NULL) { 1313 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1314 ipIfStatsOutDiscards); 1315 ip_drop_output("ipIfStatsOutDiscards", 1316 mp, ire1->ire_ill); 1317 continue; 1318 } 1319 1320 ipha1 = (ipha_t *)mp1->b_rptr; 1321 if (ixa->ixa_flags & IXAF_SET_SOURCE) { 1322 /* 1323 * Need to pick a different source address for each 1324 * interface. If we have a global IPsec policy and 1325 * no per-socket policy then we punt to 1326 * ip_output_simple_v4 using a separate ip_xmit_attr_t. 1327 */ 1328 if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) { 1329 ip_output_simple_broadcast(ixa, mp1); 1330 continue; 1331 } 1332 /* Pick a new source address for each interface */ 1333 if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY, 1334 ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst, 1335 &ipha1->ipha_src, NULL, NULL) != 0) { 1336 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1337 ipIfStatsOutDiscards); 1338 ip_drop_output("ipIfStatsOutDiscards - select " 1339 "broadcast source", mp1, ire1->ire_ill); 1340 freemsg(mp1); 1341 continue; 1342 } 1343 /* 1344 * Check against global IPsec policy to set the AH/ESP 1345 * attributes. IPsec will set IXAF_IPSEC_* and 1346 * ixa_ipsec_* as appropriate. 1347 */ 1348 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { 1349 ASSERT(ixa->ixa_ipsec_policy == NULL); 1350 mp1 = ip_output_attach_policy(mp1, ipha, NULL, 1351 NULL, ixa); 1352 if (mp1 == NULL) { 1353 /* 1354 * MIB and ip_drop_packet already 1355 * done 1356 */ 1357 continue; 1358 } 1359 } 1360 } 1361 /* Make sure we have an NCE on this ill */ 1362 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr, 1363 ire1->ire_type); 1364 if (nce1 == NULL) { 1365 BUMP_MIB(ire1->ire_ill->ill_ip_mib, 1366 ipIfStatsOutDiscards); 1367 ip_drop_output("ipIfStatsOutDiscards - broadcast nce", 1368 mp1, ire1->ire_ill); 1369 freemsg(mp1); 1370 continue; 1371 } 1372 nce_orig = ixa->ixa_nce; 1373 ixa->ixa_nce = nce1; 1374 1375 ire_refhold(ire1); 1376 /* 1377 * Ignore any errors here. We just collect the errno for 1378 * the main ire below 1379 */ 1380 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp); 1381 ire_refrele(ire1); 1382 1383 ixa->ixa_nce = nce_orig; 1384 nce_refrele(nce1); 1385 1386 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY; 1387 } 1388 irb_refrele(irb); 1389 /* Finally, the main one */ 1390 1391 /* 1392 * For IPMP we only send broadcasts on the ipmp_ill. 1393 */ 1394 if (IS_UNDER_IPMP(ire->ire_ill)) { 1395 freemsg(mp); 1396 return (0); 1397 } 1398 1399 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1400 } 1401 1402 /* 1403 * Send a packet using a different source address and different 1404 * IPsec policy. 1405 */ 1406 static void 1407 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp) 1408 { 1409 ip_xmit_attr_t ixas; 1410 1411 bzero(&ixas, sizeof (ixas)); 1412 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 1413 ixas.ixa_zoneid = ixa->ixa_zoneid; 1414 ixas.ixa_ifindex = 0; 1415 ixas.ixa_ipst = ixa->ixa_ipst; 1416 ixas.ixa_cred = ixa->ixa_cred; 1417 ixas.ixa_cpid = ixa->ixa_cpid; 1418 ixas.ixa_tsl = ixa->ixa_tsl; 1419 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1420 1421 (void) ip_output_simple(mp, &ixas); 1422 ixa_cleanup(&ixas); 1423 } 1424 1425 1426 static void 1427 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa) 1428 { 1429 ip_stack_t *ipst = ixa->ixa_ipst; 1430 1431 /* Limit the TTL on multirt packets */ 1432 if (ire->ire_type & IRE_MULTICAST) { 1433 if (ipha->ipha_ttl > 1) { 1434 ip2dbg(("ire_send_multirt_v4: forcing multicast " 1435 "multirt TTL to 1 (was %d), dst 0x%08x\n", 1436 ipha->ipha_ttl, ntohl(ire->ire_addr))); 1437 ipha->ipha_ttl = 1; 1438 } 1439 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1440 } else if ((ipst->ips_ip_multirt_ttl > 0) && 1441 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { 1442 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; 1443 /* 1444 * Need to ensure we don't increase the ttl should we go through 1445 * ire_send_broadcast or multicast. 1446 */ 1447 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 1448 } 1449 } 1450 1451 /* 1452 * ire_sendfn for IRE_MULTICAST 1453 */ 1454 int 1455 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1456 ip_xmit_attr_t *ixa, uint32_t *identp) 1457 { 1458 ipha_t *ipha = (ipha_t *)iph_arg; 1459 ip_stack_t *ipst = ixa->ixa_ipst; 1460 ill_t *ill = ire->ire_ill; 1461 iaflags_t ixaflags = ixa->ixa_flags; 1462 1463 /* 1464 * The IRE_MULTICAST is the same whether or not multirt is in use. 1465 * Hence we need special-case code. 1466 */ 1467 if (ixaflags & IXAF_MULTIRT_MULTICAST) 1468 multirt_check_v4(ire, ipha, ixa); 1469 1470 /* 1471 * Check if anything in ip_input_v4 wants a copy of the transmitted 1472 * packet (after IPsec and fragmentation) 1473 * 1474 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set 1475 * RSVP and the rsvp daemon is an example of a 1476 * protocol and user level process that 1477 * handles it's own routing. Hence, it uses the 1478 * SO_DONTROUTE option to accomplish this. 1479 * 2. If the sender has set IP_MULTICAST_LOOP, then we just 1480 * check whether there are any receivers for the group on the ill 1481 * (ignoring the zoneid). 1482 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are 1483 * any members in other shared-IP zones. 1484 * If such members exist, then we indicate that the sending zone 1485 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP 1486 * behavior. 1487 * 1488 * When we loopback we skip hardware checksum to make sure loopback 1489 * copy is checksumed. 1490 * 1491 * Note that ire_ill is the upper in the case of IPMP. 1492 */ 1493 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM); 1494 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 && 1495 !(ixaflags & IXAF_DONTROUTE)) { 1496 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1497 } else if (ixaflags & IXAF_MULTICAST_LOOP) { 1498 /* 1499 * If this zone or any other zone has members then loopback 1500 * a copy. 1501 */ 1502 if (ill_hasmembers_v4(ill, ipha->ipha_dst)) 1503 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1504 } else if (ipst->ips_netstack->netstack_numzones > 1) { 1505 /* 1506 * This zone should not have a copy. But there are some other 1507 * zones which might have members. 1508 */ 1509 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, 1510 ixa->ixa_zoneid)) { 1511 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET; 1512 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid; 1513 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; 1514 } 1515 } 1516 1517 /* 1518 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl, 1519 * force the ttl to the IP_MULTICAST_TTL value 1520 */ 1521 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) { 1522 ipha->ipha_ttl = ixa->ixa_multicast_ttl; 1523 } 1524 1525 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1526 } 1527 1528 /* 1529 * ire_sendfn for IREs with RTF_MULTIRT 1530 */ 1531 int 1532 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1533 ip_xmit_attr_t *ixa, uint32_t *identp) 1534 { 1535 ipha_t *ipha = (ipha_t *)iph_arg; 1536 1537 multirt_check_v4(ire, ipha, ixa); 1538 1539 if (ire->ire_type & IRE_MULTICAST) 1540 return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp)); 1541 else if (ire->ire_type & IRE_BROADCAST) 1542 return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp)); 1543 else 1544 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); 1545 } 1546 1547 /* 1548 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE 1549 */ 1550 int 1551 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1552 ip_xmit_attr_t *ixa, uint32_t *identp) 1553 { 1554 ip_stack_t *ipst = ixa->ixa_ipst; 1555 ipha_t *ipha = (ipha_t *)iph_arg; 1556 ill_t *ill; 1557 ip_recv_attr_t iras; 1558 boolean_t dummy; 1559 1560 /* We assign an IP ident for nice errors */ 1561 ipha->ipha_ident = atomic_add_32_nv(identp, 1); 1562 1563 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); 1564 1565 if (ire->ire_type & IRE_NOROUTE) { 1566 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */ 1567 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, 1568 RTA_DST, ipst); 1569 } 1570 1571 if (ire->ire_flags & RTF_BLACKHOLE) { 1572 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL); 1573 freemsg(mp); 1574 /* No error even for local senders - silent blackhole */ 1575 return (0); 1576 } 1577 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL); 1578 1579 /* 1580 * We need an ill_t for the ip_recv_attr_t even though this packet 1581 * was never received and icmp_unreachable doesn't currently use 1582 * ira_ill. 1583 */ 1584 ill = ill_lookup_on_name("lo0", B_FALSE, 1585 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst); 1586 if (ill == NULL) { 1587 freemsg(mp); 1588 return (EHOSTUNREACH); 1589 } 1590 1591 bzero(&iras, sizeof (iras)); 1592 /* Map ixa to ira including IPsec policies */ 1593 ipsec_out_to_in(ixa, ill, &iras); 1594 1595 if (ip_source_routed(ipha, ipst)) { 1596 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras); 1597 } else { 1598 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 1599 } 1600 /* We moved any IPsec refs from ixa to iras */ 1601 ira_cleanup(&iras, B_FALSE); 1602 ill_refrele(ill); 1603 return (EHOSTUNREACH); 1604 } 1605 1606 /* 1607 * Calculate a checksum ignoring any hardware capabilities 1608 * 1609 * Returns B_FALSE if the packet was too short for the checksum. Caller 1610 * should free and do stats. 1611 */ 1612 static boolean_t 1613 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa) 1614 { 1615 ip_stack_t *ipst = ixa->ixa_ipst; 1616 uint_t pktlen = ixa->ixa_pktlen; 1617 uint16_t *cksump; 1618 uint32_t cksum; 1619 uint8_t protocol = ixa->ixa_protocol; 1620 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1621 ipaddr_t dst = ipha->ipha_dst; 1622 ipaddr_t src = ipha->ipha_src; 1623 1624 /* Just in case it contained garbage */ 1625 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1626 1627 /* 1628 * Calculate ULP checksum 1629 */ 1630 if (protocol == IPPROTO_TCP) { 1631 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1632 cksum = IP_TCP_CSUM_COMP; 1633 } else if (protocol == IPPROTO_UDP) { 1634 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1635 cksum = IP_UDP_CSUM_COMP; 1636 } else if (protocol == IPPROTO_SCTP) { 1637 sctp_hdr_t *sctph; 1638 1639 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1640 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1641 /* 1642 * Zero out the checksum field to ensure proper 1643 * checksum calculation. 1644 */ 1645 sctph->sh_chksum = 0; 1646 #ifdef DEBUG 1647 if (!skip_sctp_cksum) 1648 #endif 1649 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1650 goto ip_hdr_cksum; 1651 } else if (protocol == IPPROTO_DCCP) { 1652 cksump = IPH_DCCPH_CHECKSUMP(ipha, ip_hdr_length); 1653 cksum = IP_DCCP_CSUM_COMP; 1654 } else { 1655 goto ip_hdr_cksum; 1656 } 1657 1658 /* ULP puts the checksum field is in the first mblk */ 1659 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1660 1661 /* 1662 * We accumulate the pseudo header checksum in cksum. 1663 * This is pretty hairy code, so watch close. One 1664 * thing to keep in mind is that UDP and TCP have 1665 * stored their respective datagram lengths in their 1666 * checksum fields. This lines things up real nice. 1667 */ 1668 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 1669 1670 cksum = IP_CSUM(mp, ip_hdr_length, cksum); 1671 /* 1672 * For UDP/IPv4 a zero means that the packets wasn't checksummed. 1673 * Change to 0xffff 1674 */ 1675 if (protocol == IPPROTO_UDP && cksum == 0) 1676 *cksump = ~cksum; 1677 else 1678 *cksump = cksum; 1679 1680 IP_STAT(ipst, ip_out_sw_cksum); 1681 IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen); 1682 1683 ip_hdr_cksum: 1684 /* Calculate IPv4 header checksum */ 1685 ipha->ipha_hdr_checksum = 0; 1686 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1687 return (B_TRUE); 1688 } 1689 1690 /* 1691 * Calculate the ULP checksum - try to use hardware. 1692 * In the case of MULTIRT, broadcast or multicast the 1693 * IXAF_NO_HW_CKSUM is set in which case we use software. 1694 * 1695 * If the hardware supports IP header checksum offload; then clear the 1696 * contents of IP header checksum field as expected by NIC. 1697 * Do this only if we offloaded either full or partial sum. 1698 * 1699 * Returns B_FALSE if the packet was too short for the checksum. Caller 1700 * should free and do stats. 1701 */ 1702 static boolean_t 1703 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha, 1704 ip_xmit_attr_t *ixa, ill_t *ill) 1705 { 1706 uint_t pktlen = ixa->ixa_pktlen; 1707 uint16_t *cksump; 1708 uint16_t hck_flags; 1709 uint32_t cksum; 1710 uint8_t protocol = ixa->ixa_protocol; 1711 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; 1712 1713 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || 1714 !dohwcksum) { 1715 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1716 } 1717 1718 /* 1719 * Calculate ULP checksum. Note that we don't use cksump and cksum 1720 * if the ill has FULL support. 1721 */ 1722 if (protocol == IPPROTO_TCP) { 1723 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); 1724 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */ 1725 } else if (protocol == IPPROTO_UDP) { 1726 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); 1727 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */ 1728 } else if (protocol == IPPROTO_SCTP) { 1729 sctp_hdr_t *sctph; 1730 1731 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); 1732 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); 1733 /* 1734 * Zero out the checksum field to ensure proper 1735 * checksum calculation. 1736 */ 1737 sctph->sh_chksum = 0; 1738 #ifdef DEBUG 1739 if (!skip_sctp_cksum) 1740 #endif 1741 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); 1742 goto ip_hdr_cksum; 1743 } else if (protocol == IPPROTO_DCCP) { 1744 cksump = IPH_DCCPH_CHECKSUMP(ipha, ip_hdr_length); 1745 cksum = IP_DCCP_CSUM_COMP; 1746 } else { 1747 ip_hdr_cksum: 1748 /* Calculate IPv4 header checksum */ 1749 ipha->ipha_hdr_checksum = 0; 1750 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1751 return (B_TRUE); 1752 } 1753 1754 /* ULP puts the checksum field is in the first mblk */ 1755 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); 1756 1757 /* 1758 * Underlying interface supports hardware checksum offload for 1759 * the payload; leave the payload checksum for the hardware to 1760 * calculate. N.B: We only need to set up checksum info on the 1761 * first mblk. 1762 */ 1763 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags; 1764 1765 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; 1766 if (hck_flags & HCKSUM_INET_FULL_V4) { 1767 /* 1768 * Hardware calculates pseudo-header, header and the 1769 * payload checksums, so clear the checksum field in 1770 * the protocol header. 1771 */ 1772 *cksump = 0; 1773 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; 1774 1775 ipha->ipha_hdr_checksum = 0; 1776 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1777 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1778 } else { 1779 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1780 } 1781 return (B_TRUE); 1782 } 1783 if ((hck_flags) & HCKSUM_INET_PARTIAL) { 1784 ipaddr_t dst = ipha->ipha_dst; 1785 ipaddr_t src = ipha->ipha_src; 1786 /* 1787 * Partial checksum offload has been enabled. Fill 1788 * the checksum field in the protocol header with the 1789 * pseudo-header checksum value. 1790 * 1791 * We accumulate the pseudo header checksum in cksum. 1792 * This is pretty hairy code, so watch close. One 1793 * thing to keep in mind is that UDP and TCP have 1794 * stored their respective datagram lengths in their 1795 * checksum fields. This lines things up real nice. 1796 */ 1797 cksum += (dst >> 16) + (dst & 0xFFFF) + 1798 (src >> 16) + (src & 0xFFFF); 1799 cksum += *(cksump); 1800 cksum = (cksum & 0xFFFF) + (cksum >> 16); 1801 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); 1802 1803 /* 1804 * Offsets are relative to beginning of IP header. 1805 */ 1806 DB_CKSUMSTART(mp) = ip_hdr_length; 1807 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha; 1808 DB_CKSUMEND(mp) = pktlen; 1809 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; 1810 1811 ipha->ipha_hdr_checksum = 0; 1812 if (hck_flags & HCKSUM_IPHDRCKSUM) { 1813 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; 1814 } else { 1815 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1816 } 1817 return (B_TRUE); 1818 } 1819 /* Hardware capabilities include neither full nor partial IPv4 */ 1820 return (ip_output_sw_cksum_v4(mp, ipha, ixa)); 1821 } 1822 1823 /* 1824 * ire_sendfn for offlink and onlink destinations. 1825 * Also called from the multicast, broadcast, multirt send functions. 1826 * 1827 * Assumes that the caller has a hold on the ire. 1828 * 1829 * This function doesn't care if the IRE just became condemned since that 1830 * can happen at any time. 1831 */ 1832 /* ARGSUSED */ 1833 int 1834 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1835 ip_xmit_attr_t *ixa, uint32_t *identp) 1836 { 1837 ip_stack_t *ipst = ixa->ixa_ipst; 1838 ipha_t *ipha = (ipha_t *)iph_arg; 1839 iaflags_t ixaflags = ixa->ixa_flags; 1840 ill_t *ill; 1841 1842 ASSERT(ixa->ixa_nce != NULL); 1843 ill = ixa->ixa_nce->nce_ill; 1844 1845 if (ixaflags & IXAF_DONTROUTE) 1846 ipha->ipha_ttl = 1; 1847 1848 /* 1849 * Assign an ident value for this packet. There could be other 1850 * threads targeting the same destination, so we have to arrange 1851 * for a atomic increment. Note that we use a 32-bit atomic add 1852 * because it has better performance than its 16-bit sibling. 1853 * 1854 * Normally ixa_extra_ident is 0, but in the case of LSO it will 1855 * be the number of TCP segments that the driver/hardware will 1856 * extraly construct. 1857 * 1858 * If running in cluster mode and if the source address 1859 * belongs to a replicated service then vector through 1860 * cl_inet_ipident vector to allocate ip identifier 1861 * NOTE: This is a contract private interface with the 1862 * clustering group. 1863 */ 1864 if (cl_inet_ipident != NULL) { 1865 ipaddr_t src = ipha->ipha_src; 1866 ipaddr_t dst = ipha->ipha_dst; 1867 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid; 1868 1869 ASSERT(cl_inet_isclusterwide != NULL); 1870 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP, 1871 AF_INET, (uint8_t *)(uintptr_t)src, NULL)) { 1872 /* 1873 * Note: not correct with LSO since we can't allocate 1874 * ixa_extra_ident+1 consecutive values. 1875 */ 1876 ipha->ipha_ident = (*cl_inet_ipident)(stack_id, 1877 IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src, 1878 (uint8_t *)(uintptr_t)dst, NULL); 1879 } else { 1880 ipha->ipha_ident = atomic_add_32_nv(identp, 1881 ixa->ixa_extra_ident + 1); 1882 } 1883 } else { 1884 ipha->ipha_ident = atomic_add_32_nv(identp, 1885 ixa->ixa_extra_ident + 1); 1886 } 1887 #ifndef _BIG_ENDIAN 1888 ipha->ipha_ident = htons(ipha->ipha_ident); 1889 #endif 1890 1891 /* 1892 * This might set b_band, thus the IPsec and fragmentation 1893 * code in IP ensures that b_band is updated in the first mblk. 1894 */ 1895 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 1896 /* ip_process translates an IS_UNDER_IPMP */ 1897 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill); 1898 if (mp == NULL) { 1899 /* ip_drop_packet and MIB done */ 1900 return (0); /* Might just be delayed */ 1901 } 1902 } 1903 1904 /* 1905 * Verify any IPv4 options. 1906 * 1907 * The presense of IP options also forces the network stack to 1908 * calculate the checksum in software. This is because: 1909 * 1910 * Wrap around: certain partial-checksum NICs (eri, ce) limit 1911 * the size of "start offset" width to 6-bit. This effectively 1912 * sets the largest value of the offset to 64-bytes, starting 1913 * from the MAC header. When the cumulative MAC and IP headers 1914 * exceed such limit, the offset will wrap around. This causes 1915 * the checksum to be calculated at the wrong place. 1916 * 1917 * IPv4 source routing: none of the full-checksum capable NICs 1918 * is capable of correctly handling the IPv4 source-routing 1919 * option for purposes of calculating the pseudo-header; the 1920 * actual destination is different from the destination in the 1921 * header which is that of the next-hop. (This case may not be 1922 * true for NICs which can parse IPv6 extension headers, but 1923 * we choose to simplify the implementation by not offloading 1924 * checksum when they are present.) 1925 */ 1926 if (!IS_SIMPLE_IPH(ipha)) { 1927 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM; 1928 /* An IS_UNDER_IPMP ill is ok here */ 1929 if (ip_output_options(mp, ipha, ixa, ill)) { 1930 /* Packet has been consumed and ICMP error sent */ 1931 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1932 return (EINVAL); 1933 } 1934 } 1935 1936 /* 1937 * To handle IPsec/iptun's labeling needs we need to tag packets 1938 * while we still have ixa_tsl 1939 */ 1940 if (is_system_labeled() && ixa->ixa_tsl != NULL && 1941 (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 || 1942 ill->ill_mactype == DL_IPV6)) { 1943 cred_t *newcr; 1944 1945 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl, 1946 KM_NOSLEEP); 1947 if (newcr == NULL) { 1948 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1949 ip_drop_output("ipIfStatsOutDiscards - newcr", 1950 mp, ill); 1951 freemsg(mp); 1952 return (ENOBUFS); 1953 } 1954 mblk_setcred(mp, newcr, NOPID); 1955 crfree(newcr); /* mblk_setcred did its own crhold */ 1956 } 1957 1958 if (ixa->ixa_pktlen > ixa->ixa_fragsize || 1959 (ixaflags & IXAF_IPSEC_SECURE)) { 1960 uint32_t pktlen; 1961 1962 pktlen = ixa->ixa_pktlen; 1963 if (ixaflags & IXAF_IPSEC_SECURE) 1964 pktlen += ipsec_out_extra_length(ixa); 1965 1966 if (pktlen > IP_MAXPACKET) 1967 return (EMSGSIZE); 1968 1969 if (ixaflags & IXAF_SET_ULP_CKSUM) { 1970 /* 1971 * Compute ULP checksum and IP header checksum 1972 * using software 1973 */ 1974 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) { 1975 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 1976 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 1977 freemsg(mp); 1978 return (EINVAL); 1979 } 1980 } else { 1981 /* Calculate IPv4 header checksum */ 1982 ipha->ipha_hdr_checksum = 0; 1983 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1984 } 1985 1986 /* 1987 * If this packet would generate a icmp_frag_needed 1988 * message, we need to handle it before we do the IPsec 1989 * processing. Otherwise, we need to strip the IPsec 1990 * headers before we send up the message to the ULPs 1991 * which becomes messy and difficult. 1992 * 1993 * We check using IXAF_DONTFRAG. The DF bit in the header 1994 * is not inspected - it will be copied to any generated 1995 * fragments. 1996 */ 1997 if ((pktlen > ixa->ixa_fragsize) && 1998 (ixaflags & IXAF_DONTFRAG)) { 1999 /* Generate ICMP and return error */ 2000 ip_recv_attr_t iras; 2001 2002 DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen, 2003 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2004 uint_t, ixa->ixa_pmtu); 2005 2006 bzero(&iras, sizeof (iras)); 2007 /* Map ixa to ira including IPsec policies */ 2008 ipsec_out_to_in(ixa, ill, &iras); 2009 2010 ip_drop_output("ICMP_FRAG_NEEDED", mp, ill); 2011 icmp_frag_needed(mp, ixa->ixa_fragsize, &iras); 2012 /* We moved any IPsec refs from ixa to iras */ 2013 ira_cleanup(&iras, B_FALSE); 2014 return (EMSGSIZE); 2015 } 2016 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen, 2017 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, 2018 uint_t, ixa->ixa_pmtu); 2019 2020 if (ixaflags & IXAF_IPSEC_SECURE) { 2021 /* 2022 * Pass in sufficient information so that 2023 * IPsec can determine whether to fragment, and 2024 * which function to call after fragmentation. 2025 */ 2026 return (ipsec_out_process(mp, ixa)); 2027 } 2028 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags, 2029 ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint, 2030 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, 2031 ixa->ixa_postfragfn, &ixa->ixa_cookie)); 2032 } 2033 if (ixaflags & IXAF_SET_ULP_CKSUM) { 2034 /* Compute ULP checksum and IP header checksum */ 2035 /* An IS_UNDER_IPMP ill is ok here */ 2036 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) { 2037 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2038 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2039 freemsg(mp); 2040 return (EINVAL); 2041 } 2042 } else { 2043 /* Calculate IPv4 header checksum */ 2044 ipha->ipha_hdr_checksum = 0; 2045 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 2046 } 2047 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags, 2048 ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, 2049 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie)); 2050 } 2051 2052 /* 2053 * Send mp into ip_input 2054 * Common for IPv4 and IPv6 2055 */ 2056 void 2057 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2058 uint_t pkt_len, zoneid_t nolzid) 2059 { 2060 rtc_t rtc; 2061 ill_t *ill = nce->nce_ill; 2062 ip_recv_attr_t iras; /* NOTE: No bzero for performance */ 2063 ncec_t *ncec; 2064 2065 ncec = nce->nce_common; 2066 iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM | 2067 IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK; 2068 if (ncec->ncec_flags & NCE_F_BCAST) 2069 iras.ira_flags |= IRAF_L2DST_BROADCAST; 2070 else if (ncec->ncec_flags & NCE_F_MCAST) 2071 iras.ira_flags |= IRAF_L2DST_MULTICAST; 2072 2073 iras.ira_free_flags = 0; 2074 iras.ira_cred = NULL; 2075 iras.ira_cpid = NOPID; 2076 iras.ira_tsl = NULL; 2077 iras.ira_zoneid = ALL_ZONES; 2078 iras.ira_pktlen = pkt_len; 2079 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen); 2080 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 2081 2082 if (ixaflags & IXAF_IS_IPV4) 2083 iras.ira_flags |= IRAF_IS_IPV4; 2084 2085 iras.ira_ill = iras.ira_rill = ill; 2086 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2087 iras.ira_rifindex = iras.ira_ruifindex; 2088 iras.ira_mhip = NULL; 2089 2090 iras.ira_flags |= ixaflags & IAF_MASK; 2091 iras.ira_no_loop_zoneid = nolzid; 2092 2093 /* Broadcast and multicast doesn't care about the squeue */ 2094 iras.ira_sqp = NULL; 2095 2096 rtc.rtc_ire = NULL; 2097 if (ixaflags & IXAF_IS_IPV4) { 2098 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2099 2100 rtc.rtc_ipaddr = INADDR_ANY; 2101 2102 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); 2103 if (rtc.rtc_ire != NULL) { 2104 ASSERT(rtc.rtc_ipaddr != INADDR_ANY); 2105 ire_refrele(rtc.rtc_ire); 2106 } 2107 } else { 2108 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2109 2110 rtc.rtc_ip6addr = ipv6_all_zeros; 2111 2112 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc); 2113 if (rtc.rtc_ire != NULL) { 2114 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr)); 2115 ire_refrele(rtc.rtc_ire); 2116 } 2117 } 2118 /* Any references to clean up? No hold on ira */ 2119 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) 2120 ira_cleanup(&iras, B_FALSE); 2121 } 2122 2123 /* 2124 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which 2125 * looks at the IXAF_LOOPBACK_COPY flag. 2126 * Common for IPv4 and IPv6. 2127 * 2128 * If the loopback copy fails (due to no memory) but we send the packet out 2129 * on the wire we return no failure. Only in the case we supress the wire 2130 * sending do we take the loopback failure into account. 2131 * 2132 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy. 2133 * Those operations are performed on this packet in ip_xmit() and it would 2134 * be odd to do it twice for the same packet. 2135 */ 2136 int 2137 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2138 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2139 uintptr_t *ixacookie) 2140 { 2141 ill_t *ill = nce->nce_ill; 2142 int error = 0; 2143 2144 /* 2145 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver 2146 * had looped it back 2147 */ 2148 if (ixaflags & IXAF_LOOPBACK_COPY) { 2149 mblk_t *mp1; 2150 2151 mp1 = copymsg(mp); 2152 if (mp1 == NULL) { 2153 /* Failed to deliver the loopback copy. */ 2154 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2155 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2156 error = ENOBUFS; 2157 } else { 2158 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2159 nolzid); 2160 } 2161 } 2162 2163 /* 2164 * If TTL = 0 then only do the loopback to this host i.e. we are 2165 * done. We are also done if this was the 2166 * loopback interface since it is sufficient 2167 * to loopback one copy of a multicast packet. 2168 */ 2169 if (ixaflags & IXAF_IS_IPV4) { 2170 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2171 2172 if (ipha->ipha_ttl == 0) { 2173 ip_drop_output("multicast ipha_ttl not sent to wire", 2174 mp, ill); 2175 freemsg(mp); 2176 return (error); 2177 } 2178 } else { 2179 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2180 2181 if (ip6h->ip6_hops == 0) { 2182 ip_drop_output("multicast ipha_ttl not sent to wire", 2183 mp, ill); 2184 freemsg(mp); 2185 return (error); 2186 } 2187 } 2188 if (nce->nce_ill->ill_wq == NULL) { 2189 /* Loopback interface */ 2190 ip_drop_output("multicast on lo0 not sent to wire", mp, ill); 2191 freemsg(mp); 2192 return (error); 2193 } 2194 2195 return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2196 ixacookie)); 2197 } 2198 2199 /* 2200 * Post fragmentation function for RTF_MULTIRT routes. 2201 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function 2202 * checks IXAF_LOOPBACK_COPY. 2203 * 2204 * If no packet is sent due to failures then we return an errno, but if at 2205 * least one succeeded we return zero. 2206 */ 2207 int 2208 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, 2209 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, 2210 uintptr_t *ixacookie) 2211 { 2212 irb_t *irb; 2213 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2214 ire_t *ire; 2215 ire_t *ire1; 2216 mblk_t *mp1; 2217 nce_t *nce1; 2218 ill_t *ill = nce->nce_ill; 2219 ill_t *ill1; 2220 ip_stack_t *ipst = ill->ill_ipst; 2221 int error = 0; 2222 int num_sent = 0; 2223 int err; 2224 uint_t ire_type; 2225 ipaddr_t nexthop; 2226 2227 ASSERT(ixaflags & IXAF_IS_IPV4); 2228 2229 /* Check for IXAF_LOOPBACK_COPY */ 2230 if (ixaflags & IXAF_LOOPBACK_COPY) { 2231 mblk_t *mp1; 2232 2233 mp1 = copymsg(mp); 2234 if (mp1 == NULL) { 2235 /* Failed to deliver the loopback copy. */ 2236 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2237 ip_drop_output("ipIfStatsOutDiscards", mp, ill); 2238 error = ENOBUFS; 2239 } else { 2240 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, 2241 nolzid); 2242 } 2243 } 2244 2245 /* 2246 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send 2247 * a copy to each one. 2248 * Use the nce (nexthop) and ipha_dst to find the ire. 2249 * 2250 * MULTIRT is not designed to work with shared-IP zones thus we don't 2251 * need to pass a zoneid or a label to the IRE lookup. 2252 */ 2253 if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) { 2254 /* Broadcast and multicast case */ 2255 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0, 2256 NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 2257 } else { 2258 ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr); 2259 2260 /* Unicast case */ 2261 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0, 2262 NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL); 2263 } 2264 2265 if (ire == NULL || 2266 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 2267 !(ire->ire_flags & RTF_MULTIRT)) { 2268 /* Drop */ 2269 ip_drop_output("ip_postfrag_multirt didn't find route", 2270 mp, nce->nce_ill); 2271 if (ire != NULL) 2272 ire_refrele(ire); 2273 return (ENETUNREACH); 2274 } 2275 2276 irb = ire->ire_bucket; 2277 irb_refhold(irb); 2278 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 2279 /* 2280 * For broadcast we can have a mixture of IRE_BROADCAST and 2281 * IRE_HOST due to the manually added IRE_HOSTs that are used 2282 * to trigger the creation of the special CGTP broadcast routes. 2283 * Thus we have to skip if ire_type doesn't match the original. 2284 */ 2285 if (IRE_IS_CONDEMNED(ire1) || 2286 !(ire1->ire_flags & RTF_MULTIRT) || 2287 ire1->ire_type != ire->ire_type) 2288 continue; 2289 2290 /* Do the ire argument one after the loop */ 2291 if (ire1 == ire) 2292 continue; 2293 2294 ill1 = ire_nexthop_ill(ire1); 2295 if (ill1 == NULL) { 2296 /* 2297 * This ire might not have been picked by 2298 * ire_route_recursive, in which case ire_dep might 2299 * not have been setup yet. 2300 * We kick ire_route_recursive to try to resolve 2301 * starting at ire1. 2302 */ 2303 ire_t *ire2; 2304 uint_t match_flags = MATCH_IRE_DSTONLY; 2305 2306 if (ire1->ire_ill != NULL) 2307 match_flags |= MATCH_IRE_ILL; 2308 ire2 = ire_route_recursive_impl_v4(ire1, 2309 ire1->ire_addr, ire1->ire_type, ire1->ire_ill, 2310 ire1->ire_zoneid, NULL, match_flags, 2311 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); 2312 if (ire2 != NULL) 2313 ire_refrele(ire2); 2314 ill1 = ire_nexthop_ill(ire1); 2315 } 2316 2317 if (ill1 == NULL) { 2318 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2319 ip_drop_output("ipIfStatsOutDiscards - no ill", 2320 mp, ill); 2321 error = ENETUNREACH; 2322 continue; 2323 } 2324 2325 /* Pick the addr and type to use for arp_nce_init */ 2326 if (nce->nce_common->ncec_flags & NCE_F_BCAST) { 2327 ire_type = IRE_BROADCAST; 2328 nexthop = ire1->ire_gateway_addr; 2329 } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) { 2330 ire_type = IRE_MULTICAST; 2331 nexthop = ipha->ipha_dst; 2332 } else { 2333 ire_type = ire1->ire_type; /* Doesn't matter */ 2334 nexthop = ire1->ire_gateway_addr; 2335 } 2336 2337 /* If IPMP meta or under, then we just drop */ 2338 if (ill1->ill_grp != NULL) { 2339 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2340 ip_drop_output("ipIfStatsOutDiscards - IPMP", 2341 mp, ill1); 2342 ill_refrele(ill1); 2343 error = ENETUNREACH; 2344 continue; 2345 } 2346 2347 nce1 = arp_nce_init(ill1, nexthop, ire_type); 2348 if (nce1 == NULL) { 2349 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2350 ip_drop_output("ipIfStatsOutDiscards - no nce", 2351 mp, ill1); 2352 ill_refrele(ill1); 2353 error = ENETUNREACH; 2354 continue; 2355 } 2356 mp1 = copymsg(mp); 2357 if (mp1 == NULL) { 2358 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); 2359 ip_drop_output("ipIfStatsOutDiscards", mp, ill1); 2360 nce_refrele(nce1); 2361 ill_refrele(ill1); 2362 error = ENOBUFS; 2363 continue; 2364 } 2365 /* Preserve HW checksum for this copy */ 2366 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); 2367 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); 2368 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); 2369 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); 2370 DB_LSOMSS(mp1) = DB_LSOMSS(mp); 2371 2372 ire1->ire_ob_pkt_count++; 2373 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone, 2374 0, ixacookie); 2375 if (err == 0) 2376 num_sent++; 2377 else 2378 error = err; 2379 nce_refrele(nce1); 2380 ill_refrele(ill1); 2381 } 2382 irb_refrele(irb); 2383 ire_refrele(ire); 2384 /* Finally, the main one */ 2385 err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, 2386 ixacookie); 2387 if (err == 0) 2388 num_sent++; 2389 else 2390 error = err; 2391 if (num_sent > 0) 2392 return (0); 2393 else 2394 return (error); 2395 } 2396 2397 /* 2398 * Verify local connectivity. This check is called by ULP fusion code. 2399 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if 2400 * the interface is brought down and back up. So we simply fail the local 2401 * process. The caller, TCP Fusion, should unfuse the connection. 2402 */ 2403 boolean_t 2404 ip_output_verify_local(ip_xmit_attr_t *ixa) 2405 { 2406 ire_t *ire = ixa->ixa_ire; 2407 2408 if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) 2409 return (B_FALSE); 2410 2411 return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation); 2412 } 2413 2414 /* 2415 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6. 2416 * 2417 * The caller must call ip_output_verify_local() first. This function handles 2418 * IPobs, FW_HOOKS, and/or IPsec cases sequentially. 2419 */ 2420 mblk_t * 2421 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out, 2422 boolean_t hooks_in, conn_t *peer_connp) 2423 { 2424 ill_t *ill = ixa->ixa_ire->ire_ill; 2425 ipha_t *ipha = NULL; 2426 ip6_t *ip6h = NULL; 2427 ip_stack_t *ipst = ixa->ixa_ipst; 2428 iaflags_t ixaflags = ixa->ixa_flags; 2429 ip_recv_attr_t iras; 2430 int error; 2431 2432 ASSERT(mp != NULL); 2433 2434 if (ixaflags & IXAF_IS_IPV4) { 2435 ipha = (ipha_t *)mp->b_rptr; 2436 2437 /* 2438 * If a callback is enabled then we need to know the 2439 * source and destination zoneids for the packet. We already 2440 * have those handy. 2441 */ 2442 if (ipst->ips_ip4_observe.he_interested) { 2443 zoneid_t szone, dzone; 2444 zoneid_t stackzoneid; 2445 2446 stackzoneid = netstackid_to_zoneid( 2447 ipst->ips_netstack->netstack_stackid); 2448 2449 if (stackzoneid == GLOBAL_ZONEID) { 2450 /* Shared-IP zone */ 2451 dzone = ixa->ixa_ire->ire_zoneid; 2452 szone = ixa->ixa_zoneid; 2453 } else { 2454 szone = dzone = stackzoneid; 2455 } 2456 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2457 ipst); 2458 } 2459 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2460 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2461 NULL, int, 1); 2462 2463 /* FW_HOOKS: LOOPBACK_OUT */ 2464 if (hooks_out) { 2465 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, 2466 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); 2467 FW_HOOKS(ipst->ips_ip4_loopback_out_event, 2468 ipst->ips_ipv4firewall_loopback_out, 2469 NULL, ill, ipha, mp, mp, 0, ipst, error); 2470 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); 2471 } 2472 if (mp == NULL) 2473 return (NULL); 2474 2475 /* FW_HOOKS: LOOPBACK_IN */ 2476 if (hooks_in) { 2477 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, 2478 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); 2479 FW_HOOKS(ipst->ips_ip4_loopback_in_event, 2480 ipst->ips_ipv4firewall_loopback_in, 2481 ill, NULL, ipha, mp, mp, 0, ipst, error); 2482 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); 2483 } 2484 if (mp == NULL) 2485 return (NULL); 2486 2487 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2488 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, 2489 NULL, int, 1); 2490 2491 /* Inbound IPsec polocies */ 2492 if (peer_connp != NULL) { 2493 /* Map ixa to ira including IPsec policies. */ 2494 ipsec_out_to_in(ixa, ill, &iras); 2495 mp = ipsec_check_inbound_policy(mp, peer_connp, ipha, 2496 NULL, &iras); 2497 } 2498 } else { 2499 ip6h = (ip6_t *)mp->b_rptr; 2500 2501 /* 2502 * If a callback is enabled then we need to know the 2503 * source and destination zoneids for the packet. We already 2504 * have those handy. 2505 */ 2506 if (ipst->ips_ip6_observe.he_interested) { 2507 zoneid_t szone, dzone; 2508 zoneid_t stackzoneid; 2509 2510 stackzoneid = netstackid_to_zoneid( 2511 ipst->ips_netstack->netstack_stackid); 2512 2513 if (stackzoneid == GLOBAL_ZONEID) { 2514 /* Shared-IP zone */ 2515 dzone = ixa->ixa_ire->ire_zoneid; 2516 szone = ixa->ixa_zoneid; 2517 } else { 2518 szone = dzone = stackzoneid; 2519 } 2520 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, 2521 ipst); 2522 } 2523 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2524 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2525 ip6h, int, 1); 2526 2527 /* FW_HOOKS: LOOPBACK_OUT */ 2528 if (hooks_out) { 2529 DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL, 2530 ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp); 2531 FW_HOOKS6(ipst->ips_ip6_loopback_out_event, 2532 ipst->ips_ipv6firewall_loopback_out, 2533 NULL, ill, ip6h, mp, mp, 0, ipst, error); 2534 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp); 2535 } 2536 if (mp == NULL) 2537 return (NULL); 2538 2539 /* FW_HOOKS: LOOPBACK_IN */ 2540 if (hooks_in) { 2541 DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill, 2542 ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp); 2543 FW_HOOKS6(ipst->ips_ip6_loopback_in_event, 2544 ipst->ips_ipv6firewall_loopback_in, 2545 ill, NULL, ip6h, mp, mp, 0, ipst, error); 2546 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp); 2547 } 2548 if (mp == NULL) 2549 return (NULL); 2550 2551 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 2552 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, 2553 ip6h, int, 1); 2554 2555 /* Inbound IPsec polocies */ 2556 if (peer_connp != NULL) { 2557 /* Map ixa to ira including IPsec policies. */ 2558 ipsec_out_to_in(ixa, ill, &iras); 2559 mp = ipsec_check_inbound_policy(mp, peer_connp, NULL, 2560 ip6h, &iras); 2561 } 2562 } 2563 2564 if (mp == NULL) { 2565 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2566 ip_drop_input("ipIfStatsInDiscards", NULL, ill); 2567 } 2568 2569 return (mp); 2570 }