1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/sysmacros.h>
  28 #include <sys/kmem.h>
  29 #include <sys/ksynch.h>
  30 #include <sys/systm.h>
  31 #include <sys/socket.h>
  32 #include <sys/disp.h>
  33 #include <sys/taskq.h>
  34 #include <sys/cmn_err.h>
  35 #include <sys/strsun.h>
  36 #include <sys/sdt.h>
  37 #include <sys/atomic.h>
  38 #include <netinet/in.h>
  39 #include <inet/ip.h>
  40 #include <inet/ip6.h>
  41 #include <inet/tcp.h>
  42 #include <inet/udp_impl.h>
  43 #include <inet/kstatcom.h>
  44 
  45 #include <inet/ilb_ip.h>
  46 #include "ilb_alg.h"
  47 #include "ilb_nat.h"
  48 #include "ilb_conn.h"
  49 
  50 /* ILB kmem cache flag */
  51 int ilb_kmem_flags = 0;
  52 
  53 /*
  54  * The default size for the different hash tables.  Global for all stacks.
  55  * But each stack has its own table, just that their sizes are the same.
  56  */
  57 static size_t ilb_rule_hash_size = 2048;
  58 
  59 static size_t ilb_conn_hash_size = 262144;
  60 
  61 static size_t ilb_sticky_hash_size = 262144;
  62 
  63 /* This should be a prime number. */
  64 static size_t ilb_nat_src_hash_size = 97;
  65 
  66 /* Default NAT cache entry expiry time. */
  67 static uint32_t ilb_conn_tcp_expiry = 120;
  68 static uint32_t ilb_conn_udp_expiry = 60;
  69 
  70 /* Default sticky entry expiry time. */
  71 static uint32_t ilb_sticky_expiry = 60;
  72 
  73 /* addr is assumed to be a uint8_t * to an ipaddr_t. */
  74 #define ILB_RULE_HASH(addr, hash_size) \
  75         ((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \
  76         *(addr)) & ((hash_size) - 1))
  77 
  78 /*
  79  * Note on ILB delayed processing
  80  *
  81  * To avoid in line removal on some of the data structures, such as rules,
  82  * servers and ilb_conn_hash entries, ILB delays such processing to a taskq.
  83  * There are three types of ILB taskq:
  84  *
  85  * 1. rule handling: created at stack initialialization time, ilb_stack_init()
  86  * 2. conn hash handling: created at conn hash initialization time,
  87  *                        ilb_conn_hash_init()
  88  * 3. sticky hash handling: created at sticky hash initialization time,
  89  *                          ilb_sticky_hash_init()
  90  *
  91  * The rule taskq is for processing rule and server removal.  When a user
  92  * land rule/server removal request comes in, a taskq is dispatched after
  93  * removing the rule/server from all related hashes.  This taskq will wait
  94  * until all references to the rule/server are gone before removing it.
  95  * So the user land thread requesting the removal does not need to wait
  96  * for the removal completion.
  97  *
  98  * The conn hash/sticky hash taskq is for processing ilb_conn_hash and
  99  * ilb_sticky_hash table entry removal.  There are ilb_conn_timer_size timers
 100  * and ilb_sticky_timer_size timers running for ilb_conn_hash and
 101  * ilb_sticky_hash cleanup respectively.   Each timer is responsible for one
 102  * portion (same size) of the hash table.  When a timer fires, it dispatches
 103  * a conn hash taskq to clean up its portion of the table.  This avoids in
 104  * line processing of the removal.
 105  *
 106  * There is another delayed processing, the clean up of NAT source address
 107  * table.  We just use the timer to directly handle it instead of using
 108  * a taskq.  The reason is that the table is small so it is OK to use the
 109  * timer.
 110  */
 111 
 112 /* ILB rule taskq constants. */
 113 #define ILB_RULE_TASKQ_NUM_THR  20
 114 
 115 /* Argument passed to ILB rule taskq routines. */
 116 typedef struct {
 117         ilb_stack_t     *ilbs;
 118         ilb_rule_t      *rule;
 119 } ilb_rule_tq_t;
 120 
 121 /* kstat handling routines. */
 122 static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *);
 123 static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *);
 124 static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *);
 125 static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *,
 126     ilb_server_t *);
 127 
 128 /* Rule hash handling routines. */
 129 static void ilb_rule_hash_init(ilb_stack_t *);
 130 static void ilb_rule_hash_fini(ilb_stack_t *);
 131 static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *);
 132 static void ilb_rule_hash_del(ilb_rule_t *);
 133 static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *,
 134     in_port_t, zoneid_t, uint32_t, boolean_t *);
 135 
 136 static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *);
 137 static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *);
 138 static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *);
 139 static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *,
 140     int *);
 141 static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int,
 142     int, in_port_t, in_port_t, const in6_addr_t *);
 143 
 144 /* Back end server handling routines. */
 145 static void ilb_server_free(ilb_server_t *);
 146 
 147 /* Network stack handling routines. */
 148 static void *ilb_stack_init(netstackid_t, netstack_t *);
 149 static void ilb_stack_shutdown(netstackid_t, void *);
 150 static void ilb_stack_fini(netstackid_t, void *);
 151 
 152 /* Sticky connection handling routines. */
 153 static void ilb_rule_sticky_init(ilb_rule_t *);
 154 static void ilb_rule_sticky_fini(ilb_rule_t *);
 155 
 156 /* Handy macro to check for unspecified address. */
 157 #define IS_ADDR_UNSPEC(addr)                                            \
 158         (IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) :  \
 159             IN6_IS_ADDR_UNSPECIFIED(addr))
 160 
 161 /*
 162  * Global kstat instance counter.  When a rule is created, its kstat instance
 163  * number is assigned by ilb_kstat_instance and ilb_kstat_instance is
 164  * incremented.
 165  */
 166 static uint_t ilb_kstat_instance = 0;
 167 
 168 /*
 169  * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME.
 170  * A rule's kstat has ILB_RULE_KS_CNAME class name.
 171  */
 172 #define ILB_G_KS_NAME           "global"
 173 #define ILB_G_KS_CNAME          "kstat"
 174 #define ILB_RULE_KS_CNAME       "rulestat"
 175 
 176 static kstat_t *
 177 ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs)
 178 {
 179         kstat_t *ksp;
 180         ilb_g_kstat_t template = {
 181                 { "num_rules",          KSTAT_DATA_UINT64, {{0}} },
 182                 { "ip_frag_in",         KSTAT_DATA_UINT64, {{0}} },
 183                 { "ip_frag_dropped",    KSTAT_DATA_UINT64, {{0}} }
 184         };
 185 
 186         ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME,
 187             ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t),
 188             KSTAT_FLAG_VIRTUAL, stackid);
 189         if (ksp == NULL)
 190                 return (NULL);
 191         bcopy(&template, ilbs->ilbs_kstat, sizeof (template));
 192         ksp->ks_data = ilbs->ilbs_kstat;
 193         ksp->ks_private = (void *)(uintptr_t)stackid;
 194 
 195         kstat_install(ksp);
 196         return (ksp);
 197 }
 198 
 199 static void
 200 ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs)
 201 {
 202         if (ilbs->ilbs_ksp != NULL) {
 203                 ASSERT(stackid == (netstackid_t)(uintptr_t)
 204                     ilbs->ilbs_ksp->ks_private);
 205                 kstat_delete_netstack(ilbs->ilbs_ksp, stackid);
 206                 ilbs->ilbs_ksp = NULL;
 207         }
 208 }
 209 
 210 static kstat_t *
 211 ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule)
 212 {
 213         kstat_t *ksp;
 214         ilb_rule_kstat_t template = {
 215                 { "num_servers",                KSTAT_DATA_UINT64, {{0}} },
 216                 { "bytes_not_processed",        KSTAT_DATA_UINT64, {{0}} },
 217                 { "pkt_not_processed",          KSTAT_DATA_UINT64, {{0}} },
 218                 { "bytes_dropped",              KSTAT_DATA_UINT64, {{0}} },
 219                 { "pkt_dropped",                KSTAT_DATA_UINT64, {{0}} },
 220                 { "nomem_bytes_dropped",        KSTAT_DATA_UINT64, {{0}} },
 221                 { "nomem_pkt_dropped",          KSTAT_DATA_UINT64, {{0}} },
 222                 { "noport_bytes_dropped",       KSTAT_DATA_UINT64, {{0}} },
 223                 { "noport_pkt_dropped",         KSTAT_DATA_UINT64, {{0}} },
 224                 { "icmp_echo_processed",        KSTAT_DATA_UINT64, {{0}} },
 225                 { "icmp_dropped",               KSTAT_DATA_UINT64, {{0}} },
 226                 { "icmp_too_big_processed",     KSTAT_DATA_UINT64, {{0}} },
 227                 { "icmp_too_big_dropped",       KSTAT_DATA_UINT64, {{0}} }
 228         };
 229 
 230         ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
 231             rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED,
 232             NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
 233         if (ksp == NULL)
 234                 return (NULL);
 235 
 236         bcopy(&template, &rule->ir_kstat, sizeof (template));
 237         ksp->ks_data = &rule->ir_kstat;
 238         ksp->ks_private = (void *)(uintptr_t)stackid;
 239 
 240         kstat_install(ksp);
 241         return (ksp);
 242 }
 243 
 244 static kstat_t *
 245 ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule,
 246     ilb_server_t *server)
 247 {
 248         kstat_t *ksp;
 249         ilb_server_kstat_t template = {
 250                 { "bytes_processed",    KSTAT_DATA_UINT64, {{0}} },
 251                 { "pkt_processed",      KSTAT_DATA_UINT64, {{0}} },
 252                 { "ip_address",         KSTAT_DATA_STRING, {{0}} }
 253         };
 254         char cname_buf[KSTAT_STRLEN];
 255 
 256         /* 7 is "-sstat" */
 257         ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN);
 258         (void) sprintf(cname_buf, "%s-sstat", rule->ir_name);
 259         ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
 260             server->iser_name, cname_buf, KSTAT_TYPE_NAMED,
 261             NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
 262         if (ksp == NULL)
 263                 return (NULL);
 264 
 265         bcopy(&template, &server->iser_kstat, sizeof (template));
 266         ksp->ks_data = &server->iser_kstat;
 267         ksp->ks_private = (void *)(uintptr_t)stackid;
 268 
 269         kstat_named_setstr(&server->iser_kstat.ip_address,
 270             server->iser_ip_addr);
 271         /* We never change the IP address */
 272         ksp->ks_data_size += strlen(server->iser_ip_addr) + 1;
 273 
 274         kstat_install(ksp);
 275         return (ksp);
 276 }
 277 
 278 /* Initialize the rule hash table. */
 279 static void
 280 ilb_rule_hash_init(ilb_stack_t *ilbs)
 281 {
 282         int i;
 283 
 284         /*
 285          * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to
 286          * the next power of 2.
 287          */
 288         if (!ISP2(ilbs->ilbs_rule_hash_size)) {
 289                 for (i = 0; i < 31; i++) {
 290                         if (ilbs->ilbs_rule_hash_size < (1 << i))
 291                                 break;
 292                 }
 293                 ilbs->ilbs_rule_hash_size = 1 << i;
 294         }
 295         ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) *
 296             ilbs->ilbs_rule_hash_size, KM_SLEEP);
 297         for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) {
 298                 mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL,
 299                     MUTEX_DEFAULT, NULL);
 300         }
 301 }
 302 
 303 /* Clean up the rule hash table. */
 304 static void
 305 ilb_rule_hash_fini(ilb_stack_t *ilbs)
 306 {
 307         if (ilbs->ilbs_g_hash == NULL)
 308                 return;
 309         kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) *
 310             ilbs->ilbs_rule_hash_size);
 311 }
 312 
 313 /* Add a rule to the rule hash table. */
 314 static void
 315 ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr)
 316 {
 317         int i;
 318 
 319         i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3],
 320             ilbs->ilbs_rule_hash_size);
 321         DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i);
 322         mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 323         rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule;
 324         if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL)
 325                 ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule;
 326         rule->ir_hash_prev = NULL;
 327         ilbs->ilbs_g_hash[i].ilb_hash_rule = rule;
 328 
 329         rule->ir_hash = &ilbs->ilbs_g_hash[i];
 330         mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 331 }
 332 
 333 /*
 334  * Remove a rule from the rule hash table.  Note that the rule is not freed
 335  * in this routine.
 336  */
 337 static void
 338 ilb_rule_hash_del(ilb_rule_t *rule)
 339 {
 340         mutex_enter(&rule->ir_hash->ilb_hash_lock);
 341         if (rule->ir_hash->ilb_hash_rule == rule) {
 342                 rule->ir_hash->ilb_hash_rule = rule->ir_hash_next;
 343                 if (rule->ir_hash_next != NULL)
 344                         rule->ir_hash_next->ir_hash_prev = NULL;
 345         } else {
 346                 if (rule->ir_hash_prev != NULL)
 347                         rule->ir_hash_prev->ir_hash_next =
 348                             rule->ir_hash_next;
 349                 if (rule->ir_hash_next != NULL) {
 350                         rule->ir_hash_next->ir_hash_prev =
 351                             rule->ir_hash_prev;
 352                 }
 353         }
 354         mutex_exit(&rule->ir_hash->ilb_hash_lock);
 355 
 356         rule->ir_hash_next = NULL;
 357         rule->ir_hash_prev = NULL;
 358         rule->ir_hash = NULL;
 359 }
 360 
 361 /*
 362  * Given the info of a packet, look for a match in the rule hash table.
 363  */
 364 static ilb_rule_t *
 365 ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr,
 366     in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy)
 367 {
 368         int i;
 369         ilb_rule_t *rule;
 370         ipaddr_t v4_addr;
 371 
 372         *busy = B_FALSE;
 373         IN6_V4MAPPED_TO_IPADDR(addr, v4_addr);
 374         i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size);
 375         port = ntohs(port);
 376 
 377         mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 378         for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
 379             rule = rule->ir_hash_next) {
 380                 if (!rule->ir_port_range) {
 381                         if (rule->ir_min_port != port)
 382                                 continue;
 383                 } else {
 384                         if (port < rule->ir_min_port ||
 385                             port > rule->ir_max_port) {
 386                                 continue;
 387                         }
 388                 }
 389                 if (rule->ir_ipver != l3 || rule->ir_proto != l4 ||
 390                     rule->ir_zoneid != zoneid) {
 391                         continue;
 392                 }
 393 
 394                 if (l3 == IPPROTO_IP) {
 395                         if (rule->ir_target_v4 != INADDR_ANY &&
 396                             rule->ir_target_v4 != v4_addr) {
 397                                 continue;
 398                         }
 399                 } else {
 400                         if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) &&
 401                             !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) {
 402                                 continue;
 403                         }
 404                 }
 405 
 406                 /*
 407                  * Just update the stats if the rule is disabled.
 408                  */
 409                 mutex_enter(&rule->ir_lock);
 410                 if (!(rule->ir_flags & ILB_RULE_ENABLED)) {
 411                         ILB_R_KSTAT(rule, pkt_not_processed);
 412                         ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len);
 413                         mutex_exit(&rule->ir_lock);
 414                         rule = NULL;
 415                         break;
 416                 } else if (rule->ir_flags & ILB_RULE_BUSY) {
 417                         /*
 418                          * If we are busy...
 419                          *
 420                          * XXX we should have a queue to postpone the
 421                          * packet processing.  But this requires a
 422                          * mechanism in IP to re-start the packet
 423                          * processing.  So for now, just drop the packet.
 424                          */
 425                         ILB_R_KSTAT(rule, pkt_dropped);
 426                         ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len);
 427                         mutex_exit(&rule->ir_lock);
 428                         *busy = B_TRUE;
 429                         rule = NULL;
 430                         break;
 431                 } else {
 432                         rule->ir_refcnt++;
 433                         ASSERT(rule->ir_refcnt != 1);
 434                         mutex_exit(&rule->ir_lock);
 435                         break;
 436                 }
 437         }
 438         mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 439         return (rule);
 440 }
 441 
 442 /*
 443  * Add a rule to the global rule list.  This list is for finding all rules
 444  * in an IP stack.  The caller is assumed to hold the ilbs_g_lock.
 445  */
 446 static void
 447 ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule)
 448 {
 449         ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
 450         rule->ir_next = ilbs->ilbs_rule_head;
 451         ilbs->ilbs_rule_head = rule;
 452         ILB_KSTAT_UPDATE(ilbs, num_rules, 1);
 453 }
 454 
 455 /* The call is assumed to hold the ilbs_g_lock. */
 456 static void
 457 ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule)
 458 {
 459         ilb_rule_t *tmp_rule;
 460         ilb_rule_t *prev_rule;
 461 
 462         ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
 463         prev_rule = NULL;
 464         for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
 465             prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) {
 466                 if (tmp_rule == rule)
 467                         break;
 468         }
 469         if (tmp_rule == NULL) {
 470                 mutex_exit(&ilbs->ilbs_g_lock);
 471                 return;
 472         }
 473         if (prev_rule == NULL)
 474                 ilbs->ilbs_rule_head = tmp_rule->ir_next;
 475         else
 476                 prev_rule->ir_next = tmp_rule->ir_next;
 477         ILB_KSTAT_UPDATE(ilbs, num_rules, -1);
 478 }
 479 
 480 /*
 481  * Helper routine to calculate how many source addresses are in a given
 482  * range.
 483  */
 484 static int64_t
 485 num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2)
 486 {
 487         int64_t ret;
 488         uint32_t addr1, addr2;
 489 
 490         /*
 491          * Here we assume that the max number of NAT source cannot be
 492          * large such that the most significant 2 s6_addr32 must be
 493          * equal.
 494          */
 495         addr1 = ntohl(a1->s6_addr32[3]);
 496         addr2 = ntohl(a2->s6_addr32[3]);
 497         if (a1->s6_addr32[0] != a2->s6_addr32[0] ||
 498             a1->s6_addr32[1] != a2->s6_addr32[1] ||
 499             a1->s6_addr32[2] > a2->s6_addr32[2] ||
 500             (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) {
 501                 return (-1);
 502         }
 503         if (a1->s6_addr32[2] == a2->s6_addr32[2]) {
 504                 return (addr2 - addr1 + 1);
 505         } else {
 506                 ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2]));
 507                 ret <<= 32;
 508                 ret = ret + addr1 - addr2;
 509                 return (ret + 1);
 510         }
 511 }
 512 
 513 /*
 514  * Add an ILB rule.
 515  */
 516 int
 517 ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd)
 518 {
 519         ilb_rule_t *rule;
 520         netstackid_t stackid;
 521         int ret;
 522         in_port_t min_port, max_port;
 523         int64_t num_src;
 524 
 525         /* Sanity checks. */
 526         if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6)
 527                 return (EINVAL);
 528 
 529         /* Need to support SCTP... */
 530         if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP)
 531                 return (EINVAL);
 532 
 533         /* For full NAT, the NAT source must be supplied. */
 534         if (cmd->topo == ILB_TOPO_IMPL_NAT) {
 535                 if (IS_ADDR_UNSPEC(&cmd->nat_src_start) ||
 536                     IS_ADDR_UNSPEC(&cmd->nat_src_end)) {
 537                         return (EINVAL);
 538                 }
 539         }
 540 
 541         /* Check invalid mask */
 542         if ((cmd->flags & ILB_RULE_STICKY) &&
 543             IS_ADDR_UNSPEC(&cmd->sticky_mask)) {
 544                 return (EINVAL);
 545         }
 546 
 547         /* Port is passed in network byte order. */
 548         min_port = ntohs(cmd->min_port);
 549         max_port = ntohs(cmd->max_port);
 550         if (min_port > max_port)
 551                 return (EINVAL);
 552 
 553         /* min_port == 0 means "all ports". Make it so */
 554         if (min_port == 0) {
 555                 min_port = 1;
 556                 max_port = 65535;
 557         }
 558 
 559         /* Funny address checking. */
 560         if (cmd->ip_ver == IPPROTO_IP) {
 561                 in_addr_t v4_addr1, v4_addr2;
 562 
 563                 v4_addr1 = cmd->vip.s6_addr32[3];
 564                 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
 565                     CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST ||
 566                     v4_addr1 == INADDR_ANY ||
 567                     !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
 568                         return (EINVAL);
 569                 }
 570 
 571                 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
 572                         v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]);
 573                         v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]);
 574                         if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
 575                             (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET ||
 576                             v4_addr1 == INADDR_BROADCAST ||
 577                             v4_addr2 == INADDR_BROADCAST ||
 578                             v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY ||
 579                             CLASSD(v4_addr1) || CLASSD(v4_addr2) ||
 580                             !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
 581                             !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
 582                                 return (EINVAL);
 583                         }
 584 
 585                         num_src = v4_addr2 - v4_addr1 + 1;
 586                         if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC)
 587                                 return (EINVAL);
 588                 }
 589         } else {
 590                 if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) ||
 591                     IN6_IS_ADDR_MULTICAST(&cmd->vip) ||
 592                     IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) ||
 593                     IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
 594                         return (EINVAL);
 595                 }
 596 
 597                 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
 598                         if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) ||
 599                             IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) ||
 600                             IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) ||
 601                             IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) ||
 602                             IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) ||
 603                             IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) ||
 604                             IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
 605                             IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
 606                                 return (EINVAL);
 607                         }
 608 
 609                         if ((num_src = num_nat_src_v6(&cmd->nat_src_start,
 610                             &cmd->nat_src_end)) < 0 ||
 611                             num_src > ILB_MAX_NAT_SRC) {
 612                                 return (EINVAL);
 613                         }
 614                 }
 615         }
 616 
 617         mutex_enter(&ilbs->ilbs_g_lock);
 618         if (ilbs->ilbs_g_hash == NULL)
 619                 ilb_rule_hash_init(ilbs);
 620         if (ilbs->ilbs_c2s_conn_hash == NULL) {
 621                 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
 622                 ilb_conn_hash_init(ilbs);
 623                 ilb_nat_src_init(ilbs);
 624         }
 625 
 626         /* Make sure that the new rule does not duplicate an existing one. */
 627         if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto,
 628             min_port, max_port, &cmd->vip)) {
 629                 mutex_exit(&ilbs->ilbs_g_lock);
 630                 return (EEXIST);
 631         }
 632 
 633         rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP);
 634         if (rule == NULL) {
 635                 mutex_exit(&ilbs->ilbs_g_lock);
 636                 return (ENOMEM);
 637         }
 638 
 639         /* ir_name is all 0 to begin with */
 640         (void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1);
 641 
 642         rule->ir_ks_instance = atomic_inc_uint_nv(&ilb_kstat_instance);
 643         stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
 644         if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) {
 645                 ret = ENOMEM;
 646                 goto error;
 647         }
 648 
 649         if (cmd->topo == ILB_TOPO_IMPL_NAT) {
 650                 rule->ir_nat_src_start = cmd->nat_src_start;
 651                 rule->ir_nat_src_end = cmd->nat_src_end;
 652         }
 653 
 654         rule->ir_ipver = cmd->ip_ver;
 655         rule->ir_proto = cmd->proto;
 656         rule->ir_topo = cmd->topo;
 657 
 658         rule->ir_min_port = min_port;
 659         rule->ir_max_port = max_port;
 660         if (rule->ir_min_port != rule->ir_max_port)
 661                 rule->ir_port_range = B_TRUE;
 662         else
 663                 rule->ir_port_range = B_FALSE;
 664 
 665         rule->ir_zoneid = zoneid;
 666 
 667         rule->ir_target_v6 = cmd->vip;
 668         rule->ir_servers = NULL;
 669 
 670         /*
 671          * The default connection drain timeout is indefinite (value 0),
 672          * meaning we will wait for all connections to finish.  So we
 673          * can assign cmd->conn_drain_timeout to it directly.
 674          */
 675         rule->ir_conn_drain_timeout = cmd->conn_drain_timeout;
 676         if (cmd->nat_expiry != 0) {
 677                 rule->ir_nat_expiry = cmd->nat_expiry;
 678         } else {
 679                 switch (rule->ir_proto) {
 680                 case IPPROTO_TCP:
 681                         rule->ir_nat_expiry = ilb_conn_tcp_expiry;
 682                         break;
 683                 case IPPROTO_UDP:
 684                         rule->ir_nat_expiry = ilb_conn_udp_expiry;
 685                         break;
 686                 default:
 687                         cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p",
 688                             (void *)rule);
 689                         break;
 690                 }
 691         }
 692         if (cmd->sticky_expiry != 0)
 693                 rule->ir_sticky_expiry = cmd->sticky_expiry;
 694         else
 695                 rule->ir_sticky_expiry = ilb_sticky_expiry;
 696 
 697         if (cmd->flags & ILB_RULE_STICKY) {
 698                 rule->ir_flags |= ILB_RULE_STICKY;
 699                 rule->ir_sticky_mask = cmd->sticky_mask;
 700                 if (ilbs->ilbs_sticky_hash == NULL)
 701                         ilb_sticky_hash_init(ilbs);
 702         }
 703         if (cmd->flags & ILB_RULE_ENABLED)
 704                 rule->ir_flags |= ILB_RULE_ENABLED;
 705 
 706         mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL);
 707         cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL);
 708 
 709         rule->ir_refcnt = 1;
 710 
 711         switch (cmd->algo) {
 712         case ILB_ALG_IMPL_ROUNDROBIN:
 713                 if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) {
 714                         ret = ENOMEM;
 715                         goto error;
 716                 }
 717                 rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN;
 718                 break;
 719         case ILB_ALG_IMPL_HASH_IP:
 720         case ILB_ALG_IMPL_HASH_IP_SPORT:
 721         case ILB_ALG_IMPL_HASH_IP_VIP:
 722                 if ((rule->ir_alg = ilb_alg_hash_init(rule,
 723                     &cmd->algo)) == NULL) {
 724                         ret = ENOMEM;
 725                         goto error;
 726                 }
 727                 rule->ir_alg_type = cmd->algo;
 728                 break;
 729         default:
 730                 ret = EINVAL;
 731                 goto error;
 732         }
 733 
 734         /* Add it to the global list and hash array at the end. */
 735         ilb_rule_g_add(ilbs, rule);
 736         ilb_rule_hash_add(ilbs, rule, &cmd->vip);
 737 
 738         mutex_exit(&ilbs->ilbs_g_lock);
 739 
 740         return (0);
 741 
 742 error:
 743         mutex_exit(&ilbs->ilbs_g_lock);
 744         if (rule->ir_ksp != NULL) {
 745                 /* stackid must be initialized if ir_ksp != NULL */
 746                 kstat_delete_netstack(rule->ir_ksp, stackid);
 747         }
 748         kmem_free(rule, sizeof (ilb_rule_t));
 749         return (ret);
 750 }
 751 
 752 /*
 753  * The final part in deleting a rule.  Either called directly or by the
 754  * taskq dispatched.
 755  */
 756 static void
 757 ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule)
 758 {
 759         netstackid_t stackid;
 760         ilb_server_t *server;
 761 
 762         stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
 763 
 764         /*
 765          * Let the algorithm know that the rule is going away.  The
 766          * algorithm fini routine will free all its resources with this
 767          * rule.
 768          */
 769         tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg);
 770 
 771         while ((server = tmp_rule->ir_servers) != NULL) {
 772                 mutex_enter(&server->iser_lock);
 773                 ilb_destroy_nat_src(&server->iser_nat_src);
 774                 if (tmp_rule->ir_conn_drain_timeout != 0) {
 775                         /*
 776                          * The garbage collection thread checks this value
 777                          * without grabing a lock.  So we need to use
 778                          * atomic_swap_64() to make sure that the value seen
 779                          * by gc thread is intact.
 780                          */
 781                         (void) atomic_swap_64(
 782                             (uint64_t *)&server->iser_die_time,
 783                             ddi_get_lbolt64() +
 784                             SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout));
 785                 }
 786                 while (server->iser_refcnt > 1)
 787                         cv_wait(&server->iser_cv, &server->iser_lock);
 788                 tmp_rule->ir_servers = server->iser_next;
 789                 kstat_delete_netstack(server->iser_ksp, stackid);
 790                 kmem_free(server, sizeof (ilb_server_t));
 791         }
 792 
 793         ASSERT(tmp_rule->ir_ksp != NULL);
 794         kstat_delete_netstack(tmp_rule->ir_ksp, stackid);
 795 
 796         kmem_free(tmp_rule, sizeof (ilb_rule_t));
 797 }
 798 
 799 /* The routine executed by the delayed rule taskq. */
 800 static void
 801 ilb_rule_del_tq(void *arg)
 802 {
 803         ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs;
 804         ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule;
 805 
 806         mutex_enter(&rule->ir_lock);
 807         while (rule->ir_refcnt > 1)
 808                 cv_wait(&rule->ir_cv, &rule->ir_lock);
 809         ilb_rule_del_common(ilbs, rule);
 810         kmem_free(arg, sizeof (ilb_rule_tq_t));
 811 }
 812 
 813 /* Routine to delete a rule. */
 814 int
 815 ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name)
 816 {
 817         ilb_rule_t *tmp_rule;
 818         ilb_rule_tq_t *arg;
 819         int err;
 820 
 821         mutex_enter(&ilbs->ilbs_g_lock);
 822         if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name,
 823             &err)) == NULL) {
 824                 mutex_exit(&ilbs->ilbs_g_lock);
 825                 return (err);
 826         }
 827 
 828         /*
 829          * First remove the rule from the hash array and the global list so
 830          * that no one can find this rule any more.
 831          */
 832         ilb_rule_hash_del(tmp_rule);
 833         ilb_rule_g_del(ilbs, tmp_rule);
 834         mutex_exit(&ilbs->ilbs_g_lock);
 835         ILB_RULE_REFRELE(tmp_rule);
 836 
 837         /*
 838          * Now no one can find this rule, we can remove it once all
 839          * references to it are dropped and all references to the list
 840          * of servers are dropped.  So dispatch a task to finish the deletion.
 841          * We do this instead of letting the last one referencing the
 842          * rule do it.  The reason is that the last one may be the
 843          * interrupt thread.  We want to minimize the work it needs to
 844          * do.  Rule deletion is not a critical task so it can be delayed.
 845          */
 846         arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
 847         arg->ilbs = ilbs;
 848         arg->rule = tmp_rule;
 849         (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg,
 850             TQ_SLEEP);
 851 
 852         return (0);
 853 }
 854 
 855 /*
 856  * Given an IP address, check to see if there is a rule using this
 857  * as the VIP.  It can be used to check if we need to drop a fragment.
 858  */
 859 boolean_t
 860 ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule)
 861 {
 862         int i;
 863         ilb_rule_t *rule;
 864         boolean_t ret = B_FALSE;
 865 
 866         i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3],
 867             ilbs->ilbs_rule_hash_size);
 868         mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 869         for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
 870             rule = rule->ir_hash_next) {
 871                 if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) {
 872                         mutex_enter(&rule->ir_lock);
 873                         if (rule->ir_flags & ILB_RULE_BUSY) {
 874                                 mutex_exit(&rule->ir_lock);
 875                                 break;
 876                         }
 877                         if (ret_rule != NULL) {
 878                                 rule->ir_refcnt++;
 879                                 mutex_exit(&rule->ir_lock);
 880                                 *ret_rule = rule;
 881                         } else {
 882                                 mutex_exit(&rule->ir_lock);
 883                         }
 884                         ret = B_TRUE;
 885                         break;
 886                 }
 887         }
 888         mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 889         return (ret);
 890 }
 891 
 892 boolean_t
 893 ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule)
 894 {
 895         int i;
 896         ilb_rule_t *rule;
 897         boolean_t ret = B_FALSE;
 898 
 899         i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size);
 900         mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 901         for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
 902             rule = rule->ir_hash_next) {
 903                 if (rule->ir_target_v6.s6_addr32[3] == addr) {
 904                         mutex_enter(&rule->ir_lock);
 905                         if (rule->ir_flags & ILB_RULE_BUSY) {
 906                                 mutex_exit(&rule->ir_lock);
 907                                 break;
 908                         }
 909                         if (ret_rule != NULL) {
 910                                 rule->ir_refcnt++;
 911                                 mutex_exit(&rule->ir_lock);
 912                                 *ret_rule = rule;
 913                         } else {
 914                                 mutex_exit(&rule->ir_lock);
 915                         }
 916                         ret = B_TRUE;
 917                         break;
 918                 }
 919         }
 920         mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 921         return (ret);
 922 }
 923 
 924 static ilb_rule_t *
 925 ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
 926     int *err)
 927 {
 928         ilb_rule_t *tmp_rule;
 929 
 930         ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
 931 
 932         for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
 933             tmp_rule = tmp_rule->ir_next) {
 934                 if (tmp_rule->ir_zoneid != zoneid)
 935                         continue;
 936                 if (strcasecmp(tmp_rule->ir_name, name) == 0) {
 937                         mutex_enter(&tmp_rule->ir_lock);
 938                         if (tmp_rule->ir_flags & ILB_RULE_BUSY) {
 939                                 mutex_exit(&tmp_rule->ir_lock);
 940                                 *err = EINPROGRESS;
 941                                 return (NULL);
 942                         }
 943                         tmp_rule->ir_refcnt++;
 944                         mutex_exit(&tmp_rule->ir_lock);
 945                         *err = 0;
 946                         return (tmp_rule);
 947                 }
 948         }
 949         *err = ENOENT;
 950         return (NULL);
 951 }
 952 
 953 /* To find a rule with a given name and zone in the global rule list. */
 954 ilb_rule_t *
 955 ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
 956     int *err)
 957 {
 958         ilb_rule_t *tmp_rule;
 959 
 960         mutex_enter(&ilbs->ilbs_g_lock);
 961         tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err);
 962         mutex_exit(&ilbs->ilbs_g_lock);
 963         return (tmp_rule);
 964 }
 965 
 966 /* Try to match the given packet info and zone ID with a rule. */
 967 static boolean_t
 968 ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3,
 969     int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr)
 970 {
 971         ilb_rule_t *tmp_rule;
 972 
 973         ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
 974 
 975         for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
 976             tmp_rule = tmp_rule->ir_next) {
 977                 if (tmp_rule->ir_zoneid != zoneid)
 978                         continue;
 979 
 980                 /*
 981                  * We don't allow the same name in different rules even if all
 982                  * the other rule components are different.
 983                  */
 984                 if (strcasecmp(tmp_rule->ir_name, name) == 0)
 985                         return (B_TRUE);
 986 
 987                 if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4)
 988                         continue;
 989 
 990                 /*
 991                  * ir_min_port and ir_max_port are the same if ir_port_range
 992                  * is false.  In this case, if the ir_min|max_port (same) is
 993                  * outside of the given port range, it is OK.  In other cases,
 994                  * check if min and max port are outside a rule's range.
 995                  */
 996                 if (tmp_rule->ir_max_port < min_port ||
 997                     tmp_rule->ir_min_port > max_port) {
 998                         continue;
 999                 }
1000 
1001                 /*
1002                  * If l3 is IPv4, the addr passed in is assumed to be
1003                  * mapped address.
1004                  */
1005                 if (V6_OR_V4_INADDR_ANY(*addr) ||
1006                     V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) ||
1007                     IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) {
1008                         return (B_TRUE);
1009                 }
1010         }
1011         return (B_FALSE);
1012 }
1013 
1014 int
1015 ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid,
1016     const char *rule_name, ilb_rule_t *in_rule)
1017 {
1018         ilb_rule_t *rule;
1019         int err;
1020 
1021         ASSERT((in_rule == NULL && rule_name != NULL) ||
1022             (in_rule != NULL && rule_name == NULL));
1023         if ((rule = in_rule) == NULL) {
1024                 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1025                     &err)) == NULL) {
1026                         return (err);
1027                 }
1028         }
1029         mutex_enter(&rule->ir_lock);
1030         rule->ir_flags |= ILB_RULE_ENABLED;
1031         mutex_exit(&rule->ir_lock);
1032 
1033         /* Only refrele if the rule is passed in. */
1034         if (in_rule == NULL)
1035                 ILB_RULE_REFRELE(rule);
1036         return (0);
1037 }
1038 
1039 int
1040 ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid,
1041     const char *rule_name, ilb_rule_t *in_rule)
1042 {
1043         ilb_rule_t *rule;
1044         int err;
1045 
1046         ASSERT((in_rule == NULL && rule_name != NULL) ||
1047             (in_rule != NULL && rule_name == NULL));
1048         if ((rule = in_rule) == NULL) {
1049                 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1050                     &err)) == NULL) {
1051                         return (err);
1052                 }
1053         }
1054         mutex_enter(&rule->ir_lock);
1055         rule->ir_flags &= ~ILB_RULE_ENABLED;
1056         mutex_exit(&rule->ir_lock);
1057 
1058         /* Only refrele if the rule is passed in. */
1059         if (in_rule == NULL)
1060                 ILB_RULE_REFRELE(rule);
1061         return (0);
1062 }
1063 
1064 /*
1065  * XXX We should probably have a walker function to walk all rules.  For
1066  * now, just add a simple loop for enable/disable/del.
1067  */
1068 void
1069 ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1070 {
1071         ilb_rule_t *rule;
1072 
1073         mutex_enter(&ilbs->ilbs_g_lock);
1074         for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) {
1075                 if (rule->ir_zoneid != zoneid)
1076                         continue;
1077                 /*
1078                  * No need to hold the rule as we are holding the global
1079                  * lock so it won't go away.  Ignore the return value here
1080                  * as the rule is provided so the call cannot fail.
1081                  */
1082                 (void) ilb_rule_enable(ilbs, zoneid, NULL, rule);
1083         }
1084         mutex_exit(&ilbs->ilbs_g_lock);
1085 }
1086 
1087 void
1088 ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1089 {
1090         ilb_rule_t *rule;
1091 
1092         mutex_enter(&ilbs->ilbs_g_lock);
1093         for (rule = ilbs->ilbs_rule_head; rule != NULL;
1094             rule = rule->ir_next) {
1095                 if (rule->ir_zoneid != zoneid)
1096                         continue;
1097                 (void) ilb_rule_disable(ilbs, zoneid, NULL, rule);
1098         }
1099         mutex_exit(&ilbs->ilbs_g_lock);
1100 }
1101 
1102 void
1103 ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1104 {
1105         ilb_rule_t *rule;
1106         ilb_rule_tq_t *arg;
1107 
1108         mutex_enter(&ilbs->ilbs_g_lock);
1109         while ((rule = ilbs->ilbs_rule_head) != NULL) {
1110                 if (rule->ir_zoneid != zoneid)
1111                         continue;
1112                 ilb_rule_hash_del(rule);
1113                 ilb_rule_g_del(ilbs, rule);
1114                 mutex_exit(&ilbs->ilbs_g_lock);
1115 
1116                 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
1117                 arg->ilbs = ilbs;
1118                 arg->rule = rule;
1119                 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq,
1120                     arg, TQ_SLEEP);
1121 
1122                 mutex_enter(&ilbs->ilbs_g_lock);
1123         }
1124         mutex_exit(&ilbs->ilbs_g_lock);
1125 }
1126 
1127 /*
1128  * This is just an optimization, so don't grab the global lock.  The
1129  * worst case is that we missed a couple packets.
1130  */
1131 boolean_t
1132 ilb_has_rules(ilb_stack_t *ilbs)
1133 {
1134         return (ilbs->ilbs_rule_head != NULL);
1135 }
1136 
1137 
1138 static int
1139 ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1140     ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable)
1141 {
1142         ilb_server_t *tmp_server;
1143         int ret;
1144 
1145         ASSERT((rule == NULL && rule_name != NULL) ||
1146             (rule != NULL && rule_name == NULL));
1147 
1148         if (rule == NULL) {
1149                 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1150                     &ret)) == NULL) {
1151                         return (ret);
1152                 }
1153         }
1154 
1155         /* Once we get a hold on the rule, no server can be added/deleted. */
1156         for (tmp_server = rule->ir_servers; tmp_server != NULL;
1157             tmp_server = tmp_server->iser_next) {
1158                 if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr))
1159                         break;
1160         }
1161         if (tmp_server == NULL) {
1162                 ret = ENOENT;
1163                 goto done;
1164         }
1165 
1166         if (enable) {
1167                 ret = rule->ir_alg->ilb_alg_server_enable(tmp_server,
1168                     rule->ir_alg->ilb_alg_data);
1169                 if (ret == 0) {
1170                         tmp_server->iser_enabled = B_TRUE;
1171                         tmp_server->iser_die_time = 0;
1172                 }
1173         } else {
1174                 ret = rule->ir_alg->ilb_alg_server_disable(tmp_server,
1175                     rule->ir_alg->ilb_alg_data);
1176                 if (ret == 0) {
1177                         tmp_server->iser_enabled = B_FALSE;
1178                         if (rule->ir_conn_drain_timeout != 0) {
1179                                 (void) atomic_swap_64(
1180                                     (uint64_t *)&tmp_server->iser_die_time,
1181                                     ddi_get_lbolt64() + SEC_TO_TICK(
1182                                     rule->ir_conn_drain_timeout));
1183                         }
1184                 }
1185         }
1186 
1187 done:
1188         if (rule_name != NULL)
1189                 ILB_RULE_REFRELE(rule);
1190         return (ret);
1191 }
1192 int
1193 ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1194     ilb_rule_t *rule, in6_addr_t *addr)
1195 {
1196         return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE));
1197 }
1198 
1199 int
1200 ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1201     ilb_rule_t *rule, in6_addr_t *addr)
1202 {
1203         return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE));
1204 }
1205 
1206 /*
1207  * Add a back end server to a rule.  If the address is IPv4, it is assumed
1208  * to be passed in as a mapped address.
1209  */
1210 int
1211 ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info)
1212 {
1213         ilb_server_t    *server;
1214         netstackid_t    stackid;
1215         int             ret = 0;
1216         in_port_t       min_port, max_port;
1217         in_port_t       range;
1218 
1219         /* Port is passed in network byte order. */
1220         min_port = ntohs(info->min_port);
1221         max_port = ntohs(info->max_port);
1222         if (min_port > max_port)
1223                 return (EINVAL);
1224 
1225         /* min_port == 0 means "all ports". Make it so */
1226         if (min_port == 0) {
1227                 min_port = 1;
1228                 max_port = 65535;
1229         }
1230         range = max_port - min_port;
1231 
1232         mutex_enter(&rule->ir_lock);
1233         /* If someone is already doing server add/del, sleeps and wait. */
1234         while (rule->ir_flags & ILB_RULE_BUSY) {
1235                 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1236                         mutex_exit(&rule->ir_lock);
1237                         return (EINTR);
1238                 }
1239         }
1240 
1241         /*
1242          * Set the rule to be busy to make sure that no new packet can
1243          * use this rule.
1244          */
1245         rule->ir_flags |= ILB_RULE_BUSY;
1246 
1247         /* Now wait for all other guys to finish their work. */
1248         while (rule->ir_refcnt > 2) {
1249                 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1250                         mutex_exit(&rule->ir_lock);
1251                         ret = EINTR;
1252                         goto end;
1253                 }
1254         }
1255         mutex_exit(&rule->ir_lock);
1256 
1257         /* Sanity checks... */
1258         if ((IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1259             rule->ir_ipver != IPPROTO_IP) ||
1260             (!IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1261             rule->ir_ipver != IPPROTO_IPV6)) {
1262                 ret = EINVAL;
1263                 goto end;
1264         }
1265 
1266         /*
1267          * Check for valid port range.
1268          *
1269          * For DSR, there can be no port shifting.  Hence the server
1270          * specification must be the same as the rule's.
1271          *
1272          * For half-NAT/NAT, the range must either be 0 (port collapsing) or
1273          * it must be equal to the same value as the rule port range.
1274          *
1275          */
1276         if (rule->ir_topo == ILB_TOPO_IMPL_DSR) {
1277                 if (rule->ir_max_port != max_port ||
1278                     rule->ir_min_port != min_port) {
1279                         ret = EINVAL;
1280                         goto end;
1281                 }
1282         } else {
1283                 if ((range != rule->ir_max_port - rule->ir_min_port) &&
1284                     range != 0) {
1285                         ret = EINVAL;
1286                         goto end;
1287                 }
1288         }
1289 
1290         /* Check for duplicate. */
1291         for (server = rule->ir_servers; server != NULL;
1292             server = server->iser_next) {
1293                 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) ||
1294                     strcasecmp(server->iser_name, info->name) == 0) {
1295                         break;
1296                 }
1297         }
1298         if (server != NULL) {
1299                 ret = EEXIST;
1300                 goto end;
1301         }
1302 
1303         if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) {
1304                 ret = ENOMEM;
1305                 goto end;
1306         }
1307 
1308         (void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1);
1309         (void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr,
1310             sizeof (server->iser_ip_addr));
1311         stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
1312         server->iser_ksp = ilb_server_kstat_init(stackid, rule, server);
1313         if (server->iser_ksp == NULL) {
1314                 kmem_free(server, sizeof (ilb_server_t));
1315                 ret = EINVAL;
1316                 goto end;
1317         }
1318 
1319         server->iser_stackid = stackid;
1320         server->iser_addr_v6 = info->addr;
1321         server->iser_min_port = min_port;
1322         server->iser_max_port = max_port;
1323         if (min_port != max_port)
1324                 server->iser_port_range = B_TRUE;
1325         else
1326                 server->iser_port_range = B_FALSE;
1327 
1328         /*
1329          * If the rule uses NAT, find/create the NAT source entry to use
1330          * for this server.
1331          */
1332         if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
1333                 in_port_t port;
1334 
1335                 /*
1336                  * If the server uses a port range, our port allocation
1337                  * scheme needs to treat it as a wildcard.  Refer to the
1338                  * comments in ilb_nat.c about the scheme.
1339                  */
1340                 if (server->iser_port_range)
1341                         port = 0;
1342                 else
1343                         port = server->iser_min_port;
1344 
1345                 if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src,
1346                     &server->iser_addr_v6, port, &rule->ir_nat_src_start,
1347                     num_nat_src_v6(&rule->ir_nat_src_start,
1348                     &rule->ir_nat_src_end))) != 0) {
1349                         kstat_delete_netstack(server->iser_ksp, stackid);
1350                         kmem_free(server, sizeof (ilb_server_t));
1351                         goto end;
1352                 }
1353         }
1354 
1355         /*
1356          * The iser_lock is only used to protect iser_refcnt.  All the other
1357          * fields in ilb_server_t should not change, except for iser_enabled.
1358          * The worst thing that can happen if iser_enabled is messed up is
1359          * that one or two packets may not be load balanced to a server
1360          * correctly.
1361          */
1362         server->iser_refcnt = 1;
1363         server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE :
1364             B_FALSE;
1365         mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL);
1366         cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL);
1367 
1368         /* Let the load balancing algorithm know about the addition. */
1369         ASSERT(rule->ir_alg != NULL);
1370         if ((ret = rule->ir_alg->ilb_alg_server_add(server,
1371             rule->ir_alg->ilb_alg_data)) != 0) {
1372                 kstat_delete_netstack(server->iser_ksp, stackid);
1373                 kmem_free(server, sizeof (ilb_server_t));
1374                 goto end;
1375         }
1376 
1377         /*
1378          * No need to hold ir_lock since no other thread should manipulate
1379          * the following fields until ILB_RULE_BUSY is cleared.
1380          */
1381         if (rule->ir_servers == NULL) {
1382                 server->iser_next = NULL;
1383         } else {
1384                 server->iser_next = rule->ir_servers;
1385         }
1386         rule->ir_servers = server;
1387         ILB_R_KSTAT(rule, num_servers);
1388 
1389 end:
1390         mutex_enter(&rule->ir_lock);
1391         rule->ir_flags &= ~ILB_RULE_BUSY;
1392         cv_signal(&rule->ir_cv);
1393         mutex_exit(&rule->ir_lock);
1394         return (ret);
1395 }
1396 
1397 /* The routine executed by the delayed rule processing taskq. */
1398 static void
1399 ilb_server_del_tq(void *arg)
1400 {
1401         ilb_server_t *server = (ilb_server_t *)arg;
1402 
1403         mutex_enter(&server->iser_lock);
1404         while (server->iser_refcnt > 1)
1405                 cv_wait(&server->iser_cv, &server->iser_lock);
1406         kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1407         kmem_free(server, sizeof (ilb_server_t));
1408 }
1409 
1410 /*
1411  * Delete a back end server from a rule.  If the address is IPv4, it is assumed
1412  * to be passed in as a mapped address.
1413  */
1414 int
1415 ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1416     ilb_rule_t *rule, in6_addr_t *addr)
1417 {
1418         ilb_server_t    *server;
1419         ilb_server_t    *prev_server;
1420         int             ret = 0;
1421 
1422         ASSERT((rule == NULL && rule_name != NULL) ||
1423             (rule != NULL && rule_name == NULL));
1424         if (rule == NULL) {
1425                 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1426                     &ret)) == NULL) {
1427                         return (ret);
1428                 }
1429         }
1430 
1431         mutex_enter(&rule->ir_lock);
1432         /* If someone is already doing server add/del, sleeps and wait. */
1433         while (rule->ir_flags & ILB_RULE_BUSY) {
1434                 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1435                         if (rule_name != NULL) {
1436                                 if (--rule->ir_refcnt <= 2)
1437                                         cv_signal(&rule->ir_cv);
1438                         }
1439                         mutex_exit(&rule->ir_lock);
1440                         return (EINTR);
1441                 }
1442         }
1443         /*
1444          * Set the rule to be busy to make sure that no new packet can
1445          * use this rule.
1446          */
1447         rule->ir_flags |= ILB_RULE_BUSY;
1448 
1449         /* Now wait for all other guys to finish their work. */
1450         while (rule->ir_refcnt > 2) {
1451                 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1452                         mutex_exit(&rule->ir_lock);
1453                         ret = EINTR;
1454                         goto end;
1455                 }
1456         }
1457         mutex_exit(&rule->ir_lock);
1458 
1459         prev_server = NULL;
1460         for (server = rule->ir_servers; server != NULL;
1461             prev_server = server, server = server->iser_next) {
1462                 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr))
1463                         break;
1464         }
1465         if (server == NULL) {
1466                 ret = ENOENT;
1467                 goto end;
1468         }
1469 
1470         /*
1471          * Let the load balancing algorithm know about the removal.
1472          * The algorithm may disallow the removal...
1473          */
1474         if ((ret = rule->ir_alg->ilb_alg_server_del(server,
1475             rule->ir_alg->ilb_alg_data)) != 0) {
1476                 goto end;
1477         }
1478 
1479         if (prev_server == NULL)
1480                 rule->ir_servers = server->iser_next;
1481         else
1482                 prev_server->iser_next = server->iser_next;
1483 
1484         ILB_R_KSTAT_UPDATE(rule, num_servers, -1);
1485 
1486         /*
1487          * Mark the server as disabled so that if there is any sticky cache
1488          * using this server around, it won't be used.
1489          */
1490         server->iser_enabled = B_FALSE;
1491 
1492         mutex_enter(&server->iser_lock);
1493 
1494         /*
1495          * De-allocate the NAT source array.  The indiviual ilb_nat_src_entry_t
1496          * may not go away if there is still a conn using it.  The NAT source
1497          * timer will do the garbage collection.
1498          */
1499         ilb_destroy_nat_src(&server->iser_nat_src);
1500 
1501         /* If there is a hard limit on when a server should die, set it. */
1502         if (rule->ir_conn_drain_timeout != 0) {
1503                 (void) atomic_swap_64((uint64_t *)&server->iser_die_time,
1504                     ddi_get_lbolt64() +
1505                     SEC_TO_TICK(rule->ir_conn_drain_timeout));
1506         }
1507 
1508         if (server->iser_refcnt > 1) {
1509                 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq,
1510                     server, TQ_SLEEP);
1511                 mutex_exit(&server->iser_lock);
1512         } else {
1513                 kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1514                 kmem_free(server, sizeof (ilb_server_t));
1515         }
1516 
1517 end:
1518         mutex_enter(&rule->ir_lock);
1519         rule->ir_flags &= ~ILB_RULE_BUSY;
1520         if (rule_name != NULL)
1521                 rule->ir_refcnt--;
1522         cv_signal(&rule->ir_cv);
1523         mutex_exit(&rule->ir_lock);
1524         return (ret);
1525 }
1526 
1527 /*
1528  * First check if the destination of the ICMP message matches a VIP of
1529  * a rule.  If it does not, just return ILB_PASSED.
1530  *
1531  * If the destination matches a VIP:
1532  *
1533  * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end
1534  * server.
1535  *
1536  * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload
1537  * and see which back end server we should send this message to.  And we
1538  * need to do NAT on both the payload message and the outside IP packet.
1539  *
1540  * For other ICMP messages, drop them.
1541  */
1542 /* ARGSUSED */
1543 static int
1544 ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1545     icmph_t *icmph, ipaddr_t *lb_dst)
1546 {
1547         ipaddr_t vip;
1548         ilb_rule_t *rule;
1549         in6_addr_t addr6;
1550 
1551         if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule))
1552                 return (ILB_PASSED);
1553 
1554 
1555         if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) {
1556                 ILB_R_KSTAT(rule, icmp_dropped);
1557                 ILB_RULE_REFRELE(rule);
1558                 return (ILB_DROPPED);
1559         }
1560 
1561         switch (icmph->icmph_type) {
1562         case ICMP_ECHO_REQUEST:
1563                 ILB_R_KSTAT(rule, icmp_echo_processed);
1564                 ILB_RULE_REFRELE(rule);
1565 
1566                 icmph->icmph_type = ICMP_ECHO_REPLY;
1567                 icmph->icmph_checksum = 0;
1568                 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
1569                 ipha->ipha_ttl =
1570                     ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl;
1571                 *lb_dst = ipha->ipha_src;
1572                 vip = ipha->ipha_dst;
1573                 ipha->ipha_dst = ipha->ipha_src;
1574                 ipha->ipha_src = vip;
1575                 return (ILB_BALANCED);
1576         case ICMP_DEST_UNREACHABLE: {
1577                 int ret;
1578 
1579                 if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) {
1580                         ILB_R_KSTAT(rule, icmp_dropped);
1581                         ILB_RULE_REFRELE(rule);
1582                         return (ILB_DROPPED);
1583                 }
1584                 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph,
1585                     &addr6)) {
1586                         ILB_R_KSTAT(rule, icmp_2big_processed);
1587                         ret = ILB_BALANCED;
1588                 } else {
1589                         ILB_R_KSTAT(rule, icmp_2big_dropped);
1590                         ret = ILB_DROPPED;
1591                 }
1592                 ILB_RULE_REFRELE(rule);
1593                 IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst);
1594                 return (ret);
1595         }
1596         default:
1597                 ILB_R_KSTAT(rule, icmp_dropped);
1598                 ILB_RULE_REFRELE(rule);
1599                 return (ILB_DROPPED);
1600         }
1601 }
1602 
1603 /* ARGSUSED */
1604 static int
1605 ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h,
1606     icmp6_t *icmp6, in6_addr_t *lb_dst)
1607 {
1608         ilb_rule_t *rule;
1609 
1610         if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule))
1611                 return (ILB_PASSED);
1612 
1613         if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) {
1614                 ILB_R_KSTAT(rule, icmp_dropped);
1615                 ILB_RULE_REFRELE(rule);
1616                 return (ILB_DROPPED);
1617         }
1618 
1619         switch (icmp6->icmp6_type) {
1620         case ICMP6_ECHO_REQUEST: {
1621                 int hdr_len;
1622 
1623                 ILB_R_KSTAT(rule, icmp_echo_processed);
1624                 ILB_RULE_REFRELE(rule);
1625 
1626                 icmp6->icmp6_type = ICMP6_ECHO_REPLY;
1627                 icmp6->icmp6_cksum = ip6h->ip6_plen;
1628                 hdr_len = (char *)icmp6 - (char *)ip6h;
1629                 icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len,
1630                     ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6));
1631                 ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
1632                 ip6h->ip6_hops =
1633                     ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops;
1634                 *lb_dst = ip6h->ip6_src;
1635                 ip6h->ip6_src = ip6h->ip6_dst;
1636                 ip6h->ip6_dst = *lb_dst;
1637                 return (ILB_BALANCED);
1638         }
1639         case ICMP6_PACKET_TOO_BIG: {
1640                 int ret;
1641 
1642                 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6,
1643                     lb_dst)) {
1644                         ILB_R_KSTAT(rule, icmp_2big_processed);
1645                         ret = ILB_BALANCED;
1646                 } else {
1647                         ILB_R_KSTAT(rule, icmp_2big_dropped);
1648                         ret = ILB_DROPPED;
1649                 }
1650                 ILB_RULE_REFRELE(rule);
1651                 return (ret);
1652         }
1653         default:
1654                 ILB_R_KSTAT(rule, icmp_dropped);
1655                 ILB_RULE_REFRELE(rule);
1656                 return (ILB_DROPPED);
1657         }
1658 }
1659 
1660 /*
1661  * Common routine to check an incoming packet and decide what to do with it.
1662  * called by ilb_check_v4|v6().
1663  */
1664 static int
1665 ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src,
1666     in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len,
1667     in6_addr_t *lb_dst)
1668 {
1669         in_port_t               sport, dport;
1670         tcpha_t                 *tcph;
1671         udpha_t                 *udph;
1672         ilb_rule_t              *rule;
1673         ilb_server_t            *server;
1674         boolean_t               balanced;
1675         struct ilb_sticky_s     *s = NULL;
1676         int                     ret;
1677         uint32_t                ip_sum, tp_sum;
1678         ilb_nat_info_t          info;
1679         uint16_t                nat_src_idx;
1680         boolean_t               busy;
1681 
1682         /*
1683          * We don't really need to switch here since both protocols's
1684          * ports are at the same offset.  Just prepare for future protocol
1685          * specific processing.
1686          */
1687         switch (l4) {
1688         case IPPROTO_TCP:
1689                 if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr)
1690                         return (ILB_DROPPED);
1691                 tcph = (tcpha_t *)tph;
1692                 sport = tcph->tha_lport;
1693                 dport = tcph->tha_fport;
1694                 break;
1695         case IPPROTO_UDP:
1696                 if (tph + sizeof (udpha_t) > mp->b_wptr)
1697                         return (ILB_DROPPED);
1698                 udph = (udpha_t *)tph;
1699                 sport = udph->uha_src_port;
1700                 dport = udph->uha_dst_port;
1701                 break;
1702         default:
1703                 return (ILB_PASSED);
1704         }
1705 
1706         /* Fast path, there is an existing conn. */
1707         if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport,
1708             pkt_len, lb_dst)) {
1709                 return (ILB_BALANCED);
1710         }
1711 
1712         /*
1713          * If there is no existing connection for the incoming packet, check
1714          * to see if the packet matches a rule.  If not, just let IP decide
1715          * what to do with it.
1716          *
1717          * Note: a reply from back end server should not match a rule.  A
1718          * reply should match one existing conn.
1719          */
1720         rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid,
1721             pkt_len, &busy);
1722         if (rule == NULL) {
1723                 /* If the rule is busy, just drop the packet. */
1724                 if (busy)
1725                         return (ILB_DROPPED);
1726                 else
1727                         return (ILB_PASSED);
1728         }
1729 
1730         /*
1731          * The packet matches a rule, use the rule load balance algorithm
1732          * to find a server.
1733          */
1734         balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport,
1735             rule->ir_alg->ilb_alg_data, &server);
1736         /*
1737          * This can only happen if there is no server in a rule or all
1738          * the servers are currently disabled.
1739          */
1740         if (!balanced)
1741                 goto no_server;
1742 
1743         /*
1744          * If the rule is sticky enabled, we need to check the sticky table.
1745          * If there is a sticky entry for the client, use the previous server
1746          * instead of the one found above (note that both can be the same).
1747          * If there is no entry for that client, add an entry to the sticky
1748          * table.  Both the find and add are done in ilb_sticky_find_add()
1749          * to avoid checking for duplicate when adding an entry.
1750          */
1751         if (rule->ir_flags & ILB_RULE_STICKY) {
1752                 in6_addr_t addr;
1753 
1754                 V6_MASK_COPY(*src, rule->ir_sticky_mask, addr);
1755                 if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server,
1756                     &s, &nat_src_idx)) == NULL) {
1757                         ILB_R_KSTAT(rule, nomem_pkt_dropped);
1758                         ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1759                         goto no_server;
1760                 }
1761         }
1762 
1763         /*
1764          * We are holding a reference on the rule, so the server
1765          * cannot go away.
1766          */
1767         *lb_dst = server->iser_addr_v6;
1768         ILB_S_KSTAT(server, pkt_processed);
1769         ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len);
1770 
1771         switch (rule->ir_topo) {
1772         case ILB_TOPO_IMPL_NAT: {
1773                 ilb_nat_src_entry_t     *src_ent;
1774                 uint16_t                *src_idx;
1775 
1776                 /*
1777                  * We create a cache even if it is not a SYN segment.
1778                  * The server should return a RST.  When we see the
1779                  * RST, we will destroy this cache.  But by having
1780                  * a cache, we know how to NAT the returned RST.
1781                  */
1782                 info.vip = *dst;
1783                 info.dport = dport;
1784                 info.src = *src;
1785                 info.sport = sport;
1786 
1787                 /* If stickiness is enabled, use the same source address */
1788                 if (s != NULL)
1789                         src_idx = &nat_src_idx;
1790                 else
1791                         src_idx = NULL;
1792 
1793                 if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src,
1794                     &info.nat_src, &info.nat_sport, src_idx)) == NULL) {
1795                         if (s != NULL)
1796                                 ilb_sticky_refrele(s);
1797                         ILB_R_KSTAT(rule, pkt_dropped);
1798                         ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1799                         ILB_R_KSTAT(rule, noport_pkt_dropped);
1800                         ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len);
1801                         ret = ILB_DROPPED;
1802                         break;
1803                 }
1804                 info.src_ent = src_ent;
1805                 info.nat_dst = server->iser_addr_v6;
1806                 if (rule->ir_port_range && server->iser_port_range) {
1807                         info.nat_dport = htons(ntohs(dport) -
1808                             rule->ir_min_port + server->iser_min_port);
1809                 } else {
1810                         info.nat_dport = htons(server->iser_min_port);
1811                 }
1812 
1813                 /*
1814                  * If ilb_conn_add() fails, it will release the reference on
1815                  * sticky info and de-allocate the NAT source port allocated
1816                  * above.
1817                  */
1818                 if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1819                     dport, &info, &ip_sum, &tp_sum, s) != 0) {
1820                         ILB_R_KSTAT(rule, pkt_dropped);
1821                         ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1822                         ILB_R_KSTAT(rule, nomem_pkt_dropped);
1823                         ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1824                         ret = ILB_DROPPED;
1825                         break;
1826                 }
1827                 ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1828                 ret = ILB_BALANCED;
1829                 break;
1830         }
1831         case ILB_TOPO_IMPL_HALF_NAT:
1832                 info.vip = *dst;
1833                 info.nat_dst = server->iser_addr_v6;
1834                 info.dport = dport;
1835                 if (rule->ir_port_range && server->iser_port_range) {
1836                         info.nat_dport = htons(ntohs(dport) -
1837                             rule->ir_min_port + server->iser_min_port);
1838                 } else {
1839                         info.nat_dport = htons(server->iser_min_port);
1840                 }
1841 
1842                 if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1843                     dport, &info, &ip_sum, &tp_sum, s) != 0) {
1844                         ILB_R_KSTAT(rule, pkt_dropped);
1845                         ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1846                         ILB_R_KSTAT(rule, nomem_pkt_dropped);
1847                         ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1848                         ret = ILB_DROPPED;
1849                         break;
1850                 }
1851                 ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1852 
1853                 ret = ILB_BALANCED;
1854                 break;
1855         case ILB_TOPO_IMPL_DSR:
1856                 /*
1857                  * By decrementing the sticky refcnt, the period of
1858                  * stickiness (life time of ilb_sticky_t) will be
1859                  * from now to (now + default expiry time).
1860                  */
1861                 if (s != NULL)
1862                         ilb_sticky_refrele(s);
1863                 ret = ILB_BALANCED;
1864                 break;
1865         default:
1866                 cmn_err(CE_PANIC, "data corruption unknown topology: %p",
1867                     (void *) rule);
1868                 break;
1869         }
1870         ILB_RULE_REFRELE(rule);
1871         return (ret);
1872 
1873 no_server:
1874         /* This can only happen if there is no server available. */
1875         ILB_R_KSTAT(rule, pkt_dropped);
1876         ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1877         ILB_RULE_REFRELE(rule);
1878         return (ILB_DROPPED);
1879 }
1880 
1881 int
1882 ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4,
1883     uint8_t *tph, ipaddr_t *lb_dst)
1884 {
1885         in6_addr_t v6_src, v6_dst, v6_lb_dst;
1886         int ret;
1887 
1888         ASSERT(DB_REF(mp) == 1);
1889 
1890         if (l4 == IPPROTO_ICMP) {
1891                 return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph,
1892                     lb_dst));
1893         }
1894 
1895         IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src);
1896         IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst);
1897         ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha,
1898             tph, ntohs(ipha->ipha_length), &v6_lb_dst);
1899         if (ret == ILB_BALANCED)
1900                 IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst);
1901         return (ret);
1902 }
1903 
1904 int
1905 ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4,
1906     uint8_t *tph, in6_addr_t *lb_dst)
1907 {
1908         uint32_t pkt_len;
1909 
1910         ASSERT(DB_REF(mp) == 1);
1911 
1912         if (l4 == IPPROTO_ICMPV6) {
1913                 return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph,
1914                     lb_dst));
1915         }
1916 
1917         pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
1918         return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst,
1919             IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst));
1920 }
1921 
1922 void
1923 ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules)
1924 {
1925         ilb_rule_t *tmp_rule;
1926 
1927         mutex_enter(&ilbs->ilbs_g_lock);
1928         *num_rules = 0;
1929         for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1930             tmp_rule = tmp_rule->ir_next) {
1931                 if (tmp_rule->ir_zoneid == zoneid)
1932                         *num_rules += 1;
1933         }
1934         mutex_exit(&ilbs->ilbs_g_lock);
1935 }
1936 
1937 int
1938 ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1939     uint32_t *num_servers)
1940 {
1941         ilb_rule_t *rule;
1942         int err;
1943 
1944         if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1945                 return (err);
1946         *num_servers = rule->ir_kstat.num_servers.value.ui64;
1947         ILB_RULE_REFRELE(rule);
1948         return (0);
1949 }
1950 
1951 int
1952 ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1953     ilb_server_info_t *servers, uint32_t *num_servers)
1954 {
1955         ilb_rule_t *rule;
1956         ilb_server_t *server;
1957         size_t cnt;
1958         int err;
1959 
1960         if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1961                 return (err);
1962         for (server = rule->ir_servers, cnt = *num_servers;
1963             server != NULL && cnt > 0;
1964             server = server->iser_next, cnt--, servers++) {
1965                 (void) memcpy(servers->name, server->iser_name,
1966                     ILB_SERVER_NAMESZ);
1967                 servers->addr = server->iser_addr_v6;
1968                 servers->min_port = htons(server->iser_min_port);
1969                 servers->max_port = htons(server->iser_max_port);
1970                 servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0;
1971                 servers->err = 0;
1972         }
1973         ILB_RULE_REFRELE(rule);
1974         *num_servers -= cnt;
1975 
1976         return (0);
1977 }
1978 
1979 void
1980 ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names,
1981     char *buf)
1982 {
1983         ilb_rule_t *tmp_rule;
1984         int cnt;
1985 
1986         if (*num_names == 0)
1987                 return;
1988 
1989         mutex_enter(&ilbs->ilbs_g_lock);
1990         for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1991             tmp_rule = tmp_rule->ir_next) {
1992                 if (tmp_rule->ir_zoneid != zoneid)
1993                         continue;
1994 
1995                 (void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ);
1996                 buf += ILB_RULE_NAMESZ;
1997                 if (++cnt == *num_names)
1998                         break;
1999         }
2000         mutex_exit(&ilbs->ilbs_g_lock);
2001         *num_names = cnt;
2002 }
2003 
2004 int
2005 ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd)
2006 {
2007         ilb_rule_t *rule;
2008         int err;
2009 
2010         if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) {
2011                 return (err);
2012         }
2013 
2014         /*
2015          * Except the enabled flags, none of the following will change
2016          * in the life time of a rule.  So we don't hold the mutex when
2017          * reading them.  The worst is to report a wrong enabled flags.
2018          */
2019         cmd->ip_ver = rule->ir_ipver;
2020         cmd->proto = rule->ir_proto;
2021         cmd->min_port = htons(rule->ir_min_port);
2022         cmd->max_port = htons(rule->ir_max_port);
2023 
2024         cmd->vip = rule->ir_target_v6;
2025         cmd->algo = rule->ir_alg_type;
2026         cmd->topo = rule->ir_topo;
2027 
2028         cmd->nat_src_start = rule->ir_nat_src_start;
2029         cmd->nat_src_end = rule->ir_nat_src_end;
2030 
2031         cmd->conn_drain_timeout = rule->ir_conn_drain_timeout;
2032         cmd->nat_expiry = rule->ir_nat_expiry;
2033         cmd->sticky_expiry = rule->ir_sticky_expiry;
2034 
2035         cmd->flags = 0;
2036         if (rule->ir_flags & ILB_RULE_ENABLED)
2037                 cmd->flags |= ILB_RULE_ENABLED;
2038         if (rule->ir_flags & ILB_RULE_STICKY) {
2039                 cmd->flags |= ILB_RULE_STICKY;
2040                 cmd->sticky_mask = rule->ir_sticky_mask;
2041         }
2042 
2043         ILB_RULE_REFRELE(rule);
2044         return (0);
2045 }
2046 
2047 static void *
2048 ilb_stack_init(netstackid_t stackid, netstack_t *ns)
2049 {
2050         ilb_stack_t *ilbs;
2051         char tq_name[TASKQ_NAMELEN];
2052 
2053         ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP);
2054         ilbs->ilbs_netstack = ns;
2055 
2056         ilbs->ilbs_rule_head = NULL;
2057         ilbs->ilbs_g_hash = NULL;
2058         mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL);
2059 
2060         ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP);
2061         if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) {
2062                 kmem_free(ilbs, sizeof (ilb_stack_t));
2063                 return (NULL);
2064         }
2065 
2066         /*
2067          * ilbs_conn/sticky_hash related info is initialized in
2068          * ilb_conn/sticky_hash_init().
2069          */
2070         ilbs->ilbs_conn_taskq = NULL;
2071         ilbs->ilbs_rule_hash_size = ilb_rule_hash_size;
2072         ilbs->ilbs_conn_hash_size = ilb_conn_hash_size;
2073         ilbs->ilbs_c2s_conn_hash = NULL;
2074         ilbs->ilbs_s2c_conn_hash = NULL;
2075         ilbs->ilbs_conn_timer_list = NULL;
2076 
2077         ilbs->ilbs_sticky_hash = NULL;
2078         ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size;
2079         ilbs->ilbs_sticky_timer_list = NULL;
2080         ilbs->ilbs_sticky_taskq = NULL;
2081 
2082         /* The allocation is done later when there is a rule using NAT mode. */
2083         ilbs->ilbs_nat_src = NULL;
2084         ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size;
2085         mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL);
2086         ilbs->ilbs_nat_src_tid = 0;
2087 
2088         /* For listing the conn hash table */
2089         mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL);
2090         cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL);
2091         ilbs->ilbs_conn_list_busy = B_FALSE;
2092         ilbs->ilbs_conn_list_cur = 0;
2093         ilbs->ilbs_conn_list_connp = NULL;
2094 
2095         /* For listing the sticky hash table */
2096         mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL);
2097         cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL);
2098         ilbs->ilbs_sticky_list_busy = B_FALSE;
2099         ilbs->ilbs_sticky_list_cur = 0;
2100         ilbs->ilbs_sticky_list_curp = NULL;
2101 
2102         (void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p",
2103             (void *)ns);
2104         ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR,
2105             minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
2106 
2107         return (ilbs);
2108 }
2109 
2110 /* ARGSUSED */
2111 static void
2112 ilb_stack_shutdown(netstackid_t stackid, void *arg)
2113 {
2114         ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2115         ilb_rule_t *tmp_rule;
2116 
2117         ilb_sticky_hash_fini(ilbs);
2118         ilb_conn_hash_fini(ilbs);
2119         mutex_enter(&ilbs->ilbs_g_lock);
2120         while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) {
2121                 ilb_rule_hash_del(tmp_rule);
2122                 ilb_rule_g_del(ilbs, tmp_rule);
2123                 mutex_exit(&ilbs->ilbs_g_lock);
2124                 ilb_rule_del_common(ilbs, tmp_rule);
2125                 mutex_enter(&ilbs->ilbs_g_lock);
2126         }
2127         mutex_exit(&ilbs->ilbs_g_lock);
2128         if (ilbs->ilbs_nat_src != NULL)
2129                 ilb_nat_src_fini(ilbs);
2130 }
2131 
2132 static void
2133 ilb_stack_fini(netstackid_t stackid, void * arg)
2134 {
2135         ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2136 
2137         ilb_rule_hash_fini(ilbs);
2138         taskq_destroy(ilbs->ilbs_rule_taskq);
2139         ilb_kstat_g_fini(stackid, ilbs);
2140         kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t));
2141         kmem_free(ilbs, sizeof (ilb_stack_t));
2142 }
2143 
2144 void
2145 ilb_ddi_g_init(void)
2146 {
2147         netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown,
2148             ilb_stack_fini);
2149 }
2150 
2151 void
2152 ilb_ddi_g_destroy(void)
2153 {
2154         netstack_unregister(NS_ILB);
2155         ilb_conn_cache_fini();
2156         ilb_sticky_cache_fini();
2157 }