illumos-gate New usr/src/uts/common/io/mac/mac.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * MAC Services Module
  28  *
  29  * The GLDv3 framework locking -  The MAC layer
  30  * --------------------------------------------
  31  *
  32  * The MAC layer is central to the GLD framework and can provide the locking
  33  * framework needed for itself and for the use of MAC clients. MAC end points
  34  * are fairly disjoint and don't share a lot of state. So a coarse grained
  35  * multi-threading scheme is to single thread all create/modify/delete or set
  36  * type of control operations on a per mac end point while allowing data threads
  37  * concurrently.
  38  *
  39  * Control operations (set) that modify a mac end point are always serialized on
  40  * a per mac end point basis, We have at most 1 such thread per mac end point
  41  * at a time.
  42  *
  43  * All other operations that are not serialized are essentially multi-threaded.
  44  * For example a control operation (get) like getting statistics which may not
  45  * care about reading values atomically or data threads sending or receiving
  46  * data. Mostly these type of operations don't modify the control state. Any
  47  * state these operations care about are protected using traditional locks.
  48  *
  49  * The perimeter only serializes serial operations. It does not imply there
  50  * aren't any other concurrent operations. However a serialized operation may
  51  * sometimes need to make sure it is the only thread. In this case it needs
  52  * to use reference counting mechanisms to cv_wait until any current data
  53  * threads are done.
  54  *
  55  * The mac layer itself does not hold any locks across a call to another layer.
  56  * The perimeter is however held across a down call to the driver to make the
  57  * whole control operation atomic with respect to other control operations.
  58  * Also the data path and get type control operations may proceed concurrently.
  59  * These operations synchronize with the single serial operation on a given mac
  60  * end point using regular locks. The perimeter ensures that conflicting
  61  * operations like say a mac_multicast_add and a mac_multicast_remove on the
  62  * same mac end point don't interfere with each other and also ensures that the
  63  * changes in the mac layer and the call to the underlying driver to say add a
  64  * multicast address are done atomically without interference from a thread
  65  * trying to delete the same address.
  66  *
  67  * For example, consider
  68  * mac_multicst_add()
  69  * {
  70  *      mac_perimeter_enter();  serialize all control operations
  71  *
  72  *      grab list lock          protect against access by data threads
  73  *      add to list
  74  *      drop list lock
  75  *
  76  *      call driver's mi_multicst
  77  *
  78  *      mac_perimeter_exit();
  79  * }
  80  *
  81  * To lessen the number of serialization locks and simplify the lock hierarchy,
  82  * we serialize all the control operations on a per mac end point by using a
  83  * single serialization lock called the perimeter. We allow recursive entry into
  84  * the perimeter to facilitate use of this mechanism by both the mac client and
  85  * the MAC layer itself.
  86  *
  87  * MAC client means an entity that does an operation on a mac handle
  88  * obtained from a mac_open/mac_client_open. Similarly MAC driver means
  89  * an entity that does an operation on a mac handle obtained from a
  90  * mac_register. An entity could be both client and driver but on different
  91  * handles eg. aggr. and should only make the corresponding mac interface calls
  92  * i.e. mac driver interface or mac client interface as appropriate for that
  93  * mac handle.
  94  *
  95  * General rules.
  96  * -------------
  97  *
  98  * R1. The lock order of upcall threads is natually opposite to downcall
  99  * threads. Hence upcalls must not hold any locks across layers for fear of
 100  * recursive lock enter and lock order violation. This applies to all layers.
 101  *
 102  * R2. The perimeter is just another lock. Since it is held in the down
 103  * direction, acquiring the perimeter in an upcall is prohibited as it would
 104  * cause a deadlock. This applies to all layers.
 105  *
 106  * Note that upcalls that need to grab the mac perimeter (for example
 107  * mac_notify upcalls) can still achieve that by posting the request to a
 108  * thread, which can then grab all the required perimeters and locks in the
 109  * right global order. Note that in the above example the mac layer iself
 110  * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
 111  * to the client must do that. Please see the aggr code for an example.
 112  *
 113  * MAC client rules
 114  * ----------------
 115  *
 116  * R3. A MAC client may use the MAC provided perimeter facility to serialize
 117  * control operations on a per mac end point. It does this by by acquring
 118  * and holding the perimeter across a sequence of calls to the mac layer.
 119  * This ensures atomicity across the entire block of mac calls. In this
 120  * model the MAC client must not hold any client locks across the calls to
 121  * the mac layer. This model is the preferred solution.
 122  *
 123  * R4. However if a MAC client has a lot of global state across all mac end
 124  * points the per mac end point serialization may not be sufficient. In this
 125  * case the client may choose to use global locks or use its own serialization.
 126  * To avoid deadlocks, these client layer locks held across the mac calls
 127  * in the control path must never be acquired by the data path for the reason
 128  * mentioned below.
 129  *
 130  * (Assume that a control operation that holds a client lock blocks in the
 131  * mac layer waiting for upcall reference counts to drop to zero. If an upcall
 132  * data thread that holds this reference count, tries to acquire the same
 133  * client lock subsequently it will deadlock).
 134  *
 135  * A MAC client may follow either the R3 model or the R4 model, but can't
 136  * mix both. In the former, the hierarchy is Perim -> client locks, but in
 137  * the latter it is client locks -> Perim.
 138  *
 139  * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
 140  * context since they may block while trying to acquire the perimeter.
 141  * In addition some calls may block waiting for upcall refcnts to come down to
 142  * zero.
 143  *
 144  * R6. MAC clients must make sure that they are single threaded and all threads
 145  * from the top (in particular data threads) have finished before calling
 146  * mac_client_close. The MAC framework does not track the number of client
 147  * threads using the mac client handle. Also mac clients must make sure
 148  * they have undone all the control operations before calling mac_client_close.
 149  * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
 150  * mac_unicast_add/mac_multicast_add.
 151  *
 152  * MAC framework rules
 153  * -------------------
 154  *
 155  * R7. The mac layer itself must not hold any mac layer locks (except the mac
 156  * perimeter) across a call to any other layer from the mac layer. The call to
 157  * any other layer could be via mi_* entry points, classifier entry points into
 158  * the driver or via upcall pointers into layers above. The mac perimeter may
 159  * be acquired or held only in the down direction, for e.g. when calling into
 160  * a mi_* driver enty point to provide atomicity of the operation.
 161  *
 162  * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
 163  * mac driver interfaces, the MAC layer must provide a cut out for control
 164  * interfaces like upcall notifications and start them in a separate thread.
 165  *
 166  * R9. Note that locking order also implies a plumbing order. For example
 167  * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
 168  * to plumb in any other order must be failed at mac_open time, otherwise it
 169  * could lead to deadlocks due to inverse locking order.
 170  *
 171  * R10. MAC driver interfaces must not block since the driver could call them
 172  * in interrupt context.
 173  *
 174  * R11. Walkers must preferably not hold any locks while calling walker
 175  * callbacks. Instead these can operate on reference counts. In simple
 176  * callbacks it may be ok to hold a lock and call the callbacks, but this is
 177  * harder to maintain in the general case of arbitrary callbacks.
 178  *
 179  * R12. The MAC layer must protect upcall notification callbacks using reference
 180  * counts rather than holding locks across the callbacks.
 181  *
 182  * R13. Given the variety of drivers, it is preferable if the MAC layer can make
 183  * sure that any pointers (such as mac ring pointers) it passes to the driver
 184  * remain valid until mac unregister time. Currently the mac layer achieves
 185  * this by using generation numbers for rings and freeing the mac rings only
 186  * at unregister time.  The MAC layer must provide a layer of indirection and
 187  * must not expose underlying driver rings or driver data structures/pointers
 188  * directly to MAC clients.
 189  *
 190  * MAC driver rules
 191  * ----------------
 192  *
 193  * R14. It would be preferable if MAC drivers don't hold any locks across any
 194  * mac call. However at a minimum they must not hold any locks across data
 195  * upcalls. They must also make sure that all references to mac data structures
 196  * are cleaned up and that it is single threaded at mac_unregister time.
 197  *
 198  * R15. MAC driver interfaces don't block and so the action may be done
 199  * asynchronously in a separate thread as for example handling notifications.
 200  * The driver must not assume that the action is complete when the call
 201  * returns.
 202  *
 203  * R16. Drivers must maintain a generation number per Rx ring, and pass it
 204  * back to mac_rx_ring(); They are expected to increment the generation
 205  * number whenever the ring's stop routine is invoked.
 206  * See comments in mac_rx_ring();
 207  *
 208  * R17 Similarly mi_stop is another synchronization point and the driver must
 209  * ensure that all upcalls are done and there won't be any future upcall
 210  * before returning from mi_stop.
 211  *
 212  * R18. The driver may assume that all set/modify control operations via
 213  * the mi_* entry points are single threaded on a per mac end point.
 214  *
 215  * Lock and Perimeter hierarchy scenarios
 216  * ---------------------------------------
 217  *
 218  * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
 219  *
 220  * ft_lock -> fe_lock [mac_flow_lookup]
 221  *
 222  * mi_rw_lock -> fe_lock [mac_bcast_send]
 223  *
 224  * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
 225  *
 226  * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
 227  *
 228  * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
 229  *
 230  * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
 231  * client to driver. In the case of clients that explictly use the mac provided
 232  * perimeter mechanism for its serialization, the hierarchy is
 233  * Perimeter -> mac layer locks, since the client never holds any locks across
 234  * the mac calls. In the case of clients that use its own locks the hierarchy
 235  * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
 236  * calls mac_perim_enter/exit in this case.
 237  *
 238  * Subflow creation rules
 239  * ---------------------------
 240  * o In case of a user specified cpulist present on underlying link and flows,
 241  * the flows cpulist must be a subset of the underlying link.
 242  * o In case of a user specified fanout mode present on link and flow, the
 243  * subflow fanout count has to be less than or equal to that of the
 244  * underlying link. The cpu-bindings for the subflows will be a subset of
 245  * the underlying link.
 246  * o In case if no cpulist specified on both underlying link and flow, the
 247  * underlying link relies on a  MAC tunable to provide out of box fanout.
 248  * The subflow will have no cpulist (the subflow will be unbound)
 249  * o In case if no cpulist is specified on the underlying link, a subflow can
 250  * carry  either a user-specified cpulist or fanout count. The cpu-bindings
 251  * for the subflow will not adhere to restriction that they need to be subset
 252  * of the underlying link.
 253  * o In case where the underlying link is carrying either a user specified
 254  * cpulist or fanout mode and for a unspecified subflow, the subflow will be
 255  * created unbound.
 256  * o While creating unbound subflows, bandwidth mode changes attempt to
 257  * figure a right fanout count. In such cases the fanout count will override
 258  * the unbound cpu-binding behavior.
 259  * o In addition to this, while cycling between flow and link properties, we
 260  * impose a restriction that if a link property has a subflow with
 261  * user-specified attributes, we will not allow changing the link property.
 262  * The administrator needs to reset all the user specified properties for the
 263  * subflows before attempting a link property change.
 264  * Some of the above rules can be overridden by specifying additional command
 265  * line options while creating or modifying link or subflow properties.
 266  */
 267 
 268 #include <sys/types.h>
 269 #include <sys/conf.h>
 270 #include <sys/id_space.h>
 271 #include <sys/esunddi.h>
 272 #include <sys/stat.h>
 273 #include <sys/mkdev.h>
 274 #include <sys/stream.h>
 275 #include <sys/strsun.h>
 276 #include <sys/strsubr.h>
 277 #include <sys/dlpi.h>
 278 #include <sys/list.h>
 279 #include <sys/modhash.h>
 280 #include <sys/mac_provider.h>
 281 #include <sys/mac_client_impl.h>
 282 #include <sys/mac_soft_ring.h>
 283 #include <sys/mac_stat.h>
 284 #include <sys/mac_impl.h>
 285 #include <sys/mac.h>
 286 #include <sys/dls.h>
 287 #include <sys/dld.h>
 288 #include <sys/modctl.h>
 289 #include <sys/fs/dv_node.h>
 290 #include <sys/thread.h>
 291 #include <sys/proc.h>
 292 #include <sys/callb.h>
 293 #include <sys/cpuvar.h>
 294 #include <sys/atomic.h>
 295 #include <sys/bitmap.h>
 296 #include <sys/sdt.h>
 297 #include <sys/mac_flow.h>
 298 #include <sys/ddi_intr_impl.h>
 299 #include <sys/disp.h>
 300 #include <sys/sdt.h>
 301 #include <sys/vnic.h>
 302 #include <sys/vnic_impl.h>
 303 #include <sys/vlan.h>
 304 #include <inet/ip.h>
 305 #include <inet/ip6.h>
 306 #include <sys/exacct.h>
 307 #include <sys/exacct_impl.h>
 308 #include <inet/nd.h>
 309 #include <sys/ethernet.h>
 310 #include <sys/pool.h>
 311 #include <sys/pool_pset.h>
 312 #include <sys/cpupart.h>
 313 #include <inet/wifi_ioctl.h>
 314 #include <net/wpa.h>
 315 
 316 #define IMPL_HASHSZ     67      /* prime */
 317 
 318 kmem_cache_t            *i_mac_impl_cachep;
 319 mod_hash_t              *i_mac_impl_hash;
 320 krwlock_t               i_mac_impl_lock;
 321 uint_t                  i_mac_impl_count;
 322 static kmem_cache_t     *mac_ring_cache;
 323 static id_space_t       *minor_ids;
 324 static uint32_t         minor_count;
 325 static pool_event_cb_t  mac_pool_event_reg;
 326 
 327 /*
 328  * Logging stuff. Perhaps mac_logging_interval could be broken into
 329  * mac_flow_log_interval and mac_link_log_interval if we want to be
 330  * able to schedule them differently.
 331  */
 332 uint_t                  mac_logging_interval;
 333 boolean_t               mac_flow_log_enable;
 334 boolean_t               mac_link_log_enable;
 335 timeout_id_t            mac_logging_timer;
 336 
 337 /* for debugging, see MAC_DBG_PRT() in mac_impl.h */
 338 int mac_dbg = 0;
 339 
 340 #define MACTYPE_KMODDIR "mac"
 341 #define MACTYPE_HASHSZ  67
 342 static mod_hash_t       *i_mactype_hash;
 343 /*
 344  * i_mactype_lock synchronizes threads that obtain references to mactype_t
 345  * structures through i_mactype_getplugin().
 346  */
 347 static kmutex_t         i_mactype_lock;
 348 
 349 /*
 350  * mac_tx_percpu_cnt
 351  *
 352  * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
 353  * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
 354  * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
 355  * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
 356  */
 357 int mac_tx_percpu_cnt;
 358 int mac_tx_percpu_cnt_max = 128;
 359 
 360 /*
 361  * Call back functions for the bridge module.  These are guaranteed to be valid
 362  * when holding a reference on a link or when holding mip->mi_bridge_lock and
 363  * mi_bridge_link is non-NULL.
 364  */
 365 mac_bridge_tx_t mac_bridge_tx_cb;
 366 mac_bridge_rx_t mac_bridge_rx_cb;
 367 mac_bridge_ref_t mac_bridge_ref_cb;
 368 mac_bridge_ls_t mac_bridge_ls_cb;
 369 
 370 static int i_mac_constructor(void *, void *, int);
 371 static void i_mac_destructor(void *, void *);
 372 static int i_mac_ring_ctor(void *, void *, int);
 373 static void i_mac_ring_dtor(void *, void *);
 374 static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
 375 void mac_tx_client_flush(mac_client_impl_t *);
 376 void mac_tx_client_block(mac_client_impl_t *);
 377 static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
 378 static int mac_start_group_and_rings(mac_group_t *);
 379 static void mac_stop_group_and_rings(mac_group_t *);
 380 static void mac_pool_event_cb(pool_event_t, int, void *);
 381 
 382 typedef struct netinfo_s {
 383         list_node_t     ni_link;
 384         void            *ni_record;
 385         int             ni_size;
 386         int             ni_type;
 387 } netinfo_t;
 388 
 389 /*
 390  * Module initialization functions.
 391  */
 392 
 393 void
 394 mac_init(void)
 395 {
 396         mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
 397             boot_max_ncpus);
 398 
 399         /* Upper bound is mac_tx_percpu_cnt_max */
 400         if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
 401                 mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
 402 
 403         if (mac_tx_percpu_cnt < 1) {
 404                 /* Someone set max_tx_percpu_cnt_max to 0 or less */
 405                 mac_tx_percpu_cnt = 1;
 406         }
 407 
 408         ASSERT(mac_tx_percpu_cnt >= 1);
 409         mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
 410         /*
 411          * Make it of the form 2**N - 1 in the range
 412          * [0 .. mac_tx_percpu_cnt_max - 1]
 413          */
 414         mac_tx_percpu_cnt--;
 415 
 416         i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
 417             sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
 418             NULL, NULL, NULL, 0);
 419         ASSERT(i_mac_impl_cachep != NULL);
 420 
 421         mac_ring_cache = kmem_cache_create("mac_ring_cache",
 422             sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
 423             NULL, NULL, 0);
 424         ASSERT(mac_ring_cache != NULL);
 425 
 426         i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
 427             IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
 428             mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
 429         rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
 430 
 431         mac_flow_init();
 432         mac_soft_ring_init();
 433         mac_bcast_init();
 434         mac_client_init();
 435 
 436         i_mac_impl_count = 0;
 437 
 438         i_mactype_hash = mod_hash_create_extended("mactype_hash",
 439             MACTYPE_HASHSZ,
 440             mod_hash_null_keydtor, mod_hash_null_valdtor,
 441             mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
 442 
 443         /*
 444          * Allocate an id space to manage minor numbers. The range of the
 445          * space will be from MAC_MAX_MINOR+1 to MAC_PRIVATE_MINOR-1.  This
 446          * leaves half of the 32-bit minors available for driver private use.
 447          */
 448         minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1,
 449             MAC_PRIVATE_MINOR-1);
 450         ASSERT(minor_ids != NULL);
 451         minor_count = 0;
 452 
 453         /* Let's default to 20 seconds */
 454         mac_logging_interval = 20;
 455         mac_flow_log_enable = B_FALSE;
 456         mac_link_log_enable = B_FALSE;
 457         mac_logging_timer = 0;
 458 
 459         /* Register to be notified of noteworthy pools events */
 460         mac_pool_event_reg.pec_func =  mac_pool_event_cb;
 461         mac_pool_event_reg.pec_arg = NULL;
 462         pool_event_cb_register(&mac_pool_event_reg);
 463 }
 464 
 465 int
 466 mac_fini(void)
 467 {
 468 
 469         if (i_mac_impl_count > 0 || minor_count > 0)
 470                 return (EBUSY);
 471 
 472         pool_event_cb_unregister(&mac_pool_event_reg);
 473 
 474         id_space_destroy(minor_ids);
 475         mac_flow_fini();
 476 
 477         mod_hash_destroy_hash(i_mac_impl_hash);
 478         rw_destroy(&i_mac_impl_lock);
 479 
 480         mac_client_fini();
 481         kmem_cache_destroy(mac_ring_cache);
 482 
 483         mod_hash_destroy_hash(i_mactype_hash);
 484         mac_soft_ring_finish();
 485 
 486 
 487         return (0);
 488 }
 489 
 490 /*
 491  * Initialize a GLDv3 driver's device ops.  A driver that manages its own ops
 492  * (e.g. softmac) may pass in a NULL ops argument.
 493  */
 494 void
 495 mac_init_ops(struct dev_ops *ops, const char *name)
 496 {
 497         major_t major = ddi_name_to_major((char *)name);
 498 
 499         /*
 500          * By returning on error below, we are not letting the driver continue
 501          * in an undefined context.  The mac_register() function will faill if
 502          * DN_GLDV3_DRIVER isn't set.
 503          */
 504         if (major == DDI_MAJOR_T_NONE)
 505                 return;
 506         LOCK_DEV_OPS(&devnamesp[major].dn_lock);
 507         devnamesp[major].dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER);
 508         UNLOCK_DEV_OPS(&devnamesp[major].dn_lock);
 509         if (ops != NULL)
 510                 dld_init_ops(ops, name);
 511 }
 512 
 513 void
 514 mac_fini_ops(struct dev_ops *ops)
 515 {
 516         dld_fini_ops(ops);
 517 }
 518 
 519 /*ARGSUSED*/
 520 static int
 521 i_mac_constructor(void *buf, void *arg, int kmflag)
 522 {
 523         mac_impl_t      *mip = buf;
 524 
 525         bzero(buf, sizeof (mac_impl_t));
 526 
 527         mip->mi_linkstate = LINK_STATE_UNKNOWN;
 528 
 529         rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
 530         mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
 531         mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
 532         mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
 533 
 534         mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
 535         cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
 536         mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
 537         cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
 538 
 539         mutex_init(&mip->mi_bridge_lock, NULL, MUTEX_DEFAULT, NULL);
 540 
 541         return (0);
 542 }
 543 
 544 /*ARGSUSED*/
 545 static void
 546 i_mac_destructor(void *buf, void *arg)
 547 {
 548         mac_impl_t      *mip = buf;
 549         mac_cb_info_t   *mcbi;
 550 
 551         ASSERT(mip->mi_ref == 0);
 552         ASSERT(mip->mi_active == 0);
 553         ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
 554         ASSERT(mip->mi_devpromisc == 0);
 555         ASSERT(mip->mi_ksp == NULL);
 556         ASSERT(mip->mi_kstat_count == 0);
 557         ASSERT(mip->mi_nclients == 0);
 558         ASSERT(mip->mi_nactiveclients == 0);
 559         ASSERT(mip->mi_single_active_client == NULL);
 560         ASSERT(mip->mi_state_flags == 0);
 561         ASSERT(mip->mi_factory_addr == NULL);
 562         ASSERT(mip->mi_factory_addr_num == 0);
 563         ASSERT(mip->mi_default_tx_ring == NULL);
 564 
 565         mcbi = &mip->mi_notify_cb_info;
 566         ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
 567         ASSERT(mip->mi_notify_bits == 0);
 568         ASSERT(mip->mi_notify_thread == NULL);
 569         ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
 570         mcbi->mcbi_lockp = NULL;
 571 
 572         mcbi = &mip->mi_promisc_cb_info;
 573         ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
 574         ASSERT(mip->mi_promisc_list == NULL);
 575         ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
 576         mcbi->mcbi_lockp = NULL;
 577 
 578         ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
 579         ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
 580 
 581         rw_destroy(&mip->mi_rw_lock);
 582 
 583         mutex_destroy(&mip->mi_promisc_lock);
 584         cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
 585         mutex_destroy(&mip->mi_notify_lock);
 586         cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
 587         mutex_destroy(&mip->mi_ring_lock);
 588 
 589         ASSERT(mip->mi_bridge_link == NULL);
 590 }
 591 
 592 /* ARGSUSED */
 593 static int
 594 i_mac_ring_ctor(void *buf, void *arg, int kmflag)
 595 {
 596         mac_ring_t *ring = (mac_ring_t *)buf;
 597 
 598         bzero(ring, sizeof (mac_ring_t));
 599         cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
 600         mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
 601         ring->mr_state = MR_FREE;
 602         return (0);
 603 }
 604 
 605 /* ARGSUSED */
 606 static void
 607 i_mac_ring_dtor(void *buf, void *arg)
 608 {
 609         mac_ring_t *ring = (mac_ring_t *)buf;
 610 
 611         cv_destroy(&ring->mr_cv);
 612         mutex_destroy(&ring->mr_lock);
 613 }
 614 
 615 /*
 616  * Common functions to do mac callback addition and deletion. Currently this is
 617  * used by promisc callbacks and notify callbacks. List addition and deletion
 618  * need to take care of list walkers. List walkers in general, can't hold list
 619  * locks and make upcall callbacks due to potential lock order and recursive
 620  * reentry issues. Instead list walkers increment the list walker count to mark
 621  * the presence of a walker thread. Addition can be carefully done to ensure
 622  * that the list walker always sees either the old list or the new list.
 623  * However the deletion can't be done while the walker is active, instead the
 624  * deleting thread simply marks the entry as logically deleted. The last walker
 625  * physically deletes and frees up the logically deleted entries when the walk
 626  * is complete.
 627  */
 628 void
 629 mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
 630     mac_cb_t *mcb_elem)
 631 {
 632         mac_cb_t        *p;
 633         mac_cb_t        **pp;
 634 
 635         /* Verify it is not already in the list */
 636         for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
 637                 if (p == mcb_elem)
 638                         break;
 639         }
 640         VERIFY(p == NULL);
 641 
 642         /*
 643          * Add it to the head of the callback list. The membar ensures that
 644          * the following list pointer manipulations reach global visibility
 645          * in exactly the program order below.
 646          */
 647         ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 648 
 649         mcb_elem->mcb_nextp = *mcb_head;
 650         membar_producer();
 651         *mcb_head = mcb_elem;
 652 }
 653 
 654 /*
 655  * Mark the entry as logically deleted. If there aren't any walkers unlink
 656  * from the list. In either case return the corresponding status.
 657  */
 658 boolean_t
 659 mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
 660     mac_cb_t *mcb_elem)
 661 {
 662         mac_cb_t        *p;
 663         mac_cb_t        **pp;
 664 
 665         ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 666         /*
 667          * Search the callback list for the entry to be removed
 668          */
 669         for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
 670                 if (p == mcb_elem)
 671                         break;
 672         }
 673         VERIFY(p != NULL);
 674 
 675         /*
 676          * If there are walkers just mark it as deleted and the last walker
 677          * will remove from the list and free it.
 678          */
 679         if (mcbi->mcbi_walker_cnt != 0) {
 680                 p->mcb_flags |= MCB_CONDEMNED;
 681                 mcbi->mcbi_del_cnt++;
 682                 return (B_FALSE);
 683         }
 684 
 685         ASSERT(mcbi->mcbi_del_cnt == 0);
 686         *pp = p->mcb_nextp;
 687         p->mcb_nextp = NULL;
 688         return (B_TRUE);
 689 }
 690 
 691 /*
 692  * Wait for all pending callback removals to be completed
 693  */
 694 void
 695 mac_callback_remove_wait(mac_cb_info_t *mcbi)
 696 {
 697         ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 698         while (mcbi->mcbi_del_cnt != 0) {
 699                 DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
 700                 cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
 701         }
 702 }
 703 
 704 /*
 705  * The last mac callback walker does the cleanup. Walk the list and unlik
 706  * all the logically deleted entries and construct a temporary list of
 707  * removed entries. Return the list of removed entries to the caller.
 708  */
 709 mac_cb_t *
 710 mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
 711 {
 712         mac_cb_t        *p;
 713         mac_cb_t        **pp;
 714         mac_cb_t        *rmlist = NULL;         /* List of removed elements */
 715         int     cnt = 0;
 716 
 717         ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 718         ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
 719 
 720         pp = mcb_head;
 721         while (*pp != NULL) {
 722                 if ((*pp)->mcb_flags & MCB_CONDEMNED) {
 723                         p = *pp;
 724                         *pp = p->mcb_nextp;
 725                         p->mcb_nextp = rmlist;
 726                         rmlist = p;
 727                         cnt++;
 728                         continue;
 729                 }
 730                 pp = &(*pp)->mcb_nextp;
 731         }
 732 
 733         ASSERT(mcbi->mcbi_del_cnt == cnt);
 734         mcbi->mcbi_del_cnt = 0;
 735         return (rmlist);
 736 }
 737 
 738 boolean_t
 739 mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
 740 {
 741         mac_cb_t        *mcb;
 742 
 743         /* Verify it is not already in the list */
 744         for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
 745                 if (mcb == mcb_elem)
 746                         return (B_TRUE);
 747         }
 748 
 749         return (B_FALSE);
 750 }
 751 
 752 boolean_t
 753 mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
 754 {
 755         boolean_t       found;
 756 
 757         mutex_enter(mcbi->mcbi_lockp);
 758         found = mac_callback_lookup(mcb_headp, mcb_elem);
 759         mutex_exit(mcbi->mcbi_lockp);
 760 
 761         return (found);
 762 }
 763 
 764 /* Free the list of removed callbacks */
 765 void
 766 mac_callback_free(mac_cb_t *rmlist)
 767 {
 768         mac_cb_t        *mcb;
 769         mac_cb_t        *mcb_next;
 770 
 771         for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
 772                 mcb_next = mcb->mcb_nextp;
 773                 kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
 774         }
 775 }
 776 
 777 /*
 778  * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
 779  * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
 780  * is only a single shared total walker count, and an entry can't be physically
 781  * unlinked if a walker is active on either list. The last walker does this
 782  * cleanup of logically deleted entries.
 783  */
 784 void
 785 i_mac_promisc_walker_cleanup(mac_impl_t *mip)
 786 {
 787         mac_cb_t        *rmlist;
 788         mac_cb_t        *mcb;
 789         mac_cb_t        *mcb_next;
 790         mac_promisc_impl_t      *mpip;
 791 
 792         /*
 793          * Construct a temporary list of deleted callbacks by walking the
 794          * the mi_promisc_list. Then for each entry in the temporary list,
 795          * remove it from the mci_promisc_list and free the entry.
 796          */
 797         rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
 798             &mip->mi_promisc_list);
 799 
 800         for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
 801                 mcb_next = mcb->mcb_nextp;
 802                 mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
 803                 VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
 804                     &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
 805                 mcb->mcb_flags = 0;
 806                 mcb->mcb_nextp = NULL;
 807                 kmem_cache_free(mac_promisc_impl_cache, mpip);
 808         }
 809 }
 810 
 811 void
 812 i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
 813 {
 814         mac_cb_info_t   *mcbi;
 815 
 816         /*
 817          * Signal the notify thread even after mi_ref has become zero and
 818          * mi_disabled is set. The synchronization with the notify thread
 819          * happens in mac_unregister and that implies the driver must make
 820          * sure it is single-threaded (with respect to mac calls) and that
 821          * all pending mac calls have returned before it calls mac_unregister
 822          */
 823         rw_enter(&i_mac_impl_lock, RW_READER);
 824         if (mip->mi_state_flags & MIS_DISABLED)
 825                 goto exit;
 826 
 827         /*
 828          * Guard against incorrect notifications.  (Running a newer
 829          * mac client against an older implementation?)
 830          */
 831         if (type >= MAC_NNOTE)
 832                 goto exit;
 833 
 834         mcbi = &mip->mi_notify_cb_info;
 835         mutex_enter(mcbi->mcbi_lockp);
 836         mip->mi_notify_bits |= (1 << type);
 837         cv_broadcast(&mcbi->mcbi_cv);
 838         mutex_exit(mcbi->mcbi_lockp);
 839 
 840 exit:
 841         rw_exit(&i_mac_impl_lock);
 842 }
 843 
 844 /*
 845  * Mac serialization primitives. Please see the block comment at the
 846  * top of the file.
 847  */
 848 void
 849 i_mac_perim_enter(mac_impl_t *mip)
 850 {
 851         mac_client_impl_t       *mcip;
 852 
 853         if (mip->mi_state_flags & MIS_IS_VNIC) {
 854                 /*
 855                  * This is a VNIC. Return the lower mac since that is what
 856                  * we want to serialize on.
 857                  */
 858                 mcip = mac_vnic_lower(mip);
 859                 mip = mcip->mci_mip;
 860         }
 861 
 862         mutex_enter(&mip->mi_perim_lock);
 863         if (mip->mi_perim_owner == curthread) {
 864                 mip->mi_perim_ocnt++;
 865                 mutex_exit(&mip->mi_perim_lock);
 866                 return;
 867         }
 868 
 869         while (mip->mi_perim_owner != NULL)
 870                 cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
 871 
 872         mip->mi_perim_owner = curthread;
 873         ASSERT(mip->mi_perim_ocnt == 0);
 874         mip->mi_perim_ocnt++;
 875 #ifdef DEBUG
 876         mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
 877             MAC_PERIM_STACK_DEPTH);
 878 #endif
 879         mutex_exit(&mip->mi_perim_lock);
 880 }
 881 
 882 int
 883 i_mac_perim_enter_nowait(mac_impl_t *mip)
 884 {
 885         /*
 886          * The vnic is a special case, since the serialization is done based
 887          * on the lower mac. If the lower mac is busy, it does not imply the
 888          * vnic can't be unregistered. But in the case of other drivers,
 889          * a busy perimeter or open mac handles implies that the mac is busy
 890          * and can't be unregistered.
 891          */
 892         if (mip->mi_state_flags & MIS_IS_VNIC) {
 893                 i_mac_perim_enter(mip);
 894                 return (0);
 895         }
 896 
 897         mutex_enter(&mip->mi_perim_lock);
 898         if (mip->mi_perim_owner != NULL) {
 899                 mutex_exit(&mip->mi_perim_lock);
 900                 return (EBUSY);
 901         }
 902         ASSERT(mip->mi_perim_ocnt == 0);
 903         mip->mi_perim_owner = curthread;
 904         mip->mi_perim_ocnt++;
 905         mutex_exit(&mip->mi_perim_lock);
 906 
 907         return (0);
 908 }
 909 
 910 void
 911 i_mac_perim_exit(mac_impl_t *mip)
 912 {
 913         mac_client_impl_t *mcip;
 914 
 915         if (mip->mi_state_flags & MIS_IS_VNIC) {
 916                 /*
 917                  * This is a VNIC. Return the lower mac since that is what
 918                  * we want to serialize on.
 919                  */
 920                 mcip = mac_vnic_lower(mip);
 921                 mip = mcip->mci_mip;
 922         }
 923 
 924         ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
 925 
 926         mutex_enter(&mip->mi_perim_lock);
 927         if (--mip->mi_perim_ocnt == 0) {
 928                 mip->mi_perim_owner = NULL;
 929                 cv_signal(&mip->mi_perim_cv);
 930         }
 931         mutex_exit(&mip->mi_perim_lock);
 932 }
 933 
 934 /*
 935  * Returns whether the current thread holds the mac perimeter. Used in making
 936  * assertions.
 937  */
 938 boolean_t
 939 mac_perim_held(mac_handle_t mh)
 940 {
 941         mac_impl_t      *mip = (mac_impl_t *)mh;
 942         mac_client_impl_t *mcip;
 943 
 944         if (mip->mi_state_flags & MIS_IS_VNIC) {
 945                 /*
 946                  * This is a VNIC. Return the lower mac since that is what
 947                  * we want to serialize on.
 948                  */
 949                 mcip = mac_vnic_lower(mip);
 950                 mip = mcip->mci_mip;
 951         }
 952         return (mip->mi_perim_owner == curthread);
 953 }
 954 
 955 /*
 956  * mac client interfaces to enter the mac perimeter of a mac end point, given
 957  * its mac handle, or macname or linkid.
 958  */
 959 void
 960 mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
 961 {
 962         mac_impl_t      *mip = (mac_impl_t *)mh;
 963 
 964         i_mac_perim_enter(mip);
 965         /*
 966          * The mac_perim_handle_t returned encodes the 'mip' and whether a
 967          * mac_open has been done internally while entering the perimeter.
 968          * This information is used in mac_perim_exit
 969          */
 970         MAC_ENCODE_MPH(*mphp, mip, 0);
 971 }
 972 
 973 int
 974 mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
 975 {
 976         int     err;
 977         mac_handle_t    mh;
 978 
 979         if ((err = mac_open(name, &mh)) != 0)
 980                 return (err);
 981 
 982         mac_perim_enter_by_mh(mh, mphp);
 983         MAC_ENCODE_MPH(*mphp, mh, 1);
 984         return (0);
 985 }
 986 
 987 int
 988 mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
 989 {
 990         int     err;
 991         mac_handle_t    mh;
 992 
 993         if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
 994                 return (err);
 995 
 996         mac_perim_enter_by_mh(mh, mphp);
 997         MAC_ENCODE_MPH(*mphp, mh, 1);
 998         return (0);
 999 }
1000 
1001 void
1002 mac_perim_exit(mac_perim_handle_t mph)
1003 {
1004         mac_impl_t      *mip;
1005         boolean_t       need_close;
1006 
1007         MAC_DECODE_MPH(mph, mip, need_close);
1008         i_mac_perim_exit(mip);
1009         if (need_close)
1010                 mac_close((mac_handle_t)mip);
1011 }
1012 
1013 int
1014 mac_hold(const char *macname, mac_impl_t **pmip)
1015 {
1016         mac_impl_t      *mip;
1017         int             err;
1018 
1019         /*
1020          * Check the device name length to make sure it won't overflow our
1021          * buffer.
1022          */
1023         if (strlen(macname) >= MAXNAMELEN)
1024                 return (EINVAL);
1025 
1026         /*
1027          * Look up its entry in the global hash table.
1028          */
1029         rw_enter(&i_mac_impl_lock, RW_WRITER);
1030         err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
1031             (mod_hash_val_t *)&mip);
1032 
1033         if (err != 0) {
1034                 rw_exit(&i_mac_impl_lock);
1035                 return (ENOENT);
1036         }
1037 
1038         if (mip->mi_state_flags & MIS_DISABLED) {
1039                 rw_exit(&i_mac_impl_lock);
1040                 return (ENOENT);
1041         }
1042 
1043         if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
1044                 rw_exit(&i_mac_impl_lock);
1045                 return (EBUSY);
1046         }
1047 
1048         mip->mi_ref++;
1049         rw_exit(&i_mac_impl_lock);
1050 
1051         *pmip = mip;
1052         return (0);
1053 }
1054 
1055 void
1056 mac_rele(mac_impl_t *mip)
1057 {
1058         rw_enter(&i_mac_impl_lock, RW_WRITER);
1059         ASSERT(mip->mi_ref != 0);
1060         if (--mip->mi_ref == 0) {
1061                 ASSERT(mip->mi_nactiveclients == 0 &&
1062                     !(mip->mi_state_flags & MIS_EXCLUSIVE));
1063         }
1064         rw_exit(&i_mac_impl_lock);
1065 }
1066 
1067 /*
1068  * Private GLDv3 function to start a MAC instance.
1069  */
1070 int
1071 mac_start(mac_handle_t mh)
1072 {
1073         mac_impl_t      *mip = (mac_impl_t *)mh;
1074         int             err = 0;
1075         mac_group_t     *defgrp;
1076 
1077         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1078         ASSERT(mip->mi_start != NULL);
1079 
1080         /*
1081          * Check whether the device is already started.
1082          */
1083         if (mip->mi_active++ == 0) {
1084                 mac_ring_t *ring = NULL;
1085 
1086                 /*
1087                  * Start the device.
1088                  */
1089                 err = mip->mi_start(mip->mi_driver);
1090                 if (err != 0) {
1091                         mip->mi_active--;
1092                         return (err);
1093                 }
1094 
1095                 /*
1096                  * Start the default tx ring.
1097                  */
1098                 if (mip->mi_default_tx_ring != NULL) {
1099 
1100                         ring = (mac_ring_t *)mip->mi_default_tx_ring;
1101                         if (ring->mr_state != MR_INUSE) {
1102                                 err = mac_start_ring(ring);
1103                                 if (err != 0) {
1104                                         mip->mi_active--;
1105                                         return (err);
1106                                 }
1107                         }
1108                 }
1109 
1110                 if ((defgrp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
1111                         /*
1112                          * Start the default ring, since it will be needed
1113                          * to receive broadcast and multicast traffic for
1114                          * both primary and non-primary MAC clients.
1115                          */
1116                         ASSERT(defgrp->mrg_state == MAC_GROUP_STATE_REGISTERED);
1117                         err = mac_start_group_and_rings(defgrp);
1118                         if (err != 0) {
1119                                 mip->mi_active--;
1120                                 if ((ring != NULL) &&
1121                                     (ring->mr_state == MR_INUSE))
1122                                         mac_stop_ring(ring);
1123                                 return (err);
1124                         }
1125                         mac_set_group_state(defgrp, MAC_GROUP_STATE_SHARED);
1126                 }
1127         }
1128 
1129         return (err);
1130 }
1131 
1132 /*
1133  * Private GLDv3 function to stop a MAC instance.
1134  */
1135 void
1136 mac_stop(mac_handle_t mh)
1137 {
1138         mac_impl_t      *mip = (mac_impl_t *)mh;
1139         mac_group_t     *grp;
1140 
1141         ASSERT(mip->mi_stop != NULL);
1142         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1143 
1144         /*
1145          * Check whether the device is still needed.
1146          */
1147         ASSERT(mip->mi_active != 0);
1148         if (--mip->mi_active == 0) {
1149                 if ((grp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
1150                         /*
1151                          * There should be no more active clients since the
1152                          * MAC is being stopped. Stop the default RX group
1153                          * and transition it back to registered state.
1154                          *
1155                          * When clients are torn down, the groups
1156                          * are release via mac_release_rx_group which
1157                          * knows the the default group is always in
1158                          * started mode since broadcast uses it. So
1159                          * we can assert that their are no clients
1160                          * (since mac_bcast_add doesn't register itself
1161                          * as a client) and group is in SHARED state.
1162                          */
1163                         ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
1164                         ASSERT(MAC_GROUP_NO_CLIENT(grp) &&
1165                             mip->mi_nactiveclients == 0);
1166                         mac_stop_group_and_rings(grp);
1167                         mac_set_group_state(grp, MAC_GROUP_STATE_REGISTERED);
1168                 }
1169 
1170                 if (mip->mi_default_tx_ring != NULL) {
1171                         mac_ring_t *ring;
1172 
1173                         ring = (mac_ring_t *)mip->mi_default_tx_ring;
1174                         if (ring->mr_state == MR_INUSE) {
1175                                 mac_stop_ring(ring);
1176                                 ring->mr_flag = 0;
1177                         }
1178                 }
1179 
1180                 /*
1181                  * Stop the device.
1182                  */
1183                 mip->mi_stop(mip->mi_driver);
1184         }
1185 }
1186 
1187 int
1188 i_mac_promisc_set(mac_impl_t *mip, boolean_t on)
1189 {
1190         int             err = 0;
1191 
1192         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1193         ASSERT(mip->mi_setpromisc != NULL);
1194 
1195         if (on) {
1196                 /*
1197                  * Enable promiscuous mode on the device if not yet enabled.
1198                  */
1199                 if (mip->mi_devpromisc++ == 0) {
1200                         err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
1201                         if (err != 0) {
1202                                 mip->mi_devpromisc--;
1203                                 return (err);
1204                         }
1205                         i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1206                 }
1207         } else {
1208                 if (mip->mi_devpromisc == 0)
1209                         return (EPROTO);
1210 
1211                 /*
1212                  * Disable promiscuous mode on the device if this is the last
1213                  * enabling.
1214                  */
1215                 if (--mip->mi_devpromisc == 0) {
1216                         err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
1217                         if (err != 0) {
1218                                 mip->mi_devpromisc++;
1219                                 return (err);
1220                         }
1221                         i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1222                 }
1223         }
1224 
1225         return (0);
1226 }
1227 
1228 /*
1229  * The promiscuity state can change any time. If the caller needs to take
1230  * actions that are atomic with the promiscuity state, then the caller needs
1231  * to bracket the entire sequence with mac_perim_enter/exit
1232  */
1233 boolean_t
1234 mac_promisc_get(mac_handle_t mh)
1235 {
1236         mac_impl_t              *mip = (mac_impl_t *)mh;
1237 
1238         /*
1239          * Return the current promiscuity.
1240          */
1241         return (mip->mi_devpromisc != 0);
1242 }
1243 
1244 /*
1245  * Invoked at MAC instance attach time to initialize the list
1246  * of factory MAC addresses supported by a MAC instance. This function
1247  * builds a local cache in the mac_impl_t for the MAC addresses
1248  * supported by the underlying hardware. The MAC clients themselves
1249  * use the mac_addr_factory*() functions to query and reserve
1250  * factory MAC addresses.
1251  */
1252 void
1253 mac_addr_factory_init(mac_impl_t *mip)
1254 {
1255         mac_capab_multifactaddr_t capab;
1256         uint8_t *addr;
1257         int i;
1258 
1259         /*
1260          * First round to see how many factory MAC addresses are available.
1261          */
1262         bzero(&capab, sizeof (capab));
1263         if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
1264             &capab) || (capab.mcm_naddr == 0)) {
1265                 /*
1266                  * The MAC instance doesn't support multiple factory
1267                  * MAC addresses, we're done here.
1268                  */
1269                 return;
1270         }
1271 
1272         /*
1273          * Allocate the space and get all the factory addresses.
1274          */
1275         addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
1276         capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
1277 
1278         mip->mi_factory_addr_num = capab.mcm_naddr;
1279         mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
1280             sizeof (mac_factory_addr_t), KM_SLEEP);
1281 
1282         for (i = 0; i < capab.mcm_naddr; i++) {
1283                 bcopy(addr + i * MAXMACADDRLEN,
1284                     mip->mi_factory_addr[i].mfa_addr,
1285                     mip->mi_type->mt_addr_length);
1286                 mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
1287         }
1288 
1289         kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
1290 }
1291 
1292 void
1293 mac_addr_factory_fini(mac_impl_t *mip)
1294 {
1295         if (mip->mi_factory_addr == NULL) {
1296                 ASSERT(mip->mi_factory_addr_num == 0);
1297                 return;
1298         }
1299 
1300         kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
1301             sizeof (mac_factory_addr_t));
1302 
1303         mip->mi_factory_addr = NULL;
1304         mip->mi_factory_addr_num = 0;
1305 }
1306 
1307 /*
1308  * Reserve a factory MAC address. If *slot is set to -1, the function
1309  * attempts to reserve any of the available factory MAC addresses and
1310  * returns the reserved slot id. If no slots are available, the function
1311  * returns ENOSPC. If *slot is not set to -1, the function reserves
1312  * the specified slot if it is available, or returns EBUSY is the slot
1313  * is already used. Returns ENOTSUP if the underlying MAC does not
1314  * support multiple factory addresses. If the slot number is not -1 but
1315  * is invalid, returns EINVAL.
1316  */
1317 int
1318 mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
1319 {
1320         mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1321         mac_impl_t *mip = mcip->mci_mip;
1322         int i, ret = 0;
1323 
1324         i_mac_perim_enter(mip);
1325         /*
1326          * Protect against concurrent readers that may need a self-consistent
1327          * view of the factory addresses
1328          */
1329         rw_enter(&mip->mi_rw_lock, RW_WRITER);
1330 
1331         if (mip->mi_factory_addr_num == 0) {
1332                 ret = ENOTSUP;
1333                 goto bail;
1334         }
1335 
1336         if (*slot != -1) {
1337                 /* check the specified slot */
1338                 if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
1339                         ret = EINVAL;
1340                         goto bail;
1341                 }
1342                 if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
1343                         ret = EBUSY;
1344                         goto bail;
1345                 }
1346         } else {
1347                 /* pick the next available slot */
1348                 for (i = 0; i < mip->mi_factory_addr_num; i++) {
1349                         if (!mip->mi_factory_addr[i].mfa_in_use)
1350                                 break;
1351                 }
1352 
1353                 if (i == mip->mi_factory_addr_num) {
1354                         ret = ENOSPC;
1355                         goto bail;
1356                 }
1357                 *slot = i+1;
1358         }
1359 
1360         mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
1361         mip->mi_factory_addr[*slot-1].mfa_client = mcip;
1362 
1363 bail:
1364         rw_exit(&mip->mi_rw_lock);
1365         i_mac_perim_exit(mip);
1366         return (ret);
1367 }
1368 
1369 /*
1370  * Release the specified factory MAC address slot.
1371  */
1372 void
1373 mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
1374 {
1375         mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1376         mac_impl_t *mip = mcip->mci_mip;
1377 
1378         i_mac_perim_enter(mip);
1379         /*
1380          * Protect against concurrent readers that may need a self-consistent
1381          * view of the factory addresses
1382          */
1383         rw_enter(&mip->mi_rw_lock, RW_WRITER);
1384 
1385         ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1386         ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
1387 
1388         mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
1389 
1390         rw_exit(&mip->mi_rw_lock);
1391         i_mac_perim_exit(mip);
1392 }
1393 
1394 /*
1395  * Stores in mac_addr the value of the specified MAC address. Returns
1396  * 0 on success, or EINVAL if the slot number is not valid for the MAC.
1397  * The caller must provide a string of at least MAXNAMELEN bytes.
1398  */
1399 void
1400 mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
1401     uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
1402 {
1403         mac_impl_t *mip = (mac_impl_t *)mh;
1404         boolean_t in_use;
1405 
1406         ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1407 
1408         /*
1409          * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
1410          * and mi_rw_lock
1411          */
1412         rw_enter(&mip->mi_rw_lock, RW_READER);
1413         bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
1414         *addr_len = mip->mi_type->mt_addr_length;
1415         in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
1416         if (in_use && client_name != NULL) {
1417                 bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
1418                     client_name, MAXNAMELEN);
1419         }
1420         if (in_use_arg != NULL)
1421                 *in_use_arg = in_use;
1422         rw_exit(&mip->mi_rw_lock);
1423 }
1424 
1425 /*
1426  * Returns the number of factory MAC addresses (in addition to the
1427  * primary MAC address), 0 if the underlying MAC doesn't support
1428  * that feature.
1429  */
1430 uint_t
1431 mac_addr_factory_num(mac_handle_t mh)
1432 {
1433         mac_impl_t *mip = (mac_impl_t *)mh;
1434 
1435         return (mip->mi_factory_addr_num);
1436 }
1437 
1438 
1439 void
1440 mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
1441 {
1442         mac_ring_t      *ring;
1443 
1444         for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
1445                 ring->mr_flag &= ~flag;
1446 }
1447 
1448 /*
1449  * The following mac_hwrings_xxx() functions are private mac client functions
1450  * used by the aggr driver to access and control the underlying HW Rx group
1451  * and rings. In this case, the aggr driver has exclusive control of the
1452  * underlying HW Rx group/rings, it calls the following functions to
1453  * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
1454  * addresses, or set up the Rx callback.
1455  */
1456 /* ARGSUSED */
1457 static void
1458 mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
1459     mblk_t *mp_chain, boolean_t loopback)
1460 {
1461         mac_soft_ring_set_t     *mac_srs = (mac_soft_ring_set_t *)srs;
1462         mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
1463         mac_direct_rx_t         proc;
1464         void                    *arg1;
1465         mac_resource_handle_t   arg2;
1466 
1467         proc = srs_rx->sr_func;
1468         arg1 = srs_rx->sr_arg1;
1469         arg2 = mac_srs->srs_mrh;
1470 
1471         proc(arg1, arg2, mp_chain, NULL);
1472 }
1473 
1474 /*
1475  * This function is called to get the list of HW rings that are reserved by
1476  * an exclusive mac client.
1477  *
1478  * Return value: the number of HW rings.
1479  */
1480 int
1481 mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
1482     mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
1483 {
1484         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1485         flow_entry_t            *flent = mcip->mci_flent;
1486         mac_group_t             *grp;
1487         mac_ring_t              *ring;
1488         int                     cnt = 0;
1489 
1490         if (rtype == MAC_RING_TYPE_RX) {
1491                 grp = flent->fe_rx_ring_group;
1492         } else if (rtype == MAC_RING_TYPE_TX) {
1493                 grp = flent->fe_tx_ring_group;
1494         } else {
1495                 ASSERT(B_FALSE);
1496                 return (-1);
1497         }
1498         /*
1499          * The mac client did not reserve any RX group, return directly.
1500          * This is probably because the underlying MAC does not support
1501          * any groups.
1502          */
1503         if (hwgh != NULL)
1504                 *hwgh = NULL;
1505         if (grp == NULL)
1506                 return (0);
1507         /*
1508          * This group must be reserved by this mac client.
1509          */
1510         ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
1511             (mcip == MAC_GROUP_ONLY_CLIENT(grp)));
1512 
1513         for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next, cnt++) {
1514                 ASSERT(cnt < MAX_RINGS_PER_GROUP);
1515                 hwrh[cnt] = (mac_ring_handle_t)ring;
1516         }
1517         if (hwgh != NULL)
1518                 *hwgh = (mac_group_handle_t)grp;
1519 
1520         return (cnt);
1521 }
1522 
1523 /*
1524  * This function is called to get info about Tx/Rx rings.
1525  *
1526  * Return value: returns uint_t which will have various bits set
1527  * that indicates different properties of the ring.
1528  */
1529 uint_t
1530 mac_hwring_getinfo(mac_ring_handle_t rh)
1531 {
1532         mac_ring_t *ring = (mac_ring_t *)rh;
1533         mac_ring_info_t *info = &ring->mr_info;
1534 
1535         return (info->mri_flags);
1536 }
1537 
1538 /*
1539  * Export ddi interrupt handles from the HW ring to the pseudo ring and
1540  * setup the RX callback of the mac client which exclusively controls
1541  * HW ring.
1542  */
1543 void
1544 mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh,
1545     mac_ring_handle_t pseudo_rh)
1546 {
1547         mac_ring_t              *hw_ring = (mac_ring_t *)hwrh;
1548         mac_ring_t              *pseudo_ring;
1549         mac_soft_ring_set_t     *mac_srs = hw_ring->mr_srs;
1550 
1551         if (pseudo_rh != NULL) {
1552                 pseudo_ring = (mac_ring_t *)pseudo_rh;
1553                 /* Export the ddi handles to pseudo ring */
1554                 pseudo_ring->mr_info.mri_intr.mi_ddi_handle =
1555                     hw_ring->mr_info.mri_intr.mi_ddi_handle;
1556                 pseudo_ring->mr_info.mri_intr.mi_ddi_shared =
1557                     hw_ring->mr_info.mri_intr.mi_ddi_shared;
1558                 /*
1559                  * Save a pointer to pseudo ring in the hw ring. If
1560                  * interrupt handle changes, the hw ring will be
1561                  * notified of the change (see mac_ring_intr_set())
1562                  * and the appropriate change has to be made to
1563                  * the pseudo ring that has exported the ddi handle.
1564                  */
1565                 hw_ring->mr_prh = pseudo_rh;
1566         }
1567 
1568         if (hw_ring->mr_type == MAC_RING_TYPE_RX) {
1569                 ASSERT(!(mac_srs->srs_type & SRST_TX));
1570                 mac_srs->srs_mrh = prh;
1571                 mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
1572         }
1573 }
1574 
1575 void
1576 mac_hwring_teardown(mac_ring_handle_t hwrh)
1577 {
1578         mac_ring_t              *hw_ring = (mac_ring_t *)hwrh;
1579         mac_soft_ring_set_t     *mac_srs;
1580 
1581         if (hw_ring == NULL)
1582                 return;
1583         hw_ring->mr_prh = NULL;
1584         if (hw_ring->mr_type == MAC_RING_TYPE_RX) {
1585                 mac_srs = hw_ring->mr_srs;
1586                 ASSERT(!(mac_srs->srs_type & SRST_TX));
1587                 mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
1588                 mac_srs->srs_mrh = NULL;
1589         }
1590 }
1591 
1592 int
1593 mac_hwring_disable_intr(mac_ring_handle_t rh)
1594 {
1595         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1596         mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1597 
1598         return (intr->mi_disable(intr->mi_handle));
1599 }
1600 
1601 int
1602 mac_hwring_enable_intr(mac_ring_handle_t rh)
1603 {
1604         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1605         mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1606 
1607         return (intr->mi_enable(intr->mi_handle));
1608 }
1609 
1610 int
1611 mac_hwring_start(mac_ring_handle_t rh)
1612 {
1613         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1614 
1615         MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
1616         return (0);
1617 }
1618 
1619 void
1620 mac_hwring_stop(mac_ring_handle_t rh)
1621 {
1622         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1623 
1624         mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
1625 }
1626 
1627 mblk_t *
1628 mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
1629 {
1630         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1631         mac_ring_info_t *info = &rr_ring->mr_info;
1632 
1633         return (info->mri_poll(info->mri_driver, bytes_to_pickup));
1634 }
1635 
1636 /*
1637  * Send packets through a selected tx ring.
1638  */
1639 mblk_t *
1640 mac_hwring_tx(mac_ring_handle_t rh, mblk_t *mp)
1641 {
1642         mac_ring_t *ring = (mac_ring_t *)rh;
1643         mac_ring_info_t *info = &ring->mr_info;
1644 
1645         ASSERT(ring->mr_type == MAC_RING_TYPE_TX &&
1646             ring->mr_state >= MR_INUSE);
1647         return (info->mri_tx(info->mri_driver, mp));
1648 }
1649 
1650 /*
1651  * Query stats for a particular rx/tx ring
1652  */
1653 int
1654 mac_hwring_getstat(mac_ring_handle_t rh, uint_t stat, uint64_t *val)
1655 {
1656         mac_ring_t      *ring = (mac_ring_t *)rh;
1657         mac_ring_info_t *info = &ring->mr_info;
1658 
1659         return (info->mri_stat(info->mri_driver, stat, val));
1660 }
1661 
1662 /*
1663  * Private function that is only used by aggr to send packets through
1664  * a port/Tx ring. Since aggr exposes a pseudo Tx ring even for ports
1665  * that does not expose Tx rings, aggr_ring_tx() entry point needs
1666  * access to mac_impl_t to send packets through m_tx() entry point.
1667  * It accomplishes this by calling mac_hwring_send_priv() function.
1668  */
1669 mblk_t *
1670 mac_hwring_send_priv(mac_client_handle_t mch, mac_ring_handle_t rh, mblk_t *mp)
1671 {
1672         mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1673         mac_impl_t *mip = mcip->mci_mip;
1674 
1675         MAC_TX(mip, rh, mp, mcip);
1676         return (mp);
1677 }
1678 
1679 int
1680 mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
1681 {
1682         mac_group_t *group = (mac_group_t *)gh;
1683 
1684         return (mac_group_addmac(group, addr));
1685 }
1686 
1687 int
1688 mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
1689 {
1690         mac_group_t *group = (mac_group_t *)gh;
1691 
1692         return (mac_group_remmac(group, addr));
1693 }
1694 
1695 /*
1696  * Set the RX group to be shared/reserved. Note that the group must be
1697  * started/stopped outside of this function.
1698  */
1699 void
1700 mac_set_group_state(mac_group_t *grp, mac_group_state_t state)
1701 {
1702         /*
1703          * If there is no change in the group state, just return.
1704          */
1705         if (grp->mrg_state == state)
1706                 return;
1707 
1708         switch (state) {
1709         case MAC_GROUP_STATE_RESERVED:
1710                 /*
1711                  * Successfully reserved the group.
1712                  *
1713                  * Given that there is an exclusive client controlling this
1714                  * group, we enable the group level polling when available,
1715                  * so that SRSs get to turn on/off individual rings they's
1716                  * assigned to.
1717                  */
1718                 ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1719 
1720                 if (grp->mrg_type == MAC_RING_TYPE_RX &&
1721                     GROUP_INTR_DISABLE_FUNC(grp) != NULL) {
1722                         GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1723                 }
1724                 break;
1725 
1726         case MAC_GROUP_STATE_SHARED:
1727                 /*
1728                  * Set all rings of this group to software classified.
1729                  * If the group has an overriding interrupt, then re-enable it.
1730                  */
1731                 ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1732 
1733                 if (grp->mrg_type == MAC_RING_TYPE_RX &&
1734                     GROUP_INTR_ENABLE_FUNC(grp) != NULL) {
1735                         GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1736                 }
1737                 /* The ring is not available for reservations any more */
1738                 break;
1739 
1740         case MAC_GROUP_STATE_REGISTERED:
1741                 /* Also callable from mac_register, perim is not held */
1742                 break;
1743 
1744         default:
1745                 ASSERT(B_FALSE);
1746                 break;
1747         }
1748 
1749         grp->mrg_state = state;
1750 }
1751 
1752 /*
1753  * Quiesce future hardware classified packets for the specified Rx ring
1754  */
1755 static void
1756 mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
1757 {
1758         ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
1759         ASSERT(ring_flag == MR_CONDEMNED || ring_flag  == MR_QUIESCE);
1760 
1761         mutex_enter(&rx_ring->mr_lock);
1762         rx_ring->mr_flag |= ring_flag;
1763         while (rx_ring->mr_refcnt != 0)
1764                 cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
1765         mutex_exit(&rx_ring->mr_lock);
1766 }
1767 
1768 /*
1769  * Please see mac_tx for details about the per cpu locking scheme
1770  */
1771 static void
1772 mac_tx_lock_all(mac_client_impl_t *mcip)
1773 {
1774         int     i;
1775 
1776         for (i = 0; i <= mac_tx_percpu_cnt; i++)
1777                 mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1778 }
1779 
1780 static void
1781 mac_tx_unlock_all(mac_client_impl_t *mcip)
1782 {
1783         int     i;
1784 
1785         for (i = mac_tx_percpu_cnt; i >= 0; i--)
1786                 mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1787 }
1788 
1789 static void
1790 mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
1791 {
1792         int     i;
1793 
1794         for (i = mac_tx_percpu_cnt; i > 0; i--)
1795                 mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1796 }
1797 
1798 static int
1799 mac_tx_sum_refcnt(mac_client_impl_t *mcip)
1800 {
1801         int     i;
1802         int     refcnt = 0;
1803 
1804         for (i = 0; i <= mac_tx_percpu_cnt; i++)
1805                 refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
1806 
1807         return (refcnt);
1808 }
1809 
1810 /*
1811  * Stop future Tx packets coming down from the client in preparation for
1812  * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
1813  * of rings between clients
1814  */
1815 void
1816 mac_tx_client_block(mac_client_impl_t *mcip)
1817 {
1818         mac_tx_lock_all(mcip);
1819         mcip->mci_tx_flag |= MCI_TX_QUIESCE;
1820         while (mac_tx_sum_refcnt(mcip) != 0) {
1821                 mac_tx_unlock_allbutzero(mcip);
1822                 cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1823                 mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1824                 mac_tx_lock_all(mcip);
1825         }
1826         mac_tx_unlock_all(mcip);
1827 }
1828 
1829 void
1830 mac_tx_client_unblock(mac_client_impl_t *mcip)
1831 {
1832         mac_tx_lock_all(mcip);
1833         mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
1834         mac_tx_unlock_all(mcip);
1835         /*
1836          * We may fail to disable flow control for the last MAC_NOTE_TX
1837          * notification because the MAC client is quiesced. Send the
1838          * notification again.
1839          */
1840         i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
1841 }
1842 
1843 /*
1844  * Wait for an SRS to quiesce. The SRS worker will signal us when the
1845  * quiesce is done.
1846  */
1847 static void
1848 mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
1849 {
1850         mutex_enter(&srs->srs_lock);
1851         while (!(srs->srs_state & srs_flag))
1852                 cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
1853         mutex_exit(&srs->srs_lock);
1854 }
1855 
1856 /*
1857  * Quiescing an Rx SRS is achieved by the following sequence. The protocol
1858  * works bottom up by cutting off packet flow from the bottommost point in the
1859  * mac, then the SRS, and then the soft rings. There are 2 use cases of this
1860  * mechanism. One is a temporary quiesce of the SRS, such as say while changing
1861  * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
1862  * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
1863  * for the SRS and MR flags. In the former case the threads pause waiting for
1864  * a restart, while in the latter case the threads exit. The Tx SRS teardown
1865  * is also mostly similar to the above.
1866  *
1867  * 1. Stop future hardware classified packets at the lowest level in the mac.
1868  *    Remove any hardware classification rule (CONDEMNED case) and mark the
1869  *    rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
1870  *    from increasing. Upcalls from the driver that come through hardware
1871  *    classification will be dropped in mac_rx from now on. Then we wait for
1872  *    the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
1873  *    sure there aren't any upcall threads from the driver through hardware
1874  *    classification. In the case of SRS teardown we also remove the
1875  *    classification rule in the driver.
1876  *
1877  * 2. Stop future software classified packets by marking the flow entry with
1878  *    FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
1879  *    increasing. We also remove the flow entry from the table in the latter
1880  *    case. Then wait for the fe_refcnt to reach an appropriate quiescent value
1881  *    that indicates there aren't any active threads using that flow entry.
1882  *
1883  * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
1884  *    SRS worker thread, and the soft ring threads are quiesced in sequence
1885  *    with the SRS worker thread serving as a master controller. This
1886  *    mechansim is explained in mac_srs_worker_quiesce().
1887  *
1888  * The restart mechanism to reactivate the SRS and softrings is explained
1889  * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
1890  * restart sequence.
1891  */
1892 void
1893 mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
1894 {
1895         flow_entry_t    *flent = srs->srs_flent;
1896         uint_t  mr_flag, srs_done_flag;
1897 
1898         ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1899         ASSERT(!(srs->srs_type & SRST_TX));
1900 
1901         if (srs_quiesce_flag == SRS_CONDEMNED) {
1902                 mr_flag = MR_CONDEMNED;
1903                 srs_done_flag = SRS_CONDEMNED_DONE;
1904                 if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1905                         mac_srs_client_poll_disable(srs->srs_mcip, srs);
1906         } else {
1907                 ASSERT(srs_quiesce_flag == SRS_QUIESCE);
1908                 mr_flag = MR_QUIESCE;
1909                 srs_done_flag = SRS_QUIESCE_DONE;
1910                 if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1911                         mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
1912         }
1913 
1914         if (srs->srs_ring != NULL) {
1915                 mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
1916         } else {
1917                 /*
1918                  * SRS is driven by software classification. In case
1919                  * of CONDEMNED, the top level teardown functions will
1920                  * deal with flow removal.
1921                  */
1922                 if (srs_quiesce_flag != SRS_CONDEMNED) {
1923                         FLOW_MARK(flent, FE_QUIESCE);
1924                         mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1925                 }
1926         }
1927 
1928         /*
1929          * Signal the SRS to quiesce itself, and then cv_wait for the
1930          * SRS quiesce to complete. The SRS worker thread will wake us
1931          * up when the quiesce is complete
1932          */
1933         mac_srs_signal(srs, srs_quiesce_flag);
1934         mac_srs_quiesce_wait(srs, srs_done_flag);
1935 }
1936 
1937 /*
1938  * Remove an SRS.
1939  */
1940 void
1941 mac_rx_srs_remove(mac_soft_ring_set_t *srs)
1942 {
1943         flow_entry_t *flent = srs->srs_flent;
1944         int i;
1945 
1946         mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
1947         /*
1948          * Locate and remove our entry in the fe_rx_srs[] array, and
1949          * adjust the fe_rx_srs array entries and array count by
1950          * moving the last entry into the vacated spot.
1951          */
1952         mutex_enter(&flent->fe_lock);
1953         for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1954                 if (flent->fe_rx_srs[i] == srs)
1955                         break;
1956         }
1957 
1958         ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
1959         if (i != flent->fe_rx_srs_cnt - 1) {
1960                 flent->fe_rx_srs[i] =
1961                     flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
1962                 i = flent->fe_rx_srs_cnt - 1;
1963         }
1964 
1965         flent->fe_rx_srs[i] = NULL;
1966         flent->fe_rx_srs_cnt--;
1967         mutex_exit(&flent->fe_lock);
1968 
1969         mac_srs_free(srs);
1970 }
1971 
1972 static void
1973 mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
1974 {
1975         mutex_enter(&srs->srs_lock);
1976         srs->srs_state &= ~flag;
1977         mutex_exit(&srs->srs_lock);
1978 }
1979 
1980 void
1981 mac_rx_srs_restart(mac_soft_ring_set_t *srs)
1982 {
1983         flow_entry_t    *flent = srs->srs_flent;
1984         mac_ring_t      *mr;
1985 
1986         ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1987         ASSERT((srs->srs_type & SRST_TX) == 0);
1988 
1989         /*
1990          * This handles a change in the number of SRSs between the quiesce and
1991          * and restart operation of a flow.
1992          */
1993         if (!SRS_QUIESCED(srs))
1994                 return;
1995 
1996         /*
1997          * Signal the SRS to restart itself. Wait for the restart to complete
1998          * Note that we only restart the SRS if it is not marked as
1999          * permanently quiesced.
2000          */
2001         if (!SRS_QUIESCED_PERMANENT(srs)) {
2002                 mac_srs_signal(srs, SRS_RESTART);
2003                 mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2004                 mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2005 
2006                 mac_srs_client_poll_restart(srs->srs_mcip, srs);
2007         }
2008 
2009         /* Finally clear the flags to let the packets in */
2010         mr = srs->srs_ring;
2011         if (mr != NULL) {
2012                 MAC_RING_UNMARK(mr, MR_QUIESCE);
2013                 /* In case the ring was stopped, safely restart it */
2014                 if (mr->mr_state != MR_INUSE)
2015                         (void) mac_start_ring(mr);
2016         } else {
2017                 FLOW_UNMARK(flent, FE_QUIESCE);
2018         }
2019 }
2020 
2021 /*
2022  * Temporary quiesce of a flow and associated Rx SRS.
2023  * Please see block comment above mac_rx_classify_flow_rem.
2024  */
2025 /* ARGSUSED */
2026 int
2027 mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
2028 {
2029         int             i;
2030 
2031         for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2032                 mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
2033                     SRS_QUIESCE);
2034         }
2035         return (0);
2036 }
2037 
2038 /*
2039  * Restart a flow and associated Rx SRS that has been quiesced temporarily
2040  * Please see block comment above mac_rx_classify_flow_rem
2041  */
2042 /* ARGSUSED */
2043 int
2044 mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
2045 {
2046         int             i;
2047 
2048         for (i = 0; i < flent->fe_rx_srs_cnt; i++)
2049                 mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
2050 
2051         return (0);
2052 }
2053 
2054 void
2055 mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
2056 {
2057         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
2058         flow_entry_t            *flent = mcip->mci_flent;
2059         mac_impl_t              *mip = mcip->mci_mip;
2060         mac_soft_ring_set_t     *mac_srs;
2061         int                     i;
2062 
2063         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2064 
2065         if (flent == NULL)
2066                 return;
2067 
2068         for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2069                 mac_srs = flent->fe_rx_srs[i];
2070                 mutex_enter(&mac_srs->srs_lock);
2071                 if (on)
2072                         mac_srs->srs_state |= SRS_QUIESCE_PERM;
2073                 else
2074                         mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
2075                 mutex_exit(&mac_srs->srs_lock);
2076         }
2077 }
2078 
2079 void
2080 mac_rx_client_quiesce(mac_client_handle_t mch)
2081 {
2082         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
2083         mac_impl_t              *mip = mcip->mci_mip;
2084 
2085         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2086 
2087         if (MCIP_DATAPATH_SETUP(mcip)) {
2088                 (void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
2089                     NULL);
2090                 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2091                     mac_rx_classify_flow_quiesce, NULL);
2092         }
2093 }
2094 
2095 void
2096 mac_rx_client_restart(mac_client_handle_t mch)
2097 {
2098         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
2099         mac_impl_t              *mip = mcip->mci_mip;
2100 
2101         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2102 
2103         if (MCIP_DATAPATH_SETUP(mcip)) {
2104                 (void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
2105                 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2106                     mac_rx_classify_flow_restart, NULL);
2107         }
2108 }
2109 
2110 /*
2111  * This function only quiesces the Tx SRS and softring worker threads. Callers
2112  * need to make sure that there aren't any mac client threads doing current or
2113  * future transmits in the mac before calling this function.
2114  */
2115 void
2116 mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
2117 {
2118         mac_client_impl_t       *mcip = srs->srs_mcip;
2119 
2120         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2121 
2122         ASSERT(srs->srs_type & SRST_TX);
2123         ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
2124             srs_quiesce_flag == SRS_QUIESCE);
2125 
2126         /*
2127          * Signal the SRS to quiesce itself, and then cv_wait for the
2128          * SRS quiesce to complete. The SRS worker thread will wake us
2129          * up when the quiesce is complete
2130          */
2131         mac_srs_signal(srs, srs_quiesce_flag);
2132         mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
2133             SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
2134 }
2135 
2136 void
2137 mac_tx_srs_restart(mac_soft_ring_set_t *srs)
2138 {
2139         /*
2140          * Resizing the fanout could result in creation of new SRSs.
2141          * They may not necessarily be in the quiesced state in which
2142          * case it need be restarted
2143          */
2144         if (!SRS_QUIESCED(srs))
2145                 return;
2146 
2147         mac_srs_signal(srs, SRS_RESTART);
2148         mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2149         mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2150 }
2151 
2152 /*
2153  * Temporary quiesce of a flow and associated Rx SRS.
2154  * Please see block comment above mac_rx_srs_quiesce
2155  */
2156 /* ARGSUSED */
2157 int
2158 mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
2159 {
2160         /*
2161          * The fe_tx_srs is null for a subflow on an interface that is
2162          * not plumbed
2163          */
2164         if (flent->fe_tx_srs != NULL)
2165                 mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
2166         return (0);
2167 }
2168 
2169 /* ARGSUSED */
2170 int
2171 mac_tx_flow_restart(flow_entry_t *flent, void *arg)
2172 {
2173         /*
2174          * The fe_tx_srs is null for a subflow on an interface that is
2175          * not plumbed
2176          */
2177         if (flent->fe_tx_srs != NULL)
2178                 mac_tx_srs_restart(flent->fe_tx_srs);
2179         return (0);
2180 }
2181 
2182 static void
2183 i_mac_tx_client_quiesce(mac_client_handle_t mch, uint_t srs_quiesce_flag)
2184 {
2185         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
2186 
2187         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2188 
2189         mac_tx_client_block(mcip);
2190         if (MCIP_TX_SRS(mcip) != NULL) {
2191                 mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
2192                 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2193                     mac_tx_flow_quiesce, NULL);
2194         }
2195 }
2196 
2197 void
2198 mac_tx_client_quiesce(mac_client_handle_t mch)
2199 {
2200         i_mac_tx_client_quiesce(mch, SRS_QUIESCE);
2201 }
2202 
2203 void
2204 mac_tx_client_condemn(mac_client_handle_t mch)
2205 {
2206         i_mac_tx_client_quiesce(mch, SRS_CONDEMNED);
2207 }
2208 
2209 void
2210 mac_tx_client_restart(mac_client_handle_t mch)
2211 {
2212         mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
2213 
2214         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2215 
2216         mac_tx_client_unblock(mcip);
2217         if (MCIP_TX_SRS(mcip) != NULL) {
2218                 mac_tx_srs_restart(MCIP_TX_SRS(mcip));
2219                 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2220                     mac_tx_flow_restart, NULL);
2221         }
2222 }
2223 
2224 void
2225 mac_tx_client_flush(mac_client_impl_t *mcip)
2226 {
2227         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2228 
2229         mac_tx_client_quiesce((mac_client_handle_t)mcip);
2230         mac_tx_client_restart((mac_client_handle_t)mcip);
2231 }
2232 
2233 void
2234 mac_client_quiesce(mac_client_impl_t *mcip)
2235 {
2236         mac_rx_client_quiesce((mac_client_handle_t)mcip);
2237         mac_tx_client_quiesce((mac_client_handle_t)mcip);
2238 }
2239 
2240 void
2241 mac_client_restart(mac_client_impl_t *mcip)
2242 {
2243         mac_rx_client_restart((mac_client_handle_t)mcip);
2244         mac_tx_client_restart((mac_client_handle_t)mcip);
2245 }
2246 
2247 /*
2248  * Allocate a minor number.
2249  */
2250 minor_t
2251 mac_minor_hold(boolean_t sleep)
2252 {
2253         minor_t minor;
2254 
2255         /*
2256          * Grab a value from the arena.
2257          */
2258         atomic_add_32(&minor_count, 1);
2259 
2260         if (sleep)
2261                 minor = (uint_t)id_alloc(minor_ids);
2262         else
2263                 minor = (uint_t)id_alloc_nosleep(minor_ids);
2264 
2265         if (minor == 0) {
2266                 atomic_add_32(&minor_count, -1);
2267                 return (0);
2268         }
2269 
2270         return (minor);
2271 }
2272 
2273 /*
2274  * Release a previously allocated minor number.
2275  */
2276 void
2277 mac_minor_rele(minor_t minor)
2278 {
2279         /*
2280          * Return the value to the arena.
2281          */
2282         id_free(minor_ids, minor);
2283         atomic_add_32(&minor_count, -1);
2284 }
2285 
2286 uint32_t
2287 mac_no_notification(mac_handle_t mh)
2288 {
2289         mac_impl_t *mip = (mac_impl_t *)mh;
2290 
2291         return (((mip->mi_state_flags & MIS_LEGACY) != 0) ?
2292             mip->mi_capab_legacy.ml_unsup_note : 0);
2293 }
2294 
2295 /*
2296  * Prevent any new opens of this mac in preparation for unregister
2297  */
2298 int
2299 i_mac_disable(mac_impl_t *mip)
2300 {
2301         mac_client_impl_t       *mcip;
2302 
2303         rw_enter(&i_mac_impl_lock, RW_WRITER);
2304         if (mip->mi_state_flags & MIS_DISABLED) {
2305                 /* Already disabled, return success */
2306                 rw_exit(&i_mac_impl_lock);
2307                 return (0);
2308         }
2309         /*
2310          * See if there are any other references to this mac_t (e.g., VLAN's).
2311          * If so return failure. If all the other checks below pass, then
2312          * set mi_disabled atomically under the i_mac_impl_lock to prevent
2313          * any new VLAN's from being created or new mac client opens of this
2314          * mac end point.
2315          */
2316         if (mip->mi_ref > 0) {
2317                 rw_exit(&i_mac_impl_lock);
2318                 return (EBUSY);
2319         }
2320 
2321         /*
2322          * mac clients must delete all multicast groups they join before
2323          * closing. bcast groups are reference counted, the last client
2324          * to delete the group will wait till the group is physically
2325          * deleted. Since all clients have closed this mac end point
2326          * mi_bcast_ngrps must be zero at this point
2327          */
2328         ASSERT(mip->mi_bcast_ngrps == 0);
2329 
2330         /*
2331          * Don't let go of this if it has some flows.
2332          * All other code guarantees no flows are added to a disabled
2333          * mac, therefore it is sufficient to check for the flow table
2334          * only here.
2335          */
2336         mcip = mac_primary_client_handle(mip);
2337         if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
2338                 rw_exit(&i_mac_impl_lock);
2339                 return (ENOTEMPTY);
2340         }
2341 
2342         mip->mi_state_flags |= MIS_DISABLED;
2343         rw_exit(&i_mac_impl_lock);
2344         return (0);
2345 }
2346 
2347 int
2348 mac_disable_nowait(mac_handle_t mh)
2349 {
2350         mac_impl_t      *mip = (mac_impl_t *)mh;
2351         int err;
2352 
2353         if ((err = i_mac_perim_enter_nowait(mip)) != 0)
2354                 return (err);
2355         err = i_mac_disable(mip);
2356         i_mac_perim_exit(mip);
2357         return (err);
2358 }
2359 
2360 int
2361 mac_disable(mac_handle_t mh)
2362 {
2363         mac_impl_t      *mip = (mac_impl_t *)mh;
2364         int err;
2365 
2366         i_mac_perim_enter(mip);
2367         err = i_mac_disable(mip);
2368         i_mac_perim_exit(mip);
2369 
2370         /*
2371          * Clean up notification thread and wait for it to exit.
2372          */
2373         if (err == 0)
2374                 i_mac_notify_exit(mip);
2375 
2376         return (err);
2377 }
2378 
2379 /*
2380  * Called when the MAC instance has a non empty flow table, to de-multiplex
2381  * incoming packets to the right flow.
2382  * The MAC's rw lock is assumed held as a READER.
2383  */
2384 /* ARGSUSED */
2385 static mblk_t *
2386 mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
2387 {
2388         flow_entry_t    *flent = NULL;
2389         uint_t          flags = FLOW_INBOUND;
2390         int             err;
2391 
2392         /*
2393          * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
2394          * to mac_flow_lookup() so that the VLAN packets can be successfully
2395          * passed to the non-VLAN aggregation flows.
2396          *
2397          * Note that there is possibly a race between this and
2398          * mac_unicast_remove/add() and VLAN packets could be incorrectly
2399          * classified to non-VLAN flows of non-aggregation mac clients. These
2400          * VLAN packets will be then filtered out by the mac module.
2401          */
2402         if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
2403                 flags |= FLOW_IGNORE_VLAN;
2404 
2405         err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
2406         if (err != 0) {
2407                 /* no registered receive function */
2408                 return (mp);
2409         } else {
2410                 mac_client_impl_t       *mcip;
2411 
2412                 /*
2413                  * This flent might just be an additional one on the MAC client,
2414                  * i.e. for classification purposes (different fdesc), however
2415                  * the resources, SRS et. al., are in the mci_flent, so if
2416                  * this isn't the mci_flent, we need to get it.
2417                  */
2418                 if ((mcip = flent->fe_mcip) != NULL &&
2419                     mcip->mci_flent != flent) {
2420                         FLOW_REFRELE(flent);
2421                         flent = mcip->mci_flent;
2422                         FLOW_TRY_REFHOLD(flent, err);
2423                         if (err != 0)
2424                                 return (mp);
2425                 }
2426                 (flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
2427                     B_FALSE);
2428                 FLOW_REFRELE(flent);
2429         }
2430         return (NULL);
2431 }
2432 
2433 mblk_t *
2434 mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
2435 {
2436         mac_impl_t      *mip = (mac_impl_t *)mh;
2437         mblk_t          *bp, *bp1, **bpp, *list = NULL;
2438 
2439         /*
2440          * We walk the chain and attempt to classify each packet.
2441          * The packets that couldn't be classified will be returned
2442          * back to the caller.
2443          */
2444         bp = mp_chain;
2445         bpp = &list;
2446         while (bp != NULL) {
2447                 bp1 = bp;
2448                 bp = bp->b_next;
2449                 bp1->b_next = NULL;
2450 
2451                 if (mac_rx_classify(mip, mrh, bp1) != NULL) {
2452                         *bpp = bp1;
2453                         bpp = &bp1->b_next;
2454                 }
2455         }
2456         return (list);
2457 }
2458 
2459 static int
2460 mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
2461 {
2462         mac_ring_handle_t ring = arg;
2463 
2464         if (flent->fe_tx_srs)
2465                 mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
2466         return (0);
2467 }
2468 
2469 void
2470 i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
2471 {
2472         mac_client_impl_t       *cclient;
2473         mac_soft_ring_set_t     *mac_srs;
2474 
2475         /*
2476          * After grabbing the mi_rw_lock, the list of clients can't change.
2477          * If there are any clients mi_disabled must be B_FALSE and can't
2478          * get set since there are clients. If there aren't any clients we
2479          * don't do anything. In any case the mip has to be valid. The driver
2480          * must make sure that it goes single threaded (with respect to mac
2481          * calls) and wait for all pending mac calls to finish before calling
2482          * mac_unregister.
2483          */
2484         rw_enter(&i_mac_impl_lock, RW_READER);
2485         if (mip->mi_state_flags & MIS_DISABLED) {
2486                 rw_exit(&i_mac_impl_lock);
2487                 return;
2488         }
2489 
2490         /*
2491          * Get MAC tx srs from walking mac_client_handle list.
2492          */
2493         rw_enter(&mip->mi_rw_lock, RW_READER);
2494         for (cclient = mip->mi_clients_list; cclient != NULL;
2495             cclient = cclient->mci_client_next) {
2496                 if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL) {
2497                         mac_tx_srs_wakeup(mac_srs, ring);
2498                 } else {
2499                         /*
2500                          * Aggr opens underlying ports in exclusive mode
2501                          * and registers flow control callbacks using
2502                          * mac_tx_client_notify(). When opened in
2503                          * exclusive mode, Tx SRS won't be created
2504                          * during mac_unicast_add().
2505                          */
2506                         if (cclient->mci_state_flags & MCIS_EXCLUSIVE) {
2507                                 mac_tx_invoke_callbacks(cclient,
2508                                     (mac_tx_cookie_t)ring);
2509                         }
2510                 }
2511                 (void) mac_flow_walk(cclient->mci_subflow_tab,
2512                     mac_tx_flow_srs_wakeup, ring);
2513         }
2514         rw_exit(&mip->mi_rw_lock);
2515         rw_exit(&i_mac_impl_lock);
2516 }
2517 
2518 /* ARGSUSED */
2519 void
2520 mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
2521     boolean_t add)
2522 {
2523         mac_impl_t *mip = (mac_impl_t *)mh;
2524 
2525         i_mac_perim_enter((mac_impl_t *)mh);
2526         /*
2527          * If no specific refresh function was given then default to the
2528          * driver's m_multicst entry point.
2529          */
2530         if (refresh == NULL) {
2531                 refresh = mip->mi_multicst;
2532                 arg = mip->mi_driver;
2533         }
2534 
2535         mac_bcast_refresh(mip, refresh, arg, add);
2536         i_mac_perim_exit((mac_impl_t *)mh);
2537 }
2538 
2539 void
2540 mac_promisc_refresh(mac_handle_t mh, mac_setpromisc_t refresh, void *arg)
2541 {
2542         mac_impl_t      *mip = (mac_impl_t *)mh;
2543 
2544         /*
2545          * If no specific refresh function was given then default to the
2546          * driver's m_promisc entry point.
2547          */
2548         if (refresh == NULL) {
2549                 refresh = mip->mi_setpromisc;
2550                 arg = mip->mi_driver;
2551         }
2552         ASSERT(refresh != NULL);
2553 
2554         /*
2555          * Call the refresh function with the current promiscuity.
2556          */
2557         refresh(arg, (mip->mi_devpromisc != 0));
2558 }
2559 
2560 /*
2561  * The mac client requests that the mac not to change its margin size to
2562  * be less than the specified value.  If "current" is B_TRUE, then the client
2563  * requests the mac not to change its margin size to be smaller than the
2564  * current size. Further, return the current margin size value in this case.
2565  *
2566  * We keep every requested size in an ordered list from largest to smallest.
2567  */
2568 int
2569 mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
2570 {
2571         mac_impl_t              *mip = (mac_impl_t *)mh;
2572         mac_margin_req_t        **pp, *p;
2573         int                     err = 0;
2574 
2575         rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2576         if (current)
2577                 *marginp = mip->mi_margin;
2578 
2579         /*
2580          * If the current margin value cannot satisfy the margin requested,
2581          * return ENOTSUP directly.
2582          */
2583         if (*marginp > mip->mi_margin) {
2584                 err = ENOTSUP;
2585                 goto done;
2586         }
2587 
2588         /*
2589          * Check whether the given margin is already in the list. If so,
2590          * bump the reference count.
2591          */
2592         for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
2593                 if (p->mmr_margin == *marginp) {
2594                         /*
2595                          * The margin requested is already in the list,
2596                          * so just bump the reference count.
2597                          */
2598                         p->mmr_ref++;
2599                         goto done;
2600                 }
2601                 if (p->mmr_margin < *marginp)
2602                         break;
2603         }
2604 
2605 
2606         p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
2607         p->mmr_margin = *marginp;
2608         p->mmr_ref++;
2609         p->mmr_nextp = *pp;
2610         *pp = p;
2611 
2612 done:
2613         rw_exit(&(mip->mi_rw_lock));
2614         return (err);
2615 }
2616 
2617 /*
2618  * The mac client requests to cancel its previous mac_margin_add() request.
2619  * We remove the requested margin size from the list.
2620  */
2621 int
2622 mac_margin_remove(mac_handle_t mh, uint32_t margin)
2623 {
2624         mac_impl_t              *mip = (mac_impl_t *)mh;
2625         mac_margin_req_t        **pp, *p;
2626         int                     err = 0;
2627 
2628         rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2629         /*
2630          * Find the entry in the list for the given margin.
2631          */
2632         for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
2633                 if (p->mmr_margin == margin) {
2634                         if (--p->mmr_ref == 0)
2635                                 break;
2636 
2637                         /*
2638                          * There is still a reference to this address so
2639                          * there's nothing more to do.
2640                          */
2641                         goto done;
2642                 }
2643         }
2644 
2645         /*
2646          * We did not find an entry for the given margin.
2647          */
2648         if (p == NULL) {
2649                 err = ENOENT;
2650                 goto done;
2651         }
2652 
2653         ASSERT(p->mmr_ref == 0);
2654 
2655         /*
2656          * Remove it from the list.
2657          */
2658         *pp = p->mmr_nextp;
2659         kmem_free(p, sizeof (mac_margin_req_t));
2660 done:
2661         rw_exit(&(mip->mi_rw_lock));
2662         return (err);
2663 }
2664 
2665 boolean_t
2666 mac_margin_update(mac_handle_t mh, uint32_t margin)
2667 {
2668         mac_impl_t      *mip = (mac_impl_t *)mh;
2669         uint32_t        margin_needed = 0;
2670 
2671         rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2672 
2673         if (mip->mi_mmrp != NULL)
2674                 margin_needed = mip->mi_mmrp->mmr_margin;
2675 
2676         if (margin_needed <= margin)
2677                 mip->mi_margin = margin;
2678 
2679         rw_exit(&(mip->mi_rw_lock));
2680 
2681         if (margin_needed <= margin)
2682                 i_mac_notify(mip, MAC_NOTE_MARGIN);
2683 
2684         return (margin_needed <= margin);
2685 }
2686 
2687 /*
2688  * MAC Type Plugin functions.
2689  */
2690 
2691 mactype_t *
2692 mactype_getplugin(const char *pname)
2693 {
2694         mactype_t       *mtype = NULL;
2695         boolean_t       tried_modload = B_FALSE;
2696 
2697         mutex_enter(&i_mactype_lock);
2698 
2699 find_registered_mactype:
2700         if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
2701             (mod_hash_val_t *)&mtype) != 0) {
2702                 if (!tried_modload) {
2703                         /*
2704                          * If the plugin has not yet been loaded, then
2705                          * attempt to load it now.  If modload() succeeds,
2706                          * the plugin should have registered using
2707                          * mactype_register(), in which case we can go back
2708                          * and attempt to find it again.
2709                          */
2710                         if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
2711                                 tried_modload = B_TRUE;
2712                                 goto find_registered_mactype;
2713                         }
2714                 }
2715         } else {
2716                 /*
2717                  * Note that there's no danger that the plugin we've loaded
2718                  * could be unloaded between the modload() step and the
2719                  * reference count bump here, as we're holding
2720                  * i_mactype_lock, which mactype_unregister() also holds.
2721                  */
2722                 atomic_inc_32(&mtype->mt_ref);
2723         }
2724 
2725         mutex_exit(&i_mactype_lock);
2726         return (mtype);
2727 }
2728 
2729 mactype_register_t *
2730 mactype_alloc(uint_t mactype_version)
2731 {
2732         mactype_register_t *mtrp;
2733 
2734         /*
2735          * Make sure there isn't a version mismatch between the plugin and
2736          * the framework.  In the future, if multiple versions are
2737          * supported, this check could become more sophisticated.
2738          */
2739         if (mactype_version != MACTYPE_VERSION)
2740                 return (NULL);
2741 
2742         mtrp = kmem_zalloc(sizeof (mactype_register_t), KM_SLEEP);
2743         mtrp->mtr_version = mactype_version;
2744         return (mtrp);
2745 }
2746 
2747 void
2748 mactype_free(mactype_register_t *mtrp)
2749 {
2750         kmem_free(mtrp, sizeof (mactype_register_t));
2751 }
2752 
2753 int
2754 mactype_register(mactype_register_t *mtrp)
2755 {
2756         mactype_t       *mtp;
2757         mactype_ops_t   *ops = mtrp->mtr_ops;
2758 
2759         /* Do some sanity checking before we register this MAC type. */
2760         if (mtrp->mtr_ident == NULL || ops == NULL)
2761                 return (EINVAL);
2762 
2763         /*
2764          * Verify that all mandatory callbacks are set in the ops
2765          * vector.
2766          */
2767         if (ops->mtops_unicst_verify == NULL ||
2768             ops->mtops_multicst_verify == NULL ||
2769             ops->mtops_sap_verify == NULL ||
2770             ops->mtops_header == NULL ||
2771             ops->mtops_header_info == NULL) {
2772                 return (EINVAL);
2773         }
2774 
2775         mtp = kmem_zalloc(sizeof (*mtp), KM_SLEEP);
2776         mtp->mt_ident = mtrp->mtr_ident;
2777         mtp->mt_ops = *ops;
2778         mtp->mt_type = mtrp->mtr_mactype;
2779         mtp->mt_nativetype = mtrp->mtr_nativetype;
2780         mtp->mt_addr_length = mtrp->mtr_addrlen;
2781         if (mtrp->mtr_brdcst_addr != NULL) {
2782                 mtp->mt_brdcst_addr = kmem_alloc(mtrp->mtr_addrlen, KM_SLEEP);
2783                 bcopy(mtrp->mtr_brdcst_addr, mtp->mt_brdcst_addr,
2784                     mtrp->mtr_addrlen);
2785         }
2786 
2787         mtp->mt_stats = mtrp->mtr_stats;
2788         mtp->mt_statcount = mtrp->mtr_statcount;
2789 
2790         mtp->mt_mapping = mtrp->mtr_mapping;
2791         mtp->mt_mappingcount = mtrp->mtr_mappingcount;
2792 
2793         if (mod_hash_insert(i_mactype_hash,
2794             (mod_hash_key_t)mtp->mt_ident, (mod_hash_val_t)mtp) != 0) {
2795                 kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2796                 kmem_free(mtp, sizeof (*mtp));
2797                 return (EEXIST);
2798         }
2799         return (0);
2800 }
2801 
2802 int
2803 mactype_unregister(const char *ident)
2804 {
2805         mactype_t       *mtp;
2806         mod_hash_val_t  val;
2807         int             err;
2808 
2809         /*
2810          * Let's not allow MAC drivers to use this plugin while we're
2811          * trying to unregister it.  Holding i_mactype_lock also prevents a
2812          * plugin from unregistering while a MAC driver is attempting to
2813          * hold a reference to it in i_mactype_getplugin().
2814          */
2815         mutex_enter(&i_mactype_lock);
2816 
2817         if ((err = mod_hash_find(i_mactype_hash, (mod_hash_key_t)ident,
2818             (mod_hash_val_t *)&mtp)) != 0) {
2819                 /* A plugin is trying to unregister, but it never registered. */
2820                 err = ENXIO;
2821                 goto done;
2822         }
2823 
2824         if (mtp->mt_ref != 0) {
2825                 err = EBUSY;
2826                 goto done;
2827         }
2828 
2829         err = mod_hash_remove(i_mactype_hash, (mod_hash_key_t)ident, &val);
2830         ASSERT(err == 0);
2831         if (err != 0) {
2832                 /* This should never happen, thus the ASSERT() above. */
2833                 err = EINVAL;
2834                 goto done;
2835         }
2836         ASSERT(mtp == (mactype_t *)val);
2837 
2838         if (mtp->mt_brdcst_addr != NULL)
2839                 kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2840         kmem_free(mtp, sizeof (mactype_t));
2841 done:
2842         mutex_exit(&i_mactype_lock);
2843         return (err);
2844 }
2845 
2846 /*
2847  * Checks the size of the value size specified for a property as
2848  * part of a property operation. Returns B_TRUE if the size is
2849  * correct, B_FALSE otherwise.
2850  */
2851 boolean_t
2852 mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range)
2853 {
2854         uint_t minsize = 0;
2855 
2856         if (is_range)
2857                 return (valsize >= sizeof (mac_propval_range_t));
2858 
2859         switch (id) {
2860         case MAC_PROP_ZONE:
2861                 minsize = sizeof (dld_ioc_zid_t);
2862                 break;
2863         case MAC_PROP_AUTOPUSH:
2864                 if (valsize != 0)
2865                         minsize = sizeof (struct dlautopush);
2866                 break;
2867         case MAC_PROP_TAGMODE:
2868                 minsize = sizeof (link_tagmode_t);
2869                 break;
2870         case MAC_PROP_RESOURCE:
2871         case MAC_PROP_RESOURCE_EFF:
2872                 minsize = sizeof (mac_resource_props_t);
2873                 break;
2874         case MAC_PROP_DUPLEX:
2875                 minsize = sizeof (link_duplex_t);
2876                 break;
2877         case MAC_PROP_SPEED:
2878                 minsize = sizeof (uint64_t);
2879                 break;
2880         case MAC_PROP_STATUS:
2881                 minsize = sizeof (link_state_t);
2882                 break;
2883         case MAC_PROP_AUTONEG:
2884         case MAC_PROP_EN_AUTONEG:
2885                 minsize = sizeof (uint8_t);
2886                 break;
2887         case MAC_PROP_MTU:
2888         case MAC_PROP_LLIMIT:
2889         case MAC_PROP_LDECAY:
2890                 minsize = sizeof (uint32_t);
2891                 break;
2892         case MAC_PROP_FLOWCTRL:
2893                 minsize = sizeof (link_flowctrl_t);
2894                 break;
2895         case MAC_PROP_ADV_10GFDX_CAP:
2896         case MAC_PROP_EN_10GFDX_CAP:
2897         case MAC_PROP_ADV_1000HDX_CAP:
2898         case MAC_PROP_EN_1000HDX_CAP:
2899         case MAC_PROP_ADV_100FDX_CAP:
2900         case MAC_PROP_EN_100FDX_CAP:
2901         case MAC_PROP_ADV_100HDX_CAP:
2902         case MAC_PROP_EN_100HDX_CAP:
2903         case MAC_PROP_ADV_10FDX_CAP:
2904         case MAC_PROP_EN_10FDX_CAP:
2905         case MAC_PROP_ADV_10HDX_CAP:
2906         case MAC_PROP_EN_10HDX_CAP:
2907         case MAC_PROP_ADV_100T4_CAP:
2908         case MAC_PROP_EN_100T4_CAP:
2909                 minsize = sizeof (uint8_t);
2910                 break;
2911         case MAC_PROP_PVID:
2912                 minsize = sizeof (uint16_t);
2913                 break;
2914         case MAC_PROP_IPTUN_HOPLIMIT:
2915                 minsize = sizeof (uint32_t);
2916                 break;
2917         case MAC_PROP_IPTUN_ENCAPLIMIT:
2918                 minsize = sizeof (uint32_t);
2919                 break;
2920         case MAC_PROP_MAX_TX_RINGS_AVAIL:
2921         case MAC_PROP_MAX_RX_RINGS_AVAIL:
2922         case MAC_PROP_MAX_RXHWCLNT_AVAIL:
2923         case MAC_PROP_MAX_TXHWCLNT_AVAIL:
2924                 minsize = sizeof (uint_t);
2925                 break;
2926         case MAC_PROP_WL_ESSID:
2927                 minsize = sizeof (wl_linkstatus_t);
2928                 break;
2929         case MAC_PROP_WL_BSSID:
2930                 minsize = sizeof (wl_bssid_t);
2931                 break;
2932         case MAC_PROP_WL_BSSTYPE:
2933                 minsize = sizeof (wl_bss_type_t);
2934                 break;
2935         case MAC_PROP_WL_LINKSTATUS:
2936                 minsize = sizeof (wl_linkstatus_t);
2937                 break;
2938         case MAC_PROP_WL_DESIRED_RATES:
2939                 minsize = sizeof (wl_rates_t);
2940                 break;
2941         case MAC_PROP_WL_SUPPORTED_RATES:
2942                 minsize = sizeof (wl_rates_t);
2943                 break;
2944         case MAC_PROP_WL_AUTH_MODE:
2945                 minsize = sizeof (wl_authmode_t);
2946                 break;
2947         case MAC_PROP_WL_ENCRYPTION:
2948                 minsize = sizeof (wl_encryption_t);
2949                 break;
2950         case MAC_PROP_WL_RSSI:
2951                 minsize = sizeof (wl_rssi_t);
2952                 break;
2953         case MAC_PROP_WL_PHY_CONFIG:
2954                 minsize = sizeof (wl_phy_conf_t);
2955                 break;
2956         case MAC_PROP_WL_CAPABILITY:
2957                 minsize = sizeof (wl_capability_t);
2958                 break;
2959         case MAC_PROP_WL_WPA:
2960                 minsize = sizeof (wl_wpa_t);
2961                 break;
2962         case MAC_PROP_WL_SCANRESULTS:
2963                 minsize = sizeof (wl_wpa_ess_t);
2964                 break;
2965         case MAC_PROP_WL_POWER_MODE:
2966                 minsize = sizeof (wl_ps_mode_t);
2967                 break;
2968         case MAC_PROP_WL_RADIO:
2969                 minsize = sizeof (wl_radio_t);
2970                 break;
2971         case MAC_PROP_WL_ESS_LIST:
2972                 minsize = sizeof (wl_ess_list_t);
2973                 break;
2974         case MAC_PROP_WL_KEY_TAB:
2975                 minsize = sizeof (wl_wep_key_tab_t);
2976                 break;
2977         case MAC_PROP_WL_CREATE_IBSS:
2978                 minsize = sizeof (wl_create_ibss_t);
2979                 break;
2980         case MAC_PROP_WL_SETOPTIE:
2981                 minsize = sizeof (wl_wpa_ie_t);
2982                 break;
2983         case MAC_PROP_WL_DELKEY:
2984                 minsize = sizeof (wl_del_key_t);
2985                 break;
2986         case MAC_PROP_WL_KEY:
2987                 minsize = sizeof (wl_key_t);
2988                 break;
2989         case MAC_PROP_WL_MLME:
2990                 minsize = sizeof (wl_mlme_t);
2991                 break;
2992         case MAC_PROP_MACADDRESS:
2993                 minsize = sizeof (mac_addrprop_t);
2994         }
2995 
2996         return (valsize >= minsize);
2997 }
2998 
2999 /*
3000  * mac_set_prop() sets MAC or hardware driver properties:
3001  *
3002  * - MAC-managed properties such as resource properties include maxbw,
3003  *   priority, and cpu binding list, as well as the default port VID
3004  *   used by bridging. These properties are consumed by the MAC layer
3005  *   itself and not passed down to the driver. For resource control
3006  *   properties, this function invokes mac_set_resources() which will
3007  *   cache the property value in mac_impl_t and may call
3008  *   mac_client_set_resource() to update property value of the primary
3009  *   mac client, if it exists.
3010  *
3011  * - Properties which act on the hardware and must be passed to the
3012  *   driver, such as MTU, through the driver's mc_setprop() entry point.
3013  */
3014 int
3015 mac_set_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
3016     uint_t valsize)
3017 {
3018         int err = ENOTSUP;
3019         mac_impl_t *mip = (mac_impl_t *)mh;
3020 
3021         ASSERT(MAC_PERIM_HELD(mh));
3022 
3023         switch (id) {
3024         case MAC_PROP_RESOURCE: {
3025                 mac_resource_props_t *mrp;
3026 
3027                 /* call mac_set_resources() for MAC properties */
3028                 ASSERT(valsize >= sizeof (mac_resource_props_t));
3029                 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3030                 bcopy(val, mrp, sizeof (*mrp));
3031                 err = mac_set_resources(mh, mrp);
3032                 kmem_free(mrp, sizeof (*mrp));
3033                 break;
3034         }
3035 
3036         case MAC_PROP_PVID:
3037                 ASSERT(valsize >= sizeof (uint16_t));
3038                 if (mip->mi_state_flags & MIS_IS_VNIC)
3039                         return (EINVAL);
3040                 err = mac_set_pvid(mh, *(uint16_t *)val);
3041                 break;
3042 
3043         case MAC_PROP_MTU: {
3044                 uint32_t mtu;
3045 
3046                 ASSERT(valsize >= sizeof (uint32_t));
3047                 bcopy(val, &mtu, sizeof (mtu));
3048                 err = mac_set_mtu(mh, mtu, NULL);
3049                 break;
3050         }
3051 
3052         case MAC_PROP_LLIMIT:
3053         case MAC_PROP_LDECAY: {
3054                 uint32_t learnval;
3055 
3056                 if (valsize < sizeof (learnval) ||
3057                     (mip->mi_state_flags & MIS_IS_VNIC))
3058                         return (EINVAL);
3059                 bcopy(val, &learnval, sizeof (learnval));
3060                 if (learnval == 0 && id == MAC_PROP_LDECAY)
3061                         return (EINVAL);
3062                 if (id == MAC_PROP_LLIMIT)
3063                         mip->mi_llimit = learnval;
3064                 else
3065                         mip->mi_ldecay = learnval;
3066                 err = 0;
3067                 break;
3068         }
3069 
3070         case MAC_PROP_MACADDRESS: {
3071                 mac_addrprop_t  *addrprop = val;
3072 
3073                 if (addrprop->ma_len != mip->mi_type->mt_addr_length)
3074                         return (EINVAL);
3075 
3076                 err = mac_unicast_primary_set(mh, addrprop->ma_addr);
3077                 break;
3078         }
3079 
3080         default:
3081                 /* For other driver properties, call driver's callback */
3082                 if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
3083                         err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
3084                             name, id, valsize, val);
3085                 }
3086         }
3087         return (err);
3088 }
3089 
3090 /*
3091  * mac_get_prop() gets MAC or device driver properties.
3092  *
3093  * If the property is a driver property, mac_get_prop() calls driver's callback
3094  * entry point to get it.
3095  * If the property is a MAC property, mac_get_prop() invokes mac_get_resources()
3096  * which returns the cached value in mac_impl_t.
3097  */
3098 int
3099 mac_get_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
3100     uint_t valsize)
3101 {
3102         int err = ENOTSUP;
3103         mac_impl_t *mip = (mac_impl_t *)mh;
3104         uint_t  rings;
3105         uint_t  vlinks;
3106 
3107         bzero(val, valsize);
3108 
3109         switch (id) {
3110         case MAC_PROP_RESOURCE: {
3111                 mac_resource_props_t *mrp;
3112 
3113                 /* If mac property, read from cache */
3114                 ASSERT(valsize >= sizeof (mac_resource_props_t));
3115                 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3116                 mac_get_resources(mh, mrp);
3117                 bcopy(mrp, val, sizeof (*mrp));
3118                 kmem_free(mrp, sizeof (*mrp));
3119                 return (0);
3120         }
3121         case MAC_PROP_RESOURCE_EFF: {
3122                 mac_resource_props_t *mrp;
3123 
3124                 /* If mac effective property, read from client */
3125                 ASSERT(valsize >= sizeof (mac_resource_props_t));
3126                 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3127                 mac_get_effective_resources(mh, mrp);
3128                 bcopy(mrp, val, sizeof (*mrp));
3129                 kmem_free(mrp, sizeof (*mrp));
3130                 return (0);
3131         }
3132 
3133         case MAC_PROP_PVID:
3134                 ASSERT(valsize >= sizeof (uint16_t));
3135                 if (mip->mi_state_flags & MIS_IS_VNIC)
3136                         return (EINVAL);
3137                 *(uint16_t *)val = mac_get_pvid(mh);
3138                 return (0);
3139 
3140         case MAC_PROP_LLIMIT:
3141         case MAC_PROP_LDECAY:
3142                 ASSERT(valsize >= sizeof (uint32_t));
3143                 if (mip->mi_state_flags & MIS_IS_VNIC)
3144                         return (EINVAL);
3145                 if (id == MAC_PROP_LLIMIT)
3146                         bcopy(&mip->mi_llimit, val, sizeof (mip->mi_llimit));
3147                 else
3148                         bcopy(&mip->mi_ldecay, val, sizeof (mip->mi_ldecay));
3149                 return (0);
3150 
3151         case MAC_PROP_MTU: {
3152                 uint32_t sdu;
3153 
3154                 ASSERT(valsize >= sizeof (uint32_t));
3155                 mac_sdu_get2(mh, NULL, &sdu, NULL);
3156                 bcopy(&sdu, val, sizeof (sdu));
3157 
3158                 return (0);
3159         }
3160         case MAC_PROP_STATUS: {
3161                 link_state_t link_state;
3162 
3163                 if (valsize < sizeof (link_state))
3164                         return (EINVAL);
3165                 link_state = mac_link_get(mh);
3166                 bcopy(&link_state, val, sizeof (link_state));
3167 
3168                 return (0);
3169         }
3170 
3171         case MAC_PROP_MAX_RX_RINGS_AVAIL:
3172         case MAC_PROP_MAX_TX_RINGS_AVAIL:
3173                 ASSERT(valsize >= sizeof (uint_t));
3174                 rings = id == MAC_PROP_MAX_RX_RINGS_AVAIL ?
3175                     mac_rxavail_get(mh) : mac_txavail_get(mh);
3176                 bcopy(&rings, val, sizeof (uint_t));
3177                 return (0);
3178 
3179         case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3180         case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3181                 ASSERT(valsize >= sizeof (uint_t));
3182                 vlinks = id == MAC_PROP_MAX_RXHWCLNT_AVAIL ?
3183                     mac_rxhwlnksavail_get(mh) : mac_txhwlnksavail_get(mh);
3184                 bcopy(&vlinks, val, sizeof (uint_t));
3185                 return (0);
3186 
3187         case MAC_PROP_RXRINGSRANGE:
3188         case MAC_PROP_TXRINGSRANGE:
3189                 /*
3190                  * The value for these properties are returned through
3191                  * the MAC_PROP_RESOURCE property.
3192                  */
3193                 return (0);
3194 
3195         case MAC_PROP_MACADDRESS: {
3196                 mac_addrprop_t  *addrprop = val;
3197 
3198                 if (valsize < sizeof (mac_addrprop_t))
3199                         return (EINVAL);
3200                 mac_unicast_primary_get(mh, addrprop->ma_addr);
3201                 addrprop->ma_len = mip->mi_type->mt_addr_length;
3202                 return (0);
3203         }
3204 
3205         default:
3206                 break;
3207 
3208         }
3209 
3210         /* If driver property, request from driver */
3211         if (mip->mi_callbacks->mc_callbacks & MC_GETPROP) {
3212                 err = mip->mi_callbacks->mc_getprop(mip->mi_driver, name, id,
3213                     valsize, val);
3214         }
3215 
3216         return (err);
3217 }
3218 
3219 /*
3220  * Helper function to initialize the range structure for use in
3221  * mac_get_prop. If the type can be other than uint32, we can
3222  * pass that as an arg.
3223  */
3224 static void
3225 _mac_set_range(mac_propval_range_t *range, uint32_t min, uint32_t max)
3226 {
3227         range->mpr_count = 1;
3228         range->mpr_type = MAC_PROPVAL_UINT32;
3229         range->mpr_range_uint32[0].mpur_min = min;
3230         range->mpr_range_uint32[0].mpur_max = max;
3231 }
3232 
3233 /*
3234  * Returns information about the specified property, such as default
3235  * values or permissions.
3236  */
3237 int
3238 mac_prop_info(mac_handle_t mh, mac_prop_id_t id, char *name,
3239     void *default_val, uint_t default_size, mac_propval_range_t *range,
3240     uint_t *perm)
3241 {
3242         mac_prop_info_state_t state;
3243         mac_impl_t *mip = (mac_impl_t *)mh;
3244         uint_t  max;
3245 
3246         /*
3247          * A property is read/write by default unless the driver says
3248          * otherwise.
3249          */
3250         if (perm != NULL)
3251                 *perm = MAC_PROP_PERM_RW;
3252 
3253         if (default_val != NULL)
3254                 bzero(default_val, default_size);
3255 
3256         /*
3257          * First, handle framework properties for which we don't need to
3258          * involve the driver.
3259          */
3260         switch (id) {
3261         case MAC_PROP_RESOURCE:
3262         case MAC_PROP_PVID:
3263         case MAC_PROP_LLIMIT:
3264         case MAC_PROP_LDECAY:
3265                 return (0);
3266 
3267         case MAC_PROP_MAX_RX_RINGS_AVAIL:
3268         case MAC_PROP_MAX_TX_RINGS_AVAIL:
3269         case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3270         case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3271                 if (perm != NULL)
3272                         *perm = MAC_PROP_PERM_READ;
3273                 return (0);
3274 
3275         case MAC_PROP_RXRINGSRANGE:
3276         case MAC_PROP_TXRINGSRANGE:
3277                 /*
3278                  * Currently, we support range for RX and TX rings properties.
3279                  * When we extend this support to maxbw, cpus and priority,
3280                  * we should move this to mac_get_resources.
3281                  * There is no default value for RX or TX rings.
3282                  */
3283                 if ((mip->mi_state_flags & MIS_IS_VNIC) &&
3284                     mac_is_vnic_primary(mh)) {
3285                         /*
3286                          * We don't support setting rings for a VLAN
3287                          * data link because it shares its ring with the
3288                          * primary MAC client.
3289                          */
3290                         if (perm != NULL)
3291                                 *perm = MAC_PROP_PERM_READ;
3292                         if (range != NULL)
3293                                 range->mpr_count = 0;
3294                 } else if (range != NULL) {
3295                         if (mip->mi_state_flags & MIS_IS_VNIC)
3296                                 mh = mac_get_lower_mac_handle(mh);
3297                         mip = (mac_impl_t *)mh;
3298                         if ((id == MAC_PROP_RXRINGSRANGE &&
3299                             mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) ||
3300                             (id == MAC_PROP_TXRINGSRANGE &&
3301                             mip->mi_tx_group_type == MAC_GROUP_TYPE_STATIC)) {
3302                                 if (id == MAC_PROP_RXRINGSRANGE) {
3303                                         if ((mac_rxhwlnksavail_get(mh) +
3304                                             mac_rxhwlnksrsvd_get(mh)) <= 1) {
3305                                                 /*
3306                                                  * doesn't support groups or
3307                                                  * rings
3308                                                  */
3309                                                 range->mpr_count = 0;
3310                                         } else {
3311                                                 /*
3312                                                  * supports specifying groups,
3313                                                  * but not rings
3314                                                  */
3315                                                 _mac_set_range(range, 0, 0);
3316                                         }
3317                                 } else {
3318                                         if ((mac_txhwlnksavail_get(mh) +
3319                                             mac_txhwlnksrsvd_get(mh)) <= 1) {
3320                                                 /*
3321                                                  * doesn't support groups or
3322                                                  * rings
3323                                                  */
3324                                                 range->mpr_count = 0;
3325                                         } else {
3326                                                 /*
3327                                                  * supports specifying groups,
3328                                                  * but not rings
3329                                                  */
3330                                                 _mac_set_range(range, 0, 0);
3331                                         }
3332                                 }
3333                         } else {
3334                                 max = id == MAC_PROP_RXRINGSRANGE ?
3335                                     mac_rxavail_get(mh) + mac_rxrsvd_get(mh) :
3336                                     mac_txavail_get(mh) + mac_txrsvd_get(mh);
3337                                 if (max <= 1) {
3338                                         /*
3339                                          * doesn't support groups or
3340                                          * rings
3341                                          */
3342                                         range->mpr_count = 0;
3343                                 } else  {
3344                                         /*
3345                                          * -1 because we have to leave out the
3346                                          * default ring.
3347                                          */
3348                                         _mac_set_range(range, 1, max - 1);
3349                                 }
3350                         }
3351                 }
3352                 return (0);
3353 
3354         case MAC_PROP_STATUS:
3355                 if (perm != NULL)
3356                         *perm = MAC_PROP_PERM_READ;
3357                 return (0);
3358 
3359         case MAC_PROP_MACADDRESS: {
3360                 mac_addrprop_t  *defaddr = default_val;
3361 
3362                 if (defaddr != NULL) {
3363                         if (default_size < sizeof (mac_addrprop_t))
3364                                 return (EINVAL);
3365                         bcopy(mip->mi_info.mi_unicst_addr, defaddr->ma_addr,
3366                             mip->mi_type->mt_addr_length);
3367                         defaddr->ma_len = mip->mi_type->mt_addr_length;
3368                 }
3369                 return (0);
3370         }
3371         }
3372 
3373         /*
3374          * Get the property info from the driver if it implements the
3375          * property info entry point.
3376          */
3377         bzero(&state, sizeof (state));
3378 
3379         if (mip->mi_callbacks->mc_callbacks & MC_PROPINFO) {
3380                 state.pr_default = default_val;
3381                 state.pr_default_size = default_size;
3382 
3383                 /*
3384                  * The caller specifies the maximum number of ranges
3385                  * it can accomodate using mpr_count. We don't touch
3386                  * this value until the driver returns from its
3387                  * mc_propinfo() callback, and ensure we don't exceed
3388                  * this number of range as the driver defines
3389                  * supported range from its mc_propinfo().
3390                  *
3391                  * pr_range_cur_count keeps track of how many ranges
3392                  * were defined by the driver from its mc_propinfo()
3393                  * entry point.
3394                  *
3395                  * On exit, the user-specified range mpr_count returns
3396                  * the number of ranges specified by the driver on
3397                  * success, or the number of ranges it wanted to
3398                  * define if that number of ranges could not be
3399                  * accomodated by the specified range structure.  In
3400                  * the latter case, the caller will be able to
3401                  * allocate a larger range structure, and query the
3402                  * property again.
3403                  */
3404                 state.pr_range_cur_count = 0;
3405                 state.pr_range = range;
3406 
3407                 mip->mi_callbacks->mc_propinfo(mip->mi_driver, name, id,
3408                     (mac_prop_info_handle_t)&state);
3409 
3410                 if (state.pr_flags & MAC_PROP_INFO_RANGE)
3411                         range->mpr_count = state.pr_range_cur_count;
3412 
3413                 /*
3414                  * The operation could fail if the buffer supplied by
3415                  * the user was too small for the range or default
3416                  * value of the property.
3417                  */
3418                 if (state.pr_errno != 0)
3419                         return (state.pr_errno);
3420 
3421                 if (perm != NULL && state.pr_flags & MAC_PROP_INFO_PERM)
3422                         *perm = state.pr_perm;
3423         }
3424 
3425         /*
3426          * The MAC layer may want to provide default values or allowed
3427          * ranges for properties if the driver does not provide a
3428          * property info entry point, or that entry point exists, but
3429          * it did not provide a default value or allowed ranges for
3430          * that property.
3431          */
3432         switch (id) {
3433         case MAC_PROP_MTU: {
3434                 uint32_t sdu;
3435 
3436                 mac_sdu_get2(mh, NULL, &sdu, NULL);
3437 
3438                 if (range != NULL && !(state.pr_flags &
3439                     MAC_PROP_INFO_RANGE)) {
3440                         /* MTU range */
3441                         _mac_set_range(range, sdu, sdu);
3442                 }
3443 
3444                 if (default_val != NULL && !(state.pr_flags &
3445                     MAC_PROP_INFO_DEFAULT)) {
3446                         if (mip->mi_info.mi_media == DL_ETHER)
3447                                 sdu = ETHERMTU;
3448                         /* default MTU value */
3449                         bcopy(&sdu, default_val, sizeof (sdu));
3450                 }
3451         }
3452         }
3453 
3454         return (0);
3455 }
3456 
3457 int
3458 mac_fastpath_disable(mac_handle_t mh)
3459 {
3460         mac_impl_t      *mip = (mac_impl_t *)mh;
3461 
3462         if ((mip->mi_state_flags & MIS_LEGACY) == 0)
3463                 return (0);
3464 
3465         return (mip->mi_capab_legacy.ml_fastpath_disable(mip->mi_driver));
3466 }
3467 
3468 void
3469 mac_fastpath_enable(mac_handle_t mh)
3470 {
3471         mac_impl_t      *mip = (mac_impl_t *)mh;
3472 
3473         if ((mip->mi_state_flags & MIS_LEGACY) == 0)
3474                 return;
3475 
3476         mip->mi_capab_legacy.ml_fastpath_enable(mip->mi_driver);
3477 }
3478 
3479 void
3480 mac_register_priv_prop(mac_impl_t *mip, char **priv_props)
3481 {
3482         uint_t nprops, i;
3483 
3484         if (priv_props == NULL)
3485                 return;
3486 
3487         nprops = 0;
3488         while (priv_props[nprops] != NULL)
3489                 nprops++;
3490         if (nprops == 0)
3491                 return;
3492 
3493 
3494         mip->mi_priv_prop = kmem_zalloc(nprops * sizeof (char *), KM_SLEEP);
3495 
3496         for (i = 0; i < nprops; i++) {
3497                 mip->mi_priv_prop[i] = kmem_zalloc(MAXLINKPROPNAME, KM_SLEEP);
3498                 (void) strlcpy(mip->mi_priv_prop[i], priv_props[i],
3499                     MAXLINKPROPNAME);
3500         }
3501 
3502         mip->mi_priv_prop_count = nprops;
3503 }
3504 
3505 void
3506 mac_unregister_priv_prop(mac_impl_t *mip)
3507 {
3508         uint_t i;
3509 
3510         if (mip->mi_priv_prop_count == 0) {
3511                 ASSERT(mip->mi_priv_prop == NULL);
3512                 return;
3513         }
3514 
3515         for (i = 0; i < mip->mi_priv_prop_count; i++)
3516                 kmem_free(mip->mi_priv_prop[i], MAXLINKPROPNAME);
3517         kmem_free(mip->mi_priv_prop, mip->mi_priv_prop_count *
3518             sizeof (char *));
3519 
3520         mip->mi_priv_prop = NULL;
3521         mip->mi_priv_prop_count = 0;
3522 }
3523 
3524 /*
3525  * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
3526  * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
3527  * cases if MAC free's the ring structure after mac_stop_ring(), any
3528  * illegal access to the ring structure coming from the driver will panic
3529  * the system. In order to protect the system from such inadverent access,
3530  * we maintain a cache of rings in the mac_impl_t after they get free'd up.
3531  * When packets are received on free'd up rings, MAC (through the generation
3532  * count mechanism) will drop such packets.
3533  */
3534 static mac_ring_t *
3535 mac_ring_alloc(mac_impl_t *mip)
3536 {
3537         mac_ring_t *ring;
3538 
3539         mutex_enter(&mip->mi_ring_lock);
3540         if (mip->mi_ring_freelist != NULL) {
3541                 ring = mip->mi_ring_freelist;
3542                 mip->mi_ring_freelist = ring->mr_next;
3543                 bzero(ring, sizeof (mac_ring_t));
3544                 mutex_exit(&mip->mi_ring_lock);
3545         } else {
3546                 mutex_exit(&mip->mi_ring_lock);
3547                 ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
3548         }
3549         ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
3550         return (ring);
3551 }
3552 
3553 static void
3554 mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
3555 {
3556         ASSERT(ring->mr_state == MR_FREE);
3557 
3558         mutex_enter(&mip->mi_ring_lock);
3559         ring->mr_state = MR_FREE;
3560         ring->mr_flag = 0;
3561         ring->mr_next = mip->mi_ring_freelist;
3562         ring->mr_mip = NULL;
3563         mip->mi_ring_freelist = ring;
3564         mac_ring_stat_delete(ring);
3565         mutex_exit(&mip->mi_ring_lock);
3566 }
3567 
3568 static void
3569 mac_ring_freeall(mac_impl_t *mip)
3570 {
3571         mac_ring_t *ring_next;
3572         mutex_enter(&mip->mi_ring_lock);
3573         mac_ring_t *ring = mip->mi_ring_freelist;
3574         while (ring != NULL) {
3575                 ring_next = ring->mr_next;
3576                 kmem_cache_free(mac_ring_cache, ring);
3577                 ring = ring_next;
3578         }
3579         mip->mi_ring_freelist = NULL;
3580         mutex_exit(&mip->mi_ring_lock);
3581 }
3582 
3583 int
3584 mac_start_ring(mac_ring_t *ring)
3585 {
3586         int rv = 0;
3587 
3588         ASSERT(ring->mr_state == MR_FREE);
3589 
3590         if (ring->mr_start != NULL) {
3591                 rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
3592                 if (rv != 0)
3593                         return (rv);
3594         }
3595 
3596         ring->mr_state = MR_INUSE;
3597         return (rv);
3598 }
3599 
3600 void
3601 mac_stop_ring(mac_ring_t *ring)
3602 {
3603         ASSERT(ring->mr_state == MR_INUSE);
3604 
3605         if (ring->mr_stop != NULL)
3606                 ring->mr_stop(ring->mr_driver);
3607 
3608         ring->mr_state = MR_FREE;
3609 
3610         /*
3611          * Increment the ring generation number for this ring.
3612          */
3613         ring->mr_gen_num++;
3614 }
3615 
3616 int
3617 mac_start_group(mac_group_t *group)
3618 {
3619         int rv = 0;
3620 
3621         if (group->mrg_start != NULL)
3622                 rv = group->mrg_start(group->mrg_driver);
3623 
3624         return (rv);
3625 }
3626 
3627 void
3628 mac_stop_group(mac_group_t *group)
3629 {
3630         if (group->mrg_stop != NULL)
3631                 group->mrg_stop(group->mrg_driver);
3632 }
3633 
3634 /*
3635  * Called from mac_start() on the default Rx group. Broadcast and multicast
3636  * packets are received only on the default group. Hence the default group
3637  * needs to be up even if the primary client is not up, for the other groups
3638  * to be functional. We do this by calling this function at mac_start time
3639  * itself. However the broadcast packets that are received can't make their
3640  * way beyond mac_rx until a mac client creates a broadcast flow.
3641  */
3642 static int
3643 mac_start_group_and_rings(mac_group_t *group)
3644 {
3645         mac_ring_t      *ring;
3646         int             rv = 0;
3647 
3648         ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
3649         if ((rv = mac_start_group(group)) != 0)
3650                 return (rv);
3651 
3652         for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3653                 ASSERT(ring->mr_state == MR_FREE);
3654                 if ((rv = mac_start_ring(ring)) != 0)
3655                         goto error;
3656                 ring->mr_classify_type = MAC_SW_CLASSIFIER;
3657         }
3658         return (0);
3659 
3660 error:
3661         mac_stop_group_and_rings(group);
3662         return (rv);
3663 }
3664 
3665 /* Called from mac_stop on the default Rx group */
3666 static void
3667 mac_stop_group_and_rings(mac_group_t *group)
3668 {
3669         mac_ring_t      *ring;
3670 
3671         for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3672                 if (ring->mr_state != MR_FREE) {
3673                         mac_stop_ring(ring);
3674                         ring->mr_flag = 0;
3675                         ring->mr_classify_type = MAC_NO_CLASSIFIER;
3676                 }
3677         }
3678         mac_stop_group(group);
3679 }
3680 
3681 
3682 static mac_ring_t *
3683 mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
3684     mac_capab_rings_t *cap_rings)
3685 {
3686         mac_ring_t *ring, *rnext;
3687         mac_ring_info_t ring_info;
3688         ddi_intr_handle_t ddi_handle;
3689 
3690         ring = mac_ring_alloc(mip);
3691 
3692         /* Prepare basic information of ring */
3693 
3694         /*
3695          * Ring index is numbered to be unique across a particular device.
3696          * Ring index computation makes following assumptions:
3697          *      - For drivers with static grouping (e.g. ixgbe, bge),
3698          *      ring index exchanged with the driver (e.g. during mr_rget)
3699          *      is unique only across the group the ring belongs to.
3700          *      - Drivers with dynamic grouping (e.g. nxge), start
3701          *      with single group (mrg_index = 0).
3702          */
3703         ring->mr_index = group->mrg_index * group->mrg_info.mgi_count + index;
3704         ring->mr_type = group->mrg_type;
3705         ring->mr_gh = (mac_group_handle_t)group;
3706 
3707         /* Insert the new ring to the list. */
3708         ring->mr_next = group->mrg_rings;
3709         group->mrg_rings = ring;
3710 
3711         /* Zero to reuse the info data structure */
3712         bzero(&ring_info, sizeof (ring_info));
3713 
3714         /* Query ring information from driver */
3715         cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
3716             index, &ring_info, (mac_ring_handle_t)ring);
3717 
3718         ring->mr_info = ring_info;
3719 
3720         /*
3721          * The interrupt handle could be shared among multiple rings.
3722          * Thus if there is a bunch of rings that are sharing an
3723          * interrupt, then only one ring among the bunch will be made
3724          * available for interrupt re-targeting; the rest will have
3725          * ddi_shared flag set to TRUE and would not be available for
3726          * be interrupt re-targeting.
3727          */
3728         if ((ddi_handle = ring_info.mri_intr.mi_ddi_handle) != NULL) {
3729                 rnext = ring->mr_next;
3730                 while (rnext != NULL) {
3731                         if (rnext->mr_info.mri_intr.mi_ddi_handle ==
3732                             ddi_handle) {
3733                                 /*
3734                                  * If default ring (mr_index == 0) is part
3735                                  * of a group of rings sharing an
3736                                  * interrupt, then set ddi_shared flag for
3737                                  * the default ring and give another ring
3738                                  * the chance to be re-targeted.
3739                                  */
3740                                 if (rnext->mr_index == 0 &&
3741                                     !rnext->mr_info.mri_intr.mi_ddi_shared) {
3742                                         rnext->mr_info.mri_intr.mi_ddi_shared =
3743                                             B_TRUE;
3744                                 } else {
3745                                         ring->mr_info.mri_intr.mi_ddi_shared =
3746                                             B_TRUE;
3747                                 }
3748                                 break;
3749                         }
3750                         rnext = rnext->mr_next;
3751                 }
3752                 /*
3753                  * If rnext is NULL, then no matching ddi_handle was found.
3754                  * Rx rings get registered first. So if this is a Tx ring,
3755                  * then go through all the Rx rings and see if there is a
3756                  * matching ddi handle.
3757                  */
3758                 if (rnext == NULL && ring->mr_type == MAC_RING_TYPE_TX) {
3759                         mac_compare_ddi_handle(mip->mi_rx_groups,
3760                             mip->mi_rx_group_count, ring);
3761                 }
3762         }
3763 
3764         /* Update ring's status */
3765         ring->mr_state = MR_FREE;
3766         ring->mr_flag = 0;
3767 
3768         /* Update the ring count of the group */
3769         group->mrg_cur_count++;
3770 
3771         /* Create per ring kstats */
3772         if (ring->mr_stat != NULL) {
3773                 ring->mr_mip = mip;
3774                 mac_ring_stat_create(ring);
3775         }
3776 
3777         return (ring);
3778 }
3779 
3780 /*
3781  * Rings are chained together for easy regrouping.
3782  */
3783 static void
3784 mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
3785     mac_capab_rings_t *cap_rings)
3786 {
3787         int index;
3788 
3789         /*
3790          * Initialize all ring members of this group. Size of zero will not
3791          * enter the loop, so it's safe for initializing an empty group.
3792          */
3793         for (index = size - 1; index >= 0; index--)
3794                 (void) mac_init_ring(mip, group, index, cap_rings);
3795 }
3796 
3797 int
3798 mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3799 {
3800         mac_capab_rings_t       *cap_rings;
3801         mac_group_t             *group;
3802         mac_group_t             *groups;
3803         mac_group_info_t        group_info;
3804         uint_t                  group_free = 0;
3805         uint_t                  ring_left;
3806         mac_ring_t              *ring;
3807         int                     g;
3808         int                     err = 0;
3809         uint_t                  grpcnt;
3810         boolean_t               pseudo_txgrp = B_FALSE;
3811 
3812         switch (rtype) {
3813         case MAC_RING_TYPE_RX:
3814                 ASSERT(mip->mi_rx_groups == NULL);
3815 
3816                 cap_rings = &mip->mi_rx_rings_cap;
3817                 cap_rings->mr_type = MAC_RING_TYPE_RX;
3818                 break;
3819         case MAC_RING_TYPE_TX:
3820                 ASSERT(mip->mi_tx_groups == NULL);
3821 
3822                 cap_rings = &mip->mi_tx_rings_cap;
3823                 cap_rings->mr_type = MAC_RING_TYPE_TX;
3824                 break;
3825         default:
3826                 ASSERT(B_FALSE);
3827         }
3828 
3829         if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS, cap_rings))
3830                 return (0);
3831         grpcnt = cap_rings->mr_gnum;
3832 
3833         /*
3834          * If we have multiple TX rings, but only one TX group, we can
3835          * create pseudo TX groups (one per TX ring) in the MAC layer,
3836          * except for an aggr. For an aggr currently we maintain only
3837          * one group with all the rings (for all its ports), going
3838          * forwards we might change this.
3839          */
3840         if (rtype == MAC_RING_TYPE_TX &&
3841             cap_rings->mr_gnum == 0 && cap_rings->mr_rnum >  0 &&
3842             (mip->mi_state_flags & MIS_IS_AGGR) == 0) {
3843                 /*
3844                  * The -1 here is because we create a default TX group
3845                  * with all the rings in it.
3846                  */
3847                 grpcnt = cap_rings->mr_rnum - 1;
3848                 pseudo_txgrp = B_TRUE;
3849         }
3850 
3851         /*
3852          * Allocate a contiguous buffer for all groups.
3853          */
3854         groups = kmem_zalloc(sizeof (mac_group_t) * (grpcnt+ 1), KM_SLEEP);
3855 
3856         ring_left = cap_rings->mr_rnum;
3857 
3858         /*
3859          * Get all ring groups if any, and get their ring members
3860          * if any.
3861          */
3862         for (g = 0; g < grpcnt; g++) {
3863                 group = groups + g;
3864 
3865                 /* Prepare basic information of the group */
3866                 group->mrg_index = g;
3867                 group->mrg_type = rtype;
3868                 group->mrg_state = MAC_GROUP_STATE_UNINIT;
3869                 group->mrg_mh = (mac_handle_t)mip;
3870                 group->mrg_next = group + 1;
3871 
3872                 /* Zero to reuse the info data structure */
3873                 bzero(&group_info, sizeof (group_info));
3874 
3875                 if (pseudo_txgrp) {
3876                         /*
3877                          * This is a pseudo group that we created, apart
3878                          * from setting the state there is nothing to be
3879                          * done.
3880                          */
3881                         group->mrg_state = MAC_GROUP_STATE_REGISTERED;
3882                         group_free++;
3883                         continue;
3884                 }
3885                 /* Query group information from driver */
3886                 cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
3887                     (mac_group_handle_t)group);
3888 
3889                 switch (cap_rings->mr_group_type) {
3890                 case MAC_GROUP_TYPE_DYNAMIC:
3891                         if (cap_rings->mr_gaddring == NULL ||
3892                             cap_rings->mr_gremring == NULL) {
3893                                 DTRACE_PROBE3(
3894                                     mac__init__rings_no_addremring,
3895                                     char *, mip->mi_name,
3896                                     mac_group_add_ring_t,
3897                                     cap_rings->mr_gaddring,
3898                                     mac_group_add_ring_t,
3899                                     cap_rings->mr_gremring);
3900                                 err = EINVAL;
3901                                 goto bail;
3902                         }
3903 
3904                         switch (rtype) {
3905                         case MAC_RING_TYPE_RX:
3906                                 /*
3907                                  * The first RX group must have non-zero
3908                                  * rings, and the following groups must
3909                                  * have zero rings.
3910                                  */
3911                                 if (g == 0 && group_info.mgi_count == 0) {
3912                                         DTRACE_PROBE1(
3913                                             mac__init__rings__rx__def__zero,
3914                                             char *, mip->mi_name);
3915                                         err = EINVAL;
3916                                         goto bail;
3917                                 }
3918                                 if (g > 0 && group_info.mgi_count != 0) {
3919                                         DTRACE_PROBE3(
3920                                             mac__init__rings__rx__nonzero,
3921                                             char *, mip->mi_name,
3922                                             int, g, int, group_info.mgi_count);
3923                                         err = EINVAL;
3924                                         goto bail;
3925                                 }
3926                                 break;
3927                         case MAC_RING_TYPE_TX:
3928                                 /*
3929                                  * All TX ring groups must have zero rings.
3930                                  */
3931                                 if (group_info.mgi_count != 0) {
3932                                         DTRACE_PROBE3(
3933                                             mac__init__rings__tx__nonzero,
3934                                             char *, mip->mi_name,
3935                                             int, g, int, group_info.mgi_count);
3936                                         err = EINVAL;
3937                                         goto bail;
3938                                 }
3939                                 break;
3940                         }
3941                         break;
3942                 case MAC_GROUP_TYPE_STATIC:
3943                         /*
3944                          * Note that an empty group is allowed, e.g., an aggr
3945                          * would start with an empty group.
3946                          */
3947                         break;
3948                 default:
3949                         /* unknown group type */
3950                         DTRACE_PROBE2(mac__init__rings__unknown__type,
3951                             char *, mip->mi_name,
3952                             int, cap_rings->mr_group_type);
3953                         err = EINVAL;
3954                         goto bail;
3955                 }
3956 
3957 
3958                 /*
3959                  * Driver must register group->mgi_addmac/remmac() for rx groups
3960                  * to support multiple MAC addresses.
3961                  */
3962                 if (rtype == MAC_RING_TYPE_RX) {
3963                         if ((group_info.mgi_addmac == NULL) ||
3964                             (group_info.mgi_addmac == NULL)) {
3965                                 goto bail;
3966                         }
3967                 }
3968 
3969                 /* Cache driver-supplied information */
3970                 group->mrg_info = group_info;
3971 
3972                 /* Update the group's status and group count. */
3973                 mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
3974                 group_free++;
3975 
3976                 group->mrg_rings = NULL;
3977                 group->mrg_cur_count = 0;
3978                 mac_init_group(mip, group, group_info.mgi_count, cap_rings);
3979                 ring_left -= group_info.mgi_count;
3980 
3981                 /* The current group size should be equal to default value */
3982                 ASSERT(group->mrg_cur_count == group_info.mgi_count);
3983         }
3984 
3985         /* Build up a dummy group for free resources as a pool */
3986         group = groups + grpcnt;
3987 
3988         /* Prepare basic information of the group */
3989         group->mrg_index = -1;
3990         group->mrg_type = rtype;
3991         group->mrg_state = MAC_GROUP_STATE_UNINIT;
3992         group->mrg_mh = (mac_handle_t)mip;
3993         group->mrg_next = NULL;
3994 
3995         /*
3996          * If there are ungrouped rings, allocate a continuous buffer for
3997          * remaining resources.
3998          */
3999         if (ring_left != 0) {
4000                 group->mrg_rings = NULL;
4001                 group->mrg_cur_count = 0;
4002                 mac_init_group(mip, group, ring_left, cap_rings);
4003 
4004                 /* The current group size should be equal to ring_left */
4005                 ASSERT(group->mrg_cur_count == ring_left);
4006 
4007                 ring_left = 0;
4008 
4009                 /* Update this group's status */
4010                 mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
4011         } else
4012                 group->mrg_rings = NULL;
4013 
4014         ASSERT(ring_left == 0);
4015 
4016 bail:
4017 
4018         /* Cache other important information to finalize the initialization */
4019         switch (rtype) {
4020         case MAC_RING_TYPE_RX:
4021                 mip->mi_rx_group_type = cap_rings->mr_group_type;
4022                 mip->mi_rx_group_count = cap_rings->mr_gnum;
4023                 mip->mi_rx_groups = groups;
4024                 mip->mi_rx_donor_grp = groups;
4025                 if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
4026                         /*
4027                          * The default ring is reserved since it is
4028                          * used for sending the broadcast etc. packets.
4029                          */
4030                         mip->mi_rxrings_avail =
4031                             mip->mi_rx_groups->mrg_cur_count - 1;
4032                         mip->mi_rxrings_rsvd = 1;
4033                 }
4034                 /*
4035                  * The default group cannot be reserved. It is used by
4036                  * all the clients that do not have an exclusive group.
4037                  */
4038                 mip->mi_rxhwclnt_avail = mip->mi_rx_group_count - 1;
4039                 mip->mi_rxhwclnt_used = 1;
4040                 break;
4041         case MAC_RING_TYPE_TX:
4042                 mip->mi_tx_group_type = pseudo_txgrp ? MAC_GROUP_TYPE_DYNAMIC :
4043                     cap_rings->mr_group_type;
4044                 mip->mi_tx_group_count = grpcnt;
4045                 mip->mi_tx_group_free = group_free;
4046                 mip->mi_tx_groups = groups;
4047 
4048                 group = groups + grpcnt;
4049                 ring = group->mrg_rings;
4050                 /*
4051                  * The ring can be NULL in the case of aggr. Aggr will
4052                  * have an empty Tx group which will get populated
4053                  * later when pseudo Tx rings are added after
4054                  * mac_register() is done.
4055                  */
4056                 if (ring == NULL) {
4057                         ASSERT(mip->mi_state_flags & MIS_IS_AGGR);
4058                         /*
4059                          * pass the group to aggr so it can add Tx
4060                          * rings to the group later.
4061                          */
4062                         cap_rings->mr_gget(mip->mi_driver, rtype, 0, NULL,
4063                             (mac_group_handle_t)group);
4064                         /*
4065                          * Even though there are no rings at this time
4066                          * (rings will come later), set the group
4067                          * state to registered.
4068                          */
4069                         group->mrg_state = MAC_GROUP_STATE_REGISTERED;
4070                 } else {
4071                         /*
4072                          * Ring 0 is used as the default one and it could be
4073                          * assigned to a client as well.
4074                          */
4075                         while ((ring->mr_index != 0) && (ring->mr_next != NULL))
4076                                 ring = ring->mr_next;
4077                         ASSERT(ring->mr_index == 0);
4078                         mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
4079                 }
4080                 if (mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC)
4081                         mip->mi_txrings_avail = group->mrg_cur_count - 1;
4082                         /*
4083                          * The default ring cannot be reserved.
4084                          */
4085                         mip->mi_txrings_rsvd = 1;
4086                 /*
4087                  * The default group cannot be reserved. It will be shared
4088                  * by clients that do not have an exclusive group.
4089                  */
4090                 mip->mi_txhwclnt_avail = mip->mi_tx_group_count;
4091                 mip->mi_txhwclnt_used = 1;
4092                 break;
4093         default:
4094                 ASSERT(B_FALSE);
4095         }
4096 
4097         if (err != 0)
4098                 mac_free_rings(mip, rtype);
4099 
4100         return (err);
4101 }
4102 
4103 /*
4104  * The ddi interrupt handle could be shared amoung rings. If so, compare
4105  * the new ring's ddi handle with the existing ones and set ddi_shared
4106  * flag.
4107  */
4108 void
4109 mac_compare_ddi_handle(mac_group_t *groups, uint_t grpcnt, mac_ring_t *cring)
4110 {
4111         mac_group_t *group;
4112         mac_ring_t *ring;
4113         ddi_intr_handle_t ddi_handle;
4114         int g;
4115 
4116         ddi_handle = cring->mr_info.mri_intr.mi_ddi_handle;
4117         for (g = 0; g < grpcnt; g++) {
4118                 group = groups + g;
4119                 for (ring = group->mrg_rings; ring != NULL;
4120                     ring = ring->mr_next) {
4121                         if (ring == cring)
4122                                 continue;
4123                         if (ring->mr_info.mri_intr.mi_ddi_handle ==
4124                             ddi_handle) {
4125                                 if (cring->mr_type == MAC_RING_TYPE_RX &&
4126                                     ring->mr_index == 0 &&
4127                                     !ring->mr_info.mri_intr.mi_ddi_shared) {
4128                                         ring->mr_info.mri_intr.mi_ddi_shared =
4129                                             B_TRUE;
4130                                 } else {
4131                                         cring->mr_info.mri_intr.mi_ddi_shared =
4132                                             B_TRUE;
4133                                 }
4134                                 return;
4135                         }
4136                 }
4137         }
4138 }
4139 
4140 /*
4141  * Called to free all groups of particular type (RX or TX). It's assumed that
4142  * no clients are using these groups.
4143  */
4144 void
4145 mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
4146 {
4147         mac_group_t *group, *groups;
4148         uint_t group_count;
4149 
4150         switch (rtype) {
4151         case MAC_RING_TYPE_RX:
4152                 if (mip->mi_rx_groups == NULL)
4153                         return;
4154 
4155                 groups = mip->mi_rx_groups;
4156                 group_count = mip->mi_rx_group_count;
4157 
4158                 mip->mi_rx_groups = NULL;
4159                 mip->mi_rx_donor_grp = NULL;
4160                 mip->mi_rx_group_count = 0;
4161                 break;
4162         case MAC_RING_TYPE_TX:
4163                 ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
4164 
4165                 if (mip->mi_tx_groups == NULL)
4166                         return;
4167 
4168                 groups = mip->mi_tx_groups;
4169                 group_count = mip->mi_tx_group_count;
4170 
4171                 mip->mi_tx_groups = NULL;
4172                 mip->mi_tx_group_count = 0;
4173                 mip->mi_tx_group_free = 0;
4174                 mip->mi_default_tx_ring = NULL;
4175                 break;
4176         default:
4177                 ASSERT(B_FALSE);
4178         }
4179 
4180         for (group = groups; group != NULL; group = group->mrg_next) {
4181                 mac_ring_t *ring;
4182 
4183                 if (group->mrg_cur_count == 0)
4184                         continue;
4185 
4186                 ASSERT(group->mrg_rings != NULL);
4187 
4188                 while ((ring = group->mrg_rings) != NULL) {
4189                         group->mrg_rings = ring->mr_next;
4190                         mac_ring_free(mip, ring);
4191                 }
4192         }
4193 
4194         /* Free all the cached rings */
4195         mac_ring_freeall(mip);
4196         /* Free the block of group data strutures */
4197         kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
4198 }
4199 
4200 /*
4201  * Associate a MAC address with a receive group.
4202  *
4203  * The return value of this function should always be checked properly, because
4204  * any type of failure could cause unexpected results. A group can be added
4205  * or removed with a MAC address only after it has been reserved. Ideally,
4206  * a successful reservation always leads to calling mac_group_addmac() to
4207  * steer desired traffic. Failure of adding an unicast MAC address doesn't
4208  * always imply that the group is functioning abnormally.
4209  *
4210  * Currently this function is called everywhere, and it reflects assumptions
4211  * about MAC addresses in the implementation. CR 6735196.
4212  */
4213 int
4214 mac_group_addmac(mac_group_t *group, const uint8_t *addr)
4215 {
4216         ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
4217         ASSERT(group->mrg_info.mgi_addmac != NULL);
4218 
4219         return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
4220 }
4221 
4222 /*
4223  * Remove the association between MAC address and receive group.
4224  */
4225 int
4226 mac_group_remmac(mac_group_t *group, const uint8_t *addr)
4227 {
4228         ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
4229         ASSERT(group->mrg_info.mgi_remmac != NULL);
4230 
4231         return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
4232 }
4233 
4234 /*
4235  * This is the entry point for packets transmitted through the bridging code.
4236  * If no bridge is in place, MAC_RING_TX transmits using tx ring. The 'rh'
4237  * pointer may be NULL to select the default ring.
4238  */
4239 mblk_t *
4240 mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp)
4241 {
4242         mac_handle_t mh;
4243 
4244         /*
4245          * Once we take a reference on the bridge link, the bridge
4246          * module itself can't unload, so the callback pointers are
4247          * stable.
4248          */
4249         mutex_enter(&mip->mi_bridge_lock);
4250         if ((mh = mip->mi_bridge_link) != NULL)
4251                 mac_bridge_ref_cb(mh, B_TRUE);
4252         mutex_exit(&mip->mi_bridge_lock);
4253         if (mh == NULL) {
4254                 MAC_RING_TX(mip, rh, mp, mp);
4255         } else {
4256                 mp = mac_bridge_tx_cb(mh, rh, mp);
4257                 mac_bridge_ref_cb(mh, B_FALSE);
4258         }
4259 
4260         return (mp);
4261 }
4262 
4263 /*
4264  * Find a ring from its index.
4265  */
4266 mac_ring_handle_t
4267 mac_find_ring(mac_group_handle_t gh, int index)
4268 {
4269         mac_group_t *group = (mac_group_t *)gh;
4270         mac_ring_t *ring = group->mrg_rings;
4271 
4272         for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
4273                 if (ring->mr_index == index)
4274                         break;
4275 
4276         return ((mac_ring_handle_t)ring);
4277 }
4278 /*
4279  * Add a ring to an existing group.
4280  *
4281  * The ring must be either passed directly (for example if the ring
4282  * movement is initiated by the framework), or specified through a driver
4283  * index (for example when the ring is added by the driver.
4284  *
4285  * The caller needs to call mac_perim_enter() before calling this function.
4286  */
4287 int
4288 i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
4289 {
4290         mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
4291         mac_capab_rings_t *cap_rings;
4292         boolean_t driver_call = (ring == NULL);
4293         mac_group_type_t group_type;
4294         int ret = 0;
4295         flow_entry_t *flent;
4296 
4297         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4298 
4299         switch (group->mrg_type) {
4300         case MAC_RING_TYPE_RX:
4301                 cap_rings = &mip->mi_rx_rings_cap;
4302                 group_type = mip->mi_rx_group_type;
4303                 break;
4304         case MAC_RING_TYPE_TX:
4305                 cap_rings = &mip->mi_tx_rings_cap;
4306                 group_type = mip->mi_tx_group_type;
4307                 break;
4308         default:
4309                 ASSERT(B_FALSE);
4310         }
4311 
4312         /*
4313          * There should be no ring with the same ring index in the target
4314          * group.
4315          */
4316         ASSERT(mac_find_ring((mac_group_handle_t)group,
4317             driver_call ? index : ring->mr_index) == NULL);
4318 
4319         if (driver_call) {
4320                 /*
4321                  * The function is called as a result of a request from
4322                  * a driver to add a ring to an existing group, for example
4323                  * from the aggregation driver. Allocate a new mac_ring_t
4324                  * for that ring.
4325                  */
4326                 ring = mac_init_ring(mip, group, index, cap_rings);
4327                 ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
4328         } else {
4329                 /*
4330                  * The function is called as a result of a MAC layer request
4331                  * to add a ring to an existing group. In this case the
4332                  * ring is being moved between groups, which requires
4333                  * the underlying driver to support dynamic grouping,
4334                  * and the mac_ring_t already exists.
4335                  */
4336                 ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
4337                 ASSERT(group->mrg_driver == NULL ||
4338                     cap_rings->mr_gaddring != NULL);
4339                 ASSERT(ring->mr_gh == NULL);
4340         }
4341 
4342         /*
4343          * At this point the ring should not be in use, and it should be
4344          * of the right for the target group.
4345          */
4346         ASSERT(ring->mr_state < MR_INUSE);
4347         ASSERT(ring->mr_srs == NULL);
4348         ASSERT(ring->mr_type == group->mrg_type);
4349 
4350         if (!driver_call) {
4351                 /*
4352                  * Add the driver level hardware ring if the process was not
4353                  * initiated by the driver, and the target group is not the
4354                  * group.
4355                  */
4356                 if (group->mrg_driver != NULL) {
4357                         cap_rings->mr_gaddring(group->mrg_driver,
4358                             ring->mr_driver, ring->mr_type);
4359                 }
4360 
4361                 /*
4362                  * Insert the ring ahead existing rings.
4363                  */
4364                 ring->mr_next = group->mrg_rings;
4365                 group->mrg_rings = ring;
4366                 ring->mr_gh = (mac_group_handle_t)group;
4367                 group->mrg_cur_count++;
4368         }
4369 
4370         /*
4371          * If the group has not been actively used, we're done.
4372          */
4373         if (group->mrg_index != -1 &&
4374             group->mrg_state < MAC_GROUP_STATE_RESERVED)
4375                 return (0);
4376 
4377         /*
4378          * Start the ring if needed. Failure causes to undo the grouping action.
4379          */
4380         if (ring->mr_state != MR_INUSE) {
4381                 if ((ret = mac_start_ring(ring)) != 0) {
4382                         if (!driver_call) {
4383                                 cap_rings->mr_gremring(group->mrg_driver,
4384                                     ring->mr_driver, ring->mr_type);
4385                         }
4386                         group->mrg_cur_count--;
4387                         group->mrg_rings = ring->mr_next;
4388 
4389                         ring->mr_gh = NULL;
4390 
4391                         if (driver_call)
4392                                 mac_ring_free(mip, ring);
4393 
4394                         return (ret);
4395                 }
4396         }
4397 
4398         /*
4399          * Set up SRS/SR according to the ring type.
4400          */
4401         switch (ring->mr_type) {
4402         case MAC_RING_TYPE_RX:
4403                 /*
4404                  * Setup SRS on top of the new ring if the group is
4405                  * reserved for someones exclusive use.
4406                  */
4407                 if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
4408                         mac_client_impl_t *mcip;
4409 
4410                         mcip = MAC_GROUP_ONLY_CLIENT(group);
4411                         /*
4412                          * Even though this group is reserved we migth still
4413                          * have multiple clients, i.e a VLAN shares the
4414                          * group with the primary mac client.
4415                          */
4416                         if (mcip != NULL) {
4417                                 flent = mcip->mci_flent;
4418                                 ASSERT(flent->fe_rx_srs_cnt > 0);
4419                                 mac_rx_srs_group_setup(mcip, flent, SRST_LINK);
4420                                 mac_fanout_setup(mcip, flent,
4421                                     MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver,
4422                                     mcip, NULL, NULL);
4423                         } else {
4424                                 ring->mr_classify_type = MAC_SW_CLASSIFIER;
4425                         }
4426                 }
4427                 break;
4428         case MAC_RING_TYPE_TX:
4429         {
4430                 mac_grp_client_t        *mgcp = group->mrg_clients;
4431                 mac_client_impl_t       *mcip;
4432                 mac_soft_ring_set_t     *mac_srs;
4433                 mac_srs_tx_t            *tx;
4434 
4435                 if (MAC_GROUP_NO_CLIENT(group)) {
4436                         if (ring->mr_state == MR_INUSE)
4437                                 mac_stop_ring(ring);
4438                         ring->mr_flag = 0;
4439                         break;
4440                 }
4441                 /*
4442                  * If the rings are being moved to a group that has
4443                  * clients using it, then add the new rings to the
4444                  * clients SRS.
4445                  */
4446                 while (mgcp != NULL) {
4447                         boolean_t       is_aggr;
4448 
4449                         mcip = mgcp->mgc_client;
4450                         flent = mcip->mci_flent;
4451                         is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR);
4452                         mac_srs = MCIP_TX_SRS(mcip);
4453                         tx = &mac_srs->srs_tx;
4454                         mac_tx_client_quiesce((mac_client_handle_t)mcip);
4455                         /*
4456                          * If we are  growing from 1 to multiple rings.
4457                          */
4458                         if (tx->st_mode == SRS_TX_BW ||
4459                             tx->st_mode == SRS_TX_SERIALIZE ||
4460                             tx->st_mode == SRS_TX_DEFAULT) {
4461                                 mac_ring_t      *tx_ring = tx->st_arg2;
4462 
4463                                 tx->st_arg2 = NULL;
4464                                 mac_tx_srs_stat_recreate(mac_srs, B_TRUE);
4465                                 mac_tx_srs_add_ring(mac_srs, tx_ring);
4466                                 if (mac_srs->srs_type & SRST_BW_CONTROL) {
4467                                         tx->st_mode = is_aggr ? SRS_TX_BW_AGGR :
4468                                             SRS_TX_BW_FANOUT;
4469                                 } else {
4470                                         tx->st_mode = is_aggr ? SRS_TX_AGGR :
4471                                             SRS_TX_FANOUT;
4472                                 }
4473                                 tx->st_func = mac_tx_get_func(tx->st_mode);
4474                         }
4475                         mac_tx_srs_add_ring(mac_srs, ring);
4476                         mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
4477                             mac_rx_deliver, mcip, NULL, NULL);
4478                         mac_tx_client_restart((mac_client_handle_t)mcip);
4479                         mgcp = mgcp->mgc_next;
4480                 }
4481                 break;
4482         }
4483         default:
4484                 ASSERT(B_FALSE);
4485         }
4486         /*
4487          * For aggr, the default ring will be NULL to begin with. If it
4488          * is NULL, then pick the first ring that gets added as the
4489          * default ring. Any ring in an aggregation can be removed at
4490          * any time (by the user action of removing a link) and if the
4491          * current default ring gets removed, then a new one gets
4492          * picked (see i_mac_group_rem_ring()).
4493          */
4494         if (mip->mi_state_flags & MIS_IS_AGGR &&
4495             mip->mi_default_tx_ring == NULL &&
4496             ring->mr_type == MAC_RING_TYPE_TX) {
4497                 mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
4498         }
4499 
4500         MAC_RING_UNMARK(ring, MR_INCIPIENT);
4501         return (0);
4502 }
4503 
4504 /*
4505  * Remove a ring from it's current group. MAC internal function for dynamic
4506  * grouping.
4507  *
4508  * The caller needs to call mac_perim_enter() before calling this function.
4509  */
4510 void
4511 i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
4512     boolean_t driver_call)
4513 {
4514         mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
4515         mac_capab_rings_t *cap_rings = NULL;
4516         mac_group_type_t group_type;
4517 
4518         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4519 
4520         ASSERT(mac_find_ring((mac_group_handle_t)group,
4521             ring->mr_index) == (mac_ring_handle_t)ring);
4522         ASSERT((mac_group_t *)ring->mr_gh == group);
4523         ASSERT(ring->mr_type == group->mrg_type);
4524 
4525         if (ring->mr_state == MR_INUSE)
4526                 mac_stop_ring(ring);
4527         switch (ring->mr_type) {
4528         case MAC_RING_TYPE_RX:
4529                 group_type = mip->mi_rx_group_type;
4530                 cap_rings = &mip->mi_rx_rings_cap;
4531 
4532                 /*
4533                  * Only hardware classified packets hold a reference to the
4534                  * ring all the way up the Rx path. mac_rx_srs_remove()
4535                  * will take care of quiescing the Rx path and removing the
4536                  * SRS. The software classified path neither holds a reference
4537                  * nor any association with the ring in mac_rx.
4538                  */
4539                 if (ring->mr_srs != NULL) {
4540                         mac_rx_srs_remove(ring->mr_srs);
4541                         ring->mr_srs = NULL;
4542                 }
4543 
4544                 break;
4545         case MAC_RING_TYPE_TX:
4546         {
4547                 mac_grp_client_t        *mgcp;
4548                 mac_client_impl_t       *mcip;
4549                 mac_soft_ring_set_t     *mac_srs;
4550                 mac_srs_tx_t            *tx;
4551                 mac_ring_t              *rem_ring;
4552                 mac_group_t             *defgrp;
4553                 uint_t                  ring_info = 0;
4554 
4555                 /*
4556                  * For TX this function is invoked in three
4557                  * cases:
4558                  *
4559                  * 1) In the case of a failure during the
4560                  * initial creation of a group when a share is
4561                  * associated with a MAC client. So the SRS is not
4562                  * yet setup, and will be setup later after the
4563                  * group has been reserved and populated.
4564                  *
4565                  * 2) From mac_release_tx_group() when freeing
4566                  * a TX SRS.
4567                  *
4568                  * 3) In the case of aggr, when a port gets removed,
4569                  * the pseudo Tx rings that it exposed gets removed.
4570                  *
4571                  * In the first two cases the SRS and its soft
4572                  * rings are already quiesced.
4573                  */
4574                 if (driver_call) {
4575                         mac_client_impl_t *mcip;
4576                         mac_soft_ring_set_t *mac_srs;
4577                         mac_soft_ring_t *sringp;
4578                         mac_srs_tx_t *srs_tx;
4579 
4580                         if (mip->mi_state_flags & MIS_IS_AGGR &&
4581                             mip->mi_default_tx_ring ==
4582                             (mac_ring_handle_t)ring) {
4583                                 /* pick a new default Tx ring */
4584                                 mip->mi_default_tx_ring =
4585                                     (group->mrg_rings != ring) ?
4586                                     (mac_ring_handle_t)group->mrg_rings :
4587                                     (mac_ring_handle_t)(ring->mr_next);
4588                         }
4589                         /* Presently only aggr case comes here */
4590                         if (group->mrg_state != MAC_GROUP_STATE_RESERVED)
4591                                 break;
4592 
4593                         mcip = MAC_GROUP_ONLY_CLIENT(group);
4594                         ASSERT(mcip != NULL);
4595                         ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR);
4596                         mac_srs = MCIP_TX_SRS(mcip);
4597                         ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
4598                             mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
4599                         srs_tx = &mac_srs->srs_tx;
4600                         /*
4601                          * Wakeup any callers blocked on this
4602                          * Tx ring due to flow control.
4603                          */
4604                         sringp = srs_tx->st_soft_rings[ring->mr_index];
4605                         ASSERT(sringp != NULL);
4606                         mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)sringp);
4607                         mac_tx_client_quiesce((mac_client_handle_t)mcip);
4608                         mac_tx_srs_del_ring(mac_srs, ring);
4609                         mac_tx_client_restart((mac_client_handle_t)mcip);
4610                         break;
4611                 }
4612                 ASSERT(ring != (mac_ring_t *)mip->mi_default_tx_ring);
4613                 group_type = mip->mi_tx_group_type;
4614                 cap_rings = &mip->mi_tx_rings_cap;
4615                 /*
4616                  * See if we need to take it out of the MAC clients using
4617                  * this group
4618                  */
4619                 if (MAC_GROUP_NO_CLIENT(group))
4620                         break;
4621                 mgcp = group->mrg_clients;
4622                 defgrp = MAC_DEFAULT_TX_GROUP(mip);
4623                 while (mgcp != NULL) {
4624                         mcip = mgcp->mgc_client;
4625                         mac_srs = MCIP_TX_SRS(mcip);
4626                         tx = &mac_srs->srs_tx;
4627                         mac_tx_client_quiesce((mac_client_handle_t)mcip);
4628                         /*
4629                          * If we are here when removing rings from the
4630                          * defgroup, mac_reserve_tx_ring would have
4631                          * already deleted the ring from the MAC
4632                          * clients in the group.
4633                          */
4634                         if (group != defgrp) {
4635                                 mac_tx_invoke_callbacks(mcip,
4636                                     (mac_tx_cookie_t)
4637                                     mac_tx_srs_get_soft_ring(mac_srs, ring));
4638                                 mac_tx_srs_del_ring(mac_srs, ring);
4639                         }
4640                         /*
4641                          * Additionally, if  we are left with only
4642                          * one ring in the group after this, we need
4643                          * to modify the mode etc. to. (We haven't
4644                          * yet taken the ring out, so we check with 2).
4645                          */
4646                         if (group->mrg_cur_count == 2) {
4647                                 if (ring->mr_next == NULL)
4648                                         rem_ring = group->mrg_rings;
4649                                 else
4650                                         rem_ring = ring->mr_next;
4651                                 mac_tx_invoke_callbacks(mcip,
4652                                     (mac_tx_cookie_t)
4653                                     mac_tx_srs_get_soft_ring(mac_srs,
4654                                     rem_ring));
4655                                 mac_tx_srs_del_ring(mac_srs, rem_ring);
4656                                 if (rem_ring->mr_state != MR_INUSE) {
4657                                         (void) mac_start_ring(rem_ring);
4658                                 }
4659                                 tx->st_arg2 = (void *)rem_ring;
4660                                 mac_tx_srs_stat_recreate(mac_srs, B_FALSE);
4661                                 ring_info = mac_hwring_getinfo(
4662                                     (mac_ring_handle_t)rem_ring);
4663                                 /*
4664                                  * We are  shrinking from multiple
4665                                  * to 1 ring.
4666                                  */
4667                                 if (mac_srs->srs_type & SRST_BW_CONTROL) {
4668                                         tx->st_mode = SRS_TX_BW;
4669                                 } else if (mac_tx_serialize ||
4670                                     (ring_info & MAC_RING_TX_SERIALIZE)) {
4671                                         tx->st_mode = SRS_TX_SERIALIZE;
4672                                 } else {
4673                                         tx->st_mode = SRS_TX_DEFAULT;
4674                                 }
4675                                 tx->st_func = mac_tx_get_func(tx->st_mode);
4676                         }
4677                         mac_tx_client_restart((mac_client_handle_t)mcip);
4678                         mgcp = mgcp->mgc_next;
4679                 }
4680                 break;
4681         }
4682         default:
4683                 ASSERT(B_FALSE);
4684         }
4685 
4686         /*
4687          * Remove the ring from the group.
4688          */
4689         if (ring == group->mrg_rings)
4690                 group->mrg_rings = ring->mr_next;
4691         else {
4692                 mac_ring_t *pre;
4693 
4694                 pre = group->mrg_rings;
4695                 while (pre->mr_next != ring)
4696                         pre = pre->mr_next;
4697                 pre->mr_next = ring->mr_next;
4698         }
4699         group->mrg_cur_count--;
4700 
4701         if (!driver_call) {
4702                 ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
4703                 ASSERT(group->mrg_driver == NULL ||
4704                     cap_rings->mr_gremring != NULL);
4705 
4706                 /*
4707                  * Remove the driver level hardware ring.
4708                  */
4709                 if (group->mrg_driver != NULL) {
4710                         cap_rings->mr_gremring(group->mrg_driver,
4711                             ring->mr_driver, ring->mr_type);
4712                 }
4713         }
4714 
4715         ring->mr_gh = NULL;
4716         if (driver_call)
4717                 mac_ring_free(mip, ring);
4718         else
4719                 ring->mr_flag = 0;
4720 }
4721 
4722 /*
4723  * Move a ring to the target group. If needed, remove the ring from the group
4724  * that it currently belongs to.
4725  *
4726  * The caller need to enter MAC's perimeter by calling mac_perim_enter().
4727  */
4728 static int
4729 mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
4730 {
4731         mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
4732         int rv;
4733 
4734         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4735         ASSERT(d_group != NULL);
4736         ASSERT(s_group->mrg_mh == d_group->mrg_mh);
4737 
4738         if (s_group == d_group)
4739                 return (0);
4740 
4741         /*
4742          * Remove it from current group first.
4743          */
4744         if (s_group != NULL)
4745                 i_mac_group_rem_ring(s_group, ring, B_FALSE);
4746 
4747         /*
4748          * Add it to the new group.
4749          */
4750         rv = i_mac_group_add_ring(d_group, ring, 0);
4751         if (rv != 0) {
4752                 /*
4753                  * Failed to add ring back to source group. If
4754                  * that fails, the ring is stuck in limbo, log message.
4755                  */
4756                 if (i_mac_group_add_ring(s_group, ring, 0)) {
4757                         cmn_err(CE_WARN, "%s: failed to move ring %p\n",
4758                             mip->mi_name, (void *)ring);
4759                 }
4760         }
4761 
4762         return (rv);
4763 }
4764 
4765 /*
4766  * Find a MAC address according to its value.
4767  */
4768 mac_address_t *
4769 mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
4770 {
4771         mac_address_t *map;
4772 
4773         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4774 
4775         for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
4776                 if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
4777                         break;
4778         }
4779 
4780         return (map);
4781 }
4782 
4783 /*
4784  * Check whether the MAC address is shared by multiple clients.
4785  */
4786 boolean_t
4787 mac_check_macaddr_shared(mac_address_t *map)
4788 {
4789         ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
4790 
4791         return (map->ma_nusers > 1);
4792 }
4793 
4794 /*
4795  * Remove the specified MAC address from the MAC address list and free it.
4796  */
4797 static void
4798 mac_free_macaddr(mac_address_t *map)
4799 {
4800         mac_impl_t *mip = map->ma_mip;
4801 
4802         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4803         ASSERT(mip->mi_addresses != NULL);
4804 
4805         map = mac_find_macaddr(mip, map->ma_addr);
4806 
4807         ASSERT(map != NULL);
4808         ASSERT(map->ma_nusers == 0);
4809 
4810         if (map == mip->mi_addresses) {
4811                 mip->mi_addresses = map->ma_next;
4812         } else {
4813                 mac_address_t *pre;
4814 
4815                 pre = mip->mi_addresses;
4816                 while (pre->ma_next != map)
4817                         pre = pre->ma_next;
4818                 pre->ma_next = map->ma_next;
4819         }
4820 
4821         kmem_free(map, sizeof (mac_address_t));
4822 }
4823 
4824 /*
4825  * Add a MAC address reference for a client. If the desired MAC address
4826  * exists, add a reference to it. Otherwise, add the new address by adding
4827  * it to a reserved group or setting promiscuous mode. Won't try different
4828  * group is the group is non-NULL, so the caller must explictly share
4829  * default group when needed.
4830  *
4831  * Note, the primary MAC address is initialized at registration time, so
4832  * to add it to default group only need to activate it if its reference
4833  * count is still zero. Also, some drivers may not have advertised RINGS
4834  * capability.
4835  */
4836 int
4837 mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
4838     boolean_t use_hw)
4839 {
4840         mac_address_t *map;
4841         int err = 0;
4842         boolean_t allocated_map = B_FALSE;
4843 
4844         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4845 
4846         map = mac_find_macaddr(mip, mac_addr);
4847 
4848         /*
4849          * If the new MAC address has not been added. Allocate a new one
4850          * and set it up.
4851          */
4852         if (map == NULL) {
4853                 map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
4854                 map->ma_len = mip->mi_type->mt_addr_length;
4855                 bcopy(mac_addr, map->ma_addr, map->ma_len);
4856                 map->ma_nusers = 0;
4857                 map->ma_group = group;
4858                 map->ma_mip = mip;
4859 
4860                 /* add the new MAC address to the head of the address list */
4861                 map->ma_next = mip->mi_addresses;
4862                 mip->mi_addresses = map;
4863 
4864                 allocated_map = B_TRUE;
4865         }
4866 
4867         ASSERT(map->ma_group == NULL || map->ma_group == group);
4868         if (map->ma_group == NULL)
4869                 map->ma_group = group;
4870 
4871         /*
4872          * If the MAC address is already in use, simply account for the
4873          * new client.
4874          */
4875         if (map->ma_nusers++ > 0)
4876                 return (0);
4877 
4878         /*
4879          * Activate this MAC address by adding it to the reserved group.
4880          */
4881         if (group != NULL) {
4882                 err = mac_group_addmac(group, (const uint8_t *)mac_addr);
4883                 if (err == 0) {
4884                         map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4885                         return (0);
4886                 }
4887         }
4888 
4889         /*
4890          * The MAC address addition failed. If the client requires a
4891          * hardware classified MAC address, fail the operation.
4892          */
4893         if (use_hw) {
4894                 err = ENOSPC;
4895                 goto bail;
4896         }
4897 
4898         /*
4899          * Try promiscuous mode.
4900          *
4901          * For drivers that don't advertise RINGS capability, do
4902          * nothing for the primary address.
4903          */
4904         if ((group == NULL) &&
4905             (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
4906                 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4907                 return (0);
4908         }
4909 
4910         /*
4911          * Enable promiscuous mode in order to receive traffic
4912          * to the new MAC address.
4913          */
4914         if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) {
4915                 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
4916                 return (0);
4917         }
4918 
4919         /*
4920          * Free the MAC address that could not be added. Don't free
4921          * a pre-existing address, it could have been the entry
4922          * for the primary MAC address which was pre-allocated by
4923          * mac_init_macaddr(), and which must remain on the list.
4924          */
4925 bail:
4926         map->ma_nusers--;
4927         if (allocated_map)
4928                 mac_free_macaddr(map);
4929         return (err);
4930 }
4931 
4932 /*
4933  * Remove a reference to a MAC address. This may cause to remove the MAC
4934  * address from an associated group or to turn off promiscuous mode.
4935  * The caller needs to handle the failure properly.
4936  */
4937 int
4938 mac_remove_macaddr(mac_address_t *map)
4939 {
4940         mac_impl_t *mip = map->ma_mip;
4941         int err = 0;
4942 
4943         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4944 
4945         ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
4946 
4947         /*
4948          * If it's not the last client using this MAC address, only update
4949          * the MAC clients count.
4950          */
4951         if (--map->ma_nusers > 0)
4952                 return (0);
4953 
4954         /*
4955          * The MAC address is no longer used by any MAC client, so remove
4956          * it from its associated group, or turn off promiscuous mode
4957          * if it was enabled for the MAC address.
4958          */
4959         switch (map->ma_type) {
4960         case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
4961                 /*
4962                  * Don't free the preset primary address for drivers that
4963                  * don't advertise RINGS capability.
4964                  */
4965                 if (map->ma_group == NULL)
4966                         return (0);
4967 
4968                 err = mac_group_remmac(map->ma_group, map->ma_addr);
4969                 if (err == 0)
4970                         map->ma_group = NULL;
4971                 break;
4972         case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
4973                 err = i_mac_promisc_set(mip, B_FALSE);
4974                 break;
4975         default:
4976                 ASSERT(B_FALSE);
4977         }
4978 
4979         if (err != 0)
4980                 return (err);
4981 
4982         /*
4983          * We created MAC address for the primary one at registration, so we
4984          * won't free it here. mac_fini_macaddr() will take care of it.
4985          */
4986         if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
4987                 mac_free_macaddr(map);
4988 
4989         return (0);
4990 }
4991 
4992 /*
4993  * Update an existing MAC address. The caller need to make sure that the new
4994  * value has not been used.
4995  */
4996 int
4997 mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
4998 {
4999         mac_impl_t *mip = map->ma_mip;
5000         int err = 0;
5001 
5002         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5003         ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
5004 
5005         switch (map->ma_type) {
5006         case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
5007                 /*
5008                  * Update the primary address for drivers that are not
5009                  * RINGS capable.
5010                  */
5011                 if (mip->mi_rx_groups == NULL) {
5012                         err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
5013                             mac_addr);
5014                         if (err != 0)
5015                                 return (err);
5016                         break;
5017                 }
5018 
5019                 /*
5020                  * If this MAC address is not currently in use,
5021                  * simply break out and update the value.
5022                  */
5023                 if (map->ma_nusers == 0)
5024                         break;
5025 
5026                 /*
5027                  * Need to replace the MAC address associated with a group.
5028                  */
5029                 err = mac_group_remmac(map->ma_group, map->ma_addr);
5030                 if (err != 0)
5031                         return (err);
5032 
5033                 err = mac_group_addmac(map->ma_group, mac_addr);
5034 
5035                 /*
5036                  * Failure hints hardware error. The MAC layer needs to
5037                  * have error notification facility to handle this.
5038                  * Now, simply try to restore the value.
5039                  */
5040                 if (err != 0)
5041                         (void) mac_group_addmac(map->ma_group, map->ma_addr);
5042 
5043                 break;
5044         case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
5045                 /*
5046                  * Need to do nothing more if in promiscuous mode.
5047                  */
5048                 break;
5049         default:
5050                 ASSERT(B_FALSE);
5051         }
5052 
5053         /*
5054          * Successfully replaced the MAC address.
5055          */
5056         if (err == 0)
5057                 bcopy(mac_addr, map->ma_addr, map->ma_len);
5058 
5059         return (err);
5060 }
5061 
5062 /*
5063  * Freshen the MAC address with new value. Its caller must have updated the
5064  * hardware MAC address before calling this function.
5065  * This funcitons is supposed to be used to handle the MAC address change
5066  * notification from underlying drivers.
5067  */
5068 void
5069 mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
5070 {
5071         mac_impl_t *mip = map->ma_mip;
5072 
5073         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5074         ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
5075 
5076         /*
5077          * Freshen the MAC address with new value.
5078          */
5079         bcopy(mac_addr, map->ma_addr, map->ma_len);
5080         bcopy(mac_addr, mip->mi_addr, map->ma_len);
5081 
5082         /*
5083          * Update all MAC clients that share this MAC address.
5084          */
5085         mac_unicast_update_clients(mip, map);
5086 }
5087 
5088 /*
5089  * Set up the primary MAC address.
5090  */
5091 void
5092 mac_init_macaddr(mac_impl_t *mip)
5093 {
5094         mac_address_t *map;
5095 
5096         /*
5097          * The reference count is initialized to zero, until it's really
5098          * activated.
5099          */
5100         map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
5101         map->ma_len = mip->mi_type->mt_addr_length;
5102         bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
5103 
5104         /*
5105          * If driver advertises RINGS capability, it shouldn't have initialized
5106          * its primary MAC address. For other drivers, including VNIC, the
5107          * primary address must work after registration.
5108          */
5109         if (mip->mi_rx_groups == NULL)
5110                 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
5111 
5112         map->ma_mip = mip;
5113 
5114         mip->mi_addresses = map;
5115 }
5116 
5117 /*
5118  * Clean up the primary MAC address. Note, only one primary MAC address
5119  * is allowed. All other MAC addresses must have been freed appropriately.
5120  */
5121 void
5122 mac_fini_macaddr(mac_impl_t *mip)
5123 {
5124         mac_address_t *map = mip->mi_addresses;
5125 
5126         if (map == NULL)
5127                 return;
5128 
5129         /*
5130          * If mi_addresses is initialized, there should be exactly one
5131          * entry left on the list with no users.
5132          */
5133         ASSERT(map->ma_nusers == 0);
5134         ASSERT(map->ma_next == NULL);
5135 
5136         kmem_free(map, sizeof (mac_address_t));
5137         mip->mi_addresses = NULL;
5138 }
5139 
5140 /*
5141  * Logging related functions.
5142  *
5143  * Note that Kernel statistics have been extended to maintain fine
5144  * granularity of statistics viz. hardware lane, software lane, fanout
5145  * stats etc. However, extended accounting continues to support only
5146  * aggregate statistics like before.
5147  */
5148 
5149 /* Write the flow description to a netinfo_t record */
5150 static netinfo_t *
5151 mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
5152 {
5153         netinfo_t               *ninfo;
5154         net_desc_t              *ndesc;
5155         flow_desc_t             *fdesc;
5156         mac_resource_props_t    *mrp;
5157 
5158         ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5159         if (ninfo == NULL)
5160                 return (NULL);
5161         ndesc = kmem_zalloc(sizeof (net_desc_t), KM_NOSLEEP);
5162         if (ndesc == NULL) {
5163                 kmem_free(ninfo, sizeof (netinfo_t));
5164                 return (NULL);
5165         }
5166 
5167         /*
5168          * Grab the fe_lock to see a self-consistent fe_flow_desc.
5169          * Updates to the fe_flow_desc are done under the fe_lock
5170          */
5171         mutex_enter(&flent->fe_lock);
5172         fdesc = &flent->fe_flow_desc;
5173         mrp = &flent->fe_resource_props;
5174 
5175         ndesc->nd_name = flent->fe_flow_name;
5176         ndesc->nd_devname = mcip->mci_name;
5177         bcopy(fdesc->fd_src_mac, ndesc->nd_ehost, ETHERADDRL);
5178         bcopy(fdesc->fd_dst_mac, ndesc->nd_edest, ETHERADDRL);
5179         ndesc->nd_sap = htonl(fdesc->fd_sap);
5180         ndesc->nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
5181         ndesc->nd_bw_limit = mrp->mrp_maxbw;
5182         if (ndesc->nd_isv4) {
5183                 ndesc->nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
5184                 ndesc->nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
5185         } else {
5186                 bcopy(&fdesc->fd_local_addr, ndesc->nd_saddr, IPV6_ADDR_LEN);
5187                 bcopy(&fdesc->fd_remote_addr, ndesc->nd_daddr, IPV6_ADDR_LEN);
5188         }
5189         ndesc->nd_sport = htons(fdesc->fd_local_port);
5190         ndesc->nd_dport = htons(fdesc->fd_remote_port);
5191         ndesc->nd_protocol = (uint8_t)fdesc->fd_protocol;
5192         mutex_exit(&flent->fe_lock);
5193 
5194         ninfo->ni_record = ndesc;
5195         ninfo->ni_size = sizeof (net_desc_t);
5196         ninfo->ni_type = EX_NET_FLDESC_REC;
5197 
5198         return (ninfo);
5199 }
5200 
5201 /* Write the flow statistics to a netinfo_t record */
5202 static netinfo_t *
5203 mac_write_flow_stats(flow_entry_t *flent)
5204 {
5205         netinfo_t               *ninfo;
5206         net_stat_t              *nstat;
5207         mac_soft_ring_set_t     *mac_srs;
5208         mac_rx_stats_t          *mac_rx_stat;
5209         mac_tx_stats_t          *mac_tx_stat;
5210         int                     i;
5211 
5212         ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5213         if (ninfo == NULL)
5214                 return (NULL);
5215         nstat = kmem_zalloc(sizeof (net_stat_t), KM_NOSLEEP);
5216         if (nstat == NULL) {
5217                 kmem_free(ninfo, sizeof (netinfo_t));
5218                 return (NULL);
5219         }
5220 
5221         nstat->ns_name = flent->fe_flow_name;
5222         for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
5223                 mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
5224                 mac_rx_stat = &mac_srs->srs_rx.sr_stat;
5225 
5226                 nstat->ns_ibytes += mac_rx_stat->mrs_intrbytes +
5227                     mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
5228                 nstat->ns_ipackets += mac_rx_stat->mrs_intrcnt +
5229                     mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
5230                 nstat->ns_oerrors += mac_rx_stat->mrs_ierrors;
5231         }
5232 
5233         mac_srs = (mac_soft_ring_set_t *)(flent->fe_tx_srs);
5234         if (mac_srs != NULL) {
5235                 mac_tx_stat = &mac_srs->srs_tx.st_stat;
5236 
5237                 nstat->ns_obytes = mac_tx_stat->mts_obytes;
5238                 nstat->ns_opackets = mac_tx_stat->mts_opackets;
5239                 nstat->ns_oerrors = mac_tx_stat->mts_oerrors;
5240         }
5241 
5242         ninfo->ni_record = nstat;
5243         ninfo->ni_size = sizeof (net_stat_t);
5244         ninfo->ni_type = EX_NET_FLSTAT_REC;
5245 
5246         return (ninfo);
5247 }
5248 
5249 /* Write the link description to a netinfo_t record */
5250 static netinfo_t *
5251 mac_write_link_desc(mac_client_impl_t *mcip)
5252 {
5253         netinfo_t               *ninfo;
5254         net_desc_t              *ndesc;
5255         flow_entry_t            *flent = mcip->mci_flent;
5256 
5257         ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5258         if (ninfo == NULL)
5259                 return (NULL);
5260         ndesc = kmem_zalloc(sizeof (net_desc_t), KM_NOSLEEP);
5261         if (ndesc == NULL) {
5262                 kmem_free(ninfo, sizeof (netinfo_t));
5263                 return (NULL);
5264         }
5265 
5266         ndesc->nd_name = mcip->mci_name;
5267         ndesc->nd_devname = mcip->mci_name;
5268         ndesc->nd_isv4 = B_TRUE;
5269         /*
5270          * Grab the fe_lock to see a self-consistent fe_flow_desc.
5271          * Updates to the fe_flow_desc are done under the fe_lock
5272          * after removing the flent from the flow table.
5273          */
5274         mutex_enter(&flent->fe_lock);
5275         bcopy(flent->fe_flow_desc.fd_src_mac, ndesc->nd_ehost, ETHERADDRL);
5276         mutex_exit(&flent->fe_lock);
5277 
5278         ninfo->ni_record = ndesc;
5279         ninfo->ni_size = sizeof (net_desc_t);
5280         ninfo->ni_type = EX_NET_LNDESC_REC;
5281 
5282         return (ninfo);
5283 }
5284 
5285 /* Write the link statistics to a netinfo_t record */
5286 static netinfo_t *
5287 mac_write_link_stats(mac_client_impl_t *mcip)
5288 {
5289         netinfo_t               *ninfo;
5290         net_stat_t              *nstat;
5291         flow_entry_t            *flent;
5292         mac_soft_ring_set_t     *mac_srs;
5293         mac_rx_stats_t          *mac_rx_stat;
5294         mac_tx_stats_t          *mac_tx_stat;
5295         int                     i;
5296 
5297         ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5298         if (ninfo == NULL)
5299                 return (NULL);
5300         nstat = kmem_zalloc(sizeof (net_stat_t), KM_NOSLEEP);
5301         if (nstat == NULL) {
5302                 kmem_free(ninfo, sizeof (netinfo_t));
5303                 return (NULL);
5304         }
5305 
5306         nstat->ns_name = mcip->mci_name;
5307         flent = mcip->mci_flent;
5308         if (flent != NULL)  {
5309                 for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
5310                         mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
5311                         mac_rx_stat = &mac_srs->srs_rx.sr_stat;
5312 
5313                         nstat->ns_ibytes += mac_rx_stat->mrs_intrbytes +
5314                             mac_rx_stat->mrs_pollbytes +
5315                             mac_rx_stat->mrs_lclbytes;
5316                         nstat->ns_ipackets += mac_rx_stat->mrs_intrcnt +
5317                             mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
5318                         nstat->ns_oerrors += mac_rx_stat->mrs_ierrors;
5319                 }
5320         }
5321 
5322         mac_srs = (mac_soft_ring_set_t *)(mcip->mci_flent->fe_tx_srs);
5323         if (mac_srs != NULL) {
5324                 mac_tx_stat = &mac_srs->srs_tx.st_stat;
5325 
5326                 nstat->ns_obytes = mac_tx_stat->mts_obytes;
5327                 nstat->ns_opackets = mac_tx_stat->mts_opackets;
5328                 nstat->ns_oerrors = mac_tx_stat->mts_oerrors;
5329         }
5330 
5331         ninfo->ni_record = nstat;
5332         ninfo->ni_size = sizeof (net_stat_t);
5333         ninfo->ni_type = EX_NET_LNSTAT_REC;
5334 
5335         return (ninfo);
5336 }
5337 
5338 typedef struct i_mac_log_state_s {
5339         boolean_t       mi_last;
5340         int             mi_fenable;
5341         int             mi_lenable;
5342         list_t          *mi_list;
5343 } i_mac_log_state_t;
5344 
5345 /*
5346  * For a given flow, if the description has not been logged before, do it now.
5347  * If it is a VNIC, then we have collected information about it from the MAC
5348  * table, so skip it.
5349  *
5350  * Called through mac_flow_walk_nolock()
5351  *
5352  * Return 0 if successful.
5353  */
5354 static int
5355 mac_log_flowinfo(flow_entry_t *flent, void *arg)
5356 {
5357         mac_client_impl_t       *mcip = flent->fe_mcip;
5358         i_mac_log_state_t       *lstate = arg;
5359         netinfo_t               *ninfo;
5360 
5361         if (mcip == NULL)
5362                 return (0);
5363 
5364         /*
5365          * If the name starts with "vnic", and fe_user_generated is true (to
5366          * exclude the mcast and active flow entries created implicitly for
5367          * a vnic, it is a VNIC flow.  i.e. vnic1 is a vnic flow,
5368          * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
5369          */
5370         if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
5371             (flent->fe_type & FLOW_USER) != 0) {
5372                 return (0);
5373         }
5374 
5375         if (!flent->fe_desc_logged) {
5376                 /*
5377                  * We don't return error because we want to continue the
5378                  * walk in case this is the last walk which means we
5379                  * need to reset fe_desc_logged in all the flows.
5380                  */
5381                 if ((ninfo = mac_write_flow_desc(flent, mcip)) == NULL)
5382                         return (0);
5383                 list_insert_tail(lstate->mi_list, ninfo);
5384                 flent->fe_desc_logged = B_TRUE;
5385         }
5386 
5387         /*
5388          * Regardless of the error, we want to proceed in case we have to
5389          * reset fe_desc_logged.
5390          */
5391         ninfo = mac_write_flow_stats(flent);
5392         if (ninfo == NULL)
5393                 return (-1);
5394 
5395         list_insert_tail(lstate->mi_list, ninfo);
5396 
5397         if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
5398                 flent->fe_desc_logged = B_FALSE;
5399 
5400         return (0);
5401 }
5402 
5403 /*
5404  * Log the description for each mac client of this mac_impl_t, if it
5405  * hasn't already been done. Additionally, log statistics for the link as
5406  * well. Walk the flow table and log information for each flow as well.
5407  * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
5408  * also fe_desc_logged, if flow logging is on) since we want to log the
5409  * description if and when logging is restarted.
5410  *
5411  * Return 0 upon success or -1 upon failure
5412  */
5413 static int
5414 i_mac_impl_log(mac_impl_t *mip, i_mac_log_state_t *lstate)
5415 {
5416         mac_client_impl_t       *mcip;
5417         netinfo_t               *ninfo;
5418 
5419         i_mac_perim_enter(mip);
5420         /*
5421          * Only walk the client list for NIC and etherstub
5422          */
5423         if ((mip->mi_state_flags & MIS_DISABLED) ||
5424             ((mip->mi_state_flags & MIS_IS_VNIC) &&
5425             (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL))) {
5426                 i_mac_perim_exit(mip);
5427                 return (0);
5428         }
5429 
5430         for (mcip = mip->mi_clients_list; mcip != NULL;
5431             mcip = mcip->mci_client_next) {
5432                 if (!MCIP_DATAPATH_SETUP(mcip))
5433                         continue;
5434                 if (lstate->mi_lenable) {
5435                         if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
5436                                 ninfo = mac_write_link_desc(mcip);
5437                                 if (ninfo == NULL) {
5438                                 /*
5439                                  * We can't terminate it if this is the last
5440                                  * walk, else there might be some links with
5441                                  * mi_desc_logged set to true, which means
5442                                  * their description won't be logged the next
5443                                  * time logging is started (similarly for the
5444                                  * flows within such links). We can continue
5445                                  * without walking the flow table (i.e. to
5446                                  * set fe_desc_logged to false) because we
5447                                  * won't have written any flow stuff for this
5448                                  * link as we haven't logged the link itself.
5449                                  */
5450                                         i_mac_perim_exit(mip);
5451                                         if (lstate->mi_last)
5452                                                 return (0);
5453                                         else
5454                                                 return (-1);
5455                                 }
5456                                 mcip->mci_state_flags |= MCIS_DESC_LOGGED;
5457                                 list_insert_tail(lstate->mi_list, ninfo);
5458                         }
5459                 }
5460 
5461                 ninfo = mac_write_link_stats(mcip);
5462                 if (ninfo == NULL && !lstate->mi_last) {
5463                         i_mac_perim_exit(mip);
5464                         return (-1);
5465                 }
5466                 list_insert_tail(lstate->mi_list, ninfo);
5467 
5468                 if (lstate->mi_last)
5469                         mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
5470 
5471                 if (lstate->mi_fenable) {
5472                         if (mcip->mci_subflow_tab != NULL) {
5473                                 (void) mac_flow_walk_nolock(
5474                                     mcip->mci_subflow_tab, mac_log_flowinfo,
5475                                     lstate);
5476                         }
5477                 }
5478         }
5479         i_mac_perim_exit(mip);
5480         return (0);
5481 }
5482 
5483 /*
5484  * modhash walker function to add a mac_impl_t to a list
5485  */
5486 /*ARGSUSED*/
5487 static uint_t
5488 i_mac_impl_list_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
5489 {
5490         list_t                  *list = (list_t *)arg;
5491         mac_impl_t              *mip = (mac_impl_t *)val;
5492 
5493         if ((mip->mi_state_flags & MIS_DISABLED) == 0) {
5494                 list_insert_tail(list, mip);
5495                 mip->mi_ref++;
5496         }
5497 
5498         return (MH_WALK_CONTINUE);
5499 }
5500 
5501 void
5502 i_mac_log_info(list_t *net_log_list, i_mac_log_state_t *lstate)
5503 {
5504         list_t                  mac_impl_list;
5505         mac_impl_t              *mip;
5506         netinfo_t               *ninfo;
5507 
5508         /* Create list of mac_impls */
5509         ASSERT(RW_LOCK_HELD(&i_mac_impl_lock));
5510         list_create(&mac_impl_list, sizeof (mac_impl_t), offsetof(mac_impl_t,
5511             mi_node));
5512         mod_hash_walk(i_mac_impl_hash, i_mac_impl_list_walker, &mac_impl_list);
5513         rw_exit(&i_mac_impl_lock);
5514 
5515         /* Create log entries for each mac_impl */
5516         for (mip = list_head(&mac_impl_list); mip != NULL;
5517             mip = list_next(&mac_impl_list, mip)) {
5518                 if (i_mac_impl_log(mip, lstate) != 0)
5519                         continue;
5520         }
5521 
5522         /* Remove elements and destroy list of mac_impls */
5523         rw_enter(&i_mac_impl_lock, RW_WRITER);
5524         while ((mip = list_remove_tail(&mac_impl_list)) != NULL) {
5525                 mip->mi_ref--;
5526         }
5527         rw_exit(&i_mac_impl_lock);
5528         list_destroy(&mac_impl_list);
5529 
5530         /*
5531          * Write log entries to files outside of locks, free associated
5532          * structures, and remove entries from the list.
5533          */
5534         while ((ninfo = list_head(net_log_list)) != NULL) {
5535                 (void) exacct_commit_netinfo(ninfo->ni_record, ninfo->ni_type);
5536                 list_remove(net_log_list, ninfo);
5537                 kmem_free(ninfo->ni_record, ninfo->ni_size);
5538                 kmem_free(ninfo, sizeof (*ninfo));
5539         }
5540         list_destroy(net_log_list);
5541 }
5542 
5543 /*
5544  * The timer thread that runs every mac_logging_interval seconds and logs
5545  * link and/or flow information.
5546  */
5547 /* ARGSUSED */
5548 void
5549 mac_log_linkinfo(void *arg)
5550 {
5551         i_mac_log_state_t       lstate;
5552         list_t                  net_log_list;
5553 
5554         list_create(&net_log_list, sizeof (netinfo_t),
5555             offsetof(netinfo_t, ni_link));
5556 
5557         rw_enter(&i_mac_impl_lock, RW_READER);
5558         if (!mac_flow_log_enable && !mac_link_log_enable) {
5559                 rw_exit(&i_mac_impl_lock);
5560                 return;
5561         }
5562         lstate.mi_fenable = mac_flow_log_enable;
5563         lstate.mi_lenable = mac_link_log_enable;
5564         lstate.mi_last = B_FALSE;
5565         lstate.mi_list = &net_log_list;
5566 
5567         /* Write log entries for each mac_impl in the list */
5568         i_mac_log_info(&net_log_list, &lstate);
5569 
5570         if (mac_flow_log_enable || mac_link_log_enable) {
5571                 mac_logging_timer = timeout(mac_log_linkinfo, NULL,
5572                     SEC_TO_TICK(mac_logging_interval));
5573         }
5574 }
5575 
5576 typedef struct i_mac_fastpath_state_s {
5577         boolean_t       mf_disable;
5578         int             mf_err;
5579 } i_mac_fastpath_state_t;
5580 
5581 /* modhash walker function to enable or disable fastpath */
5582 /*ARGSUSED*/
5583 static uint_t
5584 i_mac_fastpath_walker(mod_hash_key_t key, mod_hash_val_t *val,
5585     void *arg)
5586 {
5587         i_mac_fastpath_state_t  *state = arg;
5588         mac_handle_t            mh = (mac_handle_t)val;
5589 
5590         if (state->mf_disable)
5591                 state->mf_err = mac_fastpath_disable(mh);
5592         else
5593                 mac_fastpath_enable(mh);
5594 
5595         return (state->mf_err == 0 ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
5596 }
5597 
5598 /*
5599  * Start the logging timer.
5600  */
5601 int
5602 mac_start_logusage(mac_logtype_t type, uint_t interval)
5603 {
5604         i_mac_fastpath_state_t  dstate = {B_TRUE, 0};
5605         i_mac_fastpath_state_t  estate = {B_FALSE, 0};
5606         int                     err;
5607 
5608         rw_enter(&i_mac_impl_lock, RW_WRITER);
5609         switch (type) {
5610         case MAC_LOGTYPE_FLOW:
5611                 if (mac_flow_log_enable) {
5612                         rw_exit(&i_mac_impl_lock);
5613                         return (0);
5614                 }
5615                 /* FALLTHRU */
5616         case MAC_LOGTYPE_LINK:
5617                 if (mac_link_log_enable) {
5618                         rw_exit(&i_mac_impl_lock);
5619                         return (0);
5620                 }
5621                 break;
5622         default:
5623                 ASSERT(0);
5624         }
5625 
5626         /* Disable fastpath */
5627         mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &dstate);
5628         if ((err = dstate.mf_err) != 0) {
5629                 /* Reenable fastpath  */
5630                 mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
5631                 rw_exit(&i_mac_impl_lock);
5632                 return (err);
5633         }
5634 
5635         switch (type) {
5636         case MAC_LOGTYPE_FLOW:
5637                 mac_flow_log_enable = B_TRUE;
5638                 /* FALLTHRU */
5639         case MAC_LOGTYPE_LINK:
5640                 mac_link_log_enable = B_TRUE;
5641                 break;
5642         }
5643 
5644         mac_logging_interval = interval;
5645         rw_exit(&i_mac_impl_lock);
5646         mac_log_linkinfo(NULL);
5647         return (0);
5648 }
5649 
5650 /*
5651  * Stop the logging timer if both link and flow logging are turned off.
5652  */
5653 void
5654 mac_stop_logusage(mac_logtype_t type)
5655 {
5656         i_mac_log_state_t       lstate;
5657         i_mac_fastpath_state_t  estate = {B_FALSE, 0};
5658         list_t                  net_log_list;
5659 
5660         list_create(&net_log_list, sizeof (netinfo_t),
5661             offsetof(netinfo_t, ni_link));
5662 
5663         rw_enter(&i_mac_impl_lock, RW_WRITER);
5664 
5665         lstate.mi_fenable = mac_flow_log_enable;
5666         lstate.mi_lenable = mac_link_log_enable;
5667         lstate.mi_list = &net_log_list;
5668 
5669         /* Last walk */
5670         lstate.mi_last = B_TRUE;
5671 
5672         switch (type) {
5673         case MAC_LOGTYPE_FLOW:
5674                 if (lstate.mi_fenable) {
5675                         ASSERT(mac_link_log_enable);
5676                         mac_flow_log_enable = B_FALSE;
5677                         mac_link_log_enable = B_FALSE;
5678                         break;
5679                 }
5680                 /* FALLTHRU */
5681         case MAC_LOGTYPE_LINK:
5682                 if (!lstate.mi_lenable || mac_flow_log_enable) {
5683                         rw_exit(&i_mac_impl_lock);
5684                         return;
5685                 }
5686                 mac_link_log_enable = B_FALSE;
5687                 break;
5688         default:
5689                 ASSERT(0);
5690         }
5691 
5692         /* Reenable fastpath */
5693         mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
5694 
5695         (void) untimeout(mac_logging_timer);
5696         mac_logging_timer = 0;
5697 
5698         /* Write log entries for each mac_impl in the list */
5699         i_mac_log_info(&net_log_list, &lstate);
5700 }
5701 
5702 /*
5703  * Walk the rx and tx SRS/SRs for a flow and update the priority value.
5704  */
5705 void
5706 mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
5707 {
5708         pri_t                   pri;
5709         int                     count;
5710         mac_soft_ring_set_t     *mac_srs;
5711 
5712         if (flent->fe_rx_srs_cnt <= 0)
5713                 return;
5714 
5715         if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
5716             SRST_FLOW) {
5717                 pri = FLOW_PRIORITY(mcip->mci_min_pri,
5718                     mcip->mci_max_pri,
5719                     flent->fe_resource_props.mrp_priority);
5720         } else {
5721                 pri = mcip->mci_max_pri;
5722         }
5723 
5724         for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
5725                 mac_srs = flent->fe_rx_srs[count];
5726                 mac_update_srs_priority(mac_srs, pri);
5727         }
5728         /*
5729          * If we have a Tx SRS, we need to modify all the threads associated
5730          * with it.
5731          */
5732         if (flent->fe_tx_srs != NULL)
5733                 mac_update_srs_priority(flent->fe_tx_srs, pri);
5734 }
5735 
5736 /*
5737  * RX and TX rings are reserved according to different semantics depending
5738  * on the requests from the MAC clients and type of rings:
5739  *
5740  * On the Tx side, by default we reserve individual rings, independently from
5741  * the groups.
5742  *
5743  * On the Rx side, the reservation is at the granularity of the group
5744  * of rings, and used for v12n level 1 only. It has a special case for the
5745  * primary client.
5746  *
5747  * If a share is allocated to a MAC client, we allocate a TX group and an
5748  * RX group to the client, and assign TX rings and RX rings to these
5749  * groups according to information gathered from the driver through
5750  * the share capability.
5751  *
5752  * The foreseable evolution of Rx rings will handle v12n level 2 and higher
5753  * to allocate individual rings out of a group and program the hw classifier
5754  * based on IP address or higher level criteria.
5755  */
5756 
5757 /*
5758  * mac_reserve_tx_ring()
5759  * Reserve a unused ring by marking it with MR_INUSE state.
5760  * As reserved, the ring is ready to function.
5761  *
5762  * Notes for Hybrid I/O:
5763  *
5764  * If a specific ring is needed, it is specified through the desired_ring
5765  * argument. Otherwise that argument is set to NULL.
5766  * If the desired ring was previous allocated to another client, this
5767  * function swaps it with a new ring from the group of unassigned rings.
5768  */
5769 mac_ring_t *
5770 mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
5771 {
5772         mac_group_t             *group;
5773         mac_grp_client_t        *mgcp;
5774         mac_client_impl_t       *mcip;
5775         mac_soft_ring_set_t     *srs;
5776 
5777         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5778 
5779         /*
5780          * Find an available ring and start it before changing its status.
5781          * The unassigned rings are at the end of the mi_tx_groups
5782          * array.
5783          */
5784         group = MAC_DEFAULT_TX_GROUP(mip);
5785 
5786         /* Can't take the default ring out of the default group */
5787         ASSERT(desired_ring != (mac_ring_t *)mip->mi_default_tx_ring);
5788 
5789         if (desired_ring->mr_state == MR_FREE) {
5790                 ASSERT(MAC_GROUP_NO_CLIENT(group));
5791                 if (mac_start_ring(desired_ring) != 0)
5792                         return (NULL);
5793                 return (desired_ring);
5794         }
5795         /*
5796          * There are clients using this ring, so let's move the clients
5797          * away from using this ring.
5798          */
5799         for (mgcp = group->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
5800                 mcip = mgcp->mgc_client;
5801                 mac_tx_client_quiesce((mac_client_handle_t)mcip);
5802                 srs = MCIP_TX_SRS(mcip);
5803                 ASSERT(mac_tx_srs_ring_present(srs, desired_ring));
5804                 mac_tx_invoke_callbacks(mcip,
5805                     (mac_tx_cookie_t)mac_tx_srs_get_soft_ring(srs,
5806                     desired_ring));
5807                 mac_tx_srs_del_ring(srs, desired_ring);
5808                 mac_tx_client_restart((mac_client_handle_t)mcip);
5809         }
5810         return (desired_ring);
5811 }
5812 
5813 /*
5814  * For a reserved group with multiple clients, return the primary client.
5815  */
5816 static mac_client_impl_t *
5817 mac_get_grp_primary(mac_group_t *grp)
5818 {
5819         mac_grp_client_t        *mgcp = grp->mrg_clients;
5820         mac_client_impl_t       *mcip;
5821 
5822         while (mgcp != NULL) {
5823                 mcip = mgcp->mgc_client;
5824                 if (mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC)
5825                         return (mcip);
5826                 mgcp = mgcp->mgc_next;
5827         }
5828         return (NULL);
5829 }
5830 
5831 /*
5832  * Hybrid I/O specifies the ring that should be given to a share.
5833  * If the ring is already used by clients, then we need to release
5834  * the ring back to the default group so that we can give it to
5835  * the share. This means the clients using this ring now get a
5836  * replacement ring. If there aren't any replacement rings, this
5837  * function returns a failure.
5838  */
5839 static int
5840 mac_reclaim_ring_from_grp(mac_impl_t *mip, mac_ring_type_t ring_type,
5841     mac_ring_t *ring, mac_ring_t **rings, int nrings)
5842 {
5843         mac_group_t             *group = (mac_group_t *)ring->mr_gh;
5844         mac_resource_props_t    *mrp;
5845         mac_client_impl_t       *mcip;
5846         mac_group_t             *defgrp;
5847         mac_ring_t              *tring;
5848         mac_group_t             *tgrp;
5849         int                     i;
5850         int                     j;
5851 
5852         mcip = MAC_GROUP_ONLY_CLIENT(group);
5853         if (mcip == NULL)
5854                 mcip = mac_get_grp_primary(group);
5855         ASSERT(mcip != NULL);
5856         ASSERT(mcip->mci_share == NULL);
5857 
5858         mrp = MCIP_RESOURCE_PROPS(mcip);
5859         if (ring_type == MAC_RING_TYPE_RX) {
5860                 defgrp = mip->mi_rx_donor_grp;
5861                 if ((mrp->mrp_mask & MRP_RX_RINGS) == 0) {
5862                         /* Need to put this mac client in the default group */
5863                         if (mac_rx_switch_group(mcip, group, defgrp) != 0)
5864                                 return (ENOSPC);
5865                 } else {
5866                         /*
5867                          * Switch this ring with some other ring from
5868                          * the default group.
5869                          */
5870                         for (tring = defgrp->mrg_rings; tring != NULL;
5871                             tring = tring->mr_next) {
5872                                 if (tring->mr_index == 0)
5873                                         continue;
5874                                 for (j = 0; j < nrings; j++) {
5875                                         if (rings[j] == tring)
5876                                                 break;
5877                                 }
5878                                 if (j >= nrings)
5879                                         break;
5880                         }
5881                         if (tring == NULL)
5882                                 return (ENOSPC);
5883                         if (mac_group_mov_ring(mip, group, tring) != 0)
5884                                 return (ENOSPC);
5885                         if (mac_group_mov_ring(mip, defgrp, ring) != 0) {
5886                                 (void) mac_group_mov_ring(mip, defgrp, tring);
5887                                 return (ENOSPC);
5888                         }
5889                 }
5890                 ASSERT(ring->mr_gh == (mac_group_handle_t)defgrp);
5891                 return (0);
5892         }
5893 
5894         defgrp = MAC_DEFAULT_TX_GROUP(mip);
5895         if (ring == (mac_ring_t *)mip->mi_default_tx_ring) {
5896                 /*
5897                  * See if we can get a spare ring to replace the default
5898                  * ring.
5899                  */
5900                 if (defgrp->mrg_cur_count == 1) {
5901                         /*
5902                          * Need to get a ring from another client, see if
5903                          * there are any clients that can be moved to
5904                          * the default group, thereby freeing some rings.
5905                          */
5906                         for (i = 0; i < mip->mi_tx_group_count; i++) {
5907                                 tgrp = &mip->mi_tx_groups[i];
5908                                 if (tgrp->mrg_state ==
5909                                     MAC_GROUP_STATE_REGISTERED) {
5910                                         continue;
5911                                 }
5912                                 mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
5913                                 if (mcip == NULL)
5914                                         mcip = mac_get_grp_primary(tgrp);
5915                                 ASSERT(mcip != NULL);
5916                                 mrp = MCIP_RESOURCE_PROPS(mcip);
5917                                 if ((mrp->mrp_mask & MRP_TX_RINGS) == 0) {
5918                                         ASSERT(tgrp->mrg_cur_count == 1);
5919                                         /*
5920                                          * If this ring is part of the
5921                                          * rings asked by the share we cannot
5922                                          * use it as the default ring.
5923                                          */
5924                                         for (j = 0; j < nrings; j++) {
5925                                                 if (rings[j] == tgrp->mrg_rings)
5926                                                         break;
5927                                         }
5928                                         if (j < nrings)
5929                                                 continue;
5930                                         mac_tx_client_quiesce(
5931                                             (mac_client_handle_t)mcip);
5932                                         mac_tx_switch_group(mcip, tgrp,
5933                                             defgrp);
5934                                         mac_tx_client_restart(
5935                                             (mac_client_handle_t)mcip);
5936                                         break;
5937                                 }
5938                         }
5939                         /*
5940                          * All the rings are reserved, can't give up the
5941                          * default ring.
5942                          */
5943                         if (defgrp->mrg_cur_count <= 1)
5944                                 return (ENOSPC);
5945                 }
5946                 /*
5947                  * Swap the default ring with another.
5948                  */
5949                 for (tring = defgrp->mrg_rings; tring != NULL;
5950                     tring = tring->mr_next) {
5951                         /*
5952                          * If this ring is part of the rings asked by the
5953                          * share we cannot use it as the default ring.
5954                          */
5955                         for (j = 0; j < nrings; j++) {
5956                                 if (rings[j] == tring)
5957                                         break;
5958                         }
5959                         if (j >= nrings)
5960                                 break;
5961                 }
5962                 ASSERT(tring != NULL);
5963                 mip->mi_default_tx_ring = (mac_ring_handle_t)tring;
5964                 return (0);
5965         }
5966         /*
5967          * The Tx ring is with a group reserved by a MAC client. See if
5968          * we can swap it.
5969          */
5970         ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
5971         mcip = MAC_GROUP_ONLY_CLIENT(group);
5972         if (mcip == NULL)
5973                 mcip = mac_get_grp_primary(group);
5974         ASSERT(mcip !=  NULL);
5975         mrp = MCIP_RESOURCE_PROPS(mcip);
5976         mac_tx_client_quiesce((mac_client_handle_t)mcip);
5977         if ((mrp->mrp_mask & MRP_TX_RINGS) == 0) {
5978                 ASSERT(group->mrg_cur_count == 1);
5979                 /* Put this mac client in the default group */
5980                 mac_tx_switch_group(mcip, group, defgrp);
5981         } else {
5982                 /*
5983                  * Switch this ring with some other ring from
5984                  * the default group.
5985                  */
5986                 for (tring = defgrp->mrg_rings; tring != NULL;
5987                     tring = tring->mr_next) {
5988                         if (tring == (mac_ring_t *)mip->mi_default_tx_ring)
5989                                 continue;
5990                         /*
5991                          * If this ring is part of the rings asked by the
5992                          * share we cannot use it for swapping.
5993                          */
5994                         for (j = 0; j < nrings; j++) {
5995                                 if (rings[j] == tring)
5996                                         break;
5997                         }
5998                         if (j >= nrings)
5999                                 break;
6000                 }
6001                 if (tring == NULL) {
6002                         mac_tx_client_restart((mac_client_handle_t)mcip);
6003                         return (ENOSPC);
6004                 }
6005                 if (mac_group_mov_ring(mip, group, tring) != 0) {
6006                         mac_tx_client_restart((mac_client_handle_t)mcip);
6007                         return (ENOSPC);
6008                 }
6009                 if (mac_group_mov_ring(mip, defgrp, ring) != 0) {
6010                         (void) mac_group_mov_ring(mip, defgrp, tring);
6011                         mac_tx_client_restart((mac_client_handle_t)mcip);
6012                         return (ENOSPC);
6013                 }
6014         }
6015         mac_tx_client_restart((mac_client_handle_t)mcip);
6016         ASSERT(ring->mr_gh == (mac_group_handle_t)defgrp);
6017         return (0);
6018 }
6019 
6020 /*
6021  * Populate a zero-ring group with rings. If the share is non-NULL,
6022  * the rings are chosen according to that share.
6023  * Invoked after allocating a new RX or TX group through
6024  * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively.
6025  * Returns zero on success, an errno otherwise.
6026  */
6027 int
6028 i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type,
6029     mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share,
6030     uint32_t ringcnt)
6031 {
6032         mac_ring_t **rings, *ring;
6033         uint_t nrings;
6034         int rv = 0, i = 0, j;
6035 
6036         ASSERT((ring_type == MAC_RING_TYPE_RX &&
6037             mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) ||
6038             (ring_type == MAC_RING_TYPE_TX &&
6039             mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC));
6040 
6041         /*
6042          * First find the rings to allocate to the group.
6043          */
6044         if (share != NULL) {
6045                 /* get rings through ms_squery() */
6046                 mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings);
6047                 ASSERT(nrings != 0);
6048                 rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t),
6049                     KM_SLEEP);
6050                 mip->mi_share_capab.ms_squery(share, ring_type,
6051                     (mac_ring_handle_t *)rings, &nrings);
6052                 for (i = 0; i < nrings; i++) {
6053                         /*
6054                          * If we have given this ring to a non-default
6055                          * group, we need to check if we can get this
6056                          * ring.
6057                          */
6058                         ring = rings[i];
6059                         if (ring->mr_gh != (mac_group_handle_t)src_group ||
6060                             ring == (mac_ring_t *)mip->mi_default_tx_ring) {
6061                                 if (mac_reclaim_ring_from_grp(mip, ring_type,
6062                                     ring, rings, nrings) != 0) {
6063                                         rv = ENOSPC;
6064                                         goto bail;
6065                                 }
6066                         }
6067                 }
6068         } else {
6069                 /*
6070                  * Pick one ring from default group.
6071                  *
6072                  * for now pick the second ring which requires the first ring
6073                  * at index 0 to stay in the default group, since it is the
6074                  * ring which carries the multicast traffic.
6075                  * We need a better way for a driver to indicate this,
6076                  * for example a per-ring flag.
6077                  */
6078                 rings = kmem_alloc(ringcnt * sizeof (mac_ring_handle_t),
6079                     KM_SLEEP);
6080                 for (ring = src_group->mrg_rings; ring != NULL;
6081                     ring = ring->mr_next) {
6082                         if (ring_type == MAC_RING_TYPE_RX &&
6083                             ring->mr_index == 0) {
6084                                 continue;
6085                         }
6086                         if (ring_type == MAC_RING_TYPE_TX &&
6087                             ring == (mac_ring_t *)mip->mi_default_tx_ring) {
6088                                 continue;
6089                         }
6090                         rings[i++] = ring;
6091                         if (i == ringcnt)
6092                                 break;
6093                 }
6094                 ASSERT(ring != NULL);
6095                 nrings = i;
6096                 /* Not enough rings as required */
6097                 if (nrings != ringcnt) {
6098                         rv = ENOSPC;
6099                         goto bail;
6100                 }
6101         }
6102 
6103         switch (ring_type) {
6104         case MAC_RING_TYPE_RX:
6105                 if (src_group->mrg_cur_count - nrings < 1) {
6106                         /* we ran out of rings */
6107                         rv = ENOSPC;
6108                         goto bail;
6109                 }
6110 
6111                 /* move receive rings to new group */
6112                 for (i = 0; i < nrings; i++) {
6113                         rv = mac_group_mov_ring(mip, new_group, rings[i]);
6114                         if (rv != 0) {
6115                                 /* move rings back on failure */
6116                                 for (j = 0; j < i; j++) {
6117                                         (void) mac_group_mov_ring(mip,
6118                                             src_group, rings[j]);
6119                                 }
6120                                 goto bail;
6121                         }
6122                 }
6123                 break;
6124 
6125         case MAC_RING_TYPE_TX: {
6126                 mac_ring_t *tmp_ring;
6127 
6128                 /* move the TX rings to the new group */
6129                 for (i = 0; i < nrings; i++) {
6130                         /* get the desired ring */
6131                         tmp_ring = mac_reserve_tx_ring(mip, rings[i]);
6132                         if (tmp_ring == NULL) {
6133                                 rv = ENOSPC;
6134                                 goto bail;
6135                         }
6136                         ASSERT(tmp_ring == rings[i]);
6137                         rv = mac_group_mov_ring(mip, new_group, rings[i]);
6138                         if (rv != 0) {
6139                                 /* cleanup on failure */
6140                                 for (j = 0; j < i; j++) {
6141                                         (void) mac_group_mov_ring(mip,
6142                                             MAC_DEFAULT_TX_GROUP(mip),
6143                                             rings[j]);
6144                                 }
6145                                 goto bail;
6146                         }
6147                 }
6148                 break;
6149         }
6150         }
6151 
6152         /* add group to share */
6153         if (share != NULL)
6154                 mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver);
6155 
6156 bail:
6157         /* free temporary array of rings */
6158         kmem_free(rings, nrings * sizeof (mac_ring_handle_t));
6159 
6160         return (rv);
6161 }
6162 
6163 void
6164 mac_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
6165 {
6166         mac_grp_client_t *mgcp;
6167 
6168         for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
6169                 if (mgcp->mgc_client == mcip)
6170                         break;
6171         }
6172 
6173         VERIFY(mgcp == NULL);
6174 
6175         mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
6176         mgcp->mgc_client = mcip;
6177         mgcp->mgc_next = grp->mrg_clients;
6178         grp->mrg_clients = mgcp;
6179 
6180 }
6181 
6182 void
6183 mac_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
6184 {
6185         mac_grp_client_t *mgcp, **pprev;
6186 
6187         for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL;
6188             pprev = &mgcp->mgc_next, mgcp = *pprev) {
6189                 if (mgcp->mgc_client == mcip)
6190                         break;
6191         }
6192 
6193         ASSERT(mgcp != NULL);
6194 
6195         *pprev = mgcp->mgc_next;
6196         kmem_free(mgcp, sizeof (mac_grp_client_t));
6197 }
6198 
6199 /*
6200  * mac_reserve_rx_group()
6201  *
6202  * Finds an available group and exclusively reserves it for a client.
6203  * The group is chosen to suit the flow's resource controls (bandwidth and
6204  * fanout requirements) and the address type.
6205  * If the requestor is the pimary MAC then return the group with the
6206  * largest number of rings, otherwise the default ring when available.
6207  */
6208 mac_group_t *
6209 mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
6210 {
6211         mac_share_handle_t      share = mcip->mci_share;
6212         mac_impl_t              *mip = mcip->mci_mip;
6213         mac_group_t             *grp = NULL;
6214         int                     i;
6215         int                     err = 0;
6216         mac_address_t           *map;
6217         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
6218         int                     nrings;
6219         int                     donor_grp_rcnt;
6220         boolean_t               need_exclgrp = B_FALSE;
6221         int                     need_rings = 0;
6222         mac_group_t             *candidate_grp = NULL;
6223         mac_client_impl_t       *gclient;
6224         mac_resource_props_t    *gmrp;
6225         mac_group_t             *donorgrp = NULL;
6226         boolean_t               rxhw = mrp->mrp_mask & MRP_RX_RINGS;
6227         boolean_t               unspec = mrp->mrp_mask & MRP_RXRINGS_UNSPEC;
6228         boolean_t               isprimary;
6229 
6230         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
6231 
6232         isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
6233 
6234         /*
6235          * Check if a group already has this mac address (case of VLANs)
6236          * unless we are moving this MAC client from one group to another.
6237          */
6238         if (!move && (map = mac_find_macaddr(mip, mac_addr)) != NULL) {
6239                 if (map->ma_group != NULL)
6240                         return (map->ma_group);
6241         }
6242         if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0)
6243                 return (NULL);
6244         /*
6245          * If exclusive open, return NULL which will enable the
6246          * caller to use the default group.
6247          */
6248         if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
6249                 return (NULL);
6250 
6251         /* For dynamic groups default unspecified to 1 */
6252         if (rxhw && unspec &&
6253             mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6254                 mrp->mrp_nrxrings = 1;
6255         }
6256         /*
6257          * For static grouping we allow only specifying rings=0 and
6258          * unspecified
6259          */
6260         if (rxhw && mrp->mrp_nrxrings > 0 &&
6261             mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) {
6262                 return (NULL);
6263         }
6264         if (rxhw) {
6265                 /*
6266                  * We have explicitly asked for a group (with nrxrings,
6267                  * if unspec).
6268                  */
6269                 if (unspec || mrp->mrp_nrxrings > 0) {
6270                         need_exclgrp = B_TRUE;
6271                         need_rings = mrp->mrp_nrxrings;
6272                 } else if (mrp->mrp_nrxrings == 0) {
6273                         /*
6274                          * We have asked for a software group.
6275                          */
6276                         return (NULL);
6277                 }
6278         } else if (isprimary && mip->mi_nactiveclients == 1 &&
6279             mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6280                 /*
6281                  * If the primary is the only active client on this
6282                  * mip and we have not asked for any rings, we give
6283                  * it the default group so that the primary gets to
6284                  * use all the rings.
6285                  */
6286                 return (NULL);
6287         }
6288 
6289         /* The group that can donate rings */
6290         donorgrp = mip->mi_rx_donor_grp;
6291 
6292         /*
6293          * The number of rings that the default group can donate.
6294          * We need to leave at least one ring.
6295          */
6296         donor_grp_rcnt = donorgrp->mrg_cur_count - 1;
6297 
6298         /*
6299          * Try to exclusively reserve a RX group.
6300          *
6301          * For flows requiring HW_DEFAULT_RING (unicast flow of the primary
6302          * client), try to reserve the a non-default RX group and give
6303          * it all the rings from the donor group, except the default ring
6304          *
6305          * For flows requiring HW_RING (unicast flow of other clients), try
6306          * to reserve non-default RX group with the specified number of
6307          * rings, if available.
6308          *
6309          * For flows that have not asked for software or hardware ring,
6310          * try to reserve a non-default group with 1 ring, if available.
6311          */
6312         for (i = 1; i < mip->mi_rx_group_count; i++) {
6313                 grp = &mip->mi_rx_groups[i];
6314 
6315                 DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name,
6316                     int, grp->mrg_index, mac_group_state_t, grp->mrg_state);
6317 
6318                 /*
6319                  * Check if this group could be a candidate group for
6320                  * eviction if we need a group for this MAC client,
6321                  * but there aren't any. A candidate group is one
6322                  * that didn't ask for an exclusive group, but got
6323                  * one and it has enough rings (combined with what
6324                  * the donor group can donate) for the new MAC
6325                  * client
6326                  */
6327                 if (grp->mrg_state >= MAC_GROUP_STATE_RESERVED) {
6328                         /*
6329                          * If the primary/donor group is not the default
6330                          * group, don't bother looking for a candidate group.
6331                          * If we don't have enough rings we will check
6332                          * if the primary group can be vacated.
6333                          */
6334                         if (candidate_grp == NULL &&
6335                             donorgrp == MAC_DEFAULT_RX_GROUP(mip)) {
6336                                 ASSERT(!MAC_GROUP_NO_CLIENT(grp));
6337                                 gclient = MAC_GROUP_ONLY_CLIENT(grp);
6338                                 if (gclient == NULL)
6339                                         gclient = mac_get_grp_primary(grp);
6340                                 ASSERT(gclient != NULL);
6341                                 gmrp = MCIP_RESOURCE_PROPS(gclient);
6342                                 if (gclient->mci_share == NULL &&
6343                                     (gmrp->mrp_mask & MRP_RX_RINGS) == 0 &&
6344                                     (unspec ||
6345                                     (grp->mrg_cur_count + donor_grp_rcnt >=
6346                                     need_rings))) {
6347                                         candidate_grp = grp;
6348                                 }
6349                         }
6350                         continue;
6351                 }
6352                 /*
6353                  * This group could already be SHARED by other multicast
6354                  * flows on this client. In that case, the group would
6355                  * be shared and has already been started.
6356                  */
6357                 ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT);
6358 
6359                 if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) &&
6360                     (mac_start_group(grp) != 0)) {
6361                         continue;
6362                 }
6363 
6364                 if (mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6365                         break;
6366                 ASSERT(grp->mrg_cur_count == 0);
6367 
6368                 /*
6369                  * Populate the group. Rings should be taken
6370                  * from the donor group.
6371                  */
6372                 nrings = rxhw ? need_rings : isprimary ? donor_grp_rcnt: 1;
6373 
6374                 /*
6375                  * If the donor group can't donate, let's just walk and
6376                  * see if someone can vacate a group, so that we have
6377                  * enough rings for this, unless we already have
6378                  * identified a candiate group..
6379                  */
6380                 if (nrings <= donor_grp_rcnt) {
6381                         err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
6382                             donorgrp, grp, share, nrings);
6383                         if (err == 0) {
6384                                 /*
6385                                  * For a share i_mac_group_allocate_rings gets
6386                                  * the rings from the driver, let's populate
6387                                  * the property for the client now.
6388                                  */
6389                                 if (share != NULL) {
6390                                         mac_client_set_rings(
6391                                             (mac_client_handle_t)mcip,
6392                                             grp->mrg_cur_count, -1);
6393                                 }
6394                                 if (mac_is_primary_client(mcip) && !rxhw)
6395                                         mip->mi_rx_donor_grp = grp;
6396                                 break;
6397                         }
6398                 }
6399 
6400                 DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
6401                     mip->mi_name, int, grp->mrg_index, int, err);
6402 
6403                 /*
6404                  * It's a dynamic group but the grouping operation
6405                  * failed.
6406                  */
6407                 mac_stop_group(grp);
6408         }
6409         /* We didn't find an exclusive group for this MAC client */
6410         if (i >= mip->mi_rx_group_count) {
6411 
6412                 if (!need_exclgrp)
6413                         return (NULL);
6414 
6415                 /*
6416                  * If we found a candidate group then we switch the
6417                  * MAC client from the candidate_group to the default
6418                  * group and give the group to this MAC client. If
6419                  * we didn't find a candidate_group, check if the
6420                  * primary is in its own group and if it can make way
6421                  * for this MAC client.
6422                  */
6423                 if (candidate_grp == NULL &&
6424                     donorgrp != MAC_DEFAULT_RX_GROUP(mip) &&
6425                     donorgrp->mrg_cur_count >= need_rings) {
6426                         candidate_grp = donorgrp;
6427                 }
6428                 if (candidate_grp != NULL) {
6429                         boolean_t       prim_grp = B_FALSE;
6430 
6431                         /*
6432                          * Switch the MAC client from the candidate group
6433                          * to the default group.. If this group was the
6434                          * donor group, then after the switch we need
6435                          * to update the donor group too.
6436                          */
6437                         grp = candidate_grp;
6438                         gclient = MAC_GROUP_ONLY_CLIENT(grp);
6439                         if (gclient == NULL)
6440                                 gclient = mac_get_grp_primary(grp);
6441                         if (grp == mip->mi_rx_donor_grp)
6442                                 prim_grp = B_TRUE;
6443                         if (mac_rx_switch_group(gclient, grp,
6444                             MAC_DEFAULT_RX_GROUP(mip)) != 0) {
6445                                 return (NULL);
6446                         }
6447                         if (prim_grp) {
6448                                 mip->mi_rx_donor_grp =
6449                                     MAC_DEFAULT_RX_GROUP(mip);
6450                                 donorgrp = MAC_DEFAULT_RX_GROUP(mip);
6451                         }
6452 
6453 
6454                         /*
6455                          * Now give this group with the required rings
6456                          * to this MAC client.
6457                          */
6458                         ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
6459                         if (mac_start_group(grp) != 0)
6460                                 return (NULL);
6461 
6462                         if (mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6463                                 return (grp);
6464 
6465                         donor_grp_rcnt = donorgrp->mrg_cur_count - 1;
6466                         ASSERT(grp->mrg_cur_count == 0);
6467                         ASSERT(donor_grp_rcnt >= need_rings);
6468                         err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
6469                             donorgrp, grp, share, need_rings);
6470                         if (err == 0) {
6471                                 /*
6472                                  * For a share i_mac_group_allocate_rings gets
6473                                  * the rings from the driver, let's populate
6474                                  * the property for the client now.
6475                                  */
6476                                 if (share != NULL) {
6477                                         mac_client_set_rings(
6478                                             (mac_client_handle_t)mcip,
6479                                             grp->mrg_cur_count, -1);
6480                                 }
6481                                 DTRACE_PROBE2(rx__group__reserved,
6482                                     char *, mip->mi_name, int, grp->mrg_index);
6483                                 return (grp);
6484                         }
6485                         DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
6486                             mip->mi_name, int, grp->mrg_index, int, err);
6487                         mac_stop_group(grp);
6488                 }
6489                 return (NULL);
6490         }
6491         ASSERT(grp != NULL);
6492 
6493         DTRACE_PROBE2(rx__group__reserved,
6494             char *, mip->mi_name, int, grp->mrg_index);
6495         return (grp);
6496 }
6497 
6498 /*
6499  * mac_rx_release_group()
6500  *
6501  * This is called when there are no clients left for the group.
6502  * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
6503  * and if it is a non default group, the shares are removed and
6504  * all rings are assigned back to default group.
6505  */
6506 void
6507 mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
6508 {
6509         mac_impl_t              *mip = mcip->mci_mip;
6510         mac_ring_t              *ring;
6511 
6512         ASSERT(group != MAC_DEFAULT_RX_GROUP(mip));
6513 
6514         if (mip->mi_rx_donor_grp == group)
6515                 mip->mi_rx_donor_grp = MAC_DEFAULT_RX_GROUP(mip);
6516 
6517         /*
6518          * This is the case where there are no clients left. Any
6519          * SRS etc on this group have also be quiesced.
6520          */
6521         for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
6522                 if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
6523                         ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
6524                         /*
6525                          * Remove the SRS associated with the HW ring.
6526                          * As a result, polling will be disabled.
6527                          */
6528                         ring->mr_srs = NULL;
6529                 }
6530                 ASSERT(group->mrg_state < MAC_GROUP_STATE_RESERVED ||
6531                     ring->mr_state == MR_INUSE);
6532                 if (ring->mr_state == MR_INUSE) {
6533                         mac_stop_ring(ring);
6534                         ring->mr_flag = 0;
6535                 }
6536         }
6537 
6538         /* remove group from share */
6539         if (mcip->mci_share != NULL) {
6540                 mip->mi_share_capab.ms_sremove(mcip->mci_share,
6541                     group->mrg_driver);
6542         }
6543 
6544         if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6545                 mac_ring_t *ring;
6546 
6547                 /*
6548                  * Rings were dynamically allocated to group.
6549                  * Move rings back to default group.
6550                  */
6551                 while ((ring = group->mrg_rings) != NULL) {
6552                         (void) mac_group_mov_ring(mip, mip->mi_rx_donor_grp,
6553                             ring);
6554                 }
6555         }
6556         mac_stop_group(group);
6557         /*
6558          * Possible improvement: See if we can assign the group just released
6559          * to a another client of the mip
6560          */
6561 }
6562 
6563 /*
6564  * When we move the primary's mac address between groups, we need to also
6565  * take all the clients sharing the same mac address along with it (VLANs)
6566  * We remove the mac address for such clients from the group after quiescing
6567  * them. When we add the mac address we restart the client. Note that
6568  * the primary's mac address is removed from the group after all the
6569  * other clients sharing the address are removed. Similarly, the primary's
6570  * mac address is added before all the other client's mac address are
6571  * added. While grp is the group where the clients reside, tgrp is
6572  * the group where the addresses have to be added.
6573  */
6574 static void
6575 mac_rx_move_macaddr_prim(mac_client_impl_t *mcip, mac_group_t *grp,
6576     mac_group_t *tgrp, uint8_t *maddr, boolean_t add)
6577 {
6578         mac_impl_t              *mip = mcip->mci_mip;
6579         mac_grp_client_t        *mgcp = grp->mrg_clients;
6580         mac_client_impl_t       *gmcip;
6581         boolean_t               prim;
6582 
6583         prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
6584 
6585         /*
6586          * If the clients are in a non-default group, we just have to
6587          * walk the group's client list. If it is in the default group
6588          * (which will be shared by other clients as well, we need to
6589          * check if the unicast address matches mcip's unicast.
6590          */
6591         while (mgcp != NULL) {
6592                 gmcip = mgcp->mgc_client;
6593                 if (gmcip != mcip &&
6594                     (grp != MAC_DEFAULT_RX_GROUP(mip) ||
6595                     mcip->mci_unicast == gmcip->mci_unicast)) {
6596                         if (!add) {
6597                                 mac_rx_client_quiesce(
6598                                     (mac_client_handle_t)gmcip);
6599                                 (void) mac_remove_macaddr(mcip->mci_unicast);
6600                         } else {
6601                                 (void) mac_add_macaddr(mip, tgrp, maddr, prim);
6602                                 mac_rx_client_restart(
6603                                     (mac_client_handle_t)gmcip);
6604                         }
6605                 }
6606                 mgcp = mgcp->mgc_next;
6607         }
6608 }
6609 
6610 
6611 /*
6612  * Move the MAC address from fgrp to tgrp. If this is the primary client,
6613  * we need to take any VLANs etc. together too.
6614  */
6615 static int
6616 mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp,
6617     mac_group_t *tgrp)
6618 {
6619         mac_impl_t              *mip = mcip->mci_mip;
6620         uint8_t                 maddr[MAXMACADDRLEN];
6621         int                     err = 0;
6622         boolean_t               prim;
6623         boolean_t               multiclnt = B_FALSE;
6624 
6625         mac_rx_client_quiesce((mac_client_handle_t)mcip);
6626         ASSERT(mcip->mci_unicast != NULL);
6627         bcopy(mcip->mci_unicast->ma_addr, maddr, mcip->mci_unicast->ma_len);
6628 
6629         prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
6630         if (mcip->mci_unicast->ma_nusers > 1) {
6631                 mac_rx_move_macaddr_prim(mcip, fgrp, NULL, maddr, B_FALSE);
6632                 multiclnt = B_TRUE;
6633         }
6634         ASSERT(mcip->mci_unicast->ma_nusers == 1);
6635         err = mac_remove_macaddr(mcip->mci_unicast);
6636         if (err != 0) {
6637                 mac_rx_client_restart((mac_client_handle_t)mcip);
6638                 if (multiclnt) {
6639                         mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
6640                             B_TRUE);
6641                 }
6642                 return (err);
6643         }
6644         /*
6645          * Program the H/W Classifier first, if this fails we need
6646          * not proceed with the other stuff.
6647          */
6648         if ((err = mac_add_macaddr(mip, tgrp, maddr, prim)) != 0) {
6649                 /* Revert back the H/W Classifier */
6650                 if ((err = mac_add_macaddr(mip, fgrp, maddr, prim)) != 0) {
6651                         /*
6652                          * This should not fail now since it worked earlier,
6653                          * should we panic?
6654                          */
6655                         cmn_err(CE_WARN,
6656                             "mac_rx_switch_group: switching %p back"
6657                             " to group %p failed!!", (void *)mcip,
6658                             (void *)fgrp);
6659                 }
6660                 mac_rx_client_restart((mac_client_handle_t)mcip);
6661                 if (multiclnt) {
6662                         mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
6663                             B_TRUE);
6664                 }
6665                 return (err);
6666         }
6667         mcip->mci_unicast = mac_find_macaddr(mip, maddr);
6668         mac_rx_client_restart((mac_client_handle_t)mcip);
6669         if (multiclnt)
6670                 mac_rx_move_macaddr_prim(mcip, fgrp, tgrp, maddr, B_TRUE);
6671         return (err);
6672 }
6673 
6674 /*
6675  * Switch the MAC client from one group to another. This means we need
6676  * to remove the MAC address from the group, remove the MAC client,
6677  * teardown the SRSs and revert the group state. Then, we add the client
6678  * to the destination group, set the SRSs, and add the MAC address to the
6679  * group.
6680  */
6681 int
6682 mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
6683     mac_group_t *tgrp)
6684 {
6685         int                     err;
6686         mac_group_state_t       next_state;
6687         mac_client_impl_t       *group_only_mcip;
6688         mac_client_impl_t       *gmcip;
6689         mac_impl_t              *mip = mcip->mci_mip;
6690         mac_grp_client_t        *mgcp;
6691 
6692         ASSERT(fgrp == mcip->mci_flent->fe_rx_ring_group);
6693 
6694         if ((err = mac_rx_move_macaddr(mcip, fgrp, tgrp)) != 0)
6695                 return (err);
6696 
6697         /*
6698          * The group might be reserved, but SRSs may not be set up, e.g.
6699          * primary and its vlans using a reserved group.
6700          */
6701         if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED &&
6702             MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) {
6703                 mac_rx_srs_group_teardown(mcip->mci_flent, B_TRUE);
6704         }
6705         if (fgrp != MAC_DEFAULT_RX_GROUP(mip)) {
6706                 mgcp = fgrp->mrg_clients;
6707                 while (mgcp != NULL) {
6708                         gmcip = mgcp->mgc_client;
6709                         mgcp = mgcp->mgc_next;
6710                         mac_group_remove_client(fgrp, gmcip);
6711                         mac_group_add_client(tgrp, gmcip);
6712                         gmcip->mci_flent->fe_rx_ring_group = tgrp;
6713                 }
6714                 mac_release_rx_group(mcip, fgrp);
6715                 ASSERT(MAC_GROUP_NO_CLIENT(fgrp));
6716                 mac_set_group_state(fgrp, MAC_GROUP_STATE_REGISTERED);
6717         } else {
6718                 mac_group_remove_client(fgrp, mcip);
6719                 mac_group_add_client(tgrp, mcip);
6720                 mcip->mci_flent->fe_rx_ring_group = tgrp;
6721                 /*
6722                  * If there are other clients (VLANs) sharing this address
6723                  * we should be here only for the primary.
6724                  */
6725                 if (mcip->mci_unicast->ma_nusers > 1) {
6726                         /*
6727                          * We need to move all the clients that are using
6728                          * this h/w address.
6729                          */
6730                         mgcp = fgrp->mrg_clients;
6731                         while (mgcp != NULL) {
6732                                 gmcip = mgcp->mgc_client;
6733                                 mgcp = mgcp->mgc_next;
6734                                 if (mcip->mci_unicast == gmcip->mci_unicast) {
6735                                         mac_group_remove_client(fgrp, gmcip);
6736                                         mac_group_add_client(tgrp, gmcip);
6737                                         gmcip->mci_flent->fe_rx_ring_group =
6738                                             tgrp;
6739                                 }
6740                         }
6741                 }
6742                 /*
6743                  * The default group will still take the multicast,
6744                  * broadcast traffic etc., so it won't go to
6745                  * MAC_GROUP_STATE_REGISTERED.
6746                  */
6747                 if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED)
6748                         mac_rx_group_unmark(fgrp, MR_CONDEMNED);
6749                 mac_set_group_state(fgrp, MAC_GROUP_STATE_SHARED);
6750         }
6751         next_state = mac_group_next_state(tgrp, &group_only_mcip,
6752             MAC_DEFAULT_RX_GROUP(mip), B_TRUE);
6753         mac_set_group_state(tgrp, next_state);
6754         /*
6755          * If the destination group is reserved, setup the SRSs etc.
6756          */
6757         if (tgrp->mrg_state == MAC_GROUP_STATE_RESERVED) {
6758                 mac_rx_srs_group_setup(mcip, mcip->mci_flent, SRST_LINK);
6759                 mac_fanout_setup(mcip, mcip->mci_flent,
6760                     MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver, mcip, NULL,
6761                     NULL);
6762                 mac_rx_group_unmark(tgrp, MR_INCIPIENT);
6763         } else {
6764                 mac_rx_switch_grp_to_sw(tgrp);
6765         }
6766         return (0);
6767 }
6768 
6769 /*
6770  * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup()
6771  * when a share was allocated to the client.
6772  */
6773 mac_group_t *
6774 mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move)
6775 {
6776         mac_impl_t              *mip = mcip->mci_mip;
6777         mac_group_t             *grp = NULL;
6778         int                     rv;
6779         int                     i;
6780         int                     err;
6781         mac_group_t             *defgrp;
6782         mac_share_handle_t      share = mcip->mci_share;
6783         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
6784         int                     nrings;
6785         int                     defnrings;
6786         boolean_t               need_exclgrp = B_FALSE;
6787         int                     need_rings = 0;
6788         mac_group_t             *candidate_grp = NULL;
6789         mac_client_impl_t       *gclient;
6790         mac_resource_props_t    *gmrp;
6791         boolean_t               txhw = mrp->mrp_mask & MRP_TX_RINGS;
6792         boolean_t               unspec = mrp->mrp_mask & MRP_TXRINGS_UNSPEC;
6793         boolean_t               isprimary;
6794 
6795         isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
6796         /*
6797          * When we come here for a VLAN on the primary (dladm create-vlan),
6798          * we need to pair it along with the primary (to keep it consistent
6799          * with the RX side). So, we check if the primary is already assigned
6800          * to a group and return the group if so. The other way is also
6801          * true, i.e. the VLAN is already created and now we are plumbing
6802          * the primary.
6803          */
6804         if (!move && isprimary) {
6805                 for (gclient = mip->mi_clients_list; gclient != NULL;
6806                     gclient = gclient->mci_client_next) {
6807                         if (gclient->mci_flent->fe_type & FLOW_PRIMARY_MAC &&
6808                             gclient->mci_flent->fe_tx_ring_group != NULL) {
6809                                 return (gclient->mci_flent->fe_tx_ring_group);
6810                         }
6811                 }
6812         }
6813 
6814         if (mip->mi_tx_groups == NULL || mip->mi_tx_group_count == 0)
6815                 return (NULL);
6816 
6817         /* For dynamic groups, default unspec to 1 */
6818         if (txhw && unspec &&
6819             mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6820                 mrp->mrp_ntxrings = 1;
6821         }
6822         /*
6823          * For static grouping we allow only specifying rings=0 and
6824          * unspecified
6825          */
6826         if (txhw && mrp->mrp_ntxrings > 0 &&
6827             mip->mi_tx_group_type == MAC_GROUP_TYPE_STATIC) {
6828                 return (NULL);
6829         }
6830 
6831         if (txhw) {
6832                 /*
6833                  * We have explicitly asked for a group (with ntxrings,
6834                  * if unspec).
6835                  */
6836                 if (unspec || mrp->mrp_ntxrings > 0) {
6837                         need_exclgrp = B_TRUE;
6838                         need_rings = mrp->mrp_ntxrings;
6839                 } else if (mrp->mrp_ntxrings == 0) {
6840                         /*
6841                          * We have asked for a software group.
6842                          */
6843                         return (NULL);
6844                 }
6845         }
6846         defgrp = MAC_DEFAULT_TX_GROUP(mip);
6847         /*
6848          * The number of rings that the default group can donate.
6849          * We need to leave at least one ring - the default ring - in
6850          * this group.
6851          */
6852         defnrings = defgrp->mrg_cur_count - 1;
6853 
6854         /*
6855          * Primary gets default group unless explicitly told not
6856          * to  (i.e. rings > 0).
6857          */
6858         if (isprimary && !need_exclgrp)
6859                 return (NULL);
6860 
6861         nrings = (mrp->mrp_mask & MRP_TX_RINGS) != 0 ? mrp->mrp_ntxrings : 1;
6862         for (i = 0; i <  mip->mi_tx_group_count; i++) {
6863                 grp = &mip->mi_tx_groups[i];
6864                 if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) ||
6865                     (grp->mrg_state == MAC_GROUP_STATE_UNINIT)) {
6866                         /*
6867                          * Select a candidate for replacement if we don't
6868                          * get an exclusive group. A candidate group is one
6869                          * that didn't ask for an exclusive group, but got
6870                          * one and it has enough rings (combined with what
6871                          * the default group can donate) for the new MAC
6872                          * client.
6873                          */
6874                         if (grp->mrg_state == MAC_GROUP_STATE_RESERVED &&
6875                             candidate_grp == NULL) {
6876                                 gclient = MAC_GROUP_ONLY_CLIENT(grp);
6877                                 if (gclient == NULL)
6878                                         gclient = mac_get_grp_primary(grp);
6879                                 gmrp = MCIP_RESOURCE_PROPS(gclient);
6880                                 if (gclient->mci_share == NULL &&
6881                                     (gmrp->mrp_mask & MRP_TX_RINGS) == 0 &&
6882                                     (unspec ||
6883                                     (grp->mrg_cur_count + defnrings) >=
6884                                     need_rings)) {
6885                                         candidate_grp = grp;
6886                                 }
6887                         }
6888                         continue;
6889                 }
6890                 /*
6891                  * If the default can't donate let's just walk and
6892                  * see if someone can vacate a group, so that we have
6893                  * enough rings for this.
6894                  */
6895                 if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC ||
6896                     nrings <= defnrings) {
6897                         if (grp->mrg_state == MAC_GROUP_STATE_REGISTERED) {
6898                                 rv = mac_start_group(grp);
6899                                 ASSERT(rv == 0);
6900                         }
6901                         break;
6902                 }
6903         }
6904 
6905         /* The default group */
6906         if (i >= mip->mi_tx_group_count) {
6907                 /*
6908                  * If we need an exclusive group and have identified a
6909                  * candidate group we switch the MAC client from the
6910                  * candidate group to the default group and give the
6911                  * candidate group to this client.
6912                  */
6913                 if (need_exclgrp && candidate_grp != NULL) {
6914                         /*
6915                          * Switch the MAC client from the candidate group
6916                          * to the default group.
6917                          */
6918                         grp = candidate_grp;
6919                         gclient = MAC_GROUP_ONLY_CLIENT(grp);
6920                         if (gclient == NULL)
6921                                 gclient = mac_get_grp_primary(grp);
6922                         mac_tx_client_quiesce((mac_client_handle_t)gclient);
6923                         mac_tx_switch_group(gclient, grp, defgrp);
6924                         mac_tx_client_restart((mac_client_handle_t)gclient);
6925 
6926                         /*
6927                          * Give the candidate group with the specified number
6928                          * of rings to this MAC client.
6929                          */
6930                         ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
6931                         rv = mac_start_group(grp);
6932                         ASSERT(rv == 0);
6933 
6934                         if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6935                                 return (grp);
6936 
6937                         ASSERT(grp->mrg_cur_count == 0);
6938                         ASSERT(defgrp->mrg_cur_count > need_rings);
6939 
6940                         err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX,
6941                             defgrp, grp, share, need_rings);
6942                         if (err == 0) {
6943                                 /*
6944                                  * For a share i_mac_group_allocate_rings gets
6945                                  * the rings from the driver, let's populate
6946                                  * the property for the client now.
6947                                  */
6948                                 if (share != NULL) {
6949                                         mac_client_set_rings(
6950                                             (mac_client_handle_t)mcip, -1,
6951                                             grp->mrg_cur_count);
6952                                 }
6953                                 mip->mi_tx_group_free--;
6954                                 return (grp);
6955                         }
6956                         DTRACE_PROBE3(tx__group__reserve__alloc__rings, char *,
6957                             mip->mi_name, int, grp->mrg_index, int, err);
6958                         mac_stop_group(grp);
6959                 }
6960                 return (NULL);
6961         }
6962         /*
6963          * We got an exclusive group, but it is not dynamic.
6964          */
6965         if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC) {
6966                 mip->mi_tx_group_free--;
6967                 return (grp);
6968         }
6969 
6970         rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, defgrp, grp,
6971             share, nrings);
6972         if (rv != 0) {
6973                 DTRACE_PROBE3(tx__group__reserve__alloc__rings,
6974                     char *, mip->mi_name, int, grp->mrg_index, int, rv);
6975                 mac_stop_group(grp);
6976                 return (NULL);
6977         }
6978         /*
6979          * For a share i_mac_group_allocate_rings gets the rings from the
6980          * driver, let's populate the property for the client now.
6981          */
6982         if (share != NULL) {
6983                 mac_client_set_rings((mac_client_handle_t)mcip, -1,
6984                     grp->mrg_cur_count);
6985         }
6986         mip->mi_tx_group_free--;
6987         return (grp);
6988 }
6989 
6990 void
6991 mac_release_tx_group(mac_client_impl_t *mcip, mac_group_t *grp)
6992 {
6993         mac_impl_t              *mip = mcip->mci_mip;
6994         mac_share_handle_t      share = mcip->mci_share;
6995         mac_ring_t              *ring;
6996         mac_soft_ring_set_t     *srs = MCIP_TX_SRS(mcip);
6997         mac_group_t             *defgrp;
6998 
6999         defgrp = MAC_DEFAULT_TX_GROUP(mip);
7000         if (srs != NULL) {
7001                 if (srs->srs_soft_ring_count > 0) {
7002                         for (ring = grp->mrg_rings; ring != NULL;
7003                             ring = ring->mr_next) {
7004                                 ASSERT(mac_tx_srs_ring_present(srs, ring));
7005                                 mac_tx_invoke_callbacks(mcip,
7006                                     (mac_tx_cookie_t)
7007                                     mac_tx_srs_get_soft_ring(srs, ring));
7008                                 mac_tx_srs_del_ring(srs, ring);
7009                         }
7010                 } else {
7011                         ASSERT(srs->srs_tx.st_arg2 != NULL);
7012                         srs->srs_tx.st_arg2 = NULL;
7013                         mac_srs_stat_delete(srs);
7014                 }
7015         }
7016         if (share != NULL)
7017                 mip->mi_share_capab.ms_sremove(share, grp->mrg_driver);
7018 
7019         /* move the ring back to the pool */
7020         if (mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
7021                 while ((ring = grp->mrg_rings) != NULL)
7022                         (void) mac_group_mov_ring(mip, defgrp, ring);
7023         }
7024         mac_stop_group(grp);
7025         mip->mi_tx_group_free++;
7026 }
7027 
7028 /*
7029  * Disassociate a MAC client from a group, i.e go through the rings in the
7030  * group and delete all the soft rings tied to them.
7031  */
7032 static void
7033 mac_tx_dismantle_soft_rings(mac_group_t *fgrp, flow_entry_t *flent)
7034 {
7035         mac_client_impl_t       *mcip = flent->fe_mcip;
7036         mac_soft_ring_set_t     *tx_srs;
7037         mac_srs_tx_t            *tx;
7038         mac_ring_t              *ring;
7039 
7040         tx_srs = flent->fe_tx_srs;
7041         tx = &tx_srs->srs_tx;
7042 
7043         /* Single ring case we haven't created any soft rings */
7044         if (tx->st_mode == SRS_TX_BW || tx->st_mode == SRS_TX_SERIALIZE ||
7045             tx->st_mode == SRS_TX_DEFAULT) {
7046                 tx->st_arg2 = NULL;
7047                 mac_srs_stat_delete(tx_srs);
7048         /* Fanout case, where we have to dismantle the soft rings */
7049         } else {
7050                 for (ring = fgrp->mrg_rings; ring != NULL;
7051                     ring = ring->mr_next) {
7052                         ASSERT(mac_tx_srs_ring_present(tx_srs, ring));
7053                         mac_tx_invoke_callbacks(mcip,
7054                             (mac_tx_cookie_t)mac_tx_srs_get_soft_ring(tx_srs,
7055                             ring));
7056                         mac_tx_srs_del_ring(tx_srs, ring);
7057                 }
7058                 ASSERT(tx->st_arg2 == NULL);
7059         }
7060 }
7061 
7062 /*
7063  * Switch the MAC client from one group to another. This means we need
7064  * to remove the MAC client, teardown the SRSs and revert the group state.
7065  * Then, we add the client to the destination roup, set the SRSs etc.
7066  */
7067 void
7068 mac_tx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
7069     mac_group_t *tgrp)
7070 {
7071         mac_client_impl_t       *group_only_mcip;
7072         mac_impl_t              *mip = mcip->mci_mip;
7073         flow_entry_t            *flent = mcip->mci_flent;
7074         mac_group_t             *defgrp;
7075         mac_grp_client_t        *mgcp;
7076         mac_client_impl_t       *gmcip;
7077         flow_entry_t            *gflent;
7078 
7079         defgrp = MAC_DEFAULT_TX_GROUP(mip);
7080         ASSERT(fgrp == flent->fe_tx_ring_group);
7081 
7082         if (fgrp == defgrp) {
7083                 /*
7084                  * If this is the primary we need to find any VLANs on
7085                  * the primary and move them too.
7086                  */
7087                 mac_group_remove_client(fgrp, mcip);
7088                 mac_tx_dismantle_soft_rings(fgrp, flent);
7089                 if (mcip->mci_unicast->ma_nusers > 1) {
7090                         mgcp = fgrp->mrg_clients;
7091                         while (mgcp != NULL) {
7092                                 gmcip = mgcp->mgc_client;
7093                                 mgcp = mgcp->mgc_next;
7094                                 if (mcip->mci_unicast != gmcip->mci_unicast)
7095                                         continue;
7096                                 mac_tx_client_quiesce(
7097                                     (mac_client_handle_t)gmcip);
7098 
7099                                 gflent = gmcip->mci_flent;
7100                                 mac_group_remove_client(fgrp, gmcip);
7101                                 mac_tx_dismantle_soft_rings(fgrp, gflent);
7102 
7103                                 mac_group_add_client(tgrp, gmcip);
7104                                 gflent->fe_tx_ring_group = tgrp;
7105                                 /* We could directly set this to SHARED */
7106                                 tgrp->mrg_state = mac_group_next_state(tgrp,
7107                                     &group_only_mcip, defgrp, B_FALSE);
7108 
7109                                 mac_tx_srs_group_setup(gmcip, gflent,
7110                                     SRST_LINK);
7111                                 mac_fanout_setup(gmcip, gflent,
7112                                     MCIP_RESOURCE_PROPS(gmcip), mac_rx_deliver,
7113                                     gmcip, NULL, NULL);
7114 
7115                                 mac_tx_client_restart(
7116                                     (mac_client_handle_t)gmcip);
7117                         }
7118                 }
7119                 if (MAC_GROUP_NO_CLIENT(fgrp)) {
7120                         mac_ring_t      *ring;
7121                         int             cnt;
7122                         int             ringcnt;
7123 
7124                         fgrp->mrg_state = MAC_GROUP_STATE_REGISTERED;
7125                         /*
7126                          * Additionally, we also need to stop all
7127                          * the rings in the default group, except
7128                          * the default ring. The reason being
7129                          * this group won't be released since it is
7130                          * the default group, so the rings won't
7131                          * be stopped otherwise.
7132                          */
7133                         ringcnt = fgrp->mrg_cur_count;
7134                         ring = fgrp->mrg_rings;
7135                         for (cnt = 0; cnt < ringcnt; cnt++) {
7136                                 if (ring->mr_state == MR_INUSE &&
7137                                     ring !=
7138                                     (mac_ring_t *)mip->mi_default_tx_ring) {
7139                                         mac_stop_ring(ring);
7140                                         ring->mr_flag = 0;
7141                                 }
7142                                 ring = ring->mr_next;
7143                         }
7144                 } else if (MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) {
7145                         fgrp->mrg_state = MAC_GROUP_STATE_RESERVED;
7146                 } else {
7147                         ASSERT(fgrp->mrg_state == MAC_GROUP_STATE_SHARED);
7148                 }
7149         } else {
7150                 /*
7151                  * We could have VLANs sharing the non-default group with
7152                  * the primary.
7153                  */
7154                 mgcp = fgrp->mrg_clients;
7155                 while (mgcp != NULL) {
7156                         gmcip = mgcp->mgc_client;
7157                         mgcp = mgcp->mgc_next;
7158                         if (gmcip == mcip)
7159                                 continue;
7160                         mac_tx_client_quiesce((mac_client_handle_t)gmcip);
7161                         gflent = gmcip->mci_flent;
7162 
7163                         mac_group_remove_client(fgrp, gmcip);
7164                         mac_tx_dismantle_soft_rings(fgrp, gflent);
7165 
7166                         mac_group_add_client(tgrp, gmcip);
7167                         gflent->fe_tx_ring_group = tgrp;
7168                         /* We could directly set this to SHARED */
7169                         tgrp->mrg_state = mac_group_next_state(tgrp,
7170                             &group_only_mcip, defgrp, B_FALSE);
7171                         mac_tx_srs_group_setup(gmcip, gflent, SRST_LINK);
7172                         mac_fanout_setup(gmcip, gflent,
7173                             MCIP_RESOURCE_PROPS(gmcip), mac_rx_deliver,
7174                             gmcip, NULL, NULL);
7175 
7176                         mac_tx_client_restart((mac_client_handle_t)gmcip);
7177                 }
7178                 mac_group_remove_client(fgrp, mcip);
7179                 mac_release_tx_group(mcip, fgrp);
7180                 fgrp->mrg_state = MAC_GROUP_STATE_REGISTERED;
7181         }
7182 
7183         /* Add it to the tgroup */
7184         mac_group_add_client(tgrp, mcip);
7185         flent->fe_tx_ring_group = tgrp;
7186         tgrp->mrg_state = mac_group_next_state(tgrp, &group_only_mcip,
7187             defgrp, B_FALSE);
7188 
7189         mac_tx_srs_group_setup(mcip, flent, SRST_LINK);
7190         mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
7191             mac_rx_deliver, mcip, NULL, NULL);
7192 }
7193 
7194 /*
7195  * This is a 1-time control path activity initiated by the client (IP).
7196  * The mac perimeter protects against other simultaneous control activities,
7197  * for example an ioctl that attempts to change the degree of fanout and
7198  * increase or decrease the number of softrings associated with this Tx SRS.
7199  */
7200 static mac_tx_notify_cb_t *
7201 mac_client_tx_notify_add(mac_client_impl_t *mcip,
7202     mac_tx_notify_t notify, void *arg)
7203 {
7204         mac_cb_info_t *mcbi;
7205         mac_tx_notify_cb_t *mtnfp;
7206 
7207         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
7208 
7209         mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP);
7210         mtnfp->mtnf_fn = notify;
7211         mtnfp->mtnf_arg = arg;
7212         mtnfp->mtnf_link.mcb_objp = mtnfp;
7213         mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t);
7214         mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T;
7215 
7216         mcbi = &mcip->mci_tx_notify_cb_info;
7217         mutex_enter(mcbi->mcbi_lockp);
7218         mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link);
7219         mutex_exit(mcbi->mcbi_lockp);
7220         return (mtnfp);
7221 }
7222 
7223 static void
7224 mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp)
7225 {
7226         mac_cb_info_t   *mcbi;
7227         mac_cb_t        **cblist;
7228 
7229         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
7230 
7231         if (!mac_callback_find(&mcip->mci_tx_notify_cb_info,
7232             &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) {
7233                 cmn_err(CE_WARN,
7234                     "mac_client_tx_notify_remove: callback not "
7235                     "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp);
7236                 return;
7237         }
7238 
7239         mcbi = &mcip->mci_tx_notify_cb_info;
7240         cblist = &mcip->mci_tx_notify_cb_list;
7241         mutex_enter(mcbi->mcbi_lockp);
7242         if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link))
7243                 kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t));
7244         else
7245                 mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info);
7246         mutex_exit(mcbi->mcbi_lockp);
7247 }
7248 
7249 /*
7250  * mac_client_tx_notify():
7251  * call to add and remove flow control callback routine.
7252  */
7253 mac_tx_notify_handle_t
7254 mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func,
7255     void *ptr)
7256 {
7257         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
7258         mac_tx_notify_cb_t      *mtnfp = NULL;
7259 
7260         i_mac_perim_enter(mcip->mci_mip);
7261 
7262         if (callb_func != NULL) {
7263                 /* Add a notify callback */
7264                 mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr);
7265         } else {
7266                 mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr);
7267         }
7268         i_mac_perim_exit(mcip->mci_mip);
7269 
7270         return ((mac_tx_notify_handle_t)mtnfp);
7271 }
7272 
7273 void
7274 mac_bridge_vectors(mac_bridge_tx_t txf, mac_bridge_rx_t rxf,
7275     mac_bridge_ref_t reff, mac_bridge_ls_t lsf)
7276 {
7277         mac_bridge_tx_cb = txf;
7278         mac_bridge_rx_cb = rxf;
7279         mac_bridge_ref_cb = reff;
7280         mac_bridge_ls_cb = lsf;
7281 }
7282 
7283 int
7284 mac_bridge_set(mac_handle_t mh, mac_handle_t link)
7285 {
7286         mac_impl_t *mip = (mac_impl_t *)mh;
7287         int retv;
7288 
7289         mutex_enter(&mip->mi_bridge_lock);
7290         if (mip->mi_bridge_link == NULL) {
7291                 mip->mi_bridge_link = link;
7292                 retv = 0;
7293         } else {
7294                 retv = EBUSY;
7295         }
7296         mutex_exit(&mip->mi_bridge_lock);
7297         if (retv == 0) {
7298                 mac_poll_state_change(mh, B_FALSE);
7299                 mac_capab_update(mh);
7300         }
7301         return (retv);
7302 }
7303 
7304 /*
7305  * Disable bridging on the indicated link.
7306  */
7307 void
7308 mac_bridge_clear(mac_handle_t mh, mac_handle_t link)
7309 {
7310         mac_impl_t *mip = (mac_impl_t *)mh;
7311 
7312         mutex_enter(&mip->mi_bridge_lock);
7313         ASSERT(mip->mi_bridge_link == link);
7314         mip->mi_bridge_link = NULL;
7315         mutex_exit(&mip->mi_bridge_lock);
7316         mac_poll_state_change(mh, B_TRUE);
7317         mac_capab_update(mh);
7318 }
7319 
7320 void
7321 mac_no_active(mac_handle_t mh)
7322 {
7323         mac_impl_t *mip = (mac_impl_t *)mh;
7324 
7325         i_mac_perim_enter(mip);
7326         mip->mi_state_flags |= MIS_NO_ACTIVE;
7327         i_mac_perim_exit(mip);
7328 }
7329 
7330 /*
7331  * Walk the primary VLAN clients whenever the primary's rings property
7332  * changes and update the mac_resource_props_t for the VLAN's client.
7333  * We need to do this since we don't support setting these properties
7334  * on the primary's VLAN clients, but the VLAN clients have to
7335  * follow the primary w.r.t the rings property;
7336  */
7337 void
7338 mac_set_prim_vlan_rings(mac_impl_t  *mip, mac_resource_props_t *mrp)
7339 {
7340         mac_client_impl_t       *vmcip;
7341         mac_resource_props_t    *vmrp;
7342 
7343         for (vmcip = mip->mi_clients_list; vmcip != NULL;
7344             vmcip = vmcip->mci_client_next) {
7345                 if (!(vmcip->mci_flent->fe_type & FLOW_PRIMARY_MAC) ||
7346                     mac_client_vid((mac_client_handle_t)vmcip) ==
7347                     VLAN_ID_NONE) {
7348                         continue;
7349                 }
7350                 vmrp = MCIP_RESOURCE_PROPS(vmcip);
7351 
7352                 vmrp->mrp_nrxrings =  mrp->mrp_nrxrings;
7353                 if (mrp->mrp_mask & MRP_RX_RINGS)
7354                         vmrp->mrp_mask |= MRP_RX_RINGS;
7355                 else if (vmrp->mrp_mask & MRP_RX_RINGS)
7356                         vmrp->mrp_mask &= ~MRP_RX_RINGS;
7357 
7358                 vmrp->mrp_ntxrings =  mrp->mrp_ntxrings;
7359                 if (mrp->mrp_mask & MRP_TX_RINGS)
7360                         vmrp->mrp_mask |= MRP_TX_RINGS;
7361                 else if (vmrp->mrp_mask & MRP_TX_RINGS)
7362                         vmrp->mrp_mask &= ~MRP_TX_RINGS;
7363 
7364                 if (mrp->mrp_mask & MRP_RXRINGS_UNSPEC)
7365                         vmrp->mrp_mask |= MRP_RXRINGS_UNSPEC;
7366                 else
7367                         vmrp->mrp_mask &= ~MRP_RXRINGS_UNSPEC;
7368 
7369                 if (mrp->mrp_mask & MRP_TXRINGS_UNSPEC)
7370                         vmrp->mrp_mask |= MRP_TXRINGS_UNSPEC;
7371                 else
7372                         vmrp->mrp_mask &= ~MRP_TXRINGS_UNSPEC;
7373         }
7374 }
7375 
7376 /*
7377  * We are adding or removing ring(s) from a group. The source for taking
7378  * rings is the default group. The destination for giving rings back is
7379  * the default group.
7380  */
7381 int
7382 mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group,
7383     mac_group_t *defgrp)
7384 {
7385         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
7386         uint_t                  modify;
7387         int                     count;
7388         mac_ring_t              *ring;
7389         mac_ring_t              *next;
7390         mac_impl_t              *mip = mcip->mci_mip;
7391         mac_ring_t              **rings;
7392         uint_t                  ringcnt;
7393         int                     i = 0;
7394         boolean_t               rx_group = group->mrg_type == MAC_RING_TYPE_RX;
7395         int                     start;
7396         int                     end;
7397         mac_group_t             *tgrp;
7398         int                     j;
7399         int                     rv = 0;
7400 
7401         /*
7402          * If we are asked for just a group, we give 1 ring, else
7403          * the specified number of rings.
7404          */
7405         if (rx_group) {
7406                 ringcnt = (mrp->mrp_mask & MRP_RXRINGS_UNSPEC) ? 1:
7407                     mrp->mrp_nrxrings;
7408         } else {
7409                 ringcnt = (mrp->mrp_mask & MRP_TXRINGS_UNSPEC) ? 1:
7410                     mrp->mrp_ntxrings;
7411         }
7412 
7413         /* don't allow modifying rings for a share for now. */
7414         ASSERT(mcip->mci_share == NULL);
7415 
7416         if (ringcnt == group->mrg_cur_count)
7417                 return (0);
7418 
7419         if (group->mrg_cur_count > ringcnt) {
7420                 modify = group->mrg_cur_count - ringcnt;
7421                 if (rx_group) {
7422                         if (mip->mi_rx_donor_grp == group) {
7423                                 ASSERT(mac_is_primary_client(mcip));
7424                                 mip->mi_rx_donor_grp = defgrp;
7425                         } else {
7426                                 defgrp = mip->mi_rx_donor_grp;
7427                         }
7428                 }
7429                 ring = group->mrg_rings;
7430                 rings = kmem_alloc(modify * sizeof (mac_ring_handle_t),
7431                     KM_SLEEP);
7432                 j = 0;
7433                 for (count = 0; count < modify; count++) {
7434                         next = ring->mr_next;
7435                         rv = mac_group_mov_ring(mip, defgrp, ring);
7436                         if (rv != 0) {
7437                                 /* cleanup on failure */
7438                                 for (j = 0; j < count; j++) {
7439                                         (void) mac_group_mov_ring(mip, group,
7440                                             rings[j]);
7441                                 }
7442                                 break;
7443                         }
7444                         rings[j++] = ring;
7445                         ring = next;
7446                 }
7447                 kmem_free(rings, modify * sizeof (mac_ring_handle_t));
7448                 return (rv);
7449         }
7450         if (ringcnt >= MAX_RINGS_PER_GROUP)
7451                 return (EINVAL);
7452 
7453         modify = ringcnt - group->mrg_cur_count;
7454 
7455         if (rx_group) {
7456                 if (group != mip->mi_rx_donor_grp)
7457                         defgrp = mip->mi_rx_donor_grp;
7458                 else
7459                         /*
7460                          * This is the donor group with all the remaining
7461                          * rings. Default group now gets to be the donor
7462                          */
7463                         mip->mi_rx_donor_grp = defgrp;
7464                 start = 1;
7465                 end = mip->mi_rx_group_count;
7466         } else {
7467                 start = 0;
7468                 end = mip->mi_tx_group_count - 1;
7469         }
7470         /*
7471          * If the default doesn't have any rings, lets see if we can
7472          * take rings given to an h/w client that doesn't need it.
7473          * For now, we just see if there is  any one client that can donate
7474          * all the required rings.
7475          */
7476         if (defgrp->mrg_cur_count < (modify + 1)) {
7477                 for (i = start; i < end; i++) {
7478                         if (rx_group) {
7479                                 tgrp = &mip->mi_rx_groups[i];
7480                                 if (tgrp == group || tgrp->mrg_state <
7481                                     MAC_GROUP_STATE_RESERVED) {
7482                                         continue;
7483                                 }
7484                                 mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
7485                                 if (mcip == NULL)
7486                                         mcip = mac_get_grp_primary(tgrp);
7487                                 ASSERT(mcip != NULL);
7488                                 mrp = MCIP_RESOURCE_PROPS(mcip);
7489                                 if ((mrp->mrp_mask & MRP_RX_RINGS) != 0)
7490                                         continue;
7491                                 if ((tgrp->mrg_cur_count +
7492                                     defgrp->mrg_cur_count) < (modify + 1)) {
7493                                         continue;
7494                                 }
7495                                 if (mac_rx_switch_group(mcip, tgrp,
7496                                     defgrp) != 0) {
7497                                         return (ENOSPC);
7498                                 }
7499                         } else {
7500                                 tgrp = &mip->mi_tx_groups[i];
7501                                 if (tgrp == group || tgrp->mrg_state <
7502                                     MAC_GROUP_STATE_RESERVED) {
7503                                         continue;
7504                                 }
7505                                 mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
7506                                 if (mcip == NULL)
7507                                         mcip = mac_get_grp_primary(tgrp);
7508                                 mrp = MCIP_RESOURCE_PROPS(mcip);
7509                                 if ((mrp->mrp_mask & MRP_TX_RINGS) != 0)
7510                                         continue;
7511                                 if ((tgrp->mrg_cur_count +
7512                                     defgrp->mrg_cur_count) < (modify + 1)) {
7513                                         continue;
7514                                 }
7515                                 /* OK, we can switch this to s/w */
7516                                 mac_tx_client_quiesce(
7517                                     (mac_client_handle_t)mcip);
7518                                 mac_tx_switch_group(mcip, tgrp, defgrp);
7519                                 mac_tx_client_restart(
7520                                     (mac_client_handle_t)mcip);
7521                         }
7522                 }
7523                 if (defgrp->mrg_cur_count < (modify + 1))
7524                         return (ENOSPC);
7525         }
7526         if ((rv = i_mac_group_allocate_rings(mip, group->mrg_type, defgrp,
7527             group, mcip->mci_share, modify)) != 0) {
7528                 return (rv);
7529         }
7530         return (0);
7531 }
7532 
7533 /*
7534  * Given the poolname in mac_resource_props, find the cpupart
7535  * that is associated with this pool.  The cpupart will be used
7536  * later for finding the cpus to be bound to the networking threads.
7537  *
7538  * use_default is set B_TRUE if pools are enabled and pool_default
7539  * is returned.  This avoids a 2nd lookup to set the poolname
7540  * for pool-effective.
7541  *
7542  * returns:
7543  *
7544  *    NULL -   pools are disabled or if the 'cpus' property is set.
7545  *    cpupart of pool_default  - pools are enabled and the pool
7546  *             is not available or poolname is blank
7547  *    cpupart of named pool    - pools are enabled and the pool
7548  *             is available.
7549  */
7550 cpupart_t *
7551 mac_pset_find(mac_resource_props_t *mrp, boolean_t *use_default)
7552 {
7553         pool_t          *pool;
7554         cpupart_t       *cpupart;
7555 
7556         *use_default = B_FALSE;
7557 
7558         /* CPUs property is set */
7559         if (mrp->mrp_mask & MRP_CPUS)
7560                 return (NULL);
7561 
7562         ASSERT(pool_lock_held());
7563 
7564         /* Pools are disabled, no pset */
7565         if (pool_state == POOL_DISABLED)
7566                 return (NULL);
7567 
7568         /* Pools property is set */
7569         if (mrp->mrp_mask & MRP_POOL) {
7570                 if ((pool = pool_lookup_pool_by_name(mrp->mrp_pool)) == NULL) {
7571                         /* Pool not found */
7572                         DTRACE_PROBE1(mac_pset_find_no_pool, char *,
7573                             mrp->mrp_pool);
7574                         *use_default = B_TRUE;
7575                         pool = pool_default;
7576                 }
7577         /* Pools property is not set */
7578         } else {
7579                 *use_default = B_TRUE;
7580                 pool = pool_default;
7581         }
7582 
7583         /* Find the CPU pset that corresponds to the pool */
7584         mutex_enter(&cpu_lock);
7585         if ((cpupart = cpupart_find(pool->pool_pset->pset_id)) == NULL) {
7586                 DTRACE_PROBE1(mac_find_pset_no_pset, psetid_t,
7587                     pool->pool_pset->pset_id);
7588         }
7589         mutex_exit(&cpu_lock);
7590 
7591         return (cpupart);
7592 }
7593 
7594 void
7595 mac_set_pool_effective(boolean_t use_default, cpupart_t *cpupart,
7596     mac_resource_props_t *mrp, mac_resource_props_t *emrp)
7597 {
7598         ASSERT(pool_lock_held());
7599 
7600         if (cpupart != NULL) {
7601                 emrp->mrp_mask |= MRP_POOL;
7602                 if (use_default) {
7603                         (void) strcpy(emrp->mrp_pool,
7604                             "pool_default");
7605                 } else {
7606                         ASSERT(strlen(mrp->mrp_pool) != 0);
7607                         (void) strcpy(emrp->mrp_pool,
7608                             mrp->mrp_pool);
7609                 }
7610         } else {
7611                 emrp->mrp_mask &= ~MRP_POOL;
7612                 bzero(emrp->mrp_pool, MAXPATHLEN);
7613         }
7614 }
7615 
7616 struct mac_pool_arg {
7617         char            mpa_poolname[MAXPATHLEN];
7618         pool_event_t    mpa_what;
7619 };
7620 
7621 /*ARGSUSED*/
7622 static uint_t
7623 mac_pool_link_update(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
7624 {
7625         struct mac_pool_arg     *mpa = arg;
7626         mac_impl_t              *mip = (mac_impl_t *)val;
7627         mac_client_impl_t       *mcip;
7628         mac_resource_props_t    *mrp, *emrp;
7629         boolean_t               pool_update = B_FALSE;
7630         boolean_t               pool_clear = B_FALSE;
7631         boolean_t               use_default = B_FALSE;
7632         cpupart_t               *cpupart = NULL;
7633 
7634         mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
7635         i_mac_perim_enter(mip);
7636         for (mcip = mip->mi_clients_list; mcip != NULL;
7637             mcip = mcip->mci_client_next) {
7638                 pool_update = B_FALSE;
7639                 pool_clear = B_FALSE;
7640                 use_default = B_FALSE;
7641                 mac_client_get_resources((mac_client_handle_t)mcip, mrp);
7642                 emrp = MCIP_EFFECTIVE_PROPS(mcip);
7643 
7644                 /*
7645                  * When pools are enabled
7646                  */
7647                 if ((mpa->mpa_what == POOL_E_ENABLE) &&
7648                     ((mrp->mrp_mask & MRP_CPUS) == 0)) {
7649                         mrp->mrp_mask |= MRP_POOL;
7650                         pool_update = B_TRUE;
7651                 }
7652 
7653                 /*
7654                  * When pools are disabled
7655                  */
7656                 if ((mpa->mpa_what == POOL_E_DISABLE) &&
7657                     ((mrp->mrp_mask & MRP_CPUS) == 0)) {
7658                         mrp->mrp_mask |= MRP_POOL;
7659                         pool_clear = B_TRUE;
7660                 }
7661 
7662                 /*
7663                  * Look for links with the pool property set and the poolname
7664                  * matching the one which is changing.
7665                  */
7666                 if (strcmp(mrp->mrp_pool, mpa->mpa_poolname) == 0) {
7667                         /*
7668                          * The pool associated with the link has changed.
7669                          */
7670                         if (mpa->mpa_what == POOL_E_CHANGE) {
7671                                 mrp->mrp_mask |= MRP_POOL;
7672                                 pool_update = B_TRUE;
7673                         }
7674                 }
7675 
7676                 /*
7677                  * This link is associated with pool_default and
7678                  * pool_default has changed.
7679                  */
7680                 if ((mpa->mpa_what == POOL_E_CHANGE) &&
7681                     (strcmp(emrp->mrp_pool, "pool_default") == 0) &&
7682                     (strcmp(mpa->mpa_poolname, "pool_default") == 0)) {
7683                         mrp->mrp_mask |= MRP_POOL;
7684                         pool_update = B_TRUE;
7685                 }
7686 
7687                 /*
7688                  * Get new list of cpus for the pool, bind network
7689                  * threads to new list of cpus and update resources.
7690                  */
7691                 if (pool_update) {
7692                         if (MCIP_DATAPATH_SETUP(mcip)) {
7693                                 pool_lock();
7694                                 cpupart = mac_pset_find(mrp, &use_default);
7695                                 mac_fanout_setup(mcip, mcip->mci_flent, mrp,
7696                                     mac_rx_deliver, mcip, NULL, cpupart);
7697                                 mac_set_pool_effective(use_default, cpupart,
7698                                     mrp, emrp);
7699                                 pool_unlock();
7700                         }
7701                         mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip),
7702                             B_FALSE);
7703                 }
7704 
7705                 /*
7706                  * Clear the effective pool and bind network threads
7707                  * to any available CPU.
7708                  */
7709                 if (pool_clear) {
7710                         if (MCIP_DATAPATH_SETUP(mcip)) {
7711                                 emrp->mrp_mask &= ~MRP_POOL;
7712                                 bzero(emrp->mrp_pool, MAXPATHLEN);
7713                                 mac_fanout_setup(mcip, mcip->mci_flent, mrp,
7714                                     mac_rx_deliver, mcip, NULL, NULL);
7715                         }
7716                         mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip),
7717                             B_FALSE);
7718                 }
7719         }
7720         i_mac_perim_exit(mip);
7721         kmem_free(mrp, sizeof (*mrp));
7722         return (MH_WALK_CONTINUE);
7723 }
7724 
7725 static void
7726 mac_pool_update(void *arg)
7727 {
7728         mod_hash_walk(i_mac_impl_hash, mac_pool_link_update, arg);
7729         kmem_free(arg, sizeof (struct mac_pool_arg));
7730 }
7731 
7732 /*
7733  * Callback function to be executed when a noteworthy pool event
7734  * takes place.
7735  */
7736 /* ARGSUSED */
7737 static void
7738 mac_pool_event_cb(pool_event_t what, poolid_t id, void *arg)
7739 {
7740         pool_t                  *pool;
7741         char                    *poolname = NULL;
7742         struct mac_pool_arg     *mpa;
7743 
7744         pool_lock();
7745         mpa = kmem_zalloc(sizeof (struct mac_pool_arg), KM_SLEEP);
7746 
7747         switch (what) {
7748         case POOL_E_ENABLE:
7749         case POOL_E_DISABLE:
7750                 break;
7751 
7752         case POOL_E_CHANGE:
7753                 pool = pool_lookup_pool_by_id(id);
7754                 if (pool == NULL) {
7755                         kmem_free(mpa, sizeof (struct mac_pool_arg));
7756                         pool_unlock();
7757                         return;
7758                 }
7759                 pool_get_name(pool, &poolname);
7760                 (void) strlcpy(mpa->mpa_poolname, poolname,
7761                     sizeof (mpa->mpa_poolname));
7762                 break;
7763 
7764         default:
7765                 kmem_free(mpa, sizeof (struct mac_pool_arg));
7766                 pool_unlock();
7767                 return;
7768         }
7769         pool_unlock();
7770 
7771         mpa->mpa_what = what;
7772 
7773         mac_pool_update(mpa);
7774 }
7775 
7776 /*
7777  * Set effective rings property. This could be called from datapath_setup/
7778  * datapath_teardown or set-linkprop.
7779  * If the group is reserved we just go ahead and set the effective rings.
7780  * Additionally, for TX this could mean the default  group has lost/gained
7781  * some rings, so if the default group is reserved, we need to adjust the
7782  * effective rings for the default group clients. For RX, if we are working
7783  * with the non-default group, we just need * to reset the effective props
7784  * for the default group clients.
7785  */
7786 void
7787 mac_set_rings_effective(mac_client_impl_t *mcip)
7788 {
7789         mac_impl_t              *mip = mcip->mci_mip;
7790         mac_group_t             *grp;
7791         mac_group_t             *defgrp;
7792         flow_entry_t            *flent = mcip->mci_flent;
7793         mac_resource_props_t    *emrp = MCIP_EFFECTIVE_PROPS(mcip);
7794         mac_grp_client_t        *mgcp;
7795         mac_client_impl_t       *gmcip;
7796 
7797         grp = flent->fe_rx_ring_group;
7798         if (grp != NULL) {
7799                 defgrp = MAC_DEFAULT_RX_GROUP(mip);
7800                 /*
7801                  * If we have reserved a group, set the effective rings
7802                  * to the ring count in the group.
7803                  */
7804                 if (grp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7805                         emrp->mrp_mask |= MRP_RX_RINGS;
7806                         emrp->mrp_nrxrings = grp->mrg_cur_count;
7807                 }
7808 
7809                 /*
7810                  * We go through the clients in the shared group and
7811                  * reset the effective properties. It is possible this
7812                  * might have already been done for some client (i.e.
7813                  * if some client is being moved to a group that is
7814                  * already shared). The case where the default group is
7815                  * RESERVED is taken care of above (note in the RX side if
7816                  * there is a non-default group, the default group is always
7817                  * SHARED).
7818                  */
7819                 if (grp != defgrp || grp->mrg_state == MAC_GROUP_STATE_SHARED) {
7820                         if (grp->mrg_state == MAC_GROUP_STATE_SHARED)
7821                                 mgcp = grp->mrg_clients;
7822                         else
7823                                 mgcp = defgrp->mrg_clients;
7824                         while (mgcp != NULL) {
7825                                 gmcip = mgcp->mgc_client;
7826                                 emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7827                                 if (emrp->mrp_mask & MRP_RX_RINGS) {
7828                                         emrp->mrp_mask &= ~MRP_RX_RINGS;
7829                                         emrp->mrp_nrxrings = 0;
7830                                 }
7831                                 mgcp = mgcp->mgc_next;
7832                         }
7833                 }
7834         }
7835 
7836         /* Now the TX side */
7837         grp = flent->fe_tx_ring_group;
7838         if (grp != NULL) {
7839                 defgrp = MAC_DEFAULT_TX_GROUP(mip);
7840 
7841                 if (grp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7842                         emrp->mrp_mask |= MRP_TX_RINGS;
7843                         emrp->mrp_ntxrings = grp->mrg_cur_count;
7844                 } else if (grp->mrg_state == MAC_GROUP_STATE_SHARED) {
7845                         mgcp = grp->mrg_clients;
7846                         while (mgcp != NULL) {
7847                                 gmcip = mgcp->mgc_client;
7848                                 emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7849                                 if (emrp->mrp_mask & MRP_TX_RINGS) {
7850                                         emrp->mrp_mask &= ~MRP_TX_RINGS;
7851                                         emrp->mrp_ntxrings = 0;
7852                                 }
7853                                 mgcp = mgcp->mgc_next;
7854                         }
7855                 }
7856 
7857                 /*
7858                  * If the group is not the default group and the default
7859                  * group is reserved, the ring count in the default group
7860                  * might have changed, update it.
7861                  */
7862                 if (grp != defgrp &&
7863                     defgrp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7864                         gmcip = MAC_GROUP_ONLY_CLIENT(defgrp);
7865                         emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7866                         emrp->mrp_ntxrings = defgrp->mrg_cur_count;
7867                 }
7868         }
7869         emrp = MCIP_EFFECTIVE_PROPS(mcip);
7870 }
7871 
7872 /*
7873  * Check if the primary is in the default group. If so, see if we
7874  * can give it a an exclusive group now that another client is
7875  * being configured. We take the primary out of the default group
7876  * because the multicast/broadcast packets for the all the clients
7877  * will land in the default ring in the default group which means
7878  * any client in the default group, even if it is the only on in
7879  * the group, will lose exclusive access to the rings, hence
7880  * polling.
7881  */
7882 mac_client_impl_t *
7883 mac_check_primary_relocation(mac_client_impl_t *mcip, boolean_t rxhw)
7884 {
7885         mac_impl_t              *mip = mcip->mci_mip;
7886         mac_group_t             *defgrp = MAC_DEFAULT_RX_GROUP(mip);
7887         flow_entry_t            *flent = mcip->mci_flent;
7888         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
7889         uint8_t                 *mac_addr;
7890         mac_group_t             *ngrp;
7891 
7892         /*
7893          * Check if the primary is in the default group, if not
7894          * or if it is explicitly configured to be in the default
7895          * group OR set the RX rings property, return.
7896          */
7897         if (flent->fe_rx_ring_group != defgrp || mrp->mrp_mask & MRP_RX_RINGS)
7898                 return (NULL);
7899 
7900         /*
7901          * If the new client needs an exclusive group and we
7902          * don't have another for the primary, return.
7903          */
7904         if (rxhw && mip->mi_rxhwclnt_avail < 2)
7905                 return (NULL);
7906 
7907         mac_addr = flent->fe_flow_desc.fd_dst_mac;
7908         /*
7909          * We call this when we are setting up the datapath for
7910          * the first non-primary.
7911          */
7912         ASSERT(mip->mi_nactiveclients == 2);
7913         /*
7914          * OK, now we have the primary that needs to be relocated.
7915          */
7916         ngrp =  mac_reserve_rx_group(mcip, mac_addr, B_TRUE);
7917         if (ngrp == NULL)
7918                 return (NULL);
7919         if (mac_rx_switch_group(mcip, defgrp, ngrp) != 0) {
7920                 mac_stop_group(ngrp);
7921                 return (NULL);
7922         }
7923         return (mcip);
7924 }