Print this page
5297 mptsas refhash replacement on reset can cause hang

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
          +++ new/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
↓ open down ↓ 1319 lines elided ↑ open up ↑
1320 1320          /*
1321 1321           * Initialize chip using Message Unit Reset, if allowed
1322 1322           */
1323 1323          mpt->m_softstate |= MPTSAS_SS_MSG_UNIT_RESET;
1324 1324          if (mptsas_init_chip(mpt, TRUE) == DDI_FAILURE) {
1325 1325                  mutex_exit(&mpt->m_mutex);
1326 1326                  mptsas_log(mpt, CE_WARN, "mptsas chip initialization failed");
1327 1327                  goto fail;
1328 1328          }
1329 1329  
     1330 +        mpt->m_targets = refhash_create(MPTSAS_TARGET_BUCKET_COUNT,
     1331 +            mptsas_target_addr_hash, mptsas_target_addr_cmp,
     1332 +            mptsas_target_free, sizeof (mptsas_target_t),
     1333 +            offsetof(mptsas_target_t, m_link),
     1334 +            offsetof(mptsas_target_t, m_addr), KM_SLEEP);
     1335 +
1330 1336          /*
1331 1337           * Fill in the phy_info structure and get the base WWID
1332 1338           */
1333 1339          if (mptsas_get_manufacture_page5(mpt) == DDI_FAILURE) {
1334 1340                  mptsas_log(mpt, CE_WARN,
1335 1341                      "mptsas_get_manufacture_page5 failed!");
1336 1342                  goto fail;
1337 1343          }
1338 1344  
1339 1345          if (mptsas_get_sas_io_unit_page_hndshk(mpt)) {
↓ open down ↓ 11366 lines elided ↑ open up ↑
12706 12712          /*
12707 12713           * IOC facts can change after a diag reset so all buffers that are
12708 12714           * based on these numbers must be de-allocated and re-allocated.  Get
12709 12715           * new IOC facts each time chip is initialized.
12710 12716           */
12711 12717          if (mptsas_ioc_get_facts(mpt) == DDI_FAILURE) {
12712 12718                  mptsas_log(mpt, CE_WARN, "mptsas_ioc_get_facts failed");
12713 12719                  goto fail;
12714 12720          }
12715 12721  
12716      -        mpt->m_targets = refhash_create(MPTSAS_TARGET_BUCKET_COUNT,
12717      -            mptsas_target_addr_hash, mptsas_target_addr_cmp,
12718      -            mptsas_target_free, sizeof (mptsas_target_t),
12719      -            offsetof(mptsas_target_t, m_link),
12720      -            offsetof(mptsas_target_t, m_addr), KM_SLEEP);
12721      -
12722 12722          if (mptsas_alloc_active_slots(mpt, KM_SLEEP)) {
12723 12723                  goto fail;
12724 12724          }
12725 12725          /*
12726 12726           * Allocate request message frames, reply free queue, reply descriptor
12727 12727           * post queue, and reply message frames using latest IOC facts.
12728 12728           */
12729 12729          if (mptsas_alloc_request_frames(mpt) == DDI_FAILURE) {
12730 12730                  mptsas_log(mpt, CE_WARN, "mptsas_alloc_request_frames failed");
12731 12731                  goto fail;
↓ open down ↓ 1574 lines elided ↑ open up ↑
14306 14306          ASSERT(MUTEX_HELD(&mpt->m_mutex));
14307 14307  
14308 14308          /*
14309 14309           * TODO after hard reset, update the driver data structures
14310 14310           * 1. update port/phymask mapping table mpt->m_phy_info
14311 14311           * 2. invalid all the entries in hash table
14312 14312           *    m_devhdl = 0xffff and m_deviceinfo = 0
14313 14313           * 3. call sas_device_page/expander_page to update hash table
14314 14314           */
14315 14315          mptsas_update_phymask(mpt);
     14316 +
14316 14317          /*
14317      -         * Invalid the existing entries
14318      -         *
14319      -         * XXX - It seems like we should just delete everything here.  We are
14320      -         * holding the lock and are about to refresh all the targets in both
14321      -         * hashes anyway.  Given the path we're in, what outstanding async
14322      -         * event could possibly be trying to reference one of these things
14323      -         * without taking the lock, and how would that be useful anyway?
     14318 +         * Remove all the devhdls for existing entries but leave their
     14319 +         * addresses alone.  In update_hashtab() below, we'll find all
     14320 +         * targets that are still present and reassociate them with
     14321 +         * their potentially new devhdls.  Leaving the targets around in
     14322 +         * this fashion allows them to be used on the tx waitq even
     14323 +         * while IOC reset it occurring.
14324 14324           */
14325 14325          for (tp = refhash_first(mpt->m_targets); tp != NULL;
14326 14326              tp = refhash_next(mpt->m_targets, tp)) {
14327 14327                  tp->m_devhdl = MPTSAS_INVALID_DEVHDL;
14328 14328                  tp->m_deviceinfo = 0;
14329 14329                  tp->m_dr_flag = MPTSAS_DR_INACTIVE;
14330 14330          }
14331 14331          for (sp = refhash_first(mpt->m_smp_targets); sp != NULL;
14332 14332              sp = refhash_next(mpt->m_smp_targets, sp)) {
14333 14333                  sp->m_devhdl = MPTSAS_INVALID_DEVHDL;
↓ open down ↓ 2028 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX