1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 #include <sys/conf.h>
  27 #include <sys/ddi.h>
  28 #include <sys/ddifm.h>
  29 #include <sys/sunddi.h>
  30 #include <sys/sunndi.h>
  31 #include <sys/stat.h>
  32 #include <sys/modctl.h>
  33 #include <sys/types.h>
  34 #include <sys/cpuvar.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/kmem.h>
  37 #include <sys/cred.h>
  38 #include <sys/ksynch.h>
  39 #include <sys/rwlock.h>
  40 #include <sys/pghw.h>
  41 #include <sys/open.h>
  42 #include <sys/policy.h>
  43 #include <sys/x86_archext.h>
  44 #include <sys/cpu_module.h>
  45 #include <qsort.h>
  46 #include <sys/pci_cfgspace.h>
  47 #include <sys/mc.h>
  48 #include <sys/mc_amd.h>
  49 #include <sys/smbios.h>
  50 #include <sys/pci.h>
  51 #include <mcamd.h>
  52 #include <mcamd_dimmcfg.h>
  53 #include <mcamd_pcicfg.h>
  54 #include <mcamd_api.h>
  55 #include <sys/fm/cpu/AMD.h>
  56 #include <sys/fm/smb/fmsmb.h>
  57 #include <sys/fm/protocol.h>
  58 #include <sys/fm/util.h>
  59 
  60 /*
  61  * Set to prevent mc-amd from attaching.
  62  */
  63 int mc_no_attach = 0;
  64 
  65 /*
  66  * Of the 754/939/940 packages, only socket 940 supports quadrank registered
  67  * dimms.  Unfortunately, no memory-controller register indicates the
  68  * presence of quadrank dimm support or presence (i.e., in terms of number
  69  * of slots per cpu, and chip-select lines per slot,  The following may be set
  70  * in /etc/system to indicate the presence of quadrank support on a motherboard.
  71  *
  72  * There is no need to set this for F(1207) and S1g1.
  73  */
  74 int mc_quadranksupport = 0;
  75 
  76 mc_t *mc_list, *mc_last;
  77 krwlock_t mc_lock;
  78 int mc_hold_attached = 1;
  79 
  80 #define MAX(m, n) ((m) >= (n) ? (m) : (n))
  81 #define MIN(m, n) ((m) <= (n) ? (m) : (n))
  82 
  83 /*
  84  * The following tuneable is used to determine the DRAM scrubbing rate.
  85  * The values range from 0x00-0x16 as described in the BKDG.  Zero
  86  * disables DRAM scrubbing.  Values above zero indicate rates in descending
  87  * order.
  88  *
  89  * The default value below is used on several Sun systems.  In the future
  90  * this code should assign values dynamically based on memory sizing.
  91  */
  92 uint32_t mc_scrub_rate_dram = 0xd;      /* 64B every 163.8 us; 1GB per 45 min */
  93 
  94 enum {
  95         MC_SCRUB_BIOSDEFAULT,   /* retain system default value */
  96         MC_SCRUB_FIXED,         /* assign mc_scrub_rate_* values */
  97         MC_SCRUB_MAX            /* assign max of system and tunables */
  98 } mc_scrub_policy = MC_SCRUB_MAX;
  99 
 100 static void
 101 mc_snapshot_destroy(mc_t *mc)
 102 {
 103         ASSERT(RW_LOCK_HELD(&mc_lock));
 104 
 105         if (mc->mc_snapshot == NULL)
 106                 return;
 107 
 108         kmem_free(mc->mc_snapshot, mc->mc_snapshotsz);
 109         mc->mc_snapshot = NULL;
 110         mc->mc_snapshotsz = 0;
 111         mc->mc_snapshotgen++;
 112 }
 113 
 114 static int
 115 mc_snapshot_update(mc_t *mc)
 116 {
 117         ASSERT(RW_LOCK_HELD(&mc_lock));
 118 
 119         if (mc->mc_snapshot != NULL)
 120                 return (0);
 121 
 122         if (nvlist_pack(mc->mc_nvl, &mc->mc_snapshot, &mc->mc_snapshotsz,
 123             NV_ENCODE_XDR, KM_SLEEP) != 0)
 124                 return (-1);
 125 
 126         return (0);
 127 }
 128 
 129 static mc_t *
 130 mc_lookup_by_chipid(int chipid)
 131 {
 132         mc_t *mc;
 133 
 134         ASSERT(RW_LOCK_HELD(&mc_lock));
 135 
 136         for (mc = mc_list; mc != NULL; mc = mc->mc_next) {
 137                 if (mc->mc_props.mcp_num  == chipid)
 138                         return (mc);
 139         }
 140 
 141         return (NULL);
 142 }
 143 
 144 /*
 145  * Read config register pairs into the two arrays provided on the given
 146  * handle and at offsets as follows:
 147  *
 148  *      Index   Array r1 offset                 Array r2 offset
 149  *      0       r1addr                          r2addr
 150  *      1       r1addr + incr                   r2addr + incr
 151  *      2       r1addr + 2 * incr               r2addr + 2 * incr
 152  *      ...
 153  *      n - 1   r1addr + (n - 1) * incr         r2addr + (n - 1) * incr
 154  *
 155  * The number of registers to read into the r1 array is r1n; the number
 156  * for the r2 array is r2n.
 157  */
 158 static void
 159 mc_prop_read_pair(mc_pcicfg_hdl_t cfghdl, uint32_t *r1, off_t r1addr,
 160     int r1n, uint32_t *r2, off_t r2addr, int r2n, off_t incr)
 161 {
 162         int i;
 163 
 164         for (i = 0; i < MAX(r1n, r2n); i++, r1addr += incr, r2addr += incr) {
 165                 if (i < r1n)
 166                         r1[i] = mc_pcicfg_get32(cfghdl, r1addr);
 167                 if (i < r2n)
 168                         r2[i] = mc_pcicfg_get32(cfghdl, r2addr);
 169         }
 170 }
 171 
 172 /*ARGSUSED*/
 173 static int
 174 mc_nvl_add_socket_cb(cmi_hdl_t whdl, void *arg1, void *arg2, void *arg3)
 175 {
 176         uint32_t skt = *((uint32_t *)arg1);
 177         cmi_hdl_t *hdlp = (cmi_hdl_t *)arg2;
 178 
 179         if (cmi_hdl_getsockettype(whdl) == skt) {
 180                 cmi_hdl_hold(whdl);     /* short-term hold */
 181                 *hdlp = whdl;
 182                 return (CMI_HDL_WALK_DONE);
 183         } else {
 184                 return (CMI_HDL_WALK_NEXT);
 185         }
 186 }
 187 
 188 static void
 189 mc_nvl_add_socket(nvlist_t *nvl, mc_t *mc)
 190 {
 191         cmi_hdl_t hdl = NULL;
 192         const char *s;
 193 
 194         cmi_hdl_walk(mc_nvl_add_socket_cb, (void *)&mc->mc_socket,
 195             (void *)&hdl, NULL);
 196         if (hdl == NULL)
 197                 s = "Unknown";  /* no cpu for this chipid found */
 198         else
 199                 s = cmi_hdl_getsocketstr(hdl);
 200 
 201         (void) nvlist_add_string(nvl, "socket", s);
 202 
 203         if (hdl != NULL)
 204                 cmi_hdl_rele(hdl);
 205 }
 206 
 207 static uint32_t
 208 mc_ecc_enabled(mc_t *mc)
 209 {
 210         uint32_t rev = mc->mc_props.mcp_rev;
 211         union mcreg_nbcfg nbcfg;
 212 
 213         MCREG_VAL32(&nbcfg) = mc->mc_cfgregs.mcr_nbcfg;
 214 
 215         return (MC_REV_MATCH(rev, MC_F_REVS_BCDE) ?
 216             MCREG_FIELD_F_preF(&nbcfg, EccEn) :
 217             MCREG_FIELD_F_revFG(&nbcfg, EccEn));
 218 }
 219 
 220 static uint32_t
 221 mc_ck_enabled(mc_t *mc)
 222 {
 223         uint32_t rev = mc->mc_props.mcp_rev;
 224         union mcreg_nbcfg nbcfg;
 225 
 226         MCREG_VAL32(&nbcfg) = mc->mc_cfgregs.mcr_nbcfg;
 227 
 228         return (MC_REV_MATCH(rev, MC_F_REVS_BCDE) ?
 229             MCREG_FIELD_F_preF(&nbcfg, ChipKillEccEn) :
 230             MCREG_FIELD_F_revFG(&nbcfg, ChipKillEccEn));
 231 }
 232 
 233 static void
 234 mc_nvl_add_ecctype(nvlist_t *nvl, mc_t *mc)
 235 {
 236         (void) nvlist_add_string(nvl, "ecc-type", mc_ecc_enabled(mc) ?
 237             (mc_ck_enabled(mc) ? "ChipKill 128/16" : "Normal 64/8") : "None");
 238 }
 239 
 240 static void
 241 mc_nvl_add_prop(nvlist_t *nvl, void *node, mcamd_propcode_t code, int reqval)
 242 {
 243         int valfound;
 244         uint64_t value;
 245         const char *name = mcamd_get_propname(code);
 246 
 247         valfound = mcamd_get_numprop(NULL, (mcamd_node_t *)node, code, &value);
 248 
 249         ASSERT(name != NULL && valfound);
 250         if (name != NULL && valfound && (!reqval || value != MC_INVALNUM))
 251                 (void) nvlist_add_uint64(nvl, name, value);
 252 }
 253 
 254 static void
 255 mc_nvl_add_cslist(nvlist_t *mcnvl, mc_t *mc)
 256 {
 257         mc_cs_t *mccs = mc->mc_cslist;
 258         nvlist_t *cslist[MC_CHIP_NCS];
 259         int nelem, i;
 260 
 261         for (nelem = 0; mccs != NULL; mccs = mccs->mccs_next, nelem++) {
 262                 nvlist_t **csp = &cslist[nelem];
 263                 char csname[MCDCFG_CSNAMELEN];
 264 
 265                 (void) nvlist_alloc(csp, NV_UNIQUE_NAME, KM_SLEEP);
 266                 mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_NUM, 0);
 267                 mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_BASE_ADDR, 0);
 268                 mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_MASK, 0);
 269                 mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_SIZE, 0);
 270 
 271                 /*
 272                  * It is possible for an mc_cs_t not to have associated
 273                  * DIMM info if mcdcfg_lookup failed.
 274                  */
 275                 if (mccs->mccs_csl[0] != NULL) {
 276                         mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_CSDIMM1, 1);
 277                         mcdcfg_csname(mc->mc_socket, mccs->mccs_csl[0], csname,
 278                             sizeof (csname));
 279                         (void) nvlist_add_string(*csp, "dimm1-csname", csname);
 280                 }
 281 
 282                 if (mccs->mccs_csl[1] != NULL) {
 283                         mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_CSDIMM2, 1);
 284                         mcdcfg_csname(mc->mc_socket, mccs->mccs_csl[1], csname,
 285                             sizeof (csname));
 286                         (void) nvlist_add_string(*csp, "dimm2-csname", csname);
 287                 }
 288         }
 289 
 290         /* Add cslist nvlist array even if zero members */
 291         (void) nvlist_add_nvlist_array(mcnvl, "cslist", cslist, nelem);
 292         for (i = 0; i < nelem; i++)
 293                 nvlist_free(cslist[i]);
 294 }
 295 
 296 static void
 297 mc_nvl_add_dimmlist(nvlist_t *mcnvl, mc_t *mc)
 298 {
 299         nvlist_t *dimmlist[MC_CHIP_NDIMM];
 300         mc_dimm_t *mcd;
 301         int nelem, i;
 302 
 303         for (nelem = 0, mcd = mc->mc_dimmlist; mcd != NULL;
 304             mcd = mcd->mcd_next, nelem++) {
 305                 nvlist_t **dimmp = &dimmlist[nelem];
 306                 uint64_t csnums[MC_CHIP_DIMMRANKMAX];
 307                 char csname[4][MCDCFG_CSNAMELEN];
 308                 char *csnamep[4];
 309                 int ncs = 0;
 310 
 311                 (void) nvlist_alloc(dimmp, NV_UNIQUE_NAME, KM_SLEEP);
 312 
 313                 mc_nvl_add_prop(*dimmp, mcd, MCAMD_PROP_NUM, 1);
 314                 mc_nvl_add_prop(*dimmp, mcd, MCAMD_PROP_SIZE, 1);
 315 
 316                 for (i = 0; i < MC_CHIP_DIMMRANKMAX; i++) {
 317                         if (mcd->mcd_cs[i] != NULL) {
 318                                 csnums[ncs] =
 319                                     mcd->mcd_cs[i]->mccs_props.csp_num;
 320                                 mcdcfg_csname(mc->mc_socket, mcd->mcd_csl[i],
 321                                     csname[ncs], MCDCFG_CSNAMELEN);
 322                                 csnamep[ncs] = csname[ncs];
 323                                 ncs++;
 324                         }
 325                 }
 326 
 327                 (void) nvlist_add_uint64_array(*dimmp, "csnums", csnums, ncs);
 328                 (void) nvlist_add_string_array(*dimmp, "csnames", csnamep, ncs);
 329         }
 330 
 331         /* Add dimmlist nvlist array even if zero members */
 332         (void) nvlist_add_nvlist_array(mcnvl, "dimmlist", dimmlist, nelem);
 333         for (i = 0; i < nelem; i++)
 334                 nvlist_free(dimmlist[i]);
 335 }
 336 
 337 static void
 338 mc_nvl_add_htconfig(nvlist_t *mcnvl, mc_t *mc)
 339 {
 340         mc_cfgregs_t *mcr = &mc->mc_cfgregs;
 341         union mcreg_htroute *htrp = (union mcreg_htroute *)&mcr->mcr_htroute[0];
 342         union mcreg_nodeid *nip = (union mcreg_nodeid *)&mcr->mcr_htnodeid;
 343         union mcreg_unitid *uip = (union mcreg_unitid *)&mcr->mcr_htunitid;
 344         int ndcnt = HT_COHERENTNODES(nip);
 345         uint32_t BCRte[MC_CHIP_MAXNODES];
 346         uint32_t RPRte[MC_CHIP_MAXNODES];
 347         uint32_t RQRte[MC_CHIP_MAXNODES];
 348         nvlist_t *nvl;
 349         int i;
 350 
 351         (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
 352 
 353         (void) nvlist_add_uint32(nvl, "NodeId", MCREG_FIELD_CMN(nip, NodeId));
 354         (void) nvlist_add_uint32(nvl, "CoherentNodes", HT_COHERENTNODES(nip));
 355         (void) nvlist_add_uint32(nvl, "SbNode", MCREG_FIELD_CMN(nip, SbNode));
 356         (void) nvlist_add_uint32(nvl, "LkNode", MCREG_FIELD_CMN(nip, LkNode));
 357         (void) nvlist_add_uint32(nvl, "SystemCoreCount",
 358             HT_SYSTEMCORECOUNT(nip));
 359 
 360         (void) nvlist_add_uint32(nvl, "C0Unit", MCREG_FIELD_CMN(uip, C0Unit));
 361         (void) nvlist_add_uint32(nvl, "C1Unit", MCREG_FIELD_CMN(uip, C1Unit));
 362         (void) nvlist_add_uint32(nvl, "McUnit", MCREG_FIELD_CMN(uip, McUnit));
 363         (void) nvlist_add_uint32(nvl, "HbUnit", MCREG_FIELD_CMN(uip, HbUnit));
 364         (void) nvlist_add_uint32(nvl, "SbLink", MCREG_FIELD_CMN(uip, SbLink));
 365 
 366         if (ndcnt <= MC_CHIP_MAXNODES) {
 367                 for (i = 0; i < ndcnt; i++, htrp++) {
 368                         BCRte[i] = MCREG_FIELD_CMN(htrp, BCRte);
 369                         RPRte[i] = MCREG_FIELD_CMN(htrp, RPRte);
 370                         RQRte[i] = MCREG_FIELD_CMN(htrp, RQRte);
 371                 }
 372 
 373                 (void) nvlist_add_uint32_array(nvl, "BroadcastRoutes",
 374                     &BCRte[0], ndcnt);
 375                 (void) nvlist_add_uint32_array(nvl, "ResponseRoutes",
 376                     &RPRte[0], ndcnt);
 377                 (void) nvlist_add_uint32_array(nvl, "RequestRoutes",
 378                     &RQRte[0], ndcnt);
 379         }
 380 
 381         (void) nvlist_add_nvlist(mcnvl, "htconfig", nvl);
 382         nvlist_free(nvl);
 383 }
 384 
 385 static nvlist_t *
 386 mc_nvl_create(mc_t *mc)
 387 {
 388         nvlist_t *mcnvl;
 389 
 390         (void) nvlist_alloc(&mcnvl, NV_UNIQUE_NAME, KM_SLEEP);
 391 
 392         /*
 393          * Since this nvlist is used in populating the topo tree changes
 394          * made here may propogate through to changed property names etc
 395          * in the topo tree.  Some properties in the topo tree will be
 396          * contracted via ARC, so be careful what you change here.
 397          */
 398         (void) nvlist_add_uint8(mcnvl, MC_NVLIST_VERSTR, MC_NVLIST_VERS1);
 399 
 400         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_NUM, 0);
 401         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_REV, 0);
 402         (void) nvlist_add_string(mcnvl, "revname", mc->mc_revname);
 403         mc_nvl_add_socket(mcnvl, mc);
 404         mc_nvl_add_ecctype(mcnvl, mc);
 405 
 406         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_BASE_ADDR, 0);
 407         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_LIM_ADDR, 0);
 408         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_ILEN, 0);
 409         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_ILSEL, 0);
 410         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_CSINTLVFCTR, 0);
 411         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_DRAMHOLE_SIZE, 0);
 412         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_ACCESS_WIDTH, 0);
 413         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_CSBANKMAPREG, 0);
 414         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_BANKSWZL, 0);
 415         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_MOD64MUX, 0);
 416         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_SPARECS, 1);
 417         mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_BADCS, 1);
 418 
 419         mc_nvl_add_cslist(mcnvl, mc);
 420         mc_nvl_add_dimmlist(mcnvl, mc);
 421         mc_nvl_add_htconfig(mcnvl, mc);
 422 
 423         return (mcnvl);
 424 }
 425 
 426 /*
 427  * Link a dimm to its associated chip-selects and chip-select lines.
 428  * Total the size of all ranks of this dimm.
 429  */
 430 static void
 431 mc_dimm_csadd(mc_t *mc, mc_dimm_t *mcd, mc_cs_t *mccs, const mcdcfg_csl_t *csl)
 432 {
 433         int factor = (mc->mc_props.mcp_accwidth == 128) ? 2 : 1;
 434         uint64_t sz = 0;
 435         int i;
 436 
 437         /* Skip to first unused rank slot */
 438         for (i = 0; i < MC_CHIP_DIMMRANKMAX; i++) {
 439                 if (mcd->mcd_cs[i] == NULL) {
 440                         mcd->mcd_cs[i] = mccs;
 441                         mcd->mcd_csl[i] = csl;
 442                         sz += mccs->mccs_props.csp_size / factor;
 443                         break;
 444                 } else {
 445                         sz += mcd->mcd_cs[i]->mccs_props.csp_size / factor;
 446                 }
 447         }
 448 
 449         ASSERT(i != MC_CHIP_DIMMRANKMAX);
 450 
 451         mcd->mcd_size = sz;
 452 }
 453 
 454 /*
 455  * Create a dimm structure and call to link it to its associated chip-selects.
 456  */
 457 static mc_dimm_t *
 458 mc_dimm_create(mc_t *mc, uint_t num)
 459 {
 460         mc_dimm_t *mcd = kmem_zalloc(sizeof (mc_dimm_t), KM_SLEEP);
 461 
 462         mcd->mcd_hdr.mch_type = MC_NT_DIMM;
 463         mcd->mcd_mc = mc;
 464         mcd->mcd_num = num;
 465 
 466         return (mcd);
 467 }
 468 
 469 /*
 470  * The chip-select structure includes an array of dimms associated with
 471  * that chip-select.  This function fills that array, and also builds
 472  * the list of all dimms on this memory controller mc_dimmlist.  The
 473  * caller has filled a structure with all there is to know about the
 474  * associated dimm(s).
 475  */
 476 static void
 477 mc_csdimms_create(mc_t *mc, mc_cs_t *mccs, mcdcfg_rslt_t *rsltp)
 478 {
 479         mc_dimm_t *found[MC_CHIP_DIMMPERCS];
 480         mc_dimm_t *mcd;
 481         int nfound = 0;
 482         int i;
 483 
 484         /*
 485          * Has some other chip-select already created this dimm or dimms?
 486          * If so then link to the dimm(s) from the mccs_dimm array,
 487          * record their topo numbers in the csp_dimmnums array, and link
 488          * the dimm(s) to the additional chip-select.
 489          */
 490         for (mcd = mc->mc_dimmlist; mcd != NULL; mcd = mcd->mcd_next) {
 491                 for (i = 0; i < rsltp->ndimm; i++) {
 492                         if (mcd->mcd_num == rsltp->dimm[i].toponum)
 493                                 found[nfound++] = mcd;
 494                 }
 495         }
 496         ASSERT(nfound == 0 || nfound == rsltp->ndimm);
 497 
 498         for (i = 0; i < rsltp->ndimm; i++) {
 499                 if (nfound == 0) {
 500                         mcd = mc_dimm_create(mc, rsltp->dimm[i].toponum);
 501                         if (mc->mc_dimmlist == NULL)
 502                                 mc->mc_dimmlist = mcd;
 503                         else
 504                                 mc->mc_dimmlast->mcd_next = mcd;
 505                         mc->mc_dimmlast = mcd;
 506                 } else {
 507                         mcd = found[i];
 508                 }
 509 
 510                 mccs->mccs_dimm[i] = mcd;
 511                 mccs->mccs_csl[i] = rsltp->dimm[i].cslp;
 512                 mccs->mccs_props.csp_dimmnums[i] = mcd->mcd_num;
 513                 mc_dimm_csadd(mc, mcd, mccs, rsltp->dimm[i].cslp);
 514 
 515         }
 516 
 517         /* The rank number is constant across all constituent dimm(s) */
 518         mccs->mccs_props.csp_dimmrank = rsltp->dimm[0].cslp->csl_rank;
 519 }
 520 
 521 /*
 522  * mc_dimmlist_create is called after we have discovered all enabled
 523  * (and spare or testfailed on revs F and G) chip-selects on the
 524  * given memory controller.  For each chip-select we must derive
 525  * the associated dimms, remembering that a chip-select csbase/csmask
 526  * pair may be associated with up to 2 chip-select lines (in 128 bit mode)
 527  * and that any one dimm may be associated with 1, 2, or 4 chip-selects
 528  * depending on whether it is single, dual or quadrank.
 529  */
 530 static void
 531 mc_dimmlist_create(mc_t *mc)
 532 {
 533         union mcreg_dramcfg_hi *drcfghip =
 534             (union mcreg_dramcfg_hi *)(&mc->mc_cfgregs.mcr_dramcfghi);
 535         mc_props_t *mcp = &mc->mc_props;
 536         uint32_t rev = mcp->mcp_rev;
 537         mc_cs_t *mccs;
 538         int r4 = 0, s4 = 0;
 539 
 540         /*
 541          * Are we dealing with quadrank registered dimms?
 542          *
 543          * For socket 940 we can't tell and we'll assume we're not.
 544          * This can be over-ridden by the admin in /etc/system by setting
 545          * mc_quadranksupport nonzero.  A possible optimisation in systems
 546          * that export an SMBIOS table would be to count the number of
 547          * dimm slots per cpu - more than 4 would indicate no quadrank support
 548          * and 4 or fewer would indicate that if we see any of the upper
 549          * chip-selects enabled then a quadrank dimm is present.
 550          *
 551          * For socket F(1207) we can check a bit in the dram config high reg.
 552          *
 553          * Other socket types do not support registered dimms.
 554          */
 555         if (mc->mc_socket == X86_SOCKET_940)
 556                 r4 = mc_quadranksupport != 0;
 557         else if (mc->mc_socket == X86_SOCKET_F1207)
 558                 r4 = MCREG_FIELD_F_revFG(drcfghip, FourRankRDimm);
 559 
 560         /*
 561          * Are we dealing with quadrank SO-DIMMs?  These are supported
 562          * in AM2 and S1g1 packages only, but in all rev F/G cases we
 563          * can detect their presence via a bit in the dram config high reg.
 564          */
 565         if (MC_REV_MATCH(rev, MC_F_REVS_FG))
 566                 s4 = MCREG_FIELD_F_revFG(drcfghip, FourRankSODimm);
 567 
 568         for (mccs = mc->mc_cslist; mccs != NULL; mccs = mccs->mccs_next) {
 569                 mcdcfg_rslt_t rslt;
 570 
 571                 /*
 572                  * If lookup fails we will not create dimm structures for
 573                  * this chip-select.  In the mc_cs_t we will have both
 574                  * csp_dimmnum members set to MC_INVALNUM and patounum
 575                  * code will see from those that we do not have dimm info
 576                  * for this chip-select.
 577                  */
 578                 if (mcdcfg_lookup(rev, mcp->mcp_mod64mux, mcp->mcp_accwidth,
 579                     mccs->mccs_props.csp_num, mc->mc_socket,
 580                     r4, s4, &rslt) < 0)
 581                         continue;
 582 
 583                 mc_csdimms_create(mc, mccs, &rslt);
 584         }
 585 }
 586 
 587 static mc_cs_t *
 588 mc_cs_create(mc_t *mc, uint_t num, uint64_t base, uint64_t mask, size_t sz,
 589     int csbe, int spare, int testfail)
 590 {
 591         mc_cs_t *mccs = kmem_zalloc(sizeof (mc_cs_t), KM_SLEEP);
 592         mccs_props_t *csp = &mccs->mccs_props;
 593         int i;
 594 
 595         mccs->mccs_hdr.mch_type = MC_NT_CS;
 596         mccs->mccs_mc = mc;
 597         csp->csp_num = num;
 598         csp->csp_base = base;
 599         csp->csp_mask = mask;
 600         csp->csp_size = sz;
 601         csp->csp_csbe = csbe;
 602         csp->csp_spare = spare;
 603         csp->csp_testfail = testfail;
 604 
 605         for (i = 0; i < MC_CHIP_DIMMPERCS; i++)
 606                 csp->csp_dimmnums[i] = MC_INVALNUM;
 607 
 608         if (spare)
 609                 mc->mc_props.mcp_sparecs = num;
 610 
 611         return (mccs);
 612 }
 613 
 614 /*
 615  * For any cs# of this mc marked TestFail generate an ereport with
 616  * resource identifying the associated dimm(s).
 617  */
 618 static void
 619 mc_report_testfails(mc_t *mc)
 620 {
 621         mc_unum_t unum;
 622         mc_cs_t *mccs;
 623         int i;
 624 
 625         for (mccs = mc->mc_cslist; mccs != NULL; mccs = mccs->mccs_next) {
 626                 if (mccs->mccs_props.csp_testfail) {
 627                         unum.unum_board = 0;
 628                         unum.unum_chip = mc->mc_props.mcp_num;
 629                         unum.unum_mc = 0;
 630                         unum.unum_chan = MC_INVALNUM;
 631                         unum.unum_cs = mccs->mccs_props.csp_num;
 632                         unum.unum_rank = mccs->mccs_props.csp_dimmrank;
 633                         unum.unum_offset = MCAMD_RC_INVALID_OFFSET;
 634                         for (i = 0; i < MC_CHIP_DIMMPERCS; i++)
 635                                 unum.unum_dimms[i] = MC_INVALNUM;
 636 
 637                         mcamd_ereport_post(mc, FM_EREPORT_CPU_AMD_MC_TESTFAIL,
 638                             &unum,
 639                             FM_EREPORT_PAYLOAD_FLAGS_CPU_AMD_MC_TESTFAIL);
 640                 }
 641         }
 642 }
 643 
 644 /*
 645  * Function 0 - HyperTransport Technology Configuration
 646  */
 647 static void
 648 mc_mkprops_htcfg(mc_pcicfg_hdl_t cfghdl, mc_t *mc)
 649 {
 650         union mcreg_nodeid nodeid;
 651         off_t offset;
 652         int i;
 653 
 654         mc->mc_cfgregs.mcr_htnodeid = MCREG_VAL32(&nodeid) =
 655             mc_pcicfg_get32(cfghdl, MC_HT_REG_NODEID);
 656 
 657         mc->mc_cfgregs.mcr_htunitid = mc_pcicfg_get32(cfghdl, MC_HT_REG_UNITID);
 658 
 659         for (i = 0, offset = MC_HT_REG_RTBL_NODE_0;
 660             i < HT_COHERENTNODES(&nodeid);
 661             i++, offset += MC_HT_REG_RTBL_INCR)
 662                 mc->mc_cfgregs.mcr_htroute[i] = mc_pcicfg_get32(cfghdl, offset);
 663 }
 664 
 665 /*
 666  * Function 1 Configuration - Address Map (see BKDG 3.4.4 DRAM Address Map)
 667  *
 668  * Read the Function 1 Address Map for each potential DRAM node.  The Base
 669  * Address for a node gives the starting system address mapped at that node,
 670  * and the limit gives the last valid address mapped at that node.  Regions for
 671  * different nodes should not overlap, unless node-interleaving is enabled.
 672  * The base register also indicates the node-interleaving settings (IntlvEn).
 673  * The limit register includes IntlvSel which determines which 4K blocks will
 674  * be routed to this node and the destination node ID for addresses that fall
 675  * within the [base, limit] range - this must match the pair number.
 676  */
 677 static void
 678 mc_mkprops_addrmap(mc_pcicfg_hdl_t cfghdl, mc_t *mc)
 679 {
 680         union mcreg_drambase basereg;
 681         union mcreg_dramlimit limreg;
 682         mc_props_t *mcp = &mc->mc_props;
 683         mc_cfgregs_t *mcr = &mc->mc_cfgregs;
 684         union mcreg_dramhole hole;
 685         int nodeid = mc->mc_props.mcp_num;
 686 
 687         mcr->mcr_drambase = MCREG_VAL32(&basereg) = mc_pcicfg_get32(cfghdl,
 688             MC_AM_REG_DRAMBASE_0 + nodeid * MC_AM_REG_DRAM_INCR);
 689 
 690         mcr->mcr_dramlimit = MCREG_VAL32(&limreg) = mc_pcicfg_get32(cfghdl,
 691             MC_AM_REG_DRAMLIM_0 + nodeid * MC_AM_REG_DRAM_INCR);
 692 
 693         /*
 694          * Derive some "cooked" properties for nodes that have a range of
 695          * physical addresses that are read or write enabled and for which
 696          * the DstNode matches the node we are attaching.
 697          */
 698         if (MCREG_FIELD_CMN(&limreg, DRAMLimiti) != 0 &&
 699             MCREG_FIELD_CMN(&limreg, DstNode) == nodeid &&
 700             (MCREG_FIELD_CMN(&basereg, WE) || MCREG_FIELD_CMN(&basereg, RE))) {
 701                 mcp->mcp_base = MC_DRAMBASE(&basereg);
 702                 mcp->mcp_lim = MC_DRAMLIM(&limreg);
 703                 mcp->mcp_ilen = MCREG_FIELD_CMN(&basereg, IntlvEn);
 704                 mcp->mcp_ilsel = MCREG_FIELD_CMN(&limreg, IntlvSel);
 705         }
 706 
 707         /*
 708          * The Function 1 DRAM Hole Address Register tells us which node(s)
 709          * own the DRAM space that is hoisted above 4GB, together with the
 710          * hole base and offset for this node.  This was introduced in
 711          * revision E.
 712          */
 713         if (MC_REV_ATLEAST(mc->mc_props.mcp_rev, MC_F_REV_E)) {
 714                 mcr->mcr_dramhole = MCREG_VAL32(&hole) =
 715                     mc_pcicfg_get32(cfghdl, MC_AM_REG_HOLEADDR);
 716 
 717                 if (MCREG_FIELD_CMN(&hole, DramHoleValid))
 718                         mcp->mcp_dramhole_size = MC_DRAMHOLE_SIZE(&hole);
 719         }
 720 }
 721 
 722 /*
 723  * Read some function 3 parameters via PCI Mechanism 1 accesses (which
 724  * will serialize any NB accesses).
 725  */
 726 static void
 727 mc_getmiscctl(mc_t *mc)
 728 {
 729         uint32_t rev = mc->mc_props.mcp_rev;
 730         union mcreg_nbcfg nbcfg;
 731         union mcreg_sparectl sparectl;
 732 
 733         mc->mc_cfgregs.mcr_nbcfg = MCREG_VAL32(&nbcfg) =
 734             mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_NBCFG);
 735 
 736         if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
 737                 mc->mc_cfgregs.mcr_sparectl = MCREG_VAL32(&sparectl) =
 738                     mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL,
 739                     MC_CTL_REG_SPARECTL);
 740 
 741                 if (MCREG_FIELD_F_revFG(&sparectl, SwapDone)) {
 742                         mc->mc_props.mcp_badcs =
 743                             MCREG_FIELD_F_revFG(&sparectl, BadDramCs);
 744                 }
 745         }
 746 }
 747 
 748 static int
 749 csbasecmp(mc_cs_t **csapp, mc_cs_t **csbpp)
 750 {
 751         uint64_t basea = (*csapp)->mccs_props.csp_base;
 752         uint64_t baseb = (*csbpp)->mccs_props.csp_base;
 753 
 754         if (basea == baseb)
 755                 return (0);
 756         else if (basea < baseb)
 757                 return (-1);
 758         else
 759                 return (1);
 760 }
 761 
 762 /*
 763  * The following are for use in simulating TestFail for a chip-select
 764  * without poking at the hardware (which tends to get upset if you do
 765  * since the BIOS needs to restart to map a failed cs out).  For internal
 766  * testing only!  Note that setting these does not give the full experience -
 767  * the select chip-select *is* enabled and can give errors etc and the
 768  * patounum logic will get confused.
 769  */
 770 int testfail_mcnum = -1;
 771 int testfail_csnum = -1;
 772 
 773 /*
 774  * Function 2 configuration - DRAM Controller
 775  */
 776 static void
 777 mc_mkprops_dramctl(mc_pcicfg_hdl_t cfghdl, mc_t *mc)
 778 {
 779         union mcreg_csbase base[MC_CHIP_NCS];
 780         union mcreg_csmask mask[MC_CHIP_NCS];
 781         union mcreg_dramcfg_lo drcfg_lo;
 782         union mcreg_dramcfg_hi drcfg_hi;
 783         union mcreg_drammisc drmisc;
 784         union mcreg_bankaddrmap baddrmap;
 785         mc_props_t *mcp = &mc->mc_props;
 786         mc_cfgregs_t *mcr = &mc->mc_cfgregs;
 787         int maskdivisor;
 788         int wide = 0;
 789         uint32_t rev = mc->mc_props.mcp_rev;
 790         int i;
 791         mcamd_hdl_t hdl;
 792 
 793         mcamd_mkhdl(&hdl);  /* to call into common code */
 794 
 795         /*
 796          * Read Function 2 DRAM Configuration High and Low registers.  The High
 797          * part is mostly concerned with memory clocks etc and we'll not have
 798          * any use for that.  The Low component tells us if ECC is enabled,
 799          * if we're in 64- or 128-bit MC mode, how the upper chip-selects
 800          * are mapped, which chip-select pairs are using x4 parts, etc.
 801          */
 802         MCREG_VAL32(&drcfg_lo) = mc_pcicfg_get32(cfghdl, MC_DC_REG_DRAMCFGLO);
 803         MCREG_VAL32(&drcfg_hi) = mc_pcicfg_get32(cfghdl, MC_DC_REG_DRAMCFGHI);
 804         mcr->mcr_dramcfglo = MCREG_VAL32(&drcfg_lo);
 805         mcr->mcr_dramcfghi = MCREG_VAL32(&drcfg_hi);
 806 
 807         /*
 808          * Note the DRAM controller width.  The 64/128 bit is in a different
 809          * bit position for revision F and G.
 810          */
 811         if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
 812                 wide = MCREG_FIELD_F_revFG(&drcfg_lo, Width128);
 813         } else {
 814                 wide = MCREG_FIELD_F_preF(&drcfg_lo, Width128);
 815         }
 816         mcp->mcp_accwidth = wide ? 128 : 64;
 817 
 818         /*
 819          * Read Function 2 DRAM Controller Miscellaenous Regsiter for those
 820          * revs that support it.  This include the Mod64Mux indication on
 821          * these revs - for rev E it is in DRAM config low.
 822          */
 823         if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
 824                 mcr->mcr_drammisc = MCREG_VAL32(&drmisc) =
 825                     mc_pcicfg_get32(cfghdl, MC_DC_REG_DRAMMISC);
 826                 mcp->mcp_mod64mux = MCREG_FIELD_F_revFG(&drmisc, Mod64Mux);
 827         } else if (MC_REV_MATCH(rev, MC_F_REV_E)) {
 828                 mcp->mcp_mod64mux = MCREG_FIELD_F_preF(&drcfg_lo, Mod64BitMux);
 829         }
 830 
 831         /*
 832          * Read Function 2 DRAM Bank Address Mapping.  This encodes the
 833          * type of DIMM module in use for each chip-select pair.
 834          * Prior ro revision F it also tells us whether BankSwizzle mode
 835          * is enabled - in rev F that has moved to dram config hi register.
 836          */
 837         mcp->mcp_csbankmapreg = MCREG_VAL32(&baddrmap) =
 838             mc_pcicfg_get32(cfghdl, MC_DC_REG_BANKADDRMAP);
 839 
 840         /*
 841          * Determine whether bank swizzle mode is active.  Bank swizzling was
 842          * introduced as an option in rev E,  but the bit that indicates it
 843          * is enabled has moved in revs F/G.
 844          */
 845         if (MC_REV_MATCH(rev, MC_F_REV_E)) {
 846                 mcp->mcp_bnkswzl =
 847                     MCREG_FIELD_F_preF(&baddrmap, BankSwizzleMode);
 848         } else if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
 849                 mcp->mcp_bnkswzl = MCREG_FIELD_F_revFG(&drcfg_hi,
 850                     BankSwizzleMode);
 851         }
 852 
 853         /*
 854          * Read the DRAM CS Base and DRAM CS Mask registers.  Revisions prior
 855          * to F have an equal number of base and mask registers; revision F
 856          * has twice as many base registers as masks.
 857          */
 858         maskdivisor = MC_REV_MATCH(rev, MC_F_REVS_FG) ? 2 : 1;
 859 
 860         mc_prop_read_pair(cfghdl,
 861             (uint32_t *)base, MC_DC_REG_CSBASE_0, MC_CHIP_NCS,
 862             (uint32_t *)mask, MC_DC_REG_CSMASK_0, MC_CHIP_NCS / maskdivisor,
 863             MC_DC_REG_CS_INCR);
 864 
 865         /*
 866          * Create a cs node for each enabled chip-select as well as
 867          * any appointed online spare chip-selects and for any that have
 868          * failed test.
 869          */
 870         for (i = 0; i < MC_CHIP_NCS; i++) {
 871                 mc_cs_t *mccs;
 872                 uint64_t csbase, csmask;
 873                 size_t sz;
 874                 int csbe, spare, testfail;
 875 
 876                 if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
 877                         csbe = MCREG_FIELD_F_revFG(&base[i], CSEnable);
 878                         spare = MCREG_FIELD_F_revFG(&base[i], Spare);
 879                         testfail = MCREG_FIELD_F_revFG(&base[i], TestFail);
 880                 } else {
 881                         csbe = MCREG_FIELD_F_preF(&base[i], CSEnable);
 882                         spare = 0;
 883                         testfail = 0;
 884                 }
 885 
 886                 /* Testing hook */
 887                 if (testfail_mcnum != -1 && testfail_csnum != -1 &&
 888                     mcp->mcp_num == testfail_mcnum && i == testfail_csnum) {
 889                         csbe = spare = 0;
 890                         testfail = 1;
 891                         cmn_err(CE_NOTE, "Pretending MC %d CS %d failed test",
 892                             testfail_mcnum, testfail_csnum);
 893                 }
 894 
 895                 /*
 896                  * If the chip-select is not enabled then skip it unless
 897                  * it is a designated online spare or is marked with TestFail.
 898                  */
 899                 if (!csbe && !(spare || testfail))
 900                         continue;
 901 
 902                 /*
 903                  * For an enabled or spare chip-select the Bank Address Mapping
 904                  * register will be valid as will the chip-select mask.  The
 905                  * base will not be valid but we'll read and store it anyway.
 906                  * We will not know whether the spare is already swapped in
 907                  * until MC function 3 attaches.
 908                  */
 909                 if (csbe || spare) {
 910                         if (mcamd_cs_size(&hdl, (mcamd_node_t *)mc, i, &sz) < 0)
 911                                 continue;
 912                         csbase = MC_CSBASE(&base[i], rev);
 913                         csmask = MC_CSMASK(&mask[i / maskdivisor], rev);
 914                 } else {
 915                         sz = 0;
 916                         csbase = csmask = 0;
 917                 }
 918 
 919                 mccs = mc_cs_create(mc, i, csbase, csmask, sz,
 920                     csbe, spare, testfail);
 921 
 922                 if (mc->mc_cslist == NULL)
 923                         mc->mc_cslist = mccs;
 924                 else
 925                         mc->mc_cslast->mccs_next = mccs;
 926                 mc->mc_cslast = mccs;
 927 
 928                 mccs->mccs_cfgregs.csr_csbase = MCREG_VAL32(&base[i]);
 929                 mccs->mccs_cfgregs.csr_csmask =
 930                     MCREG_VAL32(&mask[i / maskdivisor]);
 931 
 932                 /*
 933                  * Check for cs bank interleaving - some bits clear in the
 934                  * lower mask.  All banks must/will have the same lomask bits
 935                  * if cs interleaving is active.
 936                  */
 937                 if (csbe && !mcp->mcp_csintlvfctr) {
 938                         int bitno, ibits = 0;
 939                         for (bitno = MC_CSMASKLO_LOBIT(rev);
 940                             bitno <= MC_CSMASKLO_HIBIT(rev); bitno++) {
 941                                 if (!(csmask & (1 << bitno)))
 942                                         ibits++;
 943                         }
 944                         mcp->mcp_csintlvfctr = 1 << ibits;
 945                 }
 946         }
 947 
 948         /*
 949          * If there is no chip-select interleave on this node determine
 950          * whether the chip-select ranks are contiguous or if there
 951          * is a hole.
 952          */
 953         if (mcp->mcp_csintlvfctr == 1) {
 954                 mc_cs_t *csp[MC_CHIP_NCS];
 955                 mc_cs_t *mccs;
 956                 int ncsbe = 0;
 957 
 958                 for (mccs = mc->mc_cslist; mccs != NULL;
 959                     mccs = mccs->mccs_next) {
 960                         if (mccs->mccs_props.csp_csbe)
 961                                 csp[ncsbe++] = mccs;
 962                 }
 963 
 964                 if (ncsbe != 0) {
 965                         qsort((void *)csp, ncsbe, sizeof (mc_cs_t *),
 966                             (int (*)(const void *, const void *))csbasecmp);
 967 
 968                         for (i = 1; i < ncsbe; i++) {
 969                                 if (csp[i]->mccs_props.csp_base !=
 970                                     csp[i - 1]->mccs_props.csp_base +
 971                                     csp[i - 1]->mccs_props.csp_size)
 972                                         mc->mc_csdiscontig = 1;
 973                         }
 974                 }
 975         }
 976 
 977 
 978         /*
 979          * Since we do not attach to MC function 3 go ahead and read some
 980          * config parameters from it now.
 981          */
 982         mc_getmiscctl(mc);
 983 
 984         /*
 985          * Now that we have discovered all enabled/spare/testfail chip-selects
 986          * we divine the associated DIMM configuration.
 987          */
 988         mc_dimmlist_create(mc);
 989 }
 990 
 991 typedef struct mc_bind_map {
 992         const char *bm_bindnm;   /* attachment binding name */
 993         enum mc_funcnum bm_func; /* PCI config space function number for bind */
 994         const char *bm_model;    /* value for device node model property */
 995         void (*bm_mkprops)(mc_pcicfg_hdl_t, mc_t *);
 996 } mc_bind_map_t;
 997 
 998 /*
 999  * Do not attach to MC function 3 - agpgart already attaches to that.
1000  * Function 3 may be a good candidate for a nexus driver to fan it out
1001  * into virtual devices by functionality.  We will use pci_mech1_getl
1002  * to retrieve the function 3 parameters we require.
1003  */
1004 
1005 static const mc_bind_map_t mc_bind_map[] = {
1006         { MC_FUNC_HTCONFIG_BINDNM, MC_FUNC_HTCONFIG,
1007             "AMD Memory Controller (HT Configuration)", mc_mkprops_htcfg },
1008         { MC_FUNC_ADDRMAP_BINDNM, MC_FUNC_ADDRMAP,
1009             "AMD Memory Controller (Address Map)", mc_mkprops_addrmap },
1010         { MC_FUNC_DRAMCTL_BINDNM, MC_FUNC_DRAMCTL,
1011             "AMD Memory Controller (DRAM Controller & HT Trace)",
1012             mc_mkprops_dramctl },
1013         NULL
1014 };
1015 
1016 /*ARGSUSED*/
1017 static int
1018 mc_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1019 {
1020         if (otyp != OTYP_CHR)
1021                 return (EINVAL);
1022 
1023         rw_enter(&mc_lock, RW_READER);
1024         if (mc_lookup_by_chipid(getminor(*devp)) == NULL) {
1025                 rw_exit(&mc_lock);
1026                 return (EINVAL);
1027         }
1028         rw_exit(&mc_lock);
1029 
1030         return (0);
1031 }
1032 
1033 /*ARGSUSED*/
1034 static int
1035 mc_close(dev_t dev, int flag, int otyp, cred_t *credp)
1036 {
1037         return (0);
1038 }
1039 
1040 /*
1041  * Enable swap from chip-select csnum to the spare chip-select on this
1042  * memory controller (if any).
1043  */
1044 
1045 int mc_swapdonetime = 30;       /* max number of seconds to wait for SwapDone */
1046 
1047 static int
1048 mc_onlinespare(mc_t *mc, int csnum)
1049 {
1050         mc_props_t *mcp = &mc->mc_props;
1051         union mcreg_sparectl sparectl;
1052         union mcreg_scrubctl scrubctl;
1053         mc_cs_t *mccs;
1054         hrtime_t tmax;
1055         int i = 0;
1056 
1057         ASSERT(RW_WRITE_HELD(&mc_lock));
1058 
1059         if (!MC_REV_MATCH(mcp->mcp_rev, MC_F_REVS_FG))
1060                 return (ENOTSUP);       /* MC rev does not offer online spare */
1061         else if (mcp->mcp_sparecs == MC_INVALNUM)
1062                 return (ENODEV);        /* Supported, but no spare configured */
1063         else if (mcp->mcp_badcs != MC_INVALNUM)
1064                 return (EBUSY);         /* Spare already swapped in */
1065         else if (csnum == mcp->mcp_sparecs)
1066                 return (EINVAL);        /* Can't spare the spare! */
1067 
1068         for (mccs = mc->mc_cslist; mccs != NULL; mccs = mccs->mccs_next) {
1069                 if (mccs->mccs_props.csp_num == csnum)
1070                         break;
1071         }
1072         if (mccs == NULL)
1073                 return (EINVAL);        /* nominated bad CS does not exist */
1074 
1075         /*
1076          * If the DRAM Scrubber is not enabled then the swap cannot succeed.
1077          */
1078         MCREG_VAL32(&scrubctl) = mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL,
1079             MC_CTL_REG_SCRUBCTL);
1080         if (MCREG_FIELD_CMN(&scrubctl, DramScrub) == 0)
1081                 return (ENODEV);        /* DRAM scrubber not enabled */
1082 
1083         /*
1084          * Read Online Spare Comtrol Register again, just in case our
1085          * state does not reflect reality.
1086          */
1087         MCREG_VAL32(&sparectl) = mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL,
1088             MC_CTL_REG_SPARECTL);
1089 
1090         if (MCREG_FIELD_F_revFG(&sparectl, SwapDone))
1091                 return (EBUSY);
1092 
1093         /* Write to the BadDramCs field */
1094         MCREG_FIELD_F_revFG(&sparectl, BadDramCs) = csnum;
1095         mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL,
1096             MCREG_VAL32(&sparectl));
1097 
1098         /* And request that the swap to the spare start */
1099         MCREG_FIELD_F_revFG(&sparectl, SwapEn) = 1;
1100         mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL,
1101             MCREG_VAL32(&sparectl));
1102 
1103         /*
1104          * Poll for SwapDone - we have disabled notification by interrupt.
1105          * Swap takes "several CPU cycles, depending on the DRAM speed, but
1106          * is performed in the background" (Family 0Fh Bios Porting Guide).
1107          * We're in a slow ioctl path so there is no harm in waiting around
1108          * a bit - consumers of the ioctl must be aware that it may take
1109          * a moment.  We will poll for up to mc_swapdonetime seconds,
1110          * limiting that to 120s.
1111          *
1112          * The swap is performed by the DRAM scrubber (which must be enabled)
1113          * whose scrub rate is accelerated for the duration of the swap.
1114          * The maximum swap rate is 40.0ns per 64 bytes, so the maximum
1115          * supported cs size of 16GB would take 10.7s at that max rate
1116          * of 25000000 scrubs/second.
1117          */
1118         tmax = gethrtime() + MIN(mc_swapdonetime, 120) * 1000000000ULL;
1119         do {
1120                 if (i++ < 20)
1121                         delay(drv_usectohz(100000));    /* 0.1s for up to 2s */
1122                 else
1123                         delay(drv_usectohz(500000));    /* 0.5s */
1124 
1125                 MCREG_VAL32(&sparectl) = mc_pcicfg_get32_nohdl(mc,
1126                     MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL);
1127         } while (!MCREG_FIELD_F_revFG(&sparectl, SwapDone) &&
1128             gethrtime() < tmax);
1129 
1130         if (!MCREG_FIELD_F_revFG(&sparectl, SwapDone))
1131                 return (ETIME);         /* Operation timed out */
1132 
1133         mcp->mcp_badcs = csnum;
1134         mc->mc_cfgregs.mcr_sparectl = MCREG_VAL32(&sparectl);
1135         mc->mc_spareswaptime = gethrtime();
1136 
1137         return (0);
1138 }
1139 
1140 /*ARGSUSED*/
1141 static int
1142 mc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1143 {
1144         int rc = 0;
1145         mc_t *mc;
1146 
1147         if (cmd != MC_IOC_SNAPSHOT_INFO && cmd != MC_IOC_SNAPSHOT &&
1148             cmd != MC_IOC_ONLINESPARE_EN)
1149                 return (EINVAL);
1150 
1151         rw_enter(&mc_lock, RW_READER);
1152 
1153         if ((mc = mc_lookup_by_chipid(getminor(dev))) == NULL) {
1154                 rw_exit(&mc_lock);
1155                 return (EINVAL);
1156         }
1157 
1158         switch (cmd) {
1159         case MC_IOC_SNAPSHOT_INFO: {
1160                 mc_snapshot_info_t mcs;
1161 
1162                 if (mc_snapshot_update(mc) < 0) {
1163                         rw_exit(&mc_lock);
1164                         return (EIO);
1165                 }
1166 
1167                 mcs.mcs_size = mc->mc_snapshotsz;
1168                 mcs.mcs_gen = mc->mc_snapshotgen;
1169 
1170                 if (ddi_copyout(&mcs, (void *)arg, sizeof (mc_snapshot_info_t),
1171                     mode) < 0)
1172                         rc = EFAULT;
1173                 break;
1174         }
1175 
1176         case MC_IOC_SNAPSHOT:
1177                 if (mc_snapshot_update(mc) < 0) {
1178                         rw_exit(&mc_lock);
1179                         return (EIO);
1180                 }
1181 
1182                 if (ddi_copyout(mc->mc_snapshot, (void *)arg, mc->mc_snapshotsz,
1183                     mode) < 0)
1184                         rc = EFAULT;
1185                 break;
1186 
1187         case MC_IOC_ONLINESPARE_EN:
1188                 if (drv_priv(credp) != 0) {
1189                         rw_exit(&mc_lock);
1190                         return (EPERM);
1191                 }
1192 
1193                 if (!rw_tryupgrade(&mc_lock)) {
1194                         rw_exit(&mc_lock);
1195                         return (EAGAIN);
1196                 }
1197 
1198                 if ((rc = mc_onlinespare(mc, (int)arg)) == 0) {
1199                         mc_snapshot_destroy(mc);
1200                         nvlist_free(mc->mc_nvl);
1201                         mc->mc_nvl = mc_nvl_create(mc);
1202                 }
1203 
1204                 break;
1205         }
1206 
1207         rw_exit(&mc_lock);
1208 
1209         return (rc);
1210 }
1211 
1212 static struct cb_ops mc_cb_ops = {
1213         mc_open,
1214         mc_close,
1215         nodev,          /* not a block driver */
1216         nodev,          /* no print routine */
1217         nodev,          /* no dump routine */
1218         nodev,          /* no read routine */
1219         nodev,          /* no write routine */
1220         mc_ioctl,
1221         nodev,          /* no devmap routine */
1222         nodev,          /* no mmap routine */
1223         nodev,          /* no segmap routine */
1224         nochpoll,       /* no chpoll routine */
1225         ddi_prop_op,
1226         0,              /* not a STREAMS driver */
1227         D_NEW | D_MP,   /* safe for multi-thread/multi-processor */
1228 };
1229 
1230 /*ARGSUSED*/
1231 static int
1232 mc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1233 {
1234         int rc = DDI_SUCCESS;
1235         mc_t *mc;
1236 
1237         if (infocmd != DDI_INFO_DEVT2DEVINFO &&
1238             infocmd != DDI_INFO_DEVT2INSTANCE) {
1239                 *result = NULL;
1240                 return (DDI_FAILURE);
1241         }
1242 
1243         rw_enter(&mc_lock, RW_READER);
1244 
1245         if ((mc = mc_lookup_by_chipid(getminor((dev_t)arg))) == NULL ||
1246             mc->mc_funcs[MC_FUNC_DEVIMAP].mcf_devi == NULL) {
1247                 rc = DDI_FAILURE;
1248         } else if (infocmd == DDI_INFO_DEVT2DEVINFO) {
1249                 *result = mc->mc_funcs[MC_FUNC_DEVIMAP].mcf_devi;
1250         } else {
1251                 *result = (void *)(uintptr_t)
1252                     mc->mc_funcs[MC_FUNC_DEVIMAP].mcf_instance;
1253         }
1254 
1255         rw_exit(&mc_lock);
1256 
1257         return (rc);
1258 }
1259 
1260 /*ARGSUSED2*/
1261 static int
1262 mc_fm_handle(dev_info_t *dip, ddi_fm_error_t *fmerr, const void *arg)
1263 {
1264         pci_ereport_post(dip, fmerr, NULL);
1265         return (fmerr->fme_status);
1266 }
1267 
1268 static void
1269 mc_fm_init(dev_info_t *dip)
1270 {
1271         int fmcap = DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE;
1272         ddi_fm_init(dip, &fmcap, NULL);
1273         pci_ereport_setup(dip);
1274         ddi_fm_handler_register(dip, mc_fm_handle, NULL);
1275 }
1276 
1277 static void
1278 mc_read_smbios(mc_t *mc, dev_info_t *dip)
1279 {
1280 
1281         uint16_t bdf;
1282         pci_regspec_t *pci_rp = NULL;
1283         uint32_t phys_hi;
1284         int m = 0;
1285         uint_t chip_inst;
1286         int rc = 0;
1287 
1288         if (ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg",
1289             (caddr_t)&pci_rp, &m) == DDI_SUCCESS) {
1290                 phys_hi = pci_rp->pci_phys_hi;
1291                 bdf = (uint16_t)(PCI_REG_BDFR_G(phys_hi) >>
1292                     PCI_REG_FUNC_SHIFT);
1293                 kmem_free(pci_rp, m);
1294                 pci_rp = NULL;
1295 
1296                 rc = fm_smb_mc_chipinst(bdf, &chip_inst);
1297                 if (rc == 0) {
1298                         mc->smb_chipid = chip_inst;
1299                 } else {
1300 #ifdef DEBUG
1301                         cmn_err(CE_NOTE, "!mc read smbios chip info failed");
1302 #endif /* DEBUG */
1303                         return;
1304                 }
1305                 mc->smb_bboard = fm_smb_mc_bboards(bdf);
1306 #ifdef DEBUG
1307                 if (mc->smb_bboard == NULL)
1308                         cmn_err(CE_NOTE,
1309                             "!mc read smbios base boards info failed");
1310 #endif /* DEBUG */
1311         }
1312 
1313         if (pci_rp != NULL)
1314                 kmem_free(pci_rp, m);
1315 }
1316 
1317 /*ARGSUSED*/
1318 static int
1319 mc_create_cb(cmi_hdl_t whdl, void *arg1, void *arg2, void *arg3)
1320 {
1321         chipid_t chipid = *((chipid_t *)arg1);
1322         cmi_hdl_t *hdlp = (cmi_hdl_t *)arg2;
1323 
1324         if (cmi_hdl_chipid(whdl) == chipid) {
1325                 cmi_hdl_hold(whdl);     /* short-term hold */
1326                 *hdlp = whdl;
1327                 return (CMI_HDL_WALK_DONE);
1328         } else {
1329                 return (CMI_HDL_WALK_NEXT);
1330         }
1331 }
1332 
1333 static mc_t *
1334 mc_create(chipid_t chipid, dev_info_t *dip)
1335 {
1336         mc_t *mc;
1337         cmi_hdl_t hdl = NULL;
1338 
1339         ASSERT(RW_WRITE_HELD(&mc_lock));
1340 
1341         /*
1342          * Find a handle for one of a chip's CPU.
1343          *
1344          * We can use one of the chip's CPUs since all cores
1345          * of a chip share the same revision and socket type.
1346          */
1347         cmi_hdl_walk(mc_create_cb, (void *)&chipid, (void *)&hdl, NULL);
1348         if (hdl == NULL)
1349                 return (NULL);  /* no cpu for this chipid found! */
1350 
1351         mc = kmem_zalloc(sizeof (mc_t), KM_SLEEP);
1352 
1353         mc->mc_hdr.mch_type = MC_NT_MC;
1354         mc->mc_props.mcp_num = chipid;
1355         mc->mc_props.mcp_sparecs = MC_INVALNUM;
1356         mc->mc_props.mcp_badcs = MC_INVALNUM;
1357 
1358         mc->mc_props.mcp_rev = cmi_hdl_chiprev(hdl);
1359         mc->mc_revname = cmi_hdl_chiprevstr(hdl);
1360         mc->mc_socket = cmi_hdl_getsockettype(hdl);
1361 
1362         mc_read_smbios(mc, dip);
1363 
1364         if (mc_list == NULL)
1365                 mc_list = mc;
1366         if (mc_last != NULL)
1367                 mc_last->mc_next = mc;
1368 
1369         mc->mc_next = NULL;
1370         mc_last = mc;
1371 
1372         cmi_hdl_rele(hdl);
1373 
1374         return (mc);
1375 }
1376 
1377 /*
1378  * Return the maximum scrubbing rate between r1 and r2, where r2 is extracted
1379  * from the specified 'cfg' register value using 'mask' and 'shift'.  If a
1380  * value is zero, scrubbing is off so return the opposite value.  Otherwise
1381  * the maximum rate is the smallest non-zero value of the two values.
1382  */
1383 static uint32_t
1384 mc_scrubber_max(uint32_t r1, uint32_t cfg, uint32_t mask, uint32_t shift)
1385 {
1386         uint32_t r2 = (cfg & mask) >> shift;
1387 
1388         if (r1 != 0 && r2 != 0)
1389                 return (MIN(r1, r2));
1390 
1391         return (r1 ? r1 : r2);
1392 }
1393 
1394 
1395 /*
1396  * Enable the memory scrubber.  We must use the mc_pcicfg_{get32,put32}_nohdl
1397  * interfaces since we do not bind to function 3.
1398  */
1399 cmi_errno_t
1400 mc_scrubber_enable(mc_t *mc)
1401 {
1402         mc_props_t *mcp = &mc->mc_props;
1403         chipid_t chipid = (chipid_t)mcp->mcp_num;
1404         uint32_t rev = (uint32_t)mcp->mcp_rev;
1405         mc_cfgregs_t *mcr = &mc->mc_cfgregs;
1406         union mcreg_scrubctl scrubctl;
1407         union mcreg_dramscrublo dalo;
1408         union mcreg_dramscrubhi dahi;
1409 
1410         mcr->mcr_scrubctl = MCREG_VAL32(&scrubctl) =
1411             mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBCTL);
1412 
1413         mcr->mcr_scrubaddrlo = MCREG_VAL32(&dalo) =
1414             mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_LO);
1415 
1416         mcr->mcr_scrubaddrhi = MCREG_VAL32(&dahi) =
1417             mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_HI);
1418 
1419         if (mc_scrub_policy == MC_SCRUB_BIOSDEFAULT)
1420                 return (MCREG_FIELD_CMN(&scrubctl, DramScrub) !=
1421                     AMD_NB_SCRUBCTL_RATE_NONE ?
1422                     CMI_SUCCESS : CMIERR_MC_NOMEMSCRUB);
1423 
1424         /*
1425          * Disable DRAM scrubbing while we fiddle.
1426          */
1427         MCREG_FIELD_CMN(&scrubctl, DramScrub) = AMD_NB_SCRUBCTL_RATE_NONE;
1428         mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBCTL,
1429             MCREG_VAL32(&scrubctl));
1430 
1431         /*
1432          * Setup DRAM Scrub Address Low and High registers for the
1433          * base address of this node, and to select srubber redirect.
1434          */
1435         MCREG_FIELD_CMN(&dalo, ScrubReDirEn) = 1;
1436         MCREG_FIELD_CMN(&dalo, ScrubAddrLo) =
1437             AMD_NB_SCRUBADDR_MKLO(mcp->mcp_base);
1438 
1439         MCREG_FIELD_CMN(&dahi, ScrubAddrHi) =
1440             AMD_NB_SCRUBADDR_MKHI(mcp->mcp_base);
1441 
1442         mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_LO,
1443             MCREG_VAL32(&dalo));
1444         mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_HI,
1445             MCREG_VAL32(&dahi));
1446 
1447         if (mc_scrub_rate_dram > AMD_NB_SCRUBCTL_RATE_MAX) {
1448                 cmn_err(CE_WARN, "mc_scrub_rate_dram is too large; "
1449                     "resetting to 0x%x\n", AMD_NB_SCRUBCTL_RATE_MAX);
1450                 mc_scrub_rate_dram = AMD_NB_SCRUBCTL_RATE_MAX;
1451         }
1452 
1453         switch (mc_scrub_policy) {
1454         case MC_SCRUB_FIXED:
1455                 /* Use the system value checked above */
1456                 break;
1457 
1458         default:
1459                 cmn_err(CE_WARN, "Unknown mc_scrub_policy value %d - "
1460                     "using default policy of MC_SCRUB_MAX", mc_scrub_policy);
1461                 /*FALLTHRU*/
1462 
1463         case MC_SCRUB_MAX:
1464                 mc_scrub_rate_dram = mc_scrubber_max(mc_scrub_rate_dram,
1465                     mcr->mcr_scrubctl, AMD_NB_SCRUBCTL_DRAM_MASK,
1466                     AMD_NB_SCRUBCTL_DRAM_SHIFT);
1467                 break;
1468         }
1469 
1470         /*
1471          * OPTERON_ERRATUM_99:
1472          * This erratum applies on revisions D and earlier.
1473          * This erratum also applies on revisions E and later,
1474          * if BIOS uses chip-select hoisting instead of DRAM hole
1475          * mapping.
1476          *
1477          * Do not enable the dram scrubber if the chip-select ranges
1478          * for the node are not contiguous.
1479          */
1480         if (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE &&
1481             mc->mc_csdiscontig) {
1482                 cmn_err(CE_CONT, "?Opteron DRAM scrubber disabled on revision "
1483                     "%s chip %d because DRAM hole is present on this node",
1484                     mc->mc_revname, chipid);
1485                 mc_scrub_rate_dram = AMD_NB_SCRUBCTL_RATE_NONE;
1486         }
1487 
1488         /*
1489          * OPTERON_ERRATUM_101:
1490          * This erratum applies on revisions D and earlier.
1491          *
1492          * If the DRAM Base Address register's IntlvEn field indicates that
1493          * node interleaving is enabled, we must disable the DRAM scrubber
1494          * and return zero to indicate that Solaris should use s/w instead.
1495          */
1496         if (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE &&
1497             mcp->mcp_ilen != 0 &&
1498             !X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_E)) {
1499                 cmn_err(CE_CONT, "?Opteron DRAM scrubber disabled on revision "
1500                     "%s chip %d because DRAM memory is node-interleaved",
1501                     mc->mc_revname, chipid);
1502                 mc_scrub_rate_dram = AMD_NB_SCRUBCTL_RATE_NONE;
1503         }
1504 
1505         if (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE) {
1506                 MCREG_FIELD_CMN(&scrubctl, DramScrub) = mc_scrub_rate_dram;
1507                 mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBCTL,
1508                     MCREG_VAL32(&scrubctl));
1509         }
1510 
1511         return (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE ?
1512             CMI_SUCCESS : CMIERR_MC_NOMEMSCRUB);
1513 }
1514 
1515 /*ARGSUSED*/
1516 static int
1517 mc_attach_cb(cmi_hdl_t whdl, void *arg1, void *arg2, void *arg3)
1518 {
1519         mc_t *mc = (mc_t *)arg1;
1520         mcamd_prop_t chipid = *((mcamd_prop_t *)arg2);
1521 
1522         if (cmi_hdl_chipid(whdl) == chipid) {
1523                 mcamd_mc_register(whdl, mc);
1524         }
1525 
1526         return (CMI_HDL_WALK_NEXT);
1527 }
1528 
1529 static int mc_sw_scrub_disabled = 0;
1530 
1531 static int
1532 mc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1533 {
1534         mc_pcicfg_hdl_t cfghdl;
1535         const mc_bind_map_t *bm;
1536         const char *bindnm;
1537         char *unitstr = NULL;
1538         enum mc_funcnum func;
1539         long unitaddr;
1540         int chipid, rc;
1541         mc_t *mc;
1542 
1543         /*
1544          * This driver has no hardware state, but does
1545          * claim to have a reg property, so it will be
1546          * called on suspend.  It is probably better to
1547          * make sure it doesn't get called on suspend,
1548          * but it is just as easy to make sure we just
1549          * return DDI_SUCCESS if called.
1550          */
1551         if (cmd == DDI_RESUME)
1552                 return (DDI_SUCCESS);
1553 
1554         if (cmd != DDI_ATTACH || mc_no_attach != 0)
1555                 return (DDI_FAILURE);
1556 
1557         bindnm = ddi_binding_name(dip);
1558         for (bm = mc_bind_map; bm->bm_bindnm != NULL; bm++) {
1559                 if (strcmp(bindnm, bm->bm_bindnm) == 0) {
1560                         func = bm->bm_func;
1561                         break;
1562                 }
1563         }
1564 
1565         if (bm->bm_bindnm == NULL)
1566                 return (DDI_FAILURE);
1567 
1568         /*
1569          * We need the device number, which corresponds to the processor node
1570          * number plus 24.  The node number can then be used to associate this
1571          * memory controller device with a given processor chip.
1572          */
1573         if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip,
1574             DDI_PROP_DONTPASS, "unit-address", &unitstr) != DDI_PROP_SUCCESS) {
1575                 cmn_err(CE_WARN, "failed to find unit-address for %s", bindnm);
1576                 return (DDI_FAILURE);
1577         }
1578 
1579         rc = ddi_strtol(unitstr, NULL, 16, &unitaddr);
1580         ASSERT(rc == 0 && unitaddr >= MC_AMD_DEV_OFFSET);
1581 
1582         if (rc != 0 || unitaddr < MC_AMD_DEV_OFFSET) {
1583                 cmn_err(CE_WARN, "failed to parse unit address %s for %s\n",
1584                     unitstr, bindnm);
1585                 ddi_prop_free(unitstr);
1586                 return (DDI_FAILURE);
1587         }
1588         ddi_prop_free(unitstr);
1589 
1590         chipid = unitaddr - MC_AMD_DEV_OFFSET;
1591 
1592         rw_enter(&mc_lock, RW_WRITER);
1593 
1594         for (mc = mc_list; mc != NULL; mc = mc->mc_next) {
1595                 if (mc->mc_props.mcp_num == chipid)
1596                         break;
1597         }
1598 
1599         /* Integrate this memory controller device into existing set */
1600         if (mc == NULL) {
1601                 mc = mc_create(chipid, dip);
1602 
1603                 if (mc == NULL) {
1604                         /*
1605                          * We don't complain here because this is a legitimate
1606                          * path for MP systems.  On those machines, we'll attach
1607                          * before all CPUs have been initialized, and thus the
1608                          * chip verification in mc_create will fail.  We'll be
1609                          * reattached later for those CPUs.
1610                          */
1611                         rw_exit(&mc_lock);
1612                         return (DDI_FAILURE);
1613                 }
1614         } else {
1615                 mc_snapshot_destroy(mc);
1616         }
1617 
1618         /* Beyond this point, we're committed to creating this node */
1619 
1620         mc_fm_init(dip);
1621 
1622         ASSERT(mc->mc_funcs[func].mcf_devi == NULL);
1623         mc->mc_funcs[func].mcf_devi = dip;
1624         mc->mc_funcs[func].mcf_instance = ddi_get_instance(dip);
1625 
1626         mc->mc_ref++;
1627 
1628         /*
1629          * Add the common properties to this node, and then add any properties
1630          * that are specific to this node based upon its configuration space.
1631          */
1632         (void) ddi_prop_update_string(DDI_DEV_T_NONE,
1633             dip, "model", (char *)bm->bm_model);
1634 
1635         (void) ddi_prop_update_int(DDI_DEV_T_NONE,
1636             dip, "chip-id", mc->mc_props.mcp_num);
1637 
1638         if (bm->bm_mkprops != NULL &&
1639             mc_pcicfg_setup(mc, bm->bm_func, &cfghdl) == DDI_SUCCESS) {
1640                 bm->bm_mkprops(cfghdl, mc);
1641                 mc_pcicfg_teardown(cfghdl);
1642         }
1643 
1644         /*
1645          * If this is the last node to be attached for this memory controller,
1646          * then create the minor node, enable scrubbers, and register with
1647          * cpu module(s) for this chip.
1648          */
1649         if (func == MC_FUNC_DEVIMAP) {
1650                 mc_props_t *mcp = &mc->mc_props;
1651                 int dram_present = 0;
1652 
1653                 if (ddi_create_minor_node(dip, "mc-amd", S_IFCHR,
1654                     mcp->mcp_num, "ddi_mem_ctrl",
1655                     0) != DDI_SUCCESS) {
1656                         cmn_err(CE_WARN, "failed to create minor node for chip "
1657                             "%d memory controller\n",
1658                             (chipid_t)mcp->mcp_num);
1659                 }
1660 
1661                 /*
1662                  * Register the memory controller for every CPU of this chip.
1663                  *
1664                  * If there is memory present on this node and ECC is enabled
1665                  * attempt to enable h/w memory scrubbers for this node.
1666                  * If we are successful in enabling *any* hardware scrubbers,
1667                  * disable the software memory scrubber.
1668                  */
1669                 cmi_hdl_walk(mc_attach_cb, (void *)mc, (void *)&mcp->mcp_num,
1670                     NULL);
1671 
1672                 if (mcp->mcp_lim != mcp->mcp_base) {
1673                         /*
1674                          * This node may map non-dram memory alone, so we
1675                          * must check for an enabled chip-select to be
1676                          * sure there is dram present.
1677                          */
1678                         mc_cs_t *mccs;
1679 
1680                         for (mccs = mc->mc_cslist; mccs != NULL;
1681                             mccs = mccs->mccs_next) {
1682                                 if (mccs->mccs_props.csp_csbe) {
1683                                         dram_present = 1;
1684                                         break;
1685                                 }
1686                         }
1687                 }
1688 
1689                 if (dram_present && !mc_ecc_enabled(mc)) {
1690                         /*
1691                          * On a single chip system there is no point in
1692                          * scrubbing if there is no ECC on the single node.
1693                          * On a multichip system, necessarily Opteron using
1694                          * registered ECC-capable DIMMs, if there is memory
1695                          * present on a node but no ECC there then we'll assume
1696                          * ECC is disabled for all nodes and we will not enable
1697                          * the scrubber and wll also disable the software
1698                          * memscrub thread.
1699                          */
1700                         rc = 1;
1701                 } else if (!dram_present) {
1702                         /* No memory on this node - others decide memscrub */
1703                         rc = 0;
1704                 } else {
1705                         /*
1706                          * There is memory on this node and ECC is enabled.
1707                          * Call via the cpu module to enable memory scrubbing
1708                          * on this node - we could call directly but then
1709                          * we may overlap with a request to enable chip-cache
1710                          * scrubbing.
1711                          */
1712                         rc = mc_scrubber_enable(mc);
1713                 }
1714 
1715                 if (rc == CMI_SUCCESS && !mc_sw_scrub_disabled++)
1716                         cmi_mc_sw_memscrub_disable();
1717 
1718                 mc_report_testfails(mc);
1719         }
1720 
1721         /*
1722          * Update nvlist for as far as we have gotten in attach/init.
1723          */
1724         nvlist_free(mc->mc_nvl);
1725         mc->mc_nvl = mc_nvl_create(mc);
1726 
1727         rw_exit(&mc_lock);
1728         return (DDI_SUCCESS);
1729 }
1730 
1731 /*ARGSUSED*/
1732 static int
1733 mc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1734 {
1735         /*
1736          * See the comment about suspend in
1737          * mc_attach().
1738          */
1739         if (cmd == DDI_SUSPEND)
1740                 return (DDI_SUCCESS);
1741         else
1742                 return (DDI_FAILURE);
1743 }
1744 
1745 
1746 static struct dev_ops mc_ops = {
1747         DEVO_REV,               /* devo_rev */
1748         0,                      /* devo_refcnt */
1749         mc_getinfo,             /* devo_getinfo */
1750         nulldev,                /* devo_identify */
1751         nulldev,                /* devo_probe */
1752         mc_attach,              /* devo_attach */
1753         mc_detach,              /* devo_detach */
1754         nodev,                  /* devo_reset */
1755         &mc_cb_ops,         /* devo_cb_ops */
1756         NULL,                   /* devo_bus_ops */
1757         NULL,                   /* devo_power */
1758         ddi_quiesce_not_needed,         /* devo_quiesce */
1759 };
1760 
1761 static struct modldrv modldrv = {
1762         &mod_driverops,
1763         "Memory Controller for AMD processors",
1764         &mc_ops
1765 };
1766 
1767 static struct modlinkage modlinkage = {
1768         MODREV_1,
1769         (void *)&modldrv,
1770         NULL
1771 };
1772 
1773 int
1774 _init(void)
1775 {
1776         /*
1777          * Refuse to load if there is no PCI config space support.
1778          */
1779         if (pci_getl_func == NULL)
1780                 return (ENOTSUP);
1781 
1782         rw_init(&mc_lock, NULL, RW_DRIVER, NULL);
1783         return (mod_install(&modlinkage));
1784 }
1785 
1786 int
1787 _info(struct modinfo *modinfop)
1788 {
1789         return (mod_info(&modlinkage, modinfop));
1790 }
1791 
1792 int
1793 _fini(void)
1794 {
1795         int rc;
1796 
1797         if ((rc = mod_remove(&modlinkage)) != 0)
1798                 return (rc);
1799 
1800         rw_destroy(&mc_lock);
1801         return (0);
1802 }