1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
  28  * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com>
  29  */
  30 
  31 /*
  32  * Memory special file
  33  */
  34 
  35 #include <sys/types.h>
  36 #include <sys/param.h>
  37 #include <sys/user.h>
  38 #include <sys/buf.h>
  39 #include <sys/systm.h>
  40 #include <sys/cred.h>
  41 #include <sys/vm.h>
  42 #include <sys/uio.h>
  43 #include <sys/mman.h>
  44 #include <sys/kmem.h>
  45 #include <vm/seg.h>
  46 #include <vm/page.h>
  47 #include <sys/stat.h>
  48 #include <sys/vmem.h>
  49 #include <sys/memlist.h>
  50 #include <sys/bootconf.h>
  51 
  52 #include <vm/seg_vn.h>
  53 #include <vm/seg_dev.h>
  54 #include <vm/seg_kmem.h>
  55 #include <vm/seg_kp.h>
  56 #include <vm/seg_kpm.h>
  57 #include <vm/hat.h>
  58 
  59 #include <sys/conf.h>
  60 #include <sys/mem.h>
  61 #include <sys/types.h>
  62 #include <sys/conf.h>
  63 #include <sys/param.h>
  64 #include <sys/systm.h>
  65 #include <sys/errno.h>
  66 #include <sys/modctl.h>
  67 #include <sys/memlist.h>
  68 #include <sys/ddi.h>
  69 #include <sys/sunddi.h>
  70 #include <sys/debug.h>
  71 #include <sys/fm/protocol.h>
  72 
  73 #if defined(__sparc)
  74 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
  75 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
  76     uint64_t *, int *, int *, int *);
  77 extern size_t cpu_get_name_bufsize(void);
  78 extern int cpu_get_mem_sid(char *, char *, int, int *);
  79 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
  80 #elif defined(__x86)
  81 #include <sys/cpu_module.h>
  82 #endif  /* __sparc */
  83 
  84 /*
  85  * Turn a byte length into a pagecount.  The DDI btop takes a
  86  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
  87  * large physical-memory 32-bit machines.
  88  */
  89 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift))
  90 
  91 static kmutex_t mm_lock;
  92 static caddr_t mm_map;
  93 
  94 static dev_info_t *mm_dip;      /* private copy of devinfo pointer */
  95 
  96 static int mm_kmem_io_access;
  97 
  98 static int mm_kstat_update(kstat_t *ksp, int rw);
  99 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
 100 
 101 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
 102 
 103 #define MM_KMEMLOG_NENTRIES     64
 104 
 105 static int mm_kmemlogent;
 106 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
 107 
 108 /*
 109  * On kmem/allmem writes, we log information that might be useful in the event
 110  * that a write is errant (that is, due to operator error) and induces a later
 111  * problem.  Note that (in particular) in the event of such operator-induced
 112  * corruption, a search over the kernel address space for the corrupted
 113  * address will yield the ring buffer entry that recorded the write.  And
 114  * should it seem baroque or otherwise unnecessary, yes, we need this kind of
 115  * auditing facility and yes, we learned that the hard way: disturbingly,
 116  * there exist recommendations for "tuning" the system that involve writing to
 117  * kernel memory addresses via the kernel debugger, and -- as we discovered --
 118  * these can easily be applied incorrectly or unsafely, yielding an entirely
 119  * undebuggable "can't happen" kind of panic.
 120  */
 121 static void
 122 mm_logkmem(struct uio *uio)
 123 {
 124         mm_logentry_t *ent;
 125         proc_t *p = curthread->t_procp;
 126 
 127         mutex_enter(&mm_lock);
 128 
 129         ent = &mm_kmemlog[mm_kmemlogent++];
 130 
 131         if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
 132                 mm_kmemlogent = 0;
 133 
 134         ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
 135         ent->mle_len = uio->uio_resid;
 136         gethrestime(&ent->mle_hrestime);
 137         ent->mle_hrtime = gethrtime();
 138         ent->mle_pid = p->p_pidp->pid_id;
 139 
 140         (void) strncpy(ent->mle_psargs,
 141             p->p_user.u_psargs, sizeof (ent->mle_psargs));
 142 
 143         mutex_exit(&mm_lock);
 144 }
 145 
 146 /*ARGSUSED1*/
 147 static int
 148 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 149 {
 150         int i;
 151         struct mem_minor {
 152                 char *name;
 153                 minor_t minor;
 154                 int privonly;
 155                 const char *rdpriv;
 156                 const char *wrpriv;
 157                 mode_t priv_mode;
 158         } mm[] = {
 159                 { "mem",        M_MEM,          0,      NULL,   "all",  0640 },
 160                 { "kmem",       M_KMEM,         0,      NULL,   "all",  0640 },
 161                 { "allkmem",    M_ALLKMEM,      0,      "all",  "all",  0600 },
 162                 { "null",       M_NULL, PRIVONLY_DEV,   NULL,   NULL,   0666 },
 163                 { "zero",       M_ZERO, PRIVONLY_DEV,   NULL,   NULL,   0666 },
 164                 { "full",       M_FULL, PRIVONLY_DEV,   NULL,   NULL,   0666 },
 165         };
 166         kstat_t *ksp;
 167 
 168         mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
 169         mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
 170 
 171         for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
 172                 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
 173                     mm[i].minor, DDI_PSEUDO, mm[i].privonly,
 174                     mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
 175                     DDI_FAILURE) {
 176                         ddi_remove_minor_node(devi, NULL);
 177                         return (DDI_FAILURE);
 178                 }
 179         }
 180 
 181         mm_dip = devi;
 182 
 183         ksp = kstat_create("mm", 0, "phys_installed", "misc",
 184             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
 185         if (ksp != NULL) {
 186                 ksp->ks_update = mm_kstat_update;
 187                 ksp->ks_snapshot = mm_kstat_snapshot;
 188                 ksp->ks_lock = &mm_lock; /* XXX - not really needed */
 189                 kstat_install(ksp);
 190         }
 191 
 192         mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 193             "kmem_io_access", 0);
 194 
 195         return (DDI_SUCCESS);
 196 }
 197 
 198 /*ARGSUSED*/
 199 static int
 200 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 201 {
 202         register int error;
 203 
 204         switch (infocmd) {
 205         case DDI_INFO_DEVT2DEVINFO:
 206                 *result = (void *)mm_dip;
 207                 error = DDI_SUCCESS;
 208                 break;
 209         case DDI_INFO_DEVT2INSTANCE:
 210                 *result = (void *)0;
 211                 error = DDI_SUCCESS;
 212                 break;
 213         default:
 214                 error = DDI_FAILURE;
 215         }
 216         return (error);
 217 }
 218 
 219 /*ARGSUSED1*/
 220 static int
 221 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
 222 {
 223         switch (getminor(*devp)) {
 224         case M_NULL:
 225         case M_ZERO:
 226         case M_FULL:
 227         case M_MEM:
 228         case M_KMEM:
 229         case M_ALLKMEM:
 230                 /* standard devices */
 231                 break;
 232 
 233         default:
 234                 /* Unsupported or unknown type */
 235                 return (EINVAL);
 236         }
 237         /* must be character device */
 238         if (typ != OTYP_CHR)
 239                 return (EINVAL);
 240         return (0);
 241 }
 242 
 243 struct pollhead mm_pollhd;
 244 
 245 /*ARGSUSED*/
 246 static int
 247 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
 248     struct pollhead **phpp)
 249 {
 250         switch (getminor(dev)) {
 251         case M_NULL:
 252         case M_ZERO:
 253         case M_FULL:
 254         case M_MEM:
 255         case M_KMEM:
 256         case M_ALLKMEM:
 257                 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
 258                     POLLWRNORM | POLLRDBAND | POLLWRBAND);
 259                 /*
 260                  * A non NULL pollhead pointer should be returned in case
 261                  * user polls for 0 events.
 262                  */
 263                 *phpp = !anyyet && !*reventsp ?
 264                     &mm_pollhd : (struct pollhead *)NULL;
 265                 return (0);
 266         default:
 267                 /* no other devices currently support polling */
 268                 return (ENXIO);
 269         }
 270 }
 271 
 272 static int
 273 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
 274     char *name, caddr_t valuep, int *lengthp)
 275 {
 276         /*
 277          * implement zero size to reduce overhead (avoid two failing
 278          * property lookups per stat).
 279          */
 280         return (ddi_prop_op_size(dev, dip, prop_op,
 281             flags, name, valuep, lengthp, 0));
 282 }
 283 
 284 static int
 285 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
 286     page_t *pp)
 287 {
 288         int error = 0;
 289         int devload = 0;
 290         int is_memory = pf_is_memory(pfn);
 291         size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
 292             (size_t)uio->uio_iov->iov_len);
 293         caddr_t va = NULL;
 294 
 295         mutex_enter(&mm_lock);
 296 
 297         if (is_memory && kpm_enable) {
 298                 if (pp)
 299                         va = hat_kpm_mapin(pp, NULL);
 300                 else
 301                         va = hat_kpm_mapin_pfn(pfn);
 302         }
 303 
 304         if (va == NULL) {
 305                 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
 306                     (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
 307                     HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
 308                 va = mm_map;
 309                 devload = 1;
 310         }
 311 
 312         if (!is_memory) {
 313                 if (allowio) {
 314                         size_t c = uio->uio_iov->iov_len;
 315 
 316                         if (ddi_peekpokeio(NULL, uio, rw,
 317                             (caddr_t)(uintptr_t)uio->uio_loffset, c,
 318                             sizeof (int32_t)) != DDI_SUCCESS)
 319                                 error = EFAULT;
 320                 } else
 321                         error = EIO;
 322         } else
 323                 error = uiomove(va + pageoff, nbytes, rw, uio);
 324 
 325         if (devload)
 326                 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
 327         else if (pp)
 328                 hat_kpm_mapout(pp, NULL, va);
 329         else
 330                 hat_kpm_mapout_pfn(pfn);
 331 
 332         mutex_exit(&mm_lock);
 333         return (error);
 334 }
 335 
 336 static int
 337 mmpagelock(struct as *as, caddr_t va)
 338 {
 339         struct seg *seg;
 340         int i;
 341 
 342         AS_LOCK_ENTER(as, RW_READER);
 343         seg = as_segat(as, va);
 344         i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
 345         AS_LOCK_EXIT(as);
 346 
 347         return (i);
 348 }
 349 
 350 #ifdef  __sparc
 351 
 352 #define NEED_LOCK_KVADDR(kva)   mmpagelock(&kas, kva)
 353 
 354 #else   /* __i386, __amd64 */
 355 
 356 #define NEED_LOCK_KVADDR(va)    0
 357 
 358 #endif  /* __sparc */
 359 
 360 /*ARGSUSED3*/
 361 static int
 362 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
 363 {
 364         pfn_t v;
 365         struct iovec *iov;
 366         int error = 0;
 367         size_t c;
 368         ssize_t oresid = uio->uio_resid;
 369         minor_t minor = getminor(dev);
 370 
 371         while (uio->uio_resid > 0 && error == 0) {
 372                 iov = uio->uio_iov;
 373                 if (iov->iov_len == 0) {
 374                         uio->uio_iov++;
 375                         uio->uio_iovcnt--;
 376                         if (uio->uio_iovcnt < 0)
 377                                 panic("mmrw");
 378                         continue;
 379                 }
 380                 switch (minor) {
 381 
 382                 case M_MEM:
 383                         memlist_read_lock();
 384                         if (!address_in_memlist(phys_install,
 385                             (uint64_t)uio->uio_loffset, 1)) {
 386                                 memlist_read_unlock();
 387                                 error = EFAULT;
 388                                 break;
 389                         }
 390                         memlist_read_unlock();
 391 
 392                         v = BTOP((u_offset_t)uio->uio_loffset);
 393                         error = mmio(uio, rw, v,
 394                             uio->uio_loffset & PAGEOFFSET, 0, NULL);
 395                         break;
 396 
 397                 case M_KMEM:
 398                 case M_ALLKMEM:
 399                         {
 400                         page_t **ppp = NULL;
 401                         caddr_t vaddr = (caddr_t)uio->uio_offset;
 402                         int try_lock = NEED_LOCK_KVADDR(vaddr);
 403                         int locked = 0;
 404 
 405                         if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
 406                                 break;
 407 
 408                         if (rw == UIO_WRITE)
 409                                 mm_logkmem(uio);
 410 
 411                         /*
 412                          * If vaddr does not map a valid page, as_pagelock()
 413                          * will return failure. Hence we can't check the
 414                          * return value and return EFAULT here as we'd like.
 415                          * seg_kp and seg_kpm do not properly support
 416                          * as_pagelock() for this context so we avoid it
 417                          * using the try_lock set check above.  Some day when
 418                          * the kernel page locking gets redesigned all this
 419                          * muck can be cleaned up.
 420                          */
 421                         if (try_lock)
 422                                 locked = (as_pagelock(&kas, &ppp, vaddr,
 423                                     PAGESIZE, S_WRITE) == 0);
 424 
 425                         v = hat_getpfnum(kas.a_hat,
 426                             (caddr_t)(uintptr_t)uio->uio_loffset);
 427                         if (v == PFN_INVALID) {
 428                                 if (locked)
 429                                         as_pageunlock(&kas, ppp, vaddr,
 430                                             PAGESIZE, S_WRITE);
 431                                 error = EFAULT;
 432                                 break;
 433                         }
 434 
 435                         error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
 436                             minor == M_ALLKMEM || mm_kmem_io_access,
 437                             (locked && ppp) ? *ppp : NULL);
 438                         if (locked)
 439                                 as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
 440                                     S_WRITE);
 441                         }
 442 
 443                         break;
 444 
 445                 case M_FULL:
 446                         if (rw == UIO_WRITE) {
 447                                 error = ENOSPC;
 448                                 break;
 449                         }
 450                         /* else it's a read, fall through to zero case */
 451                         /*FALLTHROUGH*/
 452 
 453                 case M_ZERO:
 454                         if (rw == UIO_READ) {
 455                                 label_t ljb;
 456 
 457                                 if (on_fault(&ljb)) {
 458                                         no_fault();
 459                                         error = EFAULT;
 460                                         break;
 461                                 }
 462                                 uzero(iov->iov_base, iov->iov_len);
 463                                 no_fault();
 464                                 uio->uio_resid -= iov->iov_len;
 465                                 uio->uio_loffset += iov->iov_len;
 466                                 break;
 467                         }
 468                         /* else it's a write, fall through to NULL case */
 469                         /*FALLTHROUGH*/
 470 
 471                 case M_NULL:
 472                         if (rw == UIO_READ)
 473                                 return (0);
 474                         c = iov->iov_len;
 475                         iov->iov_base += c;
 476                         iov->iov_len -= c;
 477                         uio->uio_loffset += c;
 478                         uio->uio_resid -= c;
 479                         break;
 480 
 481                 }
 482         }
 483         return (uio->uio_resid == oresid ? error : 0);
 484 }
 485 
 486 static int
 487 mmread(dev_t dev, struct uio *uio, cred_t *cred)
 488 {
 489         return (mmrw(dev, uio, UIO_READ, cred));
 490 }
 491 
 492 static int
 493 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
 494 {
 495         return (mmrw(dev, uio, UIO_WRITE, cred));
 496 }
 497 
 498 /*
 499  * Private ioctl for libkvm to support kvm_physaddr().
 500  * Given an address space and a VA, compute the PA.
 501  */
 502 static int
 503 mmioctl_vtop(intptr_t data)
 504 {
 505 #ifdef _SYSCALL32
 506         mem_vtop32_t vtop32;
 507 #endif
 508         mem_vtop_t mem_vtop;
 509         proc_t *p;
 510         pfn_t pfn = (pfn_t)PFN_INVALID;
 511         pid_t pid = 0;
 512         struct as *as;
 513         struct seg *seg;
 514 
 515         if (get_udatamodel() == DATAMODEL_NATIVE) {
 516                 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
 517                         return (EFAULT);
 518         }
 519 #ifdef _SYSCALL32
 520         else {
 521                 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
 522                         return (EFAULT);
 523                 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
 524                 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
 525 
 526                 if (mem_vtop.m_as != NULL)
 527                         return (EINVAL);
 528         }
 529 #endif
 530 
 531         if (mem_vtop.m_as == &kas) {
 532                 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
 533         } else {
 534                 if (mem_vtop.m_as == NULL) {
 535                         /*
 536                          * Assume the calling process's address space if the
 537                          * caller didn't specify one.
 538                          */
 539                         p = curthread->t_procp;
 540                         if (p == NULL)
 541                                 return (EIO);
 542                         mem_vtop.m_as = p->p_as;
 543                 }
 544 
 545                 mutex_enter(&pidlock);
 546                 for (p = practive; p != NULL; p = p->p_next) {
 547                         if (p->p_as == mem_vtop.m_as) {
 548                                 pid = p->p_pid;
 549                                 break;
 550                         }
 551                 }
 552                 mutex_exit(&pidlock);
 553                 if (p == NULL)
 554                         return (EIO);
 555                 p = sprlock(pid);
 556                 if (p == NULL)
 557                         return (EIO);
 558                 as = p->p_as;
 559                 if (as == mem_vtop.m_as) {
 560                         mutex_exit(&p->p_lock);
 561                         AS_LOCK_ENTER(as, RW_READER);
 562                         for (seg = AS_SEGFIRST(as); seg != NULL;
 563                             seg = AS_SEGNEXT(as, seg))
 564                                 if ((uintptr_t)mem_vtop.m_va -
 565                                     (uintptr_t)seg->s_base < seg->s_size)
 566                                         break;
 567                         if (seg != NULL)
 568                                 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
 569                         AS_LOCK_EXIT(as);
 570                         mutex_enter(&p->p_lock);
 571                 }
 572                 sprunlock(p);
 573         }
 574         mem_vtop.m_pfn = pfn;
 575         if (pfn == PFN_INVALID)
 576                 return (EIO);
 577 
 578         if (get_udatamodel() == DATAMODEL_NATIVE) {
 579                 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
 580                         return (EFAULT);
 581         }
 582 #ifdef _SYSCALL32
 583         else {
 584                 vtop32.m_pfn = mem_vtop.m_pfn;
 585                 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
 586                         return (EFAULT);
 587         }
 588 #endif
 589 
 590         return (0);
 591 }
 592 
 593 /*
 594  * Given a PA, execute the given page retire command on it.
 595  */
 596 static int
 597 mmioctl_page_retire(int cmd, intptr_t data)
 598 {
 599         extern int page_retire_test(void);
 600         uint64_t pa;
 601 
 602         if (copyin((void *)data, &pa, sizeof (uint64_t))) {
 603                 return (EFAULT);
 604         }
 605 
 606         switch (cmd) {
 607         case MEM_PAGE_ISRETIRED:
 608                 return (page_retire_check(pa, NULL));
 609 
 610         case MEM_PAGE_UNRETIRE:
 611                 return (page_unretire(pa));
 612 
 613         case MEM_PAGE_RETIRE:
 614                 return (page_retire(pa, PR_FMA));
 615 
 616         case MEM_PAGE_RETIRE_MCE:
 617                 return (page_retire(pa, PR_MCE));
 618 
 619         case MEM_PAGE_RETIRE_UE:
 620                 return (page_retire(pa, PR_UE));
 621 
 622         case MEM_PAGE_GETERRORS:
 623                 {
 624                         uint64_t page_errors;
 625                         int rc = page_retire_check(pa, &page_errors);
 626                         if (copyout(&page_errors, (void *)data,
 627                             sizeof (uint64_t))) {
 628                                 return (EFAULT);
 629                         }
 630                         return (rc);
 631                 }
 632 
 633         case MEM_PAGE_RETIRE_TEST:
 634                 return (page_retire_test());
 635 
 636         }
 637 
 638         return (EINVAL);
 639 }
 640 
 641 #ifdef __sparc
 642 /*
 643  * Given a syndrome, syndrome type, and address return the
 644  * associated memory name in the provided data buffer.
 645  */
 646 static int
 647 mmioctl_get_mem_name(intptr_t data)
 648 {
 649         mem_name_t mem_name;
 650         void *buf;
 651         size_t bufsize;
 652         int len, err;
 653 
 654         if ((bufsize = cpu_get_name_bufsize()) == 0)
 655                 return (ENOTSUP);
 656 
 657         if ((err = mm_read_mem_name(data, &mem_name)) < 0)
 658                 return (err);
 659 
 660         buf = kmem_alloc(bufsize, KM_SLEEP);
 661 
 662         /*
 663          * Call into cpu specific code to do the lookup.
 664          */
 665         if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
 666             mem_name.m_addr, buf, bufsize, &len)) != 0) {
 667                 kmem_free(buf, bufsize);
 668                 return (err);
 669         }
 670 
 671         if (len >= mem_name.m_namelen) {
 672                 kmem_free(buf, bufsize);
 673                 return (ENOSPC);
 674         }
 675 
 676         if (copyoutstr(buf, (char *)mem_name.m_name,
 677             mem_name.m_namelen, NULL) != 0) {
 678                 kmem_free(buf, bufsize);
 679                 return (EFAULT);
 680         }
 681 
 682         kmem_free(buf, bufsize);
 683         return (0);
 684 }
 685 
 686 /*
 687  * Given a syndrome and address return information about the associated memory.
 688  */
 689 static int
 690 mmioctl_get_mem_info(intptr_t data)
 691 {
 692         mem_info_t mem_info;
 693         int err;
 694 
 695         if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
 696                 return (EFAULT);
 697 
 698         if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
 699             &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
 700             &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
 701                 return (err);
 702 
 703         if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
 704                 return (EFAULT);
 705 
 706         return (0);
 707 }
 708 
 709 /*
 710  * Given a memory name, return its associated serial id
 711  */
 712 static int
 713 mmioctl_get_mem_sid(intptr_t data)
 714 {
 715         mem_name_t mem_name;
 716         void *buf;
 717         void *name;
 718         size_t  name_len;
 719         size_t bufsize;
 720         int len, err;
 721 
 722         if ((bufsize = cpu_get_name_bufsize()) == 0)
 723                 return (ENOTSUP);
 724 
 725         if ((err = mm_read_mem_name(data, &mem_name)) < 0)
 726                 return (err);
 727 
 728         buf = kmem_alloc(bufsize, KM_SLEEP);
 729 
 730         if (mem_name.m_namelen > 1024)
 731                 mem_name.m_namelen = 1024; /* cap at 1024 bytes */
 732 
 733         name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
 734 
 735         if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
 736             mem_name.m_namelen, &name_len)) != 0) {
 737                 kmem_free(buf, bufsize);
 738                 kmem_free(name, mem_name.m_namelen);
 739                 return (err);
 740         }
 741 
 742         /*
 743          * Call into cpu specific code to do the lookup.
 744          */
 745         if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
 746                 kmem_free(buf, bufsize);
 747                 kmem_free(name, mem_name.m_namelen);
 748                 return (err);
 749         }
 750 
 751         if (len > mem_name.m_sidlen) {
 752                 kmem_free(buf, bufsize);
 753                 kmem_free(name, mem_name.m_namelen);
 754                 return (ENAMETOOLONG);
 755         }
 756 
 757         if (copyoutstr(buf, (char *)mem_name.m_sid,
 758             mem_name.m_sidlen, NULL) != 0) {
 759                 kmem_free(buf, bufsize);
 760                 kmem_free(name, mem_name.m_namelen);
 761                 return (EFAULT);
 762         }
 763 
 764         kmem_free(buf, bufsize);
 765         kmem_free(name, mem_name.m_namelen);
 766         return (0);
 767 }
 768 #endif  /* __sparc */
 769 
 770 /*
 771  * Private ioctls for
 772  *      libkvm to support kvm_physaddr().
 773  *      FMA support for page_retire() and memory attribute information.
 774  */
 775 /*ARGSUSED*/
 776 static int
 777 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
 778 {
 779         if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
 780             (cmd != MEM_VTOP && getminor(dev) != M_MEM))
 781                 return (ENXIO);
 782 
 783         switch (cmd) {
 784         case MEM_VTOP:
 785                 return (mmioctl_vtop(data));
 786 
 787         case MEM_PAGE_RETIRE:
 788         case MEM_PAGE_ISRETIRED:
 789         case MEM_PAGE_UNRETIRE:
 790         case MEM_PAGE_RETIRE_MCE:
 791         case MEM_PAGE_RETIRE_UE:
 792         case MEM_PAGE_GETERRORS:
 793         case MEM_PAGE_RETIRE_TEST:
 794                 return (mmioctl_page_retire(cmd, data));
 795 
 796 #ifdef __sparc
 797         case MEM_NAME:
 798                 return (mmioctl_get_mem_name(data));
 799 
 800         case MEM_INFO:
 801                 return (mmioctl_get_mem_info(data));
 802 
 803         case MEM_SID:
 804                 return (mmioctl_get_mem_sid(data));
 805 #else
 806         case MEM_NAME:
 807         case MEM_INFO:
 808         case MEM_SID:
 809                 return (ENOTSUP);
 810 #endif  /* __sparc */
 811         }
 812         return (ENXIO);
 813 }
 814 
 815 /*ARGSUSED2*/
 816 static int
 817 mmmmap(dev_t dev, off_t off, int prot)
 818 {
 819         pfn_t pf;
 820         struct memlist *pmem;
 821         minor_t minor = getminor(dev);
 822 
 823         switch (minor) {
 824         case M_MEM:
 825                 pf = btop(off);
 826                 memlist_read_lock();
 827                 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
 828                         if (pf >= BTOP(pmem->ml_address) &&
 829                             pf < BTOP(pmem->ml_address + pmem->ml_size)) {
 830                                 memlist_read_unlock();
 831                                 return (impl_obmem_pfnum(pf));
 832                         }
 833                 }
 834                 memlist_read_unlock();
 835                 break;
 836 
 837         case M_KMEM:
 838         case M_ALLKMEM:
 839                 /* no longer supported with KPR */
 840                 return (-1);
 841 
 842         case M_FULL:
 843         case M_ZERO:
 844                 /*
 845                  * We shouldn't be mmap'ing to /dev/zero here as
 846                  * mmsegmap() should have already converted
 847                  * a mapping request for this device to a mapping
 848                  * using seg_vn for anonymous memory.
 849                  */
 850                 break;
 851 
 852         }
 853         return (-1);
 854 }
 855 
 856 /*
 857  * This function is called when a memory device is mmap'ed.
 858  * Set up the mapping to the correct device driver.
 859  */
 860 static int
 861 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
 862     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
 863 {
 864         struct segvn_crargs vn_a;
 865         struct segdev_crargs dev_a;
 866         int error;
 867         minor_t minor;
 868         off_t i;
 869 
 870         minor = getminor(dev);
 871 
 872         as_rangelock(as);
 873         /*
 874          * No need to worry about vac alignment on /dev/zero
 875          * since this is a "clone" object that doesn't yet exist.
 876          */
 877         error = choose_addr(as, addrp, len, off,
 878             (minor == M_MEM) || (minor == M_KMEM), flags);
 879         if (error != 0) {
 880                 as_rangeunlock(as);
 881                 return (error);
 882         }
 883 
 884         switch (minor) {
 885         case M_MEM:
 886                 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
 887                 if ((flags & MAP_TYPE) != MAP_SHARED) {
 888                         as_rangeunlock(as);
 889                         return (EINVAL);
 890                 }
 891 
 892                 /*
 893                  * Check to ensure that the entire range is
 894                  * legal and we are not trying to map in
 895                  * more than the device will let us.
 896                  */
 897                 for (i = 0; i < len; i += PAGESIZE) {
 898                         if (mmmmap(dev, off + i, maxprot) == -1) {
 899                                 as_rangeunlock(as);
 900                                 return (ENXIO);
 901                         }
 902                 }
 903 
 904                 /*
 905                  * Use seg_dev segment driver for /dev/mem mapping.
 906                  */
 907                 dev_a.mapfunc = mmmmap;
 908                 dev_a.dev = dev;
 909                 dev_a.offset = off;
 910                 dev_a.type = (flags & MAP_TYPE);
 911                 dev_a.prot = (uchar_t)prot;
 912                 dev_a.maxprot = (uchar_t)maxprot;
 913                 dev_a.hat_attr = 0;
 914 
 915                 /*
 916                  * Make /dev/mem mappings non-consistent since we can't
 917                  * alias pages that don't have page structs behind them,
 918                  * such as kernel stack pages. If someone mmap()s a kernel
 919                  * stack page and if we give him a tte with cv, a line from
 920                  * that page can get into both pages of the spitfire d$.
 921                  * But snoop from another processor will only invalidate
 922                  * the first page. This later caused kernel (xc_attention)
 923                  * to go into an infinite loop at pil 13 and no interrupts
 924                  * could come in. See 1203630.
 925                  *
 926                  */
 927                 dev_a.hat_flags = HAT_LOAD_NOCONSIST;
 928                 dev_a.devmap_data = NULL;
 929 
 930                 error = as_map(as, *addrp, len, segdev_create, &dev_a);
 931                 break;
 932 
 933         case M_ZERO:
 934                 /*
 935                  * Use seg_vn segment driver for /dev/zero mapping.
 936                  * Passing in a NULL amp gives us the "cloning" effect.
 937                  */
 938                 vn_a.vp = NULL;
 939                 vn_a.offset = 0;
 940                 vn_a.type = (flags & MAP_TYPE);
 941                 vn_a.prot = prot;
 942                 vn_a.maxprot = maxprot;
 943                 vn_a.flags = flags & ~MAP_TYPE;
 944                 vn_a.cred = cred;
 945                 vn_a.amp = NULL;
 946                 vn_a.szc = 0;
 947                 vn_a.lgrp_mem_policy_flags = 0;
 948                 error = as_map(as, *addrp, len, segvn_create, &vn_a);
 949                 break;
 950 
 951         case M_KMEM:
 952         case M_ALLKMEM:
 953                 /* No longer supported with KPR. */
 954                 error = ENXIO;
 955                 break;
 956 
 957         case M_NULL:
 958                 /*
 959                  * Use seg_dev segment driver for /dev/null mapping.
 960                  */
 961                 dev_a.mapfunc = mmmmap;
 962                 dev_a.dev = dev;
 963                 dev_a.offset = off;
 964                 dev_a.type = 0;         /* neither PRIVATE nor SHARED */
 965                 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
 966                 dev_a.hat_attr = 0;
 967                 dev_a.hat_flags = 0;
 968                 error = as_map(as, *addrp, len, segdev_create, &dev_a);
 969                 break;
 970 
 971         default:
 972                 error = ENXIO;
 973         }
 974 
 975         as_rangeunlock(as);
 976         return (error);
 977 }
 978 
 979 static struct cb_ops mm_cb_ops = {
 980         mmopen,                 /* open */
 981         nulldev,                /* close */
 982         nodev,                  /* strategy */
 983         nodev,                  /* print */
 984         nodev,                  /* dump */
 985         mmread,                 /* read */
 986         mmwrite,                /* write */
 987         mmioctl,                /* ioctl */
 988         nodev,                  /* devmap */
 989         mmmmap,                 /* mmap */
 990         mmsegmap,               /* segmap */
 991         mmchpoll,               /* poll */
 992         mmpropop,               /* prop_op */
 993         0,                      /* streamtab  */
 994         D_NEW | D_MP | D_64BIT | D_U64BIT
 995 };
 996 
 997 static struct dev_ops mm_ops = {
 998         DEVO_REV,               /* devo_rev, */
 999         0,                      /* refcnt  */
1000         mm_info,                /* get_dev_info */
1001         nulldev,                /* identify */
1002         nulldev,                /* probe */
1003         mm_attach,              /* attach */
1004         nodev,                  /* detach */
1005         nodev,                  /* reset */
1006         &mm_cb_ops,         /* driver operations */
1007         (struct bus_ops *)0,    /* bus operations */
1008         NULL,                   /* power */
1009         ddi_quiesce_not_needed,         /* quiesce */
1010 };
1011 
1012 static struct modldrv modldrv = {
1013         &mod_driverops, "memory driver", &mm_ops,
1014 };
1015 
1016 static struct modlinkage modlinkage = {
1017         MODREV_1, &modldrv, NULL
1018 };
1019 
1020 int
1021 _init(void)
1022 {
1023         return (mod_install(&modlinkage));
1024 }
1025 
1026 int
1027 _info(struct modinfo *modinfop)
1028 {
1029         return (mod_info(&modlinkage, modinfop));
1030 }
1031 
1032 int
1033 _fini(void)
1034 {
1035         return (mod_remove(&modlinkage));
1036 }
1037 
1038 static int
1039 mm_kstat_update(kstat_t *ksp, int rw)
1040 {
1041         struct memlist *pmem;
1042         uint_t count;
1043 
1044         if (rw == KSTAT_WRITE)
1045                 return (EACCES);
1046 
1047         count = 0;
1048         memlist_read_lock();
1049         for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1050                 count++;
1051         }
1052         memlist_read_unlock();
1053 
1054         ksp->ks_ndata = count;
1055         ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1056 
1057         return (0);
1058 }
1059 
1060 static int
1061 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1062 {
1063         struct memlist *pmem;
1064         struct memunit {
1065                 uint64_t address;
1066                 uint64_t size;
1067         } *kspmem;
1068 
1069         if (rw == KSTAT_WRITE)
1070                 return (EACCES);
1071 
1072         ksp->ks_snaptime = gethrtime();
1073 
1074         kspmem = (struct memunit *)buf;
1075         memlist_read_lock();
1076         for (pmem = phys_install; pmem != NULL;
1077             pmem = pmem->ml_next, kspmem++) {
1078                 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1079                         break;
1080                 kspmem->address = pmem->ml_address;
1081                 kspmem->size = pmem->ml_size;
1082         }
1083         memlist_read_unlock();
1084 
1085         return (0);
1086 }
1087 
1088 /*
1089  * Read a mem_name_t from user-space and store it in the mem_name_t
1090  * pointed to by the mem_name argument.
1091  */
1092 static int
1093 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1094 {
1095         if (get_udatamodel() == DATAMODEL_NATIVE) {
1096                 if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1097                         return (EFAULT);
1098         }
1099 #ifdef  _SYSCALL32
1100         else {
1101                 mem_name32_t mem_name32;
1102 
1103                 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1104                         return (EFAULT);
1105                 mem_name->m_addr = mem_name32.m_addr;
1106                 mem_name->m_synd = mem_name32.m_synd;
1107                 mem_name->m_type[0] = mem_name32.m_type[0];
1108                 mem_name->m_type[1] = mem_name32.m_type[1];
1109                 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1110                 mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1111                 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1112                 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1113         }
1114 #endif  /* _SYSCALL32 */
1115 
1116         return (0);
1117 }