1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2015, Joyent, Inc. All rights reserved. 28 * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com> 29 */ 30 31 /* 32 * Memory special file 33 */ 34 35 #include <sys/types.h> 36 #include <sys/param.h> 37 #include <sys/user.h> 38 #include <sys/buf.h> 39 #include <sys/systm.h> 40 #include <sys/cred.h> 41 #include <sys/vm.h> 42 #include <sys/uio.h> 43 #include <sys/mman.h> 44 #include <sys/kmem.h> 45 #include <vm/seg.h> 46 #include <vm/page.h> 47 #include <sys/stat.h> 48 #include <sys/vmem.h> 49 #include <sys/memlist.h> 50 #include <sys/bootconf.h> 51 52 #include <vm/seg_vn.h> 53 #include <vm/seg_dev.h> 54 #include <vm/seg_kmem.h> 55 #include <vm/seg_kp.h> 56 #include <vm/seg_kpm.h> 57 #include <vm/hat.h> 58 59 #include <sys/conf.h> 60 #include <sys/mem.h> 61 #include <sys/types.h> 62 #include <sys/conf.h> 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/errno.h> 66 #include <sys/modctl.h> 67 #include <sys/memlist.h> 68 #include <sys/ddi.h> 69 #include <sys/sunddi.h> 70 #include <sys/debug.h> 71 #include <sys/fm/protocol.h> 72 73 #if defined(__sparc) 74 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *); 75 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *, 76 uint64_t *, int *, int *, int *); 77 extern size_t cpu_get_name_bufsize(void); 78 extern int cpu_get_mem_sid(char *, char *, int, int *); 79 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *); 80 #elif defined(__x86) 81 #include <sys/cpu_module.h> 82 #endif /* __sparc */ 83 84 /* 85 * Turn a byte length into a pagecount. The DDI btop takes a 86 * 32-bit size on 32-bit machines, this handles 64-bit sizes for 87 * large physical-memory 32-bit machines. 88 */ 89 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift)) 90 91 static kmutex_t mm_lock; 92 static caddr_t mm_map; 93 94 static dev_info_t *mm_dip; /* private copy of devinfo pointer */ 95 96 static int mm_kmem_io_access; 97 98 static int mm_kstat_update(kstat_t *ksp, int rw); 99 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw); 100 101 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name); 102 103 #define MM_KMEMLOG_NENTRIES 64 104 105 static int mm_kmemlogent; 106 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES]; 107 108 /* 109 * On kmem/allmem writes, we log information that might be useful in the event 110 * that a write is errant (that is, due to operator error) and induces a later 111 * problem. Note that (in particular) in the event of such operator-induced 112 * corruption, a search over the kernel address space for the corrupted 113 * address will yield the ring buffer entry that recorded the write. And 114 * should it seem baroque or otherwise unnecessary, yes, we need this kind of 115 * auditing facility and yes, we learned that the hard way: disturbingly, 116 * there exist recommendations for "tuning" the system that involve writing to 117 * kernel memory addresses via the kernel debugger, and -- as we discovered -- 118 * these can easily be applied incorrectly or unsafely, yielding an entirely 119 * undebuggable "can't happen" kind of panic. 120 */ 121 static void 122 mm_logkmem(struct uio *uio) 123 { 124 mm_logentry_t *ent; 125 proc_t *p = curthread->t_procp; 126 127 mutex_enter(&mm_lock); 128 129 ent = &mm_kmemlog[mm_kmemlogent++]; 130 131 if (mm_kmemlogent == MM_KMEMLOG_NENTRIES) 132 mm_kmemlogent = 0; 133 134 ent->mle_vaddr = (uintptr_t)uio->uio_loffset; 135 ent->mle_len = uio->uio_resid; 136 gethrestime(&ent->mle_hrestime); 137 ent->mle_hrtime = gethrtime(); 138 ent->mle_pid = p->p_pidp->pid_id; 139 140 (void) strncpy(ent->mle_psargs, 141 p->p_user.u_psargs, sizeof (ent->mle_psargs)); 142 143 mutex_exit(&mm_lock); 144 } 145 146 /*ARGSUSED1*/ 147 static int 148 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 149 { 150 int i; 151 struct mem_minor { 152 char *name; 153 minor_t minor; 154 int privonly; 155 const char *rdpriv; 156 const char *wrpriv; 157 mode_t priv_mode; 158 } mm[] = { 159 { "mem", M_MEM, 0, NULL, "all", 0640 }, 160 { "kmem", M_KMEM, 0, NULL, "all", 0640 }, 161 { "allkmem", M_ALLKMEM, 0, "all", "all", 0600 }, 162 { "null", M_NULL, PRIVONLY_DEV, NULL, NULL, 0666 }, 163 { "zero", M_ZERO, PRIVONLY_DEV, NULL, NULL, 0666 }, 164 { "full", M_FULL, PRIVONLY_DEV, NULL, NULL, 0666 }, 165 }; 166 kstat_t *ksp; 167 168 mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL); 169 mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 170 171 for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) { 172 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR, 173 mm[i].minor, DDI_PSEUDO, mm[i].privonly, 174 mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) == 175 DDI_FAILURE) { 176 ddi_remove_minor_node(devi, NULL); 177 return (DDI_FAILURE); 178 } 179 } 180 181 mm_dip = devi; 182 183 ksp = kstat_create("mm", 0, "phys_installed", "misc", 184 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); 185 if (ksp != NULL) { 186 ksp->ks_update = mm_kstat_update; 187 ksp->ks_snapshot = mm_kstat_snapshot; 188 ksp->ks_lock = &mm_lock; /* XXX - not really needed */ 189 kstat_install(ksp); 190 } 191 192 mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, 193 "kmem_io_access", 0); 194 195 return (DDI_SUCCESS); 196 } 197 198 /*ARGSUSED*/ 199 static int 200 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 201 { 202 register int error; 203 204 switch (infocmd) { 205 case DDI_INFO_DEVT2DEVINFO: 206 *result = (void *)mm_dip; 207 error = DDI_SUCCESS; 208 break; 209 case DDI_INFO_DEVT2INSTANCE: 210 *result = (void *)0; 211 error = DDI_SUCCESS; 212 break; 213 default: 214 error = DDI_FAILURE; 215 } 216 return (error); 217 } 218 219 /*ARGSUSED1*/ 220 static int 221 mmopen(dev_t *devp, int flag, int typ, struct cred *cred) 222 { 223 switch (getminor(*devp)) { 224 case M_NULL: 225 case M_ZERO: 226 case M_FULL: 227 case M_MEM: 228 case M_KMEM: 229 case M_ALLKMEM: 230 /* standard devices */ 231 break; 232 233 default: 234 /* Unsupported or unknown type */ 235 return (EINVAL); 236 } 237 /* must be character device */ 238 if (typ != OTYP_CHR) 239 return (EINVAL); 240 return (0); 241 } 242 243 struct pollhead mm_pollhd; 244 245 /*ARGSUSED*/ 246 static int 247 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp, 248 struct pollhead **phpp) 249 { 250 switch (getminor(dev)) { 251 case M_NULL: 252 case M_ZERO: 253 case M_FULL: 254 case M_MEM: 255 case M_KMEM: 256 case M_ALLKMEM: 257 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM | 258 POLLWRNORM | POLLRDBAND | POLLWRBAND); 259 /* 260 * A non NULL pollhead pointer should be returned in case 261 * user polls for 0 events. 262 */ 263 *phpp = !anyyet && !*reventsp ? 264 &mm_pollhd : (struct pollhead *)NULL; 265 return (0); 266 default: 267 /* no other devices currently support polling */ 268 return (ENXIO); 269 } 270 } 271 272 static int 273 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags, 274 char *name, caddr_t valuep, int *lengthp) 275 { 276 /* 277 * implement zero size to reduce overhead (avoid two failing 278 * property lookups per stat). 279 */ 280 return (ddi_prop_op_size(dev, dip, prop_op, 281 flags, name, valuep, lengthp, 0)); 282 } 283 284 static int 285 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio, 286 page_t *pp) 287 { 288 int error = 0; 289 int devload = 0; 290 int is_memory = pf_is_memory(pfn); 291 size_t nbytes = MIN((size_t)(PAGESIZE - pageoff), 292 (size_t)uio->uio_iov->iov_len); 293 caddr_t va = NULL; 294 295 mutex_enter(&mm_lock); 296 297 if (is_memory && kpm_enable) { 298 if (pp) 299 va = hat_kpm_mapin(pp, NULL); 300 else 301 va = hat_kpm_mapin_pfn(pfn); 302 } 303 304 if (va == NULL) { 305 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn, 306 (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE), 307 HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK); 308 va = mm_map; 309 devload = 1; 310 } 311 312 if (!is_memory) { 313 if (allowio) { 314 size_t c = uio->uio_iov->iov_len; 315 316 if (ddi_peekpokeio(NULL, uio, rw, 317 (caddr_t)(uintptr_t)uio->uio_loffset, c, 318 sizeof (int32_t)) != DDI_SUCCESS) 319 error = EFAULT; 320 } else 321 error = EIO; 322 } else 323 error = uiomove(va + pageoff, nbytes, rw, uio); 324 325 if (devload) 326 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK); 327 else if (pp) 328 hat_kpm_mapout(pp, NULL, va); 329 else 330 hat_kpm_mapout_pfn(pfn); 331 332 mutex_exit(&mm_lock); 333 return (error); 334 } 335 336 static int 337 mmpagelock(struct as *as, caddr_t va) 338 { 339 struct seg *seg; 340 int i; 341 342 AS_LOCK_ENTER(as, RW_READER); 343 seg = as_segat(as, va); 344 i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0; 345 AS_LOCK_EXIT(as); 346 347 return (i); 348 } 349 350 #ifdef __sparc 351 352 #define NEED_LOCK_KVADDR(kva) mmpagelock(&kas, kva) 353 354 #else /* __i386, __amd64 */ 355 356 #define NEED_LOCK_KVADDR(va) 0 357 358 #endif /* __sparc */ 359 360 /*ARGSUSED3*/ 361 static int 362 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred) 363 { 364 pfn_t v; 365 struct iovec *iov; 366 int error = 0; 367 size_t c; 368 ssize_t oresid = uio->uio_resid; 369 minor_t minor = getminor(dev); 370 371 while (uio->uio_resid > 0 && error == 0) { 372 iov = uio->uio_iov; 373 if (iov->iov_len == 0) { 374 uio->uio_iov++; 375 uio->uio_iovcnt--; 376 if (uio->uio_iovcnt < 0) 377 panic("mmrw"); 378 continue; 379 } 380 switch (minor) { 381 382 case M_MEM: 383 memlist_read_lock(); 384 if (!address_in_memlist(phys_install, 385 (uint64_t)uio->uio_loffset, 1)) { 386 memlist_read_unlock(); 387 error = EFAULT; 388 break; 389 } 390 memlist_read_unlock(); 391 392 v = BTOP((u_offset_t)uio->uio_loffset); 393 error = mmio(uio, rw, v, 394 uio->uio_loffset & PAGEOFFSET, 0, NULL); 395 break; 396 397 case M_KMEM: 398 case M_ALLKMEM: 399 { 400 page_t **ppp = NULL; 401 caddr_t vaddr = (caddr_t)uio->uio_offset; 402 int try_lock = NEED_LOCK_KVADDR(vaddr); 403 int locked = 0; 404 405 if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP) 406 break; 407 408 if (rw == UIO_WRITE) 409 mm_logkmem(uio); 410 411 /* 412 * If vaddr does not map a valid page, as_pagelock() 413 * will return failure. Hence we can't check the 414 * return value and return EFAULT here as we'd like. 415 * seg_kp and seg_kpm do not properly support 416 * as_pagelock() for this context so we avoid it 417 * using the try_lock set check above. Some day when 418 * the kernel page locking gets redesigned all this 419 * muck can be cleaned up. 420 */ 421 if (try_lock) 422 locked = (as_pagelock(&kas, &ppp, vaddr, 423 PAGESIZE, S_WRITE) == 0); 424 425 v = hat_getpfnum(kas.a_hat, 426 (caddr_t)(uintptr_t)uio->uio_loffset); 427 if (v == PFN_INVALID) { 428 if (locked) 429 as_pageunlock(&kas, ppp, vaddr, 430 PAGESIZE, S_WRITE); 431 error = EFAULT; 432 break; 433 } 434 435 error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET, 436 minor == M_ALLKMEM || mm_kmem_io_access, 437 (locked && ppp) ? *ppp : NULL); 438 if (locked) 439 as_pageunlock(&kas, ppp, vaddr, PAGESIZE, 440 S_WRITE); 441 } 442 443 break; 444 445 case M_FULL: 446 if (rw == UIO_WRITE) { 447 error = ENOSPC; 448 break; 449 } 450 /* else it's a read, fall through to zero case */ 451 /*FALLTHROUGH*/ 452 453 case M_ZERO: 454 if (rw == UIO_READ) { 455 label_t ljb; 456 457 if (on_fault(&ljb)) { 458 no_fault(); 459 error = EFAULT; 460 break; 461 } 462 uzero(iov->iov_base, iov->iov_len); 463 no_fault(); 464 uio->uio_resid -= iov->iov_len; 465 uio->uio_loffset += iov->iov_len; 466 break; 467 } 468 /* else it's a write, fall through to NULL case */ 469 /*FALLTHROUGH*/ 470 471 case M_NULL: 472 if (rw == UIO_READ) 473 return (0); 474 c = iov->iov_len; 475 iov->iov_base += c; 476 iov->iov_len -= c; 477 uio->uio_loffset += c; 478 uio->uio_resid -= c; 479 break; 480 481 } 482 } 483 return (uio->uio_resid == oresid ? error : 0); 484 } 485 486 static int 487 mmread(dev_t dev, struct uio *uio, cred_t *cred) 488 { 489 return (mmrw(dev, uio, UIO_READ, cred)); 490 } 491 492 static int 493 mmwrite(dev_t dev, struct uio *uio, cred_t *cred) 494 { 495 return (mmrw(dev, uio, UIO_WRITE, cred)); 496 } 497 498 /* 499 * Private ioctl for libkvm to support kvm_physaddr(). 500 * Given an address space and a VA, compute the PA. 501 */ 502 static int 503 mmioctl_vtop(intptr_t data) 504 { 505 #ifdef _SYSCALL32 506 mem_vtop32_t vtop32; 507 #endif 508 mem_vtop_t mem_vtop; 509 proc_t *p; 510 pfn_t pfn = (pfn_t)PFN_INVALID; 511 pid_t pid = 0; 512 struct as *as; 513 struct seg *seg; 514 515 if (get_udatamodel() == DATAMODEL_NATIVE) { 516 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t))) 517 return (EFAULT); 518 } 519 #ifdef _SYSCALL32 520 else { 521 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t))) 522 return (EFAULT); 523 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as; 524 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va; 525 526 if (mem_vtop.m_as != NULL) 527 return (EINVAL); 528 } 529 #endif 530 531 if (mem_vtop.m_as == &kas) { 532 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va); 533 } else { 534 if (mem_vtop.m_as == NULL) { 535 /* 536 * Assume the calling process's address space if the 537 * caller didn't specify one. 538 */ 539 p = curthread->t_procp; 540 if (p == NULL) 541 return (EIO); 542 mem_vtop.m_as = p->p_as; 543 } 544 545 mutex_enter(&pidlock); 546 for (p = practive; p != NULL; p = p->p_next) { 547 if (p->p_as == mem_vtop.m_as) { 548 pid = p->p_pid; 549 break; 550 } 551 } 552 mutex_exit(&pidlock); 553 if (p == NULL) 554 return (EIO); 555 p = sprlock(pid); 556 if (p == NULL) 557 return (EIO); 558 as = p->p_as; 559 if (as == mem_vtop.m_as) { 560 mutex_exit(&p->p_lock); 561 AS_LOCK_ENTER(as, RW_READER); 562 for (seg = AS_SEGFIRST(as); seg != NULL; 563 seg = AS_SEGNEXT(as, seg)) 564 if ((uintptr_t)mem_vtop.m_va - 565 (uintptr_t)seg->s_base < seg->s_size) 566 break; 567 if (seg != NULL) 568 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va); 569 AS_LOCK_EXIT(as); 570 mutex_enter(&p->p_lock); 571 } 572 sprunlock(p); 573 } 574 mem_vtop.m_pfn = pfn; 575 if (pfn == PFN_INVALID) 576 return (EIO); 577 578 if (get_udatamodel() == DATAMODEL_NATIVE) { 579 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t))) 580 return (EFAULT); 581 } 582 #ifdef _SYSCALL32 583 else { 584 vtop32.m_pfn = mem_vtop.m_pfn; 585 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t))) 586 return (EFAULT); 587 } 588 #endif 589 590 return (0); 591 } 592 593 /* 594 * Given a PA, execute the given page retire command on it. 595 */ 596 static int 597 mmioctl_page_retire(int cmd, intptr_t data) 598 { 599 extern int page_retire_test(void); 600 uint64_t pa; 601 602 if (copyin((void *)data, &pa, sizeof (uint64_t))) { 603 return (EFAULT); 604 } 605 606 switch (cmd) { 607 case MEM_PAGE_ISRETIRED: 608 return (page_retire_check(pa, NULL)); 609 610 case MEM_PAGE_UNRETIRE: 611 return (page_unretire(pa)); 612 613 case MEM_PAGE_RETIRE: 614 return (page_retire(pa, PR_FMA)); 615 616 case MEM_PAGE_RETIRE_MCE: 617 return (page_retire(pa, PR_MCE)); 618 619 case MEM_PAGE_RETIRE_UE: 620 return (page_retire(pa, PR_UE)); 621 622 case MEM_PAGE_GETERRORS: 623 { 624 uint64_t page_errors; 625 int rc = page_retire_check(pa, &page_errors); 626 if (copyout(&page_errors, (void *)data, 627 sizeof (uint64_t))) { 628 return (EFAULT); 629 } 630 return (rc); 631 } 632 633 case MEM_PAGE_RETIRE_TEST: 634 return (page_retire_test()); 635 636 } 637 638 return (EINVAL); 639 } 640 641 #ifdef __sparc 642 /* 643 * Given a syndrome, syndrome type, and address return the 644 * associated memory name in the provided data buffer. 645 */ 646 static int 647 mmioctl_get_mem_name(intptr_t data) 648 { 649 mem_name_t mem_name; 650 void *buf; 651 size_t bufsize; 652 int len, err; 653 654 if ((bufsize = cpu_get_name_bufsize()) == 0) 655 return (ENOTSUP); 656 657 if ((err = mm_read_mem_name(data, &mem_name)) < 0) 658 return (err); 659 660 buf = kmem_alloc(bufsize, KM_SLEEP); 661 662 /* 663 * Call into cpu specific code to do the lookup. 664 */ 665 if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type, 666 mem_name.m_addr, buf, bufsize, &len)) != 0) { 667 kmem_free(buf, bufsize); 668 return (err); 669 } 670 671 if (len >= mem_name.m_namelen) { 672 kmem_free(buf, bufsize); 673 return (ENOSPC); 674 } 675 676 if (copyoutstr(buf, (char *)mem_name.m_name, 677 mem_name.m_namelen, NULL) != 0) { 678 kmem_free(buf, bufsize); 679 return (EFAULT); 680 } 681 682 kmem_free(buf, bufsize); 683 return (0); 684 } 685 686 /* 687 * Given a syndrome and address return information about the associated memory. 688 */ 689 static int 690 mmioctl_get_mem_info(intptr_t data) 691 { 692 mem_info_t mem_info; 693 int err; 694 695 if (copyin((void *)data, &mem_info, sizeof (mem_info_t))) 696 return (EFAULT); 697 698 if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr, 699 &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size, 700 &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0) 701 return (err); 702 703 if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0) 704 return (EFAULT); 705 706 return (0); 707 } 708 709 /* 710 * Given a memory name, return its associated serial id 711 */ 712 static int 713 mmioctl_get_mem_sid(intptr_t data) 714 { 715 mem_name_t mem_name; 716 void *buf; 717 void *name; 718 size_t name_len; 719 size_t bufsize; 720 int len, err; 721 722 if ((bufsize = cpu_get_name_bufsize()) == 0) 723 return (ENOTSUP); 724 725 if ((err = mm_read_mem_name(data, &mem_name)) < 0) 726 return (err); 727 728 buf = kmem_alloc(bufsize, KM_SLEEP); 729 730 if (mem_name.m_namelen > 1024) 731 mem_name.m_namelen = 1024; /* cap at 1024 bytes */ 732 733 name = kmem_alloc(mem_name.m_namelen, KM_SLEEP); 734 735 if ((err = copyinstr((char *)mem_name.m_name, (char *)name, 736 mem_name.m_namelen, &name_len)) != 0) { 737 kmem_free(buf, bufsize); 738 kmem_free(name, mem_name.m_namelen); 739 return (err); 740 } 741 742 /* 743 * Call into cpu specific code to do the lookup. 744 */ 745 if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) { 746 kmem_free(buf, bufsize); 747 kmem_free(name, mem_name.m_namelen); 748 return (err); 749 } 750 751 if (len > mem_name.m_sidlen) { 752 kmem_free(buf, bufsize); 753 kmem_free(name, mem_name.m_namelen); 754 return (ENAMETOOLONG); 755 } 756 757 if (copyoutstr(buf, (char *)mem_name.m_sid, 758 mem_name.m_sidlen, NULL) != 0) { 759 kmem_free(buf, bufsize); 760 kmem_free(name, mem_name.m_namelen); 761 return (EFAULT); 762 } 763 764 kmem_free(buf, bufsize); 765 kmem_free(name, mem_name.m_namelen); 766 return (0); 767 } 768 #endif /* __sparc */ 769 770 /* 771 * Private ioctls for 772 * libkvm to support kvm_physaddr(). 773 * FMA support for page_retire() and memory attribute information. 774 */ 775 /*ARGSUSED*/ 776 static int 777 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp) 778 { 779 if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) || 780 (cmd != MEM_VTOP && getminor(dev) != M_MEM)) 781 return (ENXIO); 782 783 switch (cmd) { 784 case MEM_VTOP: 785 return (mmioctl_vtop(data)); 786 787 case MEM_PAGE_RETIRE: 788 case MEM_PAGE_ISRETIRED: 789 case MEM_PAGE_UNRETIRE: 790 case MEM_PAGE_RETIRE_MCE: 791 case MEM_PAGE_RETIRE_UE: 792 case MEM_PAGE_GETERRORS: 793 case MEM_PAGE_RETIRE_TEST: 794 return (mmioctl_page_retire(cmd, data)); 795 796 #ifdef __sparc 797 case MEM_NAME: 798 return (mmioctl_get_mem_name(data)); 799 800 case MEM_INFO: 801 return (mmioctl_get_mem_info(data)); 802 803 case MEM_SID: 804 return (mmioctl_get_mem_sid(data)); 805 #else 806 case MEM_NAME: 807 case MEM_INFO: 808 case MEM_SID: 809 return (ENOTSUP); 810 #endif /* __sparc */ 811 } 812 return (ENXIO); 813 } 814 815 /*ARGSUSED2*/ 816 static int 817 mmmmap(dev_t dev, off_t off, int prot) 818 { 819 pfn_t pf; 820 struct memlist *pmem; 821 minor_t minor = getminor(dev); 822 823 switch (minor) { 824 case M_MEM: 825 pf = btop(off); 826 memlist_read_lock(); 827 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) { 828 if (pf >= BTOP(pmem->ml_address) && 829 pf < BTOP(pmem->ml_address + pmem->ml_size)) { 830 memlist_read_unlock(); 831 return (impl_obmem_pfnum(pf)); 832 } 833 } 834 memlist_read_unlock(); 835 break; 836 837 case M_KMEM: 838 case M_ALLKMEM: 839 /* no longer supported with KPR */ 840 return (-1); 841 842 case M_FULL: 843 case M_ZERO: 844 /* 845 * We shouldn't be mmap'ing to /dev/zero here as 846 * mmsegmap() should have already converted 847 * a mapping request for this device to a mapping 848 * using seg_vn for anonymous memory. 849 */ 850 break; 851 852 } 853 return (-1); 854 } 855 856 /* 857 * This function is called when a memory device is mmap'ed. 858 * Set up the mapping to the correct device driver. 859 */ 860 static int 861 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 862 uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred) 863 { 864 struct segvn_crargs vn_a; 865 struct segdev_crargs dev_a; 866 int error; 867 minor_t minor; 868 off_t i; 869 870 minor = getminor(dev); 871 872 as_rangelock(as); 873 /* 874 * No need to worry about vac alignment on /dev/zero 875 * since this is a "clone" object that doesn't yet exist. 876 */ 877 error = choose_addr(as, addrp, len, off, 878 (minor == M_MEM) || (minor == M_KMEM), flags); 879 if (error != 0) { 880 as_rangeunlock(as); 881 return (error); 882 } 883 884 switch (minor) { 885 case M_MEM: 886 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */ 887 if ((flags & MAP_TYPE) != MAP_SHARED) { 888 as_rangeunlock(as); 889 return (EINVAL); 890 } 891 892 /* 893 * Check to ensure that the entire range is 894 * legal and we are not trying to map in 895 * more than the device will let us. 896 */ 897 for (i = 0; i < len; i += PAGESIZE) { 898 if (mmmmap(dev, off + i, maxprot) == -1) { 899 as_rangeunlock(as); 900 return (ENXIO); 901 } 902 } 903 904 /* 905 * Use seg_dev segment driver for /dev/mem mapping. 906 */ 907 dev_a.mapfunc = mmmmap; 908 dev_a.dev = dev; 909 dev_a.offset = off; 910 dev_a.type = (flags & MAP_TYPE); 911 dev_a.prot = (uchar_t)prot; 912 dev_a.maxprot = (uchar_t)maxprot; 913 dev_a.hat_attr = 0; 914 915 /* 916 * Make /dev/mem mappings non-consistent since we can't 917 * alias pages that don't have page structs behind them, 918 * such as kernel stack pages. If someone mmap()s a kernel 919 * stack page and if we give him a tte with cv, a line from 920 * that page can get into both pages of the spitfire d$. 921 * But snoop from another processor will only invalidate 922 * the first page. This later caused kernel (xc_attention) 923 * to go into an infinite loop at pil 13 and no interrupts 924 * could come in. See 1203630. 925 * 926 */ 927 dev_a.hat_flags = HAT_LOAD_NOCONSIST; 928 dev_a.devmap_data = NULL; 929 930 error = as_map(as, *addrp, len, segdev_create, &dev_a); 931 break; 932 933 case M_ZERO: 934 /* 935 * Use seg_vn segment driver for /dev/zero mapping. 936 * Passing in a NULL amp gives us the "cloning" effect. 937 */ 938 vn_a.vp = NULL; 939 vn_a.offset = 0; 940 vn_a.type = (flags & MAP_TYPE); 941 vn_a.prot = prot; 942 vn_a.maxprot = maxprot; 943 vn_a.flags = flags & ~MAP_TYPE; 944 vn_a.cred = cred; 945 vn_a.amp = NULL; 946 vn_a.szc = 0; 947 vn_a.lgrp_mem_policy_flags = 0; 948 error = as_map(as, *addrp, len, segvn_create, &vn_a); 949 break; 950 951 case M_KMEM: 952 case M_ALLKMEM: 953 /* No longer supported with KPR. */ 954 error = ENXIO; 955 break; 956 957 case M_NULL: 958 /* 959 * Use seg_dev segment driver for /dev/null mapping. 960 */ 961 dev_a.mapfunc = mmmmap; 962 dev_a.dev = dev; 963 dev_a.offset = off; 964 dev_a.type = 0; /* neither PRIVATE nor SHARED */ 965 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE; 966 dev_a.hat_attr = 0; 967 dev_a.hat_flags = 0; 968 error = as_map(as, *addrp, len, segdev_create, &dev_a); 969 break; 970 971 default: 972 error = ENXIO; 973 } 974 975 as_rangeunlock(as); 976 return (error); 977 } 978 979 static struct cb_ops mm_cb_ops = { 980 mmopen, /* open */ 981 nulldev, /* close */ 982 nodev, /* strategy */ 983 nodev, /* print */ 984 nodev, /* dump */ 985 mmread, /* read */ 986 mmwrite, /* write */ 987 mmioctl, /* ioctl */ 988 nodev, /* devmap */ 989 mmmmap, /* mmap */ 990 mmsegmap, /* segmap */ 991 mmchpoll, /* poll */ 992 mmpropop, /* prop_op */ 993 0, /* streamtab */ 994 D_NEW | D_MP | D_64BIT | D_U64BIT 995 }; 996 997 static struct dev_ops mm_ops = { 998 DEVO_REV, /* devo_rev, */ 999 0, /* refcnt */ 1000 mm_info, /* get_dev_info */ 1001 nulldev, /* identify */ 1002 nulldev, /* probe */ 1003 mm_attach, /* attach */ 1004 nodev, /* detach */ 1005 nodev, /* reset */ 1006 &mm_cb_ops, /* driver operations */ 1007 (struct bus_ops *)0, /* bus operations */ 1008 NULL, /* power */ 1009 ddi_quiesce_not_needed, /* quiesce */ 1010 }; 1011 1012 static struct modldrv modldrv = { 1013 &mod_driverops, "memory driver", &mm_ops, 1014 }; 1015 1016 static struct modlinkage modlinkage = { 1017 MODREV_1, &modldrv, NULL 1018 }; 1019 1020 int 1021 _init(void) 1022 { 1023 return (mod_install(&modlinkage)); 1024 } 1025 1026 int 1027 _info(struct modinfo *modinfop) 1028 { 1029 return (mod_info(&modlinkage, modinfop)); 1030 } 1031 1032 int 1033 _fini(void) 1034 { 1035 return (mod_remove(&modlinkage)); 1036 } 1037 1038 static int 1039 mm_kstat_update(kstat_t *ksp, int rw) 1040 { 1041 struct memlist *pmem; 1042 uint_t count; 1043 1044 if (rw == KSTAT_WRITE) 1045 return (EACCES); 1046 1047 count = 0; 1048 memlist_read_lock(); 1049 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) { 1050 count++; 1051 } 1052 memlist_read_unlock(); 1053 1054 ksp->ks_ndata = count; 1055 ksp->ks_data_size = count * 2 * sizeof (uint64_t); 1056 1057 return (0); 1058 } 1059 1060 static int 1061 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw) 1062 { 1063 struct memlist *pmem; 1064 struct memunit { 1065 uint64_t address; 1066 uint64_t size; 1067 } *kspmem; 1068 1069 if (rw == KSTAT_WRITE) 1070 return (EACCES); 1071 1072 ksp->ks_snaptime = gethrtime(); 1073 1074 kspmem = (struct memunit *)buf; 1075 memlist_read_lock(); 1076 for (pmem = phys_install; pmem != NULL; 1077 pmem = pmem->ml_next, kspmem++) { 1078 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) 1079 break; 1080 kspmem->address = pmem->ml_address; 1081 kspmem->size = pmem->ml_size; 1082 } 1083 memlist_read_unlock(); 1084 1085 return (0); 1086 } 1087 1088 /* 1089 * Read a mem_name_t from user-space and store it in the mem_name_t 1090 * pointed to by the mem_name argument. 1091 */ 1092 static int 1093 mm_read_mem_name(intptr_t data, mem_name_t *mem_name) 1094 { 1095 if (get_udatamodel() == DATAMODEL_NATIVE) { 1096 if (copyin((void *)data, mem_name, sizeof (mem_name_t))) 1097 return (EFAULT); 1098 } 1099 #ifdef _SYSCALL32 1100 else { 1101 mem_name32_t mem_name32; 1102 1103 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t))) 1104 return (EFAULT); 1105 mem_name->m_addr = mem_name32.m_addr; 1106 mem_name->m_synd = mem_name32.m_synd; 1107 mem_name->m_type[0] = mem_name32.m_type[0]; 1108 mem_name->m_type[1] = mem_name32.m_type[1]; 1109 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name; 1110 mem_name->m_namelen = (size_t)mem_name32.m_namelen; 1111 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid; 1112 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen; 1113 } 1114 #endif /* _SYSCALL32 */ 1115 1116 return (0); 1117 }