1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2015, Joyent, Inc. All rights reserved. 28 */ 29 30 /* 31 * Memory special file 32 */ 33 34 #include <sys/types.h> 35 #include <sys/param.h> 36 #include <sys/user.h> 37 #include <sys/buf.h> 38 #include <sys/systm.h> 39 #include <sys/cred.h> 40 #include <sys/vm.h> 41 #include <sys/uio.h> 42 #include <sys/mman.h> 43 #include <sys/kmem.h> 44 #include <vm/seg.h> 45 #include <vm/page.h> 46 #include <sys/stat.h> 47 #include <sys/vmem.h> 48 #include <sys/memlist.h> 49 #include <sys/bootconf.h> 50 51 #include <vm/seg_vn.h> 52 #include <vm/seg_dev.h> 53 #include <vm/seg_kmem.h> 54 #include <vm/seg_kp.h> 55 #include <vm/seg_kpm.h> 56 #include <vm/hat.h> 57 58 #include <sys/conf.h> 59 #include <sys/mem.h> 60 #include <sys/types.h> 61 #include <sys/conf.h> 62 #include <sys/param.h> 63 #include <sys/systm.h> 64 #include <sys/errno.h> 65 #include <sys/modctl.h> 66 #include <sys/memlist.h> 67 #include <sys/ddi.h> 68 #include <sys/sunddi.h> 69 #include <sys/debug.h> 70 #include <sys/fm/protocol.h> 71 72 #if defined(__sparc) 73 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *); 74 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *, 75 uint64_t *, int *, int *, int *); 76 extern size_t cpu_get_name_bufsize(void); 77 extern int cpu_get_mem_sid(char *, char *, int, int *); 78 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *); 79 #elif defined(__x86) 80 #include <sys/cpu_module.h> 81 #endif /* __sparc */ 82 83 /* 84 * Turn a byte length into a pagecount. The DDI btop takes a 85 * 32-bit size on 32-bit machines, this handles 64-bit sizes for 86 * large physical-memory 32-bit machines. 87 */ 88 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift)) 89 90 static kmutex_t mm_lock; 91 static caddr_t mm_map; 92 93 static dev_info_t *mm_dip; /* private copy of devinfo pointer */ 94 95 static int mm_kmem_io_access; 96 97 static int mm_kstat_update(kstat_t *ksp, int rw); 98 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw); 99 100 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name); 101 102 #define MM_KMEMLOG_NENTRIES 64 103 104 static int mm_kmemlogent; 105 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES]; 106 107 /* 108 * On kmem/allmem writes, we log information that might be useful in the event 109 * that a write is errant (that is, due to operator error) and induces a later 110 * problem. Note that (in particular) in the event of such operator-induced 111 * corruption, a search over the kernel address space for the corrupted 112 * address will yield the ring buffer entry that recorded the write. And 113 * should it seem baroque or otherwise unnecessary, yes, we need this kind of 114 * auditing facility and yes, we learned that the hard way: disturbingly, 115 * there exist recommendations for "tuning" the system that involve writing to 116 * kernel memory addresses via the kernel debugger, and -- as we discovered -- 117 * these can easily be applied incorrectly or unsafely, yielding an entirely 118 * undebuggable "can't happen" kind of panic. 119 */ 120 static void 121 mm_logkmem(struct uio *uio) 122 { 123 mm_logentry_t *ent; 124 proc_t *p = curthread->t_procp; 125 126 mutex_enter(&mm_lock); 127 128 ent = &mm_kmemlog[mm_kmemlogent++]; 129 130 if (mm_kmemlogent == MM_KMEMLOG_NENTRIES) 131 mm_kmemlogent = 0; 132 133 ent->mle_vaddr = (uintptr_t)uio->uio_loffset; 134 ent->mle_len = uio->uio_resid; 135 gethrestime(&ent->mle_hrestime); 136 ent->mle_hrtime = gethrtime(); 137 ent->mle_pid = p->p_pidp->pid_id; 138 139 (void) strncpy(ent->mle_psargs, 140 p->p_user.u_psargs, sizeof (ent->mle_psargs)); 141 142 mutex_exit(&mm_lock); 143 } 144 145 /*ARGSUSED1*/ 146 static int 147 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 148 { 149 int i; 150 struct mem_minor { 151 char *name; 152 minor_t minor; 153 int privonly; 154 const char *rdpriv; 155 const char *wrpriv; 156 mode_t priv_mode; 157 } mm[] = { 158 { "mem", M_MEM, 0, NULL, "all", 0640 }, 159 { "kmem", M_KMEM, 0, NULL, "all", 0640 }, 160 { "allkmem", M_ALLKMEM, 0, "all", "all", 0600 }, 161 { "null", M_NULL, PRIVONLY_DEV, NULL, NULL, 0666 }, 162 { "zero", M_ZERO, PRIVONLY_DEV, NULL, NULL, 0666 }, 163 }; 164 kstat_t *ksp; 165 166 mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL); 167 mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 168 169 for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) { 170 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR, 171 mm[i].minor, DDI_PSEUDO, mm[i].privonly, 172 mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) == 173 DDI_FAILURE) { 174 ddi_remove_minor_node(devi, NULL); 175 return (DDI_FAILURE); 176 } 177 } 178 179 mm_dip = devi; 180 181 ksp = kstat_create("mm", 0, "phys_installed", "misc", 182 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); 183 if (ksp != NULL) { 184 ksp->ks_update = mm_kstat_update; 185 ksp->ks_snapshot = mm_kstat_snapshot; 186 ksp->ks_lock = &mm_lock; /* XXX - not really needed */ 187 kstat_install(ksp); 188 } 189 190 mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, 191 "kmem_io_access", 0); 192 193 return (DDI_SUCCESS); 194 } 195 196 /*ARGSUSED*/ 197 static int 198 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 199 { 200 register int error; 201 202 switch (infocmd) { 203 case DDI_INFO_DEVT2DEVINFO: 204 *result = (void *)mm_dip; 205 error = DDI_SUCCESS; 206 break; 207 case DDI_INFO_DEVT2INSTANCE: 208 *result = (void *)0; 209 error = DDI_SUCCESS; 210 break; 211 default: 212 error = DDI_FAILURE; 213 } 214 return (error); 215 } 216 217 /*ARGSUSED1*/ 218 static int 219 mmopen(dev_t *devp, int flag, int typ, struct cred *cred) 220 { 221 switch (getminor(*devp)) { 222 case M_NULL: 223 case M_ZERO: 224 case M_MEM: 225 case M_KMEM: 226 case M_ALLKMEM: 227 /* standard devices */ 228 break; 229 230 default: 231 /* Unsupported or unknown type */ 232 return (EINVAL); 233 } 234 /* must be character device */ 235 if (typ != OTYP_CHR) 236 return (EINVAL); 237 return (0); 238 } 239 240 struct pollhead mm_pollhd; 241 242 /*ARGSUSED*/ 243 static int 244 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp, 245 struct pollhead **phpp) 246 { 247 switch (getminor(dev)) { 248 case M_NULL: 249 case M_ZERO: 250 case M_MEM: 251 case M_KMEM: 252 case M_ALLKMEM: 253 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM | 254 POLLWRNORM | POLLRDBAND | POLLWRBAND); 255 /* 256 * A non NULL pollhead pointer should be returned in case 257 * user polls for 0 events. 258 */ 259 *phpp = !anyyet && !*reventsp ? 260 &mm_pollhd : (struct pollhead *)NULL; 261 return (0); 262 default: 263 /* no other devices currently support polling */ 264 return (ENXIO); 265 } 266 } 267 268 static int 269 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags, 270 char *name, caddr_t valuep, int *lengthp) 271 { 272 /* 273 * implement zero size to reduce overhead (avoid two failing 274 * property lookups per stat). 275 */ 276 return (ddi_prop_op_size(dev, dip, prop_op, 277 flags, name, valuep, lengthp, 0)); 278 } 279 280 static int 281 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio, 282 page_t *pp) 283 { 284 int error = 0; 285 int devload = 0; 286 int is_memory = pf_is_memory(pfn); 287 size_t nbytes = MIN((size_t)(PAGESIZE - pageoff), 288 (size_t)uio->uio_iov->iov_len); 289 caddr_t va = NULL; 290 291 mutex_enter(&mm_lock); 292 293 if (is_memory && kpm_enable) { 294 if (pp) 295 va = hat_kpm_mapin(pp, NULL); 296 else 297 va = hat_kpm_mapin_pfn(pfn); 298 } 299 300 if (va == NULL) { 301 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn, 302 (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE), 303 HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK); 304 va = mm_map; 305 devload = 1; 306 } 307 308 if (!is_memory) { 309 if (allowio) { 310 size_t c = uio->uio_iov->iov_len; 311 312 if (ddi_peekpokeio(NULL, uio, rw, 313 (caddr_t)(uintptr_t)uio->uio_loffset, c, 314 sizeof (int32_t)) != DDI_SUCCESS) 315 error = EFAULT; 316 } else 317 error = EIO; 318 } else 319 error = uiomove(va + pageoff, nbytes, rw, uio); 320 321 if (devload) 322 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK); 323 else if (pp) 324 hat_kpm_mapout(pp, NULL, va); 325 else 326 hat_kpm_mapout_pfn(pfn); 327 328 mutex_exit(&mm_lock); 329 return (error); 330 } 331 332 static int 333 mmpagelock(struct as *as, caddr_t va) 334 { 335 struct seg *seg; 336 int i; 337 338 AS_LOCK_ENTER(as, RW_READER); 339 seg = as_segat(as, va); 340 i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0; 341 AS_LOCK_EXIT(as); 342 343 return (i); 344 } 345 346 #ifdef __sparc 347 348 #define NEED_LOCK_KVADDR(kva) mmpagelock(&kas, kva) 349 350 #else /* __i386, __amd64 */ 351 352 #define NEED_LOCK_KVADDR(va) 0 353 354 #endif /* __sparc */ 355 356 /*ARGSUSED3*/ 357 static int 358 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred) 359 { 360 pfn_t v; 361 struct iovec *iov; 362 int error = 0; 363 size_t c; 364 ssize_t oresid = uio->uio_resid; 365 minor_t minor = getminor(dev); 366 367 while (uio->uio_resid > 0 && error == 0) { 368 iov = uio->uio_iov; 369 if (iov->iov_len == 0) { 370 uio->uio_iov++; 371 uio->uio_iovcnt--; 372 if (uio->uio_iovcnt < 0) 373 panic("mmrw"); 374 continue; 375 } 376 switch (minor) { 377 378 case M_MEM: 379 memlist_read_lock(); 380 if (!address_in_memlist(phys_install, 381 (uint64_t)uio->uio_loffset, 1)) { 382 memlist_read_unlock(); 383 error = EFAULT; 384 break; 385 } 386 memlist_read_unlock(); 387 388 v = BTOP((u_offset_t)uio->uio_loffset); 389 error = mmio(uio, rw, v, 390 uio->uio_loffset & PAGEOFFSET, 0, NULL); 391 break; 392 393 case M_KMEM: 394 case M_ALLKMEM: 395 { 396 page_t **ppp = NULL; 397 caddr_t vaddr = (caddr_t)uio->uio_offset; 398 int try_lock = NEED_LOCK_KVADDR(vaddr); 399 int locked = 0; 400 401 if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP) 402 break; 403 404 if (rw == UIO_WRITE) 405 mm_logkmem(uio); 406 407 /* 408 * If vaddr does not map a valid page, as_pagelock() 409 * will return failure. Hence we can't check the 410 * return value and return EFAULT here as we'd like. 411 * seg_kp and seg_kpm do not properly support 412 * as_pagelock() for this context so we avoid it 413 * using the try_lock set check above. Some day when 414 * the kernel page locking gets redesigned all this 415 * muck can be cleaned up. 416 */ 417 if (try_lock) 418 locked = (as_pagelock(&kas, &ppp, vaddr, 419 PAGESIZE, S_WRITE) == 0); 420 421 v = hat_getpfnum(kas.a_hat, 422 (caddr_t)(uintptr_t)uio->uio_loffset); 423 if (v == PFN_INVALID) { 424 if (locked) 425 as_pageunlock(&kas, ppp, vaddr, 426 PAGESIZE, S_WRITE); 427 error = EFAULT; 428 break; 429 } 430 431 error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET, 432 minor == M_ALLKMEM || mm_kmem_io_access, 433 (locked && ppp) ? *ppp : NULL); 434 if (locked) 435 as_pageunlock(&kas, ppp, vaddr, PAGESIZE, 436 S_WRITE); 437 } 438 439 break; 440 441 case M_ZERO: 442 if (rw == UIO_READ) { 443 label_t ljb; 444 445 if (on_fault(&ljb)) { 446 no_fault(); 447 error = EFAULT; 448 break; 449 } 450 uzero(iov->iov_base, iov->iov_len); 451 no_fault(); 452 uio->uio_resid -= iov->iov_len; 453 uio->uio_loffset += iov->iov_len; 454 break; 455 } 456 /* else it's a write, fall through to NULL case */ 457 /*FALLTHROUGH*/ 458 459 case M_NULL: 460 if (rw == UIO_READ) 461 return (0); 462 c = iov->iov_len; 463 iov->iov_base += c; 464 iov->iov_len -= c; 465 uio->uio_loffset += c; 466 uio->uio_resid -= c; 467 break; 468 469 } 470 } 471 return (uio->uio_resid == oresid ? error : 0); 472 } 473 474 static int 475 mmread(dev_t dev, struct uio *uio, cred_t *cred) 476 { 477 return (mmrw(dev, uio, UIO_READ, cred)); 478 } 479 480 static int 481 mmwrite(dev_t dev, struct uio *uio, cred_t *cred) 482 { 483 return (mmrw(dev, uio, UIO_WRITE, cred)); 484 } 485 486 /* 487 * Private ioctl for libkvm to support kvm_physaddr(). 488 * Given an address space and a VA, compute the PA. 489 */ 490 static int 491 mmioctl_vtop(intptr_t data) 492 { 493 #ifdef _SYSCALL32 494 mem_vtop32_t vtop32; 495 #endif 496 mem_vtop_t mem_vtop; 497 proc_t *p; 498 pfn_t pfn = (pfn_t)PFN_INVALID; 499 pid_t pid = 0; 500 struct as *as; 501 struct seg *seg; 502 503 if (get_udatamodel() == DATAMODEL_NATIVE) { 504 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t))) 505 return (EFAULT); 506 } 507 #ifdef _SYSCALL32 508 else { 509 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t))) 510 return (EFAULT); 511 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as; 512 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va; 513 514 if (mem_vtop.m_as != NULL) 515 return (EINVAL); 516 } 517 #endif 518 519 if (mem_vtop.m_as == &kas) { 520 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va); 521 } else { 522 if (mem_vtop.m_as == NULL) { 523 /* 524 * Assume the calling process's address space if the 525 * caller didn't specify one. 526 */ 527 p = curthread->t_procp; 528 if (p == NULL) 529 return (EIO); 530 mem_vtop.m_as = p->p_as; 531 } 532 533 mutex_enter(&pidlock); 534 for (p = practive; p != NULL; p = p->p_next) { 535 if (p->p_as == mem_vtop.m_as) { 536 pid = p->p_pid; 537 break; 538 } 539 } 540 mutex_exit(&pidlock); 541 if (p == NULL) 542 return (EIO); 543 p = sprlock(pid); 544 if (p == NULL) 545 return (EIO); 546 as = p->p_as; 547 if (as == mem_vtop.m_as) { 548 mutex_exit(&p->p_lock); 549 AS_LOCK_ENTER(as, RW_READER); 550 for (seg = AS_SEGFIRST(as); seg != NULL; 551 seg = AS_SEGNEXT(as, seg)) 552 if ((uintptr_t)mem_vtop.m_va - 553 (uintptr_t)seg->s_base < seg->s_size) 554 break; 555 if (seg != NULL) 556 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va); 557 AS_LOCK_EXIT(as); 558 mutex_enter(&p->p_lock); 559 } 560 sprunlock(p); 561 } 562 mem_vtop.m_pfn = pfn; 563 if (pfn == PFN_INVALID) 564 return (EIO); 565 566 if (get_udatamodel() == DATAMODEL_NATIVE) { 567 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t))) 568 return (EFAULT); 569 } 570 #ifdef _SYSCALL32 571 else { 572 vtop32.m_pfn = mem_vtop.m_pfn; 573 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t))) 574 return (EFAULT); 575 } 576 #endif 577 578 return (0); 579 } 580 581 /* 582 * Given a PA, execute the given page retire command on it. 583 */ 584 static int 585 mmioctl_page_retire(int cmd, intptr_t data) 586 { 587 extern int page_retire_test(void); 588 uint64_t pa; 589 590 if (copyin((void *)data, &pa, sizeof (uint64_t))) { 591 return (EFAULT); 592 } 593 594 switch (cmd) { 595 case MEM_PAGE_ISRETIRED: 596 return (page_retire_check(pa, NULL)); 597 598 case MEM_PAGE_UNRETIRE: 599 return (page_unretire(pa)); 600 601 case MEM_PAGE_RETIRE: 602 return (page_retire(pa, PR_FMA)); 603 604 case MEM_PAGE_RETIRE_MCE: 605 return (page_retire(pa, PR_MCE)); 606 607 case MEM_PAGE_RETIRE_UE: 608 return (page_retire(pa, PR_UE)); 609 610 case MEM_PAGE_GETERRORS: 611 { 612 uint64_t page_errors; 613 int rc = page_retire_check(pa, &page_errors); 614 if (copyout(&page_errors, (void *)data, 615 sizeof (uint64_t))) { 616 return (EFAULT); 617 } 618 return (rc); 619 } 620 621 case MEM_PAGE_RETIRE_TEST: 622 return (page_retire_test()); 623 624 } 625 626 return (EINVAL); 627 } 628 629 #ifdef __sparc 630 /* 631 * Given a syndrome, syndrome type, and address return the 632 * associated memory name in the provided data buffer. 633 */ 634 static int 635 mmioctl_get_mem_name(intptr_t data) 636 { 637 mem_name_t mem_name; 638 void *buf; 639 size_t bufsize; 640 int len, err; 641 642 if ((bufsize = cpu_get_name_bufsize()) == 0) 643 return (ENOTSUP); 644 645 if ((err = mm_read_mem_name(data, &mem_name)) < 0) 646 return (err); 647 648 buf = kmem_alloc(bufsize, KM_SLEEP); 649 650 /* 651 * Call into cpu specific code to do the lookup. 652 */ 653 if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type, 654 mem_name.m_addr, buf, bufsize, &len)) != 0) { 655 kmem_free(buf, bufsize); 656 return (err); 657 } 658 659 if (len >= mem_name.m_namelen) { 660 kmem_free(buf, bufsize); 661 return (ENOSPC); 662 } 663 664 if (copyoutstr(buf, (char *)mem_name.m_name, 665 mem_name.m_namelen, NULL) != 0) { 666 kmem_free(buf, bufsize); 667 return (EFAULT); 668 } 669 670 kmem_free(buf, bufsize); 671 return (0); 672 } 673 674 /* 675 * Given a syndrome and address return information about the associated memory. 676 */ 677 static int 678 mmioctl_get_mem_info(intptr_t data) 679 { 680 mem_info_t mem_info; 681 int err; 682 683 if (copyin((void *)data, &mem_info, sizeof (mem_info_t))) 684 return (EFAULT); 685 686 if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr, 687 &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size, 688 &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0) 689 return (err); 690 691 if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0) 692 return (EFAULT); 693 694 return (0); 695 } 696 697 /* 698 * Given a memory name, return its associated serial id 699 */ 700 static int 701 mmioctl_get_mem_sid(intptr_t data) 702 { 703 mem_name_t mem_name; 704 void *buf; 705 void *name; 706 size_t name_len; 707 size_t bufsize; 708 int len, err; 709 710 if ((bufsize = cpu_get_name_bufsize()) == 0) 711 return (ENOTSUP); 712 713 if ((err = mm_read_mem_name(data, &mem_name)) < 0) 714 return (err); 715 716 buf = kmem_alloc(bufsize, KM_SLEEP); 717 718 if (mem_name.m_namelen > 1024) 719 mem_name.m_namelen = 1024; /* cap at 1024 bytes */ 720 721 name = kmem_alloc(mem_name.m_namelen, KM_SLEEP); 722 723 if ((err = copyinstr((char *)mem_name.m_name, (char *)name, 724 mem_name.m_namelen, &name_len)) != 0) { 725 kmem_free(buf, bufsize); 726 kmem_free(name, mem_name.m_namelen); 727 return (err); 728 } 729 730 /* 731 * Call into cpu specific code to do the lookup. 732 */ 733 if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) { 734 kmem_free(buf, bufsize); 735 kmem_free(name, mem_name.m_namelen); 736 return (err); 737 } 738 739 if (len > mem_name.m_sidlen) { 740 kmem_free(buf, bufsize); 741 kmem_free(name, mem_name.m_namelen); 742 return (ENAMETOOLONG); 743 } 744 745 if (copyoutstr(buf, (char *)mem_name.m_sid, 746 mem_name.m_sidlen, NULL) != 0) { 747 kmem_free(buf, bufsize); 748 kmem_free(name, mem_name.m_namelen); 749 return (EFAULT); 750 } 751 752 kmem_free(buf, bufsize); 753 kmem_free(name, mem_name.m_namelen); 754 return (0); 755 } 756 #endif /* __sparc */ 757 758 /* 759 * Private ioctls for 760 * libkvm to support kvm_physaddr(). 761 * FMA support for page_retire() and memory attribute information. 762 */ 763 /*ARGSUSED*/ 764 static int 765 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp) 766 { 767 if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) || 768 (cmd != MEM_VTOP && getminor(dev) != M_MEM)) 769 return (ENXIO); 770 771 switch (cmd) { 772 case MEM_VTOP: 773 return (mmioctl_vtop(data)); 774 775 case MEM_PAGE_RETIRE: 776 case MEM_PAGE_ISRETIRED: 777 case MEM_PAGE_UNRETIRE: 778 case MEM_PAGE_RETIRE_MCE: 779 case MEM_PAGE_RETIRE_UE: 780 case MEM_PAGE_GETERRORS: 781 case MEM_PAGE_RETIRE_TEST: 782 return (mmioctl_page_retire(cmd, data)); 783 784 #ifdef __sparc 785 case MEM_NAME: 786 return (mmioctl_get_mem_name(data)); 787 788 case MEM_INFO: 789 return (mmioctl_get_mem_info(data)); 790 791 case MEM_SID: 792 return (mmioctl_get_mem_sid(data)); 793 #else 794 case MEM_NAME: 795 case MEM_INFO: 796 case MEM_SID: 797 return (ENOTSUP); 798 #endif /* __sparc */ 799 } 800 return (ENXIO); 801 } 802 803 /*ARGSUSED2*/ 804 static int 805 mmmmap(dev_t dev, off_t off, int prot) 806 { 807 pfn_t pf; 808 struct memlist *pmem; 809 minor_t minor = getminor(dev); 810 811 switch (minor) { 812 case M_MEM: 813 pf = btop(off); 814 memlist_read_lock(); 815 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) { 816 if (pf >= BTOP(pmem->ml_address) && 817 pf < BTOP(pmem->ml_address + pmem->ml_size)) { 818 memlist_read_unlock(); 819 return (impl_obmem_pfnum(pf)); 820 } 821 } 822 memlist_read_unlock(); 823 break; 824 825 case M_KMEM: 826 case M_ALLKMEM: 827 /* no longer supported with KPR */ 828 return (-1); 829 830 case M_ZERO: 831 /* 832 * We shouldn't be mmap'ing to /dev/zero here as 833 * mmsegmap() should have already converted 834 * a mapping request for this device to a mapping 835 * using seg_vn for anonymous memory. 836 */ 837 break; 838 839 } 840 return (-1); 841 } 842 843 /* 844 * This function is called when a memory device is mmap'ed. 845 * Set up the mapping to the correct device driver. 846 */ 847 static int 848 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 849 uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred) 850 { 851 struct segvn_crargs vn_a; 852 struct segdev_crargs dev_a; 853 int error; 854 minor_t minor; 855 off_t i; 856 857 minor = getminor(dev); 858 859 as_rangelock(as); 860 /* 861 * No need to worry about vac alignment on /dev/zero 862 * since this is a "clone" object that doesn't yet exist. 863 */ 864 error = choose_addr(as, addrp, len, off, 865 (minor == M_MEM) || (minor == M_KMEM), flags); 866 if (error != 0) { 867 as_rangeunlock(as); 868 return (error); 869 } 870 871 switch (minor) { 872 case M_MEM: 873 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */ 874 if ((flags & MAP_TYPE) != MAP_SHARED) { 875 as_rangeunlock(as); 876 return (EINVAL); 877 } 878 879 /* 880 * Check to ensure that the entire range is 881 * legal and we are not trying to map in 882 * more than the device will let us. 883 */ 884 for (i = 0; i < len; i += PAGESIZE) { 885 if (mmmmap(dev, off + i, maxprot) == -1) { 886 as_rangeunlock(as); 887 return (ENXIO); 888 } 889 } 890 891 /* 892 * Use seg_dev segment driver for /dev/mem mapping. 893 */ 894 dev_a.mapfunc = mmmmap; 895 dev_a.dev = dev; 896 dev_a.offset = off; 897 dev_a.type = (flags & MAP_TYPE); 898 dev_a.prot = (uchar_t)prot; 899 dev_a.maxprot = (uchar_t)maxprot; 900 dev_a.hat_attr = 0; 901 902 /* 903 * Make /dev/mem mappings non-consistent since we can't 904 * alias pages that don't have page structs behind them, 905 * such as kernel stack pages. If someone mmap()s a kernel 906 * stack page and if we give him a tte with cv, a line from 907 * that page can get into both pages of the spitfire d$. 908 * But snoop from another processor will only invalidate 909 * the first page. This later caused kernel (xc_attention) 910 * to go into an infinite loop at pil 13 and no interrupts 911 * could come in. See 1203630. 912 * 913 */ 914 dev_a.hat_flags = HAT_LOAD_NOCONSIST; 915 dev_a.devmap_data = NULL; 916 917 error = as_map(as, *addrp, len, segdev_create, &dev_a); 918 break; 919 920 case M_ZERO: 921 /* 922 * Use seg_vn segment driver for /dev/zero mapping. 923 * Passing in a NULL amp gives us the "cloning" effect. 924 */ 925 vn_a.vp = NULL; 926 vn_a.offset = 0; 927 vn_a.type = (flags & MAP_TYPE); 928 vn_a.prot = prot; 929 vn_a.maxprot = maxprot; 930 vn_a.flags = flags & ~MAP_TYPE; 931 vn_a.cred = cred; 932 vn_a.amp = NULL; 933 vn_a.szc = 0; 934 vn_a.lgrp_mem_policy_flags = 0; 935 error = as_map(as, *addrp, len, segvn_create, &vn_a); 936 break; 937 938 case M_KMEM: 939 case M_ALLKMEM: 940 /* No longer supported with KPR. */ 941 error = ENXIO; 942 break; 943 944 case M_NULL: 945 /* 946 * Use seg_dev segment driver for /dev/null mapping. 947 */ 948 dev_a.mapfunc = mmmmap; 949 dev_a.dev = dev; 950 dev_a.offset = off; 951 dev_a.type = 0; /* neither PRIVATE nor SHARED */ 952 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE; 953 dev_a.hat_attr = 0; 954 dev_a.hat_flags = 0; 955 error = as_map(as, *addrp, len, segdev_create, &dev_a); 956 break; 957 958 default: 959 error = ENXIO; 960 } 961 962 as_rangeunlock(as); 963 return (error); 964 } 965 966 static struct cb_ops mm_cb_ops = { 967 mmopen, /* open */ 968 nulldev, /* close */ 969 nodev, /* strategy */ 970 nodev, /* print */ 971 nodev, /* dump */ 972 mmread, /* read */ 973 mmwrite, /* write */ 974 mmioctl, /* ioctl */ 975 nodev, /* devmap */ 976 mmmmap, /* mmap */ 977 mmsegmap, /* segmap */ 978 mmchpoll, /* poll */ 979 mmpropop, /* prop_op */ 980 0, /* streamtab */ 981 D_NEW | D_MP | D_64BIT | D_U64BIT 982 }; 983 984 static struct dev_ops mm_ops = { 985 DEVO_REV, /* devo_rev, */ 986 0, /* refcnt */ 987 mm_info, /* get_dev_info */ 988 nulldev, /* identify */ 989 nulldev, /* probe */ 990 mm_attach, /* attach */ 991 nodev, /* detach */ 992 nodev, /* reset */ 993 &mm_cb_ops, /* driver operations */ 994 (struct bus_ops *)0, /* bus operations */ 995 NULL, /* power */ 996 ddi_quiesce_not_needed, /* quiesce */ 997 }; 998 999 static struct modldrv modldrv = { 1000 &mod_driverops, "memory driver", &mm_ops, 1001 }; 1002 1003 static struct modlinkage modlinkage = { 1004 MODREV_1, { &modldrv, NULL } 1005 }; 1006 1007 int 1008 _init(void) 1009 { 1010 return (mod_install(&modlinkage)); 1011 } 1012 1013 int 1014 _info(struct modinfo *modinfop) 1015 { 1016 return (mod_info(&modlinkage, modinfop)); 1017 } 1018 1019 int 1020 _fini(void) 1021 { 1022 return (mod_remove(&modlinkage)); 1023 } 1024 1025 static int 1026 mm_kstat_update(kstat_t *ksp, int rw) 1027 { 1028 struct memlist *pmem; 1029 uint_t count; 1030 1031 if (rw == KSTAT_WRITE) 1032 return (EACCES); 1033 1034 count = 0; 1035 memlist_read_lock(); 1036 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) { 1037 count++; 1038 } 1039 memlist_read_unlock(); 1040 1041 ksp->ks_ndata = count; 1042 ksp->ks_data_size = count * 2 * sizeof (uint64_t); 1043 1044 return (0); 1045 } 1046 1047 static int 1048 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw) 1049 { 1050 struct memlist *pmem; 1051 struct memunit { 1052 uint64_t address; 1053 uint64_t size; 1054 } *kspmem; 1055 1056 if (rw == KSTAT_WRITE) 1057 return (EACCES); 1058 1059 ksp->ks_snaptime = gethrtime(); 1060 1061 kspmem = (struct memunit *)buf; 1062 memlist_read_lock(); 1063 for (pmem = phys_install; pmem != NULL; 1064 pmem = pmem->ml_next, kspmem++) { 1065 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) 1066 break; 1067 kspmem->address = pmem->ml_address; 1068 kspmem->size = pmem->ml_size; 1069 } 1070 memlist_read_unlock(); 1071 1072 return (0); 1073 } 1074 1075 /* 1076 * Read a mem_name_t from user-space and store it in the mem_name_t 1077 * pointed to by the mem_name argument. 1078 */ 1079 static int 1080 mm_read_mem_name(intptr_t data, mem_name_t *mem_name) 1081 { 1082 if (get_udatamodel() == DATAMODEL_NATIVE) { 1083 if (copyin((void *)data, mem_name, sizeof (mem_name_t))) 1084 return (EFAULT); 1085 } 1086 #ifdef _SYSCALL32 1087 else { 1088 mem_name32_t mem_name32; 1089 1090 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t))) 1091 return (EFAULT); 1092 mem_name->m_addr = mem_name32.m_addr; 1093 mem_name->m_synd = mem_name32.m_synd; 1094 mem_name->m_type[0] = mem_name32.m_type[0]; 1095 mem_name->m_type[1] = mem_name32.m_type[1]; 1096 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name; 1097 mem_name->m_namelen = (size_t)mem_name32.m_namelen; 1098 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid; 1099 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen; 1100 } 1101 #endif /* _SYSCALL32 */ 1102 1103 return (0); 1104 }