1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright (c) 2015, Joyent, Inc. All rights reserved.
28 * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com>
29 */
30
31 /*
32 * Memory special file
33 */
34
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/user.h>
38 #include <sys/buf.h>
39 #include <sys/systm.h>
40 #include <sys/cred.h>
41 #include <sys/vm.h>
42 #include <sys/uio.h>
43 #include <sys/mman.h>
44 #include <sys/kmem.h>
45 #include <vm/seg.h>
46 #include <vm/page.h>
47 #include <sys/stat.h>
48 #include <sys/vmem.h>
49 #include <sys/memlist.h>
50 #include <sys/bootconf.h>
51
52 #include <vm/seg_vn.h>
53 #include <vm/seg_dev.h>
54 #include <vm/seg_kmem.h>
55 #include <vm/seg_kp.h>
56 #include <vm/seg_kpm.h>
57 #include <vm/hat.h>
58
59 #include <sys/conf.h>
60 #include <sys/mem.h>
61 #include <sys/types.h>
62 #include <sys/conf.h>
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/errno.h>
66 #include <sys/modctl.h>
67 #include <sys/memlist.h>
68 #include <sys/ddi.h>
69 #include <sys/sunddi.h>
70 #include <sys/debug.h>
71 #include <sys/fm/protocol.h>
72
73 #if defined(__sparc)
74 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
75 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
76 uint64_t *, int *, int *, int *);
77 extern size_t cpu_get_name_bufsize(void);
78 extern int cpu_get_mem_sid(char *, char *, int, int *);
79 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
80 #elif defined(__x86)
81 #include <sys/cpu_module.h>
82 #endif /* __sparc */
83
84 /*
85 * Turn a byte length into a pagecount. The DDI btop takes a
86 * 32-bit size on 32-bit machines, this handles 64-bit sizes for
87 * large physical-memory 32-bit machines.
88 */
89 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift))
90
91 static kmutex_t mm_lock;
92 static caddr_t mm_map;
93
94 static dev_info_t *mm_dip; /* private copy of devinfo pointer */
95
96 static int mm_kmem_io_access;
97
98 static int mm_kstat_update(kstat_t *ksp, int rw);
99 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
100
101 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
102
103 #define MM_KMEMLOG_NENTRIES 64
104
105 static int mm_kmemlogent;
106 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
107
108 /*
109 * On kmem/allmem writes, we log information that might be useful in the event
110 * that a write is errant (that is, due to operator error) and induces a later
111 * problem. Note that (in particular) in the event of such operator-induced
112 * corruption, a search over the kernel address space for the corrupted
113 * address will yield the ring buffer entry that recorded the write. And
114 * should it seem baroque or otherwise unnecessary, yes, we need this kind of
115 * auditing facility and yes, we learned that the hard way: disturbingly,
116 * there exist recommendations for "tuning" the system that involve writing to
117 * kernel memory addresses via the kernel debugger, and -- as we discovered --
118 * these can easily be applied incorrectly or unsafely, yielding an entirely
119 * undebuggable "can't happen" kind of panic.
120 */
121 static void
122 mm_logkmem(struct uio *uio)
123 {
124 mm_logentry_t *ent;
125 proc_t *p = curthread->t_procp;
126
127 mutex_enter(&mm_lock);
128
129 ent = &mm_kmemlog[mm_kmemlogent++];
130
131 if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
132 mm_kmemlogent = 0;
133
134 ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
135 ent->mle_len = uio->uio_resid;
136 gethrestime(&ent->mle_hrestime);
137 ent->mle_hrtime = gethrtime();
138 ent->mle_pid = p->p_pidp->pid_id;
139
140 (void) strncpy(ent->mle_psargs,
141 p->p_user.u_psargs, sizeof (ent->mle_psargs));
142
143 mutex_exit(&mm_lock);
144 }
145
146 /*ARGSUSED1*/
147 static int
148 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
149 {
150 int i;
151 struct mem_minor {
152 char *name;
153 minor_t minor;
154 int privonly;
155 const char *rdpriv;
156 const char *wrpriv;
157 mode_t priv_mode;
158 } mm[] = {
159 { "mem", M_MEM, 0, NULL, "all", 0640 },
160 { "kmem", M_KMEM, 0, NULL, "all", 0640 },
161 { "allkmem", M_ALLKMEM, 0, "all", "all", 0600 },
162 { "null", M_NULL, PRIVONLY_DEV, NULL, NULL, 0666 },
163 { "zero", M_ZERO, PRIVONLY_DEV, NULL, NULL, 0666 },
164 { "full", M_FULL, PRIVONLY_DEV, NULL, NULL, 0666 },
165 };
166 kstat_t *ksp;
167
168 mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
169 mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
170
171 for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
172 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
173 mm[i].minor, DDI_PSEUDO, mm[i].privonly,
174 mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
175 DDI_FAILURE) {
176 ddi_remove_minor_node(devi, NULL);
177 return (DDI_FAILURE);
178 }
179 }
180
181 mm_dip = devi;
182
183 ksp = kstat_create("mm", 0, "phys_installed", "misc",
184 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
185 if (ksp != NULL) {
186 ksp->ks_update = mm_kstat_update;
187 ksp->ks_snapshot = mm_kstat_snapshot;
188 ksp->ks_lock = &mm_lock; /* XXX - not really needed */
189 kstat_install(ksp);
190 }
191
192 mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
193 "kmem_io_access", 0);
194
195 return (DDI_SUCCESS);
196 }
197
198 /*ARGSUSED*/
199 static int
200 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
201 {
202 register int error;
203
204 switch (infocmd) {
205 case DDI_INFO_DEVT2DEVINFO:
206 *result = (void *)mm_dip;
207 error = DDI_SUCCESS;
208 break;
209 case DDI_INFO_DEVT2INSTANCE:
210 *result = (void *)0;
211 error = DDI_SUCCESS;
212 break;
213 default:
214 error = DDI_FAILURE;
215 }
216 return (error);
217 }
218
219 /*ARGSUSED1*/
220 static int
221 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
222 {
223 switch (getminor(*devp)) {
224 case M_NULL:
225 case M_ZERO:
226 case M_FULL:
227 case M_MEM:
228 case M_KMEM:
229 case M_ALLKMEM:
230 /* standard devices */
231 break;
232
233 default:
234 /* Unsupported or unknown type */
235 return (EINVAL);
236 }
237 /* must be character device */
238 if (typ != OTYP_CHR)
239 return (EINVAL);
240 return (0);
241 }
242
243 struct pollhead mm_pollhd;
244
245 /*ARGSUSED*/
246 static int
247 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
248 struct pollhead **phpp)
249 {
250 switch (getminor(dev)) {
251 case M_NULL:
252 case M_ZERO:
253 case M_FULL:
254 case M_MEM:
255 case M_KMEM:
256 case M_ALLKMEM:
257 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
258 POLLWRNORM | POLLRDBAND | POLLWRBAND);
259 /*
260 * A non NULL pollhead pointer should be returned in case
261 * user polls for 0 events.
262 */
263 *phpp = !anyyet && !*reventsp ?
264 &mm_pollhd : (struct pollhead *)NULL;
265 return (0);
266 default:
267 /* no other devices currently support polling */
268 return (ENXIO);
269 }
270 }
271
272 static int
273 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
274 char *name, caddr_t valuep, int *lengthp)
275 {
276 /*
277 * implement zero size to reduce overhead (avoid two failing
278 * property lookups per stat).
279 */
280 return (ddi_prop_op_size(dev, dip, prop_op,
281 flags, name, valuep, lengthp, 0));
282 }
283
284 static int
285 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
286 page_t *pp)
287 {
288 int error = 0;
289 int devload = 0;
290 int is_memory = pf_is_memory(pfn);
291 size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
292 (size_t)uio->uio_iov->iov_len);
293 caddr_t va = NULL;
294
295 mutex_enter(&mm_lock);
296
297 if (is_memory && kpm_enable) {
298 if (pp)
299 va = hat_kpm_mapin(pp, NULL);
300 else
301 va = hat_kpm_mapin_pfn(pfn);
302 }
303
304 if (va == NULL) {
305 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
306 (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
307 HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
308 va = mm_map;
309 devload = 1;
310 }
311
312 if (!is_memory) {
313 if (allowio) {
314 size_t c = uio->uio_iov->iov_len;
315
316 if (ddi_peekpokeio(NULL, uio, rw,
317 (caddr_t)(uintptr_t)uio->uio_loffset, c,
318 sizeof (int32_t)) != DDI_SUCCESS)
319 error = EFAULT;
320 } else
321 error = EIO;
322 } else
323 error = uiomove(va + pageoff, nbytes, rw, uio);
324
325 if (devload)
326 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
327 else if (pp)
328 hat_kpm_mapout(pp, NULL, va);
329 else
330 hat_kpm_mapout_pfn(pfn);
331
332 mutex_exit(&mm_lock);
333 return (error);
334 }
335
336 static int
337 mmpagelock(struct as *as, caddr_t va)
338 {
339 struct seg *seg;
340 int i;
341
342 AS_LOCK_ENTER(as, RW_READER);
343 seg = as_segat(as, va);
344 i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
345 AS_LOCK_EXIT(as);
346
347 return (i);
348 }
349
350 #ifdef __sparc
351
352 #define NEED_LOCK_KVADDR(kva) mmpagelock(&kas, kva)
353
354 #else /* __i386, __amd64 */
355
356 #define NEED_LOCK_KVADDR(va) 0
357
358 #endif /* __sparc */
359
360 /*ARGSUSED3*/
361 static int
362 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
363 {
364 pfn_t v;
365 struct iovec *iov;
366 int error = 0;
367 size_t c;
368 ssize_t oresid = uio->uio_resid;
369 minor_t minor = getminor(dev);
370
371 while (uio->uio_resid > 0 && error == 0) {
372 iov = uio->uio_iov;
373 if (iov->iov_len == 0) {
374 uio->uio_iov++;
375 uio->uio_iovcnt--;
376 if (uio->uio_iovcnt < 0)
377 panic("mmrw");
378 continue;
379 }
380 switch (minor) {
381
382 case M_MEM:
383 memlist_read_lock();
384 if (!address_in_memlist(phys_install,
385 (uint64_t)uio->uio_loffset, 1)) {
386 memlist_read_unlock();
387 error = EFAULT;
388 break;
389 }
390 memlist_read_unlock();
391
392 v = BTOP((u_offset_t)uio->uio_loffset);
393 error = mmio(uio, rw, v,
394 uio->uio_loffset & PAGEOFFSET, 0, NULL);
395 break;
396
397 case M_KMEM:
398 case M_ALLKMEM:
399 {
400 page_t **ppp = NULL;
401 caddr_t vaddr = (caddr_t)uio->uio_offset;
402 int try_lock = NEED_LOCK_KVADDR(vaddr);
403 int locked = 0;
404
405 if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
406 break;
407
408 if (rw == UIO_WRITE)
409 mm_logkmem(uio);
410
411 /*
412 * If vaddr does not map a valid page, as_pagelock()
413 * will return failure. Hence we can't check the
414 * return value and return EFAULT here as we'd like.
415 * seg_kp and seg_kpm do not properly support
416 * as_pagelock() for this context so we avoid it
417 * using the try_lock set check above. Some day when
418 * the kernel page locking gets redesigned all this
419 * muck can be cleaned up.
420 */
421 if (try_lock)
422 locked = (as_pagelock(&kas, &ppp, vaddr,
423 PAGESIZE, S_WRITE) == 0);
424
425 v = hat_getpfnum(kas.a_hat,
426 (caddr_t)(uintptr_t)uio->uio_loffset);
427 if (v == PFN_INVALID) {
428 if (locked)
429 as_pageunlock(&kas, ppp, vaddr,
430 PAGESIZE, S_WRITE);
431 error = EFAULT;
432 break;
433 }
434
435 error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
436 minor == M_ALLKMEM || mm_kmem_io_access,
437 (locked && ppp) ? *ppp : NULL);
438 if (locked)
439 as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
440 S_WRITE);
441 }
442
443 break;
444
445 case M_FULL:
446 if (rw == UIO_WRITE) {
447 error = ENOSPC;
448 break;
449 }
450 /* else it's a read, fall through to zero case */
451 /*FALLTHROUGH*/
452
453 case M_ZERO:
454 if (rw == UIO_READ) {
455 label_t ljb;
456
457 if (on_fault(&ljb)) {
458 no_fault();
459 error = EFAULT;
460 break;
461 }
462 uzero(iov->iov_base, iov->iov_len);
463 no_fault();
464 uio->uio_resid -= iov->iov_len;
465 uio->uio_loffset += iov->iov_len;
466 break;
467 }
468 /* else it's a write, fall through to NULL case */
469 /*FALLTHROUGH*/
470
471 case M_NULL:
472 if (rw == UIO_READ)
473 return (0);
474 c = iov->iov_len;
475 iov->iov_base += c;
476 iov->iov_len -= c;
477 uio->uio_loffset += c;
478 uio->uio_resid -= c;
479 break;
480
481 }
482 }
483 return (uio->uio_resid == oresid ? error : 0);
484 }
485
486 static int
487 mmread(dev_t dev, struct uio *uio, cred_t *cred)
488 {
489 return (mmrw(dev, uio, UIO_READ, cred));
490 }
491
492 static int
493 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
494 {
495 return (mmrw(dev, uio, UIO_WRITE, cred));
496 }
497
498 /*
499 * Private ioctl for libkvm to support kvm_physaddr().
500 * Given an address space and a VA, compute the PA.
501 */
502 static int
503 mmioctl_vtop(intptr_t data)
504 {
505 #ifdef _SYSCALL32
506 mem_vtop32_t vtop32;
507 #endif
508 mem_vtop_t mem_vtop;
509 proc_t *p;
510 pfn_t pfn = (pfn_t)PFN_INVALID;
511 pid_t pid = 0;
512 struct as *as;
513 struct seg *seg;
514
515 if (get_udatamodel() == DATAMODEL_NATIVE) {
516 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
517 return (EFAULT);
518 }
519 #ifdef _SYSCALL32
520 else {
521 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
522 return (EFAULT);
523 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
524 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
525
526 if (mem_vtop.m_as != NULL)
527 return (EINVAL);
528 }
529 #endif
530
531 if (mem_vtop.m_as == &kas) {
532 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
533 } else {
534 if (mem_vtop.m_as == NULL) {
535 /*
536 * Assume the calling process's address space if the
537 * caller didn't specify one.
538 */
539 p = curthread->t_procp;
540 if (p == NULL)
541 return (EIO);
542 mem_vtop.m_as = p->p_as;
543 }
544
545 mutex_enter(&pidlock);
546 for (p = practive; p != NULL; p = p->p_next) {
547 if (p->p_as == mem_vtop.m_as) {
548 pid = p->p_pid;
549 break;
550 }
551 }
552 mutex_exit(&pidlock);
553 if (p == NULL)
554 return (EIO);
555 p = sprlock(pid);
556 if (p == NULL)
557 return (EIO);
558 as = p->p_as;
559 if (as == mem_vtop.m_as) {
560 mutex_exit(&p->p_lock);
561 AS_LOCK_ENTER(as, RW_READER);
562 for (seg = AS_SEGFIRST(as); seg != NULL;
563 seg = AS_SEGNEXT(as, seg))
564 if ((uintptr_t)mem_vtop.m_va -
565 (uintptr_t)seg->s_base < seg->s_size)
566 break;
567 if (seg != NULL)
568 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
569 AS_LOCK_EXIT(as);
570 mutex_enter(&p->p_lock);
571 }
572 sprunlock(p);
573 }
574 mem_vtop.m_pfn = pfn;
575 if (pfn == PFN_INVALID)
576 return (EIO);
577
578 if (get_udatamodel() == DATAMODEL_NATIVE) {
579 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
580 return (EFAULT);
581 }
582 #ifdef _SYSCALL32
583 else {
584 vtop32.m_pfn = mem_vtop.m_pfn;
585 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
586 return (EFAULT);
587 }
588 #endif
589
590 return (0);
591 }
592
593 /*
594 * Given a PA, execute the given page retire command on it.
595 */
596 static int
597 mmioctl_page_retire(int cmd, intptr_t data)
598 {
599 extern int page_retire_test(void);
600 uint64_t pa;
601
602 if (copyin((void *)data, &pa, sizeof (uint64_t))) {
603 return (EFAULT);
604 }
605
606 switch (cmd) {
607 case MEM_PAGE_ISRETIRED:
608 return (page_retire_check(pa, NULL));
609
610 case MEM_PAGE_UNRETIRE:
611 return (page_unretire(pa));
612
613 case MEM_PAGE_RETIRE:
614 return (page_retire(pa, PR_FMA));
615
616 case MEM_PAGE_RETIRE_MCE:
617 return (page_retire(pa, PR_MCE));
618
619 case MEM_PAGE_RETIRE_UE:
620 return (page_retire(pa, PR_UE));
621
622 case MEM_PAGE_GETERRORS:
623 {
624 uint64_t page_errors;
625 int rc = page_retire_check(pa, &page_errors);
626 if (copyout(&page_errors, (void *)data,
627 sizeof (uint64_t))) {
628 return (EFAULT);
629 }
630 return (rc);
631 }
632
633 case MEM_PAGE_RETIRE_TEST:
634 return (page_retire_test());
635
636 }
637
638 return (EINVAL);
639 }
640
641 #ifdef __sparc
642 /*
643 * Given a syndrome, syndrome type, and address return the
644 * associated memory name in the provided data buffer.
645 */
646 static int
647 mmioctl_get_mem_name(intptr_t data)
648 {
649 mem_name_t mem_name;
650 void *buf;
651 size_t bufsize;
652 int len, err;
653
654 if ((bufsize = cpu_get_name_bufsize()) == 0)
655 return (ENOTSUP);
656
657 if ((err = mm_read_mem_name(data, &mem_name)) < 0)
658 return (err);
659
660 buf = kmem_alloc(bufsize, KM_SLEEP);
661
662 /*
663 * Call into cpu specific code to do the lookup.
664 */
665 if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
666 mem_name.m_addr, buf, bufsize, &len)) != 0) {
667 kmem_free(buf, bufsize);
668 return (err);
669 }
670
671 if (len >= mem_name.m_namelen) {
672 kmem_free(buf, bufsize);
673 return (ENOSPC);
674 }
675
676 if (copyoutstr(buf, (char *)mem_name.m_name,
677 mem_name.m_namelen, NULL) != 0) {
678 kmem_free(buf, bufsize);
679 return (EFAULT);
680 }
681
682 kmem_free(buf, bufsize);
683 return (0);
684 }
685
686 /*
687 * Given a syndrome and address return information about the associated memory.
688 */
689 static int
690 mmioctl_get_mem_info(intptr_t data)
691 {
692 mem_info_t mem_info;
693 int err;
694
695 if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
696 return (EFAULT);
697
698 if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
699 &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
700 &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
701 return (err);
702
703 if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
704 return (EFAULT);
705
706 return (0);
707 }
708
709 /*
710 * Given a memory name, return its associated serial id
711 */
712 static int
713 mmioctl_get_mem_sid(intptr_t data)
714 {
715 mem_name_t mem_name;
716 void *buf;
717 void *name;
718 size_t name_len;
719 size_t bufsize;
720 int len, err;
721
722 if ((bufsize = cpu_get_name_bufsize()) == 0)
723 return (ENOTSUP);
724
725 if ((err = mm_read_mem_name(data, &mem_name)) < 0)
726 return (err);
727
728 buf = kmem_alloc(bufsize, KM_SLEEP);
729
730 if (mem_name.m_namelen > 1024)
731 mem_name.m_namelen = 1024; /* cap at 1024 bytes */
732
733 name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
734
735 if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
736 mem_name.m_namelen, &name_len)) != 0) {
737 kmem_free(buf, bufsize);
738 kmem_free(name, mem_name.m_namelen);
739 return (err);
740 }
741
742 /*
743 * Call into cpu specific code to do the lookup.
744 */
745 if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
746 kmem_free(buf, bufsize);
747 kmem_free(name, mem_name.m_namelen);
748 return (err);
749 }
750
751 if (len > mem_name.m_sidlen) {
752 kmem_free(buf, bufsize);
753 kmem_free(name, mem_name.m_namelen);
754 return (ENAMETOOLONG);
755 }
756
757 if (copyoutstr(buf, (char *)mem_name.m_sid,
758 mem_name.m_sidlen, NULL) != 0) {
759 kmem_free(buf, bufsize);
760 kmem_free(name, mem_name.m_namelen);
761 return (EFAULT);
762 }
763
764 kmem_free(buf, bufsize);
765 kmem_free(name, mem_name.m_namelen);
766 return (0);
767 }
768 #endif /* __sparc */
769
770 /*
771 * Private ioctls for
772 * libkvm to support kvm_physaddr().
773 * FMA support for page_retire() and memory attribute information.
774 */
775 /*ARGSUSED*/
776 static int
777 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
778 {
779 if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
780 (cmd != MEM_VTOP && getminor(dev) != M_MEM))
781 return (ENXIO);
782
783 switch (cmd) {
784 case MEM_VTOP:
785 return (mmioctl_vtop(data));
786
787 case MEM_PAGE_RETIRE:
788 case MEM_PAGE_ISRETIRED:
789 case MEM_PAGE_UNRETIRE:
790 case MEM_PAGE_RETIRE_MCE:
791 case MEM_PAGE_RETIRE_UE:
792 case MEM_PAGE_GETERRORS:
793 case MEM_PAGE_RETIRE_TEST:
794 return (mmioctl_page_retire(cmd, data));
795
796 #ifdef __sparc
797 case MEM_NAME:
798 return (mmioctl_get_mem_name(data));
799
800 case MEM_INFO:
801 return (mmioctl_get_mem_info(data));
802
803 case MEM_SID:
804 return (mmioctl_get_mem_sid(data));
805 #else
806 case MEM_NAME:
807 case MEM_INFO:
808 case MEM_SID:
809 return (ENOTSUP);
810 #endif /* __sparc */
811 }
812 return (ENXIO);
813 }
814
815 /*ARGSUSED2*/
816 static int
817 mmmmap(dev_t dev, off_t off, int prot)
818 {
819 pfn_t pf;
820 struct memlist *pmem;
821 minor_t minor = getminor(dev);
822
823 switch (minor) {
824 case M_MEM:
825 pf = btop(off);
826 memlist_read_lock();
827 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
828 if (pf >= BTOP(pmem->ml_address) &&
829 pf < BTOP(pmem->ml_address + pmem->ml_size)) {
830 memlist_read_unlock();
831 return (impl_obmem_pfnum(pf));
832 }
833 }
834 memlist_read_unlock();
835 break;
836
837 case M_KMEM:
838 case M_ALLKMEM:
839 /* no longer supported with KPR */
840 return (-1);
841
842 case M_FULL:
843 case M_ZERO:
844 /*
845 * We shouldn't be mmap'ing to /dev/zero here as
846 * mmsegmap() should have already converted
847 * a mapping request for this device to a mapping
848 * using seg_vn for anonymous memory.
849 */
850 break;
851
852 }
853 return (-1);
854 }
855
856 /*
857 * This function is called when a memory device is mmap'ed.
858 * Set up the mapping to the correct device driver.
859 */
860 static int
861 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
862 uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
863 {
864 struct segvn_crargs vn_a;
865 struct segdev_crargs dev_a;
866 int error;
867 minor_t minor;
868 off_t i;
869
870 minor = getminor(dev);
871
872 as_rangelock(as);
873 /*
874 * No need to worry about vac alignment on /dev/zero
875 * since this is a "clone" object that doesn't yet exist.
876 */
877 error = choose_addr(as, addrp, len, off,
878 (minor == M_MEM) || (minor == M_KMEM), flags);
879 if (error != 0) {
880 as_rangeunlock(as);
881 return (error);
882 }
883
884 switch (minor) {
885 case M_MEM:
886 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
887 if ((flags & MAP_TYPE) != MAP_SHARED) {
888 as_rangeunlock(as);
889 return (EINVAL);
890 }
891
892 /*
893 * Check to ensure that the entire range is
894 * legal and we are not trying to map in
895 * more than the device will let us.
896 */
897 for (i = 0; i < len; i += PAGESIZE) {
898 if (mmmmap(dev, off + i, maxprot) == -1) {
899 as_rangeunlock(as);
900 return (ENXIO);
901 }
902 }
903
904 /*
905 * Use seg_dev segment driver for /dev/mem mapping.
906 */
907 dev_a.mapfunc = mmmmap;
908 dev_a.dev = dev;
909 dev_a.offset = off;
910 dev_a.type = (flags & MAP_TYPE);
911 dev_a.prot = (uchar_t)prot;
912 dev_a.maxprot = (uchar_t)maxprot;
913 dev_a.hat_attr = 0;
914
915 /*
916 * Make /dev/mem mappings non-consistent since we can't
917 * alias pages that don't have page structs behind them,
918 * such as kernel stack pages. If someone mmap()s a kernel
919 * stack page and if we give him a tte with cv, a line from
920 * that page can get into both pages of the spitfire d$.
921 * But snoop from another processor will only invalidate
922 * the first page. This later caused kernel (xc_attention)
923 * to go into an infinite loop at pil 13 and no interrupts
924 * could come in. See 1203630.
925 *
926 */
927 dev_a.hat_flags = HAT_LOAD_NOCONSIST;
928 dev_a.devmap_data = NULL;
929
930 error = as_map(as, *addrp, len, segdev_create, &dev_a);
931 break;
932
933 case M_ZERO:
934 /*
935 * Use seg_vn segment driver for /dev/zero mapping.
936 * Passing in a NULL amp gives us the "cloning" effect.
937 */
938 vn_a.vp = NULL;
939 vn_a.offset = 0;
940 vn_a.type = (flags & MAP_TYPE);
941 vn_a.prot = prot;
942 vn_a.maxprot = maxprot;
943 vn_a.flags = flags & ~MAP_TYPE;
944 vn_a.cred = cred;
945 vn_a.amp = NULL;
946 vn_a.szc = 0;
947 vn_a.lgrp_mem_policy_flags = 0;
948 error = as_map(as, *addrp, len, segvn_create, &vn_a);
949 break;
950
951 case M_KMEM:
952 case M_ALLKMEM:
953 /* No longer supported with KPR. */
954 error = ENXIO;
955 break;
956
957 case M_NULL:
958 /*
959 * Use seg_dev segment driver for /dev/null mapping.
960 */
961 dev_a.mapfunc = mmmmap;
962 dev_a.dev = dev;
963 dev_a.offset = off;
964 dev_a.type = 0; /* neither PRIVATE nor SHARED */
965 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
966 dev_a.hat_attr = 0;
967 dev_a.hat_flags = 0;
968 error = as_map(as, *addrp, len, segdev_create, &dev_a);
969 break;
970
971 default:
972 error = ENXIO;
973 }
974
975 as_rangeunlock(as);
976 return (error);
977 }
978
979 static struct cb_ops mm_cb_ops = {
980 mmopen, /* open */
981 nulldev, /* close */
982 nodev, /* strategy */
983 nodev, /* print */
984 nodev, /* dump */
985 mmread, /* read */
986 mmwrite, /* write */
987 mmioctl, /* ioctl */
988 nodev, /* devmap */
989 mmmmap, /* mmap */
990 mmsegmap, /* segmap */
991 mmchpoll, /* poll */
992 mmpropop, /* prop_op */
993 0, /* streamtab */
994 D_NEW | D_MP | D_64BIT | D_U64BIT
995 };
996
997 static struct dev_ops mm_ops = {
998 DEVO_REV, /* devo_rev, */
999 0, /* refcnt */
1000 mm_info, /* get_dev_info */
1001 nulldev, /* identify */
1002 nulldev, /* probe */
1003 mm_attach, /* attach */
1004 nodev, /* detach */
1005 nodev, /* reset */
1006 &mm_cb_ops, /* driver operations */
1007 (struct bus_ops *)0, /* bus operations */
1008 NULL, /* power */
1009 ddi_quiesce_not_needed, /* quiesce */
1010 };
1011
1012 static struct modldrv modldrv = {
1013 &mod_driverops, "memory driver", &mm_ops,
1014 };
1015
1016 static struct modlinkage modlinkage = {
1017 MODREV_1, &modldrv, NULL
1018 };
1019
1020 int
1021 _init(void)
1022 {
1023 return (mod_install(&modlinkage));
1024 }
1025
1026 int
1027 _info(struct modinfo *modinfop)
1028 {
1029 return (mod_info(&modlinkage, modinfop));
1030 }
1031
1032 int
1033 _fini(void)
1034 {
1035 return (mod_remove(&modlinkage));
1036 }
1037
1038 static int
1039 mm_kstat_update(kstat_t *ksp, int rw)
1040 {
1041 struct memlist *pmem;
1042 uint_t count;
1043
1044 if (rw == KSTAT_WRITE)
1045 return (EACCES);
1046
1047 count = 0;
1048 memlist_read_lock();
1049 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1050 count++;
1051 }
1052 memlist_read_unlock();
1053
1054 ksp->ks_ndata = count;
1055 ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1056
1057 return (0);
1058 }
1059
1060 static int
1061 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1062 {
1063 struct memlist *pmem;
1064 struct memunit {
1065 uint64_t address;
1066 uint64_t size;
1067 } *kspmem;
1068
1069 if (rw == KSTAT_WRITE)
1070 return (EACCES);
1071
1072 ksp->ks_snaptime = gethrtime();
1073
1074 kspmem = (struct memunit *)buf;
1075 memlist_read_lock();
1076 for (pmem = phys_install; pmem != NULL;
1077 pmem = pmem->ml_next, kspmem++) {
1078 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1079 break;
1080 kspmem->address = pmem->ml_address;
1081 kspmem->size = pmem->ml_size;
1082 }
1083 memlist_read_unlock();
1084
1085 return (0);
1086 }
1087
1088 /*
1089 * Read a mem_name_t from user-space and store it in the mem_name_t
1090 * pointed to by the mem_name argument.
1091 */
1092 static int
1093 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1094 {
1095 if (get_udatamodel() == DATAMODEL_NATIVE) {
1096 if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1097 return (EFAULT);
1098 }
1099 #ifdef _SYSCALL32
1100 else {
1101 mem_name32_t mem_name32;
1102
1103 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1104 return (EFAULT);
1105 mem_name->m_addr = mem_name32.m_addr;
1106 mem_name->m_synd = mem_name32.m_synd;
1107 mem_name->m_type[0] = mem_name32.m_type[0];
1108 mem_name->m_type[1] = mem_name32.m_type[1];
1109 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1110 mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1111 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1112 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1113 }
1114 #endif /* _SYSCALL32 */
1115
1116 return (0);
1117 }