1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright (c) 2015, Joyent, Inc. All rights reserved.
28 */
29
30 /*
31 * Memory special file
32 */
33
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/user.h>
37 #include <sys/buf.h>
38 #include <sys/systm.h>
39 #include <sys/cred.h>
40 #include <sys/vm.h>
41 #include <sys/uio.h>
42 #include <sys/mman.h>
43 #include <sys/kmem.h>
44 #include <vm/seg.h>
45 #include <vm/page.h>
46 #include <sys/stat.h>
47 #include <sys/vmem.h>
48 #include <sys/memlist.h>
49 #include <sys/bootconf.h>
50
51 #include <vm/seg_vn.h>
52 #include <vm/seg_dev.h>
53 #include <vm/seg_kmem.h>
54 #include <vm/seg_kp.h>
55 #include <vm/seg_kpm.h>
56 #include <vm/hat.h>
57
58 #include <sys/conf.h>
59 #include <sys/mem.h>
60 #include <sys/types.h>
61 #include <sys/conf.h>
62 #include <sys/param.h>
63 #include <sys/systm.h>
64 #include <sys/errno.h>
65 #include <sys/modctl.h>
66 #include <sys/memlist.h>
67 #include <sys/ddi.h>
68 #include <sys/sunddi.h>
69 #include <sys/debug.h>
70 #include <sys/fm/protocol.h>
71
72 #if defined(__sparc)
73 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
74 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
75 uint64_t *, int *, int *, int *);
76 extern size_t cpu_get_name_bufsize(void);
77 extern int cpu_get_mem_sid(char *, char *, int, int *);
78 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
79 #elif defined(__x86)
80 #include <sys/cpu_module.h>
81 #endif /* __sparc */
82
83 /*
84 * Turn a byte length into a pagecount. The DDI btop takes a
85 * 32-bit size on 32-bit machines, this handles 64-bit sizes for
86 * large physical-memory 32-bit machines.
87 */
88 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift))
89
90 static kmutex_t mm_lock;
91 static caddr_t mm_map;
92
93 static dev_info_t *mm_dip; /* private copy of devinfo pointer */
94
95 static int mm_kmem_io_access;
96
97 static int mm_kstat_update(kstat_t *ksp, int rw);
98 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
99
100 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
101
102 #define MM_KMEMLOG_NENTRIES 64
103
104 static int mm_kmemlogent;
105 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
106
107 /*
108 * On kmem/allmem writes, we log information that might be useful in the event
109 * that a write is errant (that is, due to operator error) and induces a later
110 * problem. Note that (in particular) in the event of such operator-induced
111 * corruption, a search over the kernel address space for the corrupted
112 * address will yield the ring buffer entry that recorded the write. And
113 * should it seem baroque or otherwise unnecessary, yes, we need this kind of
114 * auditing facility and yes, we learned that the hard way: disturbingly,
115 * there exist recommendations for "tuning" the system that involve writing to
116 * kernel memory addresses via the kernel debugger, and -- as we discovered --
117 * these can easily be applied incorrectly or unsafely, yielding an entirely
118 * undebuggable "can't happen" kind of panic.
119 */
120 static void
121 mm_logkmem(struct uio *uio)
122 {
123 mm_logentry_t *ent;
124 proc_t *p = curthread->t_procp;
125
126 mutex_enter(&mm_lock);
127
128 ent = &mm_kmemlog[mm_kmemlogent++];
129
130 if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
131 mm_kmemlogent = 0;
132
133 ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
134 ent->mle_len = uio->uio_resid;
135 gethrestime(&ent->mle_hrestime);
136 ent->mle_hrtime = gethrtime();
137 ent->mle_pid = p->p_pidp->pid_id;
138
139 (void) strncpy(ent->mle_psargs,
140 p->p_user.u_psargs, sizeof (ent->mle_psargs));
141
142 mutex_exit(&mm_lock);
143 }
144
145 /*ARGSUSED1*/
146 static int
147 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
148 {
149 int i;
150 struct mem_minor {
151 char *name;
152 minor_t minor;
153 int privonly;
154 const char *rdpriv;
155 const char *wrpriv;
156 mode_t priv_mode;
157 } mm[] = {
158 { "mem", M_MEM, 0, NULL, "all", 0640 },
159 { "kmem", M_KMEM, 0, NULL, "all", 0640 },
160 { "allkmem", M_ALLKMEM, 0, "all", "all", 0600 },
161 { "null", M_NULL, PRIVONLY_DEV, NULL, NULL, 0666 },
162 { "zero", M_ZERO, PRIVONLY_DEV, NULL, NULL, 0666 },
163 };
164 kstat_t *ksp;
165
166 mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
167 mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
168
169 for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
170 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
171 mm[i].minor, DDI_PSEUDO, mm[i].privonly,
172 mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
173 DDI_FAILURE) {
174 ddi_remove_minor_node(devi, NULL);
175 return (DDI_FAILURE);
176 }
177 }
178
179 mm_dip = devi;
180
181 ksp = kstat_create("mm", 0, "phys_installed", "misc",
182 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
183 if (ksp != NULL) {
184 ksp->ks_update = mm_kstat_update;
185 ksp->ks_snapshot = mm_kstat_snapshot;
186 ksp->ks_lock = &mm_lock; /* XXX - not really needed */
187 kstat_install(ksp);
188 }
189
190 mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
191 "kmem_io_access", 0);
192
193 return (DDI_SUCCESS);
194 }
195
196 /*ARGSUSED*/
197 static int
198 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
199 {
200 register int error;
201
202 switch (infocmd) {
203 case DDI_INFO_DEVT2DEVINFO:
204 *result = (void *)mm_dip;
205 error = DDI_SUCCESS;
206 break;
207 case DDI_INFO_DEVT2INSTANCE:
208 *result = (void *)0;
209 error = DDI_SUCCESS;
210 break;
211 default:
212 error = DDI_FAILURE;
213 }
214 return (error);
215 }
216
217 /*ARGSUSED1*/
218 static int
219 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
220 {
221 switch (getminor(*devp)) {
222 case M_NULL:
223 case M_ZERO:
224 case M_MEM:
225 case M_KMEM:
226 case M_ALLKMEM:
227 /* standard devices */
228 break;
229
230 default:
231 /* Unsupported or unknown type */
232 return (EINVAL);
233 }
234 /* must be character device */
235 if (typ != OTYP_CHR)
236 return (EINVAL);
237 return (0);
238 }
239
240 struct pollhead mm_pollhd;
241
242 /*ARGSUSED*/
243 static int
244 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
245 struct pollhead **phpp)
246 {
247 switch (getminor(dev)) {
248 case M_NULL:
249 case M_ZERO:
250 case M_MEM:
251 case M_KMEM:
252 case M_ALLKMEM:
253 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
254 POLLWRNORM | POLLRDBAND | POLLWRBAND);
255 /*
256 * A non NULL pollhead pointer should be returned in case
257 * user polls for 0 events.
258 */
259 *phpp = !anyyet && !*reventsp ?
260 &mm_pollhd : (struct pollhead *)NULL;
261 return (0);
262 default:
263 /* no other devices currently support polling */
264 return (ENXIO);
265 }
266 }
267
268 static int
269 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
270 char *name, caddr_t valuep, int *lengthp)
271 {
272 /*
273 * implement zero size to reduce overhead (avoid two failing
274 * property lookups per stat).
275 */
276 return (ddi_prop_op_size(dev, dip, prop_op,
277 flags, name, valuep, lengthp, 0));
278 }
279
280 static int
281 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
282 page_t *pp)
283 {
284 int error = 0;
285 int devload = 0;
286 int is_memory = pf_is_memory(pfn);
287 size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
288 (size_t)uio->uio_iov->iov_len);
289 caddr_t va = NULL;
290
291 mutex_enter(&mm_lock);
292
293 if (is_memory && kpm_enable) {
294 if (pp)
295 va = hat_kpm_mapin(pp, NULL);
296 else
297 va = hat_kpm_mapin_pfn(pfn);
298 }
299
300 if (va == NULL) {
301 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
302 (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
303 HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
304 va = mm_map;
305 devload = 1;
306 }
307
308 if (!is_memory) {
309 if (allowio) {
310 size_t c = uio->uio_iov->iov_len;
311
312 if (ddi_peekpokeio(NULL, uio, rw,
313 (caddr_t)(uintptr_t)uio->uio_loffset, c,
314 sizeof (int32_t)) != DDI_SUCCESS)
315 error = EFAULT;
316 } else
317 error = EIO;
318 } else
319 error = uiomove(va + pageoff, nbytes, rw, uio);
320
321 if (devload)
322 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
323 else if (pp)
324 hat_kpm_mapout(pp, NULL, va);
325 else
326 hat_kpm_mapout_pfn(pfn);
327
328 mutex_exit(&mm_lock);
329 return (error);
330 }
331
332 static int
333 mmpagelock(struct as *as, caddr_t va)
334 {
335 struct seg *seg;
336 int i;
337
338 AS_LOCK_ENTER(as, RW_READER);
339 seg = as_segat(as, va);
340 i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
341 AS_LOCK_EXIT(as);
342
343 return (i);
344 }
345
346 #ifdef __sparc
347
348 #define NEED_LOCK_KVADDR(kva) mmpagelock(&kas, kva)
349
350 #else /* __i386, __amd64 */
351
352 #define NEED_LOCK_KVADDR(va) 0
353
354 #endif /* __sparc */
355
356 /*ARGSUSED3*/
357 static int
358 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
359 {
360 pfn_t v;
361 struct iovec *iov;
362 int error = 0;
363 size_t c;
364 ssize_t oresid = uio->uio_resid;
365 minor_t minor = getminor(dev);
366
367 while (uio->uio_resid > 0 && error == 0) {
368 iov = uio->uio_iov;
369 if (iov->iov_len == 0) {
370 uio->uio_iov++;
371 uio->uio_iovcnt--;
372 if (uio->uio_iovcnt < 0)
373 panic("mmrw");
374 continue;
375 }
376 switch (minor) {
377
378 case M_MEM:
379 memlist_read_lock();
380 if (!address_in_memlist(phys_install,
381 (uint64_t)uio->uio_loffset, 1)) {
382 memlist_read_unlock();
383 error = EFAULT;
384 break;
385 }
386 memlist_read_unlock();
387
388 v = BTOP((u_offset_t)uio->uio_loffset);
389 error = mmio(uio, rw, v,
390 uio->uio_loffset & PAGEOFFSET, 0, NULL);
391 break;
392
393 case M_KMEM:
394 case M_ALLKMEM:
395 {
396 page_t **ppp = NULL;
397 caddr_t vaddr = (caddr_t)uio->uio_offset;
398 int try_lock = NEED_LOCK_KVADDR(vaddr);
399 int locked = 0;
400
401 if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
402 break;
403
404 if (rw == UIO_WRITE)
405 mm_logkmem(uio);
406
407 /*
408 * If vaddr does not map a valid page, as_pagelock()
409 * will return failure. Hence we can't check the
410 * return value and return EFAULT here as we'd like.
411 * seg_kp and seg_kpm do not properly support
412 * as_pagelock() for this context so we avoid it
413 * using the try_lock set check above. Some day when
414 * the kernel page locking gets redesigned all this
415 * muck can be cleaned up.
416 */
417 if (try_lock)
418 locked = (as_pagelock(&kas, &ppp, vaddr,
419 PAGESIZE, S_WRITE) == 0);
420
421 v = hat_getpfnum(kas.a_hat,
422 (caddr_t)(uintptr_t)uio->uio_loffset);
423 if (v == PFN_INVALID) {
424 if (locked)
425 as_pageunlock(&kas, ppp, vaddr,
426 PAGESIZE, S_WRITE);
427 error = EFAULT;
428 break;
429 }
430
431 error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
432 minor == M_ALLKMEM || mm_kmem_io_access,
433 (locked && ppp) ? *ppp : NULL);
434 if (locked)
435 as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
436 S_WRITE);
437 }
438
439 break;
440
441 case M_ZERO:
442 if (rw == UIO_READ) {
443 label_t ljb;
444
445 if (on_fault(&ljb)) {
446 no_fault();
447 error = EFAULT;
448 break;
449 }
450 uzero(iov->iov_base, iov->iov_len);
451 no_fault();
452 uio->uio_resid -= iov->iov_len;
453 uio->uio_loffset += iov->iov_len;
454 break;
455 }
456 /* else it's a write, fall through to NULL case */
457 /*FALLTHROUGH*/
458
459 case M_NULL:
460 if (rw == UIO_READ)
461 return (0);
462 c = iov->iov_len;
463 iov->iov_base += c;
464 iov->iov_len -= c;
465 uio->uio_loffset += c;
466 uio->uio_resid -= c;
467 break;
468
469 }
470 }
471 return (uio->uio_resid == oresid ? error : 0);
472 }
473
474 static int
475 mmread(dev_t dev, struct uio *uio, cred_t *cred)
476 {
477 return (mmrw(dev, uio, UIO_READ, cred));
478 }
479
480 static int
481 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
482 {
483 return (mmrw(dev, uio, UIO_WRITE, cred));
484 }
485
486 /*
487 * Private ioctl for libkvm to support kvm_physaddr().
488 * Given an address space and a VA, compute the PA.
489 */
490 static int
491 mmioctl_vtop(intptr_t data)
492 {
493 #ifdef _SYSCALL32
494 mem_vtop32_t vtop32;
495 #endif
496 mem_vtop_t mem_vtop;
497 proc_t *p;
498 pfn_t pfn = (pfn_t)PFN_INVALID;
499 pid_t pid = 0;
500 struct as *as;
501 struct seg *seg;
502
503 if (get_udatamodel() == DATAMODEL_NATIVE) {
504 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
505 return (EFAULT);
506 }
507 #ifdef _SYSCALL32
508 else {
509 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
510 return (EFAULT);
511 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
512 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
513
514 if (mem_vtop.m_as != NULL)
515 return (EINVAL);
516 }
517 #endif
518
519 if (mem_vtop.m_as == &kas) {
520 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
521 } else {
522 if (mem_vtop.m_as == NULL) {
523 /*
524 * Assume the calling process's address space if the
525 * caller didn't specify one.
526 */
527 p = curthread->t_procp;
528 if (p == NULL)
529 return (EIO);
530 mem_vtop.m_as = p->p_as;
531 }
532
533 mutex_enter(&pidlock);
534 for (p = practive; p != NULL; p = p->p_next) {
535 if (p->p_as == mem_vtop.m_as) {
536 pid = p->p_pid;
537 break;
538 }
539 }
540 mutex_exit(&pidlock);
541 if (p == NULL)
542 return (EIO);
543 p = sprlock(pid);
544 if (p == NULL)
545 return (EIO);
546 as = p->p_as;
547 if (as == mem_vtop.m_as) {
548 mutex_exit(&p->p_lock);
549 AS_LOCK_ENTER(as, RW_READER);
550 for (seg = AS_SEGFIRST(as); seg != NULL;
551 seg = AS_SEGNEXT(as, seg))
552 if ((uintptr_t)mem_vtop.m_va -
553 (uintptr_t)seg->s_base < seg->s_size)
554 break;
555 if (seg != NULL)
556 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
557 AS_LOCK_EXIT(as);
558 mutex_enter(&p->p_lock);
559 }
560 sprunlock(p);
561 }
562 mem_vtop.m_pfn = pfn;
563 if (pfn == PFN_INVALID)
564 return (EIO);
565
566 if (get_udatamodel() == DATAMODEL_NATIVE) {
567 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
568 return (EFAULT);
569 }
570 #ifdef _SYSCALL32
571 else {
572 vtop32.m_pfn = mem_vtop.m_pfn;
573 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
574 return (EFAULT);
575 }
576 #endif
577
578 return (0);
579 }
580
581 /*
582 * Given a PA, execute the given page retire command on it.
583 */
584 static int
585 mmioctl_page_retire(int cmd, intptr_t data)
586 {
587 extern int page_retire_test(void);
588 uint64_t pa;
589
590 if (copyin((void *)data, &pa, sizeof (uint64_t))) {
591 return (EFAULT);
592 }
593
594 switch (cmd) {
595 case MEM_PAGE_ISRETIRED:
596 return (page_retire_check(pa, NULL));
597
598 case MEM_PAGE_UNRETIRE:
599 return (page_unretire(pa));
600
601 case MEM_PAGE_RETIRE:
602 return (page_retire(pa, PR_FMA));
603
604 case MEM_PAGE_RETIRE_MCE:
605 return (page_retire(pa, PR_MCE));
606
607 case MEM_PAGE_RETIRE_UE:
608 return (page_retire(pa, PR_UE));
609
610 case MEM_PAGE_GETERRORS:
611 {
612 uint64_t page_errors;
613 int rc = page_retire_check(pa, &page_errors);
614 if (copyout(&page_errors, (void *)data,
615 sizeof (uint64_t))) {
616 return (EFAULT);
617 }
618 return (rc);
619 }
620
621 case MEM_PAGE_RETIRE_TEST:
622 return (page_retire_test());
623
624 }
625
626 return (EINVAL);
627 }
628
629 #ifdef __sparc
630 /*
631 * Given a syndrome, syndrome type, and address return the
632 * associated memory name in the provided data buffer.
633 */
634 static int
635 mmioctl_get_mem_name(intptr_t data)
636 {
637 mem_name_t mem_name;
638 void *buf;
639 size_t bufsize;
640 int len, err;
641
642 if ((bufsize = cpu_get_name_bufsize()) == 0)
643 return (ENOTSUP);
644
645 if ((err = mm_read_mem_name(data, &mem_name)) < 0)
646 return (err);
647
648 buf = kmem_alloc(bufsize, KM_SLEEP);
649
650 /*
651 * Call into cpu specific code to do the lookup.
652 */
653 if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
654 mem_name.m_addr, buf, bufsize, &len)) != 0) {
655 kmem_free(buf, bufsize);
656 return (err);
657 }
658
659 if (len >= mem_name.m_namelen) {
660 kmem_free(buf, bufsize);
661 return (ENOSPC);
662 }
663
664 if (copyoutstr(buf, (char *)mem_name.m_name,
665 mem_name.m_namelen, NULL) != 0) {
666 kmem_free(buf, bufsize);
667 return (EFAULT);
668 }
669
670 kmem_free(buf, bufsize);
671 return (0);
672 }
673
674 /*
675 * Given a syndrome and address return information about the associated memory.
676 */
677 static int
678 mmioctl_get_mem_info(intptr_t data)
679 {
680 mem_info_t mem_info;
681 int err;
682
683 if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
684 return (EFAULT);
685
686 if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
687 &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
688 &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
689 return (err);
690
691 if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
692 return (EFAULT);
693
694 return (0);
695 }
696
697 /*
698 * Given a memory name, return its associated serial id
699 */
700 static int
701 mmioctl_get_mem_sid(intptr_t data)
702 {
703 mem_name_t mem_name;
704 void *buf;
705 void *name;
706 size_t name_len;
707 size_t bufsize;
708 int len, err;
709
710 if ((bufsize = cpu_get_name_bufsize()) == 0)
711 return (ENOTSUP);
712
713 if ((err = mm_read_mem_name(data, &mem_name)) < 0)
714 return (err);
715
716 buf = kmem_alloc(bufsize, KM_SLEEP);
717
718 if (mem_name.m_namelen > 1024)
719 mem_name.m_namelen = 1024; /* cap at 1024 bytes */
720
721 name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
722
723 if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
724 mem_name.m_namelen, &name_len)) != 0) {
725 kmem_free(buf, bufsize);
726 kmem_free(name, mem_name.m_namelen);
727 return (err);
728 }
729
730 /*
731 * Call into cpu specific code to do the lookup.
732 */
733 if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
734 kmem_free(buf, bufsize);
735 kmem_free(name, mem_name.m_namelen);
736 return (err);
737 }
738
739 if (len > mem_name.m_sidlen) {
740 kmem_free(buf, bufsize);
741 kmem_free(name, mem_name.m_namelen);
742 return (ENAMETOOLONG);
743 }
744
745 if (copyoutstr(buf, (char *)mem_name.m_sid,
746 mem_name.m_sidlen, NULL) != 0) {
747 kmem_free(buf, bufsize);
748 kmem_free(name, mem_name.m_namelen);
749 return (EFAULT);
750 }
751
752 kmem_free(buf, bufsize);
753 kmem_free(name, mem_name.m_namelen);
754 return (0);
755 }
756 #endif /* __sparc */
757
758 /*
759 * Private ioctls for
760 * libkvm to support kvm_physaddr().
761 * FMA support for page_retire() and memory attribute information.
762 */
763 /*ARGSUSED*/
764 static int
765 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
766 {
767 if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
768 (cmd != MEM_VTOP && getminor(dev) != M_MEM))
769 return (ENXIO);
770
771 switch (cmd) {
772 case MEM_VTOP:
773 return (mmioctl_vtop(data));
774
775 case MEM_PAGE_RETIRE:
776 case MEM_PAGE_ISRETIRED:
777 case MEM_PAGE_UNRETIRE:
778 case MEM_PAGE_RETIRE_MCE:
779 case MEM_PAGE_RETIRE_UE:
780 case MEM_PAGE_GETERRORS:
781 case MEM_PAGE_RETIRE_TEST:
782 return (mmioctl_page_retire(cmd, data));
783
784 #ifdef __sparc
785 case MEM_NAME:
786 return (mmioctl_get_mem_name(data));
787
788 case MEM_INFO:
789 return (mmioctl_get_mem_info(data));
790
791 case MEM_SID:
792 return (mmioctl_get_mem_sid(data));
793 #else
794 case MEM_NAME:
795 case MEM_INFO:
796 case MEM_SID:
797 return (ENOTSUP);
798 #endif /* __sparc */
799 }
800 return (ENXIO);
801 }
802
803 /*ARGSUSED2*/
804 static int
805 mmmmap(dev_t dev, off_t off, int prot)
806 {
807 pfn_t pf;
808 struct memlist *pmem;
809 minor_t minor = getminor(dev);
810
811 switch (minor) {
812 case M_MEM:
813 pf = btop(off);
814 memlist_read_lock();
815 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
816 if (pf >= BTOP(pmem->ml_address) &&
817 pf < BTOP(pmem->ml_address + pmem->ml_size)) {
818 memlist_read_unlock();
819 return (impl_obmem_pfnum(pf));
820 }
821 }
822 memlist_read_unlock();
823 break;
824
825 case M_KMEM:
826 case M_ALLKMEM:
827 /* no longer supported with KPR */
828 return (-1);
829
830 case M_ZERO:
831 /*
832 * We shouldn't be mmap'ing to /dev/zero here as
833 * mmsegmap() should have already converted
834 * a mapping request for this device to a mapping
835 * using seg_vn for anonymous memory.
836 */
837 break;
838
839 }
840 return (-1);
841 }
842
843 /*
844 * This function is called when a memory device is mmap'ed.
845 * Set up the mapping to the correct device driver.
846 */
847 static int
848 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
849 uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
850 {
851 struct segvn_crargs vn_a;
852 struct segdev_crargs dev_a;
853 int error;
854 minor_t minor;
855 off_t i;
856
857 minor = getminor(dev);
858
859 as_rangelock(as);
860 /*
861 * No need to worry about vac alignment on /dev/zero
862 * since this is a "clone" object that doesn't yet exist.
863 */
864 error = choose_addr(as, addrp, len, off,
865 (minor == M_MEM) || (minor == M_KMEM), flags);
866 if (error != 0) {
867 as_rangeunlock(as);
868 return (error);
869 }
870
871 switch (minor) {
872 case M_MEM:
873 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
874 if ((flags & MAP_TYPE) != MAP_SHARED) {
875 as_rangeunlock(as);
876 return (EINVAL);
877 }
878
879 /*
880 * Check to ensure that the entire range is
881 * legal and we are not trying to map in
882 * more than the device will let us.
883 */
884 for (i = 0; i < len; i += PAGESIZE) {
885 if (mmmmap(dev, off + i, maxprot) == -1) {
886 as_rangeunlock(as);
887 return (ENXIO);
888 }
889 }
890
891 /*
892 * Use seg_dev segment driver for /dev/mem mapping.
893 */
894 dev_a.mapfunc = mmmmap;
895 dev_a.dev = dev;
896 dev_a.offset = off;
897 dev_a.type = (flags & MAP_TYPE);
898 dev_a.prot = (uchar_t)prot;
899 dev_a.maxprot = (uchar_t)maxprot;
900 dev_a.hat_attr = 0;
901
902 /*
903 * Make /dev/mem mappings non-consistent since we can't
904 * alias pages that don't have page structs behind them,
905 * such as kernel stack pages. If someone mmap()s a kernel
906 * stack page and if we give him a tte with cv, a line from
907 * that page can get into both pages of the spitfire d$.
908 * But snoop from another processor will only invalidate
909 * the first page. This later caused kernel (xc_attention)
910 * to go into an infinite loop at pil 13 and no interrupts
911 * could come in. See 1203630.
912 *
913 */
914 dev_a.hat_flags = HAT_LOAD_NOCONSIST;
915 dev_a.devmap_data = NULL;
916
917 error = as_map(as, *addrp, len, segdev_create, &dev_a);
918 break;
919
920 case M_ZERO:
921 /*
922 * Use seg_vn segment driver for /dev/zero mapping.
923 * Passing in a NULL amp gives us the "cloning" effect.
924 */
925 vn_a.vp = NULL;
926 vn_a.offset = 0;
927 vn_a.type = (flags & MAP_TYPE);
928 vn_a.prot = prot;
929 vn_a.maxprot = maxprot;
930 vn_a.flags = flags & ~MAP_TYPE;
931 vn_a.cred = cred;
932 vn_a.amp = NULL;
933 vn_a.szc = 0;
934 vn_a.lgrp_mem_policy_flags = 0;
935 error = as_map(as, *addrp, len, segvn_create, &vn_a);
936 break;
937
938 case M_KMEM:
939 case M_ALLKMEM:
940 /* No longer supported with KPR. */
941 error = ENXIO;
942 break;
943
944 case M_NULL:
945 /*
946 * Use seg_dev segment driver for /dev/null mapping.
947 */
948 dev_a.mapfunc = mmmmap;
949 dev_a.dev = dev;
950 dev_a.offset = off;
951 dev_a.type = 0; /* neither PRIVATE nor SHARED */
952 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
953 dev_a.hat_attr = 0;
954 dev_a.hat_flags = 0;
955 error = as_map(as, *addrp, len, segdev_create, &dev_a);
956 break;
957
958 default:
959 error = ENXIO;
960 }
961
962 as_rangeunlock(as);
963 return (error);
964 }
965
966 static struct cb_ops mm_cb_ops = {
967 mmopen, /* open */
968 nulldev, /* close */
969 nodev, /* strategy */
970 nodev, /* print */
971 nodev, /* dump */
972 mmread, /* read */
973 mmwrite, /* write */
974 mmioctl, /* ioctl */
975 nodev, /* devmap */
976 mmmmap, /* mmap */
977 mmsegmap, /* segmap */
978 mmchpoll, /* poll */
979 mmpropop, /* prop_op */
980 0, /* streamtab */
981 D_NEW | D_MP | D_64BIT | D_U64BIT
982 };
983
984 static struct dev_ops mm_ops = {
985 DEVO_REV, /* devo_rev, */
986 0, /* refcnt */
987 mm_info, /* get_dev_info */
988 nulldev, /* identify */
989 nulldev, /* probe */
990 mm_attach, /* attach */
991 nodev, /* detach */
992 nodev, /* reset */
993 &mm_cb_ops, /* driver operations */
994 (struct bus_ops *)0, /* bus operations */
995 NULL, /* power */
996 ddi_quiesce_not_needed, /* quiesce */
997 };
998
999 static struct modldrv modldrv = {
1000 &mod_driverops, "memory driver", &mm_ops,
1001 };
1002
1003 static struct modlinkage modlinkage = {
1004 MODREV_1, &modldrv, NULL
1005 };
1006
1007 int
1008 _init(void)
1009 {
1010 return (mod_install(&modlinkage));
1011 }
1012
1013 int
1014 _info(struct modinfo *modinfop)
1015 {
1016 return (mod_info(&modlinkage, modinfop));
1017 }
1018
1019 int
1020 _fini(void)
1021 {
1022 return (mod_remove(&modlinkage));
1023 }
1024
1025 static int
1026 mm_kstat_update(kstat_t *ksp, int rw)
1027 {
1028 struct memlist *pmem;
1029 uint_t count;
1030
1031 if (rw == KSTAT_WRITE)
1032 return (EACCES);
1033
1034 count = 0;
1035 memlist_read_lock();
1036 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1037 count++;
1038 }
1039 memlist_read_unlock();
1040
1041 ksp->ks_ndata = count;
1042 ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1043
1044 return (0);
1045 }
1046
1047 static int
1048 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1049 {
1050 struct memlist *pmem;
1051 struct memunit {
1052 uint64_t address;
1053 uint64_t size;
1054 } *kspmem;
1055
1056 if (rw == KSTAT_WRITE)
1057 return (EACCES);
1058
1059 ksp->ks_snaptime = gethrtime();
1060
1061 kspmem = (struct memunit *)buf;
1062 memlist_read_lock();
1063 for (pmem = phys_install; pmem != NULL;
1064 pmem = pmem->ml_next, kspmem++) {
1065 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1066 break;
1067 kspmem->address = pmem->ml_address;
1068 kspmem->size = pmem->ml_size;
1069 }
1070 memlist_read_unlock();
1071
1072 return (0);
1073 }
1074
1075 /*
1076 * Read a mem_name_t from user-space and store it in the mem_name_t
1077 * pointed to by the mem_name argument.
1078 */
1079 static int
1080 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1081 {
1082 if (get_udatamodel() == DATAMODEL_NATIVE) {
1083 if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1084 return (EFAULT);
1085 }
1086 #ifdef _SYSCALL32
1087 else {
1088 mem_name32_t mem_name32;
1089
1090 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1091 return (EFAULT);
1092 mem_name->m_addr = mem_name32.m_addr;
1093 mem_name->m_synd = mem_name32.m_synd;
1094 mem_name->m_type[0] = mem_name32.m_type[0];
1095 mem_name->m_type[1] = mem_name32.m_type[1];
1096 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1097 mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1098 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1099 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1100 }
1101 #endif /* _SYSCALL32 */
1102
1103 return (0);
1104 }