Print this page
7882 Add /dev/full , the always-full memory device
Reviewed by: Adam Stevko <adam.stevko@gmail.com>
Reviewed by: Toomas Soome <tsoome@me.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/io/mem.c
+++ new/usr/src/uts/common/io/mem.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
↓ open down ↓ |
17 lines elided |
↑ open up ↑ |
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /*
27 27 * Copyright (c) 2015, Joyent, Inc. All rights reserved.
28 + * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com>
28 29 */
29 30
30 31 /*
31 32 * Memory special file
32 33 */
33 34
34 35 #include <sys/types.h>
35 36 #include <sys/param.h>
36 37 #include <sys/user.h>
37 38 #include <sys/buf.h>
38 39 #include <sys/systm.h>
39 40 #include <sys/cred.h>
40 41 #include <sys/vm.h>
41 42 #include <sys/uio.h>
42 43 #include <sys/mman.h>
43 44 #include <sys/kmem.h>
44 45 #include <vm/seg.h>
45 46 #include <vm/page.h>
46 47 #include <sys/stat.h>
47 48 #include <sys/vmem.h>
48 49 #include <sys/memlist.h>
49 50 #include <sys/bootconf.h>
50 51
51 52 #include <vm/seg_vn.h>
52 53 #include <vm/seg_dev.h>
53 54 #include <vm/seg_kmem.h>
54 55 #include <vm/seg_kp.h>
55 56 #include <vm/seg_kpm.h>
56 57 #include <vm/hat.h>
57 58
58 59 #include <sys/conf.h>
59 60 #include <sys/mem.h>
60 61 #include <sys/types.h>
61 62 #include <sys/conf.h>
62 63 #include <sys/param.h>
63 64 #include <sys/systm.h>
64 65 #include <sys/errno.h>
65 66 #include <sys/modctl.h>
66 67 #include <sys/memlist.h>
67 68 #include <sys/ddi.h>
68 69 #include <sys/sunddi.h>
69 70 #include <sys/debug.h>
70 71 #include <sys/fm/protocol.h>
71 72
72 73 #if defined(__sparc)
73 74 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
74 75 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
75 76 uint64_t *, int *, int *, int *);
76 77 extern size_t cpu_get_name_bufsize(void);
77 78 extern int cpu_get_mem_sid(char *, char *, int, int *);
78 79 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
79 80 #elif defined(__x86)
80 81 #include <sys/cpu_module.h>
81 82 #endif /* __sparc */
82 83
83 84 /*
84 85 * Turn a byte length into a pagecount. The DDI btop takes a
85 86 * 32-bit size on 32-bit machines, this handles 64-bit sizes for
86 87 * large physical-memory 32-bit machines.
87 88 */
88 89 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift))
89 90
90 91 static kmutex_t mm_lock;
91 92 static caddr_t mm_map;
92 93
93 94 static dev_info_t *mm_dip; /* private copy of devinfo pointer */
94 95
95 96 static int mm_kmem_io_access;
96 97
97 98 static int mm_kstat_update(kstat_t *ksp, int rw);
98 99 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
99 100
100 101 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
101 102
102 103 #define MM_KMEMLOG_NENTRIES 64
103 104
104 105 static int mm_kmemlogent;
105 106 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
106 107
107 108 /*
108 109 * On kmem/allmem writes, we log information that might be useful in the event
109 110 * that a write is errant (that is, due to operator error) and induces a later
110 111 * problem. Note that (in particular) in the event of such operator-induced
111 112 * corruption, a search over the kernel address space for the corrupted
112 113 * address will yield the ring buffer entry that recorded the write. And
113 114 * should it seem baroque or otherwise unnecessary, yes, we need this kind of
114 115 * auditing facility and yes, we learned that the hard way: disturbingly,
115 116 * there exist recommendations for "tuning" the system that involve writing to
116 117 * kernel memory addresses via the kernel debugger, and -- as we discovered --
117 118 * these can easily be applied incorrectly or unsafely, yielding an entirely
118 119 * undebuggable "can't happen" kind of panic.
119 120 */
120 121 static void
121 122 mm_logkmem(struct uio *uio)
122 123 {
123 124 mm_logentry_t *ent;
124 125 proc_t *p = curthread->t_procp;
125 126
126 127 mutex_enter(&mm_lock);
127 128
128 129 ent = &mm_kmemlog[mm_kmemlogent++];
129 130
130 131 if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
131 132 mm_kmemlogent = 0;
132 133
133 134 ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
134 135 ent->mle_len = uio->uio_resid;
135 136 gethrestime(&ent->mle_hrestime);
136 137 ent->mle_hrtime = gethrtime();
137 138 ent->mle_pid = p->p_pidp->pid_id;
138 139
139 140 (void) strncpy(ent->mle_psargs,
140 141 p->p_user.u_psargs, sizeof (ent->mle_psargs));
141 142
142 143 mutex_exit(&mm_lock);
143 144 }
144 145
145 146 /*ARGSUSED1*/
146 147 static int
147 148 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
148 149 {
149 150 int i;
150 151 struct mem_minor {
151 152 char *name;
152 153 minor_t minor;
↓ open down ↓ |
115 lines elided |
↑ open up ↑ |
153 154 int privonly;
154 155 const char *rdpriv;
155 156 const char *wrpriv;
156 157 mode_t priv_mode;
157 158 } mm[] = {
158 159 { "mem", M_MEM, 0, NULL, "all", 0640 },
159 160 { "kmem", M_KMEM, 0, NULL, "all", 0640 },
160 161 { "allkmem", M_ALLKMEM, 0, "all", "all", 0600 },
161 162 { "null", M_NULL, PRIVONLY_DEV, NULL, NULL, 0666 },
162 163 { "zero", M_ZERO, PRIVONLY_DEV, NULL, NULL, 0666 },
164 + { "full", M_FULL, PRIVONLY_DEV, NULL, NULL, 0666 },
163 165 };
164 166 kstat_t *ksp;
165 167
166 168 mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
167 169 mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
168 170
169 171 for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
170 172 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
171 173 mm[i].minor, DDI_PSEUDO, mm[i].privonly,
172 174 mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
173 175 DDI_FAILURE) {
174 176 ddi_remove_minor_node(devi, NULL);
175 177 return (DDI_FAILURE);
176 178 }
177 179 }
178 180
179 181 mm_dip = devi;
180 182
181 183 ksp = kstat_create("mm", 0, "phys_installed", "misc",
182 184 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
183 185 if (ksp != NULL) {
184 186 ksp->ks_update = mm_kstat_update;
185 187 ksp->ks_snapshot = mm_kstat_snapshot;
186 188 ksp->ks_lock = &mm_lock; /* XXX - not really needed */
187 189 kstat_install(ksp);
188 190 }
189 191
190 192 mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
191 193 "kmem_io_access", 0);
192 194
193 195 return (DDI_SUCCESS);
194 196 }
195 197
196 198 /*ARGSUSED*/
197 199 static int
198 200 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
199 201 {
200 202 register int error;
201 203
202 204 switch (infocmd) {
203 205 case DDI_INFO_DEVT2DEVINFO:
204 206 *result = (void *)mm_dip;
205 207 error = DDI_SUCCESS;
206 208 break;
207 209 case DDI_INFO_DEVT2INSTANCE:
208 210 *result = (void *)0;
209 211 error = DDI_SUCCESS;
210 212 break;
211 213 default:
212 214 error = DDI_FAILURE;
213 215 }
↓ open down ↓ |
41 lines elided |
↑ open up ↑ |
214 216 return (error);
215 217 }
216 218
217 219 /*ARGSUSED1*/
218 220 static int
219 221 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
220 222 {
221 223 switch (getminor(*devp)) {
222 224 case M_NULL:
223 225 case M_ZERO:
226 + case M_FULL:
224 227 case M_MEM:
225 228 case M_KMEM:
226 229 case M_ALLKMEM:
227 230 /* standard devices */
228 231 break;
229 232
230 233 default:
231 234 /* Unsupported or unknown type */
232 235 return (EINVAL);
233 236 }
234 237 /* must be character device */
235 238 if (typ != OTYP_CHR)
236 239 return (EINVAL);
237 240 return (0);
238 241 }
239 242
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
240 243 struct pollhead mm_pollhd;
241 244
242 245 /*ARGSUSED*/
243 246 static int
244 247 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
245 248 struct pollhead **phpp)
246 249 {
247 250 switch (getminor(dev)) {
248 251 case M_NULL:
249 252 case M_ZERO:
253 + case M_FULL:
250 254 case M_MEM:
251 255 case M_KMEM:
252 256 case M_ALLKMEM:
253 257 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
254 258 POLLWRNORM | POLLRDBAND | POLLWRBAND);
255 259 /*
256 260 * A non NULL pollhead pointer should be returned in case
257 261 * user polls for 0 events.
258 262 */
259 263 *phpp = !anyyet && !*reventsp ?
260 264 &mm_pollhd : (struct pollhead *)NULL;
261 265 return (0);
262 266 default:
263 267 /* no other devices currently support polling */
264 268 return (ENXIO);
265 269 }
266 270 }
267 271
268 272 static int
269 273 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
270 274 char *name, caddr_t valuep, int *lengthp)
271 275 {
272 276 /*
273 277 * implement zero size to reduce overhead (avoid two failing
274 278 * property lookups per stat).
275 279 */
276 280 return (ddi_prop_op_size(dev, dip, prop_op,
277 281 flags, name, valuep, lengthp, 0));
278 282 }
279 283
280 284 static int
281 285 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
282 286 page_t *pp)
283 287 {
284 288 int error = 0;
285 289 int devload = 0;
286 290 int is_memory = pf_is_memory(pfn);
287 291 size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
288 292 (size_t)uio->uio_iov->iov_len);
289 293 caddr_t va = NULL;
290 294
291 295 mutex_enter(&mm_lock);
292 296
293 297 if (is_memory && kpm_enable) {
294 298 if (pp)
295 299 va = hat_kpm_mapin(pp, NULL);
296 300 else
297 301 va = hat_kpm_mapin_pfn(pfn);
298 302 }
299 303
300 304 if (va == NULL) {
301 305 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
302 306 (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
303 307 HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
304 308 va = mm_map;
305 309 devload = 1;
306 310 }
307 311
308 312 if (!is_memory) {
309 313 if (allowio) {
310 314 size_t c = uio->uio_iov->iov_len;
311 315
312 316 if (ddi_peekpokeio(NULL, uio, rw,
313 317 (caddr_t)(uintptr_t)uio->uio_loffset, c,
314 318 sizeof (int32_t)) != DDI_SUCCESS)
315 319 error = EFAULT;
316 320 } else
317 321 error = EIO;
318 322 } else
319 323 error = uiomove(va + pageoff, nbytes, rw, uio);
320 324
321 325 if (devload)
322 326 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
323 327 else if (pp)
324 328 hat_kpm_mapout(pp, NULL, va);
325 329 else
326 330 hat_kpm_mapout_pfn(pfn);
327 331
328 332 mutex_exit(&mm_lock);
329 333 return (error);
330 334 }
331 335
332 336 static int
333 337 mmpagelock(struct as *as, caddr_t va)
334 338 {
335 339 struct seg *seg;
336 340 int i;
337 341
338 342 AS_LOCK_ENTER(as, RW_READER);
339 343 seg = as_segat(as, va);
340 344 i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
341 345 AS_LOCK_EXIT(as);
342 346
343 347 return (i);
344 348 }
345 349
346 350 #ifdef __sparc
347 351
348 352 #define NEED_LOCK_KVADDR(kva) mmpagelock(&kas, kva)
349 353
350 354 #else /* __i386, __amd64 */
351 355
352 356 #define NEED_LOCK_KVADDR(va) 0
353 357
354 358 #endif /* __sparc */
355 359
356 360 /*ARGSUSED3*/
357 361 static int
358 362 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
359 363 {
360 364 pfn_t v;
361 365 struct iovec *iov;
362 366 int error = 0;
363 367 size_t c;
364 368 ssize_t oresid = uio->uio_resid;
365 369 minor_t minor = getminor(dev);
366 370
367 371 while (uio->uio_resid > 0 && error == 0) {
368 372 iov = uio->uio_iov;
369 373 if (iov->iov_len == 0) {
370 374 uio->uio_iov++;
371 375 uio->uio_iovcnt--;
372 376 if (uio->uio_iovcnt < 0)
373 377 panic("mmrw");
374 378 continue;
375 379 }
376 380 switch (minor) {
377 381
378 382 case M_MEM:
379 383 memlist_read_lock();
380 384 if (!address_in_memlist(phys_install,
381 385 (uint64_t)uio->uio_loffset, 1)) {
382 386 memlist_read_unlock();
383 387 error = EFAULT;
384 388 break;
385 389 }
386 390 memlist_read_unlock();
387 391
388 392 v = BTOP((u_offset_t)uio->uio_loffset);
389 393 error = mmio(uio, rw, v,
390 394 uio->uio_loffset & PAGEOFFSET, 0, NULL);
391 395 break;
392 396
393 397 case M_KMEM:
394 398 case M_ALLKMEM:
395 399 {
396 400 page_t **ppp = NULL;
397 401 caddr_t vaddr = (caddr_t)uio->uio_offset;
398 402 int try_lock = NEED_LOCK_KVADDR(vaddr);
399 403 int locked = 0;
400 404
401 405 if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
402 406 break;
403 407
404 408 if (rw == UIO_WRITE)
405 409 mm_logkmem(uio);
406 410
407 411 /*
408 412 * If vaddr does not map a valid page, as_pagelock()
409 413 * will return failure. Hence we can't check the
410 414 * return value and return EFAULT here as we'd like.
411 415 * seg_kp and seg_kpm do not properly support
412 416 * as_pagelock() for this context so we avoid it
413 417 * using the try_lock set check above. Some day when
414 418 * the kernel page locking gets redesigned all this
415 419 * muck can be cleaned up.
416 420 */
417 421 if (try_lock)
418 422 locked = (as_pagelock(&kas, &ppp, vaddr,
419 423 PAGESIZE, S_WRITE) == 0);
420 424
421 425 v = hat_getpfnum(kas.a_hat,
422 426 (caddr_t)(uintptr_t)uio->uio_loffset);
423 427 if (v == PFN_INVALID) {
424 428 if (locked)
425 429 as_pageunlock(&kas, ppp, vaddr,
426 430 PAGESIZE, S_WRITE);
427 431 error = EFAULT;
428 432 break;
429 433 }
430 434
↓ open down ↓ |
171 lines elided |
↑ open up ↑ |
431 435 error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
432 436 minor == M_ALLKMEM || mm_kmem_io_access,
433 437 (locked && ppp) ? *ppp : NULL);
434 438 if (locked)
435 439 as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
436 440 S_WRITE);
437 441 }
438 442
439 443 break;
440 444
445 + case M_FULL:
446 + if (rw == UIO_WRITE) {
447 + error = ENOSPC;
448 + break;
449 + }
450 + /* else it's a read, fall through to zero case */
451 + /*FALLTHROUGH*/
452 +
441 453 case M_ZERO:
442 454 if (rw == UIO_READ) {
443 455 label_t ljb;
444 456
445 457 if (on_fault(&ljb)) {
446 458 no_fault();
447 459 error = EFAULT;
448 460 break;
449 461 }
450 462 uzero(iov->iov_base, iov->iov_len);
451 463 no_fault();
452 464 uio->uio_resid -= iov->iov_len;
453 465 uio->uio_loffset += iov->iov_len;
454 466 break;
455 467 }
456 468 /* else it's a write, fall through to NULL case */
457 469 /*FALLTHROUGH*/
458 470
459 471 case M_NULL:
460 472 if (rw == UIO_READ)
461 473 return (0);
462 474 c = iov->iov_len;
463 475 iov->iov_base += c;
464 476 iov->iov_len -= c;
465 477 uio->uio_loffset += c;
466 478 uio->uio_resid -= c;
467 479 break;
468 480
469 481 }
470 482 }
471 483 return (uio->uio_resid == oresid ? error : 0);
472 484 }
473 485
474 486 static int
475 487 mmread(dev_t dev, struct uio *uio, cred_t *cred)
476 488 {
477 489 return (mmrw(dev, uio, UIO_READ, cred));
478 490 }
479 491
480 492 static int
481 493 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
482 494 {
483 495 return (mmrw(dev, uio, UIO_WRITE, cred));
484 496 }
485 497
486 498 /*
487 499 * Private ioctl for libkvm to support kvm_physaddr().
488 500 * Given an address space and a VA, compute the PA.
489 501 */
490 502 static int
491 503 mmioctl_vtop(intptr_t data)
492 504 {
493 505 #ifdef _SYSCALL32
494 506 mem_vtop32_t vtop32;
495 507 #endif
496 508 mem_vtop_t mem_vtop;
497 509 proc_t *p;
498 510 pfn_t pfn = (pfn_t)PFN_INVALID;
499 511 pid_t pid = 0;
500 512 struct as *as;
501 513 struct seg *seg;
502 514
503 515 if (get_udatamodel() == DATAMODEL_NATIVE) {
504 516 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
505 517 return (EFAULT);
506 518 }
507 519 #ifdef _SYSCALL32
508 520 else {
509 521 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
510 522 return (EFAULT);
511 523 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
512 524 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
513 525
514 526 if (mem_vtop.m_as != NULL)
515 527 return (EINVAL);
516 528 }
517 529 #endif
518 530
519 531 if (mem_vtop.m_as == &kas) {
520 532 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
521 533 } else {
522 534 if (mem_vtop.m_as == NULL) {
523 535 /*
524 536 * Assume the calling process's address space if the
525 537 * caller didn't specify one.
526 538 */
527 539 p = curthread->t_procp;
528 540 if (p == NULL)
529 541 return (EIO);
530 542 mem_vtop.m_as = p->p_as;
531 543 }
532 544
533 545 mutex_enter(&pidlock);
534 546 for (p = practive; p != NULL; p = p->p_next) {
535 547 if (p->p_as == mem_vtop.m_as) {
536 548 pid = p->p_pid;
537 549 break;
538 550 }
539 551 }
540 552 mutex_exit(&pidlock);
541 553 if (p == NULL)
542 554 return (EIO);
543 555 p = sprlock(pid);
544 556 if (p == NULL)
545 557 return (EIO);
546 558 as = p->p_as;
547 559 if (as == mem_vtop.m_as) {
548 560 mutex_exit(&p->p_lock);
549 561 AS_LOCK_ENTER(as, RW_READER);
550 562 for (seg = AS_SEGFIRST(as); seg != NULL;
551 563 seg = AS_SEGNEXT(as, seg))
552 564 if ((uintptr_t)mem_vtop.m_va -
553 565 (uintptr_t)seg->s_base < seg->s_size)
554 566 break;
555 567 if (seg != NULL)
556 568 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
557 569 AS_LOCK_EXIT(as);
558 570 mutex_enter(&p->p_lock);
559 571 }
560 572 sprunlock(p);
561 573 }
562 574 mem_vtop.m_pfn = pfn;
563 575 if (pfn == PFN_INVALID)
564 576 return (EIO);
565 577
566 578 if (get_udatamodel() == DATAMODEL_NATIVE) {
567 579 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
568 580 return (EFAULT);
569 581 }
570 582 #ifdef _SYSCALL32
571 583 else {
572 584 vtop32.m_pfn = mem_vtop.m_pfn;
573 585 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
574 586 return (EFAULT);
575 587 }
576 588 #endif
577 589
578 590 return (0);
579 591 }
580 592
581 593 /*
582 594 * Given a PA, execute the given page retire command on it.
583 595 */
584 596 static int
585 597 mmioctl_page_retire(int cmd, intptr_t data)
586 598 {
587 599 extern int page_retire_test(void);
588 600 uint64_t pa;
589 601
590 602 if (copyin((void *)data, &pa, sizeof (uint64_t))) {
591 603 return (EFAULT);
592 604 }
593 605
594 606 switch (cmd) {
595 607 case MEM_PAGE_ISRETIRED:
596 608 return (page_retire_check(pa, NULL));
597 609
598 610 case MEM_PAGE_UNRETIRE:
599 611 return (page_unretire(pa));
600 612
601 613 case MEM_PAGE_RETIRE:
602 614 return (page_retire(pa, PR_FMA));
603 615
604 616 case MEM_PAGE_RETIRE_MCE:
605 617 return (page_retire(pa, PR_MCE));
606 618
607 619 case MEM_PAGE_RETIRE_UE:
608 620 return (page_retire(pa, PR_UE));
609 621
610 622 case MEM_PAGE_GETERRORS:
611 623 {
612 624 uint64_t page_errors;
613 625 int rc = page_retire_check(pa, &page_errors);
614 626 if (copyout(&page_errors, (void *)data,
615 627 sizeof (uint64_t))) {
616 628 return (EFAULT);
617 629 }
618 630 return (rc);
619 631 }
620 632
621 633 case MEM_PAGE_RETIRE_TEST:
622 634 return (page_retire_test());
623 635
624 636 }
625 637
626 638 return (EINVAL);
627 639 }
628 640
629 641 #ifdef __sparc
630 642 /*
631 643 * Given a syndrome, syndrome type, and address return the
632 644 * associated memory name in the provided data buffer.
633 645 */
634 646 static int
635 647 mmioctl_get_mem_name(intptr_t data)
636 648 {
637 649 mem_name_t mem_name;
638 650 void *buf;
639 651 size_t bufsize;
640 652 int len, err;
641 653
642 654 if ((bufsize = cpu_get_name_bufsize()) == 0)
643 655 return (ENOTSUP);
644 656
645 657 if ((err = mm_read_mem_name(data, &mem_name)) < 0)
646 658 return (err);
647 659
648 660 buf = kmem_alloc(bufsize, KM_SLEEP);
649 661
650 662 /*
651 663 * Call into cpu specific code to do the lookup.
652 664 */
653 665 if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
654 666 mem_name.m_addr, buf, bufsize, &len)) != 0) {
655 667 kmem_free(buf, bufsize);
656 668 return (err);
657 669 }
658 670
659 671 if (len >= mem_name.m_namelen) {
660 672 kmem_free(buf, bufsize);
661 673 return (ENOSPC);
662 674 }
663 675
664 676 if (copyoutstr(buf, (char *)mem_name.m_name,
665 677 mem_name.m_namelen, NULL) != 0) {
666 678 kmem_free(buf, bufsize);
667 679 return (EFAULT);
668 680 }
669 681
670 682 kmem_free(buf, bufsize);
671 683 return (0);
672 684 }
673 685
674 686 /*
675 687 * Given a syndrome and address return information about the associated memory.
676 688 */
677 689 static int
678 690 mmioctl_get_mem_info(intptr_t data)
679 691 {
680 692 mem_info_t mem_info;
681 693 int err;
682 694
683 695 if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
684 696 return (EFAULT);
685 697
686 698 if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
687 699 &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
688 700 &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
689 701 return (err);
690 702
691 703 if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
692 704 return (EFAULT);
693 705
694 706 return (0);
695 707 }
696 708
697 709 /*
698 710 * Given a memory name, return its associated serial id
699 711 */
700 712 static int
701 713 mmioctl_get_mem_sid(intptr_t data)
702 714 {
703 715 mem_name_t mem_name;
704 716 void *buf;
705 717 void *name;
706 718 size_t name_len;
707 719 size_t bufsize;
708 720 int len, err;
709 721
710 722 if ((bufsize = cpu_get_name_bufsize()) == 0)
711 723 return (ENOTSUP);
712 724
713 725 if ((err = mm_read_mem_name(data, &mem_name)) < 0)
714 726 return (err);
715 727
716 728 buf = kmem_alloc(bufsize, KM_SLEEP);
717 729
718 730 if (mem_name.m_namelen > 1024)
719 731 mem_name.m_namelen = 1024; /* cap at 1024 bytes */
720 732
721 733 name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
722 734
723 735 if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
724 736 mem_name.m_namelen, &name_len)) != 0) {
725 737 kmem_free(buf, bufsize);
726 738 kmem_free(name, mem_name.m_namelen);
727 739 return (err);
728 740 }
729 741
730 742 /*
731 743 * Call into cpu specific code to do the lookup.
732 744 */
733 745 if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
734 746 kmem_free(buf, bufsize);
735 747 kmem_free(name, mem_name.m_namelen);
736 748 return (err);
737 749 }
738 750
739 751 if (len > mem_name.m_sidlen) {
740 752 kmem_free(buf, bufsize);
741 753 kmem_free(name, mem_name.m_namelen);
742 754 return (ENAMETOOLONG);
743 755 }
744 756
745 757 if (copyoutstr(buf, (char *)mem_name.m_sid,
746 758 mem_name.m_sidlen, NULL) != 0) {
747 759 kmem_free(buf, bufsize);
748 760 kmem_free(name, mem_name.m_namelen);
749 761 return (EFAULT);
750 762 }
751 763
752 764 kmem_free(buf, bufsize);
753 765 kmem_free(name, mem_name.m_namelen);
754 766 return (0);
755 767 }
756 768 #endif /* __sparc */
757 769
758 770 /*
759 771 * Private ioctls for
760 772 * libkvm to support kvm_physaddr().
761 773 * FMA support for page_retire() and memory attribute information.
762 774 */
763 775 /*ARGSUSED*/
764 776 static int
765 777 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
766 778 {
767 779 if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
768 780 (cmd != MEM_VTOP && getminor(dev) != M_MEM))
769 781 return (ENXIO);
770 782
771 783 switch (cmd) {
772 784 case MEM_VTOP:
773 785 return (mmioctl_vtop(data));
774 786
775 787 case MEM_PAGE_RETIRE:
776 788 case MEM_PAGE_ISRETIRED:
777 789 case MEM_PAGE_UNRETIRE:
778 790 case MEM_PAGE_RETIRE_MCE:
779 791 case MEM_PAGE_RETIRE_UE:
780 792 case MEM_PAGE_GETERRORS:
781 793 case MEM_PAGE_RETIRE_TEST:
782 794 return (mmioctl_page_retire(cmd, data));
783 795
784 796 #ifdef __sparc
785 797 case MEM_NAME:
786 798 return (mmioctl_get_mem_name(data));
787 799
788 800 case MEM_INFO:
789 801 return (mmioctl_get_mem_info(data));
790 802
791 803 case MEM_SID:
792 804 return (mmioctl_get_mem_sid(data));
793 805 #else
794 806 case MEM_NAME:
795 807 case MEM_INFO:
796 808 case MEM_SID:
797 809 return (ENOTSUP);
798 810 #endif /* __sparc */
799 811 }
800 812 return (ENXIO);
801 813 }
802 814
803 815 /*ARGSUSED2*/
804 816 static int
805 817 mmmmap(dev_t dev, off_t off, int prot)
806 818 {
807 819 pfn_t pf;
808 820 struct memlist *pmem;
809 821 minor_t minor = getminor(dev);
810 822
811 823 switch (minor) {
812 824 case M_MEM:
813 825 pf = btop(off);
814 826 memlist_read_lock();
815 827 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
816 828 if (pf >= BTOP(pmem->ml_address) &&
817 829 pf < BTOP(pmem->ml_address + pmem->ml_size)) {
818 830 memlist_read_unlock();
819 831 return (impl_obmem_pfnum(pf));
↓ open down ↓ |
369 lines elided |
↑ open up ↑ |
820 832 }
821 833 }
822 834 memlist_read_unlock();
823 835 break;
824 836
825 837 case M_KMEM:
826 838 case M_ALLKMEM:
827 839 /* no longer supported with KPR */
828 840 return (-1);
829 841
842 + case M_FULL:
830 843 case M_ZERO:
831 844 /*
832 845 * We shouldn't be mmap'ing to /dev/zero here as
833 846 * mmsegmap() should have already converted
834 847 * a mapping request for this device to a mapping
835 848 * using seg_vn for anonymous memory.
836 849 */
837 850 break;
838 851
839 852 }
840 853 return (-1);
841 854 }
842 855
843 856 /*
844 857 * This function is called when a memory device is mmap'ed.
845 858 * Set up the mapping to the correct device driver.
846 859 */
847 860 static int
848 861 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
849 862 uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
850 863 {
851 864 struct segvn_crargs vn_a;
852 865 struct segdev_crargs dev_a;
853 866 int error;
854 867 minor_t minor;
855 868 off_t i;
856 869
857 870 minor = getminor(dev);
858 871
859 872 as_rangelock(as);
860 873 /*
861 874 * No need to worry about vac alignment on /dev/zero
862 875 * since this is a "clone" object that doesn't yet exist.
863 876 */
864 877 error = choose_addr(as, addrp, len, off,
865 878 (minor == M_MEM) || (minor == M_KMEM), flags);
866 879 if (error != 0) {
867 880 as_rangeunlock(as);
868 881 return (error);
869 882 }
870 883
871 884 switch (minor) {
872 885 case M_MEM:
873 886 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
874 887 if ((flags & MAP_TYPE) != MAP_SHARED) {
875 888 as_rangeunlock(as);
876 889 return (EINVAL);
877 890 }
878 891
879 892 /*
880 893 * Check to ensure that the entire range is
881 894 * legal and we are not trying to map in
882 895 * more than the device will let us.
883 896 */
884 897 for (i = 0; i < len; i += PAGESIZE) {
885 898 if (mmmmap(dev, off + i, maxprot) == -1) {
886 899 as_rangeunlock(as);
887 900 return (ENXIO);
888 901 }
889 902 }
890 903
891 904 /*
892 905 * Use seg_dev segment driver for /dev/mem mapping.
893 906 */
894 907 dev_a.mapfunc = mmmmap;
895 908 dev_a.dev = dev;
896 909 dev_a.offset = off;
897 910 dev_a.type = (flags & MAP_TYPE);
898 911 dev_a.prot = (uchar_t)prot;
899 912 dev_a.maxprot = (uchar_t)maxprot;
900 913 dev_a.hat_attr = 0;
901 914
902 915 /*
903 916 * Make /dev/mem mappings non-consistent since we can't
904 917 * alias pages that don't have page structs behind them,
905 918 * such as kernel stack pages. If someone mmap()s a kernel
906 919 * stack page and if we give him a tte with cv, a line from
907 920 * that page can get into both pages of the spitfire d$.
908 921 * But snoop from another processor will only invalidate
909 922 * the first page. This later caused kernel (xc_attention)
910 923 * to go into an infinite loop at pil 13 and no interrupts
911 924 * could come in. See 1203630.
912 925 *
913 926 */
914 927 dev_a.hat_flags = HAT_LOAD_NOCONSIST;
915 928 dev_a.devmap_data = NULL;
916 929
917 930 error = as_map(as, *addrp, len, segdev_create, &dev_a);
918 931 break;
919 932
920 933 case M_ZERO:
921 934 /*
922 935 * Use seg_vn segment driver for /dev/zero mapping.
923 936 * Passing in a NULL amp gives us the "cloning" effect.
924 937 */
925 938 vn_a.vp = NULL;
926 939 vn_a.offset = 0;
927 940 vn_a.type = (flags & MAP_TYPE);
928 941 vn_a.prot = prot;
929 942 vn_a.maxprot = maxprot;
930 943 vn_a.flags = flags & ~MAP_TYPE;
931 944 vn_a.cred = cred;
932 945 vn_a.amp = NULL;
933 946 vn_a.szc = 0;
934 947 vn_a.lgrp_mem_policy_flags = 0;
935 948 error = as_map(as, *addrp, len, segvn_create, &vn_a);
936 949 break;
937 950
938 951 case M_KMEM:
939 952 case M_ALLKMEM:
940 953 /* No longer supported with KPR. */
941 954 error = ENXIO;
942 955 break;
943 956
944 957 case M_NULL:
945 958 /*
946 959 * Use seg_dev segment driver for /dev/null mapping.
947 960 */
948 961 dev_a.mapfunc = mmmmap;
949 962 dev_a.dev = dev;
950 963 dev_a.offset = off;
951 964 dev_a.type = 0; /* neither PRIVATE nor SHARED */
952 965 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
953 966 dev_a.hat_attr = 0;
954 967 dev_a.hat_flags = 0;
955 968 error = as_map(as, *addrp, len, segdev_create, &dev_a);
956 969 break;
957 970
958 971 default:
959 972 error = ENXIO;
960 973 }
961 974
962 975 as_rangeunlock(as);
963 976 return (error);
964 977 }
965 978
966 979 static struct cb_ops mm_cb_ops = {
967 980 mmopen, /* open */
968 981 nulldev, /* close */
969 982 nodev, /* strategy */
970 983 nodev, /* print */
971 984 nodev, /* dump */
972 985 mmread, /* read */
973 986 mmwrite, /* write */
974 987 mmioctl, /* ioctl */
975 988 nodev, /* devmap */
976 989 mmmmap, /* mmap */
977 990 mmsegmap, /* segmap */
978 991 mmchpoll, /* poll */
979 992 mmpropop, /* prop_op */
980 993 0, /* streamtab */
981 994 D_NEW | D_MP | D_64BIT | D_U64BIT
982 995 };
983 996
984 997 static struct dev_ops mm_ops = {
985 998 DEVO_REV, /* devo_rev, */
986 999 0, /* refcnt */
987 1000 mm_info, /* get_dev_info */
988 1001 nulldev, /* identify */
989 1002 nulldev, /* probe */
990 1003 mm_attach, /* attach */
991 1004 nodev, /* detach */
992 1005 nodev, /* reset */
993 1006 &mm_cb_ops, /* driver operations */
994 1007 (struct bus_ops *)0, /* bus operations */
995 1008 NULL, /* power */
996 1009 ddi_quiesce_not_needed, /* quiesce */
997 1010 };
998 1011
999 1012 static struct modldrv modldrv = {
1000 1013 &mod_driverops, "memory driver", &mm_ops,
1001 1014 };
1002 1015
1003 1016 static struct modlinkage modlinkage = {
1004 1017 MODREV_1, &modldrv, NULL
1005 1018 };
1006 1019
1007 1020 int
1008 1021 _init(void)
1009 1022 {
1010 1023 return (mod_install(&modlinkage));
1011 1024 }
1012 1025
1013 1026 int
1014 1027 _info(struct modinfo *modinfop)
1015 1028 {
1016 1029 return (mod_info(&modlinkage, modinfop));
1017 1030 }
1018 1031
1019 1032 int
1020 1033 _fini(void)
1021 1034 {
1022 1035 return (mod_remove(&modlinkage));
1023 1036 }
1024 1037
1025 1038 static int
1026 1039 mm_kstat_update(kstat_t *ksp, int rw)
1027 1040 {
1028 1041 struct memlist *pmem;
1029 1042 uint_t count;
1030 1043
1031 1044 if (rw == KSTAT_WRITE)
1032 1045 return (EACCES);
1033 1046
1034 1047 count = 0;
1035 1048 memlist_read_lock();
1036 1049 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1037 1050 count++;
1038 1051 }
1039 1052 memlist_read_unlock();
1040 1053
1041 1054 ksp->ks_ndata = count;
1042 1055 ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1043 1056
1044 1057 return (0);
1045 1058 }
1046 1059
1047 1060 static int
1048 1061 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1049 1062 {
1050 1063 struct memlist *pmem;
1051 1064 struct memunit {
1052 1065 uint64_t address;
1053 1066 uint64_t size;
1054 1067 } *kspmem;
1055 1068
1056 1069 if (rw == KSTAT_WRITE)
1057 1070 return (EACCES);
1058 1071
1059 1072 ksp->ks_snaptime = gethrtime();
1060 1073
1061 1074 kspmem = (struct memunit *)buf;
1062 1075 memlist_read_lock();
1063 1076 for (pmem = phys_install; pmem != NULL;
1064 1077 pmem = pmem->ml_next, kspmem++) {
1065 1078 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1066 1079 break;
1067 1080 kspmem->address = pmem->ml_address;
1068 1081 kspmem->size = pmem->ml_size;
1069 1082 }
1070 1083 memlist_read_unlock();
1071 1084
1072 1085 return (0);
1073 1086 }
1074 1087
1075 1088 /*
1076 1089 * Read a mem_name_t from user-space and store it in the mem_name_t
1077 1090 * pointed to by the mem_name argument.
1078 1091 */
1079 1092 static int
1080 1093 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1081 1094 {
1082 1095 if (get_udatamodel() == DATAMODEL_NATIVE) {
1083 1096 if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1084 1097 return (EFAULT);
1085 1098 }
1086 1099 #ifdef _SYSCALL32
1087 1100 else {
1088 1101 mem_name32_t mem_name32;
1089 1102
1090 1103 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1091 1104 return (EFAULT);
1092 1105 mem_name->m_addr = mem_name32.m_addr;
1093 1106 mem_name->m_synd = mem_name32.m_synd;
1094 1107 mem_name->m_type[0] = mem_name32.m_type[0];
1095 1108 mem_name->m_type[1] = mem_name32.m_type[1];
1096 1109 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1097 1110 mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1098 1111 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1099 1112 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1100 1113 }
1101 1114 #endif /* _SYSCALL32 */
1102 1115
1103 1116 return (0);
1104 1117 }
↓ open down ↓ |
265 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX