42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/user.h>
45 #include <sys/proc.h>
46 #include <sys/kmem.h>
47 #include <sys/vmem.h>
48 #include <sys/buf.h>
49 #include <sys/cpuvar.h>
50 #include <sys/lgrp.h>
51 #include <sys/disp.h>
52 #include <sys/vm.h>
53 #include <sys/mman.h>
54 #include <sys/vnode.h>
55 #include <sys/cred.h>
56 #include <sys/exec.h>
57 #include <sys/exechdr.h>
58 #include <sys/debug.h>
59 #include <sys/vmsystm.h>
60 #include <sys/swap.h>
61 #include <sys/dumphdr.h>
62
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_kp.h>
67 #include <vm/seg_vn.h>
68 #include <vm/page.h>
69 #include <vm/seg_kmem.h>
70 #include <vm/seg_kpm.h>
71 #include <vm/vm_dep.h>
72
73 #include <sys/cpu.h>
74 #include <sys/vm_machparam.h>
75 #include <sys/memlist.h>
76 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
77 #include <vm/hat_i86.h>
78 #include <sys/x86_archext.h>
79 #include <sys/elf_386.h>
80 #include <sys/cmn_err.h>
81 #include <sys/archsystm.h>
82 #include <sys/machsystm.h>
83
84 #include <sys/vtrace.h>
85 #include <sys/ddidmareq.h>
86 #include <sys/promif.h>
87 #include <sys/memnode.h>
88 #include <sys/stack.h>
89 #include <util/qsort.h>
90 #include <sys/taskq.h>
91
92 #ifdef __xpv
93
94 #include <sys/hypervisor.h>
95 #include <sys/xen_mmu.h>
96 #include <sys/balloon_impl.h>
97
98 /*
99 * domain 0 pages usable for DMA are kept pre-allocated and kept in
100 * distinct lists, ordered by increasing mfn.
101 */
102 static kmutex_t io_pool_lock;
620 }
621
622 void
623 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
624 {
625 struct proc *p = curproc;
626 caddr_t userlimit = (flags & _MAP_LOW32) ?
627 (caddr_t)_userlimit32 : p->p_as->a_userlimit;
628
629 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
630 }
631
632 /*ARGSUSED*/
633 int
634 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
635 {
636 return (0);
637 }
638
639 /*
640 * map_addr_proc() is the routine called when the system is to
641 * choose an address for the user. We will pick an address
642 * range which is the highest available below userlimit.
643 *
644 * Every mapping will have a redzone of a single page on either side of
645 * the request. This is done to leave one page unmapped between segments.
646 * This is not required, but it's useful for the user because if their
647 * program strays across a segment boundary, it will catch a fault
648 * immediately making debugging a little easier. Currently the redzone
649 * is mandatory.
650 *
651 * addrp is a value/result parameter.
652 * On input it is a hint from the user to be used in a completely
653 * machine dependent fashion. We decide to completely ignore this hint.
654 * If MAP_ALIGN was specified, addrp contains the minimal alignment, which
655 * must be some "power of two" multiple of pagesize.
656 *
657 * On output it is NULL if no address can be found in the current
658 * processes address space or else an address that is currently
659 * not mapped for len bytes with a page of red zone on either side.
735 * For 32-bit processes, only those which have specified
736 * MAP_ALIGN and an addr will be aligned on a larger page size.
737 * Not doing so can potentially waste up to 1G of process
738 * address space.
739 */
740 int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
741 mmu.umax_page_level;
742
743 while (lvl && len < LEVEL_SIZE(lvl))
744 --lvl;
745
746 align_amount = LEVEL_SIZE(lvl);
747 }
748 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
749 align_amount = (uintptr_t)*addrp;
750
751 ASSERT(ISP2(align_amount));
752 ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
753
754 off = off & (align_amount - 1);
755 /*
756 * Look for a large enough hole starting below userlimit.
757 * After finding it, use the upper part.
758 */
759 if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
760 PAGESIZE, off) == 0) {
761 caddr_t as_addr;
762
763 /*
764 * addr is the highest possible address to use since we have
765 * a PAGESIZE redzone at the beginning and end.
766 */
767 addr = base + slen - (PAGESIZE + len);
768 as_addr = addr;
769 /*
770 * Round address DOWN to the alignment amount and
771 * add the offset in.
772 * If addr is greater than as_addr, len would not be large
773 * enough to include the redzone, so we must adjust down
774 * by the alignment amount.
775 */
776 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
777 addr += (uintptr_t)off;
778 if (addr > as_addr) {
779 addr -= align_amount;
780 }
781
782 ASSERT(addr > base);
783 ASSERT(addr + len < base + slen);
784 ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
785 ((uintptr_t)(off)));
786 *addrp = addr;
787 } else {
788 *addrp = NULL; /* no more virtual space */
789 }
790 }
791
792 int valid_va_range_aligned_wraparound;
793
794 /*
795 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
796 * addresses at least "minlen" long, where the base of the range is at "off"
797 * phase from an "align" boundary and there is space for a "redzone"-sized
798 * redzone on either side of the range. On success, 1 is returned and *basep
799 * and *lenp are adjusted to describe the acceptable range (including
800 * the redzone). On failure, 0 is returned.
801 */
887 }
888
889 *basep = (caddr_t)lo;
890 *lenp = hi - lo;
891 return (1);
892 }
893
894 /*
895 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
896 * addresses at least "minlen" long. On success, 1 is returned and *basep
897 * and *lenp are adjusted to describe the acceptable range. On failure, 0
898 * is returned.
899 */
900 int
901 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
902 {
903 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
904 }
905
906 /*
907 * Determine whether [addr, addr+len] are valid user addresses.
908 */
909 /*ARGSUSED*/
910 int
911 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
912 caddr_t userlimit)
913 {
914 caddr_t eaddr = addr + len;
915
916 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
917 return (RANGE_BADADDR);
918
919 #if defined(__amd64)
920 /*
921 * Check for the VA hole
922 */
923 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
924 return (RANGE_BADADDR);
925 #endif
926
927 return (RANGE_OKAY);
928 }
929
930 /*
931 * Return 1 if the page frame is onboard memory, else 0.
932 */
933 int
934 pf_is_memory(pfn_t pf)
935 {
936 if (pfn_is_foreign(pf))
937 return (0);
938 return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
3909
3910 hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
3911 cpup->cpu_caddr2pte = 0;
3912 vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
3913 cpup->cpu_caddr2 = 0;
3914
3915 hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
3916 cpup->cpu_caddr1pte = 0;
3917 vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
3918 cpup->cpu_caddr1 = 0;
3919 }
3920
3921 /*
3922 * Function for flushing D-cache when performing module relocations
3923 * to an alternate mapping. Unnecessary on Intel / AMD platforms.
3924 */
3925 void
3926 dcache_flushall()
3927 {}
3928
3929 size_t
3930 exec_get_spslew(void)
3931 {
3932 return (0);
3933 }
3934
3935 /*
3936 * Allocate a memory page. The argument 'seed' can be any pseudo-random
3937 * number to vary where the pages come from. This is quite a hacked up
3938 * method -- it works for now, but really needs to be fixed up a bit.
3939 *
3940 * We currently use page_create_va() on the kvp with fake offsets,
3941 * segments and virt address. This is pretty bogus, but was copied from the
3942 * old hat_i86.c code. A better approach would be to specify either mnode
3943 * random or mnode local and takes a page from whatever color has the MOST
3944 * available - this would have a minimal impact on page coloring.
3945 */
3946 page_t *
3947 page_get_physical(uintptr_t seed)
3948 {
3949 page_t *pp;
3950 u_offset_t offset;
3951 static struct seg tmpseg;
3952 static uintptr_t ctr = 0;
3953
3954 /*
|
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/user.h>
45 #include <sys/proc.h>
46 #include <sys/kmem.h>
47 #include <sys/vmem.h>
48 #include <sys/buf.h>
49 #include <sys/cpuvar.h>
50 #include <sys/lgrp.h>
51 #include <sys/disp.h>
52 #include <sys/vm.h>
53 #include <sys/mman.h>
54 #include <sys/vnode.h>
55 #include <sys/cred.h>
56 #include <sys/exec.h>
57 #include <sys/exechdr.h>
58 #include <sys/debug.h>
59 #include <sys/vmsystm.h>
60 #include <sys/swap.h>
61 #include <sys/dumphdr.h>
62 #include <sys/random.h>
63
64 #include <vm/hat.h>
65 #include <vm/as.h>
66 #include <vm/seg.h>
67 #include <vm/seg_kp.h>
68 #include <vm/seg_vn.h>
69 #include <vm/page.h>
70 #include <vm/seg_kmem.h>
71 #include <vm/seg_kpm.h>
72 #include <vm/vm_dep.h>
73
74 #include <sys/cpu.h>
75 #include <sys/vm_machparam.h>
76 #include <sys/memlist.h>
77 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
78 #include <vm/hat_i86.h>
79 #include <sys/x86_archext.h>
80 #include <sys/elf_386.h>
81 #include <sys/cmn_err.h>
82 #include <sys/archsystm.h>
83 #include <sys/machsystm.h>
84 #include <sys/secflags.h>
85
86 #include <sys/vtrace.h>
87 #include <sys/ddidmareq.h>
88 #include <sys/promif.h>
89 #include <sys/memnode.h>
90 #include <sys/stack.h>
91 #include <util/qsort.h>
92 #include <sys/taskq.h>
93
94 #ifdef __xpv
95
96 #include <sys/hypervisor.h>
97 #include <sys/xen_mmu.h>
98 #include <sys/balloon_impl.h>
99
100 /*
101 * domain 0 pages usable for DMA are kept pre-allocated and kept in
102 * distinct lists, ordered by increasing mfn.
103 */
104 static kmutex_t io_pool_lock;
622 }
623
624 void
625 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
626 {
627 struct proc *p = curproc;
628 caddr_t userlimit = (flags & _MAP_LOW32) ?
629 (caddr_t)_userlimit32 : p->p_as->a_userlimit;
630
631 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
632 }
633
634 /*ARGSUSED*/
635 int
636 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
637 {
638 return (0);
639 }
640
641 /*
642 * The maximum amount a randomized mapping will be slewed. We should perhaps
643 * arrange things so these tunables can be separate for mmap, mmapobj, and
644 * ld.so
645 */
646 size_t aslr_max_map_skew = 256 * 1024 * 1024; /* 256MB */
647
648 /*
649 * map_addr_proc() is the routine called when the system is to
650 * choose an address for the user. We will pick an address
651 * range which is the highest available below userlimit.
652 *
653 * Every mapping will have a redzone of a single page on either side of
654 * the request. This is done to leave one page unmapped between segments.
655 * This is not required, but it's useful for the user because if their
656 * program strays across a segment boundary, it will catch a fault
657 * immediately making debugging a little easier. Currently the redzone
658 * is mandatory.
659 *
660 * addrp is a value/result parameter.
661 * On input it is a hint from the user to be used in a completely
662 * machine dependent fashion. We decide to completely ignore this hint.
663 * If MAP_ALIGN was specified, addrp contains the minimal alignment, which
664 * must be some "power of two" multiple of pagesize.
665 *
666 * On output it is NULL if no address can be found in the current
667 * processes address space or else an address that is currently
668 * not mapped for len bytes with a page of red zone on either side.
744 * For 32-bit processes, only those which have specified
745 * MAP_ALIGN and an addr will be aligned on a larger page size.
746 * Not doing so can potentially waste up to 1G of process
747 * address space.
748 */
749 int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
750 mmu.umax_page_level;
751
752 while (lvl && len < LEVEL_SIZE(lvl))
753 --lvl;
754
755 align_amount = LEVEL_SIZE(lvl);
756 }
757 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
758 align_amount = (uintptr_t)*addrp;
759
760 ASSERT(ISP2(align_amount));
761 ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
762
763 off = off & (align_amount - 1);
764
765 /*
766 * Look for a large enough hole starting below userlimit.
767 * After finding it, use the upper part.
768 */
769 if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
770 PAGESIZE, off) == 0) {
771 caddr_t as_addr;
772
773 /*
774 * addr is the highest possible address to use since we have
775 * a PAGESIZE redzone at the beginning and end.
776 */
777 addr = base + slen - (PAGESIZE + len);
778 as_addr = addr;
779 /*
780 * Round address DOWN to the alignment amount and
781 * add the offset in.
782 * If addr is greater than as_addr, len would not be large
783 * enough to include the redzone, so we must adjust down
784 * by the alignment amount.
785 */
786 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
787 addr += (uintptr_t)off;
788 if (addr > as_addr) {
789 addr -= align_amount;
790 }
791
792 /*
793 * If randomization is requested, slew the allocation
794 * backwards, within the same gap, by a random amount.
795 */
796 if (flags & _MAP_RANDOMIZE) {
797 uint32_t slew;
798
799 (void) random_get_pseudo_bytes((uint8_t *)&slew,
800 sizeof (slew));
801
802 slew = slew % MIN(aslr_max_map_skew, (addr - base));
803 addr -= P2ALIGN(slew, align_amount);
804 }
805
806 ASSERT(addr > base);
807 ASSERT(addr + len < base + slen);
808 ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
809 ((uintptr_t)(off)));
810 *addrp = addr;
811 } else {
812 *addrp = NULL; /* no more virtual space */
813 }
814 }
815
816 int valid_va_range_aligned_wraparound;
817
818 /*
819 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
820 * addresses at least "minlen" long, where the base of the range is at "off"
821 * phase from an "align" boundary and there is space for a "redzone"-sized
822 * redzone on either side of the range. On success, 1 is returned and *basep
823 * and *lenp are adjusted to describe the acceptable range (including
824 * the redzone). On failure, 0 is returned.
825 */
911 }
912
913 *basep = (caddr_t)lo;
914 *lenp = hi - lo;
915 return (1);
916 }
917
918 /*
919 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
920 * addresses at least "minlen" long. On success, 1 is returned and *basep
921 * and *lenp are adjusted to describe the acceptable range. On failure, 0
922 * is returned.
923 */
924 int
925 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
926 {
927 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
928 }
929
930 /*
931 * Default to forbidding the first 64k of address space. This protects most
932 * reasonably sized structures from dereferences through NULL:
933 * ((foo_t *)0)->bar
934 */
935 uintptr_t forbidden_null_mapping_sz = 0x10000;
936
937 /*
938 * Determine whether [addr, addr+len] are valid user addresses.
939 */
940 /*ARGSUSED*/
941 int
942 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
943 caddr_t userlimit)
944 {
945 caddr_t eaddr = addr + len;
946
947 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
948 return (RANGE_BADADDR);
949
950 if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
951 secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
952 return (RANGE_BADADDR);
953
954 #if defined(__amd64)
955 /*
956 * Check for the VA hole
957 */
958 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
959 return (RANGE_BADADDR);
960 #endif
961
962 return (RANGE_OKAY);
963 }
964
965 /*
966 * Return 1 if the page frame is onboard memory, else 0.
967 */
968 int
969 pf_is_memory(pfn_t pf)
970 {
971 if (pfn_is_foreign(pf))
972 return (0);
973 return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
3944
3945 hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
3946 cpup->cpu_caddr2pte = 0;
3947 vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
3948 cpup->cpu_caddr2 = 0;
3949
3950 hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
3951 cpup->cpu_caddr1pte = 0;
3952 vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
3953 cpup->cpu_caddr1 = 0;
3954 }
3955
3956 /*
3957 * Function for flushing D-cache when performing module relocations
3958 * to an alternate mapping. Unnecessary on Intel / AMD platforms.
3959 */
3960 void
3961 dcache_flushall()
3962 {}
3963
3964 /*
3965 * Allocate a memory page. The argument 'seed' can be any pseudo-random
3966 * number to vary where the pages come from. This is quite a hacked up
3967 * method -- it works for now, but really needs to be fixed up a bit.
3968 *
3969 * We currently use page_create_va() on the kvp with fake offsets,
3970 * segments and virt address. This is pretty bogus, but was copied from the
3971 * old hat_i86.c code. A better approach would be to specify either mnode
3972 * random or mnode local and takes a page from whatever color has the MOST
3973 * available - this would have a minimal impact on page coloring.
3974 */
3975 page_t *
3976 page_get_physical(uintptr_t seed)
3977 {
3978 page_t *pp;
3979 u_offset_t offset;
3980 static struct seg tmpseg;
3981 static uintptr_t ctr = 0;
3982
3983 /*
|