19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright (c) 2009, Intel Corporation
28 * All rights reserved.
29 */
30
31 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
32 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
33 /* All Rights Reserved */
34
35 /* Copyright (c) 1987, 1988 Microsoft Corporation */
36 /* All Rights Reserved */
37
38 /*
39 * Copyright 2016 Joyent, Inc.
40 */
41
42 #include <sys/errno.h>
43 #include <sys/asm_linkage.h>
44
45 #if defined(__lint)
46 #include <sys/types.h>
47 #include <sys/systm.h>
48 #else /* __lint */
49 #include "assym.h"
50 #endif /* __lint */
51
52 #define KCOPY_MIN_SIZE 128 /* Must be >= 16 bytes */
53 #define XCOPY_MIN_SIZE 128 /* Must be >= 16 bytes */
54 /*
55 * Non-temopral access (NTA) alignment requirement
56 */
57 #define NTA_ALIGN_SIZE 4 /* Must be at least 4-byte aligned */
58 #define NTA_ALIGN_MASK _CONST(NTA_ALIGN_SIZE-1)
59 #define COUNT_ALIGN_SIZE 16 /* Must be at least 16-byte aligned */
849 movzbq -0x1(%rdi), %rcx
850 mov %r8d, -0x7(%rsi)
851 mov %r10w, -0x3(%rsi)
852 mov %cl, -0x1(%rsi)
853 ret
854
855 /*
856 * For large sizes rep smovq is fastest.
857 * Transition point determined experimentally as measured on
858 * Intel Xeon processors (incl. Nehalem and previous generations) and
859 * AMD Opteron. The transition value is patched at boot time to avoid
860 * memory reference hit.
861 */
862 .globl bcopy_patch_start
863 bcopy_patch_start:
864 cmpq $BCOPY_NHM_REP, %rdx
865 .globl bcopy_patch_end
866 bcopy_patch_end:
867
868 .p2align 4
869 .globl bcopy_ck_size
870 bcopy_ck_size:
871 cmpq $BCOPY_DFLT_REP, %rdx
872 jae L(use_rep)
873
874 /*
875 * Align to a 8-byte boundary. Avoids penalties from unaligned stores
876 * as well as from stores spanning cachelines.
877 */
878 test $0x7, %rsi
879 jz L(aligned_loop)
880 test $0x1, %rsi
881 jz 2f
882 movzbq (%rdi), %r8
883 dec %rdx
884 inc %rdi
885 mov %r8b, (%rsi)
886 inc %rsi
887 2:
888 test $0x2, %rsi
889 jz 4f
890 movzwq (%rdi), %r8
939 movslq (%r10,%rdx,4), %rcx
940 leaq (%rcx,%r10,1), %r10
941 jmpq *%r10
942
943 /*
944 * Use rep smovq. Clear remainder via unrolled code
945 */
946 .p2align 4
947 L(use_rep):
948 xchgq %rdi, %rsi /* %rsi = source, %rdi = destination */
949 movq %rdx, %rcx /* %rcx = count */
950 shrq $3, %rcx /* 8-byte word count */
951 rep
952 smovq
953
954 xchgq %rsi, %rdi /* %rdi = src, %rsi = destination */
955 andq $7, %rdx /* remainder */
956 jnz L(do_remainder)
957 ret
958 #undef L
959
960 #ifdef DEBUG
961 /*
962 * Setup frame on the run-time stack. The end of the input argument
963 * area must be aligned on a 16 byte boundary. The stack pointer %rsp,
964 * always points to the end of the latest allocated stack frame.
965 * panic(const char *format, ...) is a varargs function. When a
966 * function taking variable arguments is called, %rax must be set
967 * to eight times the number of floating point parameters passed
968 * to the function in SSE registers.
969 */
970 call_panic:
971 pushq %rbp /* align stack properly */
972 movq %rsp, %rbp
973 xorl %eax, %eax /* no variable arguments */
974 call panic /* %rdi = format string */
975 #endif
976 SET_SIZE(bcopy_altentry)
977 SET_SIZE(bcopy)
978
|
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright (c) 2009, Intel Corporation
28 * All rights reserved.
29 */
30
31 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
32 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
33 /* All Rights Reserved */
34
35 /* Copyright (c) 1987, 1988 Microsoft Corporation */
36 /* All Rights Reserved */
37
38 /*
39 * Copyright (c) 2018 Joyent, Inc.
40 */
41
42 #include <sys/errno.h>
43 #include <sys/asm_linkage.h>
44
45 #if defined(__lint)
46 #include <sys/types.h>
47 #include <sys/systm.h>
48 #else /* __lint */
49 #include "assym.h"
50 #endif /* __lint */
51
52 #define KCOPY_MIN_SIZE 128 /* Must be >= 16 bytes */
53 #define XCOPY_MIN_SIZE 128 /* Must be >= 16 bytes */
54 /*
55 * Non-temopral access (NTA) alignment requirement
56 */
57 #define NTA_ALIGN_SIZE 4 /* Must be at least 4-byte aligned */
58 #define NTA_ALIGN_MASK _CONST(NTA_ALIGN_SIZE-1)
59 #define COUNT_ALIGN_SIZE 16 /* Must be at least 16-byte aligned */
849 movzbq -0x1(%rdi), %rcx
850 mov %r8d, -0x7(%rsi)
851 mov %r10w, -0x3(%rsi)
852 mov %cl, -0x1(%rsi)
853 ret
854
855 /*
856 * For large sizes rep smovq is fastest.
857 * Transition point determined experimentally as measured on
858 * Intel Xeon processors (incl. Nehalem and previous generations) and
859 * AMD Opteron. The transition value is patched at boot time to avoid
860 * memory reference hit.
861 */
862 .globl bcopy_patch_start
863 bcopy_patch_start:
864 cmpq $BCOPY_NHM_REP, %rdx
865 .globl bcopy_patch_end
866 bcopy_patch_end:
867
868 .p2align 4
869 ALTENTRY(bcopy_ck_size)
870
871 cmpq $BCOPY_DFLT_REP, %rdx
872 jae L(use_rep)
873
874 /*
875 * Align to a 8-byte boundary. Avoids penalties from unaligned stores
876 * as well as from stores spanning cachelines.
877 */
878 test $0x7, %rsi
879 jz L(aligned_loop)
880 test $0x1, %rsi
881 jz 2f
882 movzbq (%rdi), %r8
883 dec %rdx
884 inc %rdi
885 mov %r8b, (%rsi)
886 inc %rsi
887 2:
888 test $0x2, %rsi
889 jz 4f
890 movzwq (%rdi), %r8
939 movslq (%r10,%rdx,4), %rcx
940 leaq (%rcx,%r10,1), %r10
941 jmpq *%r10
942
943 /*
944 * Use rep smovq. Clear remainder via unrolled code
945 */
946 .p2align 4
947 L(use_rep):
948 xchgq %rdi, %rsi /* %rsi = source, %rdi = destination */
949 movq %rdx, %rcx /* %rcx = count */
950 shrq $3, %rcx /* 8-byte word count */
951 rep
952 smovq
953
954 xchgq %rsi, %rdi /* %rdi = src, %rsi = destination */
955 andq $7, %rdx /* remainder */
956 jnz L(do_remainder)
957 ret
958 #undef L
959 SET_SIZE(bcopy_ck_size)
960
961 #ifdef DEBUG
962 /*
963 * Setup frame on the run-time stack. The end of the input argument
964 * area must be aligned on a 16 byte boundary. The stack pointer %rsp,
965 * always points to the end of the latest allocated stack frame.
966 * panic(const char *format, ...) is a varargs function. When a
967 * function taking variable arguments is called, %rax must be set
968 * to eight times the number of floating point parameters passed
969 * to the function in SSE registers.
970 */
971 call_panic:
972 pushq %rbp /* align stack properly */
973 movq %rsp, %rbp
974 xorl %eax, %eax /* no variable arguments */
975 call panic /* %rdi = format string */
976 #endif
977 SET_SIZE(bcopy_altentry)
978 SET_SIZE(bcopy)
979
|