1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
27
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/sysmacros.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/debug.h>
40 #include <sys/errno.h>
41 #include <sys/time.h>
42 #include <sys/file.h>
43 #include <sys/user.h>
44 #include <sys/stream.h>
45 #include <sys/strsubr.h>
46 #include <sys/strsun.h>
47 #include <sys/sunddi.h>
48 #include <sys/esunddi.h>
49 #include <sys/flock.h>
50 #include <sys/modctl.h>
51 #include <sys/cmn_err.h>
52 #include <sys/vmsystm.h>
53 #include <sys/policy.h>
54
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57
58 #include <sys/isa_defs.h>
59 #include <sys/inttypes.h>
60 #include <sys/systm.h>
61 #include <sys/cpuvar.h>
62 #include <sys/filio.h>
63 #include <sys/sendfile.h>
64 #include <sys/ddi.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kpm.h>
68
69 #include <fs/sockfs/nl7c.h>
70 #include <fs/sockfs/sockcommon.h>
71 #include <fs/sockfs/sockfilter_impl.h>
72 #include <fs/sockfs/socktpi.h>
73
74 #ifdef SOCK_TEST
75 int do_useracc = 1; /* Controlled by setting SO_DEBUG to 4 */
76 #else
77 #define do_useracc 1
78 #endif /* SOCK_TEST */
79
80 extern int xnet_truncate_print;
81
82 extern void nl7c_init(void);
83 extern int sockfs_defer_nl7c_init;
84
85 /*
86 * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
87 * as there isn't a formal definition of IOV_MAX ???
88 */
89 #define MSG_MAXIOVLEN 16
90
91 /*
92 * Kernel component of socket creation.
93 *
94 * The socket library determines which version number to use.
95 * First the library calls this with a NULL devpath. If this fails
96 * to find a transport (using solookup) the library will look in /etc/netconfig
97 * for the appropriate transport. If one is found it will pass in the
98 * devpath for the kernel to use.
99 */
100 int
101 so_socket(int family, int type_w_flags, int protocol, char *devpath,
102 int version)
103 {
104 struct sonode *so;
105 vnode_t *vp;
106 struct file *fp;
107 int fd;
108 int error;
109 int type;
110
111 type = type_w_flags & SOCK_TYPE_MASK;
112 if (devpath != NULL) {
113 char *buf;
114 size_t kdevpathlen = 0;
115
116 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
117 if ((error = copyinstr(devpath, buf,
118 MAXPATHLEN, &kdevpathlen)) != 0) {
119 kmem_free(buf, MAXPATHLEN);
120 return (set_errno(error));
121 }
122 so = socket_create(family, type, protocol, buf, NULL,
123 SOCKET_SLEEP, version, CRED(), &error);
124 kmem_free(buf, MAXPATHLEN);
125 } else {
126 so = socket_create(family, type, protocol, NULL, NULL,
127 SOCKET_SLEEP, version, CRED(), &error);
128 }
129 if (so == NULL)
130 return (set_errno(error));
131
132 /* Allocate a file descriptor for the socket */
133 vp = SOTOV(so);
134 if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
135 (void) socket_close(so, 0, CRED());
136 socket_destroy(so);
137 return (set_errno(error));
138 }
139
140 /*
141 * Now fill in the entries that falloc reserved
142 */
143 mutex_exit(&fp->f_tlock);
144 setf(fd, fp);
145 if ((type_w_flags & SOCK_CLOEXEC) != 0) {
146 f_setfd(fd, FD_CLOEXEC);
147 }
148
149 return (fd);
150 }
151
152 /*
153 * Map from a file descriptor to a socket node.
154 * Returns with the file descriptor held i.e. the caller has to
155 * use releasef when done with the file descriptor.
156 */
157 struct sonode *
158 getsonode(int sock, int *errorp, file_t **fpp)
159 {
160 file_t *fp;
161 vnode_t *vp;
162 struct sonode *so;
163
164 if ((fp = getf(sock)) == NULL) {
165 *errorp = EBADF;
166 eprintline(*errorp);
167 return (NULL);
168 }
169 vp = fp->f_vnode;
170 /* Check if it is a socket */
171 if (vp->v_type != VSOCK) {
172 releasef(sock);
173 *errorp = ENOTSOCK;
174 eprintline(*errorp);
175 return (NULL);
176 }
177 /*
178 * Use the stream head to find the real socket vnode.
179 * This is needed when namefs sits above sockfs.
180 */
181 if (vp->v_stream) {
182 ASSERT(vp->v_stream->sd_vnode);
183 vp = vp->v_stream->sd_vnode;
184
185 so = VTOSO(vp);
186 if (so->so_version == SOV_STREAM) {
187 releasef(sock);
188 *errorp = ENOTSOCK;
189 eprintsoline(so, *errorp);
190 return (NULL);
191 }
192 } else {
193 so = VTOSO(vp);
194 }
195 if (fpp)
196 *fpp = fp;
197 return (so);
198 }
199
200 /*
201 * Allocate and copyin a sockaddr.
202 * Ensures NULL termination for AF_UNIX addresses by extending them
203 * with one NULL byte if need be. Verifies that the length is not
204 * excessive to prevent an application from consuming all of kernel
205 * memory. Returns NULL when an error occurred.
206 */
207 static struct sockaddr *
208 copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
209 int *errorp)
210 {
211 char *faddr;
212 size_t namelen = (size_t)*namelenp;
213
214 ASSERT(namelen != 0);
215 if (namelen > SO_MAXARGSIZE) {
216 *errorp = EINVAL;
217 eprintsoline(so, *errorp);
218 return (NULL);
219 }
220
221 faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
222 if (copyin(name, faddr, namelen)) {
223 kmem_free(faddr, namelen);
224 *errorp = EFAULT;
225 eprintsoline(so, *errorp);
226 return (NULL);
227 }
228
229 /*
230 * Add space for NULL termination if needed.
231 * Do a quick check if the last byte is NUL.
232 */
233 if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
234 /* Check if there is any NULL termination */
235 size_t i;
236 int foundnull = 0;
237
238 for (i = sizeof (name->sa_family); i < namelen; i++) {
239 if (faddr[i] == '\0') {
240 foundnull = 1;
241 break;
242 }
243 }
244 if (!foundnull) {
245 /* Add extra byte for NUL padding */
246 char *nfaddr;
247
248 nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
249 bcopy(faddr, nfaddr, namelen);
250 kmem_free(faddr, namelen);
251
252 /* NUL terminate */
253 nfaddr[namelen] = '\0';
254 namelen++;
255 ASSERT((socklen_t)namelen == namelen);
256 *namelenp = (socklen_t)namelen;
257 faddr = nfaddr;
258 }
259 }
260 return ((struct sockaddr *)faddr);
261 }
262
263 /*
264 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
265 */
266 static int
267 copyout_arg(void *uaddr, socklen_t ulen, void *ulenp,
268 void *kaddr, socklen_t klen)
269 {
270 if (uaddr != NULL) {
271 if (ulen > klen)
272 ulen = klen;
273
274 if (ulen != 0) {
275 if (copyout(kaddr, uaddr, ulen))
276 return (EFAULT);
277 }
278 } else
279 ulen = 0;
280
281 if (ulenp != NULL) {
282 if (copyout(&ulen, ulenp, sizeof (ulen)))
283 return (EFAULT);
284 }
285 return (0);
286 }
287
288 /*
289 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
290 * If klen is greater than ulen it still uses the non-truncated
291 * klen to update ulenp.
292 */
293 static int
294 copyout_name(void *uaddr, socklen_t ulen, void *ulenp,
295 void *kaddr, socklen_t klen)
296 {
297 if (uaddr != NULL) {
298 if (ulen >= klen)
299 ulen = klen;
300 else if (ulen != 0 && xnet_truncate_print) {
301 printf("sockfs: truncating copyout of address using "
302 "XNET semantics for pid = %d. Lengths %d, %d\n",
303 curproc->p_pid, klen, ulen);
304 }
305
306 if (ulen != 0) {
307 if (copyout(kaddr, uaddr, ulen))
308 return (EFAULT);
309 } else
310 klen = 0;
311 } else
312 klen = 0;
313
314 if (ulenp != NULL) {
315 if (copyout(&klen, ulenp, sizeof (klen)))
316 return (EFAULT);
317 }
318 return (0);
319 }
320
321 /*
322 * The socketpair() code in libsocket creates two sockets (using
323 * the /etc/netconfig fallback if needed) before calling this routine
324 * to connect the two sockets together.
325 *
326 * For a SOCK_STREAM socketpair a listener is needed - in that case this
327 * routine will create a new file descriptor as part of accepting the
328 * connection. The library socketpair() will check if svs[2] has changed
329 * in which case it will close the changed fd.
330 *
331 * Note that this code could use the TPI feature of accepting the connection
332 * on the listening endpoint. However, that would require significant changes
333 * to soaccept.
334 */
335 int
336 so_socketpair(int sv[2])
337 {
338 int svs[2];
339 struct sonode *so1, *so2;
340 int error;
341 int orig_flags;
342 struct sockaddr_ux *name;
343 size_t namelen;
344 sotpi_info_t *sti1;
345 sotpi_info_t *sti2;
346
347 dprint(1, ("so_socketpair(%p)\n", (void *)sv));
348
349 error = useracc(sv, sizeof (svs), B_WRITE);
350 if (error && do_useracc)
351 return (set_errno(EFAULT));
352
353 if (copyin(sv, svs, sizeof (svs)))
354 return (set_errno(EFAULT));
355
356 if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
357 return (set_errno(error));
358
359 if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
360 releasef(svs[0]);
361 return (set_errno(error));
362 }
363
364 if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
365 error = EOPNOTSUPP;
366 goto done;
367 }
368
369 sti1 = SOTOTPI(so1);
370 sti2 = SOTOTPI(so2);
371
372 /*
373 * The code below makes assumptions about the "sockfs" implementation.
374 * So make sure that the correct implementation is really used.
375 */
376 ASSERT(so1->so_ops == &sotpi_sonodeops);
377 ASSERT(so2->so_ops == &sotpi_sonodeops);
378
379 if (so1->so_type == SOCK_DGRAM) {
380 /*
381 * Bind both sockets and connect them with each other.
382 * Need to allocate name/namelen for soconnect.
383 */
384 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
385 if (error) {
386 eprintsoline(so1, error);
387 goto done;
388 }
389 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
390 if (error) {
391 eprintsoline(so2, error);
392 goto done;
393 }
394 namelen = sizeof (struct sockaddr_ux);
395 name = kmem_alloc(namelen, KM_SLEEP);
396 name->sou_family = AF_UNIX;
397 name->sou_addr = sti2->sti_ux_laddr;
398 error = socket_connect(so1,
399 (struct sockaddr *)name,
400 (socklen_t)namelen,
401 0, _SOCONNECT_NOXLATE, CRED());
402 if (error) {
403 kmem_free(name, namelen);
404 eprintsoline(so1, error);
405 goto done;
406 }
407 name->sou_addr = sti1->sti_ux_laddr;
408 error = socket_connect(so2,
409 (struct sockaddr *)name,
410 (socklen_t)namelen,
411 0, _SOCONNECT_NOXLATE, CRED());
412 kmem_free(name, namelen);
413 if (error) {
414 eprintsoline(so2, error);
415 goto done;
416 }
417 releasef(svs[0]);
418 releasef(svs[1]);
419 } else {
420 /*
421 * Bind both sockets, with so1 being a listener.
422 * Connect so2 to so1 - nonblocking to avoid waiting for
423 * soaccept to complete.
424 * Accept a connection on so1. Pass out the new fd as sv[0].
425 * The library will detect the changed fd and close
426 * the original one.
427 */
428 struct sonode *nso;
429 struct vnode *nvp;
430 struct file *nfp;
431 int nfd;
432
433 /*
434 * We could simply call socket_listen() here (which would do the
435 * binding automatically) if the code didn't rely on passing
436 * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
437 */
438 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
439 _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
440 CRED());
441 if (error) {
442 eprintsoline(so1, error);
443 goto done;
444 }
445 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
446 if (error) {
447 eprintsoline(so2, error);
448 goto done;
449 }
450
451 namelen = sizeof (struct sockaddr_ux);
452 name = kmem_alloc(namelen, KM_SLEEP);
453 name->sou_family = AF_UNIX;
454 name->sou_addr = sti1->sti_ux_laddr;
455 error = socket_connect(so2,
456 (struct sockaddr *)name,
457 (socklen_t)namelen,
458 FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
459 kmem_free(name, namelen);
460 if (error) {
461 if (error != EINPROGRESS) {
462 eprintsoline(so2, error); goto done;
463 }
464 }
465
466 error = socket_accept(so1, 0, CRED(), &nso);
467 if (error) {
468 eprintsoline(so1, error);
469 goto done;
470 }
471
472 /* wait for so2 being SS_CONNECTED ignoring signals */
473 mutex_enter(&so2->so_lock);
474 error = sowaitconnected(so2, 0, 1);
475 mutex_exit(&so2->so_lock);
476 if (error != 0) {
477 (void) socket_close(nso, 0, CRED());
478 socket_destroy(nso);
479 eprintsoline(so2, error);
480 goto done;
481 }
482
483 nvp = SOTOV(nso);
484 if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
485 (void) socket_close(nso, 0, CRED());
486 socket_destroy(nso);
487 eprintsoline(nso, error);
488 goto done;
489 }
490 /*
491 * fill in the entries that falloc reserved
492 */
493 mutex_exit(&nfp->f_tlock);
494 setf(nfd, nfp);
495
496 releasef(svs[0]);
497 releasef(svs[1]);
498
499 /*
500 * If FD_CLOEXEC was set on the filedescriptor we're
501 * swapping out, we should set it on the new one too.
502 */
503 VERIFY(f_getfd_error(svs[0], &orig_flags) == 0);
504 if (orig_flags & FD_CLOEXEC) {
505 f_setfd(nfd, FD_CLOEXEC);
506 }
507
508 /*
509 * The socketpair library routine will close the original
510 * svs[0] when this code passes out a different file
511 * descriptor.
512 */
513 svs[0] = nfd;
514
515 if (copyout(svs, sv, sizeof (svs))) {
516 (void) closeandsetf(nfd, NULL);
517 eprintline(EFAULT);
518 return (set_errno(EFAULT));
519 }
520 }
521 return (0);
522
523 done:
524 releasef(svs[0]);
525 releasef(svs[1]);
526 return (set_errno(error));
527 }
528
529 int
530 bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
531 {
532 struct sonode *so;
533 int error;
534
535 dprint(1, ("bind(%d, %p, %d)\n",
536 sock, (void *)name, namelen));
537
538 if ((so = getsonode(sock, &error, NULL)) == NULL)
539 return (set_errno(error));
540
541 /* Allocate and copyin name */
542 /*
543 * X/Open test does not expect EFAULT with NULL name and non-zero
544 * namelen.
545 */
546 if (name != NULL && namelen != 0) {
547 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
548 name = copyin_name(so, name, &namelen, &error);
549 if (name == NULL) {
550 releasef(sock);
551 return (set_errno(error));
552 }
553 } else {
554 name = NULL;
555 namelen = 0;
556 }
557
558 switch (version) {
559 default:
560 error = socket_bind(so, name, namelen, 0, CRED());
561 break;
562 case SOV_XPG4_2:
563 error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
564 break;
565 case SOV_SOCKBSD:
566 error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
567 break;
568 }
569 done:
570 releasef(sock);
571 if (name != NULL)
572 kmem_free(name, (size_t)namelen);
573
574 if (error)
575 return (set_errno(error));
576 return (0);
577 }
578
579 /* ARGSUSED2 */
580 int
581 listen(int sock, int backlog, int version)
582 {
583 struct sonode *so;
584 int error;
585
586 dprint(1, ("listen(%d, %d)\n",
587 sock, backlog));
588
589 if ((so = getsonode(sock, &error, NULL)) == NULL)
590 return (set_errno(error));
591
592 error = socket_listen(so, backlog, CRED());
593
594 releasef(sock);
595 if (error)
596 return (set_errno(error));
597 return (0);
598 }
599
600 /*ARGSUSED3*/
601 int
602 accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version,
603 int flags)
604 {
605 struct sonode *so;
606 file_t *fp;
607 int error;
608 socklen_t namelen;
609 struct sonode *nso;
610 struct vnode *nvp;
611 struct file *nfp;
612 int nfd;
613 int ssflags;
614 struct sockaddr *addrp;
615 socklen_t addrlen;
616
617 dprint(1, ("accept(%d, %p, %p)\n",
618 sock, (void *)name, (void *)namelenp));
619
620 if (flags & ~(SOCK_CLOEXEC|SOCK_NONBLOCK|SOCK_NDELAY)) {
621 return (set_errno(EINVAL));
622 }
623
624 /* Translate SOCK_ flags to their SS_ variant */
625 ssflags = 0;
626 if (flags & SOCK_NONBLOCK)
627 ssflags |= SS_NONBLOCK;
628 if (flags & SOCK_NDELAY)
629 ssflags |= SS_NDELAY;
630
631 if ((so = getsonode(sock, &error, &fp)) == NULL)
632 return (set_errno(error));
633
634 if (name != NULL) {
635 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
636 if (copyin(namelenp, &namelen, sizeof (namelen))) {
637 releasef(sock);
638 return (set_errno(EFAULT));
639 }
640 if (namelen != 0) {
641 error = useracc(name, (size_t)namelen, B_WRITE);
642 if (error && do_useracc) {
643 releasef(sock);
644 return (set_errno(EFAULT));
645 }
646 } else
647 name = NULL;
648 } else {
649 namelen = 0;
650 }
651
652 /*
653 * Allocate the user fd before socket_accept() in order to
654 * catch EMFILE errors before calling socket_accept().
655 */
656 if ((nfd = ufalloc(0)) == -1) {
657 eprintsoline(so, EMFILE);
658 releasef(sock);
659 return (set_errno(EMFILE));
660 }
661 error = socket_accept(so, fp->f_flag, CRED(), &nso);
662 if (error) {
663 setf(nfd, NULL);
664 releasef(sock);
665 return (set_errno(error));
666 }
667
668 nvp = SOTOV(nso);
669
670 ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
671 if (namelen != 0) {
672 addrlen = so->so_max_addr_len;
673 addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
674
675 if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
676 &addrlen, B_TRUE, CRED())) == 0) {
677 error = copyout_name(name, namelen, namelenp,
678 addrp, addrlen);
679 } else {
680 ASSERT(error == EINVAL || error == ENOTCONN);
681 error = ECONNABORTED;
682 }
683 kmem_free(addrp, so->so_max_addr_len);
684 }
685
686 if (error) {
687 setf(nfd, NULL);
688 (void) socket_close(nso, 0, CRED());
689 socket_destroy(nso);
690 releasef(sock);
691 return (set_errno(error));
692 }
693 if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
694 setf(nfd, NULL);
695 (void) socket_close(nso, 0, CRED());
696 socket_destroy(nso);
697 eprintsoline(so, error);
698 releasef(sock);
699 return (set_errno(error));
700 }
701 /*
702 * fill in the entries that falloc reserved
703 */
704 nfp->f_vnode = nvp;
705 mutex_exit(&nfp->f_tlock);
706 setf(nfd, nfp);
707
708 /*
709 * Act on SOCK_CLOEXEC from flags
710 */
711 if (flags & SOCK_CLOEXEC) {
712 f_setfd(nfd, FD_CLOEXEC);
713 }
714
715 /*
716 * Copy FNDELAY and FNONBLOCK from listener to acceptor
717 * and from ssflags
718 */
719 if ((ssflags | so->so_state) & (SS_NDELAY|SS_NONBLOCK)) {
720 uint_t oflag = nfp->f_flag;
721 int arg = 0;
722
723 if ((ssflags | so->so_state) & SS_NONBLOCK)
724 arg |= FNONBLOCK;
725 else if ((ssflags | so->so_state) & SS_NDELAY)
726 arg |= FNDELAY;
727
728 /*
729 * This code is a simplification of the F_SETFL code in fcntl()
730 * Ignore any errors from VOP_SETFL.
731 */
732 if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL))
733 != 0) {
734 eprintsoline(so, error);
735 error = 0;
736 } else {
737 mutex_enter(&nfp->f_tlock);
738 nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
739 nfp->f_flag |= arg;
740 mutex_exit(&nfp->f_tlock);
741 }
742 }
743 releasef(sock);
744 return (nfd);
745 }
746
747 int
748 connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
749 {
750 struct sonode *so;
751 file_t *fp;
752 int error;
753
754 dprint(1, ("connect(%d, %p, %d)\n",
755 sock, (void *)name, namelen));
756
757 if ((so = getsonode(sock, &error, &fp)) == NULL)
758 return (set_errno(error));
759
760 /* Allocate and copyin name */
761 if (namelen != 0) {
762 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
763 name = copyin_name(so, name, &namelen, &error);
764 if (name == NULL) {
765 releasef(sock);
766 return (set_errno(error));
767 }
768 } else
769 name = NULL;
770
771 error = socket_connect(so, name, namelen, fp->f_flag,
772 (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
773 releasef(sock);
774 if (name)
775 kmem_free(name, (size_t)namelen);
776 if (error)
777 return (set_errno(error));
778 return (0);
779 }
780
781 /*ARGSUSED2*/
782 int
783 shutdown(int sock, int how, int version)
784 {
785 struct sonode *so;
786 int error;
787
788 dprint(1, ("shutdown(%d, %d)\n",
789 sock, how));
790
791 if ((so = getsonode(sock, &error, NULL)) == NULL)
792 return (set_errno(error));
793
794 error = socket_shutdown(so, how, CRED());
795
796 releasef(sock);
797 if (error)
798 return (set_errno(error));
799 return (0);
800 }
801
802 /*
803 * Common receive routine.
804 */
805 static ssize_t
806 recvit(int sock,
807 struct nmsghdr *msg,
808 struct uio *uiop,
809 int flags,
810 socklen_t *namelenp,
811 socklen_t *controllenp,
812 int *flagsp)
813 {
814 struct sonode *so;
815 file_t *fp;
816 void *name;
817 socklen_t namelen;
818 void *control;
819 socklen_t controllen;
820 ssize_t len;
821 int error;
822
823 if ((so = getsonode(sock, &error, &fp)) == NULL)
824 return (set_errno(error));
825
826 len = uiop->uio_resid;
827 uiop->uio_fmode = fp->f_flag;
828 uiop->uio_extflg = UIO_COPY_CACHED;
829
830 name = msg->msg_name;
831 namelen = msg->msg_namelen;
832 control = msg->msg_control;
833 controllen = msg->msg_controllen;
834
835 msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
836 MSG_DONTWAIT | MSG_XPG4_2);
837
838 error = socket_recvmsg(so, msg, uiop, CRED());
839 if (error) {
840 releasef(sock);
841 return (set_errno(error));
842 }
843 lwp_stat_update(LWP_STAT_MSGRCV, 1);
844 releasef(sock);
845
846 error = copyout_name(name, namelen, namelenp,
847 msg->msg_name, msg->msg_namelen);
848 if (error)
849 goto err;
850
851 if (flagsp != NULL) {
852 /*
853 * Clear internal flag.
854 */
855 msg->msg_flags &= ~MSG_XPG4_2;
856
857 /*
858 * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
859 * when controllen is zero and there is control data to
860 * copy out.
861 */
862 if (controllen != 0 &&
863 (msg->msg_controllen > controllen || control == NULL)) {
864 dprint(1, ("recvit: CTRUNC %d %d %p\n",
865 msg->msg_controllen, controllen, control));
866
867 msg->msg_flags |= MSG_CTRUNC;
868 }
869 if (copyout(&msg->msg_flags, flagsp,
870 sizeof (msg->msg_flags))) {
871 error = EFAULT;
872 goto err;
873 }
874 }
875 /*
876 * Note: This MUST be done last. There can be no "goto err" after this
877 * point since it could make so_closefds run twice on some part
878 * of the file descriptor array.
879 */
880 if (controllen != 0) {
881 if (!(flags & MSG_XPG4_2)) {
882 /*
883 * Good old msg_accrights can only return a multiple
884 * of 4 bytes.
885 */
886 controllen &= ~((int)sizeof (uint32_t) - 1);
887 }
888 error = copyout_arg(control, controllen, controllenp,
889 msg->msg_control, msg->msg_controllen);
890 if (error)
891 goto err;
892
893 if (msg->msg_controllen > controllen || control == NULL) {
894 if (control == NULL)
895 controllen = 0;
896 so_closefds(msg->msg_control, msg->msg_controllen,
897 !(flags & MSG_XPG4_2), controllen);
898 }
899 }
900 if (msg->msg_namelen != 0)
901 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
902 if (msg->msg_controllen != 0)
903 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
904 return (len - uiop->uio_resid);
905
906 err:
907 /*
908 * If we fail and the control part contains file descriptors
909 * we have to close the fd's.
910 */
911 if (msg->msg_controllen != 0)
912 so_closefds(msg->msg_control, msg->msg_controllen,
913 !(flags & MSG_XPG4_2), 0);
914 if (msg->msg_namelen != 0)
915 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
916 if (msg->msg_controllen != 0)
917 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
918 return (set_errno(error));
919 }
920
921 /*
922 * Native system call
923 */
924 ssize_t
925 recv(int sock, void *buffer, size_t len, int flags)
926 {
927 struct nmsghdr lmsg;
928 struct uio auio;
929 struct iovec aiov[1];
930
931 dprint(1, ("recv(%d, %p, %ld, %d)\n",
932 sock, buffer, len, flags));
933
934 if ((ssize_t)len < 0) {
935 return (set_errno(EINVAL));
936 }
937
938 aiov[0].iov_base = buffer;
939 aiov[0].iov_len = len;
940 auio.uio_loffset = 0;
941 auio.uio_iov = aiov;
942 auio.uio_iovcnt = 1;
943 auio.uio_resid = len;
944 auio.uio_segflg = UIO_USERSPACE;
945 auio.uio_limit = 0;
946
947 lmsg.msg_namelen = 0;
948 lmsg.msg_controllen = 0;
949 lmsg.msg_flags = 0;
950 return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
951 }
952
953 ssize_t
954 recvfrom(int sock, void *buffer, size_t len, int flags,
955 struct sockaddr *name, socklen_t *namelenp)
956 {
957 struct nmsghdr lmsg;
958 struct uio auio;
959 struct iovec aiov[1];
960
961 dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
962 sock, buffer, len, flags, (void *)name, (void *)namelenp));
963
964 if ((ssize_t)len < 0) {
965 return (set_errno(EINVAL));
966 }
967
968 aiov[0].iov_base = buffer;
969 aiov[0].iov_len = len;
970 auio.uio_loffset = 0;
971 auio.uio_iov = aiov;
972 auio.uio_iovcnt = 1;
973 auio.uio_resid = len;
974 auio.uio_segflg = UIO_USERSPACE;
975 auio.uio_limit = 0;
976
977 lmsg.msg_name = (char *)name;
978 if (namelenp != NULL) {
979 if (copyin(namelenp, &lmsg.msg_namelen,
980 sizeof (lmsg.msg_namelen)))
981 return (set_errno(EFAULT));
982 } else {
983 lmsg.msg_namelen = 0;
984 }
985 lmsg.msg_controllen = 0;
986 lmsg.msg_flags = 0;
987
988 return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
989 }
990
991 /*
992 * Uses the MSG_XPG4_2 flag to determine if the caller is using
993 * struct omsghdr or struct nmsghdr.
994 */
995 ssize_t
996 recvmsg(int sock, struct nmsghdr *msg, int flags)
997 {
998 STRUCT_DECL(nmsghdr, u_lmsg);
999 STRUCT_HANDLE(nmsghdr, umsgptr);
1000 struct nmsghdr lmsg;
1001 struct uio auio;
1002 struct iovec aiov[MSG_MAXIOVLEN];
1003 int iovcnt;
1004 ssize_t len;
1005 int i;
1006 int *flagsp;
1007 model_t model;
1008
1009 dprint(1, ("recvmsg(%d, %p, %d)\n",
1010 sock, (void *)msg, flags));
1011
1012 model = get_udatamodel();
1013 STRUCT_INIT(u_lmsg, model);
1014 STRUCT_SET_HANDLE(umsgptr, model, msg);
1015
1016 if (flags & MSG_XPG4_2) {
1017 if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
1018 return (set_errno(EFAULT));
1019 flagsp = STRUCT_FADDR(umsgptr, msg_flags);
1020 } else {
1021 /*
1022 * Assumes that nmsghdr and omsghdr are identically shaped
1023 * except for the added msg_flags field.
1024 */
1025 if (copyin(msg, STRUCT_BUF(u_lmsg),
1026 SIZEOF_STRUCT(omsghdr, model)))
1027 return (set_errno(EFAULT));
1028 STRUCT_FSET(u_lmsg, msg_flags, 0);
1029 flagsp = NULL;
1030 }
1031
1032 /*
1033 * Code below us will kmem_alloc memory and hang it
1034 * off msg_control and msg_name fields. This forces
1035 * us to copy the structure to its native form.
1036 */
1037 lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1038 lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1039 lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1040 lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1041 lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1042 lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1043 lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1044
1045 iovcnt = lmsg.msg_iovlen;
1046
1047 if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1048 return (set_errno(EMSGSIZE));
1049 }
1050
1051 #ifdef _SYSCALL32_IMPL
1052 /*
1053 * 32-bit callers need to have their iovec expanded, while ensuring
1054 * that they can't move more than 2Gbytes of data in a single call.
1055 */
1056 if (model == DATAMODEL_ILP32) {
1057 struct iovec32 aiov32[MSG_MAXIOVLEN];
1058 ssize32_t count32;
1059
1060 if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1061 iovcnt * sizeof (struct iovec32)))
1062 return (set_errno(EFAULT));
1063
1064 count32 = 0;
1065 for (i = 0; i < iovcnt; i++) {
1066 ssize32_t iovlen32;
1067
1068 iovlen32 = aiov32[i].iov_len;
1069 count32 += iovlen32;
1070 if (iovlen32 < 0 || count32 < 0)
1071 return (set_errno(EINVAL));
1072 aiov[i].iov_len = iovlen32;
1073 aiov[i].iov_base =
1074 (caddr_t)(uintptr_t)aiov32[i].iov_base;
1075 }
1076 } else
1077 #endif /* _SYSCALL32_IMPL */
1078 if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
1079 return (set_errno(EFAULT));
1080 }
1081 len = 0;
1082 for (i = 0; i < iovcnt; i++) {
1083 ssize_t iovlen = aiov[i].iov_len;
1084 len += iovlen;
1085 if (iovlen < 0 || len < 0) {
1086 return (set_errno(EINVAL));
1087 }
1088 }
1089 auio.uio_loffset = 0;
1090 auio.uio_iov = aiov;
1091 auio.uio_iovcnt = iovcnt;
1092 auio.uio_resid = len;
1093 auio.uio_segflg = UIO_USERSPACE;
1094 auio.uio_limit = 0;
1095
1096 if (lmsg.msg_control != NULL &&
1097 (do_useracc == 0 ||
1098 useracc(lmsg.msg_control, lmsg.msg_controllen,
1099 B_WRITE) != 0)) {
1100 return (set_errno(EFAULT));
1101 }
1102
1103 return (recvit(sock, &lmsg, &auio, flags,
1104 STRUCT_FADDR(umsgptr, msg_namelen),
1105 STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
1106 }
1107
1108 /*
1109 * Common send function.
1110 */
1111 static ssize_t
1112 sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
1113 {
1114 struct sonode *so;
1115 file_t *fp;
1116 void *name;
1117 socklen_t namelen;
1118 void *control;
1119 socklen_t controllen;
1120 ssize_t len;
1121 int error;
1122
1123 if ((so = getsonode(sock, &error, &fp)) == NULL)
1124 return (set_errno(error));
1125
1126 uiop->uio_fmode = fp->f_flag;
1127
1128 if (so->so_family == AF_UNIX)
1129 uiop->uio_extflg = UIO_COPY_CACHED;
1130 else
1131 uiop->uio_extflg = UIO_COPY_DEFAULT;
1132
1133 /* Allocate and copyin name and control */
1134 name = msg->msg_name;
1135 namelen = msg->msg_namelen;
1136 if (name != NULL && namelen != 0) {
1137 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1138 name = copyin_name(so,
1139 (struct sockaddr *)name,
1140 &namelen, &error);
1141 if (name == NULL)
1142 goto done3;
1143 /* copyin_name null terminates addresses for AF_UNIX */
1144 msg->msg_namelen = namelen;
1145 msg->msg_name = name;
1146 } else {
1147 msg->msg_name = name = NULL;
1148 msg->msg_namelen = namelen = 0;
1149 }
1150
1151 control = msg->msg_control;
1152 controllen = msg->msg_controllen;
1153 if ((control != NULL) && (controllen != 0)) {
1154 /*
1155 * Verify that the length is not excessive to prevent
1156 * an application from consuming all of kernel memory.
1157 */
1158 if (controllen > SO_MAXARGSIZE) {
1159 error = EINVAL;
1160 goto done2;
1161 }
1162 control = kmem_alloc(controllen, KM_SLEEP);
1163
1164 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1165 if (copyin(msg->msg_control, control, controllen)) {
1166 error = EFAULT;
1167 goto done1;
1168 }
1169 msg->msg_control = control;
1170 } else {
1171 msg->msg_control = control = NULL;
1172 msg->msg_controllen = controllen = 0;
1173 }
1174
1175 len = uiop->uio_resid;
1176 msg->msg_flags = flags;
1177
1178 error = socket_sendmsg(so, msg, uiop, CRED());
1179 done1:
1180 if (control != NULL)
1181 kmem_free(control, controllen);
1182 done2:
1183 if (name != NULL)
1184 kmem_free(name, namelen);
1185 done3:
1186 if (error != 0) {
1187 releasef(sock);
1188 return (set_errno(error));
1189 }
1190 lwp_stat_update(LWP_STAT_MSGSND, 1);
1191 releasef(sock);
1192 return (len - uiop->uio_resid);
1193 }
1194
1195 /*
1196 * Native system call
1197 */
1198 ssize_t
1199 send(int sock, void *buffer, size_t len, int flags)
1200 {
1201 struct nmsghdr lmsg;
1202 struct uio auio;
1203 struct iovec aiov[1];
1204
1205 dprint(1, ("send(%d, %p, %ld, %d)\n",
1206 sock, buffer, len, flags));
1207
1208 if ((ssize_t)len < 0) {
1209 return (set_errno(EINVAL));
1210 }
1211
1212 aiov[0].iov_base = buffer;
1213 aiov[0].iov_len = len;
1214 auio.uio_loffset = 0;
1215 auio.uio_iov = aiov;
1216 auio.uio_iovcnt = 1;
1217 auio.uio_resid = len;
1218 auio.uio_segflg = UIO_USERSPACE;
1219 auio.uio_limit = 0;
1220
1221 lmsg.msg_name = NULL;
1222 lmsg.msg_control = NULL;
1223 if (!(flags & MSG_XPG4_2)) {
1224 /*
1225 * In order to be compatible with the libsocket/sockmod
1226 * implementation we set EOR for all send* calls.
1227 */
1228 flags |= MSG_EOR;
1229 }
1230 return (sendit(sock, &lmsg, &auio, flags));
1231 }
1232
1233 /*
1234 * Uses the MSG_XPG4_2 flag to determine if the caller is using
1235 * struct omsghdr or struct nmsghdr.
1236 */
1237 ssize_t
1238 sendmsg(int sock, struct nmsghdr *msg, int flags)
1239 {
1240 struct nmsghdr lmsg;
1241 STRUCT_DECL(nmsghdr, u_lmsg);
1242 struct uio auio;
1243 struct iovec aiov[MSG_MAXIOVLEN];
1244 int iovcnt;
1245 ssize_t len;
1246 int i;
1247 model_t model;
1248
1249 dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags));
1250
1251 model = get_udatamodel();
1252 STRUCT_INIT(u_lmsg, model);
1253
1254 if (flags & MSG_XPG4_2) {
1255 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1256 STRUCT_SIZE(u_lmsg)))
1257 return (set_errno(EFAULT));
1258 } else {
1259 /*
1260 * Assumes that nmsghdr and omsghdr are identically shaped
1261 * except for the added msg_flags field.
1262 */
1263 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1264 SIZEOF_STRUCT(omsghdr, model)))
1265 return (set_errno(EFAULT));
1266 /*
1267 * In order to be compatible with the libsocket/sockmod
1268 * implementation we set EOR for all send* calls.
1269 */
1270 flags |= MSG_EOR;
1271 }
1272
1273 /*
1274 * Code below us will kmem_alloc memory and hang it
1275 * off msg_control and msg_name fields. This forces
1276 * us to copy the structure to its native form.
1277 */
1278 lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1279 lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1280 lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1281 lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1282 lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1283 lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1284 lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1285
1286 iovcnt = lmsg.msg_iovlen;
1287
1288 if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1289 /*
1290 * Unless this is XPG 4.2 we allow iovcnt == 0 to
1291 * be compatible with SunOS 4.X and 4.4BSD.
1292 */
1293 if (iovcnt != 0 || (flags & MSG_XPG4_2))
1294 return (set_errno(EMSGSIZE));
1295 }
1296
1297 #ifdef _SYSCALL32_IMPL
1298 /*
1299 * 32-bit callers need to have their iovec expanded, while ensuring
1300 * that they can't move more than 2Gbytes of data in a single call.
1301 */
1302 if (model == DATAMODEL_ILP32) {
1303 struct iovec32 aiov32[MSG_MAXIOVLEN];
1304 ssize32_t count32;
1305
1306 if (iovcnt != 0 &&
1307 copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1308 iovcnt * sizeof (struct iovec32)))
1309 return (set_errno(EFAULT));
1310
1311 count32 = 0;
1312 for (i = 0; i < iovcnt; i++) {
1313 ssize32_t iovlen32;
1314
1315 iovlen32 = aiov32[i].iov_len;
1316 count32 += iovlen32;
1317 if (iovlen32 < 0 || count32 < 0)
1318 return (set_errno(EINVAL));
1319 aiov[i].iov_len = iovlen32;
1320 aiov[i].iov_base =
1321 (caddr_t)(uintptr_t)aiov32[i].iov_base;
1322 }
1323 } else
1324 #endif /* _SYSCALL32_IMPL */
1325 if (iovcnt != 0 &&
1326 copyin(lmsg.msg_iov, aiov,
1327 (unsigned)iovcnt * sizeof (struct iovec))) {
1328 return (set_errno(EFAULT));
1329 }
1330 len = 0;
1331 for (i = 0; i < iovcnt; i++) {
1332 ssize_t iovlen = aiov[i].iov_len;
1333 len += iovlen;
1334 if (iovlen < 0 || len < 0) {
1335 return (set_errno(EINVAL));
1336 }
1337 }
1338 auio.uio_loffset = 0;
1339 auio.uio_iov = aiov;
1340 auio.uio_iovcnt = iovcnt;
1341 auio.uio_resid = len;
1342 auio.uio_segflg = UIO_USERSPACE;
1343 auio.uio_limit = 0;
1344
1345 return (sendit(sock, &lmsg, &auio, flags));
1346 }
1347
1348 ssize_t
1349 sendto(int sock, void *buffer, size_t len, int flags,
1350 struct sockaddr *name, socklen_t namelen)
1351 {
1352 struct nmsghdr lmsg;
1353 struct uio auio;
1354 struct iovec aiov[1];
1355
1356 dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1357 sock, buffer, len, flags, (void *)name, namelen));
1358
1359 if ((ssize_t)len < 0) {
1360 return (set_errno(EINVAL));
1361 }
1362
1363 aiov[0].iov_base = buffer;
1364 aiov[0].iov_len = len;
1365 auio.uio_loffset = 0;
1366 auio.uio_iov = aiov;
1367 auio.uio_iovcnt = 1;
1368 auio.uio_resid = len;
1369 auio.uio_segflg = UIO_USERSPACE;
1370 auio.uio_limit = 0;
1371
1372 lmsg.msg_name = (char *)name;
1373 lmsg.msg_namelen = namelen;
1374 lmsg.msg_control = NULL;
1375 if (!(flags & MSG_XPG4_2)) {
1376 /*
1377 * In order to be compatible with the libsocket/sockmod
1378 * implementation we set EOR for all send* calls.
1379 */
1380 flags |= MSG_EOR;
1381 }
1382 return (sendit(sock, &lmsg, &auio, flags));
1383 }
1384
1385 /*ARGSUSED3*/
1386 int
1387 getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1388 {
1389 struct sonode *so;
1390 int error;
1391 socklen_t namelen;
1392 socklen_t sock_addrlen;
1393 struct sockaddr *sock_addrp;
1394
1395 dprint(1, ("getpeername(%d, %p, %p)\n",
1396 sock, (void *)name, (void *)namelenp));
1397
1398 if ((so = getsonode(sock, &error, NULL)) == NULL)
1399 goto bad;
1400
1401 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1402 if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1403 (name == NULL && namelen != 0)) {
1404 error = EFAULT;
1405 goto rel_out;
1406 }
1407 sock_addrlen = so->so_max_addr_len;
1408 sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1409
1410 if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
1411 B_FALSE, CRED())) == 0) {
1412 ASSERT(sock_addrlen <= so->so_max_addr_len);
1413 error = copyout_name(name, namelen, namelenp,
1414 (void *)sock_addrp, sock_addrlen);
1415 }
1416 kmem_free(sock_addrp, so->so_max_addr_len);
1417 rel_out:
1418 releasef(sock);
1419 bad: return (error != 0 ? set_errno(error) : 0);
1420 }
1421
1422 /*ARGSUSED3*/
1423 int
1424 getsockname(int sock, struct sockaddr *name,
1425 socklen_t *namelenp, int version)
1426 {
1427 struct sonode *so;
1428 int error;
1429 socklen_t namelen, sock_addrlen;
1430 struct sockaddr *sock_addrp;
1431
1432 dprint(1, ("getsockname(%d, %p, %p)\n",
1433 sock, (void *)name, (void *)namelenp));
1434
1435 if ((so = getsonode(sock, &error, NULL)) == NULL)
1436 goto bad;
1437
1438 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1439 if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1440 (name == NULL && namelen != 0)) {
1441 error = EFAULT;
1442 goto rel_out;
1443 }
1444
1445 sock_addrlen = so->so_max_addr_len;
1446 sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1447 if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
1448 CRED())) == 0) {
1449 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1450 ASSERT(sock_addrlen <= so->so_max_addr_len);
1451 error = copyout_name(name, namelen, namelenp,
1452 (void *)sock_addrp, sock_addrlen);
1453 }
1454 kmem_free(sock_addrp, so->so_max_addr_len);
1455 rel_out:
1456 releasef(sock);
1457 bad: return (error != 0 ? set_errno(error) : 0);
1458 }
1459
1460 /*ARGSUSED5*/
1461 int
1462 getsockopt(int sock,
1463 int level,
1464 int option_name,
1465 void *option_value,
1466 socklen_t *option_lenp,
1467 int version)
1468 {
1469 struct sonode *so;
1470 socklen_t optlen, optlen_res;
1471 void *optval;
1472 int error;
1473
1474 dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1475 sock, level, option_name, option_value, (void *)option_lenp));
1476
1477 if ((so = getsonode(sock, &error, NULL)) == NULL)
1478 return (set_errno(error));
1479
1480 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1481 if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1482 releasef(sock);
1483 return (set_errno(EFAULT));
1484 }
1485 /*
1486 * Verify that the length is not excessive to prevent
1487 * an application from consuming all of kernel memory.
1488 */
1489 if (optlen > SO_MAXARGSIZE) {
1490 error = EINVAL;
1491 releasef(sock);
1492 return (set_errno(error));
1493 }
1494 optval = kmem_alloc(optlen, KM_SLEEP);
1495 optlen_res = optlen;
1496 error = socket_getsockopt(so, level, option_name, optval,
1497 &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
1498 CRED());
1499 releasef(sock);
1500 if (error) {
1501 kmem_free(optval, optlen);
1502 return (set_errno(error));
1503 }
1504 error = copyout_arg(option_value, optlen, option_lenp,
1505 optval, optlen_res);
1506 kmem_free(optval, optlen);
1507 if (error)
1508 return (set_errno(error));
1509 return (0);
1510 }
1511
1512 /*ARGSUSED5*/
1513 int
1514 setsockopt(int sock,
1515 int level,
1516 int option_name,
1517 void *option_value,
1518 socklen_t option_len,
1519 int version)
1520 {
1521 struct sonode *so;
1522 intptr_t buffer[2];
1523 void *optval = NULL;
1524 int error;
1525
1526 dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1527 sock, level, option_name, option_value, option_len));
1528
1529 if ((so = getsonode(sock, &error, NULL)) == NULL)
1530 return (set_errno(error));
1531
1532 if (option_value != NULL) {
1533 if (option_len != 0) {
1534 /*
1535 * Verify that the length is not excessive to prevent
1536 * an application from consuming all of kernel memory.
1537 */
1538 if (option_len > SO_MAXARGSIZE) {
1539 error = EINVAL;
1540 goto done2;
1541 }
1542 optval = option_len <= sizeof (buffer) ?
1543 &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
1544 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1545 if (copyin(option_value, optval, (size_t)option_len)) {
1546 error = EFAULT;
1547 goto done1;
1548 }
1549 }
1550 } else
1551 option_len = 0;
1552
1553 error = socket_setsockopt(so, level, option_name, optval,
1554 (t_uscalar_t)option_len, CRED());
1555 done1:
1556 if (optval != buffer)
1557 kmem_free(optval, (size_t)option_len);
1558 done2:
1559 releasef(sock);
1560 if (error)
1561 return (set_errno(error));
1562 return (0);
1563 }
1564
1565 static int
1566 sockconf_add_sock(int family, int type, int protocol, char *name)
1567 {
1568 int error = 0;
1569 char *kdevpath = NULL;
1570 char *kmodule = NULL;
1571 char *buf = NULL;
1572 size_t pathlen = 0;
1573 struct sockparams *sp;
1574
1575 if (name == NULL)
1576 return (EINVAL);
1577 /*
1578 * Copyin the name.
1579 * This also makes it possible to check for too long pathnames.
1580 * Compress the space needed for the name before passing it
1581 * to soconfig - soconfig will store the string until
1582 * the configuration is removed.
1583 */
1584 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1585 if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
1586 kmem_free(buf, MAXPATHLEN);
1587 return (error);
1588 }
1589 if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
1590 /* For device */
1591
1592 /*
1593 * Special handling for NCA:
1594 *
1595 * DEV_NCA is never opened even if an application
1596 * requests for AF_NCA. The device opened is instead a
1597 * predefined AF_INET transport (NCA_INET_DEV).
1598 *
1599 * Prior to Volo (PSARC/2007/587) NCA would determine
1600 * the device using a lookup, which worked then because
1601 * all protocols were based on TPI. Since TPI is no
1602 * longer the default, we have to explicitly state
1603 * which device to use.
1604 */
1605 if (strcmp(buf, NCA_DEV) == 0) {
1606 /* only support entry <28, 2, 0> */
1607 if (family != AF_NCA || type != SOCK_STREAM ||
1608 protocol != 0) {
1609 kmem_free(buf, MAXPATHLEN);
1610 return (EINVAL);
1611 }
1612
1613 pathlen = strlen(NCA_INET_DEV) + 1;
1614 kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1615 bcopy(NCA_INET_DEV, kdevpath, pathlen);
1616 kdevpath[pathlen - 1] = '\0';
1617 } else {
1618 kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1619 bcopy(buf, kdevpath, pathlen);
1620 kdevpath[pathlen - 1] = '\0';
1621 }
1622 } else {
1623 /* For socket module */
1624 kmodule = kmem_alloc(pathlen, KM_SLEEP);
1625 bcopy(buf, kmodule, pathlen);
1626 kmodule[pathlen - 1] = '\0';
1627 pathlen = 0;
1628 }
1629 kmem_free(buf, MAXPATHLEN);
1630
1631 /* sockparams_create frees mod name and devpath upon failure */
1632 sp = sockparams_create(family, type, protocol, kmodule,
1633 kdevpath, pathlen, 0, KM_SLEEP, &error);
1634 if (sp != NULL) {
1635 error = sockparams_add(sp);
1636 if (error != 0)
1637 sockparams_destroy(sp);
1638 }
1639
1640 return (error);
1641 }
1642
1643 static int
1644 sockconf_remove_sock(int family, int type, int protocol)
1645 {
1646 return (sockparams_delete(family, type, protocol));
1647 }
1648
1649 static int
1650 sockconfig_remove_filter(const char *uname)
1651 {
1652 char kname[SOF_MAXNAMELEN];
1653 size_t len;
1654 int error;
1655 sof_entry_t *ent;
1656
1657 if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
1658 return (error);
1659
1660 ent = sof_entry_remove_by_name(kname);
1661 if (ent == NULL)
1662 return (ENXIO);
1663
1664 mutex_enter(&ent->sofe_lock);
1665 ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
1666 if (ent->sofe_refcnt == 0) {
1667 mutex_exit(&ent->sofe_lock);
1668 sof_entry_free(ent);
1669 } else {
1670 /* let the last socket free the filter */
1671 ent->sofe_flags |= SOFEF_CONDEMED;
1672 mutex_exit(&ent->sofe_lock);
1673 }
1674
1675 return (0);
1676 }
1677
1678 static int
1679 sockconfig_add_filter(const char *uname, void *ufilpropp)
1680 {
1681 struct sockconfig_filter_props filprop;
1682 sof_entry_t *ent;
1683 int error;
1684 size_t tuplesz, len;
1685 char hintbuf[SOF_MAXNAMELEN];
1686
1687 ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
1688 mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
1689
1690 if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
1691 &len)) != 0) {
1692 sof_entry_free(ent);
1693 return (error);
1694 }
1695
1696 if (get_udatamodel() == DATAMODEL_NATIVE) {
1697 if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
1698 sof_entry_free(ent);
1699 return (EFAULT);
1700 }
1701 }
1702 #ifdef _SYSCALL32_IMPL
1703 else {
1704 struct sockconfig_filter_props32 filprop32;
1705
1706 if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
1707 sof_entry_free(ent);
1708 return (EFAULT);
1709 }
1710 filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
1711 filprop.sfp_autoattach = filprop32.sfp_autoattach;
1712 filprop.sfp_hint = filprop32.sfp_hint;
1713 filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
1714 filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
1715 filprop.sfp_socktuple =
1716 (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
1717 }
1718 #endif /* _SYSCALL32_IMPL */
1719
1720 if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
1721 sizeof (ent->sofe_modname), &len)) != 0) {
1722 sof_entry_free(ent);
1723 return (error);
1724 }
1725
1726 /*
1727 * A filter must specify at least one socket tuple.
1728 */
1729 if (filprop.sfp_socktuple_cnt == 0 ||
1730 filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
1731 sof_entry_free(ent);
1732 return (EINVAL);
1733 }
1734 ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
1735 ent->sofe_hint = filprop.sfp_hint;
1736
1737 /*
1738 * Verify the hint, and copy in the hint argument, if necessary.
1739 */
1740 switch (ent->sofe_hint) {
1741 case SOF_HINT_BEFORE:
1742 case SOF_HINT_AFTER:
1743 if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
1744 sizeof (hintbuf), &len)) != 0) {
1745 sof_entry_free(ent);
1746 return (error);
1747 }
1748 ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
1749 bcopy(hintbuf, ent->sofe_hintarg, len);
1750 /* FALLTHRU */
1751 case SOF_HINT_TOP:
1752 case SOF_HINT_BOTTOM:
1753 /* hints cannot be used with programmatic filters */
1754 if (ent->sofe_flags & SOFEF_PROG) {
1755 sof_entry_free(ent);
1756 return (EINVAL);
1757 }
1758 break;
1759 case SOF_HINT_NONE:
1760 break;
1761 default:
1762 /* bad hint value */
1763 sof_entry_free(ent);
1764 return (EINVAL);
1765 }
1766
1767 ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
1768 tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
1769 ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
1770
1771 if (get_udatamodel() == DATAMODEL_NATIVE) {
1772 if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
1773 tuplesz)) {
1774 sof_entry_free(ent);
1775 return (EFAULT);
1776 }
1777 }
1778 #ifdef _SYSCALL32_IMPL
1779 else {
1780 int i;
1781 caddr_t data = (caddr_t)filprop.sfp_socktuple;
1782 sof_socktuple_t *tup = ent->sofe_socktuple;
1783 sof_socktuple32_t tup32;
1784
1785 tup = ent->sofe_socktuple;
1786 for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
1787 ASSERT(tup < ent->sofe_socktuple + tuplesz);
1788
1789 if (copyin(data, &tup32, sizeof (tup32)) != 0) {
1790 sof_entry_free(ent);
1791 return (EFAULT);
1792 }
1793 tup->sofst_family = tup32.sofst_family;
1794 tup->sofst_type = tup32.sofst_type;
1795 tup->sofst_protocol = tup32.sofst_protocol;
1796
1797 data += sizeof (tup32);
1798 }
1799 }
1800 #endif /* _SYSCALL32_IMPL */
1801
1802 /* Sockets can start using the filter as soon as the filter is added */
1803 if ((error = sof_entry_add(ent)) != 0)
1804 sof_entry_free(ent);
1805
1806 return (error);
1807 }
1808
1809 /*
1810 * Socket configuration system call. It is used to add and remove
1811 * socket types.
1812 */
1813 int
1814 sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
1815 {
1816 int error = 0;
1817
1818 if (secpolicy_net_config(CRED(), B_FALSE) != 0)
1819 return (set_errno(EPERM));
1820
1821 if (sockfs_defer_nl7c_init) {
1822 nl7c_init();
1823 sockfs_defer_nl7c_init = 0;
1824 }
1825
1826 switch (cmd) {
1827 case SOCKCONFIG_ADD_SOCK:
1828 error = sockconf_add_sock((int)(uintptr_t)arg1,
1829 (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
1830 break;
1831 case SOCKCONFIG_REMOVE_SOCK:
1832 error = sockconf_remove_sock((int)(uintptr_t)arg1,
1833 (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
1834 break;
1835 case SOCKCONFIG_ADD_FILTER:
1836 error = sockconfig_add_filter((const char *)arg1, arg2);
1837 break;
1838 case SOCKCONFIG_REMOVE_FILTER:
1839 error = sockconfig_remove_filter((const char *)arg1);
1840 break;
1841 default:
1842 #ifdef DEBUG
1843 cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
1844 #endif
1845 error = EINVAL;
1846 break;
1847 }
1848
1849 if (error != 0) {
1850 eprintline(error);
1851 return (set_errno(error));
1852 }
1853 return (0);
1854 }
1855
1856
1857 /*
1858 * Sendfile is implemented through two schemes, direct I/O or by
1859 * caching in the filesystem page cache. We cache the input file by
1860 * default and use direct I/O only if sendfile_max_size is set
1861 * appropriately as explained below. Note that this logic is consistent
1862 * with other filesystems where caching is turned on by default
1863 * unless explicitly turned off by using the DIRECTIO ioctl.
1864 *
1865 * We choose a slightly different scheme here. One can turn off
1866 * caching by setting sendfile_max_size to 0. One can also enable
1867 * caching of files <= sendfile_max_size by setting sendfile_max_size
1868 * to an appropriate value. By default sendfile_max_size is set to the
1869 * maximum value so that all files are cached. In future, we may provide
1870 * better interfaces for caching the file.
1871 *
1872 * Sendfile through Direct I/O (Zero copy)
1873 * --------------------------------------
1874 *
1875 * As disks are normally slower than the network, we can't have a
1876 * single thread that reads the disk and writes to the network. We
1877 * need to have parallelism. This is done by having the sendfile
1878 * thread create another thread that reads from the filesystem
1879 * and queues it for network processing. In this scheme, the data
1880 * is never copied anywhere i.e it is zero copy unlike the other
1881 * scheme.
1882 *
1883 * We have a sendfile queue (snfq) where each sendfile
1884 * request (snf_req_t) is queued for processing by a thread. Number
1885 * of threads is dynamically allocated and they exit if they are idling
1886 * beyond a specified amount of time. When each request (snf_req_t) is
1887 * processed by a thread, it produces a number of mblk_t structures to
1888 * be consumed by the sendfile thread. snf_deque and snf_enque are
1889 * used for consuming and producing mblks. Size of the filesystem
1890 * read is determined by the tunable (sendfile_read_size). A single
1891 * mblk holds sendfile_read_size worth of data (except the last
1892 * read of the file) which is sent down as a whole to the network.
1893 * sendfile_read_size is set to 1 MB as this seems to be the optimal
1894 * value for the UFS filesystem backed by a striped storage array.
1895 *
1896 * Synchronisation between read (producer) and write (consumer) threads.
1897 * --------------------------------------------------------------------
1898 *
1899 * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1900 * adding and deleting items in this list. Error can happen anytime
1901 * during read or write. There could be unprocessed mblks in the
1902 * sr_ib_XXX list when a read or write error occurs. Whenever error
1903 * is encountered, we need two things to happen :
1904 *
1905 * a) One of the threads need to clean the mblks.
1906 * b) When one thread encounters an error, the other should stop.
1907 *
1908 * For (a), we don't want to penalize the reader thread as it could do
1909 * some useful work processing other requests. For (b), the error can
1910 * be detected by examining sr_read_error or sr_write_error.
1911 * sr_lock protects sr_read_error and sr_write_error. If both reader and
1912 * writer encounters error, we need to report the write error back to
1913 * the application as that's what would have happened if the operations
1914 * were done sequentially. With this in mind, following should work :
1915 *
1916 * - Check for errors before read or write.
1917 * - If the reader encounters error, set the error in sr_read_error.
1918 * Check sr_write_error, if it is set, send cv_signal as it is
1919 * waiting for reader to complete. If it is not set, the writer
1920 * is either running sinking data to the network or blocked
1921 * because of flow control. For handling the latter case, we
1922 * always send a signal. In any case, it will examine sr_read_error
1923 * and return. sr_read_error is marked with SR_READ_DONE to tell
1924 * the writer that the reader is done in all the cases.
1925 * - If the writer encounters error, set the error in sr_write_error.
1926 * The reader thread is either blocked because of flow control or
1927 * running reading data from the disk. For the former, we need to
1928 * wakeup the thread. Again to keep it simple, we always wake up
1929 * the reader thread. Then, wait for the read thread to complete
1930 * if it is not done yet. Cleanup and return.
1931 *
1932 * High and low water marks for the read thread.
1933 * --------------------------------------------
1934 *
1935 * If sendfile() is used to send data over a slow network, we need to
1936 * make sure that the read thread does not produce data at a faster
1937 * rate than the network. This can happen if the disk is faster than
1938 * the network. In such a case, we don't want to build a very large queue.
1939 * But we would still like to get all of the network throughput possible.
1940 * This implies that network should never block waiting for data.
1941 * As there are lot of disk throughput/network throughput combinations
1942 * possible, it is difficult to come up with an accurate number.
1943 * A typical 10K RPM disk has a max seek latency 17ms and rotational
1944 * latency of 3ms for reading a disk block. Thus, the total latency to
1945 * initiate a new read, transfer data from the disk and queue for
1946 * transmission would take about a max of 25ms. Todays max transfer rate
1947 * for network is 100MB/sec. If the thread is blocked because of flow
1948 * control, it would take 25ms to get new data ready for transmission.
1949 * We have to make sure that network is not idling, while we are initiating
1950 * new transfers. So, at 100MB/sec, to keep network busy we would need
1951 * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1952 * We need to pick a high water mark so that the woken up thread would
1953 * do considerable work before blocking again to prevent thrashing. Currently,
1954 * we pick this to be 10 times that of the low water mark.
1955 *
1956 * Sendfile with segmap caching (One copy from page cache to mblks).
1957 * ----------------------------------------------------------------
1958 *
1959 * We use the segmap cache for caching the file, if the size of file
1960 * is <= sendfile_max_size. In this case we don't use threads as VM
1961 * is reasonably fast enough to keep up with the network. If the underlying
1962 * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1963 * of data into segmap space, and use the virtual address from segmap
1964 * directly through desballoc() to avoid copy. Once the transport is done
1965 * with the data, the mapping will be released through segmap_release()
1966 * called by the call-back routine.
1967 *
1968 * If zero-copy is not allowed by the transport, we simply call VOP_READ()
1969 * to copy the data from the filesystem into our temporary network buffer.
1970 *
1971 * To disable caching, set sendfile_max_size to 0.
1972 */
1973
1974 uint_t sendfile_read_size = 1024 * 1024;
1975 #define SENDFILE_REQ_LOWAT 3 * 1024 * 1024
1976 uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
1977 uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
1978 struct sendfile_stats sf_stats;
1979 struct sendfile_queue *snfq;
1980 clock_t snfq_timeout;
1981 off64_t sendfile_max_size;
1982
1983 static void snf_enque(snf_req_t *, mblk_t *);
1984 static mblk_t *snf_deque(snf_req_t *);
1985
1986 void
1987 sendfile_init(void)
1988 {
1989 snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
1990
1991 mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
1992 cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
1993 snfq->snfq_max_threads = max_ncpus;
1994 snfq_timeout = SNFQ_TIMEOUT;
1995 /* Cache all files by default. */
1996 sendfile_max_size = MAXOFFSET_T;
1997 }
1998
1999 /*
2000 * Queues a mblk_t for network processing.
2001 */
2002 static void
2003 snf_enque(snf_req_t *sr, mblk_t *mp)
2004 {
2005 mp->b_next = NULL;
2006 mutex_enter(&sr->sr_lock);
2007 if (sr->sr_mp_head == NULL) {
2008 sr->sr_mp_head = sr->sr_mp_tail = mp;
2009 cv_signal(&sr->sr_cv);
2010 } else {
2011 sr->sr_mp_tail->b_next = mp;
2012 sr->sr_mp_tail = mp;
2013 }
2014 sr->sr_qlen += MBLKL(mp);
2015 while ((sr->sr_qlen > sr->sr_hiwat) &&
2016 (sr->sr_write_error == 0)) {
2017 sf_stats.ss_full_waits++;
2018 cv_wait(&sr->sr_cv, &sr->sr_lock);
2019 }
2020 mutex_exit(&sr->sr_lock);
2021 }
2022
2023 /*
2024 * De-queues a mblk_t for network processing.
2025 */
2026 static mblk_t *
2027 snf_deque(snf_req_t *sr)
2028 {
2029 mblk_t *mp;
2030
2031 mutex_enter(&sr->sr_lock);
2032 /*
2033 * If we have encountered an error on read or read is
2034 * completed and no more mblks, return NULL.
2035 * We need to check for NULL sr_mp_head also as
2036 * the reads could have completed and there is
2037 * nothing more to come.
2038 */
2039 if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
2040 ((sr->sr_read_error & SR_READ_DONE) &&
2041 sr->sr_mp_head == NULL)) {
2042 mutex_exit(&sr->sr_lock);
2043 return (NULL);
2044 }
2045 /*
2046 * To start with neither SR_READ_DONE is marked nor
2047 * the error is set. When we wake up from cv_wait,
2048 * following are the possibilities :
2049 *
2050 * a) sr_read_error is zero and mblks are queued.
2051 * b) sr_read_error is set to SR_READ_DONE
2052 * and mblks are queued.
2053 * c) sr_read_error is set to SR_READ_DONE
2054 * and no mblks.
2055 * d) sr_read_error is set to some error other
2056 * than SR_READ_DONE.
2057 */
2058
2059 while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
2060 sf_stats.ss_empty_waits++;
2061 cv_wait(&sr->sr_cv, &sr->sr_lock);
2062 }
2063 /* Handle (a) and (b) first - the normal case. */
2064 if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
2065 (sr->sr_mp_head != NULL)) {
2066 mp = sr->sr_mp_head;
2067 sr->sr_mp_head = mp->b_next;
2068 sr->sr_qlen -= MBLKL(mp);
2069 if (sr->sr_qlen < sr->sr_lowat)
2070 cv_signal(&sr->sr_cv);
2071 mutex_exit(&sr->sr_lock);
2072 mp->b_next = NULL;
2073 return (mp);
2074 }
2075 /* Handle (c) and (d). */
2076 mutex_exit(&sr->sr_lock);
2077 return (NULL);
2078 }
2079
2080 /*
2081 * Reads data from the filesystem and queues it for network processing.
2082 */
2083 void
2084 snf_async_read(snf_req_t *sr)
2085 {
2086 size_t iosize;
2087 u_offset_t fileoff;
2088 u_offset_t size;
2089 int ret_size;
2090 int error;
2091 file_t *fp;
2092 mblk_t *mp;
2093 struct vnode *vp;
2094 int extra = 0;
2095 int maxblk = 0;
2096 int wroff = 0;
2097 struct sonode *so;
2098
2099 fp = sr->sr_fp;
2100 size = sr->sr_file_size;
2101 fileoff = sr->sr_file_off;
2102
2103 /*
2104 * Ignore the error for filesystems that doesn't support DIRECTIO.
2105 */
2106 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
2107 kcred, NULL, NULL);
2108
2109 vp = sr->sr_vp;
2110 if (vp->v_type == VSOCK) {
2111 stdata_t *stp;
2112
2113 /*
2114 * Get the extra space to insert a header and a trailer.
2115 */
2116 so = VTOSO(vp);
2117 stp = vp->v_stream;
2118 if (stp == NULL) {
2119 wroff = so->so_proto_props.sopp_wroff;
2120 maxblk = so->so_proto_props.sopp_maxblk;
2121 extra = wroff + so->so_proto_props.sopp_tail;
2122 } else {
2123 wroff = (int)(stp->sd_wroff);
2124 maxblk = (int)(stp->sd_maxblk);
2125 extra = wroff + (int)(stp->sd_tail);
2126 }
2127 }
2128
2129 while ((size != 0) && (sr->sr_write_error == 0)) {
2130
2131 iosize = (int)MIN(sr->sr_maxpsz, size);
2132
2133 /*
2134 * Socket filters can limit the mblk size,
2135 * so limit reads to maxblk if there are
2136 * filters present.
2137 */
2138 if (vp->v_type == VSOCK &&
2139 so->so_filter_active > 0 && maxblk != INFPSZ)
2140 iosize = (int)MIN(iosize, maxblk);
2141
2142 if (is_system_labeled()) {
2143 mp = allocb_cred(iosize + extra, CRED(),
2144 curproc->p_pid);
2145 } else {
2146 mp = allocb(iosize + extra, BPRI_MED);
2147 }
2148 if (mp == NULL) {
2149 error = EAGAIN;
2150 break;
2151 }
2152
2153 mp->b_rptr += wroff;
2154
2155 ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
2156
2157 /* Error or Reached EOF ? */
2158 if ((error != 0) || (ret_size == 0)) {
2159 freeb(mp);
2160 break;
2161 }
2162 mp->b_wptr = mp->b_rptr + ret_size;
2163
2164 snf_enque(sr, mp);
2165 size -= ret_size;
2166 fileoff += ret_size;
2167 }
2168 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
2169 kcred, NULL, NULL);
2170 mutex_enter(&sr->sr_lock);
2171 sr->sr_read_error = error;
2172 sr->sr_read_error |= SR_READ_DONE;
2173 cv_signal(&sr->sr_cv);
2174 mutex_exit(&sr->sr_lock);
2175 }
2176
2177 void
2178 snf_async_thread(void)
2179 {
2180 snf_req_t *sr;
2181 callb_cpr_t cprinfo;
2182 clock_t time_left = 1;
2183
2184 CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
2185
2186 mutex_enter(&snfq->snfq_lock);
2187 for (;;) {
2188 /*
2189 * If we didn't find a entry, then block until woken up
2190 * again and then look through the queues again.
2191 */
2192 while ((sr = snfq->snfq_req_head) == NULL) {
2193 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2194 if (time_left <= 0) {
2195 snfq->snfq_svc_threads--;
2196 CALLB_CPR_EXIT(&cprinfo);
2197 thread_exit();
2198 /* NOTREACHED */
2199 }
2200 snfq->snfq_idle_cnt++;
2201
2202 time_left = cv_reltimedwait(&snfq->snfq_cv,
2203 &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK);
2204 snfq->snfq_idle_cnt--;
2205
2206 CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2207 }
2208 snfq->snfq_req_head = sr->sr_next;
2209 snfq->snfq_req_cnt--;
2210 mutex_exit(&snfq->snfq_lock);
2211 snf_async_read(sr);
2212 mutex_enter(&snfq->snfq_lock);
2213 }
2214 }
2215
2216
2217 snf_req_t *
2218 create_thread(int operation, struct vnode *vp, file_t *fp,
2219 u_offset_t fileoff, u_offset_t size)
2220 {
2221 snf_req_t *sr;
2222 stdata_t *stp;
2223
2224 sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2225
2226 sr->sr_vp = vp;
2227 sr->sr_fp = fp;
2228 stp = vp->v_stream;
2229
2230 /*
2231 * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2232 * stream might be closed before thread returns from snf_async_read.
2233 */
2234 if (stp != NULL && stp->sd_qn_maxpsz > 0) {
2235 sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2236 } else {
2237 sr->sr_maxpsz = MAXBSIZE;
2238 }
2239
2240 sr->sr_operation = operation;
2241 sr->sr_file_off = fileoff;
2242 sr->sr_file_size = size;
2243 sr->sr_hiwat = sendfile_req_hiwat;
2244 sr->sr_lowat = sendfile_req_lowat;
2245 mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2246 cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2247 /*
2248 * See whether we need another thread for servicing this
2249 * request. If there are already enough requests queued
2250 * for the threads, create one if not exceeding
2251 * snfq_max_threads.
2252 */
2253 mutex_enter(&snfq->snfq_lock);
2254 if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2255 snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2256 (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2257 TS_RUN, minclsyspri);
2258 snfq->snfq_svc_threads++;
2259 }
2260 if (snfq->snfq_req_head == NULL) {
2261 snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2262 cv_signal(&snfq->snfq_cv);
2263 } else {
2264 snfq->snfq_req_tail->sr_next = sr;
2265 snfq->snfq_req_tail = sr;
2266 }
2267 snfq->snfq_req_cnt++;
2268 mutex_exit(&snfq->snfq_lock);
2269 return (sr);
2270 }
2271
2272 int
2273 snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2274 ssize_t *count)
2275 {
2276 snf_req_t *sr;
2277 mblk_t *mp;
2278 int iosize;
2279 int error = 0;
2280 short fflag;
2281 struct vnode *vp;
2282 int ksize;
2283 struct nmsghdr msg;
2284
2285 ksize = 0;
2286 *count = 0;
2287 bzero(&msg, sizeof (msg));
2288
2289 vp = fp->f_vnode;
2290 fflag = fp->f_flag;
2291 if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2292 return (EAGAIN);
2293
2294 /*
2295 * We check for read error in snf_deque. It has to check
2296 * for successful READ_DONE and return NULL, and we might
2297 * as well make an additional check there.
2298 */
2299 while ((mp = snf_deque(sr)) != NULL) {
2300
2301 if (ISSIG(curthread, JUSTLOOKING)) {
2302 freeb(mp);
2303 error = EINTR;
2304 break;
2305 }
2306 iosize = MBLKL(mp);
2307
2308 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2309
2310 if (error != 0) {
2311 if (mp != NULL)
2312 freeb(mp);
2313 break;
2314 }
2315 ksize += iosize;
2316 }
2317 *count = ksize;
2318
2319 mutex_enter(&sr->sr_lock);
2320 sr->sr_write_error = error;
2321 /* Look at the big comments on why we cv_signal here. */
2322 cv_signal(&sr->sr_cv);
2323
2324 /* Wait for the reader to complete always. */
2325 while (!(sr->sr_read_error & SR_READ_DONE)) {
2326 cv_wait(&sr->sr_cv, &sr->sr_lock);
2327 }
2328 /* If there is no write error, check for read error. */
2329 if (error == 0)
2330 error = (sr->sr_read_error & ~SR_READ_DONE);
2331
2332 if (error != 0) {
2333 mblk_t *next_mp;
2334
2335 mp = sr->sr_mp_head;
2336 while (mp != NULL) {
2337 next_mp = mp->b_next;
2338 mp->b_next = NULL;
2339 freeb(mp);
2340 mp = next_mp;
2341 }
2342 }
2343 mutex_exit(&sr->sr_lock);
2344 kmem_free(sr, sizeof (snf_req_t));
2345 return (error);
2346 }
2347
2348 /* Maximum no.of pages allocated by vpm for sendfile at a time */
2349 #define SNF_VPMMAXPGS (VPMMAXPGS/2)
2350
2351 /*
2352 * Maximum no.of elements in the list returned by vpm, including
2353 * NULL for the last entry
2354 */
2355 #define SNF_MAXVMAPS (SNF_VPMMAXPGS + 1)
2356
2357 typedef struct {
2358 unsigned int snfv_ref;
2359 frtn_t snfv_frtn;
2360 vnode_t *snfv_vp;
2361 struct vmap snfv_vml[SNF_MAXVMAPS];
2362 } snf_vmap_desbinfo;
2363
2364 typedef struct {
2365 frtn_t snfi_frtn;
2366 caddr_t snfi_base;
2367 uint_t snfi_mapoff;
2368 size_t snfi_len;
2369 vnode_t *snfi_vp;
2370 } snf_smap_desbinfo;
2371
2372 /*
2373 * The callback function used for vpm mapped mblks called when the last ref of
2374 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2375 * can be the driver too due to lazy reclaim.
2376 */
2377 void
2378 snf_vmap_desbfree(snf_vmap_desbinfo *snfv)
2379 {
2380 ASSERT(snfv->snfv_ref != 0);
2381 if (atomic_add_32_nv(&snfv->snfv_ref, -1) == 0) {
2382 vpm_unmap_pages(snfv->snfv_vml, S_READ);
2383 VN_RELE(snfv->snfv_vp);
2384 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2385 }
2386 }
2387
2388 /*
2389 * The callback function used for segmap'ped mblks called when the last ref of
2390 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2391 * can be the driver too due to lazy reclaim.
2392 */
2393 void
2394 snf_smap_desbfree(snf_smap_desbinfo *snfi)
2395 {
2396 if (! IS_KPM_ADDR(snfi->snfi_base)) {
2397 /*
2398 * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2399 * segmap_kpm as long as the latter never falls back to
2400 * "use_segmap_range". (See segmap_getmapflt().)
2401 *
2402 * Using S_OTHER saves an redundant hat_setref() in
2403 * segmap_unlock()
2404 */
2405 (void) segmap_fault(kas.a_hat, segkmap,
2406 (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2407 snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2408 F_SOFTUNLOCK, S_OTHER);
2409 }
2410 (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2411 VN_RELE(snfi->snfi_vp);
2412 kmem_free(snfi, sizeof (*snfi));
2413 }
2414
2415 /*
2416 * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2417 * When segmap is used, the mblk contains a segmap slot of no more
2418 * than MAXBSIZE.
2419 *
2420 * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2421 * in each iteration and sent by socket_sendmblk until an error occurs or
2422 * the requested size has been transferred. An mblk is esballoca'ed from
2423 * each mapped page and a chain of these mblk is sent to the transport layer.
2424 * vpm will be called to unmap the pages when all mblks have been freed by
2425 * free_func.
2426 *
2427 * At the end of the whole sendfile() operation, we wait till the data from
2428 * the last mblk is ack'ed by the transport before returning so that the
2429 * caller of sendfile() can safely modify the file content.
2430 */
2431 int
2432 snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size,
2433 ssize_t *count, boolean_t nowait)
2434 {
2435 caddr_t base;
2436 int mapoff;
2437 vnode_t *vp;
2438 mblk_t *mp = NULL;
2439 int chain_size;
2440 int error;
2441 clock_t deadlk_wait;
2442 short fflag;
2443 int ksize;
2444 struct vattr va;
2445 boolean_t dowait = B_FALSE;
2446 struct nmsghdr msg;
2447
2448 vp = fp->f_vnode;
2449 fflag = fp->f_flag;
2450 ksize = 0;
2451 bzero(&msg, sizeof (msg));
2452
2453 for (;;) {
2454 if (ISSIG(curthread, JUSTLOOKING)) {
2455 error = EINTR;
2456 break;
2457 }
2458
2459 if (vpm_enable) {
2460 snf_vmap_desbinfo *snfv;
2461 mblk_t *nmp;
2462 int mblk_size;
2463 int maxsize;
2464 int i;
2465
2466 mapoff = fileoff & PAGEOFFSET;
2467 maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size);
2468
2469 snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo),
2470 KM_SLEEP);
2471
2472 /*
2473 * Get vpm mappings for maxsize with read access.
2474 * If the pages aren't available yet, we get
2475 * DEADLK, so wait and try again a little later using
2476 * an increasing wait. We might be here a long time.
2477 *
2478 * If delay_sig returns EINTR, be sure to exit and
2479 * pass it up to the caller.
2480 */
2481 deadlk_wait = 0;
2482 while ((error = vpm_map_pages(fvp, fileoff,
2483 (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml,
2484 SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) {
2485 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2486 if ((error = delay_sig(deadlk_wait)) != 0) {
2487 break;
2488 }
2489 }
2490 if (error != 0) {
2491 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2492 error = (error == EINTR) ? EINTR : EIO;
2493 goto out;
2494 }
2495 snfv->snfv_frtn.free_func = snf_vmap_desbfree;
2496 snfv->snfv_frtn.free_arg = (caddr_t)snfv;
2497
2498 /* Construct the mblk chain from the page mappings */
2499 chain_size = 0;
2500 for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) &&
2501 total_size > 0; i++) {
2502 ASSERT(chain_size < maxsize);
2503 mblk_size = MIN(snfv->snfv_vml[i].vs_len -
2504 mapoff, total_size);
2505 nmp = esballoca(
2506 (uchar_t *)snfv->snfv_vml[i].vs_addr +
2507 mapoff, mblk_size, BPRI_HI,
2508 &snfv->snfv_frtn);
2509
2510 /*
2511 * We return EAGAIN after unmapping the pages
2512 * if we cannot allocate the the head of the
2513 * chain. Otherwise, we continue sending the
2514 * mblks constructed so far.
2515 */
2516 if (nmp == NULL) {
2517 if (i == 0) {
2518 vpm_unmap_pages(snfv->snfv_vml,
2519 S_READ);
2520 kmem_free(snfv,
2521 sizeof (snf_vmap_desbinfo));
2522 error = EAGAIN;
2523 goto out;
2524 }
2525 break;
2526 }
2527 /* Mark this dblk with the zero-copy flag */
2528 nmp->b_datap->db_struioflag |= STRUIO_ZC;
2529 nmp->b_wptr += mblk_size;
2530 chain_size += mblk_size;
2531 fileoff += mblk_size;
2532 total_size -= mblk_size;
2533 snfv->snfv_ref++;
2534 mapoff = 0;
2535 if (i > 0)
2536 linkb(mp, nmp);
2537 else
2538 mp = nmp;
2539 }
2540 VN_HOLD(fvp);
2541 snfv->snfv_vp = fvp;
2542 } else {
2543 /* vpm not supported. fallback to segmap */
2544 snf_smap_desbinfo *snfi;
2545
2546 mapoff = fileoff & MAXBOFFSET;
2547 chain_size = MAXBSIZE - mapoff;
2548 if (chain_size > total_size)
2549 chain_size = total_size;
2550 /*
2551 * we don't forcefault because we'll call
2552 * segmap_fault(F_SOFTLOCK) next.
2553 *
2554 * S_READ will get the ref bit set (by either
2555 * segmap_getmapflt() or segmap_fault()) and page
2556 * shared locked.
2557 */
2558 base = segmap_getmapflt(segkmap, fvp, fileoff,
2559 chain_size, segmap_kpm ? SM_FAULT : 0, S_READ);
2560
2561 snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2562 snfi->snfi_len = (size_t)roundup(mapoff+chain_size,
2563 PAGESIZE)- (mapoff & PAGEMASK);
2564 /*
2565 * We must call segmap_fault() even for segmap_kpm
2566 * because that's how error gets returned.
2567 * (segmap_getmapflt() never fails but segmap_fault()
2568 * does.)
2569 *
2570 * If the pages aren't available yet, we get
2571 * DEADLK, so wait and try again a little later using
2572 * an increasing wait. We might be here a long time.
2573 *
2574 * If delay_sig returns EINTR, be sure to exit and
2575 * pass it up to the caller.
2576 */
2577 deadlk_wait = 0;
2578 while ((error = FC_ERRNO(segmap_fault(kas.a_hat,
2579 segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base +
2580 mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2581 S_READ))) == EDEADLK) {
2582 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2583 if ((error = delay_sig(deadlk_wait)) != 0) {
2584 break;
2585 }
2586 }
2587 if (error != 0) {
2588 (void) segmap_release(segkmap, base, 0);
2589 kmem_free(snfi, sizeof (*snfi));
2590 error = (error == EINTR) ? EINTR : EIO;
2591 goto out;
2592 }
2593 snfi->snfi_frtn.free_func = snf_smap_desbfree;
2594 snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2595 snfi->snfi_base = base;
2596 snfi->snfi_mapoff = mapoff;
2597 mp = esballoca((uchar_t *)base + mapoff, chain_size,
2598 BPRI_HI, &snfi->snfi_frtn);
2599
2600 if (mp == NULL) {
2601 (void) segmap_fault(kas.a_hat, segkmap,
2602 (caddr_t)(uintptr_t)(((uintptr_t)base +
2603 mapoff) & PAGEMASK), snfi->snfi_len,
2604 F_SOFTUNLOCK, S_OTHER);
2605 (void) segmap_release(segkmap, base, 0);
2606 kmem_free(snfi, sizeof (*snfi));
2607 freemsg(mp);
2608 error = EAGAIN;
2609 goto out;
2610 }
2611 VN_HOLD(fvp);
2612 snfi->snfi_vp = fvp;
2613 mp->b_wptr += chain_size;
2614
2615 /* Mark this dblk with the zero-copy flag */
2616 mp->b_datap->db_struioflag |= STRUIO_ZC;
2617 fileoff += chain_size;
2618 total_size -= chain_size;
2619 }
2620
2621 if (total_size == 0 && !nowait) {
2622 ASSERT(!dowait);
2623 dowait = B_TRUE;
2624 mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2625 }
2626 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2627 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2628 if (error != 0) {
2629 /*
2630 * mp contains the mblks that were not sent by
2631 * socket_sendmblk. Use its size to update *count
2632 */
2633 *count = ksize + (chain_size - msgdsize(mp));
2634 if (mp != NULL)
2635 freemsg(mp);
2636 return (error);
2637 }
2638 ksize += chain_size;
2639 if (total_size == 0)
2640 goto done;
2641
2642 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2643 va.va_mask = AT_SIZE;
2644 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2645 if (error)
2646 break;
2647 /* Read as much as possible. */
2648 if (fileoff >= va.va_size)
2649 break;
2650 if (total_size + fileoff > va.va_size)
2651 total_size = va.va_size - fileoff;
2652 }
2653 out:
2654 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2655 done:
2656 *count = ksize;
2657 if (dowait) {
2658 stdata_t *stp;
2659
2660 stp = vp->v_stream;
2661 if (stp == NULL) {
2662 struct sonode *so;
2663 so = VTOSO(vp);
2664 error = so_zcopy_wait(so);
2665 } else {
2666 mutex_enter(&stp->sd_lock);
2667 while (!(stp->sd_flag & STZCNOTIFY)) {
2668 if (cv_wait_sig(&stp->sd_zcopy_wait,
2669 &stp->sd_lock) == 0) {
2670 error = EINTR;
2671 break;
2672 }
2673 }
2674 stp->sd_flag &= ~STZCNOTIFY;
2675 mutex_exit(&stp->sd_lock);
2676 }
2677 }
2678 return (error);
2679 }
2680
2681 int
2682 snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2683 uint_t maxpsz, ssize_t *count)
2684 {
2685 struct vnode *vp;
2686 mblk_t *mp;
2687 int iosize;
2688 int extra = 0;
2689 int error;
2690 short fflag;
2691 int ksize;
2692 int ioflag;
2693 struct uio auio;
2694 struct iovec aiov;
2695 struct vattr va;
2696 int maxblk = 0;
2697 int wroff = 0;
2698 struct sonode *so;
2699 struct nmsghdr msg;
2700
2701 vp = fp->f_vnode;
2702 if (vp->v_type == VSOCK) {
2703 stdata_t *stp;
2704
2705 /*
2706 * Get the extra space to insert a header and a trailer.
2707 */
2708 so = VTOSO(vp);
2709 stp = vp->v_stream;
2710 if (stp == NULL) {
2711 wroff = so->so_proto_props.sopp_wroff;
2712 maxblk = so->so_proto_props.sopp_maxblk;
2713 extra = wroff + so->so_proto_props.sopp_tail;
2714 } else {
2715 wroff = (int)(stp->sd_wroff);
2716 maxblk = (int)(stp->sd_maxblk);
2717 extra = wroff + (int)(stp->sd_tail);
2718 }
2719 }
2720 bzero(&msg, sizeof (msg));
2721 fflag = fp->f_flag;
2722 ksize = 0;
2723 auio.uio_iov = &aiov;
2724 auio.uio_iovcnt = 1;
2725 auio.uio_segflg = UIO_SYSSPACE;
2726 auio.uio_llimit = MAXOFFSET_T;
2727 auio.uio_fmode = fflag;
2728 auio.uio_extflg = UIO_COPY_CACHED;
2729 ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2730 /* If read sync is not asked for, filter sync flags */
2731 if ((ioflag & FRSYNC) == 0)
2732 ioflag &= ~(FSYNC|FDSYNC);
2733 for (;;) {
2734 if (ISSIG(curthread, JUSTLOOKING)) {
2735 error = EINTR;
2736 break;
2737 }
2738 iosize = (int)MIN(maxpsz, size);
2739
2740 /*
2741 * Socket filters can limit the mblk size,
2742 * so limit reads to maxblk if there are
2743 * filters present.
2744 */
2745 if (vp->v_type == VSOCK &&
2746 so->so_filter_active > 0 && maxblk != INFPSZ)
2747 iosize = (int)MIN(iosize, maxblk);
2748
2749 if (is_system_labeled()) {
2750 mp = allocb_cred(iosize + extra, CRED(),
2751 curproc->p_pid);
2752 } else {
2753 mp = allocb(iosize + extra, BPRI_MED);
2754 }
2755 if (mp == NULL) {
2756 error = EAGAIN;
2757 break;
2758 }
2759
2760 mp->b_rptr += wroff;
2761
2762 aiov.iov_base = (caddr_t)mp->b_rptr;
2763 aiov.iov_len = iosize;
2764 auio.uio_loffset = fileoff;
2765 auio.uio_resid = iosize;
2766
2767 error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2768 iosize -= auio.uio_resid;
2769
2770 if (error == EINTR && iosize != 0)
2771 error = 0;
2772
2773 if (error != 0 || iosize == 0) {
2774 freeb(mp);
2775 break;
2776 }
2777 mp->b_wptr = mp->b_rptr + iosize;
2778
2779 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2780
2781 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2782
2783 if (error != 0) {
2784 *count = ksize;
2785 if (mp != NULL)
2786 freeb(mp);
2787 return (error);
2788 }
2789 ksize += iosize;
2790 size -= iosize;
2791 if (size == 0)
2792 goto done;
2793
2794 fileoff += iosize;
2795 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2796 va.va_mask = AT_SIZE;
2797 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2798 if (error)
2799 break;
2800 /* Read as much as possible. */
2801 if (fileoff >= va.va_size)
2802 size = 0;
2803 else if (size + fileoff > va.va_size)
2804 size = va.va_size - fileoff;
2805 }
2806 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2807 done:
2808 *count = ksize;
2809 return (error);
2810 }
2811
2812 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2813 /*
2814 * Largefile support for 32 bit applications only.
2815 */
2816 int
2817 sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2818 ssize32_t *count32)
2819 {
2820 ssize32_t sfv_len;
2821 u_offset_t sfv_off, va_size;
2822 struct vnode *vp, *fvp, *realvp;
2823 struct vattr va;
2824 stdata_t *stp;
2825 ssize_t count = 0;
2826 int error = 0;
2827 boolean_t dozcopy = B_FALSE;
2828 uint_t maxpsz;
2829
2830 sfv_len = (ssize32_t)sfv->sfv_len;
2831 if (sfv_len < 0) {
2832 error = EINVAL;
2833 goto out;
2834 }
2835
2836 if (sfv_len == 0) goto out;
2837
2838 sfv_off = (u_offset_t)sfv->sfv_off;
2839
2840 /* Same checks as in pread */
2841 if (sfv_off > MAXOFFSET_T) {
2842 error = EINVAL;
2843 goto out;
2844 }
2845 if (sfv_off + sfv_len > MAXOFFSET_T)
2846 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2847
2848 /*
2849 * There are no more checks on sfv_len. So, we cast it to
2850 * u_offset_t and share the snf_direct_io/snf_cache code between
2851 * 32 bit and 64 bit.
2852 *
2853 * TODO: should do nbl_need_check() like read()?
2854 */
2855 if (sfv_len > sendfile_max_size) {
2856 sf_stats.ss_file_not_cached++;
2857 error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2858 &count);
2859 goto out;
2860 }
2861 fvp = rfp->f_vnode;
2862 if (VOP_REALVP(fvp, &realvp, NULL) == 0)
2863 fvp = realvp;
2864 /*
2865 * Grab the lock as a reader to prevent the file size
2866 * from changing underneath.
2867 */
2868 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2869 va.va_mask = AT_SIZE;
2870 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2871 va_size = va.va_size;
2872 if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2873 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2874 goto out;
2875 }
2876 /* Read as much as possible. */
2877 if (sfv_off + sfv_len > va_size)
2878 sfv_len = va_size - sfv_off;
2879
2880 vp = fp->f_vnode;
2881 stp = vp->v_stream;
2882 /*
2883 * When the NOWAIT flag is not set, we enable zero-copy only if the
2884 * transfer size is large enough. This prevents performance loss
2885 * when the caller sends the file piece by piece.
2886 */
2887 if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2888 (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2889 !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
2890 uint_t copyflag;
2891 copyflag = stp != NULL ? stp->sd_copyflag :
2892 VTOSO(vp)->so_proto_props.sopp_zcopyflag;
2893 if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2894 int on = 1;
2895
2896 if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
2897 SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
2898 dozcopy = B_TRUE;
2899 } else {
2900 dozcopy = copyflag & STZCVMSAFE;
2901 }
2902 }
2903 if (dozcopy) {
2904 sf_stats.ss_file_segmap++;
2905 error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2906 &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
2907 } else {
2908 if (vp->v_type == VSOCK && stp == NULL) {
2909 sonode_t *so = VTOSO(vp);
2910 maxpsz = so->so_proto_props.sopp_maxpsz;
2911 } else if (stp != NULL) {
2912 maxpsz = stp->sd_qn_maxpsz;
2913 } else {
2914 maxpsz = maxphys;
2915 }
2916
2917 if (maxpsz == INFPSZ)
2918 maxpsz = maxphys;
2919 else
2920 maxpsz = roundup(maxpsz, MAXBSIZE);
2921 sf_stats.ss_file_cached++;
2922 error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2923 maxpsz, &count);
2924 }
2925 out:
2926 releasef(sfv->sfv_fd);
2927 *count32 = (ssize32_t)count;
2928 return (error);
2929 }
2930 #endif
2931
2932 #ifdef _SYSCALL32_IMPL
2933 /*
2934 * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2935 * ssize_t rather than ssize32_t; see the comments above read32 for details.
2936 */
2937
2938 ssize_t
2939 recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2940 {
2941 return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2942 }
2943
2944 ssize_t
2945 recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2946 caddr32_t name, caddr32_t namelenp)
2947 {
2948 return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2949 (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
2950 }
2951
2952 ssize_t
2953 send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2954 {
2955 return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2956 }
2957
2958 ssize_t
2959 sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2960 caddr32_t name, socklen_t namelen)
2961 {
2962 return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2963 (void *)(uintptr_t)name, namelen));
2964 }
2965 #endif /* _SYSCALL32_IMPL */
2966
2967 /*
2968 * Function wrappers (mostly around the sonode switch) for
2969 * backward compatibility.
2970 */
2971
2972 int
2973 soaccept(struct sonode *so, int fflag, struct sonode **nsop)
2974 {
2975 return (socket_accept(so, fflag, CRED(), nsop));
2976 }
2977
2978 int
2979 sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
2980 int backlog, int flags)
2981 {
2982 int error;
2983
2984 error = socket_bind(so, name, namelen, flags, CRED());
2985 if (error == 0 && backlog != 0)
2986 return (socket_listen(so, backlog, CRED()));
2987
2988 return (error);
2989 }
2990
2991 int
2992 solisten(struct sonode *so, int backlog)
2993 {
2994 return (socket_listen(so, backlog, CRED()));
2995 }
2996
2997 int
2998 soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
2999 int fflag, int flags)
3000 {
3001 return (socket_connect(so, name, namelen, fflag, flags, CRED()));
3002 }
3003
3004 int
3005 sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3006 {
3007 return (socket_recvmsg(so, msg, uiop, CRED()));
3008 }
3009
3010 int
3011 sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3012 {
3013 return (socket_sendmsg(so, msg, uiop, CRED()));
3014 }
3015
3016 int
3017 soshutdown(struct sonode *so, int how)
3018 {
3019 return (socket_shutdown(so, how, CRED()));
3020 }
3021
3022 int
3023 sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
3024 socklen_t *optlenp, int flags)
3025 {
3026 return (socket_getsockopt(so, level, option_name, optval, optlenp,
3027 flags, CRED()));
3028 }
3029
3030 int
3031 sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
3032 t_uscalar_t optlen)
3033 {
3034 return (socket_setsockopt(so, level, option_name, optval, optlen,
3035 CRED()));
3036 }
3037
3038 /*
3039 * Because this is backward compatibility interface it only needs to be
3040 * able to handle the creation of TPI sockfs sockets.
3041 */
3042 struct sonode *
3043 socreate(struct sockparams *sp, int family, int type, int protocol, int version,
3044 int *errorp)
3045 {
3046 struct sonode *so;
3047
3048 ASSERT(sp != NULL);
3049
3050 so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
3051 version, SOCKET_SLEEP, errorp, CRED());
3052 if (so == NULL) {
3053 SOCKPARAMS_DEC_REF(sp);
3054 } else {
3055 if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
3056 /* Cannot fail, only bumps so_count */
3057 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
3058 } else {
3059 socket_destroy(so);
3060 so = NULL;
3061 }
3062 }
3063 return (so);
3064 }